From patchwork Tue Dec 31 23:45:06 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924059 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 334451B4245 for ; Tue, 31 Dec 2024 23:45:06 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688708; cv=none; b=bAvfBjltOM3X3F8IodOjNgKNQioY/AliACheFkbC0cqRnKEkUbkadTMoEhw8hBmG1dvkJuUp6ModHl9at7+2QBLXm/BMmdFFjSBEGA4wgeQ36yAhcs36D+bYbIb2RTlZTK2O1ww83qmzXMmBQIa2QW11CGMPdrZ0oPbnng4YAmY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688708; c=relaxed/simple; bh=qzIXa/Rlk9CV2h1vN9erSeGTfGFZNUEwbRH4OpJFyDg=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=KiNFl/Np18KL69cC97W7WNCsbfIPLj5H1qceZ6hjDxYiA0Pq04Io6vLVIwA7rMQvG4whsTMgDrbLB2QcwbxVxAaM5EqbAdZk99BjqvE4m2MbZgE+CTqGXcUlZO1G5wVkwkyZin9iuN5fBHTLZlU0N/F7BaZ3GcHmL5Y4LvIzhAQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=XpQ3MVOm; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="XpQ3MVOm" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B5CF0C4CED2; Tue, 31 Dec 2024 23:45:06 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688706; bh=qzIXa/Rlk9CV2h1vN9erSeGTfGFZNUEwbRH4OpJFyDg=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=XpQ3MVOmD3f2wWRU6xkPyEscaM3ebB1XRTjh7My+gZ4vDO+koWbewvCmbZp6g2ZRT XrMgigYepYTsLNrQoTXSp+UGzrcC5HJcSmDwVPmAHdOELnHU9ykLzULLhDIVNagvIV Ba6CN2htzfll8MOulZuD518RtSL2/l30rrI6NKpGAUQBRL5L2RP9tg4F3DIz+dXgPn tWn2OSEBTtgeenXGjF7Fato6fDP698XhUCJXagOBl6dhlwkVpUz3rPHC6DK591P1EV o5lRstsgSnQZ9a0pYJpBPhUJhAxV61hwpaAu0qrVxXpofo9iIaJ/JQd9Yu8UE1rWwo ILNgP5GrAe50w== Date: Tue, 31 Dec 2024 15:45:06 -0800 Subject: [PATCH 01/11] xfs_io: display rtgroup number in verbose fsrefs output From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777884.2709794.12247373333506534863.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Display the rtgroup number in the verbose fsrefcounts output. Signed-off-by: "Darrick J. Wong" --- io/fsrefcounts.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/io/fsrefcounts.c b/io/fsrefcounts.c index ad1f26dfde3ec3..9127f536da382e 100644 --- a/io/fsrefcounts.c +++ b/io/fsrefcounts.c @@ -13,6 +13,7 @@ static cmdinfo_t fsrefcounts_cmd; static dev_t xfs_data_dev; +static dev_t xfs_rt_dev; static void fsrefcounts_help(void) @@ -119,7 +120,7 @@ dump_refcounts_verbose( unsigned long long i; struct xfs_getfsrefs *p; int agno; - off_t agoff, bperag; + off_t agoff, bperag, bperrtg; int boff_w, aoff_w, tot_w, agno_w, own_w; int nr_w, dev_w; char bbuf[40], abuf[40], obuf[40]; @@ -132,6 +133,7 @@ dump_refcounts_verbose( nr_w = 4; tot_w = MINTOT_WIDTH; bperag = (off_t)fsgeo->agblocks * (off_t)fsgeo->blocksize; + bperrtg = bytes_per_rtgroup(fsgeo); sunit = (fsgeo->sunit * fsgeo->blocksize); swidth = (fsgeo->swidth * fsgeo->blocksize); @@ -173,6 +175,13 @@ dump_refcounts_verbose( "(%lld..%lld)", (long long)BTOBBT(agoff), (long long)BTOBBT(agoff + p->fcr_length - 1)); + } else if (p->fcr_device == xfs_rt_dev && fsgeo->rgcount > 0) { + agno = p->fcr_physical / bperrtg; + agoff = p->fcr_physical - (agno * bperrtg); + snprintf(abuf, sizeof(abuf), + "(%lld..%lld)", + (long long)BTOBBT(agoff), + (long long)BTOBBT(agoff + p->fcr_length - 1)); } else abuf[0] = 0; aoff_w = max(aoff_w, strlen(abuf)); @@ -231,6 +240,16 @@ dump_refcounts_verbose( snprintf(gbuf, sizeof(gbuf), "%lld", (long long)agno); + } else if (p->fcr_device == xfs_rt_dev && fsgeo->rgcount > 0) { + agno = p->fcr_physical / bperrtg; + agoff = p->fcr_physical - (agno * bperrtg); + snprintf(abuf, sizeof(abuf), + "(%lld..%lld)", + (long long)BTOBBT(agoff), + (long long)BTOBBT(agoff + p->fcr_length - 1)); + snprintf(gbuf, sizeof(gbuf), + "%lld", + (long long)agno); } else { abuf[0] = 0; gbuf[0] = 0; @@ -420,6 +439,7 @@ fsrefcounts_f( } fs = fs_table_lookup(file->name, FS_MOUNT_POINT); xfs_data_dev = fs ? fs->fs_datadev : 0; + xfs_rt_dev = fs ? fs->fs_rtdev : 0; head->fch_count = map_size; do { From patchwork Tue Dec 31 23:45:21 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924060 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E135E1B0425 for ; Tue, 31 Dec 2024 23:45:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688723; cv=none; b=DlA/QyEYMqBzEaKAT+cVK+tBevV/wmZtkaO+B7zY4BrubRWkwJ6uxwHve7rX9I7WMfvgBdBUa8VCKEXPZ7nSTqqRx0DwUJsNAAPydYGWHErUG8wQyvjUZNf4jd+bm7m/O2nk1HZvYpx4ICiKgicpKPr4X10z7G6dMZsg2O1wqoE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688723; c=relaxed/simple; bh=OblL/dLkuUwoe8muFK3ADuTLxL7PFJ/3UKqXXvCuB4M=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=eFzrWC9lhfkzFesqKF7wG9zIN2wcRRsmLyg7UviL7Wtku0f4efL4LxDd8RaW4FpGnr9MOjiRGG6jJVHuRAb5X1mNnFWpQGgeGMfoGOGKDS7/JEA1BW/BkgdzuP+pDd5ijlXdognj0UjGLYC7PdQn63xmB6n5zyJamaQz5MTK5AQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=GTRwPXk3; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="GTRwPXk3" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6A663C4CED2; Tue, 31 Dec 2024 23:45:22 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688722; bh=OblL/dLkuUwoe8muFK3ADuTLxL7PFJ/3UKqXXvCuB4M=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=GTRwPXk3N8JmQalXD7xpNwTWQTZakoMUkQrMBhT5jQee6ACVTeva3kYoK7UJuLOh2 Xk77UlawVl1e2l3TcDKI6Huw6lDB8XOaoRigV0/NUdXuRf7HqBQxEGA3wJg+VhOzDn owHCt1Uk67P8zKbHuq5bLUwRJl2kroaL7fuAeLb7l6JDBLngUepxCfvD2NRAzP2R6u Zz3iChRfChOMhOchtYJU9bBzUdjMLYQ31O9WmZz5OVgQurxLF7YL7RlkyBxJetiQLq dRu7zPqt0L29o0aqnDNDI458tXA+UXH0USdECG9XO8/FidhZtQ6ywE/7wZ5daUumIb Kz0IjJg8K9MeQ== Date: Tue, 31 Dec 2024 15:45:21 -0800 Subject: [PATCH 02/11] xfs: add an ioctl to map free space into a file From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777900.2709794.4653807334110054797.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add a new ioctl to map free physical space into a file, at the same file offset as if the file were a sparse image of the physical device backing the filesystem. The intent here is to use this to prototype a free space defragmentation tool. Signed-off-by: "Darrick J. Wong" --- include/xfs_trace.h | 4 ++ libxfs/libxfs_priv.h | 9 ++++ libxfs/xfs_alloc.c | 88 +++++++++++++++++++++++++++++++++++++++ libxfs/xfs_alloc.h | 3 + libxfs/xfs_fs.h | 14 ++++++ man/man2/ioctl_xfs_map_freesp.2 | 76 ++++++++++++++++++++++++++++++++++ 6 files changed, 194 insertions(+) create mode 100644 man/man2/ioctl_xfs_map_freesp.2 diff --git a/include/xfs_trace.h b/include/xfs_trace.h index 7778366c5e3319..178497c8770d37 100644 --- a/include/xfs_trace.h +++ b/include/xfs_trace.h @@ -26,6 +26,8 @@ #define trace_xfs_alloc_exact_done(a) ((void) 0) #define trace_xfs_alloc_exact_notfound(a) ((void) 0) #define trace_xfs_alloc_exact_error(a) ((void) 0) +#define trace_xfs_alloc_find_freesp(...) ((void) 0) +#define trace_xfs_alloc_find_freesp_done(...) ((void) 0) #define trace_xfs_alloc_near_first(a) ((void) 0) #define trace_xfs_alloc_near_greater(a) ((void) 0) #define trace_xfs_alloc_near_lesser(a) ((void) 0) @@ -197,6 +199,8 @@ #define trace_xfs_bmap_pre_update(a,b,c,d) ((void) 0) #define trace_xfs_bmap_post_update(a,b,c,d) ((void) 0) +#define trace_xfs_bmapi_freesp(...) ((void) 0) +#define trace_xfs_bmapi_freesp_done(...) ((void) 0) #define trace_xfs_bunmap(a,b,c,d,e) ((void) 0) #define trace_xfs_read_extent(a,b,c,d) ((void) 0) diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index ac2f64a9a75d82..932a45d734d460 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -446,6 +446,15 @@ xfs_buf_readahead( #define xfs_filestream_new_ag(ip,ag) (0) #define xfs_filestream_select_ag(...) (-ENOSYS) +struct xfs_trans; + +static inline int +xfs_rtallocate_extent(struct xfs_trans *tp, xfs_rtxnum_t start, + xfs_rtxlen_t maxlen, xfs_rtxlen_t *len, xfs_rtxnum_t *rtx) +{ + return -EOPNOTSUPP; +} + #define xfs_trans_inode_buf(tp, bp) ((void) 0) /* quota bits */ diff --git a/libxfs/xfs_alloc.c b/libxfs/xfs_alloc.c index 9aebe7227a6148..e21b694420e309 100644 --- a/libxfs/xfs_alloc.c +++ b/libxfs/xfs_alloc.c @@ -4164,3 +4164,91 @@ xfs_extfree_intent_destroy_cache(void) kmem_cache_destroy(xfs_extfree_item_cache); xfs_extfree_item_cache = NULL; } + +/* + * Find the next chunk of free space in @pag starting at @agbno and going no + * higher than @end_agbno. Set @agbno and @len to whatever free space we find, + * or to @end_agbno if we find no space. + */ +int +xfs_alloc_find_freesp( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t *agbno, + xfs_agblock_t end_agbno, + xfs_extlen_t *len) +{ + struct xfs_mount *mp = pag_mount(pag); + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agblock_t found_agbno; + xfs_extlen_t found_len; + int found; + int error; + + trace_xfs_alloc_find_freesp(pag_group(pag), *agbno, + end_agbno - *agbno); + + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) + return error; + + cur = xfs_bnobt_init_cursor(mp, tp, agf_bp, pag); + + /* Try to find a free extent that starts before here. */ + error = xfs_alloc_lookup_le(cur, *agbno, 0, &found); + if (error) + goto out_cur; + if (found) { + error = xfs_alloc_get_rec(cur, &found_agbno, &found_len, + &found); + if (error) + goto out_cur; + if (XFS_IS_CORRUPT(mp, !found)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto out_cur; + } + + if (found_agbno + found_len > *agbno) + goto found; + } + + /* Examine the next record if free extent not in range. */ + error = xfs_btree_increment(cur, 0, &found); + if (error) + goto out_cur; + if (!found) + goto next_ag; + + error = xfs_alloc_get_rec(cur, &found_agbno, &found_len, &found); + if (error) + goto out_cur; + if (XFS_IS_CORRUPT(mp, !found)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto out_cur; + } + + if (found_agbno >= end_agbno) + goto next_ag; + +found: + /* Found something, so update the mapping. */ + trace_xfs_alloc_find_freesp_done(pag_group(pag), found_agbno, + found_len); + if (found_agbno < *agbno) { + found_len -= *agbno - found_agbno; + found_agbno = *agbno; + } + *len = found_len; + *agbno = found_agbno; + goto out_cur; +next_ag: + /* Found nothing, so advance the cursor beyond the end of the range. */ + *agbno = end_agbno; + *len = 0; +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} diff --git a/libxfs/xfs_alloc.h b/libxfs/xfs_alloc.h index 50ef79a1ed41a1..069077d9ad2f8c 100644 --- a/libxfs/xfs_alloc.h +++ b/libxfs/xfs_alloc.h @@ -286,5 +286,8 @@ void xfs_extfree_intent_destroy_cache(void); xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno, uint32_t length); +int xfs_alloc_find_freesp(struct xfs_trans *tp, struct xfs_perag *pag, + xfs_agblock_t *agbno, xfs_agblock_t end_agbno, + xfs_extlen_t *len); #endif /* __XFS_ALLOC_H__ */ diff --git a/libxfs/xfs_fs.h b/libxfs/xfs_fs.h index 936f719236944f..f4128dbdf3b9a2 100644 --- a/libxfs/xfs_fs.h +++ b/libxfs/xfs_fs.h @@ -1087,6 +1087,19 @@ xfs_getfsrefs_advance( /* fcr_flags values - returned for each non-header segment */ #define FCR_OF_LAST (1U << 0) /* last record in the dataset */ +/* map free space to file */ + +/* + * XFS_IOC_MAP_FREESP maps all the free physical space in the filesystem into + * the file at the same offsets. This ioctl requires CAP_SYS_ADMIN. + */ +struct xfs_map_freesp { + __s64 offset; /* disk address to map, in bytes */ + __s64 len; /* length in bytes */ + __u64 flags; /* must be zero */ + __u64 pad; /* must be zero */ +}; + /* * ioctl commands that are used by Linux filesystems */ @@ -1127,6 +1140,7 @@ xfs_getfsrefs_advance( #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) #define XFS_IOC_GETFSREFCOUNTS _IOWR('X', 66, struct xfs_getfsrefs_head) +#define XFS_IOC_MAP_FREESP _IOW ('X', 67, struct xfs_map_freesp) /* * ioctl commands that replace IRIX syssgi()'s diff --git a/man/man2/ioctl_xfs_map_freesp.2 b/man/man2/ioctl_xfs_map_freesp.2 new file mode 100644 index 00000000000000..ecd2d08f3fdeee --- /dev/null +++ b/man/man2/ioctl_xfs_map_freesp.2 @@ -0,0 +1,76 @@ +.\" Copyright (c) 2023-2025 Oracle. All rights reserved. +.\" +.\" %%%LICENSE_START(GPLv2+_DOC_FULL) +.\" SPDX-License-Identifier: GPL-2.0-or-later +.\" %%%LICENSE_END +.TH IOCTL-XFS-MAP-FREESP 2 2023-11-17 "XFS" +.SH NAME +ioctl_xfs_map_freesp \- map free space into a file +.SH SYNOPSIS +.br +.B #include +.PP +.BI "int ioctl(int " fd ", XFS_IOC_MAP_FREESP, struct xfs_map_freesp *" arg ); +.SH DESCRIPTION +Maps free space into the sparse ranges of a regular file. +This ioctl uses +.B struct xfs_map_freesp +to specify the range of free space to be mapped: +.PP +.in +4n +.nf +struct xfs_map_freesp { + __s64 offset; + __s64 len; + __s64 flags; + __s64 pad; +}; +.fi +.in +.PP +.I offset +is the physical disk address, in bytes, of the start of the range to scan. +Each free space extent in this range will be mapped to the file if the +corresponding range of the file is sparse. +.PP +.I len +is the number of bytes in the range to scan. +.PP +.I flags +must be zero; there are no flags defined yet. +.PP +.I pad +must be zero. +.SH RETURN VALUE +On error, \-1 is returned, and +.I errno +is set to indicate the error. +.PP +.SH ERRORS +Error codes can be one of, but are not limited to, the following: +.TP +.B EFAULT +The kernel was not able to copy into the userspace buffer. +.TP +.B EFSBADCRC +Metadata checksum validation failed while performing the query. +.TP +.B EFSCORRUPTED +Metadata corruption was encountered while performing the query. +.TP +.B EINVAL +One of the arguments was not valid, +or the file was not sparse. +.TP +.B EIO +An I/O error was encountered while performing the query. +.TP +.B ENOMEM +There was insufficient memory to perform the query. +.TP +.B ENOSPC +There was insufficient disk space to commit the space mappings. +.SH CONFORMING TO +This API is specific to XFS filesystem on the Linux kernel. +.SH SEE ALSO +.BR ioctl (2) From patchwork Tue Dec 31 23:45:37 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924061 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 75D8729415 for ; Tue, 31 Dec 2024 23:45:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688738; cv=none; b=rqQQjRC8bsFjOHtQ4sGQfgysmmf1HTTPbe0z+3P3b0vSlWo4cCDMPkPQDvAz26Mr2jZ3o9EG/0vSkLe6S58UJ3Dwumm45lAsHiPNoIr1MGLaq2TsM1woXbekZcmqpHqzo1aivw13huDrtJ4cdtk6v+eLYlczfnpx2axUpT46aRw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688738; c=relaxed/simple; bh=wkOu/OjpGNRpHmsdGyDgsuU1iDkLeXtZC/kTTPFr8IE=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=GFZ/iHu9L22NSd9HuZTIuk3td84IjyhB4JSNtnQdBsExOMnBviaG87anPcamH+49xu+f7NZ+9oJzw0TNBf1izAheTdpz+mYZ+IqX846dDd64l5LKoFpxGM3nSLgTmJAniMUU0lH31gkEV1VNEuBqknlj38lCgXLe91aWtI0TfEs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=VM0J0dN9; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="VM0J0dN9" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 042C4C4CED2; Tue, 31 Dec 2024 23:45:37 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688738; bh=wkOu/OjpGNRpHmsdGyDgsuU1iDkLeXtZC/kTTPFr8IE=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=VM0J0dN9tgWVDE6mjrQZ7bdN5DmmU4H6nut+81pxQleWtGzdIcidigcLGEVVIJBRn CLrsnGWcOdjcYB6sbEmEvrt4iJ2MAwWZSppi8Gjejvx1JQsEtqjHFSZxVmc+2iKd+p T2jGUt6UIWVyv+ti1U7jNA3Ta4IxyrMlGYDa/RmpW6a+qzMWavESXV++qPaPQDo62U YBnrM6QC5iqutavyKk1o3HCw93v6aBfozJnBAAT4PmrUfODCrcRBXQUSFhUPLwJart 5DJ+O9D6Ov0IjFMQebW/UHUJQMjeAMkB2SbN7f8WSssC8akQ3gTy8jmdlMUNGHwDhr xPfFJ36gJXYCg== Date: Tue, 31 Dec 2024 15:45:37 -0800 Subject: [PATCH 03/11] xfs_io: support using XFS_IOC_MAP_FREESP to map free space From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777915.2709794.7308361997137954140.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add a command to call XFS_IOC_MAP_FREESP. This is experimental code to see if we can build a free space defragmenter out of this. Signed-off-by: "Darrick J. Wong" --- io/prealloc.c | 35 +++++++++++++++++++++++++++++++++++ man/man8/xfs_io.8 | 8 +++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/io/prealloc.c b/io/prealloc.c index 8e968c9f2455d5..b7004697a045c5 100644 --- a/io/prealloc.c +++ b/io/prealloc.c @@ -41,6 +41,7 @@ static cmdinfo_t fcollapse_cmd; static cmdinfo_t finsert_cmd; static cmdinfo_t fzero_cmd; static cmdinfo_t funshare_cmd; +static cmdinfo_t fmapfree_cmd; static int offset_length( @@ -377,6 +378,30 @@ funshare_f( return 0; } +static int +fmapfree_f( + int argc, + char **argv) +{ + struct xfs_flock64 segment; + struct xfs_map_freesp args = { }; + + if (!offset_length(argv[1], argv[2], &segment)) { + exitcode = 1; + return 0; + } + + args.offset = segment.l_start; + args.len = segment.l_len; + + if (ioctl(file->fd, XFS_IOC_MAP_FREESP, &args)) { + perror("XFS_IOC_MAP_FREESP"); + exitcode = 1; + return 0; + } + return 0; +} + void prealloc_init(void) { @@ -489,4 +514,14 @@ prealloc_init(void) funshare_cmd.oneline = _("unshares shared blocks within the range"); add_command(&funshare_cmd); + + fmapfree_cmd.name = "fmapfree"; + fmapfree_cmd.cfunc = fmapfree_f; + fmapfree_cmd.argmin = 2; + fmapfree_cmd.argmax = 2; + fmapfree_cmd.flags = CMD_NOMAP_OK | CMD_FOREIGN_OK; + fmapfree_cmd.args = _("off len"); + fmapfree_cmd.oneline = + _("maps free space into a file"); + add_command(&fmapfree_cmd); } diff --git a/man/man8/xfs_io.8 b/man/man8/xfs_io.8 index 37ad497c771051..c4d09ce07f597b 100644 --- a/man/man8/xfs_io.8 +++ b/man/man8/xfs_io.8 @@ -519,8 +519,14 @@ .SH FILE I/O COMMANDS .BR fallocate (2) manual page to create the hole by shifting data blocks. .TP +.BI fmapfree " offset length" +Maps free physical space into the file by calling XFS_IOC_MAP_FREESP as +described in the +.BR XFS_IOC_MAP_FREESP (2) +manual page. +.TP .BI fpunch " offset length" -Punches (de-allocates) blocks in the file by calling fallocate with +Punches (de-allocates) blocks in the file by calling fallocate with the FALLOC_FL_PUNCH_HOLE flag as described in the .BR fallocate (2) manual page. From patchwork Tue Dec 31 23:45:53 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924062 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1972C29415 for ; Tue, 31 Dec 2024 23:45:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688754; cv=none; b=Pj1JxvaPojElbrWUooEkSoAoWJ5mtsgtYVg3ZOZPgtJJAfvku4gcJYg6ElMluHBdzd9Y72z5ZNNY6mUxWhIL/2oKm7UmcIdmo3r7Eky5s1XZdoeHVAk5BXx7DjRXZYF3v0kTS0R8g9fmVcYgFZ3wtvJsIC0emJkdpYEs8GBuu/M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688754; c=relaxed/simple; bh=vOvBSt6CEDdc7vD9vhQlu0A0DHO4gYI/wqqWBlY5kl8=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=DJdeaX4/n2R64Gc25FCKneG8I4lJnuGeIBni8HtZQ/WtTtL8qCNa7tAfr+BZTlNYpVF/YJU1sxe0JjqhPP+dkBAyx2I4NzjHcBFBtK1gmVaIio/kS1S5zJ3XWKe1LaYLnmbsZzUQEkArSYc8fElpiRXh1R05/TWw6D/NCh8r3/Q= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=X9RABNsS; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="X9RABNsS" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9CCF7C4CED2; Tue, 31 Dec 2024 23:45:53 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688753; bh=vOvBSt6CEDdc7vD9vhQlu0A0DHO4gYI/wqqWBlY5kl8=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=X9RABNsSmfahNP6afRiw8Ym4CBbUeOc++DP/GGcNtzkBlEOF/Eqohx+j7o7AAbrHS 4dBjLEvrUbNpUD4uTjeNlb3U3A78QCjpX2IzEmLqUQfg71kYZVKIjGwFNNoMCnlliu zxcHJudUCdaX26pkfjnUBTUMp3ZjM79tfow00V/Qwocm0tyu/Q4d4wp2bsFsLRBdKD Blro3s08zJiQK2UVSKEXSIXxpQNe9EHptiDOaGWaecBlG8KfarKapIfk/3FAaZozMU AIGwpWjJRz796BI/72eUAZdZpiE0JYxlkLgWwvg/Oeah0fX7WNshbpcFLfjN/k5uzH Ye3G+6H4E45Xg== Date: Tue, 31 Dec 2024 15:45:53 -0800 Subject: [PATCH 04/11] xfs_db: get and put blocks on the AGFL From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777931.2709794.2210883689118756992.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add a new xfs_db command to let people add and remove blocks from an AGFL. This isn't really related to rmap btree reconstruction, other than enabling debugging code to mess around with the AGFL to exercise various odd scenarios. Signed-off-by: "Darrick J. Wong" --- db/agfl.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++ libxfs/libxfs_api_defs.h | 4 + man/man8/xfs_db.8 | 11 ++ 3 files changed, 308 insertions(+), 4 deletions(-) diff --git a/db/agfl.c b/db/agfl.c index f0f3f21a64d12c..cf5a2407f6b6d8 100644 --- a/db/agfl.c +++ b/db/agfl.c @@ -15,13 +15,14 @@ #include "output.h" #include "init.h" #include "agfl.h" +#include "libfrog/bitmap.h" static int agfl_bno_size(void *obj, int startoff); static int agfl_f(int argc, char **argv); static void agfl_help(void); static const cmdinfo_t agfl_cmd = - { "agfl", NULL, agfl_f, 0, 1, 1, N_("[agno]"), + { "agfl", NULL, agfl_f, 0, -1, 1, N_("[agno] [-g nr] [-p nr]"), N_("set address to agfl block"), agfl_help }; const field_t agfl_hfld[] = { { @@ -77,10 +78,280 @@ agfl_help(void) " for each allocation group. This acts as a reserved pool of space\n" " separate from the general filesystem freespace (not used for user data).\n" "\n" +" -g quantity\tRemove this many blocks from the AGFL.\n" +" -p quantity\tAdd this many blocks to the AGFL.\n" +"\n" )); } +struct dump_info { + struct xfs_perag *pag; + bool leak; +}; + +/* Return blocks freed from the AGFL to the free space btrees. */ +static int +free_grabbed( + uint64_t start, + uint64_t length, + void *data) +{ + struct dump_info *di = data; + struct xfs_perag *pag = di->pag; + struct xfs_mount *mp = pag_mount(pag); + struct xfs_trans *tp; + struct xfs_buf *agf_bp; + int error; + + error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); + if (error) + return error; + + error = -libxfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) + goto out_cancel; + + error = -libxfs_free_extent(tp, pag, start, length, &XFS_RMAP_OINFO_AG, + XFS_AG_RESV_AGFL); + if (error) + goto out_cancel; + + return -libxfs_trans_commit(tp); + +out_cancel: + libxfs_trans_cancel(tp); + return error; +} + +/* Report blocks freed from the AGFL. */ +static int +dump_grabbed( + uint64_t start, + uint64_t length, + void *data) +{ + struct dump_info *di = data; + const char *fmt; + + if (length == 1) + fmt = di->leak ? _("agfl %u: leaked agbno %u\n") : + _("agfl %u: removed agbno %u\n"); + else + fmt = di->leak ? _("agfl %u: leaked agbno %u-%u\n") : + _("agfl %u: removed agbno %u-%u\n"); + + printf(fmt, pag_agno(di->pag), (unsigned int)start, + (unsigned int)(start + length - 1)); + return 0; +} + +/* Remove blocks from the AGFL. */ +static int +agfl_get( + struct xfs_perag *pag, + int quantity) +{ + struct dump_info di = { + .pag = pag, + .leak = quantity < 0, + }; + struct xfs_agf *agf; + struct xfs_buf *agf_bp; + struct xfs_trans *tp; + struct bitmap *grabbed; + const unsigned int agfl_size = libxfs_agfl_size(pag_mount(pag)); + unsigned int i; + int error; + + if (!quantity) + return 0; + + if (di.leak) + quantity = -quantity; + quantity = min(quantity, agfl_size); + + error = bitmap_alloc(&grabbed); + if (error) + goto out; + + error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, quantity, 0, + 0, &tp); + if (error) + goto out_bitmap; + + error = -libxfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) + goto out_cancel; + + agf = agf_bp->b_addr; + quantity = min(quantity, be32_to_cpu(agf->agf_flcount)); + + for (i = 0; i < quantity; i++) { + xfs_agblock_t agbno; + + error = -libxfs_alloc_get_freelist(pag, tp, agf_bp, &agbno, 0); + if (error) + goto out_cancel; + + if (agbno == NULLAGBLOCK) { + error = ENOSPC; + goto out_cancel; + } + + error = bitmap_set(grabbed, agbno, 1); + if (error) + goto out_cancel; + } + + error = -libxfs_trans_commit(tp); + if (error) + goto out_bitmap; + + error = bitmap_iterate(grabbed, dump_grabbed, &di); + if (error) + goto out_bitmap; + + if (!di.leak) { + error = bitmap_iterate(grabbed, free_grabbed, &di); + if (error) + goto out_bitmap; + } + + bitmap_free(&grabbed); + return 0; + +out_cancel: + libxfs_trans_cancel(tp); +out_bitmap: + bitmap_free(&grabbed); +out: + if (error) + printf(_("agfl %u: %s\n"), pag_agno(pag), strerror(error)); + return error; +} + +/* Add blocks to the AGFL. */ +static int +agfl_put( + struct xfs_perag *pag, + int quantity) +{ + struct xfs_alloc_arg args = { + .mp = pag_mount(pag), + .alignment = 1, + .minlen = 1, + .prod = 1, + .resv = XFS_AG_RESV_AGFL, + .oinfo = XFS_RMAP_OINFO_AG, + }; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + struct xfs_trans *tp; + xfs_fsblock_t target; + const unsigned int agfl_size = libxfs_agfl_size(pag_mount(pag)); + unsigned int i; + bool eoag = quantity < 0; + int error; + + if (!quantity) + return 0; + + if (eoag) + quantity = -quantity; + quantity = min(quantity, agfl_size); + + error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, quantity, 0, + 0, &tp); + if (error) + return error; + args.tp = tp; + + error = -libxfs_alloc_read_agf(pag, tp, 0, &args.agbp); + if (error) + goto out_cancel; + + agf = args.agbp->b_addr; + args.maxlen = min(quantity, agfl_size - be32_to_cpu(agf->agf_flcount)); + + if (eoag) + target = xfs_agbno_to_fsb(pag, + be32_to_cpu(agf->agf_length) - 1); + else + target = xfs_agbno_to_fsb(pag, 0); + + error = -libxfs_alloc_read_agfl(pag, tp, &agfl_bp); + if (error) + goto out_cancel; + + error = -libxfs_alloc_vextent_near_bno(&args, target); + if (error) + goto out_cancel; + + if (args.agbno == NULLAGBLOCK) { + error = ENOSPC; + goto out_cancel; + } + + for (i = 0; i < args.len; i++) { + error = -libxfs_alloc_put_freelist(pag, tp, args.agbp, + agfl_bp, args.agbno + i, 0); + if (error) + goto out_cancel; + } + + if (i == 1) + printf(_("agfl %u: added agbno %u\n"), pag_agno(pag), + args.agbno); + else if (i > 1) + printf(_("agfl %u: added agbno %u-%u\n"), pag_agno(pag), + args.agbno, args.agbno + i - 1); + + error = -libxfs_trans_commit(tp); + if (error) + goto out; + + return 0; + +out_cancel: + libxfs_trans_cancel(tp); +out: + if (error) + printf(_("agfl %u: %s\n"), pag_agno(pag), strerror(error)); + return error; +} + +static void +agfl_adjust( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int gblocks, + int pblocks) +{ + struct xfs_perag *pag; + int error; + + if (!expert_mode) { + printf(_("AGFL get/put only supported in expert mode.\n")); + exitcode = 1; + return; + } + + pag = libxfs_perag_get(mp, agno); + + error = agfl_get(pag, gblocks); + if (error) + goto out_pag; + + error = agfl_put(pag, pblocks); + +out_pag: + libxfs_perag_put(pag); + if (error) + exitcode = 1; +} + static int agfl_f( int argc, @@ -88,9 +359,25 @@ agfl_f( { xfs_agnumber_t agno; char *p; + int c; + int gblocks = 0, pblocks = 0; - if (argc > 1) { - agno = (xfs_agnumber_t)strtoul(argv[1], &p, 0); + while ((c = getopt(argc, argv, "g:p:")) != -1) { + switch (c) { + case 'g': + gblocks = atoi(optarg); + break; + case 'p': + pblocks = atoi(optarg); + break; + default: + agfl_help(); + return 1; + } + } + + if (argc > optind) { + agno = (xfs_agnumber_t)strtoul(argv[optind], &p, 0); if (*p != '\0' || agno >= mp->m_sb.sb_agcount) { dbprintf(_("bad allocation group number %s\n"), argv[1]); return 0; @@ -98,6 +385,10 @@ agfl_f( cur_agno = agno; } else if (cur_agno == NULLAGNUMBER) cur_agno = 0; + + if (gblocks || pblocks) + agfl_adjust(mp, cur_agno, gblocks, pblocks); + ASSERT(typtab[TYP_AGFL].typnm == TYP_AGFL); set_cur(&typtab[TYP_AGFL], XFS_AG_DADDR(mp, cur_agno, XFS_AGFL_DADDR(mp)), diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h index 530feef2a47db8..76f55515bb41f7 100644 --- a/libxfs/libxfs_api_defs.h +++ b/libxfs/libxfs_api_defs.h @@ -31,8 +31,12 @@ #define xfs_allocbt_maxrecs libxfs_allocbt_maxrecs #define xfs_allocbt_stage_cursor libxfs_allocbt_stage_cursor #define xfs_alloc_fix_freelist libxfs_alloc_fix_freelist +#define xfs_alloc_get_freelist libxfs_alloc_get_freelist #define xfs_alloc_min_freelist libxfs_alloc_min_freelist +#define xfs_alloc_put_freelist libxfs_alloc_put_freelist #define xfs_alloc_read_agf libxfs_alloc_read_agf +#define xfs_alloc_read_agfl libxfs_alloc_read_agfl +#define xfs_alloc_vextent_near_bno libxfs_alloc_vextent_near_bno #define xfs_alloc_vextent_start_ag libxfs_alloc_vextent_start_ag #define xfs_ascii_ci_hashname libxfs_ascii_ci_hashname diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8 index 553adff758bc02..4217e9932dd775 100644 --- a/man/man8/xfs_db.8 +++ b/man/man8/xfs_db.8 @@ -182,10 +182,19 @@ .SH COMMANDS .IR agno . If no argument is given, use the current allocation group. .TP -.BI "agfl [" agno ] +.BI "agfl [" agno "] [\-g " " quantity" "] [\-p " quantity ] Set current address to the AGFL block for allocation group .IR agno . If no argument is given, use the current allocation group. +If the +.B -g +option is specified with a positive quantity, remove that many blocks from the +AGFL and put them in the free space btrees. +If the quantity is negative, remove the blocks and leak them. +If the +.B -p +option is specified, add that many blocks to the AGFL. +If the quantity is negative, the blocks are selected from the end of the AG. .TP .BI "agi [" agno ] Set current address to the AGI block for allocation group From patchwork Tue Dec 31 23:46:08 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924063 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C64801B0425 for ; Tue, 31 Dec 2024 23:46:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688769; cv=none; b=MkO4HuR0xs4mVf9jW2jZFK3TBH6pma+7BsDRnuZ4jzpXBYHTG1nhuv9hVRWuzP18FiMIg9W+b/F1B6BCeHd+u/U50IT+8ulsyW5bWn5IRITtGw2ddJIWoT0XxxAc4TC6BZQ79rc9TRfMH6kYwZbu5PFf93IMRKePzXRQJKUd5y8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688769; c=relaxed/simple; bh=1+NZiWgwyJUOsLUN9Do6nttifOytV9UyLZ8BjPgN8UE=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=mjlN7PpJRWKZIQSZ99Z1mEtHbdTlQ9wzJeJqTs3JGq72azDkam2dbLf7p+meiqeY6l1mLgr4xbgJwp05WjlrfDHI9hKdU+FnIWFRG4mqgqeLnvK73tqWFRSnjTTT36FqiNUmiJ2B+HBZk7Mi2h+8yZvg7ym3yZrSpGhS5afqVFk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=bvtJzsvK; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="bvtJzsvK" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3A8EFC4CED2; Tue, 31 Dec 2024 23:46:09 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688769; bh=1+NZiWgwyJUOsLUN9Do6nttifOytV9UyLZ8BjPgN8UE=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=bvtJzsvKwKABlVPI/jNUr5ayvys0UrCRnQJHenGvI4apPLjqn5+2xPV4MhObvdlQ+ Vq8etMzEKeS0tlyEvilFPZMQhe3o/LD8GhqVwiLVuA9lW75Fhd1rQbNiZyTYcmLwXt 2ICAKZX0hPuHLI8Pk6Je4pAkEBRQZCu9vqqSp8ULuq4PPD095QrFoIujb72DmSRGQJ 5rx6qLNDeyQHdDF7k64DlNnylCxkvIcL78MbzROE0oNQT6M7lnvmyzLhPCvzWyYjeT q1j/Rq6T7pC53P1oyU4xhpBjLsw7uCzXZSLwI7Pi1IVtXml4aedofrS9VywIkiSEHn qzIu5ijzwi00g== Date: Tue, 31 Dec 2024 15:46:08 -0800 Subject: [PATCH 05/11] xfs_spaceman: implement clearing free space From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777946.2709794.13757066684932445846.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong First attempt at evacuating all the used blocks from part of a filesystem. Signed-off-by: "Darrick J. Wong" --- libfrog/Makefile | 5 libfrog/clearspace.c | 3294 +++++++++++++++++++++++++++++++++++++++++++++++ libfrog/clearspace.h | 79 + man/man8/xfs_spaceman.8 | 17 spaceman/Makefile | 2 spaceman/clearfree.c | 171 ++ spaceman/init.c | 1 spaceman/space.h | 2 8 files changed, 3570 insertions(+), 1 deletion(-) create mode 100644 libfrog/clearspace.c create mode 100644 libfrog/clearspace.h create mode 100644 spaceman/clearfree.c diff --git a/libfrog/Makefile b/libfrog/Makefile index 4da427789411a6..91c99822002347 100644 --- a/libfrog/Makefile +++ b/libfrog/Makefile @@ -65,6 +65,11 @@ workqueue.h LSRCFILES += gen_crc32table.c +ifeq ($(HAVE_GETFSMAP),yes) +CFILES+=clearspace.c +HFILES+=clearspace.h +endif + LDIRT = gen_crc32table crc32table.h default: ltdepend $(LTLIBRARY) diff --git a/libfrog/clearspace.c b/libfrog/clearspace.c new file mode 100644 index 00000000000000..0b6ef8f1b15015 --- /dev/null +++ b/libfrog/clearspace.c @@ -0,0 +1,3294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2021-2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include +#include "paths.h" +#include "fsgeom.h" +#include "logging.h" +#include "bulkstat.h" +#include "bitmap.h" +#include "file_exchange.h" +#include "clearspace.h" +#include "handle.h" + +/* + * Filesystem Space Balloons + * ========================= + * + * NOTE: Due to the evolving identity of this code, the "space_fd" or "space + * file" in the codebase are the same as the balloon file in this introduction. + * The introduction was written much later than the code. + * + * The goal of this code is to create a balloon file that is mapped to a range + * of the physical space that is managed by a filesystem. There are several + * uses envisioned for balloon files: + * + * 1. Defragmenting free space. Once the balloon is created, freeing it leaves + * a large chunk of contiguous free space ready for reallocation. + * + * 2. Shrinking the filesystem. If the balloon is inflated at the end of the + * filesystem, the file can be handed to the shrink code. The shrink code + * can then reduce the filesystem size by the size of the balloon. + * + * 3. Constraining usage of underlying thin provisioning pools. The space + * assigned to a balloon can be DISCARDed, which prevents the filesystem + * from using that space until the balloon is freed. This can be done more + * efficiently with the standard fallocate call, unless the balloon must + * target specific LBA ranges. + * + * Inflating a balloon is performed in five phases: claiming unused space; + * freezing used space; migrating file mappings away from frozen space; moving + * inodes; and rebuilding metadata elsewhere. + * + * Claiming Unused Space + * --------------------- + * + * The first step of inflating a file balloon is to define the range of + * physical space to be added to the balloon and claim as much of the free + * space inside that range as possible. Dirty data are flushed to disk and + * the block and inode garbage collectors are run to remove any speculative + * preallocations that might be occupying space in the target range. + * + * Second, the new XFS_IOC_MAP_FREESP ioctl is used to map free space in the + * target range to the balloon file. This step will be repeated after every + * space-clearing step below to capture that cleared space. Concurrent writer + * threads will (hopefully) be allocated space outside the target range. + * + * Freezing Used Space + * ------------------- + * + * The second phase of inflating the balloon is to freeze as much of the + * allocated space within the target range as possible. The purpose of this + * step is to grab a second reference to the used space, thereby preventing it + * from being reused elsewhere. + * + * Freezing of a physical space extent starts by using GETFSMAP to find the + * file owner of the space, and opening the file by handle. The fsmap record + * is used to create a FICLONERANGE request to link the file range into a work + * file. Once the reflink is made, any subsequent writes to any of the owners + * of that space are staged via copy on write. The balloon file prevents the + * copy on write from being staged within the target range. The frozen space + * mapping is moved from the work file to the balloon file, where it remains + * until the balloon file is freed. + * + * If reflink is not supported on the filesystem, used space cannot be frozen. + * This phase is skipped. + * + * Migrating File Mappings + * ----------------------- + * + * Once the balloon file has been populated with as much of the target range as + * possible, it is time to remap file ranges that point to the frozen space. + * + * It is advantageous to remap as many blocks as can be done with as few system + * calls as possible to avoid fragmenting files. Furthermore, it is preferable + * to remap heavily shared extents before lightly shared extents to preserve + * reflinks when possible. The new GETFSREFCOUNTS call is used to rank + * physical space extents by size and sharing factor so that the library always + * tries to relocate the highest ranking space extent. + * + * Once a space extent has been selected for relocation, it is reflinked from + * the balloon file into the work file. Next, fallocate is called with the + * FALLOC_FL_UNSHARE_RANGE mode to persist a new copy of the file data and + * update the mapping in the work file. The GETFSMAP call is used to find the + * remaining owners of the target space. For each owner, FIEDEDUPERANGE is + * used to change the owner file's mapping to the space in the work file if the + * owner has not been changed. + * + * If the filesystem does not support reflink, FIDEDUPERANGE will not be + * available. Fortunately, there will only be one owner of the frozen space. + * The file range contents are instead copied through the page cache to the + * work file, and EXCHANGE_RANGE is used to swap the mappings if the owner + * file has not been modified. + * + * When the only remaining owner of the space is the balloon file, return to + * the GETFSREFCOUNTS step to find a new target. This phase is complete when + * there are no more targets. + * + * Moving Inodes + * ------------- + * + * NOTE: This part is not written. + * + * When GETFSMAP tells us about an inode chunk, it is necessary to move the + * inodes allocated in that inode chunk to a new chunk. The first step is to + * create a new donor file whose inode record is not in the target range. This + * file must be created in a donor directory. Next, the file contents should + * be cloned, either via FICLONE for regular files or by copying the directory + * entries for directories. The caller must ensure that no programs write to + * the victim inode while this process is ongoing. + * + * Finally, the new inode must be mapped into the same points in the directory + * tree as the old inode. For each parent pointer accessible by the file, + * perform a RENAME_EXCHANGE operation to update the directory entry. One + * obvious flaw of this method is that we cannot specify (parent, name, child) + * pairs to renameat, which means that the rename does the wrong thing if + * either directory is updated concurrently. + * + * If parent pointers are not available, this phase could be performed slowly + * by iterating all directories looking for entries of interest and swapping + * them. + * + * It is required that the caller guarantee that other applications cannot + * update the filesystem concurrently. + * + * Rebuilding Metadata + * ------------------- + * + * The final phase identifies filesystem metadata occupying the target range + * and uses the online filesystem repair facility to rebuild the metadata + * structures. Assuming that the balloon file now maps most of the space in + * the target range, the new structures should be located outside of the target + * range. This phase runs in a loop until there is no more metadata to + * relocate or no progress can be made on relocating metadata. + * + * Limitations and Bugs + * -------------------- + * + * - This code must be able to find the owners of a range of physical space. + * If GETFSMAP does not return owner information, this code cannot succeed. + * In other words, reverse mapping must be enabled. + * + * - We cannot freeze EOF blocks because the FICLONERANGE code does not allow + * us to remap an EOF block into the middle of the balloon file. I think we + * actually succeed at reflinking the EOF block into the work file during the + * freeze step, but we need to dedupe/exchange the real owners' mappings + * without waiting for the freeze step. OTOH, we /also/ want to freeze as + * much space as quickly as we can. + * + * - Freeze cannot use FIECLONERANGE to reflink unwritten extents into the work + * file because FICLONERANGE ignores unwritten extents. We could create the + * work file as a sparse file and use EXCHANGE_RANGE to swap the unwritten + * extent with the hole, extend EOF to be allocunit aligned, and use + * EXCHANGE_RANGE to move it to the balloon file. That first exchange must + * be careful to sample the owner file's bulkstat data, re-measure the file + * range to confirm that the unwritten extent is still the one we want, and + * only exchange if the owner file has not changed. + * + * - csp_buffercopy seems to hang if pread returns zero bytes read. Do we dare + * use copy_file_range for this instead? + * + * - None of this code knows how to move inodes. Phase 4 is entirely + * speculative fiction rooted in Dave Chinner's earlier implementation. + * + * - Does this work for realtime files? Even for large rt extent sizes? + */ + +/* VFS helpers */ + +/* Remap the file range described by @fcr into fd, or return an errno. */ +static inline int +clonerange(int fd, struct file_clone_range *fcr) +{ + int ret; + + ret = ioctl(fd, FICLONERANGE, fcr); + if (ret) + return errno; + + return 0; +} + +/* + * Deduplicate part of fd into the file range described by fdr. If the + * operation succeeded, we set @same to whether or not we deduped the data and + * return zero. If not, return an errno. + */ +static inline int +deduperange(int fd, struct file_dedupe_range *fdr, bool *same) +{ + struct file_dedupe_range_info *info = &fdr->info[0]; + int ret; + + assert(fdr->dest_count == 1); + *same = false; + + ret = ioctl(fd, FIDEDUPERANGE, fdr); + if (ret) + return errno; + + if (info->status < 0) + return -info->status; + + if (info->status == FILE_DEDUPE_RANGE_DIFFERS) + return 0; + + /* The kernel should never dedupe more than it was asked. */ + assert(fdr->src_length >= info->bytes_deduped); + + *same = true; + return 0; +} + +/* Space clearing operation control */ + +#define QUERY_BATCH_SIZE 1024 + +struct clearspace_tgt { + unsigned long long start; + unsigned long long length; + unsigned long long owners; + unsigned long long prio; + unsigned long long evacuated; + bool try_again; +}; + +struct clearspace_req { + struct xfs_fd *xfd; + + /* all the blocks that we've tried to clear */ + struct bitmap *visited; + + /* stat buffer of the open file */ + struct stat statbuf; + struct stat temp_statbuf; + struct stat space_statbuf; + + /* handle to this filesystem */ + void *fshandle; + size_t fshandle_sz; + + /* physical storage that we want to clear */ + unsigned long long start; + unsigned long long length; + dev_t dev; + + /* convenience variable */ + bool realtime:1; + bool use_reflink:1; + bool can_evac_metadata:1; + + /* + * The "space capture" file. Each extent in this file must be mapped + * to the same byte offset as the byte address of the physical space. + */ + int space_fd; + + /* work file for migrating file data */ + int work_fd; + + /* preallocated buffers for queries */ + struct getbmapx *bhead; + struct fsmap_head *mhead; + struct xfs_getfsrefs_head *rhead; + + /* buffer for copying data */ + char *buf; + + /* buffer for deduping data */ + struct file_dedupe_range *fdr; + + /* tracing mask and indent level */ + unsigned int trace_mask; + unsigned int trace_indent; +}; + +static inline bool +csp_is_internal_owner( + const struct clearspace_req *req, + unsigned long long owner) +{ + return owner == req->temp_statbuf.st_ino || + owner == req->space_statbuf.st_ino; +} + +/* Debugging stuff */ + +static const struct csp_errstr { + unsigned int mask; + const char *tag; +} errtags[] = { + { CSP_TRACE_FREEZE, "freeze" }, + { CSP_TRACE_GRAB, "grab" }, + { CSP_TRACE_PREP, "prep" }, + { CSP_TRACE_TARGET, "target" }, + { CSP_TRACE_DEDUPE, "dedupe" }, + { CSP_TRACE_EXCHANGE, "exchange_range" }, + { CSP_TRACE_XREBUILD, "rebuild" }, + { CSP_TRACE_EFFICACY, "efficacy" }, + { CSP_TRACE_SETUP, "setup" }, + { CSP_TRACE_DUMPFILE, "dumpfile" }, + { CSP_TRACE_BITMAP, "bitmap" }, + + /* prioritize high level functions over low level queries for tagging */ + { CSP_TRACE_FSMAP, "fsmap" }, + { CSP_TRACE_FSREFS, "fsrefs" }, + { CSP_TRACE_BMAPX, "bmapx" }, + { CSP_TRACE_FALLOC, "falloc" }, + { CSP_TRACE_STATUS, "status" }, + { 0, NULL }, +}; + +static void +csp_debug( + struct clearspace_req *req, + unsigned int mask, + const char *func, + int line, + const char *format, + ...) +{ + const struct csp_errstr *et = errtags; + bool debug = (req->trace_mask & ~CSP_TRACE_STATUS); + int indent = req->trace_indent; + va_list args; + + if ((req->trace_mask & mask) != mask) + return; + + if (debug) { + while (indent > 0) { + fprintf(stderr, " "); + indent--; + } + + for (; et->tag; et++) { + if (et->mask & mask) { + fprintf(stderr, "%s: ", et->tag); + break; + } + } + } + + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + + if (debug) + fprintf(stderr, " (line %d)\n", line); + else + fprintf(stderr, "\n"); + fflush(stderr); +} + +#define trace_freeze(req, format, ...) \ + csp_debug((req), CSP_TRACE_FREEZE, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_grabfree(req, format, ...) \ + csp_debug((req), CSP_TRACE_GRAB, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_fsmap(req, format, ...) \ + csp_debug((req), CSP_TRACE_FSMAP, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_fsmap_rec(req, mask, mrec) \ + while (!csp_is_internal_owner((req), (mrec)->fmr_owner)) { \ + csp_debug((req), (mask) | CSP_TRACE_FSMAP, __func__, __LINE__, \ +"fsmap phys 0x%llx owner 0x%llx offset 0x%llx bytecount 0x%llx flags 0x%x", \ + (unsigned long long)(mrec)->fmr_physical, \ + (unsigned long long)(mrec)->fmr_owner, \ + (unsigned long long)(mrec)->fmr_offset, \ + (unsigned long long)(mrec)->fmr_length, \ + (mrec)->fmr_flags); \ + break; \ + } + +#define trace_fsrefs(req, format, ...) \ + csp_debug((req), CSP_TRACE_FSREFS, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_fsrefs_rec(req, mask, rrec) \ + csp_debug((req), (mask) | CSP_TRACE_FSREFS, __func__, __LINE__, \ +"fsref phys 0x%llx bytecount 0x%llx owners %llu flags 0x%x", \ + (unsigned long long)(rrec)->fcr_physical, \ + (unsigned long long)(rrec)->fcr_length, \ + (unsigned long long)(rrec)->fcr_owners, \ + (rrec)->fcr_flags) + +#define trace_bmapx(req, format, ...) \ + csp_debug((req), CSP_TRACE_BMAPX, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_bmapx_rec(req, mask, brec) \ + csp_debug((req), (mask) | CSP_TRACE_BMAPX, __func__, __LINE__, \ +"bmapx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%x", \ + (unsigned long long)BBTOB((brec)->bmv_offset), \ + (unsigned long long)BBTOB((brec)->bmv_length), \ + (unsigned long long)BBTOB((brec)->bmv_block), \ + (brec)->bmv_oflags) + +#define trace_prep(req, format, ...) \ + csp_debug((req), CSP_TRACE_PREP, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_target(req, format, ...) \ + csp_debug((req), CSP_TRACE_TARGET, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_dedupe(req, format, ...) \ + csp_debug((req), CSP_TRACE_DEDUPE, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_falloc(req, format, ...) \ + csp_debug((req), CSP_TRACE_FALLOC, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_exchange(req, format, ...) \ + csp_debug((req), CSP_TRACE_EXCHANGE, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_xrebuild(req, format, ...) \ + csp_debug((req), CSP_TRACE_XREBUILD, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_setup(req, format, ...) \ + csp_debug((req), CSP_TRACE_SETUP, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_status(req, format, ...) \ + csp_debug((req), CSP_TRACE_STATUS, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_dumpfile(req, format, ...) \ + csp_debug((req), CSP_TRACE_DUMPFILE, __func__, __LINE__, format, __VA_ARGS__) + +#define trace_bitmap(req, format, ...) \ + csp_debug((req), CSP_TRACE_BITMAP, __func__, __LINE__, format, __VA_ARGS__) + +/* VFS Iteration helpers */ + +static inline void +start_spacefd_iter(struct clearspace_req *req) +{ + req->trace_indent++; +} + +static inline void +end_spacefd_iter(struct clearspace_req *req) +{ + req->trace_indent--; +} + +/* + * Iterate each hole in the space-capture file. Returns 1 if holepos/length + * has been set to a hole; 0 if there aren't any holes left, or -1 for error. + */ +static inline int +spacefd_hole_iter( + const struct clearspace_req *req, + loff_t *holepos, + loff_t *length) +{ + loff_t end = req->start + req->length; + loff_t h; + loff_t d; + + if (*length == 0) + d = req->start; + else + d = *holepos + *length; + if (d >= end) + return 0; + + h = lseek(req->space_fd, d, SEEK_HOLE); + if (h < 0) { + perror(_("finding start of hole in space capture file")); + return h; + } + if (h >= end) + return 0; + + d = lseek(req->space_fd, h, SEEK_DATA); + if (d < 0 && errno == ENXIO) + d = end; + if (d < 0) { + perror(_("finding end of hole in space capture file")); + return d; + } + if (d > end) + d = end; + + *holepos = h; + *length = d - h; + return 1; +} + +/* + * Iterate each written region in the space-capture file. Returns 1 if + * datapos/length have been set to a data area; 0 if there isn't any data left, + * or -1 for error. + */ +static int +spacefd_data_iter( + const struct clearspace_req *req, + loff_t *datapos, + loff_t *length) +{ + loff_t end = req->start + req->length; + loff_t d; + loff_t h; + + if (*length == 0) + h = req->start; + else + h = *datapos + *length; + if (h >= end) + return 0; + + d = lseek(req->space_fd, h, SEEK_DATA); + if (d < 0 && errno == ENXIO) + return 0; + if (d < 0) { + perror(_("finding start of data in space capture file")); + return d; + } + if (d >= end) + return 0; + + h = lseek(req->space_fd, d, SEEK_HOLE); + if (h < 0) { + perror(_("finding end of data in space capture file")); + return h; + } + if (h > end) + h = end; + + *datapos = d; + *length = h - d; + return 1; +} + +/* Filesystem space usage queries */ + +/* Allocate the structures needed for a fsmap query. */ +static void +start_fsmap_query( + struct clearspace_req *req, + dev_t dev, + unsigned long long physical, + unsigned long long length) +{ + struct fsmap_head *mhead = req->mhead; + + assert(req->mhead->fmh_count == 0); + memset(mhead, 0, sizeof(struct fsmap_head)); + mhead->fmh_count = QUERY_BATCH_SIZE; + mhead->fmh_keys[0].fmr_device = dev; + mhead->fmh_keys[0].fmr_physical = physical; + mhead->fmh_keys[1].fmr_device = dev; + mhead->fmh_keys[1].fmr_physical = physical + length; + mhead->fmh_keys[1].fmr_owner = ULLONG_MAX; + mhead->fmh_keys[1].fmr_flags = UINT_MAX; + mhead->fmh_keys[1].fmr_offset = ULLONG_MAX; + + trace_fsmap(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx", + major(dev), minor(dev), + (unsigned long long)physical, + (unsigned long long)length, + (unsigned long long)mhead->fmh_keys[1].fmr_physical); + req->trace_indent++; +} + +static inline void +end_fsmap_query( + struct clearspace_req *req) +{ + req->trace_indent--; + req->mhead->fmh_count = 0; +} + +/* Set us up for the next run_fsmap_query, or return false. */ +static inline bool +advance_fsmap_cursor(struct fsmap_head *mhead) +{ + struct fsmap *mrec; + + mrec = &mhead->fmh_recs[mhead->fmh_entries - 1]; + if (mrec->fmr_flags & FMR_OF_LAST) + return false; + + fsmap_advance(mhead); + return true; +} + +/* + * Run a GETFSMAP query. Returns 1 if there are rows, 0 if there are no rows, + * or -1 for error. + */ +static inline int +run_fsmap_query( + struct clearspace_req *req) +{ + struct fsmap_head *mhead = req->mhead; + int ret; + + if (mhead->fmh_entries > 0 && !advance_fsmap_cursor(mhead)) + return 0; + + trace_fsmap(req, + "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx", + major(mhead->fmh_keys[0].fmr_device), + minor(mhead->fmh_keys[0].fmr_device), + (unsigned long long)mhead->fmh_keys[0].fmr_physical, + (unsigned long long)mhead->fmh_keys[0].fmr_length, + (unsigned long long)mhead->fmh_keys[1].fmr_physical); + + ret = ioctl(req->xfd->fd, FS_IOC_GETFSMAP, mhead); + if (ret) { + perror(_("querying fsmap data")); + return -1; + } + + if (!(mhead->fmh_oflags & FMH_OF_DEV_T)) { + fprintf(stderr, _("fsmap does not return dev_t.\n")); + return -1; + } + + if (mhead->fmh_entries == 0) + return 0; + + return 1; +} + +#define for_each_fsmap_row(req, rec) \ + for ((rec) = (req)->mhead->fmh_recs; \ + (rec) < (req)->mhead->fmh_recs + (req)->mhead->fmh_entries; \ + (rec)++) + +/* Allocate the structures needed for a fsrefcounts query. */ +static void +start_fsrefs_query( + struct clearspace_req *req, + dev_t dev, + unsigned long long physical, + unsigned long long length) +{ + struct xfs_getfsrefs_head *rhead = req->rhead; + + assert(req->rhead->fch_count == 0); + memset(rhead, 0, sizeof(struct xfs_getfsrefs_head)); + rhead->fch_count = QUERY_BATCH_SIZE; + rhead->fch_keys[0].fcr_device = dev; + rhead->fch_keys[0].fcr_physical = physical; + rhead->fch_keys[1].fcr_device = dev; + rhead->fch_keys[1].fcr_physical = physical + length; + rhead->fch_keys[1].fcr_owners = ULLONG_MAX; + rhead->fch_keys[1].fcr_flags = UINT_MAX; + + trace_fsrefs(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx", + major(dev), minor(dev), + (unsigned long long)physical, + (unsigned long long)length, + (unsigned long long)rhead->fch_keys[1].fcr_physical); + req->trace_indent++; +} + +static inline void +end_fsrefs_query( + struct clearspace_req *req) +{ + req->trace_indent--; + req->rhead->fch_count = 0; +} + +/* Set us up for the next run_fsrefs_query, or return false. */ +static inline bool +advance_fsrefs_query(struct xfs_getfsrefs_head *rhead) +{ + struct xfs_getfsrefs *rrec; + + rrec = &rhead->fch_recs[rhead->fch_entries - 1]; + if (rrec->fcr_flags & FCR_OF_LAST) + return false; + + xfs_getfsrefs_advance(rhead); + return true; +} + +/* + * Run a GETFSREFCOUNTS query. Returns 1 if there are rows, 0 if there are + * no rows, or -1 for error. + */ +static inline int +run_fsrefs_query( + struct clearspace_req *req) +{ + struct xfs_getfsrefs_head *rhead = req->rhead; + int ret; + + if (rhead->fch_entries > 0 && !advance_fsrefs_query(rhead)) + return 0; + + trace_fsrefs(req, + "ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx", + major(rhead->fch_keys[0].fcr_device), + minor(rhead->fch_keys[0].fcr_device), + (unsigned long long)rhead->fch_keys[0].fcr_physical, + (unsigned long long)rhead->fch_keys[0].fcr_length, + (unsigned long long)rhead->fch_keys[1].fcr_physical); + + ret = ioctl(req->xfd->fd, XFS_IOC_GETFSREFCOUNTS, rhead); + if (ret) { + perror(_("querying refcount data")); + return -1; + } + + if (!(rhead->fch_oflags & FCH_OF_DEV_T)) { + fprintf(stderr, _("fsrefcounts does not return dev_t.\n")); + return -1; + } + + if (rhead->fch_entries == 0) + return 0; + + return 1; +} + +#define for_each_fsref_row(req, rec) \ + for ((rec) = (req)->rhead->fch_recs; \ + (rec) < (req)->rhead->fch_recs + (req)->rhead->fch_entries; \ + (rec)++) + +/* Allocate the structures needed for a bmapx query. */ +static void +start_bmapx_query( + struct clearspace_req *req, + unsigned int fork, + unsigned long long pos, + unsigned long long length) +{ + struct getbmapx *bhead = req->bhead; + + assert(fork == BMV_IF_ATTRFORK || fork == BMV_IF_COWFORK || !fork); + assert(req->bhead->bmv_count == 0); + + memset(bhead, 0, sizeof(struct getbmapx)); + bhead[0].bmv_offset = BTOBB(pos); + bhead[0].bmv_length = BTOBB(length); + bhead[0].bmv_count = QUERY_BATCH_SIZE + 1; + bhead[0].bmv_iflags = fork | BMV_IF_PREALLOC | BMV_IF_DELALLOC; + + trace_bmapx(req, "%s pos 0x%llx bytecount 0x%llx", + fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data", + (unsigned long long)BBTOB(bhead[0].bmv_offset), + (unsigned long long)BBTOB(bhead[0].bmv_length)); + req->trace_indent++; +} + +static inline void +end_bmapx_query( + struct clearspace_req *req) +{ + req->trace_indent--; + req->bhead->bmv_count = 0; +} + +/* Set us up for the next run_bmapx_query, or return false. */ +static inline bool +advance_bmapx_query(struct getbmapx *bhead) +{ + struct getbmapx *brec; + unsigned long long next_offset; + unsigned long long end = bhead->bmv_offset + bhead->bmv_length; + + brec = &bhead[bhead->bmv_entries]; + if (brec->bmv_oflags & BMV_OF_LAST) + return false; + + next_offset = brec->bmv_offset + brec->bmv_length; + if (next_offset > end) + return false; + + bhead->bmv_offset = next_offset; + bhead->bmv_length = end - next_offset; + return true; +} + +/* + * Run a GETBMAPX query. Returns 1 if there are rows, 0 if there are no rows, + * or -1 for error. + */ +static inline int +run_bmapx_query( + struct clearspace_req *req, + int fd) +{ + struct getbmapx *bhead = req->bhead; + unsigned int fork; + int ret; + + if (bhead->bmv_entries > 0 && !advance_bmapx_query(bhead)) + return 0; + + fork = bhead[0].bmv_iflags & (BMV_IF_COWFORK | BMV_IF_ATTRFORK); + trace_bmapx(req, "ioctl %s pos 0x%llx bytecount 0x%llx", + fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data", + (unsigned long long)BBTOB(bhead[0].bmv_offset), + (unsigned long long)BBTOB(bhead[0].bmv_length)); + + ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead); + if (ret) { + perror(_("querying bmapx data")); + return -1; + } + + if (bhead->bmv_entries == 0) + return 0; + + return 1; +} + +#define for_each_bmapx_row(req, rec) \ + for ((rec) = (req)->bhead + 1; \ + (rec) < (req)->bhead + 1 + (req)->bhead->bmv_entries; \ + (rec)++) + +static inline void +csp_dump_bmapx_row( + struct clearspace_req *req, + unsigned int nr, + const struct getbmapx *brec) +{ + if (brec->bmv_block == -1) { + trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx hole", + nr, + (unsigned long long)BBTOB(brec->bmv_offset), + (unsigned long long)BBTOB(brec->bmv_length)); + return; + } + + if (brec->bmv_block == -2) { + trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx delalloc", + nr, + (unsigned long long)BBTOB(brec->bmv_offset), + (unsigned long long)BBTOB(brec->bmv_length)); + return; + } + + trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx phys 0x%llx flags 0x%x", + nr, + (unsigned long long)BBTOB(brec->bmv_offset), + (unsigned long long)BBTOB(brec->bmv_length), + (unsigned long long)BBTOB(brec->bmv_block), + brec->bmv_oflags); +} + +static inline void +csp_dump_bmapx( + struct clearspace_req *req, + int fd, + unsigned int indent, + const char *tag) +{ + unsigned int nr; + int ret; + + trace_dumpfile(req, "DUMP BMAP OF DATA FORK %s", tag); + start_bmapx_query(req, 0, req->start, req->length); + nr = 0; + while ((ret = run_bmapx_query(req, fd)) > 0) { + struct getbmapx *brec; + + for_each_bmapx_row(req, brec) { + csp_dump_bmapx_row(req, nr++, brec); + if (nr > 10) + goto dump_cow; + } + } + +dump_cow: + end_bmapx_query(req); + trace_dumpfile(req, "DUMP BMAP OF COW FORK %s", tag); + start_bmapx_query(req, BMV_IF_COWFORK, req->start, req->length); + nr = 0; + while ((ret = run_bmapx_query(req, fd)) > 0) { + struct getbmapx *brec; + + for_each_bmapx_row(req, brec) { + csp_dump_bmapx_row(req, nr++, brec); + if (nr > 10) + goto dump_attr; + } + } + +dump_attr: + end_bmapx_query(req); + trace_dumpfile(req, "DUMP BMAP OF ATTR FORK %s", tag); + start_bmapx_query(req, BMV_IF_ATTRFORK, req->start, req->length); + nr = 0; + while ((ret = run_bmapx_query(req, fd)) > 0) { + struct getbmapx *brec; + + for_each_bmapx_row(req, brec) { + csp_dump_bmapx_row(req, nr++, brec); + if (nr > 10) + goto stop; + } + } + +stop: + end_bmapx_query(req); + trace_dumpfile(req, "DONE DUMPING %s", tag); +} + +/* Return the first bmapx for the given file range. */ +static int +bmapx_one( + struct clearspace_req *req, + int fd, + unsigned long long pos, + unsigned long long length, + struct getbmapx *brec) +{ + struct getbmapx bhead[2]; + int ret; + + memset(bhead, 0, sizeof(struct getbmapx) * 2); + bhead[0].bmv_offset = BTOBB(pos); + bhead[0].bmv_length = BTOBB(length); + bhead[0].bmv_count = 2; + bhead[0].bmv_iflags = BMV_IF_PREALLOC | BMV_IF_DELALLOC; + + ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead); + if (ret) { + perror(_("simple bmapx query")); + return -1; + } + + if (bhead->bmv_entries > 0) { + memcpy(brec, &bhead[1], sizeof(struct getbmapx)); + return 0; + } + + memset(brec, 0, sizeof(struct getbmapx)); + brec->bmv_offset = pos; + brec->bmv_block = -1; /* hole */ + brec->bmv_length = length; + return 0; +} + +/* Constrain space map records. */ +static void +__trim_fsmap( + uint64_t start, + uint64_t length, + struct fsmap *fsmap) +{ + unsigned long long delta, end; + bool need_off; + + need_off = !(fsmap->fmr_flags & (FMR_OF_EXTENT_MAP | + FMR_OF_SPECIAL_OWNER)); + + if (fsmap->fmr_physical < start) { + delta = start - fsmap->fmr_physical; + fsmap->fmr_physical = start; + fsmap->fmr_length -= delta; + if (need_off) + fsmap->fmr_offset += delta; + } + + end = fsmap->fmr_physical + fsmap->fmr_length; + if (end > start + length) { + delta = end - (start + length); + fsmap->fmr_length -= delta; + } +} + +static inline void +trim_target_fsmap(const struct clearspace_tgt *tgt, struct fsmap *fsmap) +{ + return __trim_fsmap(tgt->start, tgt->length, fsmap); +} + +static inline void +trim_request_fsmap(const struct clearspace_req *req, struct fsmap *fsmap) +{ + return __trim_fsmap(req->start, req->length, fsmap); +} + +/* Actual space clearing code */ + +/* + * Map all the free space in the region that we're clearing to the space + * catcher file. + */ +static int +csp_grab_free_space( + struct clearspace_req *req) +{ + struct xfs_map_freesp args = { + .offset = req->start, + .len = req->length, + }; + int ret; + + trace_grabfree(req, "start 0x%llx length 0x%llx", + (unsigned long long)req->start, + (unsigned long long)req->length); + + ret = ioctl(req->space_fd, XFS_IOC_MAP_FREESP, &args); + if (ret) { + perror(_("map free space to space capture file")); + return -1; + } + + return 0; +} + +/* + * Rank a refcount record. We prefer to tackle highly shared and longer + * extents first. + */ +static inline unsigned long long +csp_space_prio( + const struct xfs_fsop_geom *g, + const struct xfs_getfsrefs *p) +{ + unsigned long long blocks = p->fcr_length / g->blocksize; + unsigned long long ret = blocks * p->fcr_owners; + + if (ret < blocks || ret < p->fcr_owners) + return UINT64_MAX; + return ret; +} + +/* Make the current refcount record the clearing target if desirable. */ +static void +csp_adjust_target( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct xfs_getfsrefs *rec, + unsigned long long prio) +{ + if (prio < target->prio) + return; + if (prio == target->prio && + rec->fcr_length <= target->length) + return; + + /* Ignore results that go beyond the end of what we wanted. */ + if (rec->fcr_physical >= req->start + req->length) + return; + + /* Ignore regions that we already tried to clear. */ + if (bitmap_test(req->visited, rec->fcr_physical, rec->fcr_length)) + return; + + trace_target(req, + "set target, prio 0x%llx -> 0x%llx phys 0x%llx bytecount 0x%llx", + target->prio, prio, + (unsigned long long)rec->fcr_physical, + (unsigned long long)rec->fcr_length); + + target->start = rec->fcr_physical; + target->length = rec->fcr_length; + target->owners = rec->fcr_owners; + target->prio = prio; +} + +/* + * Decide if this refcount record maps to extents that are sufficiently + * interesting to target. + */ +static int +csp_evaluate_refcount( + struct clearspace_req *req, + const struct xfs_getfsrefs *rrec, + struct clearspace_tgt *target) +{ + const struct xfs_fsop_geom *fsgeom = &req->xfd->fsgeom; + unsigned long long prio = csp_space_prio(fsgeom, rrec); + int ret; + + if (rrec->fcr_device != req->dev) + return 0; + + if (prio < target->prio) + return 0; + + /* + * XFS only supports sharing data blocks. If there's more than one + * owner, we know that we can easily move the blocks. + */ + if (rrec->fcr_owners > 1) { + csp_adjust_target(req, target, rrec, prio); + return 0; + } + + /* + * Otherwise, this extent has single owners. Walk the fsmap records to + * figure out if they're movable or not. + */ + start_fsmap_query(req, rrec->fcr_device, rrec->fcr_physical, + rrec->fcr_length); + while ((ret = run_fsmap_query(req)) > 0) { + struct fsmap *mrec; + uint64_t next_phys = 0; + + for_each_fsmap_row(req, mrec) { + struct xfs_getfsrefs fake_rec = { }; + + trace_fsmap_rec(req, CSP_TRACE_TARGET, mrec); + + if (mrec->fmr_device != rrec->fcr_device) + continue; + if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER) + continue; + if (csp_is_internal_owner(req, mrec->fmr_owner)) + continue; + + /* + * If the space has become shared since the fsrefs + * query, just skip this record. We might come back to + * it in a later iteration. + */ + if (mrec->fmr_physical < next_phys) + continue; + + /* Fake enough of a fsrefs to calculate the priority. */ + fake_rec.fcr_physical = mrec->fmr_physical; + fake_rec.fcr_length = mrec->fmr_length; + fake_rec.fcr_owners = 1; + prio = csp_space_prio(fsgeom, &fake_rec); + + /* Target unwritten extents first; they're cheap. */ + if (mrec->fmr_flags & FMR_OF_PREALLOC) + prio |= (1ULL << 63); + + csp_adjust_target(req, target, &fake_rec, prio); + + next_phys = mrec->fmr_physical + mrec->fmr_length; + } + } + end_fsmap_query(req); + + return ret; +} + +/* + * Given a range of storage to search, find the most appealing target for space + * clearing. If nothing suitable is found, the target will be zeroed. + */ +static int +csp_find_target( + struct clearspace_req *req, + struct clearspace_tgt *target) +{ + int ret; + + memset(target, 0, sizeof(struct clearspace_tgt)); + + start_fsrefs_query(req, req->dev, req->start, req->length); + while ((ret = run_fsrefs_query(req)) > 0) { + struct xfs_getfsrefs *rrec; + + for_each_fsref_row(req, rrec) { + trace_fsrefs_rec(req, CSP_TRACE_TARGET, rrec); + ret = csp_evaluate_refcount(req, rrec, target); + if (ret) { + end_fsrefs_query(req); + return ret; + } + } + } + end_fsrefs_query(req); + + if (target->length != 0) { + /* + * Mark this extent visited so that we won't try again this + * round. + */ + trace_bitmap(req, "set filedata start 0x%llx length 0x%llx", + target->start, target->length); + ret = bitmap_set(req->visited, target->start, target->length); + if (ret) { + perror(_("marking file extent visited")); + return ret; + } + } + + return 0; +} + +/* Try to evacuate blocks by using online repair. */ +static int +csp_evac_file_metadata( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct fsmap *mrec, + int fd, + const struct xfs_bulkstat *bulkstat) +{ + struct xfs_scrub_metadata scrub = { + .sm_type = XFS_SCRUB_TYPE_PROBE, + .sm_flags = XFS_SCRUB_IFLAG_REPAIR | + XFS_SCRUB_IFLAG_FORCE_REBUILD, + }; + struct xfs_fd *xfd = req->xfd; + int ret; + + trace_xrebuild(req, + "ino 0x%llx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%llx", + (unsigned long long)mrec->fmr_owner, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_physical, + (unsigned long long)mrec->fmr_length, + (unsigned long long)mrec->fmr_flags); + + if (fd == -1) { + scrub.sm_ino = mrec->fmr_owner; + scrub.sm_gen = bulkstat->bs_gen; + fd = xfd->fd; + } + + if (mrec->fmr_flags & FMR_OF_ATTR_FORK) { + if (mrec->fmr_flags & FMR_OF_EXTENT_MAP) + scrub.sm_type = XFS_SCRUB_TYPE_BMBTA; + else + scrub.sm_type = XFS_SCRUB_TYPE_XATTR; + } else if (mrec->fmr_flags & FMR_OF_EXTENT_MAP) { + scrub.sm_type = XFS_SCRUB_TYPE_BMBTD; + } else if (S_ISLNK(bulkstat->bs_mode)) { + scrub.sm_type = XFS_SCRUB_TYPE_SYMLINK; + } else if (S_ISDIR(bulkstat->bs_mode)) { + scrub.sm_type = XFS_SCRUB_TYPE_DIR; + } + + if (scrub.sm_type == XFS_SCRUB_TYPE_PROBE) + return 0; + + trace_xrebuild(req, "ino 0x%llx gen 0x%x type %u", + (unsigned long long)mrec->fmr_owner, + (unsigned int)bulkstat->bs_gen, + (unsigned int)scrub.sm_type); + + ret = ioctl(fd, XFS_IOC_SCRUB_METADATA, &scrub); + if (ret) { + fprintf(stderr, + _("evacuating inode 0x%llx metadata type %u: %s\n"), + (unsigned long long)mrec->fmr_owner, + scrub.sm_type, strerror(errno)); + return -1; + } + + target->evacuated++; + return 0; +} + +/* + * Open an inode via handle. Returns a file descriptor, -2 if the file is + * gone, or -1 on error. + */ +static int +csp_open_by_handle( + struct clearspace_req *req, + int oflags, + uint64_t ino, + uint32_t gen) +{ + struct xfs_handle handle = { }; + struct xfs_fsop_handlereq hreq = { + .oflags = oflags | O_NOATIME | O_NOFOLLOW | + O_NOCTTY | O_LARGEFILE, + .ihandle = &handle, + .ihandlen = sizeof(handle), + }; + int ret; + + memcpy(&handle.ha_fsid, req->fshandle, sizeof(handle.ha_fsid)); + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_ino = ino; + handle.ha_fid.fid_gen = gen; + + /* + * Since we extracted the fshandle from the open file instead of using + * path_to_fshandle, the fsid cache doesn't know about the fshandle. + * Construct the open by handle request manually. + */ + ret = ioctl(req->xfd->fd, XFS_IOC_OPEN_BY_HANDLE, &hreq); + if (ret < 0) { + if (errno == ENOENT || errno == EINVAL) + return -2; + + fprintf(stderr, _("open inode 0x%llx: %s\n"), + (unsigned long long)ino, + strerror(errno)); + return -1; + } + + return ret; +} + +/* + * Open a file for evacuation. Returns a positive errno on error; a fd in @fd + * if the caller is supposed to do something; or @fd == -1 if there's nothing + * further to do. + */ +static int +csp_evac_open( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct fsmap *mrec, + struct xfs_bulkstat *bulkstat, + int oflags, + int *fd) +{ + struct xfs_bulkstat __bs; + int target_fd; + int ret; + + *fd = -1; + + if (csp_is_internal_owner(req, mrec->fmr_owner) || + (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)) + goto nothing_to_do; + + if (bulkstat == NULL) + bulkstat = &__bs; + + /* + * Snapshot this file so that we can perform a fresh-only exchange. + * For other types of files we just skip to the evacuation step. + */ + ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, bulkstat); + if (ret) { + if (ret == ENOENT || ret == EINVAL) + goto nothing_to_do; + + fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"), + (unsigned long long)mrec->fmr_owner, + strerror(ret)); + return ret; + } + + /* + * If we get stats for a different inode, the file may have been freed + * out from under us and there's nothing to do. + */ + if (bulkstat->bs_ino != mrec->fmr_owner) + goto nothing_to_do; + + /* + * We're only allowed to open regular files and directories via handle + * so jump to online rebuild for all other file types. + */ + if (!S_ISREG(bulkstat->bs_mode) && !S_ISDIR(bulkstat->bs_mode)) + return csp_evac_file_metadata(req, target, mrec, -1, + bulkstat); + + if (S_ISDIR(bulkstat->bs_mode)) + oflags = O_RDONLY; + + target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner, + bulkstat->bs_gen); + if (target_fd == -2) + goto nothing_to_do; + if (target_fd < 0) + return -target_fd; + + /* + * Exchange only works for regular file data blocks. If that isn't the + * case, our only recourse is online rebuild. + */ + if (S_ISDIR(bulkstat->bs_mode) || + (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) { + int ret2; + + ret = csp_evac_file_metadata(req, target, mrec, target_fd, + bulkstat); + ret2 = close(target_fd); + if (!ret && ret2) + ret = ret2; + return ret; + } + + *fd = target_fd; + return 0; + +nothing_to_do: + target->try_again = true; + return 0; +} + +/* Unshare the space in the work file that we're using for deduplication. */ +static int +csp_unshare_workfile( + struct clearspace_req *req, + unsigned long long start, + unsigned long long length) +{ + int ret; + + trace_falloc(req, "funshare workfd pos 0x%llx bytecount 0x%llx", + start, length); + + ret = fallocate(req->work_fd, FALLOC_FL_UNSHARE_RANGE, start, length); + if (ret) { + perror(_("unsharing work file")); + return ret; + } + + ret = fsync(req->work_fd); + if (ret) { + perror(_("syncing work file")); + return ret; + } + + /* Make sure we didn't get any space within the clearing range. */ + start_bmapx_query(req, 0, start, length); + while ((ret = run_bmapx_query(req, req->work_fd)) > 0) { + struct getbmapx *brec; + + for_each_bmapx_row(req, brec) { + unsigned long long p, l; + + trace_bmapx_rec(req, CSP_TRACE_FALLOC, brec); + p = BBTOB(brec->bmv_block); + l = BBTOB(brec->bmv_length); + + if (p + l < req->start || p >= req->start + req->length) + continue; + + trace_prep(req, + "workfd has extent inside clearing range, phys 0x%llx fsbcount 0x%llx", + p, l); + end_bmapx_query(req); + return -1; + } + } + end_bmapx_query(req); + + return 0; +} + +/* Try to deduplicate every block in the fdr request, if we can. */ +static int +csp_evac_dedupe_loop( + struct clearspace_req *req, + struct clearspace_tgt *target, + unsigned long long ino, + int max_reqlen) +{ + struct file_dedupe_range *fdr = req->fdr; + struct file_dedupe_range_info *info = &fdr->info[0]; + loff_t last_unshare_off = -1; + int ret; + + while (fdr->src_length > 0) { + struct getbmapx brec; + bool same; + unsigned int old_reqlen = fdr->src_length; + + if (max_reqlen && fdr->src_length > max_reqlen) + fdr->src_length = max_reqlen; + + trace_dedupe(req, "ino 0x%llx pos 0x%llx bytecount 0x%llx", + ino, + (unsigned long long)info->dest_offset, + (unsigned long long)fdr->src_length); + + ret = bmapx_one(req, req->work_fd, fdr->src_offset, + fdr->src_length, &brec); + if (ret) + return ret; + + trace_dedupe(req, "workfd pos 0x%llx phys 0x%llx", + (unsigned long long)fdr->src_offset, + (unsigned long long)BBTOB(brec.bmv_block)); + + ret = deduperange(req->work_fd, fdr, &same); + if (ret == ENOSPC && last_unshare_off < fdr->src_offset) { + req->trace_indent++; + trace_dedupe(req, "funshare workfd at phys 0x%llx", + (unsigned long long)fdr->src_offset); + /* + * If we ran out of space, it's possible that we have + * reached the maximum sharing factor of the blocks in + * the work file. Try unsharing the range of the work + * file to get a singly-owned range and loop again. + */ + ret = csp_unshare_workfile(req, fdr->src_offset, + fdr->src_length); + req->trace_indent--; + if (ret) + return ret; + + ret = fsync(req->work_fd); + if (ret) { + perror(_("sync after unshare work file")); + return ret; + } + + last_unshare_off = fdr->src_offset; + fdr->src_length = old_reqlen; + continue; + } + if (ret == EINVAL) { + /* + * If we can't dedupe get the block, it's possible that + * src_fd was punched or truncated out from under us. + * Treat this the same way we would if the contents + * didn't match. + */ + trace_dedupe(req, "cannot evac space, moving on", 0); + same = false; + ret = 0; + } + if (ret) { + fprintf(stderr, _("evacuating inode 0x%llx: %s\n"), + ino, strerror(ret)); + return ret; + } + + if (same) { + req->trace_indent++; + trace_dedupe(req, + "evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx", + ino, + (unsigned long long)info->dest_offset, + (unsigned long long)info->bytes_deduped); + req->trace_indent--; + + target->evacuated++; + } else { + req->trace_indent++; + trace_dedupe(req, + "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx", + ino, + (unsigned long long)info->dest_offset, + (unsigned long long)fdr->src_length); + req->trace_indent--; + + target->try_again = true; + + /* + * If we aren't single-stepping the deduplication, + * stop early so that the caller goes into single-step + * mode. + */ + if (!max_reqlen) { + fdr->src_length = old_reqlen; + return 0; + } + + /* Contents changed, move on to the next block. */ + info->bytes_deduped = fdr->src_length; + } + fdr->src_length = old_reqlen; + + fdr->src_offset += info->bytes_deduped; + info->dest_offset += info->bytes_deduped; + fdr->src_length -= info->bytes_deduped; + } + + return 0; +} + +/* + * Evacuate one fsmapping by using dedupe to remap data stored in the target + * range to a copy stored in the work file. + */ +static int +csp_evac_dedupe_fsmap( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct fsmap *mrec) +{ + struct file_dedupe_range *fdr = req->fdr; + struct file_dedupe_range_info *info = &fdr->info[0]; + bool can_single_step; + int target_fd; + int ret, ret2; + + if (mrec->fmr_device != req->dev) { + fprintf(stderr, _("wrong fsmap device in results.\n")); + return -1; + } + + ret = csp_evac_open(req, target, mrec, NULL, O_RDONLY, &target_fd); + if (ret || target_fd < 0) + return ret; + + /* + * Use dedupe to try to shift the target file's mappings to use the + * copy of the data that's in the work file. + */ + fdr->src_offset = mrec->fmr_physical; + fdr->src_length = mrec->fmr_length; + fdr->dest_count = 1; + info->dest_fd = target_fd; + info->dest_offset = mrec->fmr_offset; + + can_single_step = mrec->fmr_length > req->xfd->fsgeom.blocksize; + + /* First we try to do the entire thing all at once. */ + ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner, 0); + if (ret) + goto out_fd; + + /* If there's any work left, try again one block at a time. */ + if (can_single_step && fdr->src_length > 0) { + ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner, + req->xfd->fsgeom.blocksize); + if (ret) + goto out_fd; + } + +out_fd: + ret2 = close(target_fd); + if (!ret && ret2) + ret = ret2; + return ret; +} + +/* + * Evacuate a prealloc fsmapping by using exchangerange to move the + * preallocation to the work file. + */ +static int +csp_evac_exchange_prealloc( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct fsmap *mrec) +{ + struct xfs_bulkstat bulkstat; + struct xfs_commit_range xcr; + struct getbmapx brec; + int target_fd; + int ret, ret2; + + if (mrec->fmr_device != req->dev) { + fprintf(stderr, _("wrong fsmap device in results.\n")); + return -1; + } + + ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd); + if (ret || target_fd < 0) + return ret; + + ret = xfrog_commitrange_prep(&xcr, target_fd, mrec->fmr_offset, + req->work_fd, mrec->fmr_offset, mrec->fmr_length); + if (ret) { + perror(_("preparing for commit")); + goto out_fd; + } + + /* + * Now that we've snapshotted target_fd, check that the mapping we're + * after is still one large preallocation. If it isn't, then we tell + * the caller to try again. + */ + ret = bmapx_one(req, target_fd, mrec->fmr_offset, mrec->fmr_length, + &brec); + if (ret) + return ret; + + trace_exchange(req, + "targetfd pos 0x%llx offset 0x%llx phys 0x%llx len 0x%llx prealloc? %d", + (unsigned long long)mrec->fmr_offset, + (unsigned long long)BBTOB(brec.bmv_offset), + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)BBTOB(brec.bmv_length), + !!(brec.bmv_oflags & BMV_IF_PREALLOC)); + + if (BBTOB(brec.bmv_offset) > mrec->fmr_offset || + BBTOB(brec.bmv_offset + brec.bmv_length) < + mrec->fmr_offset + mrec->fmr_length || + !(brec.bmv_oflags & BMV_IF_PREALLOC)) { + req->trace_indent++; + trace_exchange(req, + "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx", + bulkstat.bs_ino, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_length); + req->trace_indent--; + target->try_again = true; + goto out_fd; + } + + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("truncating work file")); + goto out_fd; + } + + /* + * Create a preallocation in the work file to match the one in the + * file that we're evacuating. + */ + ret = fallocate(req->work_fd, 0, mrec->fmr_offset, mrec->fmr_length); + if (ret) { + fprintf(stderr, + _("copying target file preallocation to work file: %s\n"), + strerror(ret)); + goto out_fd; + } + + ret = bmapx_one(req, req->work_fd, mrec->fmr_offset, mrec->fmr_length, + &brec); + if (ret) + return ret; + + trace_exchange(req, "workfd pos 0x%llx off 0x%llx phys 0x%llx", + (unsigned long long)mrec->fmr_offset, + (unsigned long long)BBTOB(brec.bmv_offset), + (unsigned long long)BBTOB(brec.bmv_block)); + + /* + * Exchange the mappings, with the freshness check enabled. This + * should result in the target file being switched to new blocks unless + * it has changed, in which case we bounce out and find a new target. + */ + ret = xfrog_commitrange(target_fd, &xcr, 0); + if (ret) { + if (ret == EBUSY) { + req->trace_indent++; + trace_exchange(req, + "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx", + bulkstat.bs_ino, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_length); + req->trace_indent--; + target->try_again = true; + } else { + fprintf(stderr, + _("exchanging target and work file contents: %s\n"), + strerror(ret)); + } + goto out_fd; + } + + req->trace_indent++; + trace_exchange(req, + "evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx", + bulkstat.bs_ino, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_length); + req->trace_indent--; + target->evacuated++; + +out_fd: + ret2 = close(target_fd); + if (!ret && ret2) + ret = ret2; + return ret; +} + +/* Use deduplication to remap data extents away from where we're clearing. */ +static int +csp_evac_dedupe( + struct clearspace_req *req, + struct clearspace_tgt *target) +{ + int ret; + + start_fsmap_query(req, req->dev, target->start, target->length); + while ((ret = run_fsmap_query(req)) > 0) { + struct fsmap *mrec; + + for_each_fsmap_row(req, mrec) { + trace_fsmap_rec(req, CSP_TRACE_DEDUPE, mrec); + trim_target_fsmap(target, mrec); + + req->trace_indent++; + if (mrec->fmr_flags & FMR_OF_PREALLOC) + ret = csp_evac_exchange_prealloc(req, target, + mrec); + else + ret = csp_evac_dedupe_fsmap(req, target, mrec); + req->trace_indent--; + if (ret) + goto out; + + ret = csp_grab_free_space(req); + if (ret) + goto out; + } + } + +out: + end_fsmap_query(req); + if (ret) + trace_dedupe(req, "ret %d", ret); + return ret; +} + +#define BUFFERCOPY_BUFSZ 65536 + +/* + * Use a memory buffer to copy part of src_fd to dst_fd, or return an errno. */ +static int +csp_buffercopy( + struct clearspace_req *req, + int src_fd, + loff_t src_off, + int dst_fd, + loff_t dst_off, + loff_t len) +{ + int ret = 0; + + while (len > 0) { + size_t count = min(BUFFERCOPY_BUFSZ, len); + ssize_t bytes_read, bytes_written; + + bytes_read = pread(src_fd, req->buf, count, src_off); + if (bytes_read < 0) { + ret = errno; + break; + } + + bytes_written = pwrite(dst_fd, req->buf, bytes_read, dst_off); + if (bytes_written < 0) { + ret = errno; + break; + } + + src_off += bytes_written; + dst_off += bytes_written; + len -= bytes_written; + } + + return ret; +} + +/* + * Prepare the work file to assist in evacuating file data by copying the + * contents of the frozen space into the work file. + */ +static int +csp_prepare_for_dedupe( + struct clearspace_req *req) +{ + struct file_clone_range fcr; + struct stat statbuf; + loff_t datapos = 0; + loff_t length = 0; + int ret; + + ret = fstat(req->space_fd, &statbuf); + if (ret) { + perror(_("space capture file")); + return ret; + } + + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("truncate work file")); + return ret; + } + + ret = ftruncate(req->work_fd, statbuf.st_size); + if (ret) { + perror(_("reset work file")); + return ret; + } + + /* Make a working copy of the frozen file data. */ + start_spacefd_iter(req); + while ((ret = spacefd_data_iter(req, &datapos, &length)) > 0) { + trace_prep(req, "clone spacefd data 0x%llx length 0x%llx", + (long long)datapos, (long long)length); + + fcr.src_fd = req->space_fd; + fcr.src_offset = datapos; + fcr.src_length = length; + fcr.dest_offset = datapos; + + ret = clonerange(req->work_fd, &fcr); + if (ret == ENOSPC) { + req->trace_indent++; + trace_prep(req, + "falling back to buffered copy at 0x%llx", + (long long)datapos); + req->trace_indent--; + ret = csp_buffercopy(req, req->space_fd, datapos, + req->work_fd, datapos, length); + } + if (ret) { + perror( + _("copying space capture file contents to work file")); + return ret; + } + } + end_spacefd_iter(req); + if (ret < 0) + return ret; + + /* + * Unshare the work file so that it contains an identical copy of the + * contents of the space capture file but mapped to different blocks. + * This is key to using dedupe to migrate file space away from the + * requested region. + */ + req->trace_indent++; + ret = csp_unshare_workfile(req, req->start, req->length); + req->trace_indent--; + return ret; +} + +/* + * Evacuate one fsmapping by using dedupe to remap data stored in the target + * range to a copy stored in the work file. + */ +static int +csp_evac_exchange_fsmap( + struct clearspace_req *req, + struct clearspace_tgt *target, + const struct fsmap *mrec) +{ + struct xfs_bulkstat bulkstat; + struct xfs_commit_range xcr; + struct getbmapx brec; + int target_fd; + int ret, ret2; + + if (mrec->fmr_device != req->dev) { + fprintf(stderr, _("wrong fsmap device in results.\n")); + return -1; + } + + ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd); + if (ret || target_fd < 0) + return ret; + + ret = xfrog_commitrange_prep(&xcr, target_fd, mrec->fmr_offset, + req->work_fd, mrec->fmr_offset, mrec->fmr_length); + if (ret) { + perror(_("preparing for commit")); + goto out_fd; + } + + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("truncating work file")); + goto out_fd; + } + + /* + * Copy the data from the original file to the work file. We assume + * that the work file will end up with different data blocks and that + * they're outside of the requested range. + */ + ret = csp_buffercopy(req, target_fd, mrec->fmr_offset, req->work_fd, + mrec->fmr_offset, mrec->fmr_length); + if (ret) { + fprintf(stderr, _("copying target file to work file: %s\n"), + strerror(ret)); + goto out_fd; + } + + ret = fsync(req->work_fd); + if (ret) { + perror(_("flush work file for fiexchange")); + goto out_fd; + } + + ret = bmapx_one(req, req->work_fd, mrec->fmr_offset, mrec->fmr_length, + &brec); + if (ret) + return ret; + + trace_exchange(req, "workfd pos 0x%llx phys 0x%llx", + (unsigned long long)mrec->fmr_offset, + (unsigned long long)BBTOB(brec.bmv_block)); + + /* + * Exchange the mappings, with the freshness check enabled. This + * should result in the target file being switched to new blocks unless + * it has changed, in which case we bounce out and find a new target. + */ + ret = xfrog_commitrange(target_fd, &xcr, 0); + if (ret) { + if (ret == EBUSY) { + req->trace_indent++; + trace_exchange(req, + "failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx", + bulkstat.bs_ino, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_length); + req->trace_indent--; + target->try_again = true; + } else { + fprintf(stderr, + _("exchanging target and work file contents: %s\n"), + strerror(ret)); + } + goto out_fd; + } + + req->trace_indent++; + trace_exchange(req, + "evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx", + bulkstat.bs_ino, + (unsigned long long)mrec->fmr_offset, + (unsigned long long)mrec->fmr_length); + req->trace_indent--; + target->evacuated++; + +out_fd: + ret2 = close(target_fd); + if (!ret && ret2) + ret = ret2; + return ret; +} + +/* + * Try to evacuate all data blocks in the target region by copying the contents + * to a new file and exchanging the extents. + */ +static int +csp_evac_exchange( + struct clearspace_req *req, + struct clearspace_tgt *target) +{ + int ret; + + start_fsmap_query(req, req->dev, target->start, target->length); + while ((ret = run_fsmap_query(req)) > 0) { + struct fsmap *mrec; + + for_each_fsmap_row(req, mrec) { + trace_fsmap_rec(req, CSP_TRACE_EXCHANGE, mrec); + trim_target_fsmap(target, mrec); + + req->trace_indent++; + ret = csp_evac_exchange_fsmap(req, target, mrec); + req->trace_indent--; + if (ret) + goto out; + + ret = csp_grab_free_space(req); + if (ret) + goto out; + } + } +out: + end_fsmap_query(req); + if (ret) + trace_exchange(req, "ret %d", ret); + return ret; +} + +/* Try to evacuate blocks by using online repair to rebuild AG metadata. */ +static int +csp_evac_ag_metadata( + struct clearspace_req *req, + struct clearspace_tgt *target, + uint32_t agno, + uint32_t mask) +{ + struct xfs_scrub_metadata scrub = { + .sm_flags = XFS_SCRUB_IFLAG_REPAIR | + XFS_SCRUB_IFLAG_FORCE_REBUILD, + }; + unsigned int i; + int ret; + + trace_xrebuild(req, "agno 0x%x mask 0x%x", + (unsigned int)agno, + (unsigned int)mask); + + for (i = XFS_SCRUB_TYPE_AGFL; i < XFS_SCRUB_TYPE_REFCNTBT; i++) { + + if (!(mask & (1U << i))) + continue; + + scrub.sm_type = i; + + req->trace_indent++; + trace_xrebuild(req, "agno %u type %u", + (unsigned int)agno, + (unsigned int)scrub.sm_type); + req->trace_indent--; + + ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub); + if (ret) { + if (errno == ENOENT || errno == ENOSPC) + continue; + fprintf(stderr, _("rebuilding ag %u type %u: %s\n"), + (unsigned int)agno, scrub.sm_type, + strerror(errno)); + return -1; + } + + target->evacuated++; + + ret = csp_grab_free_space(req); + if (ret) + return ret; + } + + return 0; +} + +/* Compute a scrub mask for a fsmap special owner. */ +static uint32_t +fsmap_owner_to_scrub_mask(__u64 owner) +{ + switch (owner) { + case XFS_FMR_OWN_FREE: + case XFS_FMR_OWN_UNKNOWN: + case XFS_FMR_OWN_FS: + case XFS_FMR_OWN_LOG: + /* can't move these */ + return 0; + case XFS_FMR_OWN_AG: + return (1U << XFS_SCRUB_TYPE_BNOBT) | + (1U << XFS_SCRUB_TYPE_CNTBT) | + (1U << XFS_SCRUB_TYPE_AGFL) | + (1U << XFS_SCRUB_TYPE_RMAPBT); + case XFS_FMR_OWN_INOBT: + return (1U << XFS_SCRUB_TYPE_INOBT) | + (1U << XFS_SCRUB_TYPE_FINOBT); + case XFS_FMR_OWN_REFC: + return (1U << XFS_SCRUB_TYPE_REFCNTBT); + case XFS_FMR_OWN_INODES: + case XFS_FMR_OWN_COW: + /* don't know how to get rid of these */ + return 0; + case XFS_FMR_OWN_DEFECTIVE: + /* good, get rid of it */ + return 0; + default: + return 0; + } +} + +/* Try to clear all per-AG metadata from the requested range. */ +static int +csp_evac_fs_metadata( + struct clearspace_req *req, + struct clearspace_tgt *target, + bool *cleared_anything) +{ + uint32_t curr_agno = -1U; + uint32_t curr_mask = 0; + int ret = 0; + + if (req->realtime) + return 0; + + start_fsmap_query(req, req->dev, target->start, target->length); + while ((ret = run_fsmap_query(req)) > 0) { + struct fsmap *mrec; + + for_each_fsmap_row(req, mrec) { + uint64_t daddr; + uint32_t agno; + uint32_t mask; + + if (mrec->fmr_device != req->dev) + continue; + if (!(mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)) + continue; + + /* Ignore regions that we already tried to clear. */ + if (bitmap_test(req->visited, mrec->fmr_physical, + mrec->fmr_length)) + continue; + + mask = fsmap_owner_to_scrub_mask(mrec->fmr_owner); + if (!mask) + continue; + + trace_fsmap_rec(req, CSP_TRACE_XREBUILD, mrec); + + daddr = BTOBB(mrec->fmr_physical); + agno = cvt_daddr_to_agno(req->xfd, daddr); + + trace_xrebuild(req, + "agno 0x%x -> 0x%x mask 0x%x owner %lld", + curr_agno, agno, curr_mask, + (unsigned long long)mrec->fmr_owner); + + if (curr_agno == -1U) { + curr_agno = agno; + } else if (curr_agno != agno) { + ret = csp_evac_ag_metadata(req, target, + curr_agno, curr_mask); + if (ret) + goto out; + + *cleared_anything = true; + curr_agno = agno; + curr_mask = 0; + } + + /* Put this on the list and try to clear it once. */ + curr_mask |= mask; + ret = bitmap_set(req->visited, mrec->fmr_physical, + mrec->fmr_length); + if (ret) { + perror(_("marking metadata extent visited")); + goto out; + } + } + } + + if (curr_agno != -1U && curr_mask != 0) { + ret = csp_evac_ag_metadata(req, target, curr_agno, curr_mask); + if (ret) + goto out; + *cleared_anything = true; + } + + if (*cleared_anything) + trace_bitmap(req, "set metadata start 0x%llx length 0x%llx", + target->start, target->length); + +out: + end_fsmap_query(req); + if (ret) + trace_xrebuild(req, "ret %d", ret); + return ret; +} + +/* + * Check that at least the start of the mapping was frozen into the work file + * at the correct offset. Set @len to the number of bytes that were frozen. + * Returns -1 for error, zero if written extents are waiting to be mapped into + * the space capture file, or 1 if there's nothing to transfer to the space + * capture file. + */ +enum freeze_outcome { + FREEZE_FAILED = -1, + FREEZE_DONE, + FREEZE_SKIP, +}; + +static enum freeze_outcome +csp_freeze_check_outcome( + struct clearspace_req *req, + const struct fsmap *mrec, + unsigned long long *len) +{ + struct getbmapx brec; + int ret; + + *len = 0; + + ret = bmapx_one(req, req->work_fd, 0, mrec->fmr_length, &brec); + if (ret) + return FREEZE_FAILED; + + trace_freeze(req, + "check if workfd pos 0x0 phys 0x%llx len 0x%llx maps to phys 0x%llx len 0x%llx", + (unsigned long long)mrec->fmr_physical, + (unsigned long long)mrec->fmr_length, + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)BBTOB(brec.bmv_length)); + + /* freeze of an unwritten extent punches a hole in the work file. */ + if ((mrec->fmr_flags & FMR_OF_PREALLOC) && brec.bmv_block == -1) { + *len = min(mrec->fmr_length, BBTOB(brec.bmv_length)); + return FREEZE_SKIP; + } + + /* + * freeze of a written extent must result in the same physical space + * being mapped into the work file. + */ + if (!(mrec->fmr_flags & FMR_OF_PREALLOC) && + BBTOB(brec.bmv_block) == mrec->fmr_physical) { + *len = min(mrec->fmr_length, BBTOB(brec.bmv_length)); + return FREEZE_DONE; + } + + /* + * We didn't find what we were looking for, which implies that the + * mapping changed out from under us. Punch out everything that could + * have been mapped into the work file. Set @len to zero and return so + * that we try again with the next mapping. + */ + trace_falloc(req, "reset workfd isize 0x0", 0); + + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("resetting work file after failed freeze")); + return FREEZE_FAILED; + } + + return FREEZE_SKIP; +} + +/* + * Open a file to try to freeze whatever data is in the requested range. + * + * Returns nonzero on error. Returns zero and a file descriptor in @fd if the + * caller is supposed to do something; or returns zero and @fd == -1 if there's + * nothing to freeze. + */ +static int +csp_freeze_open( + struct clearspace_req *req, + const struct fsmap *mrec, + int *fd) +{ + struct xfs_bulkstat bulkstat; + int oflags = O_RDWR; + int target_fd; + int ret; + + *fd = -1; + + ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, &bulkstat); + if (ret) { + if (ret == ENOENT || ret == EINVAL) + return 0; + + fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"), + (unsigned long long)mrec->fmr_owner, + strerror(errno)); + return ret; + } + + /* + * If we get stats for a different inode, the file may have been freed + * out from under us and there's nothing to do. + */ + if (bulkstat.bs_ino != mrec->fmr_owner) + return 0; + + /* Skip anything we can't freeze. */ + if (!S_ISREG(bulkstat.bs_mode) && !S_ISDIR(bulkstat.bs_mode)) + return 0; + + if (S_ISDIR(bulkstat.bs_mode)) + oflags = O_RDONLY; + + target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner, + bulkstat.bs_gen); + if (target_fd == -2) + return 0; + if (target_fd < 0) + return target_fd; + + /* + * Skip mappings for directories, xattr data, and block mapping btree + * blocks. We still have to close the file though. + */ + if (S_ISDIR(bulkstat.bs_mode) || + (mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) { + return close(target_fd); + } + + *fd = target_fd; + return 0; +} + +static inline uint64_t rounddown_64(uint64_t x, uint64_t y) +{ + return (x / y) * y; +} + +/* + * Deal with a frozen extent containing a partially written EOF block. Either + * we use funshare to get src_fd to release the block, or we reduce the length + * of the frozen extent by one block. + */ +static int +csp_freeze_unaligned_eofblock( + struct clearspace_req *req, + int src_fd, + const struct fsmap *mrec, + unsigned long long *frozen_len) +{ + struct getbmapx brec; + struct stat statbuf; + loff_t work_offset, length; + int ret; + + ret = fstat(req->work_fd, &statbuf); + if (ret) { + perror(_("statting work file")); + return ret; + } + + /* + * The frozen extent is less than the size of the work file, which + * means that we're already block aligned. + */ + if (*frozen_len <= statbuf.st_size) + return 0; + + /* The frozen extent does not contain a partially written EOF block. */ + if (statbuf.st_size % statbuf.st_blksize == 0) + return 0; + + /* + * Unshare what we think is a partially written EOF block of the + * original file, to try to force it to release that block. + */ + work_offset = rounddown_64(statbuf.st_size, statbuf.st_blksize); + length = statbuf.st_size - work_offset; + + trace_freeze(req, + "unaligned eofblock 0x%llx work_size 0x%llx blksize 0x%x work_offset 0x%llx work_length 0x%llx", + *frozen_len, statbuf.st_size, statbuf.st_blksize, + work_offset, length); + + ret = fallocate(src_fd, FALLOC_FL_UNSHARE_RANGE, + mrec->fmr_offset + work_offset, length); + if (ret) { + perror(_("unsharing original file")); + return ret; + } + + ret = fsync(src_fd); + if (ret) { + perror(_("flushing original file")); + return ret; + } + + ret = bmapx_one(req, req->work_fd, work_offset, length, &brec); + if (ret) + return ret; + + if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) { + fprintf(stderr, + _("work file offset 0x%llx maps to phys 0x%llx, expected 0x%llx"), + (unsigned long long)work_offset, + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)mrec->fmr_physical); + return -1; + } + + /* + * If the block is still shared, there must be other owners of this + * block. Round down the frozen length and we'll come back to it + * eventually. + */ + if (brec.bmv_oflags & BMV_OF_SHARED) { + *frozen_len = work_offset; + return 0; + } + + /* + * Not shared anymore, so increase the size of the file to the next + * block boundary so that we can reflink it into the space capture + * file. + */ + ret = ftruncate(req->work_fd, + BBTOB(brec.bmv_length) + BBTOB(brec.bmv_offset)); + if (ret) { + perror(_("expanding work file")); + return ret; + } + + /* Double-check that we didn't lose the block. */ + ret = bmapx_one(req, req->work_fd, work_offset, length, &brec); + if (ret) + return ret; + + if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) { + fprintf(stderr, + _("work file offset 0x%llx maps to phys 0x%llx, should be 0x%llx"), + (unsigned long long)work_offset, + (unsigned long long)BBTOB(brec.bmv_block), + (unsigned long long)mrec->fmr_physical); + return -1; + } + + return 0; +} + +/* + * Given a fsmap, try to reflink the physical space into the space capture + * file. + */ +static int +csp_freeze_req_fsmap( + struct clearspace_req *req, + unsigned long long *cursor, + const struct fsmap *mrec) +{ + struct fsmap short_mrec; + struct file_clone_range fcr = { }; + unsigned long long frozen_len; + enum freeze_outcome outcome; + int src_fd; + int ret, ret2; + + if (mrec->fmr_device != req->dev) { + fprintf(stderr, _("wrong fsmap device in results.\n")); + return -1; + } + + /* Ignore mappings for our secret files. */ + if (csp_is_internal_owner(req, mrec->fmr_owner)) + return 0; + + /* Ignore mappings before the cursor. */ + if (mrec->fmr_physical + mrec->fmr_length < *cursor) + return 0; + + /* Jump past mappings for metadata. */ + if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER) + goto skip; + + /* + * Open this file so that we can try to freeze its data blocks. + * For other types of files we just skip to the evacuation step. + */ + ret = csp_freeze_open(req, mrec, &src_fd); + if (ret) + return ret; + if (src_fd < 0) + goto skip; + + /* + * If the cursor is in the middle of this mapping, increase the start + * of the mapping to start at the cursor. + */ + if (mrec->fmr_physical < *cursor) { + unsigned long long delta = *cursor - mrec->fmr_physical; + + short_mrec = *mrec; + short_mrec.fmr_physical = *cursor; + short_mrec.fmr_offset += delta; + short_mrec.fmr_length -= delta; + + mrec = &short_mrec; + } + + req->trace_indent++; + if (mrec->fmr_length == 0) { + trace_freeze(req, "skipping zero-length freeze", 0); + goto out_fd; + } + + /* + * Reflink the mapping from the source file into the empty work file so + * that a write will be written elsewhere. The only way to reflink a + * partially written EOF block is if the kernel can reset the work file + * size so that the post-EOF part of the block remains post-EOF. If we + * can't do that, we're sunk. If the mapping is unwritten, we'll leave + * a hole in the work file. + */ + ret = ftruncate(req->work_fd, 0); + if (ret) { + perror(_("truncating work file for freeze")); + goto out_fd; + } + + fcr.src_fd = src_fd; + fcr.src_offset = mrec->fmr_offset; + fcr.src_length = mrec->fmr_length; + fcr.dest_offset = 0; + + trace_freeze(req, + "reflink ino 0x%llx offset 0x%llx bytecount 0x%llx into workfd", + (unsigned long long)mrec->fmr_owner, + (unsigned long long)fcr.src_offset, + (unsigned long long)fcr.src_length); + + ret = clonerange(req->work_fd, &fcr); + if (ret == EINVAL) { + /* + * If that didn't work, try reflinking to EOF and picking out + * whatever pieces we want. + */ + fcr.src_length = 0; + + trace_freeze(req, + "reflink ino 0x%llx offset 0x%llx to EOF into workfd", + (unsigned long long)mrec->fmr_owner, + (unsigned long long)fcr.src_offset); + + ret = clonerange(req->work_fd, &fcr); + } + if (ret == EINVAL) { + /* + * If we still can't get the block, it's possible that src_fd + * was punched or truncated out from under us, so we just move + * on to the next fsmap. + */ + trace_freeze(req, "cannot freeze space, moving on", 0); + ret = 0; + goto out_fd; + } + if (ret) { + fprintf(stderr, _("freezing space to work file: %s\n"), + strerror(ret)); + goto out_fd; + } + + req->trace_indent++; + outcome = csp_freeze_check_outcome(req, mrec, &frozen_len); + req->trace_indent--; + switch (outcome) { + case FREEZE_FAILED: + ret = -1; + goto out_fd; + case FREEZE_SKIP: + *cursor += frozen_len; + goto out_fd; + case FREEZE_DONE: + break; + } + + /* + * If we tried reflinking to EOF to capture a partially written EOF + * block in the work file, we need to unshare the end of the source + * file before we try to reflink the frozen space into the space + * capture file. + */ + if (fcr.src_length == 0) { + ret = csp_freeze_unaligned_eofblock(req, src_fd, mrec, + &frozen_len); + if (ret) + goto out_fd; + } + + /* + * We've frozen the mapping by reflinking it into the work file and + * confirmed that the work file has the space we wanted. Now we need + * to map the same extent into the space capture file. If reflink + * fails because we're out of space, fall back to EXCHANGE_RANGE. The + * end goal is to populate the space capture file; we don't care about + * the contents of the work file. + */ + fcr.src_fd = req->work_fd; + fcr.src_offset = 0; + fcr.dest_offset = mrec->fmr_physical; + fcr.src_length = frozen_len; + + trace_freeze(req, "reflink phys 0x%llx len 0x%llx to spacefd", + (unsigned long long)mrec->fmr_physical, + (unsigned long long)mrec->fmr_length); + + ret = clonerange(req->space_fd, &fcr); + if (ret == ENOSPC) { + struct xfs_exchange_range fxr; + + xfrog_exchangerange_prep(&fxr, mrec->fmr_physical, req->work_fd, + mrec->fmr_physical, frozen_len); + ret = xfrog_exchangerange(req->space_fd, &fxr, 0); + } + if (ret) { + fprintf(stderr, _("freezing space to space capture file: %s\n"), + strerror(ret)); + goto out_fd; + } + + *cursor += frozen_len; +out_fd: + ret2 = close(src_fd); + if (!ret && ret2) + ret = ret2; + req->trace_indent--; + if (ret) + trace_freeze(req, "ret %d", ret); + return ret; +skip: + *cursor += mrec->fmr_length; + return 0; +} + +/* + * Try to freeze all the space in the requested range against overwrites. + * + * For each file data fsmap within each hole in the part of the space capture + * file corresponding to the requested range, try to reflink the space into the + * space capture file so that any subsequent writes to the original owner are + * CoW and nobody else can allocate the space. If we cannot use reflink to + * freeze all the space, we cannot proceed with the clearing. + */ +static int +csp_freeze_req_range( + struct clearspace_req *req) +{ + unsigned long long cursor = req->start; + loff_t holepos = 0; + loff_t length = 0; + int ret; + + ret = ftruncate(req->space_fd, req->start + req->length); + if (ret) { + perror(_("setting up space capture file")); + return ret; + } + + if (!req->use_reflink) + return 0; + + start_spacefd_iter(req); + while ((ret = spacefd_hole_iter(req, &holepos, &length)) > 0) { + trace_freeze(req, "spacefd hole 0x%llx length 0x%llx", + (long long)holepos, (long long)length); + + start_fsmap_query(req, req->dev, holepos, length); + while ((ret = run_fsmap_query(req)) > 0) { + struct fsmap *mrec; + + for_each_fsmap_row(req, mrec) { + trace_fsmap_rec(req, CSP_TRACE_FREEZE, mrec); + trim_request_fsmap(req, mrec); + ret = csp_freeze_req_fsmap(req, &cursor, mrec); + if (ret) { + end_fsmap_query(req); + goto out; + } + } + } + end_fsmap_query(req); + } +out: + end_spacefd_iter(req); + return ret; +} + +/* + * Dump all speculative preallocations, COW staging blocks, and inactive inodes + * to try to free up as much space as we can. + */ +static int +csp_collect_garbage( + struct clearspace_req *req) +{ + struct xfs_fs_eofblocks eofb = { + .eof_version = XFS_EOFBLOCKS_VERSION, + .eof_flags = XFS_EOF_FLAGS_SYNC, + }; + int ret; + + ret = ioctl(req->xfd->fd, XFS_IOC_FREE_EOFBLOCKS, &eofb); + if (ret) { + perror(_("xfs garbage collector")); + return -1; + } + + return 0; +} + +static int +csp_prepare( + struct clearspace_req *req) +{ + blkcnt_t old_blocks = 0; + int ret; + + /* + * Empty out CoW forks and speculative post-EOF preallocations before + * starting the clearing process. This may be somewhat overkill. + */ + ret = syncfs(req->xfd->fd); + if (ret) { + perror(_("syncing filesystem")); + return ret; + } + + ret = csp_collect_garbage(req); + if (ret) + return ret; + + /* + * Set up the space capture file as a large sparse file mirroring the + * physical space that we want to defragment. + */ + ret = ftruncate(req->space_fd, req->start + req->length); + if (ret) { + perror(_("setting up space capture file")); + return ret; + } + + /* + * If we don't have reflink, just grab the free space and move on to + * copying and exchanging file contents. + */ + if (!req->use_reflink) + return csp_grab_free_space(req); + + /* + * Try to freeze as much of the requested range as we can, grab the + * free space in that range, and run freeze again to pick up anything + * that may have been allocated while all that was going on. + */ + do { + struct stat statbuf; + + ret = csp_freeze_req_range(req); + if (ret) + return ret; + + ret = csp_grab_free_space(req); + if (ret) + return ret; + + ret = fstat(req->space_fd, &statbuf); + if (ret) + return ret; + + if (old_blocks == statbuf.st_blocks) + break; + old_blocks = statbuf.st_blocks; + } while (1); + + /* + * If reflink is enabled, our strategy is to dedupe to free blocks in + * the area that we're clearing without making any user-visible changes + * to the file contents. For all the written file data blocks in area + * we're clearing, make an identical copy in the work file that is + * backed by blocks that are not in the clearing area. + */ + return csp_prepare_for_dedupe(req); +} + +/* Set up the target to clear all metadata from the given range. */ +static inline void +csp_target_metadata( + struct clearspace_req *req, + struct clearspace_tgt *target) +{ + target->start = req->start; + target->length = req->length; + target->prio = 0; + target->evacuated = 0; + target->owners = 0; + target->try_again = false; +} + +/* + * Loop through the space to find the most appealing part of the device to + * clear, then try to evacuate everything within. + */ +int +clearspace_run( + struct clearspace_req *req) +{ + struct clearspace_tgt target; + const struct csp_errstr *es; + bool cleared_anything; + int ret; + + if (req->trace_mask) { + fprintf(stderr, "debug flags 0x%x:", req->trace_mask); + for (es = errtags; es->tag; es++) { + if (req->trace_mask & es->mask) + fprintf(stderr, " %s", es->tag); + } + fprintf(stderr, "\n"); + } + + req->trace_indent = 0; + trace_status(req, + _("Clearing dev %u:%u physical 0x%llx bytecount 0x%llx."), + major(req->dev), minor(req->dev), + req->start, req->length); + + if (req->trace_mask & ~CSP_TRACE_STATUS) + trace_status(req, "reflink? %d evac_metadata? %d", + req->use_reflink, req->can_evac_metadata); + + ret = bitmap_alloc(&req->visited); + if (ret) { + perror(_("allocating visited bitmap")); + return ret; + } + + ret = csp_prepare(req); + if (ret) + goto out_bitmap; + + /* Evacuate as many file blocks as we can. */ + do { + ret = csp_find_target(req, &target); + if (ret) + goto out_bitmap; + + if (target.length == 0) + break; + + trace_target(req, + "phys 0x%llx len 0x%llx owners 0x%llx prio 0x%llx", + target.start, target.length, + target.owners, target.prio); + + if (req->use_reflink) + ret = csp_evac_dedupe(req, &target); + else + ret = csp_evac_exchange(req, &target); + if (ret) + goto out_bitmap; + + trace_status(req, _("Evacuated %llu file items."), + target.evacuated); + } while (target.evacuated > 0 || target.try_again); + + if (!req->can_evac_metadata) + goto out_bitmap; + + /* Evacuate as many AG metadata blocks as we can. */ + do { + csp_target_metadata(req, &target); + + ret = csp_evac_fs_metadata(req, &target, &cleared_anything); + if (ret) + goto out_bitmap; + + trace_status(req, "evacuated %llu metadata items", + target.evacuated); + } while (target.evacuated > 0 && cleared_anything); + +out_bitmap: + bitmap_free(&req->visited); + return ret; +} + +/* How much space did we actually clear? */ +int +clearspace_efficacy( + struct clearspace_req *req, + unsigned long long *cleared_bytes) +{ + unsigned long long cleared = 0; + int ret; + + start_bmapx_query(req, 0, req->start, req->length); + while ((ret = run_bmapx_query(req, req->space_fd)) > 0) { + struct getbmapx *brec; + + for_each_bmapx_row(req, brec) { + if (brec->bmv_block == -1) + continue; + + trace_bmapx_rec(req, CSP_TRACE_EFFICACY, brec); + + if (brec->bmv_offset != brec->bmv_block) { + fprintf(stderr, + _("space capture file mapped incorrectly\n")); + end_bmapx_query(req); + return -1; + } + cleared += BBTOB(brec->bmv_length); + } + } + end_bmapx_query(req); + if (ret) + return ret; + + *cleared_bytes = cleared; + return 0; +} + +/* + * Create a temporary file on the same volume (data/rt) that we're trying to + * clear free space on. + */ +static int +csp_open_tempfile( + struct clearspace_req *req, + struct stat *statbuf) +{ + struct fsxattr fsx; + int fd, ret; + + fd = openat(req->xfd->fd, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600); + if (fd < 0) { + perror(_("opening temp file")); + return -1; + } + + /* Make sure we got the same filesystem as the open file. */ + ret = fstat(fd, statbuf); + if (ret) { + perror(_("stat temp file")); + goto fail; + } + if (statbuf->st_dev != req->statbuf.st_dev) { + fprintf(stderr, + _("Cannot create temp file on same fs as open file.\n")); + goto fail; + } + + /* Ensure this file targets the correct data/rt device. */ + ret = ioctl(fd, FS_IOC_FSGETXATTR, &fsx); + if (ret) { + perror(_("FSGETXATTR temp file")); + goto fail; + } + + if (!!(fsx.fsx_xflags & FS_XFLAG_REALTIME) != req->realtime) { + if (req->realtime) + fsx.fsx_xflags |= FS_XFLAG_REALTIME; + else + fsx.fsx_xflags &= ~FS_XFLAG_REALTIME; + + ret = ioctl(fd, FS_IOC_FSSETXATTR, &fsx); + if (ret) { + perror(_("FSSETXATTR temp file")); + goto fail; + } + } + + trace_setup(req, "opening temp inode 0x%llx as fd %d", + (unsigned long long)statbuf->st_ino, fd); + + return fd; +fail: + close(fd); + return -1; +} + +/* Extract fshandle from the open file. */ +static int +csp_install_file( + struct clearspace_req *req, + struct xfs_fd *xfd) +{ + void *handle; + size_t handle_sz; + int ret; + + ret = fstat(xfd->fd, &req->statbuf); + if (ret) + return ret; + + if (!S_ISDIR(req->statbuf.st_mode)) { + errno = -ENOTDIR; + return -1; + } + + ret = fd_to_handle(xfd->fd, &handle, &handle_sz); + if (ret) + return ret; + + ret = handle_to_fshandle(handle, handle_sz, &req->fshandle, + &req->fshandle_sz); + if (ret) + return ret; + + free_handle(handle, handle_sz); + req->xfd = xfd; + return 0; +} + +/* Decide if we can use online repair to evacuate metadata. */ +static void +csp_detect_evac_metadata( + struct clearspace_req *req) +{ + struct xfs_scrub_metadata scrub = { + .sm_type = XFS_SCRUB_TYPE_PROBE, + .sm_flags = XFS_SCRUB_IFLAG_REPAIR | + XFS_SCRUB_IFLAG_FORCE_REBUILD, + }; + int ret; + + ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub); + if (ret) + return; + + /* + * We'll try to evacuate metadata if the probe works. This doesn't + * guarantee success; it merely means that the kernel call exists. + */ + req->can_evac_metadata = true; +} + +/* Detect XFS_IOC_MAP_FREESP; this is critical for grabbing free space! */ +static int +csp_detect_map_freesp( + struct clearspace_req *req) +{ + struct xfs_map_freesp args = { + .offset = 0, + .len = 1, + }; + int ret; + + /* + * A single-byte fallocate request will succeed without doing anything + * to the filesystem. + */ + ret = ioctl(req->work_fd, XFS_IOC_MAP_FREESP, &args); + if (!ret) + return 0; + + if (errno == EOPNOTSUPP) { + fprintf(stderr, + _("Filesystem does not support XFS_IOC_MAP_FREESP\n")); + return -1; + } + + perror(_("test XFS_IOC_MAP_FREESP on work file")); + return -1; +} + +/* + * Assemble operation information to clear the physical space in part of a + * filesystem. + */ +int +clearspace_init( + struct clearspace_req **reqp, + const struct clearspace_init *attrs) +{ + struct clearspace_req *req; + int ret; + + req = calloc(1, sizeof(struct clearspace_req)); + if (!req) { + perror(_("malloc clearspace")); + return -1; + } + + req->work_fd = -1; + req->space_fd = -1; + req->trace_mask = attrs->trace_mask; + + req->realtime = attrs->is_realtime; + req->dev = attrs->dev; + req->start = attrs->start; + req->length = attrs->length; + + ret = csp_install_file(req, attrs->xfd); + if (ret) { + perror(attrs->fname); + goto fail; + } + + csp_detect_evac_metadata(req); + + req->work_fd = csp_open_tempfile(req, &req->temp_statbuf); + if (req->work_fd < 0) + goto fail; + + req->space_fd = csp_open_tempfile(req, &req->space_statbuf); + if (req->space_fd < 0) + goto fail; + + ret = csp_detect_map_freesp(req); + if (ret) + goto fail; + + req->mhead = calloc(1, fsmap_sizeof(QUERY_BATCH_SIZE)); + if (!req->mhead) { + perror(_("opening fs mapping query")); + goto fail; + } + + req->rhead = calloc(1, xfs_getfsrefs_sizeof(QUERY_BATCH_SIZE)); + if (!req->rhead) { + perror(_("opening refcount query")); + goto fail; + } + + req->bhead = calloc(QUERY_BATCH_SIZE + 1, sizeof(struct getbmapx)); + if (!req->bhead) { + perror(_("opening file mapping query")); + goto fail; + } + + req->buf = malloc(BUFFERCOPY_BUFSZ); + if (!req->buf) { + perror(_("allocating file copy buffer")); + goto fail; + } + + req->fdr = calloc(1, sizeof(struct file_dedupe_range) + + sizeof(struct file_dedupe_range_info)); + if (!req->fdr) { + perror(_("allocating dedupe control buffer")); + goto fail; + } + + req->use_reflink = req->xfd->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_REFLINK; + + *reqp = req; + return 0; +fail: + clearspace_free(&req); + return -1; +} + +#ifdef CLEARSPACE_DEBUG +static void +csp_dump_fd( + struct clearspace_req *req, + int fd, + const char *tag) +{ + struct stat sb; + struct getbmapx *brec; + unsigned long i = 0; + int ret; + + ret = fstat(fd, &sb); + if (ret) { + perror("fstat"); + return; + } + + printf("CLEARFREE DUMP ino 0x%llx: %s\n", + (unsigned long long)sb.st_ino, tag); + start_bmapx_query(req, 0, 0, sb.st_size); + while ((ret = run_bmapx_query(req, fd)) > 0) { + for_each_bmapx_row(req, brec) { + char *delim = ""; + + printf("[%lu]: startoff 0x%llx ", + i++, BBTOB(brec->bmv_offset)); + + if (brec->bmv_block == -1) + printf("startblock hole "); + else if (brec->bmv_block == -2) + printf("startblock delalloc "); + else + printf("startblock 0x%llx ", + BBTOB(brec->bmv_block)); + printf("blockcount 0x%llx flags [", + BBTOB(brec->bmv_length)); + if (brec->bmv_oflags & BMV_OF_PREALLOC) { + printf("%sprealloc", delim); + delim = ", "; + } + if (brec->bmv_oflags & BMV_OF_DELALLOC) { + printf("%sdelalloc", delim); + delim = ", "; + } + if (brec->bmv_oflags & BMV_OF_SHARED) { + printf("%sshared", delim); + delim = ", "; + } + printf("]\n"); + } + } + end_bmapx_query(req); +} + +/* Dump the space file and work file contents. */ +void +clearspace_dump( + struct clearspace_req *req) +{ + csp_dump_fd(req, req->space_fd, "space file"); + csp_dump_fd(req, req->work_fd, "work file"); +} +#endif /* CLEARSPACE_DEBUG */ + +/* Free all resources associated with a space clearing request. */ +int +clearspace_free( + struct clearspace_req **reqp) +{ + struct clearspace_req *req = *reqp; + int ret = 0; + + if (!req) + return 0; + + *reqp = NULL; + free(req->fdr); + free(req->buf); + free(req->bhead); + free(req->rhead); + free(req->mhead); + + if (req->space_fd >= 0) { + ret = close(req->space_fd); + if (ret) + perror(_("closing space capture file")); + } + + if (req->work_fd >= 0) { + int ret2 = close(req->work_fd); + + if (ret2) { + perror(_("closing work file")); + if (!ret && ret2) + ret = ret2; + } + } + + if (req->fshandle) + free_handle(req->fshandle, req->fshandle_sz); + free(req); + return ret; +} diff --git a/libfrog/clearspace.h b/libfrog/clearspace.h new file mode 100644 index 00000000000000..d75545752b1fbf --- /dev/null +++ b/libfrog/clearspace.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __LIBFROG_CLEARSPACE_H__ +#define __LIBFROG_CLEARSPACE_H__ + +#undef CLEARSPACE_DEBUG + +struct clearspace_req; + +struct clearspace_init { + /* Open file and its pathname */ + struct xfs_fd *xfd; + const char *fname; + + /* Which device do we want? */ + bool is_realtime; + dev_t dev; + + /* Range of device to clear. */ + unsigned long long start; + unsigned long long length; + + unsigned int trace_mask; +}; + +int clearspace_init(struct clearspace_req **reqp, + const struct clearspace_init *init); +int clearspace_free(struct clearspace_req **reqp); + +int clearspace_run(struct clearspace_req *req); + +#ifdef CLEARSPACE_DEBUG +void clearspace_dump(struct clearspace_req *req); +#else +# define clearspace_dump(req) ((void)0) +#endif +int clearspace_efficacy(struct clearspace_req *req, + unsigned long long *cleared_bytes); + +/* Debugging levels */ + +#define CSP_TRACE_FREEZE (1U << 0) +#define CSP_TRACE_GRAB (1U << 1) +#define CSP_TRACE_FSMAP (1U << 2) +#define CSP_TRACE_FSREFS (1U << 3) +#define CSP_TRACE_BMAPX (1U << 4) +#define CSP_TRACE_PREP (1U << 5) +#define CSP_TRACE_TARGET (1U << 6) +#define CSP_TRACE_DEDUPE (1U << 7) +#define CSP_TRACE_FALLOC (1U << 8) +#define CSP_TRACE_EXCHANGE (1U << 9) +#define CSP_TRACE_XREBUILD (1U << 10) +#define CSP_TRACE_EFFICACY (1U << 11) +#define CSP_TRACE_SETUP (1U << 12) +#define CSP_TRACE_STATUS (1U << 13) +#define CSP_TRACE_DUMPFILE (1U << 14) +#define CSP_TRACE_BITMAP (1U << 15) + +#define CSP_TRACE_ALL (CSP_TRACE_FREEZE | \ + CSP_TRACE_GRAB | \ + CSP_TRACE_FSMAP | \ + CSP_TRACE_FSREFS | \ + CSP_TRACE_BMAPX | \ + CSP_TRACE_PREP | \ + CSP_TRACE_TARGET | \ + CSP_TRACE_DEDUPE | \ + CSP_TRACE_FALLOC | \ + CSP_TRACE_EXCHANGE | \ + CSP_TRACE_XREBUILD | \ + CSP_TRACE_EFFICACY | \ + CSP_TRACE_SETUP | \ + CSP_TRACE_STATUS | \ + CSP_TRACE_DUMPFILE | \ + CSP_TRACE_BITMAP) + +#endif /* __LIBFROG_CLEARSPACE_H__ */ diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8 index 7d2d1ff94eeb55..a326b9a6486296 100644 --- a/man/man8/xfs_spaceman.8 +++ b/man/man8/xfs_spaceman.8 @@ -25,6 +25,23 @@ .SH OPTIONS .SH COMMANDS .TP +.BI "clearfree [ \-n nr ] [ \-r ] [ \-v mask ] " start " " length +Try to clear the specified physical range in the filesystem. +The +.B start +and +.B length +arguments must be given in units of bytes. +If the +.B -n +option is given, run the clearing algorithm this many times. +If the +.B -r +option is given, clear the realtime device. +If the +.B -v +option is given, print what's happening every step of the way. +.TP .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]" With no arguments, .B freesp diff --git a/spaceman/Makefile b/spaceman/Makefile index 358db9edf5cb73..b9eead8340cec1 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -27,7 +27,7 @@ LLDLIBS += $(LIBEDITLINE) $(LIBTERMCAP) endif ifeq ($(HAVE_GETFSMAP),yes) -CFILES += freesp.c +CFILES += freesp.c clearfree.c endif default: depend $(LTCOMMAND) diff --git a/spaceman/clearfree.c b/spaceman/clearfree.c new file mode 100644 index 00000000000000..6d686f805855dc --- /dev/null +++ b/spaceman/clearfree.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2021-2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "platform_defs.h" +#include "command.h" +#include "init.h" +#include "libfrog/paths.h" +#include "input.h" +#include "libfrog/fsgeom.h" +#include "libfrog/clearspace.h" +#include "handle.h" +#include "space.h" + +static void +clearfree_help(void) +{ + printf(_( +"Evacuate the contents of the given range of physical storage in the filesystem" +"\n" +" -n -- Run the space clearing algorithm this many times.\n" +" -r -- clear space on the realtime device.\n" +" -v -- verbosity level, or \"all\" to print everything.\n" +"\n" +"The start and length arguments are required, and must be specified in units\n" +"of bytes.\n" +"\n")); +} + +static int +clearfree_f( + int argc, + char **argv) +{ + struct clearspace_init attrs = { + .xfd = &file->xfd, + .fname = file->name, + }; + struct clearspace_req *req = NULL; + unsigned long long cleared; + unsigned long arg; + long long lnum; + unsigned int i, nr = 1; + int c, ret; + + while ((c = getopt(argc, argv, "n:rv:")) != EOF) { + switch (c) { + case 'n': + errno = 0; + arg = strtoul(optarg, NULL, 0); + if (errno) { + perror(optarg); + return 1; + } + if (arg > UINT_MAX) + arg = UINT_MAX; + nr = arg; + break; + case 'r': /* rt device */ + attrs.is_realtime = true; + break; + case 'v': /* Verbose output */ + if (!strcmp(optarg, "all")) { + attrs.trace_mask = CSP_TRACE_ALL; + } else { + errno = 0; + attrs.trace_mask = strtoul(optarg, NULL, 0); + if (errno) { + perror(optarg); + return 1; + } + } + break; + default: + exitcode = 1; + clearfree_help(); + return 0; + } + } + + if (attrs.trace_mask) + attrs.trace_mask |= CSP_TRACE_STATUS; + + if (argc != optind + 2) { + clearfree_help(); + goto fail; + } + + if (attrs.is_realtime) { + if (file->xfd.fsgeom.rtblocks == 0) { + fprintf(stderr, _("No realtime volume present.\n")); + goto fail; + } + attrs.dev = file->fs_path.fs_rtdev; + } else { + attrs.dev = file->fs_path.fs_datadev; + } + + lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize, + argv[optind]); + if (lnum < 0) { + fprintf(stderr, _("Bad clearfree start sector %s.\n"), + argv[optind]); + goto fail; + } + attrs.start = lnum; + + lnum = cvtnum(file->xfd.fsgeom.blocksize, file->xfd.fsgeom.sectsize, + argv[optind + 1]); + if (lnum < 0) { + fprintf(stderr, _("Bad clearfree length %s.\n"), + argv[optind + 1]); + goto fail; + } + attrs.length = lnum; + + ret = clearspace_init(&req, &attrs); + if (ret) + goto fail; + + for (i = 0; i < nr; i++) { + ret = clearspace_run(req); + if (ret) + goto out_clearspace; + } + + ret = clearspace_efficacy(req, &cleared); + if (ret) + goto out_clearspace; + + printf(_("Cleared 0x%llx bytes (%.1f%%) from 0x%llx to 0x%llx.\n"), + cleared, 100.0 * cleared / attrs.length, attrs.start, + attrs.start + attrs.length); + + if (!cleared) + clearspace_dump(req); + + ret = clearspace_free(&req); + if (ret) + goto fail; + + fshandle_destroy(); + return 0; + +out_clearspace: + clearspace_dump(req); + clearspace_free(&req); +fail: + fshandle_destroy(); + exitcode = 1; + return 1; +} + +static struct cmdinfo clearfree_cmd = { + .name = "clearfree", + .cfunc = clearfree_f, + .argmin = 0, + .argmax = -1, + .flags = CMD_FLAG_ONESHOT, + .args = "[-n runs] [-r] [-v mask] start length", + .help = clearfree_help, +}; + +void +clearfree_init(void) +{ + clearfree_cmd.oneline = _("clear free space in the filesystem"); + + add_command(&clearfree_cmd); +} diff --git a/spaceman/init.c b/spaceman/init.c index cf1ff3cbb0ee8d..bce62dec47f2c8 100644 --- a/spaceman/init.c +++ b/spaceman/init.c @@ -35,6 +35,7 @@ init_commands(void) trim_init(); freesp_init(); health_init(); + clearfree_init(); } static int diff --git a/spaceman/space.h b/spaceman/space.h index 28fa35a3047957..509e923375f42f 100644 --- a/spaceman/space.h +++ b/spaceman/space.h @@ -31,8 +31,10 @@ extern void quit_init(void); extern void trim_init(void); #ifdef HAVE_GETFSMAP extern void freesp_init(void); +extern void clearfree_init(void); #else # define freesp_init() do { } while (0) +# define clearfree_init() do { } while(0) #endif extern void info_init(void); extern void health_init(void); From patchwork Tue Dec 31 23:46:24 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924064 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6C9EF29415 for ; Tue, 31 Dec 2024 23:46:25 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688785; cv=none; b=Vi/cL+HnsUC/6L7YL1Mq7VnK5CkKJPnp/Znr45ljAxrgwzW9G/3dphYgzVuPkvFkdicvrvjN++UWXa7wkCCq03LcJwU5BXiWL/IpIfRxZaC5FActFsMW4VNjkgRPmAt+jxMtCfEXDaBhD8aawxdx6XC0lYGfPLELT4xwY+/N3JU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688785; c=relaxed/simple; bh=0OHgJQOD3GvFa4LoUEWO7SZOsMOatkSscug1oz4E2mM=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=Xl+aA53ZmQxxlr4i+ZV1EnjWiOWbUJu1XvjBeOeie3x6bx9CL/fzo64BXTOYERaXQJTPIeU79dy9JPNQaU43ushDHfAwoyAEQZjzi1E8N1rA22Gh8IVecsgcyL1ahXd6kSdLAVgu+/+wXOo+VB9rpOqAYc/6OfJdEP5M4r2HjFo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=uCglMnVT; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="uCglMnVT" Received: by smtp.kernel.org (Postfix) with ESMTPSA id EFEBCC4CED2; Tue, 31 Dec 2024 23:46:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688785; bh=0OHgJQOD3GvFa4LoUEWO7SZOsMOatkSscug1oz4E2mM=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=uCglMnVTFmlLzYhZGP9KEcYvPlvdPOGMPbDW0TQrXZJED1LNRs0rKf3AT6aAGZyqZ p2LBDTO2zYIL3b/n6r6lkk+vXIatQd/uqF5MbwcwZ6opornaF2cfloM19es52jsovM I0pomdMtgWFOZPvfCsovKpV8+d0bcyUq9BpF+eKAXKaxefRrZnd3OeMD9pZgUwYNge aSVx/kgMZrZ5WGJN3xOLiZyq1t9v30N7qvKvQuTDrmjvqALUMLBXASSlMBDIdGp4vW 2v9pMMf2LptNrjDOgeSfAQJQOVNNENQaW0cV7YZOLS537Rb6WxHiVfeFyx7mGFbgVw lgnkQzEMRWm2w== Date: Tue, 31 Dec 2024 15:46:24 -0800 Subject: [PATCH 06/11] spaceman: physically move a regular inode From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: dchinner@redhat.com, linux-xfs@vger.kernel.org Message-ID: <173568777962.2709794.5863683217799517220.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Dave Chinner To be able to shrink a filesystem, we need to be able to physically move an inode and all it's data and metadata from it's current location to a new AG. Add a command to spaceman to allow an inode to be moved to a new AG. This new command is not intended to be a perfect solution. I am not trying to handle atomic movement of open files - this is intended to be run as a maintenance operation on idle filesystem. If root filesystems are the target, then this should be run via a rescue environment that is not executing directly on the root fs. With those caveats in place, we can do the entire inode move as a set of non-destructive operations finalised by an atomic inode swap without any needing special kernel support. To ensure we move metadata such as BMBT blocks even if we don't need to move data, we clone the data to a new inode that we've allocated in the destination AG. This will result in new bmbt blocks being allocated in the new location even though the data is not copied. Attributes need to be copied one at a time from the original inode. If data needs to be moved, then we use fallocate(UNSHARE) to create a private copy of the range of data that needs to be moved in the new inode. This will be allocated in the destination AG by normal allocation policy. Once the new inode has been finalised, use RENAME_EXCHANGE to swap it into place and unlink the original inode to free up all the resources it still pins. There are many optimisations still possible to speed this up, but the goal here is "functional" rather than "optimal". Performance can be optimised once all the parts for a "empty the tail of the filesystem before shrink" operation are implemented and solidly tested. This functionality has been smoke tested by creating a 32MB data file with 4k extents and several hundred attributes: $ cat test.sh fname=/mnt/scratch/foo xfs_io -f -c "pwrite 0 32m" -c sync $fname for (( i=0; i < 4096 ; i++ )); do xfs_io -c "fpunch $((i * 8))k 4k" $fname done for (( i=0; i < 100 ; i++ )); do setfattr -n user.blah.$i.$i.blah -v blah.$i.$i.blah $fname setfattr -n user.foo.$i.$i.foo -v $i.cantbele.$i.ve.$i.tsnotbutter $fname done for (( i=0; i < 100 ; i++ )); do setfattr -n security.baz.$i.$i.baz -v wotchul$i$iookinat $fname done xfs_io -c stat -c "bmap -vp" -c "bmap -avp" $fname xfs_spaceman -c "move_inode -a 22" /mnt/scratch/foo xfs_io -c stat -c "bmap -vp" -c "bmap -avp" $fname $ and the output looks something like: $ sudo ./test.sh .... fd.path = "/mnt/scratch/foo" fd.flags = non-sync,non-direct,read-write stat.ino = 133 /mnt/scratch/foo: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..7]: hole 8 1: [8..15]: 208..215 0 (208..215) 8 000000 2: [16..23]: hole 8 3: [24..31]: 224..231 0 (224..231) 8 000000 .... 8189: [65512..65519]: 65712..65719 0 (65712..65719) 8 000000 8190: [65520..65527]: hole 8 8191: [65528..65535]: 65728..65735 0 (65728..65735) 8 000000 mnt/scratch/foo: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..7]: 392..399 0 (392..399) 8 000000 1: [8..15]: 408..415 0 (408..415) 8 000000 2: [16..23]: 424..431 0 (424..431) 8 000000 3: [24..31]: 456..463 0 (456..463) 8 000000 move mnt /mnt/scratch, path /mnt/scratch/foo, agno 22 fd.path = "/mnt/scratch/foo" fd.flags = non-sync,non-direct,read-write stat.ino = 47244651475 .... /mnt/scratch/foo: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..7]: hole 8 1: [8..15]: 47244763192..47244763199 22 (123112..123119) 8 000000 2: [16..23]: hole 8 3: [24..31]: 47244763208..47244763215 22 (123128..123135) 8 000000 .... 8189: [65512..65519]: 47244828808..47244828815 22 (188728..188735) 8 000000 8190: [65520..65527]: hole 8 8191: [65528..65535]: 47244828824..47244828831 22 (188744..188751) 8 000000 /mnt/scratch/foo: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..7]: 47244763176..47244763183 22 (123096..123103) 8 000000 $ Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: "Darrick J. Wong" --- man/man8/xfs_spaceman.8 | 4 spaceman/Makefile | 3 spaceman/init.c | 1 spaceman/move_inode.c | 562 +++++++++++++++++++++++++++++++++++++++++++++++ spaceman/space.h | 1 5 files changed, 570 insertions(+), 1 deletion(-) create mode 100644 spaceman/move_inode.c diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8 index a326b9a6486296..f898a8bbe840ea 100644 --- a/man/man8/xfs_spaceman.8 +++ b/man/man8/xfs_spaceman.8 @@ -146,6 +146,10 @@ .SH COMMANDS .TP .BR "help [ " command " ]" Display a brief description of one or all commands. +.TP +.BI "move_inode \-a agno" +Move the currently open file into the specified allocation group. + .TP .BI "prealloc [ \-u id ] [ \-g id ] [ -p id ] [ \-m minlen ] [ \-s ]" Removes speculative preallocation. diff --git a/spaceman/Makefile b/spaceman/Makefile index b9eead8340cec1..9d080b67de9a22 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -14,11 +14,12 @@ CFILES = \ health.c \ info.c \ init.c \ + move_inode.c \ prealloc.c \ trim.c LSRCFILES = xfs_info.sh -LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) +LLDLIBS = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) $(LIBHANDLE) LTDEPENDENCIES = $(LIBHANDLE) $(LIBXCMD) $(LIBFROG) LLDFLAGS = -static diff --git a/spaceman/init.c b/spaceman/init.c index bce62dec47f2c8..dbeebcf97b9fb2 100644 --- a/spaceman/init.c +++ b/spaceman/init.c @@ -36,6 +36,7 @@ init_commands(void) freesp_init(); health_init(); clearfree_init(); + move_inode_init(); } static int diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c new file mode 100644 index 00000000000000..b7d71ee7a46dc6 --- /dev/null +++ b/spaceman/move_inode.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Red Hat, Inc. + * All Rights Reserved. + */ + +#include "libxfs.h" +#include "libfrog/fsgeom.h" +#include "command.h" +#include "init.h" +#include "libfrog/paths.h" +#include "space.h" +#include "input.h" +#include "handle.h" + +#include +#include +#include + +static cmdinfo_t move_inode_cmd; + +/* + * We can't entirely use O_TMPFILE here because we want to use RENAME_EXCHANGE + * to swap the inode once rebuild is complete. Hence the new file has to be + * somewhere in the namespace for rename to act upon. Hence we use a normal + * open(O_CREATE) for now. + * + * This could potentially use O_TMPFILE to rebuild the entire inode, the use + * a linkat()/renameat2() pair to add it to the namespace then atomically + * replace the original. + */ +static int +create_tmpfile( + const char *mnt, + struct xfs_fd *xfd, + xfs_agnumber_t agno, + char **tmpfile, + int *tmpfd) +{ + char name[PATH_MAX + 1]; + mode_t mask; + int fd; + int i; + int ret; + + /* construct tmpdir */ + mask = umask(0); + + snprintf(name, PATH_MAX, "%s/.spaceman", mnt); + ret = mkdir(name, 0700); + if (ret) { + if (errno != EEXIST) { + fprintf(stderr, _("could not create tmpdir: %s: %s\n"), + name, strerror(errno)); + ret = -errno; + goto out_cleanup; + } + } + + /* loop creating directories until we get one in the right AG */ + for (i = 0; i < xfd->fsgeom.agcount; i++) { + struct stat st; + + snprintf(name, PATH_MAX, "%s/.spaceman/dir%d", mnt, i); + ret = mkdir(name, 0700); + if (ret) { + if (errno != EEXIST) { + fprintf(stderr, + _("cannot create tmpdir: %s: %s\n"), + name, strerror(errno)); + ret = -errno; + goto out_cleanup_dir; + } + } + ret = lstat(name, &st); + if (ret) { + fprintf(stderr, _("cannot stat tmpdir: %s: %s\n"), + name, strerror(errno)); + ret = -errno; + rmdir(name); + goto out_cleanup_dir; + } + if (cvt_ino_to_agno(xfd, st.st_ino) == agno) + break; + + /* remove directory in wrong AG */ + rmdir(name); + } + + if (i == xfd->fsgeom.agcount) { + /* + * Nothing landed in the selected AG! Must have been skipped + * because the AG is out of space. + */ + fprintf(stderr, _("Cannot create AG tmpdir.\n")); + ret = -ENOSPC; + goto out_cleanup_dir; + } + + /* create tmpfile */ + snprintf(name, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d", mnt, i, getpid()); + fd = open(name, O_CREAT|O_EXCL|O_RDWR, 0700); + if (fd < 0) { + fprintf(stderr, _("cannot create tmpfile: %s: %s\n"), + name, strerror(errno)); + ret = -errno; + } + + /* return name and fd */ + (void)umask(mask); + *tmpfd = fd; + *tmpfile = strdup(name); + + return 0; +out_cleanup_dir: + snprintf(name, PATH_MAX, "%s/.spaceman", mnt); + rmdir(name); +out_cleanup: + (void)umask(mask); + return ret; +} + +static int +get_attr( + void *hdl, + size_t hlen, + char *name, + void *attrbuf, + int *attrlen, + int attr_ns) +{ + struct xfs_attr_multiop ops = { + .am_opcode = ATTR_OP_GET, + .am_attrname = name, + .am_attrvalue = attrbuf, + .am_length = *attrlen, + .am_flags = attr_ns, + }; + int ret; + + ret = attr_multi_by_handle(hdl, hlen, &ops, 1, 0); + if (ret < 0) { + fprintf(stderr, _("attr_multi_by_handle(GET): %s\n"), + strerror(errno)); + return -errno; + } + *attrlen = ops.am_length; + return 0; +} + +static int +set_attr( + void *hdl, + size_t hlen, + char *name, + void *attrbuf, + int attrlen, + int attr_ns) +{ + struct xfs_attr_multiop ops = { + .am_opcode = ATTR_OP_SET, + .am_attrname = name, + .am_attrvalue = attrbuf, + .am_length = attrlen, + .am_flags = ATTR_CREATE | attr_ns, + }; + int ret; + + ret = attr_multi_by_handle(hdl, hlen, &ops, 1, 0); + if (ret < 0) { + fprintf(stderr, _("attr_multi_by_handle(SET): %s\n"), + strerror(errno)); + return -errno; + } + return 0; +} + +/* + * Copy all the attributes from the original source file into the replacement + * destination. + * + * Oh the humanity of deprecated Irix compatible attr interfaces that are more + * functional and useful than their native Linux replacements! + */ +static int +copy_attrs( + int srcfd, + int dstfd, + int attr_ns) +{ + void *shdl; + void *dhdl; + size_t shlen; + size_t dhlen; + attrlist_cursor_t cursor; + attrlist_t *alist; + struct attrlist_ent *ent; + char alistbuf[XATTR_LIST_MAX]; + char attrbuf[XATTR_SIZE_MAX]; + int attrlen; + int error; + int i; + + memset(&cursor, 0, sizeof(cursor)); + + /* + * All this handle based stuff is hoop jumping to avoid: + * + * a) deprecated API warnings because attr_list, attr_get and attr_set + * have been deprecated hence through compiler warnings; and + * + * b) listxattr() failing hard if there are more than 64kB worth of attr + * names on the inode so is unusable. + * + * That leaves libhandle as the only usable interface for iterating all + * xattrs on an inode reliably. Lucky for us, libhandle is part of + * xfsprogs, so this hoop jump isn't going to get ripped out from under + * us any time soon. + */ + error = fd_to_handle(srcfd, (void **)&shdl, &shlen); + if (error) { + fprintf(stderr, _("fd_to_handle(shdl): %s\n"), + strerror(errno)); + return -errno; + } + error = fd_to_handle(dstfd, (void **)&dhdl, &dhlen); + if (error) { + fprintf(stderr, _("fd_to_handle(dhdl): %s\n"), + strerror(errno)); + goto out_free_shdl; + } + + /* loop to iterate all xattrs */ + error = attr_list_by_handle(shdl, shlen, alistbuf, + XATTR_LIST_MAX, attr_ns, &cursor); + if (error) { + fprintf(stderr, _("attr_list_by_handle(shdl): %s\n"), + strerror(errno)); + } + while (!error) { + alist = (attrlist_t *)alistbuf; + + /* + * We loop one attr at a time for initial implementation + * simplicity. attr_multi_by_handle() can retrieve and set + * multiple attrs in a single call, but that is more complex. + * Get it working first, then optimise. + */ + for (i = 0; i < alist->al_count; i++) { + ent = ATTR_ENTRY(alist, i); + + /* get xattr (val, len) from name */ + attrlen = XATTR_SIZE_MAX; + error = get_attr(shdl, shlen, ent->a_name, attrbuf, + &attrlen, attr_ns); + if (error) + break; + + /* set xattr (val, len) to name */ + error = set_attr(dhdl, dhlen, ent->a_name, attrbuf, + attrlen, ATTR_CREATE | attr_ns); + if (error) + break; + } + + if (!alist->al_more) + break; + error = attr_list_by_handle(shdl, shlen, alistbuf, + XATTR_LIST_MAX, attr_ns, &cursor); + } + + free_handle(dhdl, dhlen); +out_free_shdl: + free_handle(shdl, shlen); + return error ? -errno : 0; +} + +/* + * scan the range of the new file for data that isn't in the destination AG + * and unshare it to create a new copy of it in the current target location + * of the new file. + */ +#define EXTENT_BATCH 32 +static int +unshare_data( + struct xfs_fd *xfd, + int destfd, + xfs_agnumber_t agno) +{ + int ret; + struct fiemap *fiemap; + int done = 0; + int fiemap_flags = FIEMAP_FLAG_SYNC; + int i; + int map_size; + __u64 last_logical = 0; /* last extent offset handled */ + off_t range_end = -1LL; /* mapping end*/ + + /* fiemap loop over extents */ + map_size = sizeof(struct fiemap) + + (EXTENT_BATCH * sizeof(struct fiemap_extent)); + fiemap = malloc(map_size); + if (!fiemap) { + fprintf(stderr, _("%s: malloc of %d bytes failed.\n"), + progname, map_size); + return -ENOMEM; + } + + while (!done) { + memset(fiemap, 0, map_size); + fiemap->fm_flags = fiemap_flags; + fiemap->fm_start = last_logical; + fiemap->fm_length = range_end - last_logical; + fiemap->fm_extent_count = EXTENT_BATCH; + + ret = ioctl(destfd, FS_IOC_FIEMAP, (unsigned long)fiemap); + if (ret < 0) { + fprintf(stderr, "%s: ioctl(FS_IOC_FIEMAP): %s\n", + progname, strerror(errno)); + free(fiemap); + return -errno; + } + + /* No more extents to map, exit */ + if (!fiemap->fm_mapped_extents) + break; + + for (i = 0; i < fiemap->fm_mapped_extents; i++) { + struct fiemap_extent *extent; + xfs_agnumber_t this_agno; + + extent = &fiemap->fm_extents[i]; + this_agno = cvt_daddr_to_agno(xfd, + cvt_btobbt(extent->fe_physical)); + + /* + * If extent not in dst AG, unshare whole extent to + * trigger reallocated of the extent to be local to + * the current inode. + */ + if (this_agno != agno) { + ret = fallocate(destfd, FALLOC_FL_UNSHARE_RANGE, + extent->fe_logical, extent->fe_length); + if (ret) { + fprintf(stderr, + "%s: fallocate(UNSHARE): %s\n", + progname, strerror(errno)); + return -errno; + } + } + + last_logical = extent->fe_logical + extent->fe_length; + + /* Kernel has told us there are no more extents */ + if (extent->fe_flags & FIEMAP_EXTENT_LAST) { + done = 1; + break; + } + } + } + return 0; +} + +/* + * Exchange the inodes at the two paths indicated after first ensuring that the + * owners, permissions and timestamps are set correctly in the tmpfile. + */ +static int +exchange_inodes( + struct xfs_fd *xfd, + int tmpfd, + const char *tmpfile, + const char *path) +{ + struct timespec ts[2]; + struct stat st; + int ret; + + ret = fstat(xfd->fd, &st); + if (ret) + return -errno; + + /* set user ids */ + ret = fchown(tmpfd, st.st_uid, st.st_gid); + if (ret) + return -errno; + + /* set permissions */ + ret = fchmod(tmpfd, st.st_mode); + if (ret) + return -errno; + + /* set timestamps */ + ts[0] = st.st_atim; + ts[1] = st.st_mtim; + ret = futimens(tmpfd, ts); + if (ret) + return -errno; + + /* exchange the two inodes */ + ret = renameat2(AT_FDCWD, tmpfile, AT_FDCWD, path, RENAME_EXCHANGE); + if (ret) + return -errno; + return 0; +} + +static int +move_file_to_ag( + const char *mnt, + const char *path, + struct xfs_fd *xfd, + xfs_agnumber_t agno) +{ + int ret; + int tmpfd = -1; + char *tmpfile = NULL; + + fprintf(stderr, "move mnt %s, path %s, agno %d\n", mnt, path, agno); + + /* create temporary file in agno */ + ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd); + if (ret) + return ret; + + /* clone data to tempfile */ + ret = ioctl(tmpfd, FICLONE, xfd->fd); + if (ret) + goto out_cleanup; + + /* copy system attributes to tempfile */ + ret = copy_attrs(xfd->fd, tmpfd, ATTR_ROOT); + if (ret) + goto out_cleanup; + + /* copy user attributes to tempfile */ + ret = copy_attrs(xfd->fd, tmpfd, 0); + if (ret) + goto out_cleanup; + + /* unshare data to move it */ + ret = unshare_data(xfd, tmpfd, agno); + if (ret) + goto out_cleanup; + + /* swap the inodes over */ + ret = exchange_inodes(xfd, tmpfd, tmpfile, path); + +out_cleanup: + if (ret == -1) + ret = -errno; + + close(tmpfd); + if (tmpfile) + unlink(tmpfile); + free(tmpfile); + + return ret; +} + +static int +move_inode_f( + int argc, + char **argv) +{ + void *fshandle; + size_t fshdlen; + xfs_agnumber_t agno = 0; + struct stat st; + int ret; + int c; + + while ((c = getopt(argc, argv, "a:")) != EOF) { + switch (c) { + case 'a': + agno = cvt_u32(optarg, 10); + if (errno) { + fprintf(stderr, _("bad agno value %s\n"), + optarg); + return command_usage(&move_inode_cmd); + } + break; + default: + return command_usage(&move_inode_cmd); + } + } + + if (optind != argc) + return command_usage(&move_inode_cmd); + + if (agno >= file->xfd.fsgeom.agcount) { + fprintf(stderr, +_("Destination AG %d does not exist. Filesystem only has %d AGs\n"), + agno, file->xfd.fsgeom.agcount); + exitcode = 1; + return 0; + } + + /* this is so we can use fd_to_handle() later on */ + ret = path_to_fshandle(file->fs_path.fs_dir, &fshandle, &fshdlen); + if (ret < 0) { + fprintf(stderr, _("Cannot get fshandle for mount %s: %s\n"), + file->fs_path.fs_dir, strerror(errno)); + goto exit_fail; + } + + ret = fstat(file->xfd.fd, &st); + if (ret) { + fprintf(stderr, _("stat(%s) failed: %s\n"), + file->name, strerror(errno)); + goto exit_fail; + } + + if (S_ISREG(st.st_mode)) { + ret = move_file_to_ag(file->fs_path.fs_dir, file->name, + &file->xfd, agno); + } else { + fprintf(stderr, _("Unsupported: %s is not a regular file.\n"), + file->name); + goto exit_fail; + } + + if (ret) { + fprintf(stderr, _("Failed to move inode to AG %d: %s\n"), + agno, strerror(-ret)); + goto exit_fail; + } + fshandle_destroy(); + return 0; + +exit_fail: + fshandle_destroy(); + exitcode = 1; + return 0; +} + +static void +move_inode_help(void) +{ + printf(_( +"\n" +"Physically move an inode into a new allocation group\n" +"\n" +" -a agno -- destination AG agno for the current open file\n" +"\n")); + +} + +void +move_inode_init(void) +{ + move_inode_cmd.name = "move_inode"; + move_inode_cmd.altname = "mvino"; + move_inode_cmd.cfunc = move_inode_f; + move_inode_cmd.argmin = 2; + move_inode_cmd.argmax = 2; + move_inode_cmd.args = "-a agno"; + move_inode_cmd.flags = CMD_FLAG_ONESHOT; + move_inode_cmd.oneline = _("Move an inode into a new AG."); + move_inode_cmd.help = move_inode_help; + + add_command(&move_inode_cmd); +} diff --git a/spaceman/space.h b/spaceman/space.h index 509e923375f42f..96c3c356f13fec 100644 --- a/spaceman/space.h +++ b/spaceman/space.h @@ -38,5 +38,6 @@ extern void clearfree_init(void); #endif extern void info_init(void); extern void health_init(void); +void move_inode_init(void); #endif /* XFS_SPACEMAN_SPACE_H_ */ From patchwork Tue Dec 31 23:46:40 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924065 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0E42529415 for ; Tue, 31 Dec 2024 23:46:40 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688802; cv=none; b=Ds1dIvA6dyf1DCS5ljysqmSIXG35Bzp1n5RHOq7l9wQGkB5z2wPx6nE+J9xJu2A/zRNrAc3ohnGEUON8TXPQVMaijY2AOZG1S/L4glq1OjP5OU9xF7T/paCZ2NtLljLBFO2Ku5LktvaSkVkDFRAbaC0HPe8+m9B5Awn95xW+Xx4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688802; c=relaxed/simple; bh=IfOhOqc/kJ43To3u3e21xv6uM0GT8r3eB4uo0tqF76Y=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=QOXsNVzp9LHb0xEr0BSIiHsz1uZSscXEp9kYjPcrdR9DaIgOY4gurNqfLNsJzMTLbsQ1wZcKHDIk48skoedfr2AOFsewLtwWe4bSeGJNOptH56aVaRUpuUxESYo8Ctbl8OA1qfL39FrqXkbNEmsDA5d8JGqQEj/0nbCl7kQIQh0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ikNxpCmM; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ikNxpCmM" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 924E4C4CED2; Tue, 31 Dec 2024 23:46:40 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688800; bh=IfOhOqc/kJ43To3u3e21xv6uM0GT8r3eB4uo0tqF76Y=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=ikNxpCmM59sV2Bc+nbEvLw0GfU+bpO2lgO2E3+Y7aJVDhEvvGJJTFkbCvZDdEg/Vy +Vljz5SQQ95q52vBlgvZudUWb2Y8hxQu1i38pOzfzgcnI0PsCiZFWG4lotaDxvA37z zvw/YaEi3+pRu59k46g+N525y7nLz0XuMVNB8GKJIqxYbSTIgZVkU947NZhH7I+RcT HBFblOKaz07f7FOaJnmWNvfgxssinyJULx4uXsnOY/M4oHvAyIMMss8XV4rndnHBRZ efClJ0FfgJpVSouPTQB/4ESHMPxBMKAbBTV1mH/RewpjsWXuvmcL83S9gHJglUVb0O RxVMgZgrzJ8Zg== Date: Tue, 31 Dec 2024 15:46:40 -0800 Subject: [PATCH 07/11] spaceman: find owners of space in an AG From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: dchinner@redhat.com, linux-xfs@vger.kernel.org Message-ID: <173568777977.2709794.10470936364792801206.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Dave Chinner Before we can move inodes for a shrink operation, we have to find all the inodes that own space in the AG(s) we want to empty. This implementation uses FS_IOC_GETFSMAP on the assumption that filesystems to be shrunk have reverse mapping enabled as it is the only way to identify inode related metadata that userspace is unable to see or influence (e.g. BMBT blocks) that may be located in the specific AG. We can use GETFSMAP to identify both inodes to be moved (via XFS_FMR_OWN_INODES records) and inodes with just data and/or metadata to be moved. Once we have identified all the inodes to be moved, we have to map them to paths so that we can use renameat2() to exchange the directory entries pointing at the moved inode atomically. We also need to record inodes with hard links and all of the paths to the inode so that hard links can be recreated appropriately. This requires a directory tree walk to discover the paths (until parent pointers are a thing). Hence for filesystems that aren't reverse mapping enabled, we can eventually use this pass to discover inodes with visible data and metadata that need to be moved. As we resolve the paths to the inodes to be moved, output the information to stdout so that it can be acted upon by other utilities. This results in a command that acts similar to find but with a physical location filter rather than an inode metadata filter. Again, this is not meant to be an optimal implementation. It shouldn't suck, but there is plenty of scope for performance optimisation, especially with a multithreaded and/or async directory traversal/parent pointer path resolution process to hide access latencies. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: "Darrick J. Wong" --- libfrog/fsgeom.h | 19 ++ libfrog/radix-tree.c | 2 libfrog/radix-tree.h | 2 man/man8/xfs_spaceman.8 | 11 + spaceman/Makefile | 1 spaceman/find_owner.c | 481 +++++++++++++++++++++++++++++++++++++++++++++++ spaceman/init.c | 4 spaceman/space.h | 2 8 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 spaceman/find_owner.c diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h index b851b9bbf36a58..679046077cba84 100644 --- a/libfrog/fsgeom.h +++ b/libfrog/fsgeom.h @@ -97,6 +97,25 @@ cvt_ino_to_agino( return ino & ((1ULL << xfd->aginolog) - 1); } +/* Convert an AG block to an AG inode number. */ +static inline uint32_t +cvt_agbno_to_agino( + const struct xfs_fd *xfd, + xfs_agblock_t agbno) +{ + return agbno << xfd->inopblog; +} + +/* Calculate the number of inodes in a byte range */ +static inline uint32_t +cvt_b_to_inode_count( + const struct xfs_fd *xfd, + uint64_t bytes) +{ + return (bytes >> xfd->blocklog) << xfd->inopblog; +} + + /* * Convert a linear fs block offset number into bytes. This is the runtime * equivalent of XFS_FSB_TO_B, which means that it is /not/ for segmented fsbno diff --git a/libfrog/radix-tree.c b/libfrog/radix-tree.c index 261fc2487de97f..788d11612e290f 100644 --- a/libfrog/radix-tree.c +++ b/libfrog/radix-tree.c @@ -377,6 +377,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned int height, shift; struct radix_tree_node *slot; + ASSERT(tag < RADIX_TREE_MAX_TAGS); + height = root->height; if (index > radix_tree_maxindex(height)) return NULL; diff --git a/libfrog/radix-tree.h b/libfrog/radix-tree.h index 0a4e3bb4f9defc..73f41a9d902a26 100644 --- a/libfrog/radix-tree.h +++ b/libfrog/radix-tree.h @@ -28,7 +28,7 @@ do { \ } while (0) #ifdef RADIX_TREE_TAGS -#define RADIX_TREE_MAX_TAGS 2 +#define RADIX_TREE_MAX_TAGS 3 #endif int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8 index f898a8bbe840ea..6fef6949aa6c8b 100644 --- a/man/man8/xfs_spaceman.8 +++ b/man/man8/xfs_spaceman.8 @@ -41,6 +41,14 @@ .SH COMMANDS If the .B -v option is given, print what's happening every step of the way. +.TP +.BI "find_owner \-a agno" +Create an internal structure to map physical space in the given allocation +group to file paths. +This enables space reorganization on a mounted filesystem by enabling +us to find files. +Unclear why we can't just use FSMAP and BULKSTAT to open by handle. + .TP .BI "freesp [ \-dgrs ] [-a agno]... [ \-b | \-e bsize | \-h bsize | \-m factor ]" With no arguments, @@ -195,6 +203,9 @@ .SH COMMANDS .B print Display a list of all open files. .TP +.B resolve_owner +Resolves space in the filesystem to file paths, maybe? +.TP .B quit Exit .BR xfs_spaceman . diff --git a/spaceman/Makefile b/spaceman/Makefile index 9d080b67de9a22..b35ab1dbd2f440 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -11,6 +11,7 @@ HFILES = \ space.h CFILES = \ file.c \ + find_owner.c \ health.c \ info.c \ init.c \ diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c new file mode 100644 index 00000000000000..7a656d80d21217 --- /dev/null +++ b/spaceman/find_owner.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017 Oracle. + * Copyright (c) 2020 Red Hat, Inc. + * All Rights Reserved. + */ + +#include "libxfs.h" +#include +#include "libfrog/fsgeom.h" +#include "libfrog/radix-tree.h" +#include "command.h" +#include "init.h" +#include "libfrog/paths.h" +#include +#include "space.h" +#include "input.h" + +static cmdinfo_t find_owner_cmd; +static cmdinfo_t resolve_owner_cmd; + +#define NR_EXTENTS 128 + +static RADIX_TREE(inode_tree, 0); +#define MOVE_INODE 0 +#define MOVE_BLOCKS 1 +#define INODE_PATH 2 +int inode_count; +int inode_paths; + +static void +track_inode_chunks( + struct xfs_fd *xfd, + xfs_agnumber_t agno, + uint64_t physaddr, + uint64_t length) +{ + xfs_agblock_t agbno = cvt_b_to_agbno(xfd, physaddr); + uint64_t first_ino = cvt_agino_to_ino(xfd, agno, + cvt_agbno_to_agino(xfd, agbno)); + uint64_t num_inodes = cvt_b_to_inode_count(xfd, length); + int i; + + printf(_("AG %d\tInode Range to move: 0x%llx - 0x%llx (length 0x%llx)\n"), + agno, + (unsigned long long)first_ino, + (unsigned long long)first_ino + num_inodes - 1, + (unsigned long long)length); + + for (i = 0; i < num_inodes; i++) { + if (!radix_tree_lookup(&inode_tree, first_ino + i)) { + radix_tree_insert(&inode_tree, first_ino + i, + (void *)first_ino + i); + inode_count++; + } + radix_tree_tag_set(&inode_tree, first_ino + i, MOVE_INODE); + } +} + +static void +track_inode( + struct xfs_fd *xfd, + xfs_agnumber_t agno, + uint64_t owner, + uint64_t physaddr, + uint64_t length) +{ + if (radix_tree_tag_get(&inode_tree, owner, MOVE_BLOCKS)) + return; + + printf(_("AG %d\tInode 0x%llx: blocks to move to move: 0x%llx - 0x%llx\n"), + agno, + (unsigned long long)owner, + (unsigned long long)physaddr, + (unsigned long long)physaddr + length - 1); + if (!radix_tree_lookup(&inode_tree, owner)) { + radix_tree_insert(&inode_tree, owner, (void *)owner); + inode_count++; + } + radix_tree_tag_set(&inode_tree, owner, MOVE_BLOCKS); +} + +static void +scan_ag( + xfs_agnumber_t agno) +{ + struct fsmap_head *fsmap; + struct fsmap *extent; + struct fsmap *l, *h; + struct fsmap *p; + struct xfs_fd *xfd = &file->xfd; + int ret; + int i; + + fsmap = malloc(fsmap_sizeof(NR_EXTENTS)); + if (!fsmap) { + fprintf(stderr, _("%s: fsmap malloc failed.\n"), progname); + exitcode = 1; + return; + } + + memset(fsmap, 0, sizeof(*fsmap)); + fsmap->fmh_count = NR_EXTENTS; + l = fsmap->fmh_keys; + h = fsmap->fmh_keys + 1; + l->fmr_physical = cvt_agbno_to_b(xfd, agno, 0); + h->fmr_physical = cvt_agbno_to_b(xfd, agno + 1, 0); + l->fmr_device = h->fmr_device = file->fs_path.fs_datadev; + h->fmr_owner = ULLONG_MAX; + h->fmr_flags = UINT_MAX; + h->fmr_offset = ULLONG_MAX; + + while (true) { + printf("Inode count %d\n", inode_count); + ret = ioctl(xfd->fd, FS_IOC_GETFSMAP, fsmap); + if (ret < 0) { + fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"), + progname, file->name, strerror(errno)); + free(fsmap); + exitcode = 1; + return; + } + + /* No more extents to map, exit */ + if (!fsmap->fmh_entries) + break; + + /* + * Walk the extents, ignore everything except inode chunks + * and inode owned blocks. + */ + for (i = 0, extent = fsmap->fmh_recs; + i < fsmap->fmh_entries; + i++, extent++) { + if (extent->fmr_flags & FMR_OF_SPECIAL_OWNER) { + if (extent->fmr_owner != XFS_FMR_OWN_INODES) + continue; + /* + * This extent contains inodes that need to be + * moved into another AG. Convert the extent to + * a range of inode numbers and track them all. + */ + track_inode_chunks(xfd, agno, + extent->fmr_physical, + extent->fmr_length); + + continue; + } + + /* + * Extent is owned by an inode that may be located + * anywhere in the filesystem, not just this AG. + */ + track_inode(xfd, agno, extent->fmr_owner, + extent->fmr_physical, + extent->fmr_length); + } + + p = &fsmap->fmh_recs[fsmap->fmh_entries - 1]; + if (p->fmr_flags & FMR_OF_LAST) + break; + fsmap_advance(fsmap); + } + + free(fsmap); +} + +/* + * find inodes that own physical space in a given AG. + */ +static int +find_owner_f( + int argc, + char **argv) +{ + xfs_agnumber_t agno = -1; + int c; + + while ((c = getopt(argc, argv, "a:")) != EOF) { + switch (c) { + case 'a': + agno = cvt_u32(optarg, 10); + if (errno) { + fprintf(stderr, _("bad agno value %s\n"), + optarg); + return command_usage(&find_owner_cmd); + } + break; + default: + return command_usage(&find_owner_cmd); + } + } + + if (optind != argc) + return command_usage(&find_owner_cmd); + + if (agno == -1 || agno >= file->xfd.fsgeom.agcount) { + fprintf(stderr, +_("Destination AG %d does not exist. Filesystem only has %d AGs\n"), + agno, file->xfd.fsgeom.agcount); + exitcode = 1; + return 0; + } + + /* + * Check that rmap is enabled so that GETFSMAP is actually useful. + */ + if (!(file->xfd.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_RMAPBT)) { + fprintf(stderr, +_("Filesystem at %s does not have reverse mapping enabled. Aborting.\n"), + file->fs_path.fs_dir); + exitcode = 1; + return 0; + } + + scan_ag(agno); + return 0; +} + +static void +find_owner_help(void) +{ + printf(_( +"\n" +"Find inodes owning physical blocks in a given AG.\n" +"\n" +" -a agno -- Scan the given AG agno.\n" +"\n")); + +} + +void +find_owner_init(void) +{ + find_owner_cmd.name = "find_owner"; + find_owner_cmd.altname = "fown"; + find_owner_cmd.cfunc = find_owner_f; + find_owner_cmd.argmin = 2; + find_owner_cmd.argmax = 2; + find_owner_cmd.args = "-a agno"; + find_owner_cmd.flags = CMD_FLAG_ONESHOT; + find_owner_cmd.oneline = _("Find inodes owning physical blocks in a given AG"); + find_owner_cmd.help = find_owner_help; + + add_command(&find_owner_cmd); +} + +/* + * for each dirent we get returned, look up the inode tree to see if it is an + * inode we need to process. If it is, then replace the entry in the tree with + * a structure containing the current path and mark the entry as resolved. + */ +struct inode_path { + uint64_t ino; + struct list_head path_list; + uint32_t link_count; + char path[1]; +}; + +static int +resolve_owner_cb( + const char *path, + const struct stat *stat, + int status, + struct FTW *data) +{ + struct inode_path *ipath, *slot_ipath; + int pathlen; + void **slot; + + /* + * Lookup the slot rather than the entry so we can replace the contents + * without another lookup later on. + */ + slot = radix_tree_lookup_slot(&inode_tree, stat->st_ino); + if (!slot || *slot == NULL) + return 0; + + /* Could not get stat data? Fail! */ + if (status == FTW_NS) { + fprintf(stderr, +_("Failed to obtain stat(2) information from path %s. Aborting\n"), + path); + return -EPERM; + } + + /* Allocate a new inode path and record the path in it. */ + pathlen = strlen(path); + ipath = calloc(1, sizeof(*ipath) + pathlen + 1); + if (!ipath) { + fprintf(stderr, +_("Aborting: Storing path %s for inode 0x%lx failed: %s\n"), + path, stat->st_ino, strerror(ENOMEM)); + return -ENOMEM; + } + INIT_LIST_HEAD(&ipath->path_list); + memcpy(&ipath->path[0], path, pathlen); + ipath->ino = stat->st_ino; + + /* + * If the slot contains the inode number we just looked up, then we + * haven't recorded a path for it yet. If that is the case, we just + * set the link count of the path to 1 and replace the slot contents + * with our new_ipath. + */ + if (stat->st_ino == (uint64_t)*slot) { + ipath->link_count = 1; + *slot = ipath; + radix_tree_tag_set(&inode_tree, stat->st_ino, INODE_PATH); + inode_paths++; + return 0; + } + + /* + * Multiple hard links to this inode. The slot already contains an + * ipath pointer, so we add the new ipath to the tail of the list held + * by the slot's ipath and bump the link count of the slot's ipath to + * keep track of how many hard links the inode has. + */ + slot_ipath = *slot; + slot_ipath->link_count++; + list_add_tail(&ipath->path_list, &slot_ipath->path_list); + return 0; +} + +/* + * This should be parallelised - pass subdirs off to a work queue, have the + * work queue processes subdirs, queueing more subdirs to work on. + */ +static int +walk_mount( + const char *mntpt) +{ + int ret; + + ret = nftw(mntpt, resolve_owner_cb, + 100, FTW_PHYS | FTW_MOUNT | FTW_DEPTH); + if (ret) + return -errno; + return 0; +} + +static int +list_inode_paths(void) +{ + struct inode_path *ipath; + uint64_t idx = 0; + int ret; + + do { + bool move_blocks; + bool move_inode; + + ret = radix_tree_gang_lookup_tag(&inode_tree, (void **)&ipath, + idx, 1, INODE_PATH); + if (!ret) + break; + idx = ipath->ino + 1; + + /* Grab status tags and remove from tree. */ + move_blocks = radix_tree_tag_get(&inode_tree, ipath->ino, + MOVE_BLOCKS); + move_inode = radix_tree_tag_get(&inode_tree, ipath->ino, + MOVE_INODE); + radix_tree_delete(&inode_tree, ipath->ino); + + /* Print the initial path with inode number and state. */ + printf("0x%.16llx\t%s\t%s\t%8d\t%s\n", + (unsigned long long)ipath->ino, + move_blocks ? "BLOCK" : "---", + move_inode ? "INODE" : "---", + ipath->link_count, ipath->path); + ipath->link_count--; + + /* Walk all the hard link paths and emit them. */ + while (!list_empty(&ipath->path_list)) { + struct inode_path *hpath; + + hpath = list_first_entry(&ipath->path_list, + struct inode_path, path_list); + list_del(&hpath->path_list); + ipath->link_count--; + + printf("\t\t\t\t\t%s\n", hpath->path); + } + if (ipath->link_count) { + printf(_("Link count anomaly: %d paths left over\n"), + ipath->link_count); + } + free(ipath); + } while (true); + + /* + * Any inodes remaining in the tree at this point indicate inodes whose + * paths were not found. This will be unlinked but still open inodes or + * lost inodes due to corruptions. Either way, a shrink will not succeed + * until these inodes are removed from the filesystem. + */ + idx = 0; + do { + uint64_t ino; + + + ret = radix_tree_gang_lookup(&inode_tree, (void **)&ino, idx, 1); + if (!ret) { + if (idx != 0) + ret = -EBUSY; + break; + } + idx = ino + 1; + printf(_("No path found for inode 0x%llx!\n"), + (unsigned long long)ino); + radix_tree_delete(&inode_tree, ino); + } while (true); + + return ret; +} + +/* + * Resolve inode numbers to paths via a directory tree walk. + */ +static int +resolve_owner_f( + int argc, + char **argv) +{ + int ret; + + if (!inode_tree.rnode) { + fprintf(stderr, +_("Inode list has not been populated. No inodes to resolve.\n")); + return 0; + } + + ret = walk_mount(file->fs_path.fs_dir); + if (ret) { + fprintf(stderr, +_("Failed to resolve all paths from mount point %s: %s\n"), + file->fs_path.fs_dir, strerror(-ret)); + exitcode = 1; + return 0; + } + + ret = list_inode_paths(); + if (ret) { + fprintf(stderr, +_("Failed to list all resolved paths from mount point %s: %s\n"), + file->fs_path.fs_dir, strerror(-ret)); + exitcode = 1; + return 0; + } + return 0; +} + +static void +resolve_owner_help(void) +{ + printf(_( +"\n" +"Resolve inodes owning physical blocks in a given AG.\n" +"This requires the find_owner command to be run first to populate the table\n" +"of inodes that need to have their paths resolved.\n" +"\n")); + +} + +void +resolve_owner_init(void) +{ + resolve_owner_cmd.name = "resolve_owner"; + resolve_owner_cmd.altname = "rown"; + resolve_owner_cmd.cfunc = resolve_owner_f; + resolve_owner_cmd.argmin = 0; + resolve_owner_cmd.argmax = 0; + resolve_owner_cmd.args = ""; + resolve_owner_cmd.flags = CMD_FLAG_ONESHOT; + resolve_owner_cmd.oneline = _("Resolve patches to inodes owning physical blocks in a given AG"); + resolve_owner_cmd.help = resolve_owner_help; + + add_command(&resolve_owner_cmd); +} diff --git a/spaceman/init.c b/spaceman/init.c index dbeebcf97b9fb2..8b0af14e566dc8 100644 --- a/spaceman/init.c +++ b/spaceman/init.c @@ -10,6 +10,7 @@ #include "input.h" #include "init.h" #include "libfrog/paths.h" +#include "libfrog/radix-tree.h" #include "space.h" char *progname; @@ -37,6 +38,8 @@ init_commands(void) health_init(); clearfree_init(); move_inode_init(); + find_owner_init(); + resolve_owner_init(); } static int @@ -71,6 +74,7 @@ init( setlocale(LC_ALL, ""); bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); + radix_tree_init(); fs_table_initialise(0, NULL, 0, NULL); while ((c = getopt(argc, argv, "c:p:V")) != EOF) { diff --git a/spaceman/space.h b/spaceman/space.h index 96c3c356f13fec..cffb1882153a18 100644 --- a/spaceman/space.h +++ b/spaceman/space.h @@ -39,5 +39,7 @@ extern void clearfree_init(void); extern void info_init(void); extern void health_init(void); void move_inode_init(void); +void find_owner_init(void); +void resolve_owner_init(void); #endif /* XFS_SPACEMAN_SPACE_H_ */ From patchwork Tue Dec 31 23:46:55 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924066 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5535D29415 for ; Tue, 31 Dec 2024 23:46:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688816; cv=none; b=lOmmQApyXKuiaDb1SKBCSSHZtSuCG4mXP5cbWyzLFK1Veq9uXK5To2yxhLOB9DwYWOe0dq1DX8WhtjD3tH+eaUWcXSi+2ad4pzAH7gClSJCTOkAlwwWvNzkL8wMtC5Ui2P4+Wlzn2j56Qcq4WieNnrrRi3x1u96ofVdXZ5fbOeY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688816; c=relaxed/simple; bh=Bu7yhGg97khaNKSU1dyKpsfZL2ifbUjEhTpf9bSXmHU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=VhgU9PeKBGdqr3axfz1pMzAJUFwlmHqfzDdvu8+8SUuIjMVbyrB3lyecLkHTaKkCxKDMYiSMNWc1O/RsAy1TK3WMzpqQ17tjLo5cVZUHSMxVtUo3UuHDb2nVEVbXakrFXt7A9s4Jtjku1uf/KE5raCmIlYSwo/ND548karkCF9E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=UqND8uuE; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="UqND8uuE" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 30BF9C4CED2; Tue, 31 Dec 2024 23:46:56 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688816; bh=Bu7yhGg97khaNKSU1dyKpsfZL2ifbUjEhTpf9bSXmHU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=UqND8uuEGBXiUeG/S6SXKvhx11g8kZzcYy4ClHNWmdJ0nOhogVXbdpUr+0aGTgOBj 66bQC2bkrNKXZJmegQicWVMrpaLnbS/199HhIg+8dBiskM0DAs1wHgunusf1kFcZrh 5+9ugHCjbvMem5aU1FOXSEGCB0wC1NOyffF92yUjOnib1X775B8EpGLrgyh6fJDYRL DUxtrijdxnB1LL575fApLiPJGYhbDFYZ47+/DhYxGDh9XnxqKcAH5V+8BoXjNHm0Gu 4lxMseQlNZ1XY0v2MPMsqRpTCfRZ2Ku6S+cJCzdjh/xmyM6Qisw//SXOCgVcpQicY7 Z9D0w247GuVQA== Date: Tue, 31 Dec 2024 15:46:55 -0800 Subject: [PATCH 08/11] xfs_spaceman: wrap radix tree accesses in find_owner.c From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568777993.2709794.5469351937184897707.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Wrap the raw radix tree accesses here so that we can provide an alternate implementation on platforms where radix tree indices cannot store a full 64-bit inode number. Signed-off-by: "Darrick J. Wong" --- spaceman/Makefile | 1 spaceman/find_owner.c | 76 +++++++++------------------------ spaceman/relocation.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++ spaceman/relocation.h | 46 ++++++++++++++++++++ 4 files changed, 183 insertions(+), 54 deletions(-) create mode 100644 spaceman/relocation.c create mode 100644 spaceman/relocation.h diff --git a/spaceman/Makefile b/spaceman/Makefile index b35ab1dbd2f440..8980208285f610 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -17,6 +17,7 @@ CFILES = \ init.c \ move_inode.c \ prealloc.c \ + relocation.c \ trim.c LSRCFILES = xfs_info.sh diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c index 7a656d80d21217..80b239f9ac5de8 100644 --- a/spaceman/find_owner.c +++ b/spaceman/find_owner.c @@ -15,19 +15,13 @@ #include #include "space.h" #include "input.h" +#include "relocation.h" static cmdinfo_t find_owner_cmd; static cmdinfo_t resolve_owner_cmd; #define NR_EXTENTS 128 -static RADIX_TREE(inode_tree, 0); -#define MOVE_INODE 0 -#define MOVE_BLOCKS 1 -#define INODE_PATH 2 -int inode_count; -int inode_paths; - static void track_inode_chunks( struct xfs_fd *xfd, @@ -39,7 +33,7 @@ track_inode_chunks( uint64_t first_ino = cvt_agino_to_ino(xfd, agno, cvt_agbno_to_agino(xfd, agbno)); uint64_t num_inodes = cvt_b_to_inode_count(xfd, length); - int i; + uint64_t i; printf(_("AG %d\tInode Range to move: 0x%llx - 0x%llx (length 0x%llx)\n"), agno, @@ -47,14 +41,8 @@ track_inode_chunks( (unsigned long long)first_ino + num_inodes - 1, (unsigned long long)length); - for (i = 0; i < num_inodes; i++) { - if (!radix_tree_lookup(&inode_tree, first_ino + i)) { - radix_tree_insert(&inode_tree, first_ino + i, - (void *)first_ino + i); - inode_count++; - } - radix_tree_tag_set(&inode_tree, first_ino + i, MOVE_INODE); - } + for (i = 0; i < num_inodes; i++) + set_reloc_iflag(first_ino + i, MOVE_INODE); } static void @@ -65,7 +53,7 @@ track_inode( uint64_t physaddr, uint64_t length) { - if (radix_tree_tag_get(&inode_tree, owner, MOVE_BLOCKS)) + if (test_reloc_iflag(owner, MOVE_BLOCKS)) return; printf(_("AG %d\tInode 0x%llx: blocks to move to move: 0x%llx - 0x%llx\n"), @@ -73,11 +61,8 @@ track_inode( (unsigned long long)owner, (unsigned long long)physaddr, (unsigned long long)physaddr + length - 1); - if (!radix_tree_lookup(&inode_tree, owner)) { - radix_tree_insert(&inode_tree, owner, (void *)owner); - inode_count++; - } - radix_tree_tag_set(&inode_tree, owner, MOVE_BLOCKS); + + set_reloc_iflag(owner, MOVE_BLOCKS); } static void @@ -111,7 +96,7 @@ scan_ag( h->fmr_offset = ULLONG_MAX; while (true) { - printf("Inode count %d\n", inode_count); + printf("Inode count %llu\n", get_reloc_count()); ret = ioctl(xfd->fd, FS_IOC_GETFSMAP, fsmap); if (ret < 0) { fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"), @@ -245,18 +230,6 @@ find_owner_init(void) add_command(&find_owner_cmd); } -/* - * for each dirent we get returned, look up the inode tree to see if it is an - * inode we need to process. If it is, then replace the entry in the tree with - * a structure containing the current path and mark the entry as resolved. - */ -struct inode_path { - uint64_t ino; - struct list_head path_list; - uint32_t link_count; - char path[1]; -}; - static int resolve_owner_cb( const char *path, @@ -266,14 +239,14 @@ resolve_owner_cb( { struct inode_path *ipath, *slot_ipath; int pathlen; - void **slot; + struct inode_path **slot; /* * Lookup the slot rather than the entry so we can replace the contents * without another lookup later on. */ - slot = radix_tree_lookup_slot(&inode_tree, stat->st_ino); - if (!slot || *slot == NULL) + slot = get_reloc_ipath_slot(stat->st_ino); + if (!slot) return 0; /* Could not get stat data? Fail! */ @@ -303,11 +276,10 @@ _("Aborting: Storing path %s for inode 0x%lx failed: %s\n"), * set the link count of the path to 1 and replace the slot contents * with our new_ipath. */ - if (stat->st_ino == (uint64_t)*slot) { + if (*slot == UNLINKED_IPATH) { ipath->link_count = 1; *slot = ipath; - radix_tree_tag_set(&inode_tree, stat->st_ino, INODE_PATH); - inode_paths++; + set_reloc_iflag(stat->st_ino, INODE_PATH); return 0; } @@ -351,18 +323,15 @@ list_inode_paths(void) bool move_blocks; bool move_inode; - ret = radix_tree_gang_lookup_tag(&inode_tree, (void **)&ipath, - idx, 1, INODE_PATH); - if (!ret) + ipath = get_next_reloc_ipath(idx); + if (!ipath) break; idx = ipath->ino + 1; /* Grab status tags and remove from tree. */ - move_blocks = radix_tree_tag_get(&inode_tree, ipath->ino, - MOVE_BLOCKS); - move_inode = radix_tree_tag_get(&inode_tree, ipath->ino, - MOVE_INODE); - radix_tree_delete(&inode_tree, ipath->ino); + move_blocks = test_reloc_iflag(ipath->ino, MOVE_BLOCKS); + move_inode = test_reloc_iflag(ipath->ino, MOVE_INODE); + forget_reloc_ino(ipath->ino); /* Print the initial path with inode number and state. */ printf("0x%.16llx\t%s\t%s\t%8d\t%s\n", @@ -400,9 +369,8 @@ list_inode_paths(void) do { uint64_t ino; - - ret = radix_tree_gang_lookup(&inode_tree, (void **)&ino, idx, 1); - if (!ret) { + ino = get_next_reloc_unlinked(idx); + if (!ino) { if (idx != 0) ret = -EBUSY; break; @@ -410,7 +378,7 @@ list_inode_paths(void) idx = ino + 1; printf(_("No path found for inode 0x%llx!\n"), (unsigned long long)ino); - radix_tree_delete(&inode_tree, ino); + forget_reloc_ino(ino); } while (true); return ret; @@ -426,7 +394,7 @@ resolve_owner_f( { int ret; - if (!inode_tree.rnode) { + if (!is_reloc_populated()) { fprintf(stderr, _("Inode list has not been populated. No inodes to resolve.\n")); return 0; diff --git a/spaceman/relocation.c b/spaceman/relocation.c new file mode 100644 index 00000000000000..7c7d9a2b4b236f --- /dev/null +++ b/spaceman/relocation.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Red Hat, Inc. + * All Rights Reserved. + */ + +#include "libxfs.h" +#include "libfrog/fsgeom.h" +#include "libfrog/radix-tree.h" +#include "libfrog/paths.h" +#include "command.h" +#include "init.h" +#include "space.h" +#include "input.h" +#include "relocation.h" +#include "handle.h" + +static unsigned long long inode_count; +static unsigned long long inode_paths; + +unsigned long long +get_reloc_count(void) +{ + return inode_count; +} + +static RADIX_TREE(relocation_data, 0); + +bool +is_reloc_populated(void) +{ + return relocation_data.rnode != NULL; +} + +bool +test_reloc_iflag( + uint64_t ino, + unsigned int flag) +{ + return radix_tree_tag_get(&relocation_data, ino, flag); +} + +void +set_reloc_iflag( + uint64_t ino, + unsigned int flag) +{ + if (!radix_tree_lookup(&relocation_data, ino)) { + radix_tree_insert(&relocation_data, ino, UNLINKED_IPATH); + if (flag != INODE_PATH) + inode_count++; + } + if (flag == INODE_PATH) + inode_paths++; + + radix_tree_tag_set(&relocation_data, ino, flag); +} + +struct inode_path * +get_next_reloc_ipath( + uint64_t ino) +{ + struct inode_path *ipath; + int ret; + + ret = radix_tree_gang_lookup_tag(&relocation_data, (void **)&ipath, + ino, 1, INODE_PATH); + if (!ret) + return NULL; + return ipath; +} + +uint64_t +get_next_reloc_unlinked( + uint64_t ino) +{ + uint64_t next_ino; + int ret; + + ret = radix_tree_gang_lookup(&relocation_data, (void **)&next_ino, ino, + 1); + if (!ret) + return 0; + return next_ino; +} + +/* + * Return a pointer to a pointer where the caller can read or write a pointer + * to an inode path structure. + * + * The pointed-to pointer will be set to UNLINKED_IPATH if there is no ipath + * associated with this inode but the inode has been flagged for relocation. + * + * Returns NULL if the inode is not flagged for relocation. + */ +struct inode_path ** +get_reloc_ipath_slot( + uint64_t ino) +{ + struct inode_path **slot; + + slot = (struct inode_path **)radix_tree_lookup_slot(&relocation_data, + ino); + if (!slot || *slot == NULL) + return NULL; + return slot; +} + +void +forget_reloc_ino( + uint64_t ino) +{ + radix_tree_delete(&relocation_data, ino); +} diff --git a/spaceman/relocation.h b/spaceman/relocation.h new file mode 100644 index 00000000000000..f05a871915da42 --- /dev/null +++ b/spaceman/relocation.h @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef XFS_SPACEMAN_RELOCATION_H_ +#define XFS_SPACEMAN_RELOCATION_H_ + +bool is_reloc_populated(void); +unsigned long long get_reloc_count(void); + +/* + * Tags for the relocation_data tree that indicate what it contains and the + * discovery information that needed to be stored. + */ +#define MOVE_INODE 0 +#define MOVE_BLOCKS 1 +#define INODE_PATH 2 + +bool test_reloc_iflag(uint64_t ino, unsigned int flag); +void set_reloc_iflag(uint64_t ino, unsigned int flag); +struct inode_path *get_next_reloc_ipath(uint64_t ino); +uint64_t get_next_reloc_unlinked(uint64_t ino); +struct inode_path **get_reloc_ipath_slot(uint64_t ino); +void forget_reloc_ino(uint64_t ino); + +/* + * When the entry in the relocation_data tree is tagged with INODE_PATH, the + * entry contains a structure that tracks the discovered paths to the inode. If + * the inode has multiple hard links, then we chain each individual path found + * via the path_list and record the number of paths in the link_count entry. + */ +struct inode_path { + uint64_t ino; + struct list_head path_list; + uint32_t link_count; + char path[1]; +}; + +/* + * Sentinel value for inodes that we have to move but haven't yet found a path + * to. + */ +#define UNLINKED_IPATH ((struct inode_path *)1) + +#endif /* XFS_SPACEMAN_RELOCATION_H_ */ From patchwork Tue Dec 31 23:47:11 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924067 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 435161B0425 for ; Tue, 31 Dec 2024 23:47:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688832; cv=none; b=piYMfNZKpTd4Jl/g32B5qVTOK9255SnJc+ZcBQs93WAK8mcxwql1/IvrjO3LeDORBiHztfd7zzVOHcQv/Z8QVpuhDIh1SsWwIuZs8oG621zDPpPt8ryGhVbMec4QZXGh7/9lSODR1dNO63iToz0BuKJOldMqHyBbgN9OdsXs4PY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688832; c=relaxed/simple; bh=nGZ/kamyUCLHyi1ECY6lZfxmMHTKO1Gup9Zj5nSqy8k=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=Hba2fJBpYn7/KuwMHkInf8ccHQPCQ0JaqtyYvdRBI1Gr0nueJuWR1iJWbgA9ey8kLXXX4DMMZNUDJL4ihnCW0OCc5IPrtcByTOt195v+VAnL/NZZxEY5UXNSfbB7ty3p4iODhpvZlQAQc4xZnsxlm61r5VHr3bb3KkLTSHbn0Yc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=SZHyPBou; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="SZHyPBou" Received: by smtp.kernel.org (Postfix) with ESMTPSA id C2907C4CED2; Tue, 31 Dec 2024 23:47:11 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688831; bh=nGZ/kamyUCLHyi1ECY6lZfxmMHTKO1Gup9Zj5nSqy8k=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=SZHyPBoutiinawu/S0l9guhpypHzS+5HgVkgD9pQJf7Gsuie/ilktEIlGwWjjJj0K BtBS6o61N01y9IZWq6ecu34rBbAUZPVIyNwTStZjHkBFFLuM/pPbdEHC8avaLHQvwM qdM4SZn84ttqnLI7Tm1KruoLvJHhcF37OGLQ636AVAkjPiGgrzOjjdSJUGtJM0wpFP ZH3Td92jSscoQhI/Pn4QOSoz5wSIg0JX2Ci8IB84C5Bimfm1GvLBs0JTqXV9U8+oPd PdN0Mvmdu3VaFHAgKE7pwDdnInddNHvzhOn2RuO9K0zhw77Og3PD5Pzs/HetVkB15M bI5qjw1u2SrbA== Date: Tue, 31 Dec 2024 15:47:11 -0800 Subject: [PATCH 09/11] xfs_spaceman: port relocation structure to 32-bit systems From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <173568778008.2709794.12371752300604120680.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong We can't use the radix tree to store relocation information on 32-bit systems because unsigned longs are not large enough to hold 64-bit inodes. Use an avl64 tree instead. Signed-off-by: "Darrick J. Wong" --- configure.ac | 1 include/builddefs.in | 1 m4/package_libcdev.m4 | 20 +++++ spaceman/Makefile | 4 + spaceman/relocation.c | 203 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 229 insertions(+) diff --git a/configure.ac b/configure.ac index 224d1d3930bf2f..1f7fec838e1239 100644 --- a/configure.ac +++ b/configure.ac @@ -212,6 +212,7 @@ fi AC_MANUAL_FORMAT AC_HAVE_LIBURCU_ATOMIC64 +AC_USE_RADIX_TREE_FOR_INUMS AC_CONFIG_FILES([include/builddefs]) AC_OUTPUT diff --git a/include/builddefs.in b/include/builddefs.in index ac43b6412c8cbb..bb022c36627a72 100644 --- a/include/builddefs.in +++ b/include/builddefs.in @@ -114,6 +114,7 @@ CROND_DIR = @crond_dir@ HAVE_UDEV = @have_udev@ UDEV_RULE_DIR = @udev_rule_dir@ HAVE_LIBURCU_ATOMIC64 = @have_liburcu_atomic64@ +USE_RADIX_TREE_FOR_INUMS = @use_radix_tree_for_inums@ GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall # -Wbitwise -Wno-transparent-union -Wno-old-initializer -Wno-decl diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4 index 4ef7e8f67a3ba6..9e48273250244c 100644 --- a/m4/package_libcdev.m4 +++ b/m4/package_libcdev.m4 @@ -255,3 +255,23 @@ AC_DEFUN([AC_PACKAGE_CHECK_LTO], AC_SUBST(lto_cflags) AC_SUBST(lto_ldflags) ]) + +# +# Check if the radix tree index (unsigned long) is large enough to hold a +# 64-bit inode number +# +AC_DEFUN([AC_USE_RADIX_TREE_FOR_INUMS], + [ AC_MSG_CHECKING([if radix tree can store XFS inums]) + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ +#include +#include +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + ]], [[ + typedef uint64_t xfs_ino_t; + + BUILD_BUG_ON(sizeof(unsigned long) < sizeof(xfs_ino_t)); + return 0; + ]])],[use_radix_tree_for_inums=yes + AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)]) + AC_SUBST(use_radix_tree_for_inums) + ]) diff --git a/spaceman/Makefile b/spaceman/Makefile index 8980208285f610..d9d55245ffc47a 100644 --- a/spaceman/Makefile +++ b/spaceman/Makefile @@ -33,6 +33,10 @@ ifeq ($(HAVE_GETFSMAP),yes) CFILES += freesp.c clearfree.c endif +ifeq ($(USE_RADIX_TREE_FOR_INUMS),yes) +LCFLAGS += -DUSE_RADIX_TREE_FOR_INUMS +endif + default: depend $(LTCOMMAND) include $(BUILDRULES) diff --git a/spaceman/relocation.c b/spaceman/relocation.c index 7c7d9a2b4b236f..1c0db6a1dab465 100644 --- a/spaceman/relocation.c +++ b/spaceman/relocation.c @@ -6,7 +6,11 @@ #include "libxfs.h" #include "libfrog/fsgeom.h" +#ifdef USE_RADIX_TREE_FOR_INUMS #include "libfrog/radix-tree.h" +#else +#include "libfrog/avl64.h" +#endif /* USE_RADIX_TREE_FOR_INUMS */ #include "libfrog/paths.h" #include "command.h" #include "init.h" @@ -24,6 +28,7 @@ get_reloc_count(void) return inode_count; } +#ifdef USE_RADIX_TREE_FOR_INUMS static RADIX_TREE(relocation_data, 0); bool @@ -112,3 +117,201 @@ forget_reloc_ino( { radix_tree_delete(&relocation_data, ino); } +#else +struct reloc_node { + struct avl64node node; + uint64_t ino; + struct inode_path *ipath; + unsigned int flags; +}; + +static uint64_t +reloc_start( + struct avl64node *node) +{ + struct reloc_node *rln; + + rln = container_of(node, struct reloc_node, node); + return rln->ino; +} + +static uint64_t +reloc_end( + struct avl64node *node) +{ + struct reloc_node *rln; + + rln = container_of(node, struct reloc_node, node); + return rln->ino + 1; +} + +static struct avl64ops reloc_ops = { + reloc_start, + reloc_end, +}; + +static struct avl64tree_desc relocation_data = { + .avl_ops = &reloc_ops, +}; + +bool +is_reloc_populated(void) +{ + return relocation_data.avl_firstino != NULL; +} + +static inline struct reloc_node * +reloc_lookup( + uint64_t ino) +{ + avl64node_t *node; + + node = avl64_find(&relocation_data, ino); + if (!node) + return NULL; + + return container_of(node, struct reloc_node, node); +} + +static inline struct reloc_node * +reloc_insert( + uint64_t ino) +{ + struct reloc_node *rln; + avl64node_t *node; + + rln = malloc(sizeof(struct reloc_node)); + if (!rln) + return NULL; + + rln->node.avl_nextino = NULL; + rln->ino = ino; + rln->ipath = UNLINKED_IPATH; + rln->flags = 0; + + node = avl64_insert(&relocation_data, &rln->node); + if (node == NULL) { + free(rln); + return NULL; + } + + return rln; +} + +bool +test_reloc_iflag( + uint64_t ino, + unsigned int flag) +{ + struct reloc_node *rln; + + rln = reloc_lookup(ino); + if (!rln) + return false; + + return rln->flags & flag; +} + +void +set_reloc_iflag( + uint64_t ino, + unsigned int flag) +{ + struct reloc_node *rln; + + rln = reloc_lookup(ino); + if (!rln) { + rln = reloc_insert(ino); + if (!rln) + abort(); + if (flag != INODE_PATH) + inode_count++; + } + if (flag == INODE_PATH) + inode_paths++; + + rln->flags |= flag; +} + +#define avl_for_each_range_safe(pos, n, l, first, last) \ + for (pos = (first), n = pos->avl_nextino, l = (last)->avl_nextino; \ + pos != (l); \ + pos = n, n = pos ? pos->avl_nextino : NULL) + +struct inode_path * +get_next_reloc_ipath( + uint64_t ino) +{ + struct avl64node *firstn; + struct avl64node *lastn; + struct avl64node *pos; + struct avl64node *n; + struct avl64node *l; + struct reloc_node *rln; + + avl64_findranges(&relocation_data, ino - 1, -1ULL, &firstn, &lastn); + if (firstn == NULL && lastn == NULL) + return NULL; + + avl_for_each_range_safe(pos, n, l, firstn, lastn) { + rln = container_of(pos, struct reloc_node, node); + + if (rln->flags & INODE_PATH) + return rln->ipath; + } + + return NULL; +} + +uint64_t +get_next_reloc_unlinked( + uint64_t ino) +{ + struct avl64node *firstn; + struct avl64node *lastn; + struct avl64node *pos; + struct avl64node *n; + struct avl64node *l; + struct reloc_node *rln; + + avl64_findranges(&relocation_data, ino - 1, -1ULL, &firstn, &lastn); + if (firstn == NULL && lastn == NULL) + return 0; + + avl_for_each_range_safe(pos, n, l, firstn, lastn) { + rln = container_of(pos, struct reloc_node, node); + + if (!(rln->flags & INODE_PATH)) + return rln->ino; + } + + return 0; +} + +struct inode_path ** +get_reloc_ipath_slot( + uint64_t ino) +{ + struct reloc_node *rln; + + rln = reloc_lookup(ino); + if (!rln) + return NULL; + + return &rln->ipath; +} + +void +forget_reloc_ino( + uint64_t ino) +{ + struct reloc_node *rln; + + rln = reloc_lookup(ino); + if (!rln) + return; + + avl64_delete(&relocation_data, &rln->node); + free(rln); +} +#endif /* USE_RADIX_TREE_FOR_INUMS */ From patchwork Tue Dec 31 23:47:26 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924068 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E3F6E1B0425 for ; Tue, 31 Dec 2024 23:47:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688848; cv=none; b=JTXXbe4ddadKr1IpgB3WZOFN2SpsrKmuRClnMFCYnHVGQLcm+cIQNlqP6uBQr/QJnQubMQgfOsAw4mY8enyYbmHqIhsuLblTfj1iBHdmG95RTLXLYOYhLnlpClAbeXxzyEeanxOilaKBU3DPd/FcesqLh01mfvrL4AJ9yBIgRUQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688848; c=relaxed/simple; bh=IHVX4mtHF1Z2cmiGOUmbgoniSzCpLT8LsSyAKCnGOf4=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=IsDhfjFoOjwGCfSEzoVZWEt/kIv+QdjdDewOY2QvLyhucmBDitGQTzWkhaT9910yEaBBiR5rq5eeLX7dPG6snaxuOItVCSe9ZweJ92hbwdKLba7ZKM9pAQdJ0qgD8EACA4VyUBs4WxXMoKJjGU0Y3G/Sa9ONlsnfQ5pF0yfRcqc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=lJEFATeS; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="lJEFATeS" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 637E5C4CED2; Tue, 31 Dec 2024 23:47:27 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688847; bh=IHVX4mtHF1Z2cmiGOUmbgoniSzCpLT8LsSyAKCnGOf4=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=lJEFATeSfjyjp1mE5ViRu8tcGE+4Q8gwEMwe8a5pafO1oRHt6HzmR6CxIG4YUp9Uv RRL6q8EtABHNDo1S62+PHqU/8TeGzV5MK0tGyzKDp4M2cr5M6kOkTwM4ejsj+Bjd88 8Tz4otLJYsoYCpg9tlGghXw1dAYLrtQyi/xr24+ViWXQCoZ5LCzGEudFlGA9RMxV7C lIhEjYCWsUDTMjbN0OFoE7BYKAN//4gH45O2Gj6kr6lyvYcq89yKcdR0Y9biWHGOSv a7oclcImVRYgqlplwgEijiqyunzeV/2z84Z5Hu4SzZ2oOOKnff/QM5HvkfcUox4SMt dFb6CmaK1hn7g== Date: Tue, 31 Dec 2024 15:47:26 -0800 Subject: [PATCH 10/11] spaceman: relocate the contents of an AG From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: dchinner@redhat.com, linux-xfs@vger.kernel.org Message-ID: <173568778023.2709794.3863189992037454598.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Dave Chinner Shrinking a filesystem needs to first remove all the active user data and metadata from the AGs that are going to be lopped off the filesystem. Before we can do this, we have to relocate this information to a region of the filesystem that is going to be retained. We have a function to move an inode and all it's related information to a specific AG, we have functions to find the owners of all the information in an AG and we can find their paths. This gives us all the information we need to relocate all the objects in an AG we are going to remove via shrinking. Firstly we scan the AG to be emptied to find the inodes that need to be relocated, then we scan the directory structure to find all the paths to those inodes that need to be moved. Then we iterate over all the inodes to be moved attempting to move them to the lowest numbers AGs. When the destination AG fills up, we'll get ENOSPC from the moving code and this is a trigger to bump the destination AG and retry the move. If we haven't moved all the inodes and their data by the time the destination reaches the source AG, then the entire operation will fail with ENOSPC - there is not enough room in the filesystem to empty the selected AG in preparation for a shrink. This, once again, is not intended as an optimal or even guaranteed way of emptying an AG for shrink. It simply provides the basic algorithm and mechanisms we need to perform a shrink operation. Improvements and optimisations will come in time, but we can't get to an optimal solution without first having basic functionality in place. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: "Darrick J. Wong" --- libfrog/fsgeom.h | 10 ++ man/man8/xfs_spaceman.8 | 8 ++ spaceman/find_owner.c | 32 +++--- spaceman/init.c | 1 spaceman/move_inode.c | 7 + spaceman/relocation.c | 234 +++++++++++++++++++++++++++++++++++++++++++++++ spaceman/relocation.h | 5 + spaceman/space.h | 1 8 files changed, 280 insertions(+), 18 deletions(-) diff --git a/libfrog/fsgeom.h b/libfrog/fsgeom.h index 679046077cba84..3fe642be6dc9ae 100644 --- a/libfrog/fsgeom.h +++ b/libfrog/fsgeom.h @@ -196,6 +196,16 @@ cvt_daddr_to_agno( return cvt_bb_to_off_fsbt(xfd, daddr) / xfd->fsgeom.agblocks; } +/* Convert sparse filesystem block to AG Number */ +static inline uint32_t +cvt_fsb_to_agno( + struct xfs_fd *xfd, + uint64_t fsbno) +{ + return fsbno >> xfd->agblklog; +} + + /* Convert sector number to AG block number. */ static inline uint32_t cvt_daddr_to_agbno( diff --git a/man/man8/xfs_spaceman.8 b/man/man8/xfs_spaceman.8 index 6fef6949aa6c8b..b6488810cfab30 100644 --- a/man/man8/xfs_spaceman.8 +++ b/man/man8/xfs_spaceman.8 @@ -202,9 +202,17 @@ .SH COMMANDS .TP .B print Display a list of all open files. +.TP +.BI "relocate \-a agno [ \-h agno ]" +Empty out the given allocation group by moving file data elsewhere. +The +.B -h +option specifies the highest allocation group into which we can move data. + .TP .B resolve_owner Resolves space in the filesystem to file paths, maybe? + .TP .B quit Exit diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c index 80b239f9ac5de8..8e93145539a227 100644 --- a/spaceman/find_owner.c +++ b/spaceman/find_owner.c @@ -9,10 +9,10 @@ #include #include "libfrog/fsgeom.h" #include "libfrog/radix-tree.h" -#include "command.h" -#include "init.h" #include "libfrog/paths.h" #include +#include "command.h" +#include "init.h" #include "space.h" #include "input.h" #include "relocation.h" @@ -65,8 +65,8 @@ track_inode( set_reloc_iflag(owner, MOVE_BLOCKS); } -static void -scan_ag( +int +find_relocation_targets( xfs_agnumber_t agno) { struct fsmap_head *fsmap; @@ -80,8 +80,7 @@ scan_ag( fsmap = malloc(fsmap_sizeof(NR_EXTENTS)); if (!fsmap) { fprintf(stderr, _("%s: fsmap malloc failed.\n"), progname); - exitcode = 1; - return; + return -ENOMEM; } memset(fsmap, 0, sizeof(*fsmap)); @@ -102,8 +101,7 @@ scan_ag( fprintf(stderr, _("%s: FS_IOC_GETFSMAP [\"%s\"]: %s\n"), progname, file->name, strerror(errno)); free(fsmap); - exitcode = 1; - return; + return -errno; } /* No more extents to map, exit */ @@ -148,6 +146,7 @@ scan_ag( } free(fsmap); + return 0; } /* @@ -159,6 +158,7 @@ find_owner_f( char **argv) { xfs_agnumber_t agno = -1; + int ret; int c; while ((c = getopt(argc, argv, "a:")) != EOF) { @@ -198,7 +198,9 @@ _("Filesystem at %s does not have reverse mapping enabled. Aborting.\n"), return 0; } - scan_ag(agno); + ret = find_relocation_targets(agno); + if (ret) + exitcode = 1; return 0; } @@ -299,8 +301,8 @@ _("Aborting: Storing path %s for inode 0x%lx failed: %s\n"), * This should be parallelised - pass subdirs off to a work queue, have the * work queue processes subdirs, queueing more subdirs to work on. */ -static int -walk_mount( +int +resolve_target_paths( const char *mntpt) { int ret; @@ -361,9 +363,9 @@ list_inode_paths(void) /* * Any inodes remaining in the tree at this point indicate inodes whose - * paths were not found. This will be unlinked but still open inodes or - * lost inodes due to corruptions. Either way, a shrink will not succeed - * until these inodes are removed from the filesystem. + * paths were not found. This will be free inodes or unlinked but still + * open inodes. Either way, a shrink will not succeed until these inodes + * are removed from the filesystem. */ idx = 0; do { @@ -400,7 +402,7 @@ _("Inode list has not been populated. No inodes to resolve.\n")); return 0; } - ret = walk_mount(file->fs_path.fs_dir); + ret = resolve_target_paths(file->fs_path.fs_dir); if (ret) { fprintf(stderr, _("Failed to resolve all paths from mount point %s: %s\n"), diff --git a/spaceman/init.c b/spaceman/init.c index 8b0af14e566dc8..cfe1b96fb66cd1 100644 --- a/spaceman/init.c +++ b/spaceman/init.c @@ -40,6 +40,7 @@ init_commands(void) move_inode_init(); find_owner_init(); resolve_owner_init(); + relocate_init(); } static int diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c index b7d71ee7a46dc6..ab3c12f5de987b 100644 --- a/spaceman/move_inode.c +++ b/spaceman/move_inode.c @@ -12,6 +12,7 @@ #include "space.h" #include "input.h" #include "handle.h" +#include "relocation.h" #include #include @@ -404,8 +405,8 @@ exchange_inodes( return 0; } -static int -move_file_to_ag( +int +relocate_file_to_ag( const char *mnt, const char *path, struct xfs_fd *xfd, @@ -511,7 +512,7 @@ _("Destination AG %d does not exist. Filesystem only has %d AGs\n"), } if (S_ISREG(st.st_mode)) { - ret = move_file_to_ag(file->fs_path.fs_dir, file->name, + ret = relocate_file_to_ag(file->fs_path.fs_dir, file->name, &file->xfd, agno); } else { fprintf(stderr, _("Unsupported: %s is not a regular file.\n"), diff --git a/spaceman/relocation.c b/spaceman/relocation.c index 1c0db6a1dab465..7b125cc0ae12b0 100644 --- a/spaceman/relocation.c +++ b/spaceman/relocation.c @@ -315,3 +315,237 @@ forget_reloc_ino( free(rln); } #endif /* USE_RADIX_TREE_FOR_INUMS */ + +static struct cmdinfo relocate_cmd; + +static int +relocate_targets_to_ag( + const char *mnt, + xfs_agnumber_t dst_agno) +{ + struct inode_path *ipath; + uint64_t idx = 0; + int ret = 0; + + do { + struct xfs_fd xfd = {0}; + struct stat st; + + /* lookup first relocation target */ + ipath = get_next_reloc_ipath(idx); + if (!ipath) + break; + + /* XXX: don't handle hard link cases yet */ + if (ipath->link_count > 1) { + fprintf(stderr, + "FIXME! Skipping hardlinked inode at path %s\n", + ipath->path); + goto next; + } + + + ret = stat(ipath->path, &st); + if (ret) { + fprintf(stderr, _("stat(%s) failed: %s\n"), + ipath->path, strerror(errno)); + goto next; + } + + if (!S_ISREG(st.st_mode)) { + fprintf(stderr, + _("FIXME! Skipping %s: not a regular file.\n"), + ipath->path); + goto next; + } + + ret = xfd_open(&xfd, ipath->path, O_RDONLY); + if (ret) { + fprintf(stderr, _("xfd_open(%s) failed: %s\n"), + ipath->path, strerror(-ret)); + goto next; + } + + /* move to destination AG */ + ret = relocate_file_to_ag(mnt, ipath->path, &xfd, dst_agno); + xfd_close(&xfd); + + /* + * If the destination AG has run out of space, we do not remove + * this inode from relocation data so it will be immediately + * retried in the next AG. Other errors will be fatal. + */ + if (ret < 0) + return ret; +next: + /* remove from relocation data */ + idx = ipath->ino + 1; + forget_reloc_ino(ipath->ino); + } while (ret != -ENOSPC); + + return ret; +} + +static int +relocate_targets( + const char *mnt, + xfs_agnumber_t highest_agno) +{ + xfs_agnumber_t dst_agno = 0; + int ret; + + for (dst_agno = 0; dst_agno <= highest_agno; dst_agno++) { + ret = relocate_targets_to_ag(mnt, dst_agno); + if (ret == -ENOSPC) + continue; + break; + } + return ret; +} + +/* + * Relocate all the user objects in an AG to lower numbered AGs. + */ +static int +relocate_f( + int argc, + char **argv) +{ + xfs_agnumber_t target_agno = -1; + xfs_agnumber_t highest_agno = -1; + xfs_agnumber_t log_agno; + void *fshandle; + size_t fshdlen; + int c; + int ret; + + while ((c = getopt(argc, argv, "a:h:")) != EOF) { + switch (c) { + case 'a': + target_agno = cvt_u32(optarg, 10); + if (errno) { + fprintf(stderr, _("bad target agno value %s\n"), + optarg); + return command_usage(&relocate_cmd); + } + break; + case 'h': + highest_agno = cvt_u32(optarg, 10); + if (errno) { + fprintf(stderr, _("bad highest agno value %s\n"), + optarg); + return command_usage(&relocate_cmd); + } + break; + default: + return command_usage(&relocate_cmd); + } + } + + if (optind != argc) + return command_usage(&relocate_cmd); + + if (target_agno == -1) { + fprintf(stderr, _("Target AG must be specified!\n")); + return command_usage(&relocate_cmd); + } + + log_agno = cvt_fsb_to_agno(&file->xfd, file->xfd.fsgeom.logstart); + if (target_agno <= log_agno) { + fprintf(stderr, +_("Target AG %d must be higher than the journal AG (AG %d). Aborting.\n"), + target_agno, log_agno); + goto out_fail; + } + + if (target_agno >= file->xfd.fsgeom.agcount) { + fprintf(stderr, +_("Target AG %d does not exist. Filesystem only has %d AGs\n"), + target_agno, file->xfd.fsgeom.agcount); + goto out_fail; + } + + if (highest_agno == -1) + highest_agno = target_agno - 1; + + if (highest_agno >= target_agno) { + fprintf(stderr, +_("Highest destination AG %d must be less than target AG %d. Aborting.\n"), + highest_agno, target_agno); + goto out_fail; + } + + if (is_reloc_populated()) { + fprintf(stderr, +_("Relocation data populated from previous commands. Aborting.\n")); + goto out_fail; + } + + /* this is so we can use fd_to_handle() later on */ + ret = path_to_fshandle(file->fs_path.fs_dir, &fshandle, &fshdlen); + if (ret < 0) { + fprintf(stderr, _("Cannot get fshandle for mount %s: %s\n"), + file->fs_path.fs_dir, strerror(errno)); + goto out_fail; + } + + ret = find_relocation_targets(target_agno); + if (ret) { + fprintf(stderr, +_("Failure during target discovery. Aborting.\n")); + goto out_fail; + } + + ret = resolve_target_paths(file->fs_path.fs_dir); + if (ret) { + fprintf(stderr, +_("Failed to resolve all paths from mount point %s: %s\n"), + file->fs_path.fs_dir, strerror(-ret)); + goto out_fail; + } + + ret = relocate_targets(file->fs_path.fs_dir, highest_agno); + if (ret) { + fprintf(stderr, +_("Failed to relocate all targets out of AG %d: %s\n"), + target_agno, strerror(-ret)); + goto out_fail; + } + + return 0; +out_fail: + exitcode = 1; + return 0; +} + +static void +relocate_help(void) +{ + printf(_( +"\n" +"Relocate all the user data and metadata in an AG.\n" +"\n" +"This function will discover all the relocatable objects in a single AG and\n" +"move them to a lower AG as preparation for a shrink operation.\n" +"\n" +" -a Allocation group to empty\n" +" -h Highest target AG allowed to relocate into\n" +"\n")); + +} + +void +relocate_init(void) +{ + relocate_cmd.name = "relocate"; + relocate_cmd.altname = "relocate"; + relocate_cmd.cfunc = relocate_f; + relocate_cmd.argmin = 2; + relocate_cmd.argmax = 4; + relocate_cmd.args = "-a agno [-h agno]"; + relocate_cmd.flags = CMD_FLAG_ONESHOT; + relocate_cmd.oneline = _("Relocate data in an AG."); + relocate_cmd.help = relocate_help; + + add_command(&relocate_cmd); +} diff --git a/spaceman/relocation.h b/spaceman/relocation.h index f05a871915da42..d4c71b7bb7f054 100644 --- a/spaceman/relocation.h +++ b/spaceman/relocation.h @@ -43,4 +43,9 @@ struct inode_path { */ #define UNLINKED_IPATH ((struct inode_path *)1) +int find_relocation_targets(xfs_agnumber_t agno); +int relocate_file_to_ag(const char *mnt, const char *path, struct xfs_fd *xfd, + xfs_agnumber_t agno); +int resolve_target_paths(const char *mntpt); + #endif /* XFS_SPACEMAN_RELOCATION_H_ */ diff --git a/spaceman/space.h b/spaceman/space.h index cffb1882153a18..8c2b3e5464dee6 100644 --- a/spaceman/space.h +++ b/spaceman/space.h @@ -41,5 +41,6 @@ extern void health_init(void); void move_inode_init(void); void find_owner_init(void); void resolve_owner_init(void); +void relocate_init(void); #endif /* XFS_SPACEMAN_SPACE_H_ */ From patchwork Tue Dec 31 23:47:42 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13924069 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F56429415 for ; Tue, 31 Dec 2024 23:47:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688863; cv=none; b=mY2XTFJmcH4HzwC+7ZBMTio+5W9KgcrMNPpkYa7cYUS6LT6isPJcit5Q2CalkHWAah2xBdR2V2TaefpHLqJMLYLeN7/xGx9DanlzMMZe+PUXGFplR3ffC1KQEcddtUCO0RoFIJe4i9UtImIU3zAIxZVFHa0nTT1M7rKQidrlkbE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1735688863; c=relaxed/simple; bh=SbPd8swfPJMeGGMpudiTW/TIcFWVSrXUcajeqF1HtnU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=oOMk6RVyUpxpqHPwa9h+ZnyJVQ/wkO76F1DIDOJRQ8DHtlTFtqMPHofTaGNPW6SXbuayXHOGO0xGxcmgM1WpRitep6rh0zrPq0EemUPSvMpEimuLXXhaflNpNwZYTllUW/x43623RN1HEgp6cLDvJ7ztkso6iLmE3qSdaVDoCIA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=fzT05Imw; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="fzT05Imw" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0E7BBC4CED2; Tue, 31 Dec 2024 23:47:43 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1735688863; bh=SbPd8swfPJMeGGMpudiTW/TIcFWVSrXUcajeqF1HtnU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=fzT05ImwRte+Aa49f0+DzfM9Wia2Df3xI8ATCFXhMxCVg1xd0fJFYlVVE5Fswd+b7 d/+t1EWxMnrJkjR8hmcT8QzKheQ90nJt0bjNW2bpq0vxOV5edo5QqxUYdJvgzz9Eq5 Q5BMEpbUqV+6IcFRkCWDpSQqm0jbZRgOAk2QWpfQbifnnNFEN2Mxo+AxSoYlM2b2Lw 2NwHJxtRmz6Sop75F7Bgfkf9tUWmOhNYqJwhNmJIOkwLV+EsPCJoSxWqp8vJBoRLPl UZrYdRi6llOjdZj6+CSHFDlja9jVFcdh+UOD50w+Gv5Q7s0tKXnqOfRVHr8p1gd4zw Lkzapmd8v6AQg== Date: Tue, 31 Dec 2024 15:47:42 -0800 Subject: [PATCH 11/11] spaceman: move inodes with hardlinks From: "Darrick J. Wong" To: aalbersh@kernel.org, djwong@kernel.org Cc: dchinner@redhat.com, linux-xfs@vger.kernel.org Message-ID: <173568778039.2709794.14611363506119987915.stgit@frogsfrogsfrogs> In-Reply-To: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> References: <173568777852.2709794.6356870909327619205.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Dave Chinner When a inode to be moved to a different AG has multiple hard links, we need to "move" all the hard links, too. To do this, we need to create temporary hardlinks to the new file, and then use rename exchange to swap all the hardlinks that point to the old inode with new hardlinks that point to the new inode. We already know that an inode has hard links via the path discovery, and we can check it against the link count that is reported for the inode before we start building the link farm. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: "Darrick J. Wong" --- spaceman/find_owner.c | 13 +---- spaceman/move_inode.c | 119 +++++++++++++++++++++++++++++++++++++++++++++---- spaceman/relocation.c | 35 ++++++++++---- spaceman/relocation.h | 6 ++ 4 files changed, 140 insertions(+), 33 deletions(-) diff --git a/spaceman/find_owner.c b/spaceman/find_owner.c index 8e93145539a227..1984d0ee7ca5f6 100644 --- a/spaceman/find_owner.c +++ b/spaceman/find_owner.c @@ -240,7 +240,6 @@ resolve_owner_cb( struct FTW *data) { struct inode_path *ipath, *slot_ipath; - int pathlen; struct inode_path **slot; /* @@ -260,17 +259,9 @@ _("Failed to obtain stat(2) information from path %s. Aborting\n"), } /* Allocate a new inode path and record the path in it. */ - pathlen = strlen(path); - ipath = calloc(1, sizeof(*ipath) + pathlen + 1); - if (!ipath) { - fprintf(stderr, -_("Aborting: Storing path %s for inode 0x%lx failed: %s\n"), - path, stat->st_ino, strerror(ENOMEM)); + ipath = ipath_alloc(path, stat); + if (!ipath) return -ENOMEM; - } - INIT_LIST_HEAD(&ipath->path_list); - memcpy(&ipath->path[0], path, pathlen); - ipath->ino = stat->st_ino; /* * If the slot contains the inode number we just looked up, then we diff --git a/spaceman/move_inode.c b/spaceman/move_inode.c index ab3c12f5de987b..3a182929579e45 100644 --- a/spaceman/move_inode.c +++ b/spaceman/move_inode.c @@ -36,12 +36,14 @@ create_tmpfile( struct xfs_fd *xfd, xfs_agnumber_t agno, char **tmpfile, - int *tmpfd) + int *tmpfd, + int link_count) { char name[PATH_MAX + 1]; + char linkname[PATH_MAX + 1]; mode_t mask; int fd; - int i; + int i, j; int ret; /* construct tmpdir */ @@ -105,14 +107,36 @@ create_tmpfile( fprintf(stderr, _("cannot create tmpfile: %s: %s\n"), name, strerror(errno)); ret = -errno; + goto out_cleanup_dir; } + /* Create hard links to temporary file. */ + for (j = link_count; j > 1; i--) { + snprintf(linkname, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d.hardlink.%d", mnt, i, getpid(), j); + ret = link(name, linkname); + if (ret < 0) { + fprintf(stderr, _("cannot create hardlink: %s: %s\n"), + linkname, strerror(errno)); + ret = -errno; + goto out_cleanup_links; + } + } + + /* return name and fd */ (void)umask(mask); *tmpfd = fd; *tmpfile = strdup(name); return 0; + +out_cleanup_links: + for (; j <= link_count; j++) { + snprintf(linkname, PATH_MAX, "%s/.spaceman/dir%d/tmpfile.%d.hardlink.%d", mnt, i, getpid(), j); + unlink(linkname); + } + close(fd); + unlink(name); out_cleanup_dir: snprintf(name, PATH_MAX, "%s/.spaceman", mnt); rmdir(name); @@ -405,21 +429,53 @@ exchange_inodes( return 0; } +static int +exchange_hardlinks( + struct inode_path *ipath, + const char *tmpfile) +{ + char linkname[PATH_MAX]; + struct inode_path *linkpath; + int i = 2; + int ret; + + list_for_each_entry(linkpath, &ipath->path_list, path_list) { + if (i++ > ipath->link_count) { + fprintf(stderr, "ipath link count mismatch!\n"); + return 0; + } + + snprintf(linkname, PATH_MAX, "%s.hardlink.%d", tmpfile, i); + ret = renameat2(AT_FDCWD, linkname, + AT_FDCWD, linkpath->path, RENAME_EXCHANGE); + if (ret) { + fprintf(stderr, + "failed to exchange hard link %s with %s: %s\n", + linkname, linkpath->path, strerror(errno)); + return -errno; + } + } + return 0; +} + int relocate_file_to_ag( const char *mnt, - const char *path, + struct inode_path *ipath, struct xfs_fd *xfd, xfs_agnumber_t agno) { int ret; int tmpfd = -1; char *tmpfile = NULL; + int i; - fprintf(stderr, "move mnt %s, path %s, agno %d\n", mnt, path, agno); + fprintf(stderr, "move mnt %s, path %s, agno %d\n", + mnt, ipath->path, agno); /* create temporary file in agno */ - ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd); + ret = create_tmpfile(mnt, xfd, agno, &tmpfile, &tmpfd, + ipath->link_count); if (ret) return ret; @@ -444,12 +500,28 @@ relocate_file_to_ag( goto out_cleanup; /* swap the inodes over */ - ret = exchange_inodes(xfd, tmpfd, tmpfile, path); + ret = exchange_inodes(xfd, tmpfd, tmpfile, ipath->path); + if (ret) + goto out_cleanup; + + /* swap the hard links over */ + ret = exchange_hardlinks(ipath, tmpfile); + if (ret) + goto out_cleanup; out_cleanup: if (ret == -1) ret = -errno; + /* remove old hard links */ + for (i = 2; i <= ipath->link_count; i++) { + char linkname[PATH_MAX + 256]; // anti-warning-crap + + snprintf(linkname, PATH_MAX + 256, "%s.hardlink.%d", tmpfile, i); + unlink(linkname); + } + + /* remove tmpfile */ close(tmpfd); if (tmpfile) unlink(tmpfile); @@ -458,11 +530,32 @@ relocate_file_to_ag( return ret; } +static int +build_ipath( + const char *path, + struct stat *st, + struct inode_path **ipathp) +{ + struct inode_path *ipath; + + *ipathp = NULL; + + ipath = ipath_alloc(path, st); + if (!ipath) + return -ENOMEM; + + /* we only move a single path with move_inode */ + ipath->link_count = 1; + *ipathp = ipath; + return 0; +} + static int move_inode_f( int argc, char **argv) { + struct inode_path *ipath = NULL; void *fshandle; size_t fshdlen; xfs_agnumber_t agno = 0; @@ -511,24 +604,30 @@ _("Destination AG %d does not exist. Filesystem only has %d AGs\n"), goto exit_fail; } - if (S_ISREG(st.st_mode)) { - ret = relocate_file_to_ag(file->fs_path.fs_dir, file->name, - &file->xfd, agno); - } else { + if (!S_ISREG(st.st_mode)) { fprintf(stderr, _("Unsupported: %s is not a regular file.\n"), file->name); goto exit_fail; } + ret = build_ipath(file->name, &st, &ipath); + if (ret) + goto exit_fail; + + ret = relocate_file_to_ag(file->fs_path.fs_dir, ipath, + &file->xfd, agno); if (ret) { fprintf(stderr, _("Failed to move inode to AG %d: %s\n"), agno, strerror(-ret)); goto exit_fail; } + free(ipath); fshandle_destroy(); return 0; exit_fail: + if (ipath) + free(ipath); fshandle_destroy(); exitcode = 1; return 0; diff --git a/spaceman/relocation.c b/spaceman/relocation.c index 7b125cc0ae12b0..b0960272168510 100644 --- a/spaceman/relocation.c +++ b/spaceman/relocation.c @@ -318,6 +318,30 @@ forget_reloc_ino( static struct cmdinfo relocate_cmd; +struct inode_path * +ipath_alloc( + const char *path, + const struct stat *stat) +{ + struct inode_path *ipath; + int pathlen = strlen(path); + + /* Allocate a new inode path and record the path in it. */ + ipath = calloc(1, sizeof(*ipath) + pathlen + 1); + if (!ipath) { + fprintf(stderr, +_("Failed to allocate ipath %s for inode 0x%llx failed: %s\n"), + path, (unsigned long long)stat->st_ino, + strerror(-errno)); + return NULL; + } + INIT_LIST_HEAD(&ipath->path_list); + memcpy(&ipath->path[0], path, pathlen); + ipath->ino = stat->st_ino; + + return ipath; +} + static int relocate_targets_to_ag( const char *mnt, @@ -336,15 +360,6 @@ relocate_targets_to_ag( if (!ipath) break; - /* XXX: don't handle hard link cases yet */ - if (ipath->link_count > 1) { - fprintf(stderr, - "FIXME! Skipping hardlinked inode at path %s\n", - ipath->path); - goto next; - } - - ret = stat(ipath->path, &st); if (ret) { fprintf(stderr, _("stat(%s) failed: %s\n"), @@ -367,7 +382,7 @@ relocate_targets_to_ag( } /* move to destination AG */ - ret = relocate_file_to_ag(mnt, ipath->path, &xfd, dst_agno); + ret = relocate_file_to_ag(mnt, ipath, &xfd, dst_agno); xfd_close(&xfd); /* diff --git a/spaceman/relocation.h b/spaceman/relocation.h index d4c71b7bb7f054..2c807aa678ec5b 100644 --- a/spaceman/relocation.h +++ b/spaceman/relocation.h @@ -43,9 +43,11 @@ struct inode_path { */ #define UNLINKED_IPATH ((struct inode_path *)1) +struct inode_path *ipath_alloc(const char *path, const struct stat *st); + int find_relocation_targets(xfs_agnumber_t agno); -int relocate_file_to_ag(const char *mnt, const char *path, struct xfs_fd *xfd, - xfs_agnumber_t agno); +int relocate_file_to_ag(const char *mnt, struct inode_path *ipath, + struct xfs_fd *xfd, xfs_agnumber_t agno); int resolve_target_paths(const char *mntpt); #endif /* XFS_SPACEMAN_RELOCATION_H_ */