From patchwork Sun Dec 31 20:32:04 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507426 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0D125F9C3 for ; Sun, 31 Dec 2023 20:32:04 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="q8DN3NO1" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 7B0A2C433C8; Sun, 31 Dec 2023 20:32:04 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704054724; bh=m3szYfdgQK+ZIZkd39dtjVBSFq1uvu7K0mY6f4eqx8w=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=q8DN3NO1zTovLYMYwfzSUHjYDyIZwNw1M1IxgHd6cWcyU+TDPtcPcrcDS2NGPhhQh A4uy2FADLa5RRmqqmN0nFqx/rv3Fh+QOl80nSGztHahaGeL2bH0vZ6ecC//XSbaaM4 HcYz++zFiWgL+VhlhUeW7tPEpFrdmP2REF/l86Xp64/kHsn1GRaEU5uEjDLzXgM6kF 1a/G+bkbnkJC8YlBkdBPgNIO0Vhj1f2EepWXTWiK8CLWUVkoyYMGj93vmugCrU+S0T 6Rn7k/eJkU0YeIcNcDtspKp6rm4qHgKjdWNASSgsKys9IyAk1+y6ZKFgW4KEtJiC1r KVuFLoqpPUiXg== Date: Sun, 31 Dec 2023 12:32:04 -0800 Subject: [PATCH 1/3] xfs: support preallocating and copying content into temporary files From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404834301.1752917.3615563381322048344.stgit@frogsfrogsfrogs> In-Reply-To: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> References: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Create the routines we need to preallocate space in a temporary ondisk file and then copy the contents of an xfile into the tempfile. The upcoming rtsummary repair feature will construct the contents of a realtime summary file in memory, after which it will want to copy all that into the ondisk temporary file before atomically committing the new rtsummary contents. Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/tempfile.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/tempfile.h | 15 ++++ fs/xfs/scrub/trace.h | 39 +++++++++ 3 files changed, 251 insertions(+) diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 5f4a931b1967c..936107d083545 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -14,14 +14,18 @@ #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_quota.h" +#include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_dir2.h" #include "xfs_xchgrange.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/repair.h" #include "scrub/trace.h" #include "scrub/tempfile.h" +#include "scrub/xfile.h" /* * Create a temporary file for reconstructing metadata, with the intention of @@ -249,3 +253,196 @@ xrep_tempfile_rele( xchk_irele(sc, sc->tempip); sc->tempip = NULL; } + +/* + * Make sure that the given range of the data fork of the temporary file is + * mapped to written blocks. The caller must ensure that both inodes are + * joined to the transaction. + */ +int +xrep_tempfile_prealloc( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t end = off + len; + int error; + + ASSERT(sc->tempip != NULL); + ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); + + for (; off < end; off = map.br_startoff + map.br_blockcount) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + if (xfs_bmap_is_written_extent(&map)) + continue; + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* + * Make sure this block has a real zeroed extent allocated to + * it. + */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, + &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); + + /* Commit new extent and all deferred work. */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + } + + return 0; +} + +/* + * Write data to each block of a file. The given range of the tempfile's data + * fork must already be populated with written extents. + */ +int +xrep_tempfile_copyin( + struct xfs_scrub *sc, + xfs_fileoff_t off, + xfs_filblks_t len, + xrep_tempfile_copyin_fn prep_fn, + void *data) +{ + LIST_HEAD(buffers_list); + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_fileoff_t flush_mask; + xfs_fileoff_t end = off + len; + loff_t pos = XFS_FSB_TO_B(mp, off); + int error = 0; + + ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); + + /* Flush buffers to disk every 512K */ + flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; + + for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { + struct xfs_bmbt_irec map; + int nmaps = 1; + + /* Read block mapping for this file block. */ + error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); + if (error) + goto out_err; + if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { + error = -EFSCORRUPTED; + goto out_err; + } + + /* Get the metadata buffer for this offset in the file. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp); + if (error) + goto out_err; + + trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); + + /* Read in a block's worth of data from the xfile. */ + error = prep_fn(sc, bp, data); + if (error) { + xfs_trans_brelse(sc->tp, bp); + goto out_err; + } + + /* Queue buffer, and flush if we have too much dirty data. */ + xfs_buf_delwri_queue_here(bp, &buffers_list); + xfs_trans_brelse(sc->tp, bp); + + if (!(off & flush_mask)) { + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + } + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + error = xfs_buf_delwri_submit(&buffers_list); + if (error) + goto out_err; + + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + error = -EIO; + goto out_err; + } + + return 0; + +out_err: + xfs_buf_delwri_cancel(&buffers_list); + return error; +} + +/* + * Set the temporary file's size. Caller must join the tempfile to the scrub + * transaction and is responsible for adjusting block mappings as needed. + */ +int +xrep_tempfile_set_isize( + struct xfs_scrub *sc, + unsigned long long isize) +{ + if (sc->tempip->i_disk_size == isize) + return 0; + + sc->tempip->i_disk_size = isize; + i_size_write(VFS_I(sc->tempip), isize); + return xrep_tempfile_roll_trans(sc); +} + +/* + * Roll a repair transaction involving the temporary file. Caller must join + * both the temporary file and the file being scrubbed to the transaction. + * This function return with both inodes joined to a new scrub transaction, + * or the usual negative errno. + */ +int +xrep_tempfile_roll_trans( + struct xfs_scrub *sc) +{ + int error; + + xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); + error = xrep_roll_trans(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + return 0; +} diff --git a/fs/xfs/scrub/tempfile.h b/fs/xfs/scrub/tempfile.h index e165e0a3faf63..7980f9c4de552 100644 --- a/fs/xfs/scrub/tempfile.h +++ b/fs/xfs/scrub/tempfile.h @@ -17,6 +17,21 @@ void xrep_tempfile_iounlock(struct xfs_scrub *sc); void xrep_tempfile_ilock(struct xfs_scrub *sc); bool xrep_tempfile_ilock_nowait(struct xfs_scrub *sc); void xrep_tempfile_iunlock(struct xfs_scrub *sc); + +int xrep_tempfile_prealloc(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len); + +enum xfs_blft; + +typedef int (*xrep_tempfile_copyin_fn)(struct xfs_scrub *sc, + struct xfs_buf *bp, void *data); + +int xrep_tempfile_copyin(struct xfs_scrub *sc, xfs_fileoff_t off, + xfs_filblks_t len, xrep_tempfile_copyin_fn fn, void *data); + +int xrep_tempfile_set_isize(struct xfs_scrub *sc, unsigned long long isize); + +int xrep_tempfile_roll_trans(struct xfs_scrub *sc); #else static inline void xrep_tempfile_iolock_both(struct xfs_scrub *sc) { diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 691c91c9a5853..73ddaaadd2414 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2396,6 +2396,45 @@ TRACE_EVENT(xrep_tempfile_create, __entry->temp_inum) ); +DECLARE_EVENT_CLASS(xrep_tempfile_class, + TP_PROTO(struct xfs_scrub *sc, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(sc, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->tempip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); +#define DEFINE_XREP_TEMPFILE_EVENT(name) \ +DEFINE_EVENT(xrep_tempfile_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int whichfork, \ + struct xfs_bmbt_irec *irec), \ + TP_ARGS(sc, whichfork, irec)) +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_prealloc); +DEFINE_XREP_TEMPFILE_EVENT(xrep_tempfile_copyin); + TRACE_EVENT(xreap_ifork_extent, TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *ip, int whichfork, const struct xfs_bmbt_irec *irec), From patchwork Sun Dec 31 20:32:19 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507427 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6C491F9C4 for ; Sun, 31 Dec 2023 20:32:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="i/3dptzk" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 05EB4C433C8; Sun, 31 Dec 2023 20:32:20 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704054740; bh=b+xljRIxzTW50Sf30CqyFqYARWKDUCumrpYdO48oSAs=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=i/3dptzkWVN+PwAYfOuxzX3ixNGn+/S0wc8TBLFjRC6RLdisyj6nvbzbvoShmdO+f S9ncMk0esBdPtmF0f8ihWZtBGVO/wOZFyi4+GDjUqVYgxAARDcQHfggmLJqRgvQNj0 PPyZ+QX8MLUEddqKJQ5Rol8IvE5N2o7RTHlBHQgP0KymnL766Z2ZUsZOBXNEA8P9IP 0qHpuAqyNSYhxQvPLa+JW6QUISRxnhW+YGYflep6NGJ66YCE3G7yxLo8dSqPuMWcDH L6A51SQQXvstNPH/qgRaN8+NeRzz3HreEYUZ1M6hu8QCkruxpe6sHkyki6mb4XpKvw lhGyxmfHYHOqQ== Date: Sun, 31 Dec 2023 12:32:19 -0800 Subject: [PATCH 2/3] xfs: teach the tempfile to support atomic extent swapping From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404834317.1752917.12716897349084707493.stgit@frogsfrogsfrogs> In-Reply-To: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> References: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Create some new routines to exchange the contents of a temporary file created to stage a repair with another ondisk file. This will be used by the realtime summary repair function to commit atomically the new rtsummary data, which will be staged in the tempfile. The rest of XFS coordinates access to the realtime metadata inodes solely through the ILOCK. For repair to hold its exclusive access to the realtime summary file, it has to allocate a single large transaction and roll it repeatedly throughout the repair while holding the ILOCK. In turn, this means that for now there's only a partial swapext implementation for the temporary file, because we can only work within an existing transaction. Hence the only tempswap functions needed here are to estimate the resource requirements of swapext between, reserve more space/quota to an existing transaction, and kick off the actual swap. The rest will be added in a later patch in preparation for repairing xattrs and directories. Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/scrub.c | 11 ++- fs/xfs/scrub/scrub.h | 7 ++ fs/xfs/scrub/tempfile.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/tempswap.h | 21 +++++ fs/xfs/scrub/trace.h | 1 5 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 fs/xfs/scrub/tempswap.h diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 51bcb21325cd3..afc82f1e40ffb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,14 +149,15 @@ xchk_probe( /* Scrub setup and teardown */ +#define FSGATES_MASK (XCHK_FSGATES_ALL | XREP_FSGATES_ALL) static inline void xchk_fsgates_disable( struct xfs_scrub *sc) { - if (!(sc->flags & XCHK_FSGATES_ALL)) + if (!(sc->flags & FSGATES_MASK)) return; - trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + trace_xchk_fsgates_disable(sc, sc->flags & FSGATES_MASK); if (sc->flags & XCHK_FSGATES_DRAIN) xfs_drain_wait_disable(); @@ -170,8 +171,12 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_RMAP) xfs_rmap_hook_disable(); - sc->flags &= ~XCHK_FSGATES_ALL; + if (sc->flags & XREP_FSGATES_ATOMIC_XCHG) + xfs_xchg_range_rele_log_assist(sc->mp); + + sc->flags &= ~FSGATES_MASK; } +#undef FSGATES_MASK /* Free all the resources and finish the transactions. */ STATIC int diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5f0e8e350295e..48b2fb8271499 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -131,6 +131,7 @@ struct xfs_scrub { #define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ #define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ #define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */ +#define XREP_FSGATES_ATOMIC_XCHG (1U << 29) /* uses atomic file content exchange */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -145,6 +146,12 @@ struct xfs_scrub { XCHK_FSGATES_DIRENTS | \ XCHK_FSGATES_RMAP) +/* + * The sole XREP_FSGATES* flag reflects a log intent item that is protected + * by a log-incompat feature flag. No code patching in use here. + */ +#define XREP_FSGATES_ALL (XREP_FSGATES_ATOMIC_XCHG) + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/tempfile.c b/fs/xfs/scrub/tempfile.c index 936107d083545..a1736a3556a7d 100644 --- a/fs/xfs/scrub/tempfile.c +++ b/fs/xfs/scrub/tempfile.c @@ -19,12 +19,14 @@ #include "xfs_trans_space.h" #include "xfs_dir2.h" #include "xfs_xchgrange.h" +#include "xfs_swapext.h" #include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/trace.h" #include "scrub/tempfile.h" +#include "scrub/tempswap.h" #include "scrub/xfile.h" /* @@ -446,3 +448,205 @@ xrep_tempfile_roll_trans( xfs_trans_ijoin(sc->tp, sc->tempip, 0); return 0; } + +/* Enable atomic extent swapping. */ +int +xrep_tempswap_grab_log_assist( + struct xfs_scrub *sc) +{ + bool need_rele = false; + int error; + + if (sc->flags & XREP_FSGATES_ATOMIC_XCHG) + return 0; + + error = xfs_xchg_range_grab_log_assist(sc->mp, true, &need_rele); + if (error) + return error; + if (!need_rele) { + ASSERT(need_rele); + return -EOPNOTSUPP; + } + + trace_xchk_fsgates_enable(sc, XREP_FSGATES_ATOMIC_XCHG); + + sc->flags |= XREP_FSGATES_ATOMIC_XCHG; + return 0; +} + +/* + * Fill out the swapext request in preparation for swapping the contents of a + * metadata file that we've rebuilt in the temp file. + */ +STATIC int +xrep_tempswap_prep_request( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempswap *tx) +{ + struct xfs_swapext_req *req = &tx->req; + + memset(tx, 0, sizeof(struct xrep_tempswap)); + + /* COW forks don't exist on disk. */ + if (whichfork == XFS_COW_FORK) { + ASSERT(0); + return -EINVAL; + } + + /* Both files should have the relevant forks. */ + if (!xfs_ifork_ptr(sc->ip, whichfork) || + !xfs_ifork_ptr(sc->tempip, whichfork)) { + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); + return -EINVAL; + } + + /* Swap all mappings in both forks. */ + req->ip1 = sc->tempip; + req->ip2 = sc->ip; + req->startoff1 = 0; + req->startoff2 = 0; + req->whichfork = whichfork; + req->blockcount = XFS_MAX_FILEOFF; + req->req_flags = XFS_SWAP_REQ_LOGGED; + + /* Always swap sizes when we're swapping data fork mappings. */ + if (whichfork == XFS_DATA_FORK) + req->req_flags |= XFS_SWAP_REQ_SET_SIZES; + + /* + * If we're repairing symlinks, xattrs, or directories, always try to + * convert ip2 to short format after swapping. + */ + if (whichfork == XFS_ATTR_FORK || S_ISDIR(VFS_I(sc->ip)->i_mode) || + S_ISLNK(VFS_I(sc->ip)->i_mode)) + req->req_flags |= XFS_SWAP_REQ_CVT_INO2_SF; + + return 0; +} + +/* + * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip + * this if quota enforcement is disabled or if both inodes' dquots are the + * same. The qretry structure must be initialized to zeroes before the first + * call to this function. + */ +STATIC int +xrep_tempswap_reserve_quota( + struct xfs_scrub *sc, + const struct xrep_tempswap *tx) +{ + struct xfs_trans *tp = sc->tp; + const struct xfs_swapext_req *req = &tx->req; + int64_t ddelta, rdelta; + int error; + + /* + * Don't bother with a quota reservation if we're not enforcing them + * or the two inodes have the same dquots. + */ + if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || + (req->ip1->i_udquot == req->ip2->i_udquot && + req->ip1->i_gdquot == req->ip2->i_gdquot && + req->ip1->i_pdquot == req->ip2->i_pdquot)) + return 0; + + /* + * Quota reservation for each file comes from two sources. First, we + * need to account for any net gain in mapped blocks during the swap. + * Second, we need reservation for the gross gain in mapped blocks so + * that we don't trip over any quota block reservation assertions. We + * must reserve the gross gain because the quota code subtracts from + * bcount the number of blocks that we unmap; it does not add that + * quantity back to the quota block reservation. + */ + ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); + rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); + error = xfs_trans_reserve_quota_nblks(tp, req->ip1, + ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, + true); + if (error) + return error; + + ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); + rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); + return xfs_trans_reserve_quota_nblks(tp, req->ip2, + ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, + true); +} + +/* + * Prepare an existing transaction for a swap. + * + * This function fills out the swapext request and resource estimation + * structures in preparation for swapping the contents of a metadata file that + * has been rebuilt in the temp file. Next, it reserves space and quota for + * the transaction. + * + * The caller must hold ILOCK_EXCL of the scrub target file and the temporary + * file. The caller must join both inodes to the transaction with no unlock + * flags, and is responsible for dropping both ILOCKs when appropriate. Only + * use this when those ILOCKs cannot be dropped. + */ +int +xrep_tempswap_trans_reserve( + struct xfs_scrub *sc, + int whichfork, + struct xrep_tempswap *tx) +{ + int error; + + ASSERT(sc->tp != NULL); + ASSERT(xfs_isilocked(sc->ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(sc->tempip, XFS_ILOCK_EXCL)); + + error = xrep_tempswap_prep_request(sc, whichfork, tx); + if (error) + return error; + + error = xfs_swapext_estimate(&tx->req); + if (error) + return error; + + error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); + if (error) + return error; + + return xrep_tempswap_reserve_quota(sc, tx); +} + +/* + * Swap forks between the file being repaired and the temporary file. Returns + * with both inodes locked and joined to a clean scrub transaction. + */ +int +xrep_tempswap_contents( + struct xfs_scrub *sc, + struct xrep_tempswap *tx) +{ + int error; + + ASSERT(sc->flags & XREP_FSGATES_ATOMIC_XCHG); + + xfs_swapext(sc->tp, &tx->req); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * If we swapped the ondisk sizes of two metadata files, we must swap + * the incore sizes as well. Since online fsck doesn't use swapext on + * the data forks of user-accessible files, the two sizes are always + * the same, so we don't need to log the inodes. + */ + if (tx->req.req_flags & XFS_SWAP_REQ_SET_SIZES) { + loff_t temp; + + temp = i_size_read(VFS_I(sc->ip)); + i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); + i_size_write(VFS_I(sc->tempip), temp); + } + + return 0; +} diff --git a/fs/xfs/scrub/tempswap.h b/fs/xfs/scrub/tempswap.h new file mode 100644 index 0000000000000..e8f8a6e3c8861 --- /dev/null +++ b/fs/xfs/scrub/tempswap.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_TEMPSWAP_H__ +#define __XFS_SCRUB_TEMPSWAP_H__ + +#ifdef CONFIG_XFS_ONLINE_REPAIR +struct xrep_tempswap { + struct xfs_swapext_req req; +}; + +int xrep_tempswap_grab_log_assist(struct xfs_scrub *sc); +int xrep_tempswap_trans_reserve(struct xfs_scrub *sc, int whichfork, + struct xrep_tempswap *ti); + +int xrep_tempswap_contents(struct xfs_scrub *sc, struct xrep_tempswap *ti); +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_TEMPFILE_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 73ddaaadd2414..1f06c1ace5902 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -125,6 +125,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \ + { XREP_FSGATES_ATOMIC_XCHG, "fsgates_atomic_swapext" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } From patchwork Sun Dec 31 20:32:35 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13507428 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D04D2F9C4 for ; Sun, 31 Dec 2023 20:32:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="jJhiM7/r" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9EB85C433C8; Sun, 31 Dec 2023 20:32:35 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1704054755; bh=UeDbbbgmA4/rNg/PZ411qrMeAEEN6osytpWWRlDVRP0=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=jJhiM7/r9t6BwD2DWf3iK/AkN8QyDF7G+bq5ki3b/AmDucUBffmT/gUvSULocfqm7 HFctEbmDNgODbohof9oz0ZFK5FS8YOiLgQ32Fvqs9+aDYRrvHhGxobEv9EC7U6IzU1 Ta7Sf6geKGOOzF3/29XqpGMPKY3w3HoH3FlHg4ChNHSbeTvnHsPbFY7s3qwM3gLvuJ pvteyuVK1UGC36M0Kg4SEK4Y0uHpjShwQsnpVoJcoebtuc6+2Dk4/Xe2nR96KK+pXx xBf8AnkC4zVKhOJJsD1g3t/kr/F9qNMQtZHC+2ENW9rULZtAzCDPgxPpMJG7O7nIF4 o+VQ/JjbI0lHA== Date: Sun, 31 Dec 2023 12:32:35 -0800 Subject: [PATCH 3/3] xfs: online repair of realtime summaries From: "Darrick J. Wong" To: djwong@kernel.org Cc: linux-xfs@vger.kernel.org Message-ID: <170404834333.1752917.11852698726055066260.stgit@frogsfrogsfrogs> In-Reply-To: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> References: <170404834278.1752917.3964733922134331052.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Repair the realtime summary data by constructing a new rtsummary file in the scrub temporary file, then atomically swapping the contents. Signed-off-by: Darrick J. Wong --- fs/xfs/Makefile | 1 fs/xfs/scrub/common.c | 1 fs/xfs/scrub/repair.h | 3 + fs/xfs/scrub/rtsummary.c | 33 ++++--- fs/xfs/scrub/rtsummary.h | 37 ++++++++ fs/xfs/scrub/rtsummary_repair.c | 177 +++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/scrub.c | 3 - 7 files changed, 239 insertions(+), 16 deletions(-) create mode 100644 fs/xfs/scrub/rtsummary.h create mode 100644 fs/xfs/scrub/rtsummary_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 9ce43c3037d2c..62e38f70c304b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -213,6 +213,7 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ + rtsummary_repair.o \ ) xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 78ffd6137d498..c16cd9774f525 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -31,6 +31,7 @@ #include "xfs_ag.h" #include "xfs_error.h" #include "xfs_quota.h" +#include "xfs_swapext.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 38aa5c9649d71..06125d0a2c602 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -126,8 +126,10 @@ int xrep_fscounters(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); +int xrep_rtsummary(struct xfs_scrub *sc); #else # define xrep_rtbitmap xrep_notsupported +# define xrep_rtsummary xrep_notsupported #endif /* CONFIG_XFS_RT */ #ifdef CONFIG_XFS_QUOTA @@ -212,6 +214,7 @@ xrep_setup_nothing( #define xrep_quotacheck xrep_notsupported #define xrep_nlinks xrep_notsupported #define xrep_fscounters xrep_notsupported +#define xrep_rtsummary xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index b0d90426a5cb8..5d1622203c8a9 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -16,10 +16,14 @@ #include "xfs_rtbitmap.h" #include "xfs_bit.h" #include "xfs_bmap.h" +#include "xfs_swapext.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/xfile.h" +#include "scrub/repair.h" +#include "scrub/tempswap.h" +#include "scrub/rtsummary.h" /* * Realtime Summary @@ -31,18 +35,6 @@ * (potentially large) amount of data in pageable memory. */ -struct xchk_rtsummary { - struct xfs_rtalloc_args args; - - uint64_t rextents; - uint64_t rbmblocks; - uint64_t rsumsize; - unsigned int rsumlevels; - - /* Memory buffer for the summary comparison. */ - union xfs_suminfo_raw words[]; -}; - /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -59,6 +51,12 @@ xchk_setup_rtsummary( return -ENOMEM; sc->buf = rts; + if (xchk_could_repair(sc)) { + error = xrep_setup_rtsummary(sc, rts); + if (error) + return error; + } + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -69,7 +67,7 @@ xchk_setup_rtsummary( if (error) return error; - error = xchk_trans_alloc(sc, 0); + error = xchk_trans_alloc(sc, rts->resblks); if (error) return error; @@ -134,7 +132,7 @@ xfsum_store( sumoff << XFS_WORDLOG); } -static inline int +inline int xfsum_copyout( struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, @@ -361,7 +359,12 @@ xchk_rtsummary( error = xchk_rtsum_compare(sc); out_rbm: - /* Unlock the rtbitmap since we're done with it. */ + /* + * Unlock the rtbitmap since we're done with it. All other writers of + * the rt free space metadata grab the bitmap and summary ILOCKs in + * that order, so we're still protected against allocation activities + * even if we continue on to the repair function. + */ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } diff --git a/fs/xfs/scrub/rtsummary.h b/fs/xfs/scrub/rtsummary.h new file mode 100644 index 0000000000000..8bcffd53fc2e2 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_RTSUMMARY_H__ +#define __XFS_SCRUB_RTSUMMARY_H__ + +struct xchk_rtsummary { +#ifdef CONFIG_XFS_ONLINE_REPAIR + struct xrep_tempswap tempswap; +#endif + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + unsigned int resblks; + + /* suminfo position of xfile as we write buffers to disk. */ + xfs_rtsumoff_t prep_wordoff; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + +int xfsum_copyout(struct xfs_scrub *sc, xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, unsigned int nr_words); + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtsummary(struct xfs_scrub *sc, struct xchk_rtsummary *rts); +#else +# define xrep_setup_rtsummary(sc, rts) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTSUMMARY_H__ */ diff --git a/fs/xfs/scrub/rtsummary_repair.c b/fs/xfs/scrub/rtsummary_repair.c new file mode 100644 index 0000000000000..058c5ebabf9a2 --- /dev/null +++ b/fs/xfs/scrub/rtsummary_repair.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtalloc.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_swapext.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/tempfile.h" +#include "scrub/tempswap.h" +#include "scrub/reap.h" +#include "scrub/xfile.h" +#include "scrub/rtsummary.h" + +/* Set us up to repair the rtsummary file. */ +int +xrep_setup_rtsummary( + struct xfs_scrub *sc, + struct xchk_rtsummary *rts) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks; + int error; + + error = xrep_tempfile_create(sc, S_IFREG); + if (error) + return error; + + /* + * If we're doing a repair, we reserve enough blocks to write out a + * completely new summary file, plus twice as many blocks as we would + * need if we can only allocate one block per data fork mapping. This + * should cover the preallocation of the temporary file and swapping + * the extent mappings. + * + * We cannot use xfs_swapext_estimate because we have not yet + * constructed the replacement rtsummary and therefore do not know how + * many extents it will use. By the time we do, we will have a dirty + * transaction (which we cannot drop because we cannot drop the + * rtsummary ILOCK) and cannot ask for more reservation. + */ + blocks = XFS_B_TO_FSB(mp, mp->m_rsumsize); + blocks += xfs_bmbt_calc_size(mp, blocks) * 2; + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rts->resblks += blocks; + + /* + * Grab support for atomic extent swapping before we allocate any + * transactions or grab ILOCKs. + */ + return xrep_tempswap_grab_log_assist(sc); +} + +static int +xrep_rtsummary_prep_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp, + void *data) +{ + struct xchk_rtsummary *rts = data; + struct xfs_mount *mp = sc->mp; + union xfs_suminfo_raw *ondisk; + int error; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + rts->args.sumbp = bp; + ondisk = xfs_rsumblock_infoptr(&rts->args, 0); + rts->args.sumbp = NULL; + + bp->b_ops = &xfs_rtbuf_ops; + + error = xfsum_copyout(sc, rts->prep_wordoff, ondisk, mp->m_blockwsize); + if (error) + return error; + + rts->prep_wordoff += mp->m_blockwsize; + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_RTSUMMARY_BUF); + return 0; +} + +/* Repair the realtime summary. */ +int +xrep_rtsummary( + struct xfs_scrub *sc) +{ + struct xchk_rtsummary *rts = sc->buf; + struct xfs_mount *mp = sc->mp; + xfs_filblks_t rsumblocks; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + /* Walk away if we disagree on the size of the rt bitmap. */ + if (rts->rbmblocks != mp->m_sb.sb_rbmblocks) + return 0; + + /* Make sure any problems with the fork are fixed. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + /* + * Try to take ILOCK_EXCL of the temporary file. We had better be the + * only ones holding onto this inode, but we can't block while holding + * the rtsummary file's ILOCK_EXCL. + */ + while (!xrep_tempfile_ilock_nowait(sc)) { + if (xchk_should_terminate(sc, &error)) + return error; + delay(1); + } + + /* Make sure we have space allocated for the entire summary file. */ + rsumblocks = XFS_B_TO_FSB(mp, rts->rsumsize); + xfs_trans_ijoin(sc->tp, sc->ip, 0); + xfs_trans_ijoin(sc->tp, sc->tempip, 0); + error = xrep_tempfile_prealloc(sc, 0, rsumblocks); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* Copy the rtsummary file that we generated. */ + error = xrep_tempfile_copyin(sc, 0, rsumblocks, + xrep_rtsummary_prep_buf, rts); + if (error) + return error; + error = xrep_tempfile_set_isize(sc, rts->rsumsize); + if (error) + return error; + + /* + * Now swap the extents. Nothing in repair uses the temporary buffer, + * so we can reuse it for the tempfile swapext information. + */ + error = xrep_tempswap_trans_reserve(sc, XFS_DATA_FORK, &rts->tempswap); + if (error) + return error; + + error = xrep_tempswap_contents(sc, &rts->tempswap); + if (error) + return error; + + /* Reset incore state and blow out the summary cache. */ + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, 0xFF, mp->m_sb.sb_rbmblocks); + + mp->m_rsumlevels = rts->rsumlevels; + mp->m_rsumsize = rts->rsumsize; + + /* Free the old rtsummary blocks if they're not in use. */ + return xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index afc82f1e40ffb..9af91874e58b9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -18,6 +18,7 @@ #include "xfs_buf_xfile.h" #include "xfs_rmap.h" #include "xfs_xchgrange.h" +#include "xfs_swapext.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -357,7 +358,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .repair = xrep_notsupported, + .repair = xrep_rtsummary, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS,