@@ -51,6 +51,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_refcount.o \
xfs_refcount_btree.o \
xfs_sb.o \
+ xfs_swapext.o \
xfs_symlink_remote.o \
xfs_trans_inode.o \
xfs_trans_resv.o \
@@ -158,6 +158,13 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
{ BMAP_ATTRFORK, "ATTR" }, \
{ BMAP_COWFORK, "COW" }
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_mapped_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
/*
* Return true if the extent is a real, allocated extent, or false if it is a
@@ -165,10 +172,8 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
*/
static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
{
- return irec->br_state != XFS_EXT_UNWRITTEN &&
- irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !isnullstartblock(irec->br_startblock);
+ return xfs_bmap_is_mapped_extent(irec) &&
+ irec->br_state != XFS_EXT_UNWRITTEN;
}
/*
@@ -184,6 +184,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_SWAPEXT] = &xfs_swapext_defer_type,
};
/*
@@ -18,6 +18,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_SWAPEXT,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -65,6 +66,7 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_swapext_defer_type;
/*
* Deferred operation freezer. This structure enables a dfops user to detach
new file mode 100644
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_quota.h"
+#include "xfs_swapext.h"
+#include "xfs_trace.h"
+
+/* Information to help us reset reflink flag / CoW fork state after a swap. */
+
+/* Are we swapping the data fork? */
+#define XFS_SX_REFLINK_DATAFORK (1U << 0)
+
+/* Can we swap the flags? */
+#define XFS_SX_REFLINK_SWAPFLAGS (1U << 1)
+
+/* Previous state of the two inodes' reflink flags. */
+#define XFS_SX_REFLINK_IP1_REFLINK (1U << 2)
+#define XFS_SX_REFLINK_IP2_REFLINK (1U << 3)
+
+
+/*
+ * Prepare both inodes' reflink state for an extent swap, and return our
+ * findings so that xfs_swapext_reflink_finish can deal with the aftermath.
+ */
+unsigned int
+xfs_swapext_reflink_prep(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ int whichfork,
+ xfs_fileoff_t startoff1,
+ xfs_fileoff_t startoff2,
+ xfs_filblks_t blockcount)
+{
+ struct xfs_mount *mp = ip1->i_mount;
+ unsigned int rs = 0;
+
+ if (whichfork != XFS_DATA_FORK)
+ return 0;
+
+ /*
+ * If either file has shared blocks and we're swapping data forks, we
+ * must flag the other file as having shared blocks so that we get the
+ * shared-block rmap functions if we need to fix up the rmaps. The
+ * flags will be switched for real by xfs_swapext_reflink_finish.
+ */
+ if (xfs_is_reflink_inode(ip1))
+ rs |= XFS_SX_REFLINK_IP1_REFLINK;
+ if (xfs_is_reflink_inode(ip2))
+ rs |= XFS_SX_REFLINK_IP2_REFLINK;
+
+ if (rs & XFS_SX_REFLINK_IP1_REFLINK)
+ ip2->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP2_REFLINK)
+ ip1->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+
+ /*
+ * If either file had the reflink flag set before; and the two files'
+ * reflink state was different; and we're swapping the entirety of both
+ * files, then we can exchange the reflink flags at the end.
+ * Otherwise, we propagate the reflink flag from either file to the
+ * other file.
+ *
+ * Note that we've only set the _REFLINK flags of the reflink state, so
+ * we can cheat and use hweight32 for the reflink flag test.
+ *
+ */
+ if (hweight32(rs) == 1 && startoff1 == 0 && startoff2 == 0 &&
+ blockcount == XFS_B_TO_FSB(mp, ip1->i_d.di_size) &&
+ blockcount == XFS_B_TO_FSB(mp, ip2->i_d.di_size))
+ rs |= XFS_SX_REFLINK_SWAPFLAGS;
+
+ rs |= XFS_SX_REFLINK_DATAFORK;
+ return rs;
+}
+
+/*
+ * If the reflink flag is set on either inode, make sure it has an incore CoW
+ * fork, since all reflink inodes must have them. If there's a CoW fork and it
+ * has extents in it, make sure the inodes are tagged appropriately so that
+ * speculative preallocations can be GC'd if we run low of space.
+ */
+static inline void
+xfs_swapext_ensure_cowfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *cfork;
+
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+
+ cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ if (!cfork)
+ return;
+ if (cfork->if_bytes > 0)
+ xfs_inode_set_cowblocks_tag(ip);
+ else
+ xfs_inode_clear_cowblocks_tag(ip);
+}
+
+/*
+ * Set both inodes' ondisk reflink flags to their final state and ensure that
+ * the incore state is ready to go.
+ */
+void
+xfs_swapext_reflink_finish(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ unsigned int rs)
+{
+ if (!(rs & XFS_SX_REFLINK_DATAFORK))
+ return;
+
+ if (rs & XFS_SX_REFLINK_SWAPFLAGS) {
+ /* Exchange the reflink inode flags and log them. */
+ ip1->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP2_REFLINK)
+ ip1->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+
+ ip2->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (rs & XFS_SX_REFLINK_IP1_REFLINK)
+ ip2->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+
+ xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+ }
+
+ xfs_swapext_ensure_cowfork(ip1);
+ xfs_swapext_ensure_cowfork(ip2);
+}
+
+/* Schedule an atomic extent swap. */
+static inline void
+xfs_swapext_schedule(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi)
+{
+ trace_xfs_swapext_defer(tp->t_mountp, sxi);
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->si_list);
+}
+
+/* Reschedule an atomic extent swap on behalf of log recovery. */
+void
+xfs_swapext_reschedule(
+ struct xfs_trans *tp,
+ const struct xfs_swapext_intent *sxi)
+{
+ struct xfs_swapext_intent *new_sxi;
+
+ new_sxi = kmem_alloc(sizeof(struct xfs_swapext_intent), KM_NOFS);
+ memcpy(new_sxi, sxi, sizeof(*new_sxi));
+ INIT_LIST_HEAD(&new_sxi->si_list);
+
+ xfs_swapext_schedule(tp, new_sxi);
+}
+
+/*
+ * Adjust the on-disk inode size upwards if needed so that we never map extents
+ * into the file past EOF. This is crucial so that log recovery won't get
+ * confused by the sudden appearance of post-eof extents.
+ */
+STATIC void
+xfs_swapext_update_size(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fsize_t new_isize)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsize_t len;
+
+ if (new_isize < 0)
+ return;
+
+ len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
+ new_isize);
+
+ if (len <= ip->i_d.di_size)
+ return;
+
+ trace_xfs_swapext_update_inode_size(ip, len);
+
+ ip->i_d.di_size = len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
+/* Do we have more work to do to finish this operation? */
+bool
+xfs_swapext_has_more_work(
+ struct xfs_swapext_intent *sxi)
+{
+ return sxi->si_blockcount > 0;
+}
+
+/* Finish one extent swap, possibly log more. */
+int
+xfs_swapext_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ int whichfork;
+ int nimaps;
+ int bmap_flags;
+ int error;
+
+ whichfork = (sxi->si_flags & XFS_SWAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bmap_flags = xfs_bmapi_aflag(whichfork);
+
+ while (sxi->si_blockcount > 0) {
+ int64_t ip1_delta = 0, ip2_delta = 0;
+
+ /* Read extent from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(sxi->si_ip1, sxi->si_startoff1,
+ sxi->si_blockcount, &irec1, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec1.br_startblock == DELAYSTARTBLOCK ||
+ irec1.br_startoff != sxi->si_startoff1) {
+ /*
+ * We should never get no mapping or a delalloc extent
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /* Read extent from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(sxi->si_ip2, sxi->si_startoff2,
+ irec1.br_blockcount, &irec2, &nimaps,
+ bmap_flags);
+ if (error)
+ return error;
+ if (nimaps != 1 ||
+ irec2.br_startblock == DELAYSTARTBLOCK ||
+ irec2.br_startoff != sxi->si_startoff2) {
+ /*
+ * We should never get no mapping or a delalloc extent
+ * or something that doesn't match what we asked for,
+ * since the caller flushed both inodes and we hold the
+ * ILOCKs for both inodes.
+ */
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ /*
+ * We can only swap as many blocks as the smaller of the two
+ * extent maps.
+ */
+ irec1.br_blockcount = min(irec1.br_blockcount,
+ irec2.br_blockcount);
+
+ trace_xfs_swapext_extent1(sxi->si_ip1, &irec1);
+ trace_xfs_swapext_extent2(sxi->si_ip2, &irec2);
+
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to
+ * the next extent if they're both holes or both the same
+ * physical extent.
+ */
+ if (irec1.br_startblock == irec2.br_startblock) {
+ if (irec1.br_state != irec2.br_state)
+ return -EFSCORRUPTED;
+
+ sxi->si_startoff1 += irec1.br_blockcount;
+ sxi->si_startoff2 += irec1.br_blockcount;
+ sxi->si_blockcount -= irec1.br_blockcount;
+ continue;
+ }
+
+ /* Update quota accounting. */
+ if (xfs_bmap_is_mapped_extent(&irec1)) {
+ ip1_delta -= irec1.br_blockcount;
+ ip2_delta += irec1.br_blockcount;
+ }
+ if (xfs_bmap_is_mapped_extent(&irec2)) {
+ ip1_delta += irec2.br_blockcount;
+ ip2_delta -= irec2.br_blockcount;
+ }
+
+ if (ip1_delta)
+ xfs_trans_mod_dquot_byino(tp, sxi->si_ip1,
+ XFS_TRANS_DQ_BCOUNT, ip1_delta);
+ if (ip2_delta)
+ xfs_trans_mod_dquot_byino(tp, sxi->si_ip2,
+ XFS_TRANS_DQ_BCOUNT, ip2_delta);
+
+ /* Remove both mappings. */
+ xfs_bmap_unmap_extent(tp, sxi->si_ip1, whichfork, &irec1);
+ xfs_bmap_unmap_extent(tp, sxi->si_ip2, whichfork, &irec2);
+
+ /*
+ * Re-add both mappings. We swap the file offsets between the
+ * two maps and add the opposite map, which has the effect of
+ * filling the logical offsets we just unmapped, but with with
+ * the physical mapping information swapped.
+ */
+ swap(irec1.br_startoff, irec2.br_startoff);
+ xfs_bmap_map_extent(tp, sxi->si_ip1, whichfork, &irec2);
+ xfs_bmap_map_extent(tp, sxi->si_ip2, whichfork, &irec1);
+
+ /* Make sure we're not mapping extents past EOF. */
+ if (whichfork == XFS_DATA_FORK) {
+ xfs_swapext_update_size(tp, sxi->si_ip1, &irec2,
+ sxi->si_isize1);
+ xfs_swapext_update_size(tp, sxi->si_ip2, &irec1,
+ sxi->si_isize2);
+ }
+
+ /*
+ * Advance our cursor and exit. The caller (either defer ops
+ * or log recovery) will log the SXD item, and if *blockcount
+ * is nonzero, it will log a new SXI item for the remainder
+ * and call us back.
+ */
+ sxi->si_startoff1 += irec1.br_blockcount;
+ sxi->si_startoff2 += irec1.br_blockcount;
+ sxi->si_blockcount -= irec1.br_blockcount;
+ break;
+ }
+
+ /*
+ * If we've reached the end of the remap operation and the caller
+ * wanted us to exchange the sizes, do that now.
+ */
+ if (sxi->si_blockcount == 0 &&
+ (sxi->si_flags & XFS_SWAP_EXTENT_SET_SIZES)) {
+ sxi->si_ip1->i_d.di_size = sxi->si_isize1;
+ sxi->si_ip2->i_d.di_size = sxi->si_isize2;
+ xfs_trans_log_inode(tp, sxi->si_ip1, XFS_ILOG_CORE);
+ xfs_trans_log_inode(tp, sxi->si_ip2, XFS_ILOG_CORE);
+ }
+
+ if (xfs_swapext_has_more_work(sxi))
+ trace_xfs_swapext_defer(tp->t_mountp, sxi);
+ return 0;
+}
+
+static void
+xfs_swapext_init_intent(
+ struct xfs_swapext_intent *sxi,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ int whichfork,
+ xfs_fileoff_t startoff1,
+ xfs_fileoff_t startoff2,
+ xfs_filblks_t blockcount,
+ unsigned int flags)
+{
+ INIT_LIST_HEAD(&sxi->si_list);
+ sxi->si_flags = 0;
+ if (whichfork == XFS_ATTR_FORK)
+ sxi->si_flags |= XFS_SWAP_EXTENT_ATTR_FORK;
+ sxi->si_isize1 = sxi->si_isize2 = -1;
+ if (whichfork == XFS_DATA_FORK && (flags & XFS_SWAPEXT_SET_SIZES)) {
+ sxi->si_flags |= XFS_SWAP_EXTENT_SET_SIZES;
+ sxi->si_isize1 = ip2->i_d.di_size;
+ sxi->si_isize2 = ip1->i_d.di_size;
+ }
+ sxi->si_ip1 = ip1;
+ sxi->si_ip2 = ip2;
+ sxi->si_startoff1 = startoff1;
+ sxi->si_startoff2 = startoff2;
+ sxi->si_blockcount = blockcount;
+}
+
+/*
+ * Atomically swap a range of extents from one inode to another.
+ *
+ * The caller must ensure the inodes must be joined to the transaction and
+ * ILOCKd; they will still be joined to the transaction at exit.
+ */
+int
+xfs_swapext_atomic(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2,
+ int whichfork,
+ xfs_fileoff_t startoff1,
+ xfs_fileoff_t startoff2,
+ xfs_filblks_t blockcount,
+ unsigned int flags)
+{
+ struct xfs_swapext_intent *sxi;
+ unsigned int state;
+ int error;
+
+ ASSERT(xfs_isilocked(ip1, XFS_ILOCK_EXCL));
+ ASSERT(xfs_isilocked(ip2, XFS_ILOCK_EXCL));
+ ASSERT(whichfork != XFS_COW_FORK);
+ ASSERT(whichfork == XFS_DATA_FORK || !(flags & XFS_SWAPEXT_SET_SIZES));
+
+ state = xfs_swapext_reflink_prep(ip1, ip2, whichfork, startoff1,
+ startoff2, blockcount);
+
+ sxi = kmem_alloc(sizeof(struct xfs_swapext_intent), KM_NOFS);
+ xfs_swapext_init_intent(sxi, ip1, ip2, whichfork, startoff1, startoff2,
+ blockcount, flags);
+ xfs_swapext_schedule(*tpp, sxi);
+
+ error = xfs_defer_finish(tpp);
+ if (error)
+ return error;
+
+ xfs_swapext_reflink_finish(*tpp, ip1, ip2, state);
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SWAPEXT_H_
+#define __XFS_SWAPEXT_H_ 1
+
+/*
+ * In-core information about an extent swap request between ranges of two
+ * inodes.
+ */
+struct xfs_swapext_intent {
+ /* List of other incore deferred work. */
+ struct list_head si_list;
+
+ /* The two inodes we're swapping. */
+ union {
+ struct xfs_inode *si_ip1;
+ xfs_ino_t si_ino1;
+ };
+ union {
+ struct xfs_inode *si_ip2;
+ xfs_ino_t si_ino2;
+ };
+
+ /* File offset range information. */
+ xfs_fileoff_t si_startoff1;
+ xfs_fileoff_t si_startoff2;
+ xfs_filblks_t si_blockcount;
+ uint64_t si_flags;
+
+ /* Set these file sizes after the operation, unless negative. */
+ xfs_fsize_t si_isize1;
+ xfs_fsize_t si_isize2;
+};
+
+bool xfs_swapext_has_more_work(struct xfs_swapext_intent *sxi);
+
+unsigned int xfs_swapext_reflink_prep(struct xfs_inode *ip1,
+ struct xfs_inode *ip2, int whichfork, xfs_fileoff_t startoff1,
+ xfs_fileoff_t startoff2, xfs_filblks_t blockcount);
+void xfs_swapext_reflink_finish(struct xfs_trans *tp, struct xfs_inode *ip1,
+ struct xfs_inode *ip2, unsigned int reflink_state);
+
+void xfs_swapext_reschedule(struct xfs_trans *tpp,
+ const struct xfs_swapext_intent *sxi_state);
+int xfs_swapext_finish_one(struct xfs_trans *tp,
+ struct xfs_swapext_intent *sxi_state);
+
+#define XFS_SWAPEXT_SET_SIZES (1U << 0)
+int xfs_swapext_atomic(struct xfs_trans **tpp, struct xfs_inode *ip1,
+ struct xfs_inode *ip2, int whichfork, xfs_fileoff_t startoff1,
+ xfs_fileoff_t startoff2, xfs_filblks_t blockcount,
+ unsigned int flags);
+
+#endif /* __XFS_SWAPEXT_H_ */
@@ -16,9 +16,11 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_swapext_item.h"
+#include "xfs_swapext.h"
#include "xfs_log.h"
#include "xfs_bmap.h"
#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_error.h"
#include "xfs_log_priv.h"
@@ -205,6 +207,240 @@ static const struct xfs_item_ops xfs_sxd_item_ops = {
.iop_release = xfs_sxd_item_release,
};
+static struct xfs_sxd_log_item *
+xfs_trans_get_sxd(
+ struct xfs_trans *tp,
+ struct xfs_sxi_log_item *ilip)
+{
+ struct xfs_sxd_log_item *dlip;
+
+ dlip = kmem_zone_zalloc(xfs_sxd_zone, 0);
+ xfs_log_item_init(tp->t_mountp, &dlip->sxd_item, XFS_LI_SXD,
+ &xfs_sxd_item_ops);
+ dlip->sxd_intent_log_item = ilip;
+ dlip->sxd_format.sxd_sxi_id = ilip->sxi_format.sxi_id;
+
+ xfs_trans_add_item(tp, &dlip->sxd_item);
+ return dlip;
+}
+
+/*
+ * Finish an swapext update and log it to the SXD. Note that the
+ * transaction is marked dirty regardless of whether the swapext update
+ * succeeds or fails to support the SXI/SXD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_swapext_update(
+ struct xfs_trans *tp,
+ struct xfs_sxd_log_item *dlip,
+ struct xfs_swapext_intent *sxi)
+{
+ int error;
+
+ error = xfs_swapext_finish_one(tp, sxi);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the SXI and frees the SXD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &dlip->sxd_item.li_flags);
+
+ return error;
+}
+
+/* Sort swapext intents by inode. */
+static int
+xfs_swapext_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_swapext_intent *sa;
+ struct xfs_swapext_intent *sb;
+
+ sa = container_of(a, struct xfs_swapext_intent, si_list);
+ sb = container_of(b, struct xfs_swapext_intent, si_list);
+ return sa->si_ip1->i_ino - sb->si_ip2->i_ino;
+}
+
+/* Get an SXI. */
+STATIC void *
+xfs_swapext_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_sxi_log_item *ilip;
+
+ ASSERT(count == XFS_SXI_MAX_FAST_EXTENTS);
+ ASSERT(tp != NULL);
+
+ ilip = xfs_sxi_init(tp->t_mountp);
+ ASSERT(ilip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &ilip->sxi_item);
+ return ilip;
+}
+
+/* Log swapext updates in the intent item. */
+STATIC void
+xfs_swapext_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_sxi_log_item *ilip = intent;
+ struct xfs_swapext_intent *sxi;
+ struct xfs_swap_extent *se;
+
+ ASSERT(!test_bit(XFS_LI_DIRTY, &ilip->sxi_item.li_flags));
+
+ sxi = container_of(item, struct xfs_swapext_intent, si_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ilip->sxi_item.li_flags);
+
+ se = &ilip->sxi_format.sxi_extent;
+ se->se_inode1 = sxi->si_ip1->i_ino;
+ se->se_inode2 = sxi->si_ip2->i_ino;
+ se->se_startoff1 = sxi->si_startoff1;
+ se->se_startoff2 = sxi->si_startoff2;
+ se->se_blockcount = sxi->si_blockcount;
+ se->se_isize1 = sxi->si_isize1;
+ se->se_isize2 = sxi->si_isize2;
+ se->se_flags = sxi->si_flags;
+}
+
+/* Get an SXD so we can process all the deferred swapext updates. */
+STATIC void *
+xfs_swapext_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_sxd(tp, intent);
+}
+
+/* Process a deferred swapext update. */
+STATIC int
+xfs_swapext_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_swapext_intent *sxi;
+ int error;
+
+ sxi = container_of(item, struct xfs_swapext_intent, si_list);
+
+ /*
+ * Swap one more extent between the two files. If there's still more
+ * work to do, we want to requeue ourselves after all other pending
+ * deferred operations have finished. This includes all of the dfops
+ * that we queued directly as well as any new ones created in the
+ * process of finishing the others. Doing so prevents us from queuing
+ * a large number of SXI log items in kernel memory, which in turn
+ * prevents us from pinning the tail of the log (while logging those
+ * new SXI items) until the first SXI items can be processed.
+ */
+ error = xfs_trans_log_finish_swapext_update(tp, done_item, sxi);
+ if (!error && xfs_swapext_has_more_work(sxi))
+ return -EMULTIHOP;
+
+ kmem_free(sxi);
+ return error;
+}
+
+/* Abort all pending SXIs. */
+STATIC void
+xfs_swapext_abort_intent(
+ void *intent)
+{
+ xfs_sxi_release(intent);
+}
+
+/* Cancel a deferred swapext update. */
+STATIC void
+xfs_swapext_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_swapext_intent *sxi;
+
+ sxi = container_of(item, struct xfs_swapext_intent, si_list);
+ kmem_free(sxi);
+}
+
+/* Prepare a deferred swapext item for freezing by detaching the inodes. */
+STATIC int
+xfs_swapext_freeze_item(
+ struct xfs_defer_freezer *freezer,
+ struct list_head *item)
+{
+ struct xfs_swapext_intent *sxi;
+ struct xfs_inode *ip;
+ int error;
+
+ sxi = container_of(item, struct xfs_swapext_intent, si_list);
+
+ ip = sxi->si_ip1;
+ error = xfs_defer_freezer_ijoin(freezer, ip);
+ if (error)
+ return error;
+ sxi->si_ino1 = ip->i_ino;
+
+ ip = sxi->si_ip2;
+ error = xfs_defer_freezer_ijoin(freezer, ip);
+ if (error)
+ return error;
+ sxi->si_ino2 = ip->i_ino;
+
+ return 0;
+}
+
+/* Thaw a deferred swapext item by reattaching the inodes. */
+STATIC int
+xfs_swapext_thaw_item(
+ struct xfs_defer_freezer *freezer,
+ struct list_head *item)
+{
+ struct xfs_swapext_intent *sxi;
+ struct xfs_inode *ip;
+
+ sxi = container_of(item, struct xfs_swapext_intent, si_list);
+
+ ip = xfs_defer_freezer_igrab(freezer, sxi->si_ino1);
+ if (!ip)
+ return -EFSCORRUPTED;
+ sxi->si_ip1 = ip;
+
+ ip = xfs_defer_freezer_igrab(freezer, sxi->si_ino2);
+ if (!ip)
+ return -EFSCORRUPTED;
+ sxi->si_ip2 = ip;
+
+ return 0;
+}
+
+const struct xfs_defer_op_type xfs_swapext_defer_type = {
+ .max_items = XFS_SXI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_swapext_diff_items,
+ .create_intent = xfs_swapext_create_intent,
+ .abort_intent = xfs_swapext_abort_intent,
+ .log_item = xfs_swapext_log_item,
+ .create_done = xfs_swapext_create_done,
+ .finish_item = xfs_swapext_finish_item,
+ .cancel_item = xfs_swapext_cancel_item,
+ .freeze_item = xfs_swapext_freeze_item,
+ .thaw_item = xfs_swapext_thaw_item,
+};
+
/*
* Process a swapext update intent item that was recovered from the log.
* We need to update some inode's bmbt.
@@ -215,7 +451,105 @@ xfs_sxi_recover(
struct xfs_defer_freezer **dffp,
struct xfs_sxi_log_item *ilip)
{
- return -EFSCORRUPTED;
+ struct xfs_swapext_intent sxi;
+ struct xfs_swap_extent *se;
+ struct xfs_sxd_log_item *dlip;
+ struct xfs_trans *tp;
+ int error = 0;
+
+ ASSERT(!test_bit(XFS_SXI_RECOVERED, &ilip->sxi_flags));
+
+ /*
+ * First check the validity of the extent described by the
+ * SXI. If anything is bad, then toss the SXI.
+ */
+ se = &ilip->sxi_format.sxi_extent;
+ if (se->se_blockcount == 0 ||
+ ilip->sxi_format.__pad != 0 ||
+ !xfs_verify_ino(mp, se->se_inode1) ||
+ !xfs_verify_ino(mp, se->se_inode2) ||
+ (se->se_flags & ~XFS_SWAP_EXTENT_FLAGS) ||
+ ((se->se_flags & XFS_SWAP_EXTENT_SET_SIZES) &&
+ (se->se_isize1 < 0 || se->se_isize2 < 0))) {
+ /*
+ * This will pull the SXI from the AIL and
+ * free the memory associated with it.
+ */
+ set_bit(XFS_SXI_RECOVERED, &ilip->sxi_flags);
+ xfs_sxi_release(ilip);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ return error;
+
+ dlip = xfs_trans_get_sxd(tp, ilip);
+ memset(&sxi, 0, sizeof(sxi));
+ INIT_LIST_HEAD(&sxi.si_list);
+
+ /* Grab both inodes and lock them. */
+ error = xfs_iget(mp, tp, se->se_inode1, 0, 0, &sxi.si_ip1);
+ if (error)
+ goto out_fail;
+ error = xfs_iget(mp, tp, se->se_inode2, 0, 0, &sxi.si_ip2);
+ if (error)
+ goto out_fail;
+
+ xfs_lock_two_inodes(sxi.si_ip1, XFS_ILOCK_EXCL,
+ sxi.si_ip2, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, sxi.si_ip1, 0);
+ xfs_trans_ijoin(tp, sxi.si_ip2, 0);
+
+ /*
+ * Set IRECOVERY to prevent trimming of post-eof extents and freeing of
+ * unlinked inodes until we're totally done processing files.
+ */
+ if (VFS_I(sxi.si_ip1)->i_nlink == 0)
+ xfs_iflags_set(sxi.si_ip1, XFS_IRECOVERY);
+ if (VFS_I(sxi.si_ip2)->i_nlink == 0)
+ xfs_iflags_set(sxi.si_ip2, XFS_IRECOVERY);
+
+ /*
+ * Construct the rest of our in-core swapext intent state so that we
+ * can call the deferred operation functions to continue the work.
+ */
+ sxi.si_flags = se->se_flags;
+ sxi.si_startoff1 = se->se_startoff1;
+ sxi.si_startoff2 = se->se_startoff2;
+ sxi.si_blockcount = se->se_blockcount;
+ sxi.si_isize1 = se->se_isize1;
+ sxi.si_isize2 = se->se_isize2;
+ error = xfs_trans_log_finish_swapext_update(tp, dlip, &sxi);
+ if (error)
+ goto out_fail;
+
+ /*
+ * If there's more extent swapping to be done, we have to schedule that
+ * as a separate deferred operation to be run after we've finished
+ * replaying all of the intents we recovered from the log.
+ */
+ if (xfs_swapext_has_more_work(&sxi))
+ xfs_swapext_reschedule(tp, &sxi);
+
+ set_bit(XFS_SXI_RECOVERED, &ilip->sxi_flags);
+ error = xlog_recover_trans_commit(tp, dffp);
+ goto out_rele;
+
+out_fail:
+ xfs_trans_cancel(tp);
+out_rele:
+ if (sxi.si_ip2) {
+ xfs_iunlock(sxi.si_ip2, XFS_ILOCK_EXCL);
+ xfs_irele(sxi.si_ip2);
+ }
+ if (sxi.si_ip1) {
+ xfs_iunlock(sxi.si_ip1, XFS_ILOCK_EXCL);
+ xfs_irele(sxi.si_ip1);
+ }
+ return error;
+
}
/*
@@ -30,6 +30,7 @@
#include "xfs_fsmap.h"
#include "xfs_btree_staging.h"
#include "xfs_icache.h"
+#include "xfs_swapext.h"
/*
* We include this last to have the helpers above available for the trace
@@ -37,6 +37,7 @@ struct xfs_trans_res;
struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_eofblocks;
+struct xfs_swapext_intent;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -3207,6 +3208,9 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+DEFINE_INODE_IREC_EVENT(xfs_swapext_extent1);
+DEFINE_INODE_IREC_EVENT(xfs_swapext_extent2);
+DEFINE_ITRUNC_EVENT(xfs_swapext_update_inode_size);
/* fsmap traces */
DECLARE_EVENT_CLASS(xfs_fsmap_class,
@@ -3836,6 +3840,51 @@ DEFINE_NAMESPACE_EVENT(xfs_imeta_dir_created);
DEFINE_NAMESPACE_EVENT(xfs_imeta_dir_unlinked);
DEFINE_NAMESPACE_EVENT(xfs_imeta_dir_zap);
+#define XFS_SWAPEXT_FLAGS \
+ { XFS_SWAP_EXTENT_ATTR_FORK, "ATTRFORK" }, \
+ { XFS_SWAP_EXTENT_SET_SIZES, "SETSIZES" }
+
+TRACE_EVENT(xfs_swapext_defer,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_swapext_intent *sxi),
+ TP_ARGS(mp, sxi),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino1)
+ __field(xfs_ino_t, ino2)
+ __field(uint64_t, flags)
+ __field(xfs_fileoff_t, startoff1)
+ __field(xfs_fileoff_t, startoff2)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_fsize_t, isize1)
+ __field(xfs_fsize_t, isize2)
+ __field(xfs_fsize_t, new_isize1)
+ __field(xfs_fsize_t, new_isize2)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ino1 = sxi->si_ip1->i_ino;
+ __entry->ino2 = sxi->si_ip2->i_ino;
+ __entry->flags = sxi->si_flags;
+ __entry->startoff1 = sxi->si_startoff1;
+ __entry->startoff2 = sxi->si_startoff2;
+ __entry->blockcount = sxi->si_blockcount;
+ __entry->isize1 = sxi->si_ip1->i_d.di_size;
+ __entry->isize2 = sxi->si_ip2->i_d.di_size;
+ __entry->new_isize1 = sxi->si_isize1;
+ __entry->new_isize2 = sxi->si_isize2;
+ ),
+ TP_printk("dev %d:%d ino1 0x%llx isize1 %lld ino2 0x%llx isize2 %lld flags (%s) startoff1 %llu startoff2 %llu blockcount %llu newisize1 %lld newisize2 %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino1, __entry->isize1,
+ __entry->ino2, __entry->isize2,
+ __print_flags(__entry->flags, "|", XFS_SWAPEXT_FLAGS),
+ __entry->startoff1,
+ __entry->startoff2,
+ __entry->blockcount,
+ __entry->new_isize1, __entry->new_isize2)
+
+);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH