@@ -325,6 +325,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
}
+static inline bool xfs_inode_has_bigallocunit(struct xfs_inode *ip)
+{
+ return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
/* Always set the child's GID to this value, even if the parent is setgid. */
#define CRED_FORCE_GID (1U << 0)
struct cred {
@@ -29,6 +29,7 @@
#include "xfs_dir2_priv.h"
#include "xfs_dir2.h"
#include "xfs_symlink_remote.h"
+#include "xfs_rtbitmap.h"
struct kmem_cache *xfs_swapext_intent_cache;
@@ -131,6 +132,102 @@ sxi_advance(
sxi->sxi_blockcount -= irec->br_blockcount;
}
+#ifdef DEBUG
+/*
+ * If we're going to do a BUI-only extent swap, ensure that all mappings are
+ * aligned to the realtime extent size.
+ */
+static inline int
+xfs_swapext_check_rt_extents(
+ struct xfs_mount *mp,
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ xfs_fileoff_t startoff1 = req->startoff1;
+ xfs_fileoff_t startoff2 = req->startoff2;
+ xfs_filblks_t blockcount = req->blockcount;
+ uint32_t mod;
+ int nimaps;
+ int error;
+
+ /* xattrs don't live on the rt device */
+ if (req->whichfork == XFS_ATTR_FORK)
+ return 0;
+
+ /*
+ * Caller got permission to use SXI log items, so log recovery will
+ * finish the swap and not leave us with partially swapped rt extents
+ * exposed to userspace.
+ */
+ if (req->req_flags & XFS_SWAP_REQ_LOGGED)
+ return 0;
+
+ /*
+ * Allocation units must be fully mapped to a file range. For files
+ * with a single-fsblock allocation unit, this is trivial.
+ */
+ if (!xfs_inode_has_bigallocunit(req->ip2))
+ return 0;
+
+ /*
+ * For multi-fsblock allocation units, we must check the alignment of
+ * every single mapping.
+ */
+ while (blockcount > 0) {
+ /* Read extent from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
+ &irec1, &nimaps, 0);
+ if (error)
+ return error;
+ ASSERT(nimaps == 1);
+
+ /* Read extent from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(req->ip2, startoff2,
+ irec1.br_blockcount, &irec2, &nimaps,
+ 0);
+ if (error)
+ return error;
+ ASSERT(nimaps == 1);
+
+ /*
+ * We can only swap as many blocks as the smaller of the two
+ * extent maps.
+ */
+ irec1.br_blockcount = min(irec1.br_blockcount,
+ irec2.br_blockcount);
+
+ /* Both mappings must be aligned to the realtime extent size. */
+ mod = xfs_rtb_to_rtxoff(mp, irec1.br_startoff);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ mod = xfs_rtb_to_rtxoff(mp, irec1.br_startoff);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ mod = xfs_rtb_to_rtxoff(mp, irec1.br_blockcount);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ startoff1 += irec1.br_blockcount;
+ startoff2 += irec1.br_blockcount;
+ blockcount -= irec1.br_blockcount;
+ }
+
+ return 0;
+}
+#else
+# define xfs_swapext_check_rt_extents(mp, req) (0)
+#endif
+
/* Check all extents to make sure we can actually swap them. */
int
xfs_swapext_check_extents(
@@ -150,12 +247,7 @@ xfs_swapext_check_extents(
ifp2->if_format == XFS_DINODE_FMT_LOCAL)
return -EINVAL;
- /* We don't support realtime data forks yet. */
- if (!XFS_IS_REALTIME_INODE(req->ip1))
- return 0;
- if (req->whichfork == XFS_ATTR_FORK)
- return 0;
- return -EINVAL;
+ return xfs_swapext_check_rt_extents(mp, req);
}
#ifdef CONFIG_XFS_QUOTA
@@ -196,6 +288,8 @@ xfs_swapext_can_skip_mapping(
struct xfs_swapext_intent *sxi,
struct xfs_bmbt_irec *irec)
{
+ struct xfs_mount *mp = sxi->sxi_ip1->i_mount;
+
/* Do not skip this mapping if the caller did not tell us to. */
if (!(sxi->sxi_flags & XFS_SWAP_EXT_INO1_WRITTEN))
return false;
@@ -208,10 +302,63 @@ xfs_swapext_can_skip_mapping(
* The mapping is unwritten or a hole. It cannot be a delalloc
* reservation because we already excluded those. It cannot be an
* unwritten extent with dirty page cache because we flushed the page
- * cache. We don't support realtime files yet, so we needn't (yet)
- * deal with them.
+ * cache. For files where the allocation unit is 1FSB (files on the
+ * data dev, rt files if the extent size is 1FSB), we can safely
+ * skip this mapping.
*/
- return true;
+ if (!xfs_inode_has_bigallocunit(sxi->sxi_ip1))
+ return true;
+
+ /*
+ * For a realtime file with a multi-fsb allocation unit, the decision
+ * is trickier because we can only swap full allocation units.
+ * Unwritten mappings can appear in the middle of an rtx if the rtx is
+ * partially written, but they can also appear for preallocations.
+ *
+ * If the mapping is a hole, skip it entirely. Holes should align with
+ * rtx boundaries.
+ */
+ if (!xfs_bmap_is_real_extent(irec))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten.
+ *
+ * - If the beginning is not aligned to an rtx, trim the end of the
+ * mapping so that it does not cross an rtx boundary, and swap it.
+ *
+ * - If both ends are aligned to an rtx, skip the entire mapping.
+ */
+ if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
+ xfs_fileoff_t new_end;
+
+ new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
+ irec->br_blockcount = min(irec->br_blockcount,
+ new_end - irec->br_startoff);
+ return false;
+ }
+ if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten, start on an rtx
+ * boundary, and do not end on an rtx boundary.
+ *
+ * - If the mapping is longer than one rtx, trim the end of the mapping
+ * down to an rtx boundary and skip it.
+ *
+ * - The mapping is shorter than one rtx. Swap it.
+ */
+ if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
+ xfs_fileoff_t new_end;
+
+ new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
+ mp->m_sb.sb_rextsize);
+ irec->br_blockcount = new_end - irec->br_startoff;
+ return true;
+ }
+
+ return false;
}
/*