@@ -142,6 +142,108 @@ sxi_advance(
sxi->sxi_blockcount -= irec->br_blockcount;
}
+#ifdef DEBUG
+static inline bool
+xfs_swapext_need_rt_conversion(
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_inode *ip = req->ip2;
+ struct xfs_mount *mp = ip->i_mount;
+
+ /* xattrs don't live on the rt device */
+ if (req->whichfork == XFS_ATTR_FORK)
+ return false;
+
+ /*
+ * Caller got permission to use logged swapext, so log recovery will
+ * finish the swap and not leave us with partially swapped rt extents
+ * exposed to userspace.
+ */
+ if (req->req_flags & XFS_SWAP_REQ_LOGGED)
+ return false;
+
+ /*
+ * If we can't use log intent items at all, the only supported
+ * operation is full fork swaps.
+ */
+ if (!xfs_swapext_supported(mp))
+ return false;
+
+ /* Conversion is only needed for realtime files with big rt extents */
+ return xfs_inode_has_bigrtextents(ip);
+}
+
+static inline int
+xfs_swapext_check_rt_extents(
+ struct xfs_mount *mp,
+ const struct xfs_swapext_req *req)
+{
+ struct xfs_bmbt_irec irec1, irec2;
+ xfs_fileoff_t startoff1 = req->startoff1;
+ xfs_fileoff_t startoff2 = req->startoff2;
+ xfs_filblks_t blockcount = req->blockcount;
+ uint32_t mod;
+ int nimaps;
+ int error;
+
+ if (!xfs_swapext_need_rt_conversion(req))
+ return 0;
+
+ while (blockcount > 0) {
+ /* Read extent from the first file */
+ nimaps = 1;
+ error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
+ &irec1, &nimaps, 0);
+ if (error)
+ return error;
+ ASSERT(nimaps == 1);
+
+ /* Read extent from the second file */
+ nimaps = 1;
+ error = xfs_bmapi_read(req->ip2, startoff2,
+ irec1.br_blockcount, &irec2, &nimaps,
+ 0);
+ if (error)
+ return error;
+ ASSERT(nimaps == 1);
+
+ /*
+ * We can only swap as many blocks as the smaller of the two
+ * extent maps.
+ */
+ irec1.br_blockcount = min(irec1.br_blockcount,
+ irec2.br_blockcount);
+
+ /* Both mappings must be aligned to the realtime extent size. */
+ div_u64_rem(irec1.br_startoff, mp->m_sb.sb_rextsize, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ div_u64_rem(irec2.br_startoff, mp->m_sb.sb_rextsize, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ div_u64_rem(irec1.br_blockcount, mp->m_sb.sb_rextsize, &mod);
+ if (mod) {
+ ASSERT(mod == 0);
+ return -EINVAL;
+ }
+
+ startoff1 += irec1.br_blockcount;
+ startoff2 += irec1.br_blockcount;
+ blockcount -= irec1.br_blockcount;
+ }
+
+ return 0;
+}
+#else
+# define xfs_swapext_check_rt_extents(mp, req) (0)
+#endif
+
/* Check all extents to make sure we can actually swap them. */
int
xfs_swapext_check_extents(
@@ -161,12 +263,7 @@ xfs_swapext_check_extents(
ifp2->if_format == XFS_DINODE_FMT_LOCAL)
return -EINVAL;
- /* We don't support realtime data forks yet. */
- if (!XFS_IS_REALTIME_INODE(req->ip1))
- return 0;
- if (req->whichfork == XFS_ATTR_FORK)
- return 0;
- return -EINVAL;
+ return xfs_swapext_check_rt_extents(mp, req);
}
#ifdef CONFIG_XFS_QUOTA
@@ -207,6 +304,8 @@ xfs_swapext_can_skip_mapping(
struct xfs_swapext_intent *sxi,
struct xfs_bmbt_irec *irec)
{
+ struct xfs_mount *mp = sxi->sxi_ip1->i_mount;
+
/* Do not skip this mapping if the caller did not tell us to. */
if (!(sxi->sxi_flags & XFS_SWAP_EXT_INO1_WRITTEN))
return false;
@@ -219,10 +318,62 @@ xfs_swapext_can_skip_mapping(
* The mapping is unwritten or a hole. It cannot be a delalloc
* reservation because we already excluded those. It cannot be an
* unwritten extent with dirty page cache because we flushed the page
- * cache. We don't support realtime files yet, so we needn't (yet)
- * deal with them.
+ * cache. For files where the allocation unit is 1FSB (files on the
+ * data dev, rt files if the extent size is 1FSB), we can safely
+ * skip this mapping.
*/
- return true;
+ if (!xfs_inode_has_bigrtextents(sxi->sxi_ip1))
+ return true;
+
+ /*
+ * For a realtime file with a multi-fsb allocation unit, the decision
+ * is trickier because we can only swap full allocation units.
+ * Unwritten mappings can appear in the middle of an rtx if the rtx is
+ * partially written, but they can also appear for preallocations.
+ *
+ * If the mapping is a hole, skip it entirely. Holes should align with
+ * rtx boundaries.
+ */
+ if (!xfs_bmap_is_real_extent(irec))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten.
+ *
+ * - If the beginning is not aligned to an rtx, trim the end of the
+ * mapping so that it does not cross an rtx boundary, and swap it.
+ *
+ * - If both ends are aligned to an rtx, skip the entire mapping.
+ */
+ if (!isaligned_64(irec->br_startoff, mp->m_sb.sb_rextsize)) {
+ xfs_fileoff_t new_end;
+
+ new_end = roundup_64(irec->br_startoff, mp->m_sb.sb_rextsize);
+ irec->br_blockcount = new_end - irec->br_startoff;
+ return false;
+ }
+ if (isaligned_64(irec->br_blockcount, mp->m_sb.sb_rextsize))
+ return true;
+
+ /*
+ * All mappings below this point are unwritten, start on an rtx
+ * boundary, and do not end on an rtx boundary.
+ *
+ * - If the mapping is longer than one rtx, trim the end of the mapping
+ * down to an rtx boundary and skip it.
+ *
+ * - The mapping is shorter than one rtx. Swap it.
+ */
+ if (irec->br_blockcount > mp->m_sb.sb_rextsize) {
+ xfs_fileoff_t new_end;
+
+ new_end = rounddown_64(irec->br_startoff + irec->br_blockcount,
+ mp->m_sb.sb_rextsize);
+ irec->br_blockcount = new_end - irec->br_startoff;
+ return true;
+ }
+
+ return false;
}
/*
@@ -13,12 +13,11 @@
* This can be done to individual file extents by using the block mapping log
* intent items introduced with reflink and rmap; or to entire file ranges
* using swapext log intent items to track the overall progress across multiple
- * extent mappings. Realtime is not supported yet.
+ * extent mappings.
*/
static inline bool xfs_swapext_supported(struct xfs_mount *mp)
{
- return (xfs_has_reflink(mp) || xfs_has_rmapbt(mp)) &&
- !xfs_has_realtime(mp);
+ return xfs_has_reflink(mp) || xfs_has_rmapbt(mp);
}
/*
@@ -989,7 +989,7 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/* We can only free complete realtime extents. */
- if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ if (xfs_inode_has_bigrtextents(ip)) {
startoffset_fsb = roundup_64(startoffset_fsb,
mp->m_sb.sb_rextsize);
endoffset_fsb = rounddown_64(endoffset_fsb,
@@ -293,6 +293,11 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
}
+static inline bool xfs_inode_has_bigrtextents(struct xfs_inode *ip)
+{
+ return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -21,6 +21,7 @@
#include "xfs_sb.h"
#include "xfs_log_priv.h"
#include "xfs_health.h"
+#include "xfs_trace.h"
/*
* Read and return the summary information for a given extent size,
@@ -1461,3 +1462,161 @@ xfs_rtpick_extent(
*pick = b;
return 0;
}
+
+/*
+ * Decide if this is an unwritten extent that isn't aligned to a rt extent
+ * boundary. If it is, shorten the mapping so that we're ready to convert
+ * everything up to the next rt extent to a zeroed written extent. If not,
+ * return false.
+ */
+static inline bool
+xfs_rtfile_want_conversion(
+ struct xfs_mount *mp,
+ struct xfs_bmbt_irec *irec)
+{
+ xfs_fileoff_t rext_next;
+ uint32_t modoff, modcnt;
+
+ if (irec->br_state != XFS_EXT_UNWRITTEN)
+ return false;
+
+ div_u64_rem(irec->br_startoff, mp->m_sb.sb_rextsize, &modoff);
+ if (modoff == 0) {
+ uint64_t rexts = div_u64_rem(irec->br_blockcount,
+ mp->m_sb.sb_rextsize, &modcnt);
+
+ if (rexts > 0) {
+ /*
+ * Unwritten mapping starts at an rt extent boundary
+ * and is longer than one rt extent. Round the length
+ * down to the nearest extent but don't select it for
+ * conversion.
+ */
+ irec->br_blockcount -= modcnt;
+ modcnt = 0;
+ }
+
+ /* Unwritten mapping is perfectly aligned, do not convert. */
+ if (modcnt == 0)
+ return false;
+ }
+
+ /*
+ * Unaligned and unwritten; trim to the current rt extent and select it
+ * for conversion.
+ */
+ rext_next = (irec->br_startoff - modoff) + mp->m_sb.sb_rextsize;
+ xfs_trim_extent(irec, irec->br_startoff, rext_next - irec->br_startoff);
+ return true;
+}
+
+/*
+ * Find an unwritten extent in the given file range, zero it, and convert the
+ * mapping to written. Adjust the scan cursor on the way out.
+ */
+STATIC int
+xfs_rtfile_convert_one(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offp,
+ xfs_fileoff_t endoff)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int nmap;
+ int error;
+
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 1);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Read the mapping. If we find an unwritten extent that isn't aligned
+ * to an rt extent boundary...
+ */
+retry:
+ nmap = 1;
+ error = xfs_bmapi_read(ip, *offp, endoff - *offp, &irec, &nmap, 0);
+ if (error)
+ goto out_cancel;
+ ASSERT(nmap == 1);
+ ASSERT(irec.br_startoff == *offp);
+ if (!xfs_rtfile_want_conversion(mp, &irec)) {
+ *offp = irec.br_startoff + irec.br_blockcount;
+ if (*offp >= endoff)
+ goto out_cancel;
+ goto retry;
+ }
+
+ /*
+ * ...make sure this partially unwritten rt extent gets converted to a
+ * zeroed written extent that we can remap.
+ */
+ nmap = 1;
+ error = xfs_bmapi_write(tp, ip, irec.br_startoff, irec.br_blockcount,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &irec, &nmap);
+ if (error)
+ goto out_cancel;
+ ASSERT(nmap == 1);
+ if (irec.br_state != XFS_EXT_NORM) {
+ ASSERT(0);
+ error = -EIO;
+ goto out_cancel;
+ }
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ *offp = irec.br_startoff + irec.br_blockcount;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * For all realtime extents backing the given range of a file, search for
+ * unwritten mappings that do not cover a full rt extent and convert them
+ * to zeroed written mappings. The goal is to end up with one mapping per rt
+ * extent so that we can perform a remapping operation. Callers must ensure
+ * that there are no dirty pages in the given range.
+ */
+int
+xfs_rtfile_convert_unwritten(
+ struct xfs_inode *ip,
+ loff_t pos,
+ uint64_t len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+ xfs_fileoff_t endoff;
+ int error;
+
+ if (mp->m_sb.sb_rextsize == 1)
+ return 0;
+
+ off = rounddown_64(XFS_B_TO_FSBT(mp, pos), mp->m_sb.sb_rextsize);
+ endoff = roundup_64(XFS_B_TO_FSB(mp, pos + len), mp->m_sb.sb_rextsize);
+
+ trace_xfs_rtfile_convert_unwritten(ip, pos, len);
+
+ while (off < endoff) {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ error = xfs_rtfile_convert_one(ip, &off, endoff);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
@@ -140,6 +140,8 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
+int xfs_rtfile_convert_unwritten(struct xfs_inode *ip, loff_t pos,
+ uint64_t len);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
@@ -164,6 +166,7 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
# define xfs_rtunmount_inodes(m)
+# define xfs_rtfile_convert_unwritten(ip, pos, len) (0)
#endif /* CONFIG_XFS_RT */
#endif /* __XFS_RTALLOC_H__ */
@@ -1525,7 +1525,7 @@ DEFINE_IMAP_EVENT(xfs_iomap_alloc);
DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
TP_ARGS(ip, offset, count),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1533,7 +1533,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__field(loff_t, isize)
__field(loff_t, disize)
__field(loff_t, offset)
- __field(size_t, count)
+ __field(u64, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1544,7 +1544,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__entry->count = count;
),
TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
- "pos 0x%llx bytecount 0x%zx",
+ "pos 0x%llx bytecount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
@@ -1555,7 +1555,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
#define DEFINE_SIMPLE_IO_EVENT(name) \
DEFINE_EVENT(xfs_simple_io_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count), \
TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
@@ -3749,6 +3749,9 @@ TRACE_EVENT(xfs_ioctl_clone,
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+#ifdef CONFIG_XFS_RT
+DEFINE_SIMPLE_IO_EVENT(xfs_rtfile_convert_unwritten);
+#endif /* CONFIG_XFS_RT */
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
@@ -27,6 +27,7 @@
#include "xfs_sb.h"
#include "xfs_icache.h"
#include "xfs_log.h"
+#include "xfs_rtalloc.h"
#include <linux/fsnotify.h>
/*
@@ -391,7 +392,7 @@ xfs_file_xchg_range(
goto out_err;
/* Prepare and then exchange file contents. */
- error = xfs_xchg_range_prep(file1, file2, fxr);
+ error = xfs_xchg_range_prep(file1, file2, fxr, priv_flags);
if (error)
goto out_unlock;
@@ -770,12 +771,58 @@ xfs_swap_extent_forks(
return 0;
}
+/*
+ * There may be partially written rt extents lurking in the ranges to be
+ * swapped. According to the rules for realtime files with big rt extents, we
+ * must guarantee that an outside observer (an IO thread, realistically) never
+ * can see multiple physical rt extents mapped to the same logical file rt
+ * extent. The deferred bmap log intent items that we use under the hood
+ * operate on single block mappings and not rt extents, which means we must
+ * have a strategy to ensure that log recovery after a failure won't stop in
+ * the middle of an rt extent.
+ *
+ * The preferred strategy is to use deferred extent swap log intent items to
+ * track the status of the overall swap operation so that we can complete the
+ * work during crash recovery. If that isn't possible, we fall back to
+ * requiring the selected mappings in both forks to be aligned to rt extent
+ * boundaries. As an aside, the old fork swap routine didn't have this
+ * requirement, but at an extreme cost in flexibilty (full files only, and no
+ * support if rmapbt is enabled).
+ */
+static bool
+xfs_xchg_range_need_rt_conversion(
+ struct xfs_inode *ip,
+ unsigned int xchg_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * Caller got permission to use logged swapext, so log recovery will
+ * finish the swap and not leave us with partially swapped rt extents
+ * exposed to userspace.
+ */
+ if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
+ return false;
+
+ /*
+ * If we can't use log intent items at all, the only supported
+ * operation is full fork swaps, so no conversions are needed.
+ * The range requirements are enforced by the swapext code itself.
+ */
+ if (!xfs_swapext_supported(mp))
+ return false;
+
+ /* Conversion is only needed for realtime files with big rt extents */
+ return xfs_inode_has_bigrtextents(ip);
+}
+
/* Prepare two files to have their data exchanged. */
int
xfs_xchg_range_prep(
struct file *file1,
struct file *file2,
- struct xfs_exch_range *fxr)
+ struct xfs_exch_range *fxr,
+ unsigned int xchg_flags)
{
struct xfs_inode *ip1 = XFS_I(file_inode(file1));
struct xfs_inode *ip2 = XFS_I(file_inode(file2));
@@ -839,6 +886,19 @@ xfs_xchg_range_prep(
return error;
}
+ /* Convert unwritten sub-extent mappings if required. */
+ if (xfs_xchg_range_need_rt_conversion(ip2, xchg_flags)) {
+ error = xfs_rtfile_convert_unwritten(ip2, fxr->file2_offset,
+ fxr->length);
+ if (error)
+ return error;
+
+ error = xfs_rtfile_convert_unwritten(ip1, fxr->file1_offset,
+ fxr->length);
+ if (error)
+ return error;
+ }
+
return 0;
}
@@ -1056,6 +1116,15 @@ xfs_xchg_range(
if (xchg_flags & XFS_XCHG_RANGE_LOGGED)
req.req_flags |= XFS_SWAP_REQ_LOGGED;
+ /*
+ * Round the request length up to the nearest fundamental unit of
+ * allocation. The prep function already checked that the request
+ * offsets and length in @fxr are safe to round up.
+ */
+ if (XFS_IS_REALTIME_INODE(ip2))
+ req.blockcount = roundup_64(req.blockcount,
+ mp->m_sb.sb_rextsize);
+
error = xfs_xchg_range_estimate(&req);
if (error)
return error;
@@ -51,6 +51,6 @@ void xfs_xchg_range_rele_log_assist(struct xfs_mount *mp);
int xfs_xchg_range(struct xfs_inode *ip1, struct xfs_inode *ip2,
const struct xfs_exch_range *fxr, unsigned int xchg_flags);
int xfs_xchg_range_prep(struct file *file1, struct file *file2,
- struct xfs_exch_range *fxr);
+ struct xfs_exch_range *fxr, unsigned int xchg_flags);
#endif /* __XFS_XCHGRANGE_H__ */