@@ -536,7 +536,11 @@ struct xfs_scrub_metadata {
*/
#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
-#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
+/* i: Allow scrub to freeze the filesystem to perform global scans. */
+#define XFS_SCRUB_IFLAG_FREEZE_OK (1 << 8)
+
+#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR | \
+ XFS_SCRUB_IFLAG_FREEZE_OK)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
XFS_SCRUB_OFLAG_PREEN | \
XFS_SCRUB_OFLAG_XFAIL | \
@@ -590,9 +590,13 @@ xfs_scrub_trans_alloc(
struct xfs_scrub_context *sc,
uint resblks)
{
+ uint flags = 0;
+
+ if (sc->fs_frozen)
+ flags |= XFS_TRANS_NO_WRITECOUNT;
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
- resblks, 0, 0, &sc->tp);
+ resblks, 0, flags, &sc->tp);
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
@@ -944,3 +948,84 @@ xfs_scrub_iput(
trace_xfs_scrub_iput_now(ip, __return_address);
iput(VFS_I(ip));
}
+
+/*
+ * Exclusive Filesystem Access During Scrub and Repair
+ * ===================================================
+ *
+ * While most scrub activity can occur while the filesystem is live, there
+ * are certain scenarios where we cannot tolerate concurrent metadata updates.
+ * We therefore must freeze the filesystem against all other changes.
+ *
+ * The typical scenarios envisioned for scrub freezes are (a) to lock out all
+ * other filesystem changes in order to check the global summary counters,
+ * and anything else that requires unusual behavioral semantics.
+ *
+ * The typical scenarios envisioned for repair freezes are (a) to avoid ABBA
+ * deadlocks when need to take locks in an unusual order; or (b) to update
+ * global filesystem state. For example, reconstruction of a damaged reverse
+ * mapping btree requires us to hold the AG header locks while scanning
+ * inodes, which goes against the usual inode -> AG header locking order.
+ *
+ * A note about inode reclaim: when we freeze the filesystem, users can't
+ * modify things and periodic background reclaim of speculative preallocations
+ * and copy-on-write staging extents is stopped. However, the scrub/repair
+ * thread must be careful about evicting an inode from memory -- if the
+ * eviction would require a transaction, we must defer the iput until after
+ * the scrub freeze. The reasons for this are twofold: first, scrub/repair
+ * already have a transaction and xfs can't nest transactions; and second, we
+ * froze the fs to prevent modifications that we can't control directly.
+ *
+ * Userspace is prevented from freezing or thawing the filesystem during a
+ * repair freeze by the ->freeze_super and ->thaw_super superblock operations,
+ * which block any changes to the freeze state while a repair freeze is
+ * running through the use of the m_scrub_freeze mutex. It only makes sense
+ * to run one scrub/repair freeze at a time, so the mutex is fine.
+ *
+ * Scrub/repair freezes cannot be initiated during a regular freeze because
+ * freeze_super does not allow nested freeze. Repair activity that does not
+ * require a repair freeze is also prevented from running during a regular
+ * freeze because transaction allocation blocks on the regular freeze. We
+ * assume that the only other users of XFS_TRANS_NO_WRITECOUNT transactions
+ * either aren't modifying space metadata in a way that would affect repair,
+ * or that we can inhibit any of the ones that do.
+ *
+ * Note that thaw_super and freeze_super can call deactivate_locked_super
+ * which can free the xfs_mount. This can happen if someone freezes the block
+ * device, unmounts the filesystem, and thaws the block device. Therefore, we
+ * must be careful about who gets to unlock the repair freeze mutex. See the
+ * comments in xfs_fs_put_super.
+ */
+
+/* Start a scrub/repair freeze. */
+int
+xfs_scrub_fs_freeze(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ if (!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_FREEZE_OK))
+ return -EUSERS;
+
+ mutex_lock(&sc->mp->m_scrub_freeze);
+ error = freeze_super(sc->mp->m_super);
+ if (error) {
+ mutex_unlock(&sc->mp->m_scrub_freeze);
+ return error;
+ }
+ sc->fs_frozen = true;
+ return 0;
+}
+
+/* Release a scrub/repair freeze and iput all the deferred inodes. */
+int
+xfs_scrub_fs_thaw(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ sc->fs_frozen = false;
+ error = thaw_super(sc->mp->m_super);
+ mutex_unlock(&sc->mp->m_scrub_freeze);
+ return error;
+}
@@ -141,5 +141,7 @@ static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
void xfs_scrub_iput(struct xfs_scrub_context *sc, struct xfs_inode *ip);
+int xfs_scrub_fs_freeze(struct xfs_scrub_context *sc);
+int xfs_scrub_fs_thaw(struct xfs_scrub_context *sc);
#endif /* __XFS_SCRUB_COMMON_H__ */
@@ -1161,3 +1161,24 @@ xfs_repair_ino_dqattach(
return error;
}
+
+/* Read all AG headers and attach to this transaction. */
+int
+xfs_repair_grab_all_ag_headers(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi;
+ struct xfs_buf *agf;
+ struct xfs_buf *agfl;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_scrub_ag_read_headers(sc, agno, &agi, &agf, &agfl);
+ if (error)
+ break;
+ }
+
+ return error;
+}
@@ -95,6 +95,7 @@ int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
struct xfs_buf *agfl_bp);
void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
+int xfs_repair_grab_all_ag_headers(struct xfs_scrub_context *sc);
/* Metadata repairers */
@@ -182,6 +182,8 @@ xfs_scrub_teardown(
struct xfs_inode *ip_in,
int error)
{
+ int err2;
+
xfs_scrub_ag_free(sc, &sc->sa);
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -199,6 +201,12 @@ xfs_scrub_teardown(
iput(VFS_I(sc->ip));
sc->ip = NULL;
}
+ if (sc->fs_frozen) {
+ err2 = xfs_scrub_fs_thaw(sc);
+ if (!error && err2)
+ error = err2;
+ sc->fs_frozen = false;
+ }
xfs_scrub_iput_deferred(sc);
if (sc->has_quotaofflock)
mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
@@ -65,6 +65,12 @@ struct xfs_scrub_context {
bool try_harder;
bool has_quotaofflock;
+ /*
+ * Do we own the current scrub freeze? It is critical that we
+ * release it before exiting to userspace!
+ */
+ bool fs_frozen;
+
/*
* List of inodes which cannot be released (by scrub) until after the
* scrub operation concludes because we'd have to do some work to the
@@ -193,6 +193,12 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
+ /*
+ * Only allow one thread to initiate a repair freeze at a time. We
+ * also use this to block userspace from changing the freeze state
+ * while a repair freeze is in progress.
+ */
+ struct mutex m_scrub_freeze;
} xfs_mount_t;
/*
@@ -1445,6 +1445,42 @@ xfs_fs_unfreeze(
return 0;
}
+/*
+ * Don't let userspace freeze while scrub has the filesystem frozen. Note
+ * that freeze_super can free the xfs_mount, so we must be careful to recheck
+ * XFS_M before trying to access anything in the xfs_mount afterwards.
+ */
+STATIC int
+xfs_fs_freeze_super(
+ struct super_block *sb)
+{
+ int error;
+
+ mutex_lock(&XFS_M(sb)->m_scrub_freeze);
+ error = freeze_super(sb);
+ if (XFS_M(sb))
+ mutex_unlock(&XFS_M(sb)->m_scrub_freeze);
+ return error;
+}
+
+/*
+ * Don't let userspace thaw while scrub has the filesystem frozen. Note that
+ * thaw_super can free the xfs_mount, so we must be careful to recheck XFS_M
+ * before trying to access anything in the xfs_mount afterwards.
+ */
+STATIC int
+xfs_fs_thaw_super(
+ struct super_block *sb)
+{
+ int error;
+
+ mutex_lock(&XFS_M(sb)->m_scrub_freeze);
+ error = thaw_super(sb);
+ if (XFS_M(sb))
+ mutex_unlock(&XFS_M(sb)->m_scrub_freeze);
+ return error;
+}
+
STATIC int
xfs_fs_show_options(
struct seq_file *m,
@@ -1582,6 +1618,7 @@ xfs_mount_alloc(
INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
+ mutex_init(&mp->m_scrub_freeze);
atomic_set(&mp->m_active_trans, 0);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
@@ -1768,6 +1805,7 @@ xfs_fs_fill_super(
out_free_fsname:
sb->s_fs_info = NULL;
xfs_free_fsname(mp);
+ mutex_destroy(&mp->m_scrub_freeze);
kfree(mp);
out:
return error;
@@ -1800,6 +1838,19 @@ xfs_fs_put_super(
sb->s_fs_info = NULL;
xfs_free_fsname(mp);
+ /*
+ * fs freeze takes an active reference to the filesystem and fs thaw
+ * drops it. If a filesystem on a frozen (dm) block device is
+ * unmounted before the block device is thawed, we can end up tearing
+ * down the super from within thaw_super when the device is thawed.
+ * xfs_fs_thaw_super grabbed the scrub repair mutex before calling
+ * thaw_super, so we must avoid freeing a locked mutex. At this point
+ * we know we're the only user of the filesystem, so we can safely
+ * unlock the scrub/repair mutex if it's locked.
+ */
+ if (mutex_is_locked(&mp->m_scrub_freeze))
+ mutex_unlock(&mp->m_scrub_freeze);
+ mutex_destroy(&mp->m_scrub_freeze);
kfree(mp);
}
@@ -1846,6 +1897,8 @@ static const struct super_operations xfs_super_operations = {
.show_options = xfs_fs_show_options,
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
+ .freeze_super = xfs_fs_freeze_super,
+ .thaw_super = xfs_fs_thaw_super,
};
static struct file_system_type xfs_fs_type = {
@@ -314,9 +314,12 @@ xfs_trans_alloc(
/*
* Zero-reservation ("empty") transactions can't modify anything, so
- * they're allowed to run while we're frozen.
+ * they're allowed to run while we're frozen. Scrub is allowed to
+ * freeze the filesystem in order to obtain exclusive access to the
+ * filesystem.
*/
WARN_ON(resp->tr_logres > 0 &&
+ !mutex_is_locked(&mp->m_scrub_freeze) &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);