@@ -132,6 +132,8 @@ xfs_initialize_rtgroups(
#ifdef __KERNEL__
/* Place kernel structure only init below this point. */
spin_lock_init(&rtg->rtg_state_lock);
+ xfs_drain_init(&rtg->rtg_intents);
+
#endif /* __KERNEL__ */
/* first new rtg is fully initialized */
@@ -183,6 +185,7 @@ xfs_free_rtgroups(
spin_unlock(&mp->m_rtgroup_lock);
ASSERT(rtg);
XFS_IS_CORRUPT(rtg->rtg_mount, atomic_read(&rtg->rtg_ref) != 0);
+ xfs_drain_free(&rtg->rtg_intents);
call_rcu(&rtg->rcu_head, __xfs_free_rtgroups);
}
@@ -37,6 +37,15 @@ struct xfs_rtgroup {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
spinlock_t rtg_state_lock;
+
+ /*
+ * We use xfs_drain to track the number of deferred log intent items
+ * that have been queued (but not yet processed) so that waiters (e.g.
+ * scrub) will not lock resources when other threads are in the middle
+ * of processing a chain of intent items only to find momentary
+ * inconsistencies.
+ */
+ struct xfs_drain rtg_intents;
#endif /* __KERNEL__ */
};
@@ -757,12 +757,78 @@ xchk_rt_unlock(
}
#ifdef CONFIG_XFS_RT
+/* Lock all the rt group metadata inode ILOCKs and wait for intents. */
+static int
+xchk_rtgroup_lock(
+ struct xfs_scrub *sc,
+ struct xchk_rt *sr,
+ unsigned int rtglock_flags)
+{
+ int error = 0;
+
+ ASSERT(sr->rtg != NULL);
+
+ /*
+ * If we're /only/ locking the rtbitmap in shared mode, then we're
+ * obviously not trying to compare records in two metadata inodes.
+ * There's no need to drain intents here because the caller (most
+ * likely the rgsuper scanner) doesn't need that level of consistency.
+ */
+ if (rtglock_flags == XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ do {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
+
+ /*
+ * Decide if the rt group is quiet enough for all metadata to
+ * be consistent with each other. Regular file IO doesn't get
+ * to lock all the rt inodes at the same time, which means that
+ * there could be other threads in the middle of processing a
+ * chain of deferred ops.
+ *
+ * We just locked all the metadata inodes for this rt group;
+ * now take a look to see if there are any intents in progress.
+ * If there are, drop the rt group inode locks and wait for the
+ * intents to drain. Since we hold the rt group inode locks
+ * for the duration of the scrub, this is the only time we have
+ * to sample the intents counter; any threads increasing it
+ * after this point can't possibly be in the middle of a chain
+ * of rt metadata updates.
+ *
+ * Obviously, this should be slanted against scrub and in favor
+ * of runtime threads.
+ */
+ if (!xfs_rtgroup_intents_busy(sr->rtg)) {
+ sr->rtlock_flags = rtglock_flags;
+ return 0;
+ }
+
+ xfs_rtgroup_unlock(sr->rtg, rtglock_flags);
+
+ if (!(sc->flags & XCHK_FSHOOKS_DRAIN))
+ return -ECHRNG;
+ error = xfs_rtgroup_drain_intents(sr->rtg);
+ if (error == -ERESTARTSYS)
+ error = -EINTR;
+ } while (!error);
+
+ return error;
+}
+
/*
* For scrubbing a realtime group, grab all the in-core resources we'll need to
* check the metadata, which means taking the ILOCK of the realtime group's
- * metadata inodes. Callers must not join these inodes to the transaction with
- * non-zero lockflags or concurrency problems will result. The @rtglock_flags
- * argument takes XFS_RTGLOCK_* flags.
+ * metadata inodes and draining any running intent chains. Callers must not
+ * join these inodes to the transaction with non-zero lockflags or concurrency
+ * problems will result. The @rtglock_flags argument takes XFS_RTGLOCK_*
+ * flags.
*/
int
xchk_rtgroup_init(
@@ -778,9 +844,7 @@ xchk_rtgroup_init(
if (!sr->rtg)
return -ENOENT;
- xfs_rtgroup_lock(NULL, sr->rtg, rtglock_flags);
- sr->rtlock_flags = rtglock_flags;
- return 0;
+ return xchk_rtgroup_lock(sc, sr, rtglock_flags);
}
/*
@@ -26,6 +26,9 @@ xchk_setup_rgbitmap(
{
int error;
+ if (xchk_need_fshook_drain(sc))
+ xchk_fshooks_enable(sc, XCHK_FSHOOKS_DRAIN);
+
error = xchk_trans_alloc(sc, 0);
if (error)
return error;
@@ -369,6 +369,7 @@ xfs_bmap_update_get_group(
rgno = xfs_rtb_to_rgno(mp, bi->bi_bmap.br_startblock);
bi->bi_rtg = xfs_rtgroup_get(mp, rgno);
+ xfs_rtgroup_bump_intents(bi->bi_rtg);
} else {
bi->bi_rtg = NULL;
}
@@ -395,8 +396,10 @@ xfs_bmap_update_put_group(
struct xfs_bmap_intent *bi)
{
if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) {
- if (xfs_has_rtgroups(bi->bi_owner->i_mount))
+ if (xfs_has_rtgroups(bi->bi_owner->i_mount)) {
+ xfs_rtgroup_drop_intents(bi->bi_rtg);
xfs_rtgroup_put(bi->bi_rtg);
+ }
return;
}
@@ -11,6 +11,7 @@
#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_trace.h"
+#include "xfs_rtgroup.h"
/*
* Use a static key here to reduce the overhead of xfs_drain_drop. If the
@@ -119,3 +120,43 @@ xfs_perag_intents_busy(
{
return xfs_drain_busy(&pag->pag_intents);
}
+
+#ifdef CONFIG_XFS_RT
+/* Add an item to the pending count. */
+void
+xfs_rtgroup_bump_intents(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_bump_intents(rtg, __return_address);
+ xfs_drain_bump(&rtg->rtg_intents);
+}
+
+/* Remove an item from the pending count. */
+void
+xfs_rtgroup_drop_intents(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_drop_intents(rtg, __return_address);
+ xfs_drain_drop(&rtg->rtg_intents);
+}
+
+/*
+ * Wait for the pending intent count for realtime metadata to hit zero.
+ * Callers must not hold any rt metadata inode locks.
+ */
+int
+xfs_rtgroup_drain_intents(
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_rtgroup_wait_intents(rtg, __return_address);
+ return xfs_drain_wait(&rtg->rtg_intents);
+}
+
+/* Might someone else be processing intents for this rt group? */
+bool
+xfs_rtgroup_intents_busy(
+ struct xfs_rtgroup *rtg)
+{
+ return xfs_drain_busy(&rtg->rtg_intents);
+}
+#endif /* CONFIG_XFS_RT */
@@ -7,6 +7,7 @@
#define XFS_DRAIN_H_
struct xfs_perag;
+struct xfs_rtgroup;
#ifdef CONFIG_XFS_DRAIN_INTENTS
/*
@@ -60,12 +61,27 @@ void xfs_drain_wait_enable(void);
* All functions that create work items must increment the intent counter as
* soon as the item is added to the transaction and cannot drop the counter
* until the item is finished or cancelled.
+ *
+ * The same principles apply to realtime groups because the rt metadata inode
+ * ILOCKs are not held across transaction rolls.
*/
void xfs_perag_bump_intents(struct xfs_perag *pag);
void xfs_perag_drop_intents(struct xfs_perag *pag);
int xfs_perag_drain_intents(struct xfs_perag *pag);
bool xfs_perag_intents_busy(struct xfs_perag *pag);
+
+#ifdef CONFIG_XFS_RT
+void xfs_rtgroup_bump_intents(struct xfs_rtgroup *rtg);
+void xfs_rtgroup_drop_intents(struct xfs_rtgroup *rtg);
+
+int xfs_rtgroup_drain_intents(struct xfs_rtgroup *rtg);
+bool xfs_rtgroup_intents_busy(struct xfs_rtgroup *rtg);
+#else
+static inline void xfs_rtgroup_bump_intents(struct xfs_rtgroup *rtg) { }
+static inline void xfs_rtgroup_drop_intents(struct xfs_rtgroup *rtg) { }
+#endif /* CONFIG_XFS_RT */
+
#else
struct xfs_drain { /* empty */ };
@@ -75,6 +91,9 @@ struct xfs_drain { /* empty */ };
static inline void xfs_perag_bump_intents(struct xfs_perag *pag) { }
static inline void xfs_perag_drop_intents(struct xfs_perag *pag) { }
+static inline void xfs_rtgroup_bump_intents(struct xfs_rtgroup *rtg) { }
+static inline void xfs_rtgroup_drop_intents(struct xfs_rtgroup *rtg) { }
+
#endif /* CONFIG_XFS_DRAIN_INTENTS */
#endif /* XFS_DRAIN_H_ */
@@ -491,6 +491,7 @@ xfs_extent_free_get_group(
rgno = xfs_rtb_to_rgno(mp, xefi->xefi_startblock);
xefi->xefi_rtg = xfs_rtgroup_get(mp, rgno);
+ xfs_rtgroup_bump_intents(xefi->xefi_rtg);
return;
}
@@ -505,6 +506,7 @@ xfs_extent_free_put_group(
struct xfs_extent_free_item *xefi)
{
if (xfs_efi_is_realtime(xefi)) {
+ xfs_rtgroup_drop_intents(xefi->xefi_rtg);
xfs_rtgroup_put(xefi->xefi_rtg);
return;
}
@@ -13,6 +13,7 @@ struct xfs_ail;
struct xfs_quotainfo;
struct xfs_da_geometry;
struct xfs_perag;
+struct xfs_rtgroup;
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
@@ -400,6 +400,7 @@ xfs_rmap_update_get_group(
rgno = xfs_rtb_to_rgno(mp, ri->ri_bmap.br_startblock);
ri->ri_rtg = xfs_rtgroup_get(mp, rgno);
+ xfs_rtgroup_bump_intents(ri->ri_rtg);
return;
}
@@ -414,6 +415,7 @@ xfs_rmap_update_put_group(
struct xfs_rmap_intent *ri)
{
if (ri->ri_realtime) {
+ xfs_rtgroup_drop_intents(ri->ri_rtg);
xfs_rtgroup_put(ri->ri_rtg);
return;
}
@@ -4872,6 +4872,38 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_bump_intents);
DEFINE_PERAG_INTENTS_EVENT(xfs_perag_drop_intents);
DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_rtgroup_intents_class,
+ TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip),
+ TP_ARGS(rtg, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(long, nr_intents)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg->rtg_mount->m_super->s_dev;
+ __entry->rtdev = rtg->rtg_mount->m_rtdev_targp->bt_dev;
+ __entry->nr_intents = atomic_read(&rtg->rtg_intents.dr_count);
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d intents %ld caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->nr_intents,
+ __entry->caller_ip)
+);
+
+#define DEFINE_RTGROUP_INTENTS_EVENT(name) \
+DEFINE_EVENT(xfs_rtgroup_intents_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg, void *caller_ip), \
+ TP_ARGS(rtg, caller_ip))
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_bump_intents);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_drop_intents);
+DEFINE_RTGROUP_INTENTS_EVENT(xfs_rtgroup_wait_intents);
+#endif /* CONFIG_XFS_RT */
+
#endif /* CONFIG_XFS_DRAIN_INTENTS */
TRACE_EVENT(xfs_swapext_overhead,