@@ -344,6 +344,26 @@ xfs_check_delalloc(
#define xfs_check_delalloc(ip, whichfork) do { } while (0)
#endif
+/*
+ * Decide if we're going to throttle frontend threads that are inactivating
+ * inodes so that we don't overwhelm the background workers with inodes and OOM
+ * the machine.
+ */
+static inline bool
+xfs_inodegc_want_throttle(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+
+ /* Throttle if memory reclaim anywhere has triggered us. */
+ if (atomic_read(&mp->m_inodegc_reclaim) > 0) {
+ trace_xfs_inodegc_throttle_mempressure(mp);
+ return true;
+ }
+
+ return false;
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -357,6 +377,7 @@ xfs_inode_mark_reclaimable(
struct xfs_perag *pag;
unsigned int tag;
bool need_inactive;
+ bool flush_inodegc = false;
need_inactive = xfs_inode_needs_inactive(ip);
if (!need_inactive) {
@@ -392,6 +413,7 @@ xfs_inode_mark_reclaimable(
trace_xfs_inode_set_need_inactive(ip);
ip->i_flags |= XFS_NEED_INACTIVE;
tag = XFS_ICI_INODEGC_TAG;
+ flush_inodegc = xfs_inodegc_want_throttle(pag);
} else {
trace_xfs_inode_set_reclaimable(ip);
ip->i_flags |= XFS_IRECLAIMABLE;
@@ -404,13 +426,7 @@ xfs_inode_mark_reclaimable(
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
- /*
- * Wait for the background inodegc worker if it's running so that the
- * frontend can't overwhelm the background workers with inodes and OOM
- * the machine. We'll improve this with feedback from the rest of the
- * system in subsequent patches.
- */
- if (need_inactive && flush_work(&mp->m_inodegc_work.work))
+ if (flush_inodegc && flush_work(&mp->m_inodegc_work.work))
trace_xfs_inodegc_throttled(mp, __return_address);
}
@@ -1796,6 +1812,12 @@ xfs_inodegc_worker(
trace_xfs_inodegc_worker(mp, __return_address);
xfs_icwalk(mp, XFS_ICWALK_INODEGC, NULL);
}
+
+ /*
+ * We inactivated all the inodes we could, so disable the throttling
+ * of new inactivations that happens when memory gets tight.
+ */
+ atomic_set(&mp->m_inodegc_reclaim, 0);
}
/*
@@ -1837,6 +1859,75 @@ xfs_inodegc_start(
xfs_inodegc_queue(mp);
}
+/*
+ * Register a phony shrinker so that we can speed up background inodegc and
+ * throttle new inodegc queuing when there's memory pressure. Inactivation
+ * does not itself free any memory but it does make inodes reclaimable, which
+ * eventually frees memory. The count function, seek value, and batch value
+ * are crafted to trigger the scan function any time the shrinker is not being
+ * called from a background idle scan (i.e. the second time).
+ */
+#define XFS_INODEGC_SHRINK_COUNT (1UL << DEF_PRIORITY)
+#define XFS_INODEGC_SHRINK_BATCH (LONG_MAX)
+
+static unsigned long
+xfs_inodegc_shrink_count(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp;
+
+ mp = container_of(shrink, struct xfs_mount, m_inodegc_shrink);
+
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
+ return XFS_INODEGC_SHRINK_COUNT;
+
+ return 0;
+}
+
+static unsigned long
+xfs_inodegc_shrink_scan(
+ struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct xfs_mount *mp;
+
+ /*
+ * Inode inactivation work requires NOFS allocations, so don't make
+ * things worse if the caller wanted a NOFS allocation.
+ */
+ if (!(sc->gfp_mask & __GFP_FS))
+ return SHRINK_STOP;
+
+ mp = container_of(shrink, struct xfs_mount, m_inodegc_shrink);
+
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) {
+ trace_xfs_inodegc_requeue_mempressure(mp, sc->nr_to_scan,
+ __return_address);
+
+ atomic_inc(&mp->m_inodegc_reclaim);
+ mod_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+ }
+
+ return 0;
+}
+
+/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
+int
+xfs_inodegc_register_shrinker(
+ struct xfs_mount *mp)
+{
+ struct shrinker *shrink = &mp->m_inodegc_shrink;
+
+ shrink->count_objects = xfs_inodegc_shrink_count;
+ shrink->scan_objects = xfs_inodegc_shrink_scan;
+ shrink->seeks = 0;
+ shrink->flags = SHRINKER_NONSLAB;
+ shrink->batch = XFS_INODEGC_SHRINK_BATCH;
+
+ return register_shrinker(shrink);
+}
+
/* XFS Inode Cache Walking Code */
/*
@@ -78,5 +78,6 @@ void xfs_inodegc_worker(struct work_struct *work);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
+int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif
@@ -766,6 +766,10 @@ xfs_mountfs(
goto out_free_perag;
}
+ error = xfs_inodegc_register_shrinker(mp);
+ if (error)
+ goto out_fail_wait;
+
/*
* Log's mount-time initialization. The first part of recovery can place
* some items on the AIL, to be handled when recovery is finished or
@@ -776,7 +780,7 @@ xfs_mountfs(
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
if (error) {
xfs_warn(mp, "log mount failed");
- goto out_fail_wait;
+ goto out_inodegc_shrink;
}
/* Make sure the summary counts are ok. */
@@ -970,6 +974,8 @@ xfs_mountfs(
xfs_unmount_flush_inodes(mp);
out_log_dealloc:
xfs_log_mount_cancel(mp);
+ out_inodegc_shrink:
+ unregister_shrinker(&mp->m_inodegc_shrink);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_buftarg_drain(mp->m_logdev_targp);
@@ -1050,6 +1056,7 @@ xfs_unmountfs(
#if defined(DEBUG)
xfs_errortag_clearall(mp);
#endif
+ unregister_shrinker(&mp->m_inodegc_shrink);
xfs_free_perag(mp);
xfs_errortag_del(mp);
@@ -192,6 +192,7 @@ typedef struct xfs_mount {
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct delayed_work m_inodegc_work; /* background inode inactive */
+ struct shrinker m_inodegc_shrink;
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
@@ -219,6 +220,12 @@ typedef struct xfs_mount {
uint32_t m_generation;
struct mutex m_growlock; /* growfs mutex */
+ /*
+ * How many times has the memory shrinker poked us since the last time
+ * inodegc was queued?
+ */
+ atomic_t m_inodegc_reclaim;
+
#ifdef DEBUG
/*
* Frequency with which errors are injected. Replaces xfs_etest; the
@@ -1847,6 +1847,7 @@ static int xfs_init_fs_context(
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker);
+ atomic_set(&mp->m_inodegc_reclaim, 0);
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
@@ -193,6 +193,25 @@ DEFINE_FS_EVENT(xfs_inodegc_worker);
DEFINE_FS_EVENT(xfs_inodegc_throttled);
DEFINE_FS_EVENT(xfs_fs_sync_fs);
+TRACE_EVENT(xfs_inodegc_requeue_mempressure,
+ TP_PROTO(struct xfs_mount *mp, unsigned long nr, void *caller_ip),
+ TP_ARGS(mp, nr, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, nr)
+ __field(void *, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->nr = nr;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr,
+ __entry->caller_ip)
+);
+
DECLARE_EVENT_CLASS(xfs_gc_queue_class,
TP_PROTO(struct xfs_mount *mp, unsigned int delay_ms),
TP_ARGS(mp, delay_ms),
@@ -214,6 +233,22 @@ DEFINE_EVENT(xfs_gc_queue_class, name, \
TP_ARGS(mp, delay_ms))
DEFINE_GC_QUEUE_EVENT(xfs_inodegc_queue);
+TRACE_EVENT(xfs_inodegc_throttle_mempressure,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, votes)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->votes = atomic_read(&mp->m_inodegc_reclaim);
+ ),
+ TP_printk("dev %d:%d votes %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->votes)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
TP_ARGS(mp, agno),