Message ID | 155968502066.1657646.3694276570612406995.stgit@magnolia (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | xfs: refactor and improve inode iteration | expand |
On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@oracle.com> > > Create a parallel iwalk implementation and switch quotacheck to use it. > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > --- Interesting.. is there any commonality here with the ktask mechanism that's been in progress? I've not followed the details, but I thought it was a similar idea. The last post I see for that is here: https://marc.info/?l=linux-mm&m=154143701122927&w=2 That aside, this all looks mostly fine to me. A few random thoughts.. > fs/xfs/Makefile | 1 > fs/xfs/xfs_globals.c | 3 + > fs/xfs/xfs_iwalk.c | 76 ++++++++++++++++++++++++++++++- > fs/xfs/xfs_iwalk.h | 2 + > fs/xfs/xfs_pwork.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_pwork.h | 50 ++++++++++++++++++++ > fs/xfs/xfs_qm.c | 2 - > fs/xfs/xfs_sysctl.h | 6 ++ > fs/xfs/xfs_sysfs.c | 40 ++++++++++++++++ > fs/xfs/xfs_trace.h | 18 +++++++ > 10 files changed, 317 insertions(+), 3 deletions(-) > create mode 100644 fs/xfs/xfs_pwork.c > create mode 100644 fs/xfs/xfs_pwork.h > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > index 74d30ef0dbce..48940a27d4aa 100644 > --- a/fs/xfs/Makefile > +++ b/fs/xfs/Makefile > @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ > xfs_message.o \ > xfs_mount.o \ > xfs_mru_cache.o \ > + xfs_pwork.o \ > xfs_reflink.o \ > xfs_stats.o \ > xfs_super.o \ > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c > index d0d377384120..4f93f2c4dc38 100644 > --- a/fs/xfs/xfs_globals.c > +++ b/fs/xfs/xfs_globals.c > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { > .fstrm_timer = { 1, 30*100, 3600*100}, > .eofb_timer = { 1, 300, 3600*24}, > .cowb_timer = { 1, 1800, 3600*24}, > +#ifdef DEBUG > + .pwork_threads = { 0, 0, NR_CPUS }, > +#endif > }; > > struct xfs_globals xfs_globals = { > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > index 8595258b5001..71ee1628aa70 100644 > --- a/fs/xfs/xfs_iwalk.c > +++ b/fs/xfs/xfs_iwalk.c > @@ -21,6 +21,7 @@ > #include "xfs_health.h" > #include "xfs_trans.h" > #include "xfs_iwalk.h" > +#include "xfs_pwork.h" > > /* > * Walking Inodes in the Filesystem > @@ -46,6 +47,9 @@ > */ > > struct xfs_iwalk_ag { > + /* parallel work control data; will be null if single threaded */ > + struct xfs_pwork pwork; > + > struct xfs_mount *mp; > struct xfs_trans *tp; > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( > trace_xfs_iwalk_ag_rec(mp, agno, irec); > > for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > + if (xfs_pwork_want_abort(&iwag->pwork)) > + return 0; > + > /* Skip if this inode is free */ > if (XFS_INOBT_MASK(j) & irec->ir_free) > continue; > @@ -360,7 +367,7 @@ xfs_iwalk_ag( > agino = XFS_INO_TO_AGINO(mp, iwag->startino); > error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > - while (!error && has_more) { > + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { > struct xfs_inobt_rec_incore *irec; > > cond_resched(); > @@ -409,7 +416,7 @@ xfs_iwalk_ag( > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > /* Walk any records left behind in the cache. */ > - if (iwag->nr_recs == 0 || error) > + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) > return error; > > return xfs_iwalk_ag_recs(iwag); > @@ -465,6 +472,7 @@ xfs_iwalk( > .iwalk_fn = iwalk_fn, > .data = data, > .startino = startino, > + .pwork = XFS_PWORK_SINGLE_THREADED, > }; > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > int error; > @@ -486,3 +494,67 @@ xfs_iwalk( > xfs_iwalk_free(&iwag); > return error; > } > + > +/* Run per-thread iwalk work. */ > +static int > +xfs_iwalk_ag_work( > + struct xfs_mount *mp, > + struct xfs_pwork *pwork) > +{ > + struct xfs_iwalk_ag *iwag; > + int error; > + > + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); > + error = xfs_iwalk_alloc(iwag); > + if (error) > + goto out; In most cases this will never fail, but the error path if it does looks slightly painful. I was thinking if we could move this up into xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when failure is imminent... > + > + error = xfs_iwalk_ag(iwag); > + xfs_iwalk_free(iwag); > +out: > + kmem_free(iwag); > + return error; > +} > + > +/* > + * Walk all the inodes in the filesystem using multiple threads to process each > + * AG. > + */ > +int > +xfs_iwalk_threaded( > + struct xfs_mount *mp, > + xfs_ino_t startino, > + xfs_iwalk_fn iwalk_fn, > + unsigned int max_prefetch, > + void *data) > +{ > + struct xfs_pwork_ctl pctl; > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > + unsigned int nr_threads; > + int error; > + > + ASSERT(agno < mp->m_sb.sb_agcount); > + > + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); > + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", > + nr_threads); > + if (error) > + return error; > + > + for (; agno < mp->m_sb.sb_agcount; agno++) { > + struct xfs_iwalk_ag *iwag; > + > + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); > + iwag->mp = mp; > + iwag->tp = NULL; > + iwag->iwalk_fn = iwalk_fn; > + iwag->data = data; > + iwag->startino = startino; > + iwag->recs = NULL; > + xfs_iwalk_set_prefetch(iwag, max_prefetch); > + xfs_pwork_queue(&pctl, &iwag->pwork); > + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > + } ... but this is only bound by the number of AGs and so could result in a large number of allocations. FWIW, I wouldn't expect that to be a problem in the common case. I'm more thinking about the case of a specially crafted filesystem designed to cause problems on mount. > + > + return xfs_pwork_destroy(&pctl); > +} > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h > index 45b1baabcd2d..40233a05a766 100644 > --- a/fs/xfs/xfs_iwalk.h > +++ b/fs/xfs/xfs_iwalk.h > @@ -14,5 +14,7 @@ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, > > int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, > xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); > +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, > + xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); > > #endif /* __XFS_IWALK_H__ */ > diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c > new file mode 100644 > index 000000000000..19605a3a2482 > --- /dev/null > +++ b/fs/xfs/xfs_pwork.c > @@ -0,0 +1,122 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * Copyright (C) 2019 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > + */ > +#include "xfs.h" > +#include "xfs_fs.h" > +#include "xfs_shared.h" > +#include "xfs_format.h" > +#include "xfs_log_format.h" > +#include "xfs_trans_resv.h" > +#include "xfs_mount.h" > +#include "xfs_trace.h" > +#include "xfs_sysctl.h" > +#include "xfs_pwork.h" > + > +/* > + * Parallel Work Queue > + * =================== > + * > + * Abstract away the details of running a large and "obviously" parallelizable > + * task across multiple CPUs. Callers initialize the pwork control object with > + * a desired level of parallelization and a work function. Next, they embed > + * struct xfs_pwork in whatever structure they use to pass work context to a > + * worker thread and queue that pwork. The work function will be passed the > + * pwork item when it is run (from process context) and any returned error will > + * cause all threads to abort. > + * FYI minor whitespace damage (trailing space) on the line above. > + * This is the rough equivalent of the xfsprogs workqueue code, though we can't > + * reuse that name here. > + */ > + > +/* Invoke our caller's function. */ > +static void > +xfs_pwork_work( > + struct work_struct *work) > +{ > + struct xfs_pwork *pwork; > + struct xfs_pwork_ctl *pctl; > + int error; > + > + pwork = container_of(work, struct xfs_pwork, work); > + pctl = pwork->pctl; > + error = pctl->work_fn(pctl->mp, pwork); > + if (error && !pctl->error) > + pctl->error = error; > +} > + > +/* > + * Set up control data for parallel work. @work_fn is the function that will > + * be called. @tag will be written into the kernel threads. @nr_threads is > + * the level of parallelism desired, or 0 for no limit. > + */ > +int > +xfs_pwork_init( > + struct xfs_mount *mp, > + struct xfs_pwork_ctl *pctl, > + xfs_pwork_work_fn work_fn, > + const char *tag, > + unsigned int nr_threads) > +{ > +#ifdef DEBUG > + if (xfs_globals.pwork_threads > 0) > + nr_threads = xfs_globals.pwork_threads; > +#endif Hmm, it might be useful to have the ability to force the no limit case from the debug knob. Can we use -1 or something here for "disabled?" > + trace_xfs_pwork_init(mp, nr_threads, current->pid); > + > + pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, > + current->pid); > + if (!pctl->wq) > + return -ENOMEM; > + pctl->work_fn = work_fn; > + pctl->error = 0; > + pctl->mp = mp; > + > + return 0; > +} > + > +/* Queue some parallel work. */ > +void > +xfs_pwork_queue( > + struct xfs_pwork_ctl *pctl, > + struct xfs_pwork *pwork) > +{ > + INIT_WORK(&pwork->work, xfs_pwork_work); > + pwork->pctl = pctl; > + queue_work(pctl->wq, &pwork->work); > +} > + > +/* Wait for the work to finish and tear down the control structure. */ > +int > +xfs_pwork_destroy( > + struct xfs_pwork_ctl *pctl) > +{ > + destroy_workqueue(pctl->wq); > + pctl->wq = NULL; > + return pctl->error; > +} > + > +/* > + * Return the amount of parallelism that the data device can handle, or 0 for > + * no limit. > + */ > +unsigned int > +xfs_pwork_guess_datadev_parallelism( > + struct xfs_mount *mp) > +{ > + struct xfs_buftarg *btp = mp->m_ddev_targp; > + int iomin; > + int ioopt; > + > + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) > + return num_online_cpus(); > + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) > + return mp->m_sb.sb_width / mp->m_sb.sb_unit; > + iomin = bdev_io_min(btp->bt_bdev); > + ioopt = bdev_io_opt(btp->bt_bdev); > + if (iomin && ioopt) > + return ioopt / iomin; > + > + return 1; Have you collected any performance data related to these heuristics? I assume the feature is generally a win, but this also seems like we have a large window of variance here. E.g., an SSD on a server with hundreds of CPUs will enable as many threads as CPUs, but a single xTB spindle on the same box may run single threaded (a quick check of a few local devices all return an optimal I/O size of 0). Is there really no benefit parallelizing some of that work in the spinning rust case? What about in the other direction where we might have a ton of threads for inodes across AGs that all happen to be in the same project quota, for example? Brian > +} > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h > new file mode 100644 > index 000000000000..e0c1354a2d8c > --- /dev/null > +++ b/fs/xfs/xfs_pwork.h > @@ -0,0 +1,50 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * Copyright (C) 2019 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > + */ > +#ifndef __XFS_PWORK_H__ > +#define __XFS_PWORK_H__ > + > +struct xfs_pwork; > +struct xfs_mount; > + > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); > + > +/* > + * Parallel work coordination structure. > + */ > +struct xfs_pwork_ctl { > + struct workqueue_struct *wq; > + struct xfs_mount *mp; > + xfs_pwork_work_fn work_fn; > + int error; > +}; > + > +/* > + * Embed this parallel work control item inside your own work structure, > + * then queue work with it. > + */ > +struct xfs_pwork { > + struct work_struct work; > + struct xfs_pwork_ctl *pctl; > +}; > + > +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } > + > +/* Have we been told to abort? */ > +static inline bool > +xfs_pwork_want_abort( > + struct xfs_pwork *pwork) > +{ > + return pwork->pctl && pwork->pctl->error; > +} > + > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, > + xfs_pwork_work_fn work_fn, const char *tag, > + unsigned int nr_threads); > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); > + > +#endif /* __XFS_PWORK_H__ */ > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > index a5b2260406a8..e4f3785f7a64 100644 > --- a/fs/xfs/xfs_qm.c > +++ b/fs/xfs/xfs_qm.c > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( > flags |= XFS_PQUOTA_CHKD; > } > > - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); > + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); > if (error) > goto error_return; > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h > index ad7f9be13087..b555e045e2f4 100644 > --- a/fs/xfs/xfs_sysctl.h > +++ b/fs/xfs/xfs_sysctl.h > @@ -37,6 +37,9 @@ typedef struct xfs_param { > xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ > xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ > xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ > +#ifdef DEBUG > + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ > +#endif > } xfs_param_t; > > /* > @@ -82,6 +85,9 @@ enum { > extern xfs_param_t xfs_params; > > struct xfs_globals { > +#ifdef DEBUG > + int pwork_threads; /* parallel workqueue threads */ > +#endif > int log_recovery_delay; /* log recovery delay (secs) */ > int mount_delay; /* mount setup delay (secs) */ > bool bug_on_assert; /* BUG() the kernel on assert failure */ > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c > index cabda13f3c64..910e6b9cb1a7 100644 > --- a/fs/xfs/xfs_sysfs.c > +++ b/fs/xfs/xfs_sysfs.c > @@ -206,11 +206,51 @@ always_cow_show( > } > XFS_SYSFS_ATTR_RW(always_cow); > > +#ifdef DEBUG > +/* > + * Override how many threads the parallel work queue is allowed to create. > + * This has to be a debug-only global (instead of an errortag) because one of > + * the main users of parallel workqueues is mount time quotacheck. > + */ > +STATIC ssize_t > +pwork_threads_store( > + struct kobject *kobject, > + const char *buf, > + size_t count) > +{ > + int ret; > + int val; > + > + ret = kstrtoint(buf, 0, &val); > + if (ret) > + return ret; > + > + if (val < 0 || val > NR_CPUS) > + return -EINVAL; > + > + xfs_globals.pwork_threads = val; > + > + return count; > +} > + > +STATIC ssize_t > +pwork_threads_show( > + struct kobject *kobject, > + char *buf) > +{ > + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); > +} > +XFS_SYSFS_ATTR_RW(pwork_threads); > +#endif /* DEBUG */ > + > static struct attribute *xfs_dbg_attrs[] = { > ATTR_LIST(bug_on_assert), > ATTR_LIST(log_recovery_delay), > ATTR_LIST(mount_delay), > ATTR_LIST(always_cow), > +#ifdef DEBUG > + ATTR_LIST(pwork_threads), > +#endif > NULL, > }; > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index f9bb1d50bc0e..658cbade1998 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, > __entry->startino, __entry->freemask) > ) > > +TRACE_EVENT(xfs_pwork_init, > + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), > + TP_ARGS(mp, nr_threads, pid), > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(unsigned int, nr_threads) > + __field(pid_t, pid) > + ), > + TP_fast_assign( > + __entry->dev = mp->m_super->s_dev; > + __entry->nr_threads = nr_threads; > + __entry->pid = pid; > + ), > + TP_printk("dev %d:%d nr_threads %u pid %u", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + __entry->nr_threads, __entry->pid) > +) > + > #endif /* _TRACE_XFS_H */ > > #undef TRACE_INCLUDE_PATH >
On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote: > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > Create a parallel iwalk implementation and switch quotacheck to use it. > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > --- > > Interesting.. is there any commonality here with the ktask mechanism > that's been in progress? I've not followed the details, but I thought it > was a similar idea. The last post I see for that is here: > > https://marc.info/?l=linux-mm&m=154143701122927&w=2 Yes, xfs_pwork is... the result of ktask still not landing upstream after a couple of years. :( > That aside, this all looks mostly fine to me. A few random thoughts.. > > > fs/xfs/Makefile | 1 > > fs/xfs/xfs_globals.c | 3 + > > fs/xfs/xfs_iwalk.c | 76 ++++++++++++++++++++++++++++++- > > fs/xfs/xfs_iwalk.h | 2 + > > fs/xfs/xfs_pwork.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > fs/xfs/xfs_pwork.h | 50 ++++++++++++++++++++ > > fs/xfs/xfs_qm.c | 2 - > > fs/xfs/xfs_sysctl.h | 6 ++ > > fs/xfs/xfs_sysfs.c | 40 ++++++++++++++++ > > fs/xfs/xfs_trace.h | 18 +++++++ > > 10 files changed, 317 insertions(+), 3 deletions(-) > > create mode 100644 fs/xfs/xfs_pwork.c > > create mode 100644 fs/xfs/xfs_pwork.h > > > > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > > index 74d30ef0dbce..48940a27d4aa 100644 > > --- a/fs/xfs/Makefile > > +++ b/fs/xfs/Makefile > > @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ > > xfs_message.o \ > > xfs_mount.o \ > > xfs_mru_cache.o \ > > + xfs_pwork.o \ > > xfs_reflink.o \ > > xfs_stats.o \ > > xfs_super.o \ > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c > > index d0d377384120..4f93f2c4dc38 100644 > > --- a/fs/xfs/xfs_globals.c > > +++ b/fs/xfs/xfs_globals.c > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { > > .fstrm_timer = { 1, 30*100, 3600*100}, > > .eofb_timer = { 1, 300, 3600*24}, > > .cowb_timer = { 1, 1800, 3600*24}, > > +#ifdef DEBUG > > + .pwork_threads = { 0, 0, NR_CPUS }, > > +#endif > > }; > > > > struct xfs_globals xfs_globals = { > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > > index 8595258b5001..71ee1628aa70 100644 > > --- a/fs/xfs/xfs_iwalk.c > > +++ b/fs/xfs/xfs_iwalk.c > > @@ -21,6 +21,7 @@ > > #include "xfs_health.h" > > #include "xfs_trans.h" > > #include "xfs_iwalk.h" > > +#include "xfs_pwork.h" > > > > /* > > * Walking Inodes in the Filesystem > > @@ -46,6 +47,9 @@ > > */ > > > > struct xfs_iwalk_ag { > > + /* parallel work control data; will be null if single threaded */ > > + struct xfs_pwork pwork; > > + > > struct xfs_mount *mp; > > struct xfs_trans *tp; > > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( > > trace_xfs_iwalk_ag_rec(mp, agno, irec); > > > > for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > > + if (xfs_pwork_want_abort(&iwag->pwork)) > > + return 0; > > + > > /* Skip if this inode is free */ > > if (XFS_INOBT_MASK(j) & irec->ir_free) > > continue; > > @@ -360,7 +367,7 @@ xfs_iwalk_ag( > > agino = XFS_INO_TO_AGINO(mp, iwag->startino); > > error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > > > - while (!error && has_more) { > > + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { > > struct xfs_inobt_rec_incore *irec; > > > > cond_resched(); > > @@ -409,7 +416,7 @@ xfs_iwalk_ag( > > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > > > /* Walk any records left behind in the cache. */ > > - if (iwag->nr_recs == 0 || error) > > + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) > > return error; > > > > return xfs_iwalk_ag_recs(iwag); > > @@ -465,6 +472,7 @@ xfs_iwalk( > > .iwalk_fn = iwalk_fn, > > .data = data, > > .startino = startino, > > + .pwork = XFS_PWORK_SINGLE_THREADED, > > }; > > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > int error; > > @@ -486,3 +494,67 @@ xfs_iwalk( > > xfs_iwalk_free(&iwag); > > return error; > > } > > + > > +/* Run per-thread iwalk work. */ > > +static int > > +xfs_iwalk_ag_work( > > + struct xfs_mount *mp, > > + struct xfs_pwork *pwork) > > +{ > > + struct xfs_iwalk_ag *iwag; > > + int error; > > + > > + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); > > + error = xfs_iwalk_alloc(iwag); > > + if (error) > > + goto out; > > In most cases this will never fail, but the error path if it does looks > slightly painful. I was thinking if we could move this up into > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when > failure is imminent... > > > + > > + error = xfs_iwalk_ag(iwag); > > + xfs_iwalk_free(iwag); > > +out: > > + kmem_free(iwag); > > + return error; > > +} > > + > > +/* > > + * Walk all the inodes in the filesystem using multiple threads to process each > > + * AG. > > + */ > > +int > > +xfs_iwalk_threaded( > > + struct xfs_mount *mp, > > + xfs_ino_t startino, > > + xfs_iwalk_fn iwalk_fn, > > + unsigned int max_prefetch, > > + void *data) > > +{ > > + struct xfs_pwork_ctl pctl; > > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > + unsigned int nr_threads; > > + int error; > > + > > + ASSERT(agno < mp->m_sb.sb_agcount); > > + > > + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); > > + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", > > + nr_threads); > > + if (error) > > + return error; > > + > > + for (; agno < mp->m_sb.sb_agcount; agno++) { > > + struct xfs_iwalk_ag *iwag; > > + > > + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); > > + iwag->mp = mp; > > + iwag->tp = NULL; > > + iwag->iwalk_fn = iwalk_fn; > > + iwag->data = data; > > + iwag->startino = startino; > > + iwag->recs = NULL; > > + xfs_iwalk_set_prefetch(iwag, max_prefetch); > > + xfs_pwork_queue(&pctl, &iwag->pwork); > > + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > > + } > > ... but this is only bound by the number of AGs and so could result in a > large number of allocations. FWIW, I wouldn't expect that to be a > problem in the common case. I'm more thinking about the case of a > specially crafted filesystem designed to cause problems on mount. <nod> I thought about that, and decided that it wasn't a good idea to for each of the queued (but not processing) work items to be sitting on a bunch of memory because that memory can't be put to useful work. That's why I put it in xfs_iwalk_ag_work. Also that would necessitate iwalk feeding a destructor to pwork so that it can deal with work items that were queued but never actually run. > > > + > > + return xfs_pwork_destroy(&pctl); > > +} > > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h > > index 45b1baabcd2d..40233a05a766 100644 > > --- a/fs/xfs/xfs_iwalk.h > > +++ b/fs/xfs/xfs_iwalk.h > > @@ -14,5 +14,7 @@ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, > > > > int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, > > xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); > > +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, > > + xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); > > > > #endif /* __XFS_IWALK_H__ */ > > diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c > > new file mode 100644 > > index 000000000000..19605a3a2482 > > --- /dev/null > > +++ b/fs/xfs/xfs_pwork.c > > @@ -0,0 +1,122 @@ > > +// SPDX-License-Identifier: GPL-2.0+ > > +/* > > + * Copyright (C) 2019 Oracle. All Rights Reserved. > > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > > + */ > > +#include "xfs.h" > > +#include "xfs_fs.h" > > +#include "xfs_shared.h" > > +#include "xfs_format.h" > > +#include "xfs_log_format.h" > > +#include "xfs_trans_resv.h" > > +#include "xfs_mount.h" > > +#include "xfs_trace.h" > > +#include "xfs_sysctl.h" > > +#include "xfs_pwork.h" > > + > > +/* > > + * Parallel Work Queue > > + * =================== > > + * > > + * Abstract away the details of running a large and "obviously" parallelizable > > + * task across multiple CPUs. Callers initialize the pwork control object with > > + * a desired level of parallelization and a work function. Next, they embed > > + * struct xfs_pwork in whatever structure they use to pass work context to a > > + * worker thread and queue that pwork. The work function will be passed the > > + * pwork item when it is run (from process context) and any returned error will > > + * cause all threads to abort. > > + * > > FYI minor whitespace damage (trailing space) on the line above. Fixed. > > + * This is the rough equivalent of the xfsprogs workqueue code, though we can't > > + * reuse that name here. > > + */ > > + > > +/* Invoke our caller's function. */ > > +static void > > +xfs_pwork_work( > > + struct work_struct *work) > > +{ > > + struct xfs_pwork *pwork; > > + struct xfs_pwork_ctl *pctl; > > + int error; > > + > > + pwork = container_of(work, struct xfs_pwork, work); > > + pctl = pwork->pctl; > > + error = pctl->work_fn(pctl->mp, pwork); > > + if (error && !pctl->error) > > + pctl->error = error; > > +} > > + > > +/* > > + * Set up control data for parallel work. @work_fn is the function that will > > + * be called. @tag will be written into the kernel threads. @nr_threads is > > + * the level of parallelism desired, or 0 for no limit. > > + */ > > +int > > +xfs_pwork_init( > > + struct xfs_mount *mp, > > + struct xfs_pwork_ctl *pctl, > > + xfs_pwork_work_fn work_fn, > > + const char *tag, > > + unsigned int nr_threads) > > +{ > > +#ifdef DEBUG > > + if (xfs_globals.pwork_threads > 0) > > + nr_threads = xfs_globals.pwork_threads; > > +#endif > > Hmm, it might be useful to have the ability to force the no limit case > from the debug knob. Can we use -1 or something here for "disabled?" Ok. > > + trace_xfs_pwork_init(mp, nr_threads, current->pid); > > + > > + pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, > > + current->pid); > > + if (!pctl->wq) > > + return -ENOMEM; > > + pctl->work_fn = work_fn; > > + pctl->error = 0; > > + pctl->mp = mp; > > + > > + return 0; > > +} > > + > > +/* Queue some parallel work. */ > > +void > > +xfs_pwork_queue( > > + struct xfs_pwork_ctl *pctl, > > + struct xfs_pwork *pwork) > > +{ > > + INIT_WORK(&pwork->work, xfs_pwork_work); > > + pwork->pctl = pctl; > > + queue_work(pctl->wq, &pwork->work); > > +} > > + > > +/* Wait for the work to finish and tear down the control structure. */ > > +int > > +xfs_pwork_destroy( > > + struct xfs_pwork_ctl *pctl) > > +{ > > + destroy_workqueue(pctl->wq); > > + pctl->wq = NULL; > > + return pctl->error; > > +} > > + > > +/* > > + * Return the amount of parallelism that the data device can handle, or 0 for > > + * no limit. > > + */ > > +unsigned int > > +xfs_pwork_guess_datadev_parallelism( > > + struct xfs_mount *mp) > > +{ > > + struct xfs_buftarg *btp = mp->m_ddev_targp; > > + int iomin; > > + int ioopt; > > + > > + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) > > + return num_online_cpus(); > > + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) > > + return mp->m_sb.sb_width / mp->m_sb.sb_unit; > > + iomin = bdev_io_min(btp->bt_bdev); > > + ioopt = bdev_io_opt(btp->bt_bdev); > > + if (iomin && ioopt) > > + return ioopt / iomin; > > + > > + return 1; > > Have you collected any performance data related to these heuristics? Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a single SSD, ~10% on a 4-way raid0). That wasn't really all that awesome, so I recorded a flame graph (see below) to find where the remaining overhead is. A lot of it was in xfs_iget, and I also noticed that deferred inode inactivation sped it up a little more. > I > assume the feature is generally a win, but this also seems like we have > a large window of variance here. E.g., an SSD on a server with hundreds > of CPUs will enable as many threads as CPUs, but a single xTB spindle on > the same box may run single threaded (a quick check of a few local > devices all return an optimal I/O size of 0). Is there really no benefit > parallelizing some of that work in the spinning rust case? Single-spindle spinning rust got worse even with 2 threads because the heads ping-pong between AGs. It's not horrible with enough readahead, but it turns into a disaster when I disabled readahead, unsurprisingly. > What about in the other direction where we might have a ton of threads > for inodes across AGs that all happen to be in the same project quota, > for example? I collected a flame graph... https://djwong.org/docs/quotacheck.svg It turned out that the inode setup overhead in xfs_iget is so high that locking the dquot has negligible overhead. When I "fixed" quotacheck to read the quota information straight from the inode cluster buffer if the inode wasn't in memory, the runtime dropped by 60% but Dave warned me not to take us back to inode buffer aliasing hell. I also noted that if booting with mem=512M the memory reclamation overhead totally fries us regardless of parallelisation. --D > > Brian > > > +} > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h > > new file mode 100644 > > index 000000000000..e0c1354a2d8c > > --- /dev/null > > +++ b/fs/xfs/xfs_pwork.h > > @@ -0,0 +1,50 @@ > > +// SPDX-License-Identifier: GPL-2.0+ > > +/* > > + * Copyright (C) 2019 Oracle. All Rights Reserved. > > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > > + */ > > +#ifndef __XFS_PWORK_H__ > > +#define __XFS_PWORK_H__ > > + > > +struct xfs_pwork; > > +struct xfs_mount; > > + > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); > > + > > +/* > > + * Parallel work coordination structure. > > + */ > > +struct xfs_pwork_ctl { > > + struct workqueue_struct *wq; > > + struct xfs_mount *mp; > > + xfs_pwork_work_fn work_fn; > > + int error; > > +}; > > + > > +/* > > + * Embed this parallel work control item inside your own work structure, > > + * then queue work with it. > > + */ > > +struct xfs_pwork { > > + struct work_struct work; > > + struct xfs_pwork_ctl *pctl; > > +}; > > + > > +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } > > + > > +/* Have we been told to abort? */ > > +static inline bool > > +xfs_pwork_want_abort( > > + struct xfs_pwork *pwork) > > +{ > > + return pwork->pctl && pwork->pctl->error; > > +} > > + > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, > > + xfs_pwork_work_fn work_fn, const char *tag, > > + unsigned int nr_threads); > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); > > + > > +#endif /* __XFS_PWORK_H__ */ > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > > index a5b2260406a8..e4f3785f7a64 100644 > > --- a/fs/xfs/xfs_qm.c > > +++ b/fs/xfs/xfs_qm.c > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( > > flags |= XFS_PQUOTA_CHKD; > > } > > > > - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); > > + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); > > if (error) > > goto error_return; > > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h > > index ad7f9be13087..b555e045e2f4 100644 > > --- a/fs/xfs/xfs_sysctl.h > > +++ b/fs/xfs/xfs_sysctl.h > > @@ -37,6 +37,9 @@ typedef struct xfs_param { > > xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ > > xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ > > xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ > > +#ifdef DEBUG > > + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ > > +#endif > > } xfs_param_t; > > > > /* > > @@ -82,6 +85,9 @@ enum { > > extern xfs_param_t xfs_params; > > > > struct xfs_globals { > > +#ifdef DEBUG > > + int pwork_threads; /* parallel workqueue threads */ > > +#endif > > int log_recovery_delay; /* log recovery delay (secs) */ > > int mount_delay; /* mount setup delay (secs) */ > > bool bug_on_assert; /* BUG() the kernel on assert failure */ > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c > > index cabda13f3c64..910e6b9cb1a7 100644 > > --- a/fs/xfs/xfs_sysfs.c > > +++ b/fs/xfs/xfs_sysfs.c > > @@ -206,11 +206,51 @@ always_cow_show( > > } > > XFS_SYSFS_ATTR_RW(always_cow); > > > > +#ifdef DEBUG > > +/* > > + * Override how many threads the parallel work queue is allowed to create. > > + * This has to be a debug-only global (instead of an errortag) because one of > > + * the main users of parallel workqueues is mount time quotacheck. > > + */ > > +STATIC ssize_t > > +pwork_threads_store( > > + struct kobject *kobject, > > + const char *buf, > > + size_t count) > > +{ > > + int ret; > > + int val; > > + > > + ret = kstrtoint(buf, 0, &val); > > + if (ret) > > + return ret; > > + > > + if (val < 0 || val > NR_CPUS) > > + return -EINVAL; > > + > > + xfs_globals.pwork_threads = val; > > + > > + return count; > > +} > > + > > +STATIC ssize_t > > +pwork_threads_show( > > + struct kobject *kobject, > > + char *buf) > > +{ > > + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); > > +} > > +XFS_SYSFS_ATTR_RW(pwork_threads); > > +#endif /* DEBUG */ > > + > > static struct attribute *xfs_dbg_attrs[] = { > > ATTR_LIST(bug_on_assert), > > ATTR_LIST(log_recovery_delay), > > ATTR_LIST(mount_delay), > > ATTR_LIST(always_cow), > > +#ifdef DEBUG > > + ATTR_LIST(pwork_threads), > > +#endif > > NULL, > > }; > > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > index f9bb1d50bc0e..658cbade1998 100644 > > --- a/fs/xfs/xfs_trace.h > > +++ b/fs/xfs/xfs_trace.h > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, > > __entry->startino, __entry->freemask) > > ) > > > > +TRACE_EVENT(xfs_pwork_init, > > + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), > > + TP_ARGS(mp, nr_threads, pid), > > + TP_STRUCT__entry( > > + __field(dev_t, dev) > > + __field(unsigned int, nr_threads) > > + __field(pid_t, pid) > > + ), > > + TP_fast_assign( > > + __entry->dev = mp->m_super->s_dev; > > + __entry->nr_threads = nr_threads; > > + __entry->pid = pid; > > + ), > > + TP_printk("dev %d:%d nr_threads %u pid %u", > > + MAJOR(__entry->dev), MINOR(__entry->dev), > > + __entry->nr_threads, __entry->pid) > > +) > > + > > #endif /* _TRACE_XFS_H */ > > > > #undef TRACE_INCLUDE_PATH > >
On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote: > On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote: > > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote: > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > Create a parallel iwalk implementation and switch quotacheck to use it. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > --- > > > > Interesting.. is there any commonality here with the ktask mechanism > > that's been in progress? I've not followed the details, but I thought it > > was a similar idea. The last post I see for that is here: > > > > https://marc.info/?l=linux-mm&m=154143701122927&w=2 > > Yes, xfs_pwork is... the result of ktask still not landing upstream > after a couple of years. :( > Heh, Ok. We could always port over to it if it ever does land. > > That aside, this all looks mostly fine to me. A few random thoughts.. > > > > > fs/xfs/Makefile | 1 > > > fs/xfs/xfs_globals.c | 3 + > > > fs/xfs/xfs_iwalk.c | 76 ++++++++++++++++++++++++++++++- > > > fs/xfs/xfs_iwalk.h | 2 + > > > fs/xfs/xfs_pwork.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > fs/xfs/xfs_pwork.h | 50 ++++++++++++++++++++ > > > fs/xfs/xfs_qm.c | 2 - > > > fs/xfs/xfs_sysctl.h | 6 ++ > > > fs/xfs/xfs_sysfs.c | 40 ++++++++++++++++ > > > fs/xfs/xfs_trace.h | 18 +++++++ > > > 10 files changed, 317 insertions(+), 3 deletions(-) > > > create mode 100644 fs/xfs/xfs_pwork.c > > > create mode 100644 fs/xfs/xfs_pwork.h > > > > > > > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > > > index 74d30ef0dbce..48940a27d4aa 100644 > > > --- a/fs/xfs/Makefile > > > +++ b/fs/xfs/Makefile > > > @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ > > > xfs_message.o \ > > > xfs_mount.o \ > > > xfs_mru_cache.o \ > > > + xfs_pwork.o \ > > > xfs_reflink.o \ > > > xfs_stats.o \ > > > xfs_super.o \ > > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c > > > index d0d377384120..4f93f2c4dc38 100644 > > > --- a/fs/xfs/xfs_globals.c > > > +++ b/fs/xfs/xfs_globals.c > > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { > > > .fstrm_timer = { 1, 30*100, 3600*100}, > > > .eofb_timer = { 1, 300, 3600*24}, > > > .cowb_timer = { 1, 1800, 3600*24}, > > > +#ifdef DEBUG > > > + .pwork_threads = { 0, 0, NR_CPUS }, > > > +#endif > > > }; > > > > > > struct xfs_globals xfs_globals = { > > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > > > index 8595258b5001..71ee1628aa70 100644 > > > --- a/fs/xfs/xfs_iwalk.c > > > +++ b/fs/xfs/xfs_iwalk.c > > > @@ -21,6 +21,7 @@ > > > #include "xfs_health.h" > > > #include "xfs_trans.h" > > > #include "xfs_iwalk.h" > > > +#include "xfs_pwork.h" > > > > > > /* > > > * Walking Inodes in the Filesystem > > > @@ -46,6 +47,9 @@ > > > */ > > > > > > struct xfs_iwalk_ag { > > > + /* parallel work control data; will be null if single threaded */ > > > + struct xfs_pwork pwork; > > > + > > > struct xfs_mount *mp; > > > struct xfs_trans *tp; > > > > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( > > > trace_xfs_iwalk_ag_rec(mp, agno, irec); > > > > > > for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > > > + if (xfs_pwork_want_abort(&iwag->pwork)) > > > + return 0; > > > + > > > /* Skip if this inode is free */ > > > if (XFS_INOBT_MASK(j) & irec->ir_free) > > > continue; > > > @@ -360,7 +367,7 @@ xfs_iwalk_ag( > > > agino = XFS_INO_TO_AGINO(mp, iwag->startino); > > > error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > > > > > - while (!error && has_more) { > > > + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { > > > struct xfs_inobt_rec_incore *irec; > > > > > > cond_resched(); > > > @@ -409,7 +416,7 @@ xfs_iwalk_ag( > > > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > > > > > /* Walk any records left behind in the cache. */ > > > - if (iwag->nr_recs == 0 || error) > > > + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) > > > return error; > > > > > > return xfs_iwalk_ag_recs(iwag); > > > @@ -465,6 +472,7 @@ xfs_iwalk( > > > .iwalk_fn = iwalk_fn, > > > .data = data, > > > .startino = startino, > > > + .pwork = XFS_PWORK_SINGLE_THREADED, > > > }; > > > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > int error; > > > @@ -486,3 +494,67 @@ xfs_iwalk( > > > xfs_iwalk_free(&iwag); > > > return error; > > > } > > > + > > > +/* Run per-thread iwalk work. */ > > > +static int > > > +xfs_iwalk_ag_work( > > > + struct xfs_mount *mp, > > > + struct xfs_pwork *pwork) > > > +{ > > > + struct xfs_iwalk_ag *iwag; > > > + int error; > > > + > > > + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); > > > + error = xfs_iwalk_alloc(iwag); > > > + if (error) > > > + goto out; > > > > In most cases this will never fail, but the error path if it does looks > > slightly painful. I was thinking if we could move this up into > > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when > > failure is imminent... > > > > > + > > > + error = xfs_iwalk_ag(iwag); > > > + xfs_iwalk_free(iwag); > > > +out: > > > + kmem_free(iwag); > > > + return error; > > > +} > > > + > > > +/* > > > + * Walk all the inodes in the filesystem using multiple threads to process each > > > + * AG. > > > + */ > > > +int > > > +xfs_iwalk_threaded( > > > + struct xfs_mount *mp, > > > + xfs_ino_t startino, > > > + xfs_iwalk_fn iwalk_fn, > > > + unsigned int max_prefetch, > > > + void *data) > > > +{ > > > + struct xfs_pwork_ctl pctl; > > > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > + unsigned int nr_threads; > > > + int error; > > > + > > > + ASSERT(agno < mp->m_sb.sb_agcount); > > > + > > > + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); > > > + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", > > > + nr_threads); > > > + if (error) > > > + return error; > > > + > > > + for (; agno < mp->m_sb.sb_agcount; agno++) { > > > + struct xfs_iwalk_ag *iwag; > > > + > > > + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); > > > + iwag->mp = mp; > > > + iwag->tp = NULL; > > > + iwag->iwalk_fn = iwalk_fn; > > > + iwag->data = data; > > > + iwag->startino = startino; > > > + iwag->recs = NULL; > > > + xfs_iwalk_set_prefetch(iwag, max_prefetch); > > > + xfs_pwork_queue(&pctl, &iwag->pwork); > > > + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > > > + } > > > > ... but this is only bound by the number of AGs and so could result in a > > large number of allocations. FWIW, I wouldn't expect that to be a > > problem in the common case. I'm more thinking about the case of a > > specially crafted filesystem designed to cause problems on mount. > > <nod> I thought about that, and decided that it wasn't a good idea to > for each of the queued (but not processing) work items to be sitting on > a bunch of memory because that memory can't be put to useful work. > That's why I put it in xfs_iwalk_ag_work. > Yep. > Also that would necessitate iwalk feeding a destructor to pwork so that > it can deal with work items that were queued but never actually run. > What's the scenario for queued jobs that never run? We have to run the work item to free the iwag, so I'd assume we could free the record buffer in the same place if we wanted to (which we don't :P). > > > > > + > > > + return xfs_pwork_destroy(&pctl); > > > +} ... > > > +/* > > > + * Return the amount of parallelism that the data device can handle, or 0 for > > > + * no limit. > > > + */ > > > +unsigned int > > > +xfs_pwork_guess_datadev_parallelism( > > > + struct xfs_mount *mp) > > > +{ > > > + struct xfs_buftarg *btp = mp->m_ddev_targp; > > > + int iomin; > > > + int ioopt; > > > + > > > + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) > > > + return num_online_cpus(); > > > + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) > > > + return mp->m_sb.sb_width / mp->m_sb.sb_unit; > > > + iomin = bdev_io_min(btp->bt_bdev); > > > + ioopt = bdev_io_opt(btp->bt_bdev); > > > + if (iomin && ioopt) > > > + return ioopt / iomin; > > > + > > > + return 1; > > > > Have you collected any performance data related to these heuristics? > > Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a > single SSD, ~10% on a 4-way raid0). That wasn't really all that > awesome, so I recorded a flame graph (see below) to find where the > remaining overhead is. A lot of it was in xfs_iget, and I also noticed > that deferred inode inactivation sped it up a little more. > So if I follow the graphic correctly, you have 4 walker threads running the quotacheck. The majority of overhead is the inode memory allocation followed by inode buffer lookup and then dquot lookup slightly behind that. If this is an SSD, the heuristic presumably set the thread count based on the CPU count, right? Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do you have granular enough data to see whether these are buffer cache hits or misses? > > I > > assume the feature is generally a win, but this also seems like we have > > a large window of variance here. E.g., an SSD on a server with hundreds > > of CPUs will enable as many threads as CPUs, but a single xTB spindle on > > the same box may run single threaded (a quick check of a few local > > devices all return an optimal I/O size of 0). Is there really no benefit > > parallelizing some of that work in the spinning rust case? > > Single-spindle spinning rust got worse even with 2 threads because the > heads ping-pong between AGs. It's not horrible with enough readahead, > but it turns into a disaster when I disabled readahead, unsurprisingly. > Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like device where we have a stripe unit/width set, but the device itself may have concurrency characteristics more like a single spindle as opposed to something like raid0? > > What about in the other direction where we might have a ton of threads > > for inodes across AGs that all happen to be in the same project quota, > > for example? > > I collected a flame graph... https://djwong.org/docs/quotacheck.svg > I'm specifically thinking about a few random systems I've used recently with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but if one did (and with enough AGs), I'm curious how this algorithm would behave under those conditions. I guess this all boils down to trying to understand if/what breakdown conditions might exist given the different possibilies allowed by the current heuristic. I'm wondering if we should either restrict this heuristic to enable concurrency specifically in the environments we've shown it to have benefits and/or with some kind of reasonable cap to limit unknown boundary conditions. E.g., the case of hundreds of AGs and hundreds of CPUs on SSD seems like it could go really well (a big flash raid0) or potentially really bad (one big SSD with a poorly configured fs). Thoughts? > It turned out that the inode setup overhead in xfs_iget is so high that > locking the dquot has negligible overhead. When I "fixed" quotacheck to > read the quota information straight from the inode cluster buffer if the > inode wasn't in memory, the runtime dropped by 60% but Dave warned me > not to take us back to inode buffer aliasing hell. I also noted that > if booting with mem=512M the memory reclamation overhead totally fries > us regardless of parallelisation. > I'm not familiar with the buffer aliasing problem.. I'm guessing this is talking about risk of inconsistent in-core inodes with inode buffers..? In any event, I agree that it's not worth risking integrity or overcomplicating things for something like quotacheck. Brian > --D > > > > > Brian > > > > > +} > > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h > > > new file mode 100644 > > > index 000000000000..e0c1354a2d8c > > > --- /dev/null > > > +++ b/fs/xfs/xfs_pwork.h > > > @@ -0,0 +1,50 @@ > > > +// SPDX-License-Identifier: GPL-2.0+ > > > +/* > > > + * Copyright (C) 2019 Oracle. All Rights Reserved. > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > > > + */ > > > +#ifndef __XFS_PWORK_H__ > > > +#define __XFS_PWORK_H__ > > > + > > > +struct xfs_pwork; > > > +struct xfs_mount; > > > + > > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); > > > + > > > +/* > > > + * Parallel work coordination structure. > > > + */ > > > +struct xfs_pwork_ctl { > > > + struct workqueue_struct *wq; > > > + struct xfs_mount *mp; > > > + xfs_pwork_work_fn work_fn; > > > + int error; > > > +}; > > > + > > > +/* > > > + * Embed this parallel work control item inside your own work structure, > > > + * then queue work with it. > > > + */ > > > +struct xfs_pwork { > > > + struct work_struct work; > > > + struct xfs_pwork_ctl *pctl; > > > +}; > > > + > > > +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } > > > + > > > +/* Have we been told to abort? */ > > > +static inline bool > > > +xfs_pwork_want_abort( > > > + struct xfs_pwork *pwork) > > > +{ > > > + return pwork->pctl && pwork->pctl->error; > > > +} > > > + > > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, > > > + xfs_pwork_work_fn work_fn, const char *tag, > > > + unsigned int nr_threads); > > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); > > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); > > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); > > > + > > > +#endif /* __XFS_PWORK_H__ */ > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > > > index a5b2260406a8..e4f3785f7a64 100644 > > > --- a/fs/xfs/xfs_qm.c > > > +++ b/fs/xfs/xfs_qm.c > > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( > > > flags |= XFS_PQUOTA_CHKD; > > > } > > > > > > - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > if (error) > > > goto error_return; > > > > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h > > > index ad7f9be13087..b555e045e2f4 100644 > > > --- a/fs/xfs/xfs_sysctl.h > > > +++ b/fs/xfs/xfs_sysctl.h > > > @@ -37,6 +37,9 @@ typedef struct xfs_param { > > > xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ > > > xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ > > > xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ > > > +#ifdef DEBUG > > > + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ > > > +#endif > > > } xfs_param_t; > > > > > > /* > > > @@ -82,6 +85,9 @@ enum { > > > extern xfs_param_t xfs_params; > > > > > > struct xfs_globals { > > > +#ifdef DEBUG > > > + int pwork_threads; /* parallel workqueue threads */ > > > +#endif > > > int log_recovery_delay; /* log recovery delay (secs) */ > > > int mount_delay; /* mount setup delay (secs) */ > > > bool bug_on_assert; /* BUG() the kernel on assert failure */ > > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c > > > index cabda13f3c64..910e6b9cb1a7 100644 > > > --- a/fs/xfs/xfs_sysfs.c > > > +++ b/fs/xfs/xfs_sysfs.c > > > @@ -206,11 +206,51 @@ always_cow_show( > > > } > > > XFS_SYSFS_ATTR_RW(always_cow); > > > > > > +#ifdef DEBUG > > > +/* > > > + * Override how many threads the parallel work queue is allowed to create. > > > + * This has to be a debug-only global (instead of an errortag) because one of > > > + * the main users of parallel workqueues is mount time quotacheck. > > > + */ > > > +STATIC ssize_t > > > +pwork_threads_store( > > > + struct kobject *kobject, > > > + const char *buf, > > > + size_t count) > > > +{ > > > + int ret; > > > + int val; > > > + > > > + ret = kstrtoint(buf, 0, &val); > > > + if (ret) > > > + return ret; > > > + > > > + if (val < 0 || val > NR_CPUS) > > > + return -EINVAL; > > > + > > > + xfs_globals.pwork_threads = val; > > > + > > > + return count; > > > +} > > > + > > > +STATIC ssize_t > > > +pwork_threads_show( > > > + struct kobject *kobject, > > > + char *buf) > > > +{ > > > + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); > > > +} > > > +XFS_SYSFS_ATTR_RW(pwork_threads); > > > +#endif /* DEBUG */ > > > + > > > static struct attribute *xfs_dbg_attrs[] = { > > > ATTR_LIST(bug_on_assert), > > > ATTR_LIST(log_recovery_delay), > > > ATTR_LIST(mount_delay), > > > ATTR_LIST(always_cow), > > > +#ifdef DEBUG > > > + ATTR_LIST(pwork_threads), > > > +#endif > > > NULL, > > > }; > > > > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > > index f9bb1d50bc0e..658cbade1998 100644 > > > --- a/fs/xfs/xfs_trace.h > > > +++ b/fs/xfs/xfs_trace.h > > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, > > > __entry->startino, __entry->freemask) > > > ) > > > > > > +TRACE_EVENT(xfs_pwork_init, > > > + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), > > > + TP_ARGS(mp, nr_threads, pid), > > > + TP_STRUCT__entry( > > > + __field(dev_t, dev) > > > + __field(unsigned int, nr_threads) > > > + __field(pid_t, pid) > > > + ), > > > + TP_fast_assign( > > > + __entry->dev = mp->m_super->s_dev; > > > + __entry->nr_threads = nr_threads; > > > + __entry->pid = pid; > > > + ), > > > + TP_printk("dev %d:%d nr_threads %u pid %u", > > > + MAJOR(__entry->dev), MINOR(__entry->dev), > > > + __entry->nr_threads, __entry->pid) > > > +) > > > + > > > #endif /* _TRACE_XFS_H */ > > > > > > #undef TRACE_INCLUDE_PATH > > >
On Tue, Jun 11, 2019 at 09:13:40AM -0400, Brian Foster wrote: > On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote: > > On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote: > > > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote: > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > Create a parallel iwalk implementation and switch quotacheck to use it. > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > --- > > > > > > Interesting.. is there any commonality here with the ktask mechanism > > > that's been in progress? I've not followed the details, but I thought it > > > was a similar idea. The last post I see for that is here: > > > > > > https://marc.info/?l=linux-mm&m=154143701122927&w=2 > > > > Yes, xfs_pwork is... the result of ktask still not landing upstream > > after a couple of years. :( > > > > Heh, Ok. We could always port over to it if it ever does land. > > > > That aside, this all looks mostly fine to me. A few random thoughts.. > > > > > > > fs/xfs/Makefile | 1 > > > > fs/xfs/xfs_globals.c | 3 + > > > > fs/xfs/xfs_iwalk.c | 76 ++++++++++++++++++++++++++++++- > > > > fs/xfs/xfs_iwalk.h | 2 + > > > > fs/xfs/xfs_pwork.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > > fs/xfs/xfs_pwork.h | 50 ++++++++++++++++++++ > > > > fs/xfs/xfs_qm.c | 2 - > > > > fs/xfs/xfs_sysctl.h | 6 ++ > > > > fs/xfs/xfs_sysfs.c | 40 ++++++++++++++++ > > > > fs/xfs/xfs_trace.h | 18 +++++++ > > > > 10 files changed, 317 insertions(+), 3 deletions(-) > > > > create mode 100644 fs/xfs/xfs_pwork.c > > > > create mode 100644 fs/xfs/xfs_pwork.h > > > > > > > > > > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > > > > index 74d30ef0dbce..48940a27d4aa 100644 > > > > --- a/fs/xfs/Makefile > > > > +++ b/fs/xfs/Makefile > > > > @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ > > > > xfs_message.o \ > > > > xfs_mount.o \ > > > > xfs_mru_cache.o \ > > > > + xfs_pwork.o \ > > > > xfs_reflink.o \ > > > > xfs_stats.o \ > > > > xfs_super.o \ > > > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c > > > > index d0d377384120..4f93f2c4dc38 100644 > > > > --- a/fs/xfs/xfs_globals.c > > > > +++ b/fs/xfs/xfs_globals.c > > > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { > > > > .fstrm_timer = { 1, 30*100, 3600*100}, > > > > .eofb_timer = { 1, 300, 3600*24}, > > > > .cowb_timer = { 1, 1800, 3600*24}, > > > > +#ifdef DEBUG > > > > + .pwork_threads = { 0, 0, NR_CPUS }, > > > > +#endif > > > > }; > > > > > > > > struct xfs_globals xfs_globals = { > > > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > > > > index 8595258b5001..71ee1628aa70 100644 > > > > --- a/fs/xfs/xfs_iwalk.c > > > > +++ b/fs/xfs/xfs_iwalk.c > > > > @@ -21,6 +21,7 @@ > > > > #include "xfs_health.h" > > > > #include "xfs_trans.h" > > > > #include "xfs_iwalk.h" > > > > +#include "xfs_pwork.h" > > > > > > > > /* > > > > * Walking Inodes in the Filesystem > > > > @@ -46,6 +47,9 @@ > > > > */ > > > > > > > > struct xfs_iwalk_ag { > > > > + /* parallel work control data; will be null if single threaded */ > > > > + struct xfs_pwork pwork; > > > > + > > > > struct xfs_mount *mp; > > > > struct xfs_trans *tp; > > > > > > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( > > > > trace_xfs_iwalk_ag_rec(mp, agno, irec); > > > > > > > > for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > > > > + if (xfs_pwork_want_abort(&iwag->pwork)) > > > > + return 0; > > > > + > > > > /* Skip if this inode is free */ > > > > if (XFS_INOBT_MASK(j) & irec->ir_free) > > > > continue; > > > > @@ -360,7 +367,7 @@ xfs_iwalk_ag( > > > > agino = XFS_INO_TO_AGINO(mp, iwag->startino); > > > > error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > > > > > > > - while (!error && has_more) { > > > > + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { > > > > struct xfs_inobt_rec_incore *irec; > > > > > > > > cond_resched(); > > > > @@ -409,7 +416,7 @@ xfs_iwalk_ag( > > > > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > > > > > > > /* Walk any records left behind in the cache. */ > > > > - if (iwag->nr_recs == 0 || error) > > > > + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) > > > > return error; > > > > > > > > return xfs_iwalk_ag_recs(iwag); > > > > @@ -465,6 +472,7 @@ xfs_iwalk( > > > > .iwalk_fn = iwalk_fn, > > > > .data = data, > > > > .startino = startino, > > > > + .pwork = XFS_PWORK_SINGLE_THREADED, > > > > }; > > > > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > > int error; > > > > @@ -486,3 +494,67 @@ xfs_iwalk( > > > > xfs_iwalk_free(&iwag); > > > > return error; > > > > } > > > > + > > > > +/* Run per-thread iwalk work. */ > > > > +static int > > > > +xfs_iwalk_ag_work( > > > > + struct xfs_mount *mp, > > > > + struct xfs_pwork *pwork) > > > > +{ > > > > + struct xfs_iwalk_ag *iwag; > > > > + int error; > > > > + > > > > + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); > > > > + error = xfs_iwalk_alloc(iwag); > > > > + if (error) > > > > + goto out; > > > > > > In most cases this will never fail, but the error path if it does looks > > > slightly painful. I was thinking if we could move this up into > > > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when > > > failure is imminent... > > > > > > > + > > > > + error = xfs_iwalk_ag(iwag); > > > > + xfs_iwalk_free(iwag); > > > > +out: > > > > + kmem_free(iwag); > > > > + return error; > > > > +} > > > > + > > > > +/* > > > > + * Walk all the inodes in the filesystem using multiple threads to process each > > > > + * AG. > > > > + */ > > > > +int > > > > +xfs_iwalk_threaded( > > > > + struct xfs_mount *mp, > > > > + xfs_ino_t startino, > > > > + xfs_iwalk_fn iwalk_fn, > > > > + unsigned int max_prefetch, > > > > + void *data) > > > > +{ > > > > + struct xfs_pwork_ctl pctl; > > > > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > > + unsigned int nr_threads; > > > > + int error; > > > > + > > > > + ASSERT(agno < mp->m_sb.sb_agcount); > > > > + > > > > + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); > > > > + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", > > > > + nr_threads); > > > > + if (error) > > > > + return error; > > > > + > > > > + for (; agno < mp->m_sb.sb_agcount; agno++) { > > > > + struct xfs_iwalk_ag *iwag; > > > > + > > > > + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); > > > > + iwag->mp = mp; > > > > + iwag->tp = NULL; > > > > + iwag->iwalk_fn = iwalk_fn; > > > > + iwag->data = data; > > > > + iwag->startino = startino; > > > > + iwag->recs = NULL; > > > > + xfs_iwalk_set_prefetch(iwag, max_prefetch); > > > > + xfs_pwork_queue(&pctl, &iwag->pwork); > > > > + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > > > > + } > > > > > > ... but this is only bound by the number of AGs and so could result in a > > > large number of allocations. FWIW, I wouldn't expect that to be a > > > problem in the common case. I'm more thinking about the case of a > > > specially crafted filesystem designed to cause problems on mount. > > > > <nod> I thought about that, and decided that it wasn't a good idea to > > for each of the queued (but not processing) work items to be sitting on > > a bunch of memory because that memory can't be put to useful work. > > That's why I put it in xfs_iwalk_ag_work. > > > > Yep. > > > Also that would necessitate iwalk feeding a destructor to pwork so that > > it can deal with work items that were queued but never actually run. > > > > What's the scenario for queued jobs that never run? We have to run the > work item to free the iwag, so I'd assume we could free the record > buffer in the same place if we wanted to (which we don't :P). Doh! That was a thinko on my part. pwork always runs the work item, even if pctl->error != 0; it's up to the individual work item to check if we're aborted and run for the exits, which _iwalk_ag_work doesn't do. Erugh, I'll fix the documentation and _iwalk_ag_work. > > > > > > > + > > > > + return xfs_pwork_destroy(&pctl); > > > > +} > ... > > > > +/* > > > > + * Return the amount of parallelism that the data device can handle, or 0 for > > > > + * no limit. > > > > + */ > > > > +unsigned int > > > > +xfs_pwork_guess_datadev_parallelism( > > > > + struct xfs_mount *mp) > > > > +{ > > > > + struct xfs_buftarg *btp = mp->m_ddev_targp; > > > > + int iomin; > > > > + int ioopt; > > > > + > > > > + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) > > > > + return num_online_cpus(); > > > > + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) > > > > + return mp->m_sb.sb_width / mp->m_sb.sb_unit; > > > > + iomin = bdev_io_min(btp->bt_bdev); > > > > + ioopt = bdev_io_opt(btp->bt_bdev); > > > > + if (iomin && ioopt) > > > > + return ioopt / iomin; > > > > + > > > > + return 1; > > > > > > Have you collected any performance data related to these heuristics? > > > > Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a > > single SSD, ~10% on a 4-way raid0). That wasn't really all that > > awesome, so I recorded a flame graph (see below) to find where the > > remaining overhead is. A lot of it was in xfs_iget, and I also noticed > > that deferred inode inactivation sped it up a little more. > > > > So if I follow the graphic correctly, you have 4 walker threads running > the quotacheck. The majority of overhead is the inode memory allocation > followed by inode buffer lookup and then dquot lookup slightly behind > that. If this is an SSD, the heuristic presumably set the thread count > based on the CPU count, right? Right. > Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do > you have granular enough data to see whether these are buffer cache hits > or misses? I /think/ they're mostly hits, though I admit it's hard to read the 24000 pixel wide output, even after clicking on xfs_buf_read_map to zoom in a bit. > > > I > > > assume the feature is generally a win, but this also seems like we have > > > a large window of variance here. E.g., an SSD on a server with hundreds > > > of CPUs will enable as many threads as CPUs, but a single xTB spindle on > > > the same box may run single threaded (a quick check of a few local > > > devices all return an optimal I/O size of 0). Is there really no benefit > > > parallelizing some of that work in the spinning rust case? > > > > Single-spindle spinning rust got worse even with 2 threads because the > > heads ping-pong between AGs. It's not horrible with enough readahead, > > but it turns into a disaster when I disabled readahead, unsurprisingly. > > > > Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like > device where we have a stripe unit/width set, but the device itself may > have concurrency characteristics more like a single spindle as opposed > to something like raid0? I'd have to study a wider variety of disk raid setups. I'm under the impression that the inode allocation code tries to allocate contiguous runs of inode clusters which will fill up stripe units (su), and then (if there's still space) it'll keep going into the next stripe? Hmm, I wonder how well inodes get distributed across stripes... > > > What about in the other direction where we might have a ton of threads > > > for inodes across AGs that all happen to be in the same project quota, > > > for example? > > > > I collected a flame graph... https://djwong.org/docs/quotacheck.svg > > > > I'm specifically thinking about a few random systems I've used recently > with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but > if one did (and with enough AGs), I'm curious how this algorithm would > behave under those conditions. > > I guess this all boils down to trying to understand if/what breakdown > conditions might exist given the different possibilies allowed by the > current heuristic. I'm wondering if we should either restrict this > heuristic to enable concurrency specifically in the environments we've > shown it to have benefits and/or with some kind of reasonable cap to > limit unknown boundary conditions. E.g., the case of hundreds of AGs and > hundreds of CPUs on SSD seems like it could go really well (a big flash > raid0) or potentially really bad (one big SSD with a poorly configured > fs). Thoughts? <nod> I wish we had a better way for the block layer to give us even a rough estimate of how many iops a device can handle... or even the ability to help us train ourselves after the fact by measuring average latency over time vs. the last (say) 2s so that we could back off if we thought the device was getting congested. [Hmm, Johannes merged some "pressure stall information" thing a couple of releases back; I wonder if we can make use of that... I'll have a look and report back.] I suppose we could cap the number of cpus at some arbitrary high number for now (128?) and see how long it takes to hit scalability problems because of it, but we could also just let things scale and see who complains about quotacheck getting slower on their 500 CPU system with a nonrotational slate stone. I suspect "one big SSD with a poorly configured fs" is going to notice problems everywhere anyway. Admittedly this is yet another one of those cases where I turn something on and ... the bottleneck just shows up somewhere else now. I hadn't realized that iget overhead is so high. > > It turned out that the inode setup overhead in xfs_iget is so high that > > locking the dquot has negligible overhead. When I "fixed" quotacheck to > > read the quota information straight from the inode cluster buffer if the > > inode wasn't in memory, the runtime dropped by 60% but Dave warned me > > not to take us back to inode buffer aliasing hell. I also noted that > > if booting with mem=512M the memory reclamation overhead totally fries > > us regardless of parallelisation. > > > > I'm not familiar with the buffer aliasing problem.. I'm guessing this is > talking about risk of inconsistent in-core inodes with inode buffers..? > In any event, I agree that it's not worth risking integrity or > overcomplicating things for something like quotacheck. Correct. --D > Brian > > > --D > > > > > > > > Brian > > > > > > > +} > > > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h > > > > new file mode 100644 > > > > index 000000000000..e0c1354a2d8c > > > > --- /dev/null > > > > +++ b/fs/xfs/xfs_pwork.h > > > > @@ -0,0 +1,50 @@ > > > > +// SPDX-License-Identifier: GPL-2.0+ > > > > +/* > > > > + * Copyright (C) 2019 Oracle. All Rights Reserved. > > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > > > > + */ > > > > +#ifndef __XFS_PWORK_H__ > > > > +#define __XFS_PWORK_H__ > > > > + > > > > +struct xfs_pwork; > > > > +struct xfs_mount; > > > > + > > > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); > > > > + > > > > +/* > > > > + * Parallel work coordination structure. > > > > + */ > > > > +struct xfs_pwork_ctl { > > > > + struct workqueue_struct *wq; > > > > + struct xfs_mount *mp; > > > > + xfs_pwork_work_fn work_fn; > > > > + int error; > > > > +}; > > > > + > > > > +/* > > > > + * Embed this parallel work control item inside your own work structure, > > > > + * then queue work with it. > > > > + */ > > > > +struct xfs_pwork { > > > > + struct work_struct work; > > > > + struct xfs_pwork_ctl *pctl; > > > > +}; > > > > + > > > > +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } > > > > + > > > > +/* Have we been told to abort? */ > > > > +static inline bool > > > > +xfs_pwork_want_abort( > > > > + struct xfs_pwork *pwork) > > > > +{ > > > > + return pwork->pctl && pwork->pctl->error; > > > > +} > > > > + > > > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, > > > > + xfs_pwork_work_fn work_fn, const char *tag, > > > > + unsigned int nr_threads); > > > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); > > > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); > > > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); > > > > + > > > > +#endif /* __XFS_PWORK_H__ */ > > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > > > > index a5b2260406a8..e4f3785f7a64 100644 > > > > --- a/fs/xfs/xfs_qm.c > > > > +++ b/fs/xfs/xfs_qm.c > > > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( > > > > flags |= XFS_PQUOTA_CHKD; > > > > } > > > > > > > > - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > > + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > > if (error) > > > > goto error_return; > > > > > > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h > > > > index ad7f9be13087..b555e045e2f4 100644 > > > > --- a/fs/xfs/xfs_sysctl.h > > > > +++ b/fs/xfs/xfs_sysctl.h > > > > @@ -37,6 +37,9 @@ typedef struct xfs_param { > > > > xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ > > > > xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ > > > > xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ > > > > +#ifdef DEBUG > > > > + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ > > > > +#endif > > > > } xfs_param_t; > > > > > > > > /* > > > > @@ -82,6 +85,9 @@ enum { > > > > extern xfs_param_t xfs_params; > > > > > > > > struct xfs_globals { > > > > +#ifdef DEBUG > > > > + int pwork_threads; /* parallel workqueue threads */ > > > > +#endif > > > > int log_recovery_delay; /* log recovery delay (secs) */ > > > > int mount_delay; /* mount setup delay (secs) */ > > > > bool bug_on_assert; /* BUG() the kernel on assert failure */ > > > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c > > > > index cabda13f3c64..910e6b9cb1a7 100644 > > > > --- a/fs/xfs/xfs_sysfs.c > > > > +++ b/fs/xfs/xfs_sysfs.c > > > > @@ -206,11 +206,51 @@ always_cow_show( > > > > } > > > > XFS_SYSFS_ATTR_RW(always_cow); > > > > > > > > +#ifdef DEBUG > > > > +/* > > > > + * Override how many threads the parallel work queue is allowed to create. > > > > + * This has to be a debug-only global (instead of an errortag) because one of > > > > + * the main users of parallel workqueues is mount time quotacheck. > > > > + */ > > > > +STATIC ssize_t > > > > +pwork_threads_store( > > > > + struct kobject *kobject, > > > > + const char *buf, > > > > + size_t count) > > > > +{ > > > > + int ret; > > > > + int val; > > > > + > > > > + ret = kstrtoint(buf, 0, &val); > > > > + if (ret) > > > > + return ret; > > > > + > > > > + if (val < 0 || val > NR_CPUS) > > > > + return -EINVAL; > > > > + > > > > + xfs_globals.pwork_threads = val; > > > > + > > > > + return count; > > > > +} > > > > + > > > > +STATIC ssize_t > > > > +pwork_threads_show( > > > > + struct kobject *kobject, > > > > + char *buf) > > > > +{ > > > > + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); > > > > +} > > > > +XFS_SYSFS_ATTR_RW(pwork_threads); > > > > +#endif /* DEBUG */ > > > > + > > > > static struct attribute *xfs_dbg_attrs[] = { > > > > ATTR_LIST(bug_on_assert), > > > > ATTR_LIST(log_recovery_delay), > > > > ATTR_LIST(mount_delay), > > > > ATTR_LIST(always_cow), > > > > +#ifdef DEBUG > > > > + ATTR_LIST(pwork_threads), > > > > +#endif > > > > NULL, > > > > }; > > > > > > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > > > index f9bb1d50bc0e..658cbade1998 100644 > > > > --- a/fs/xfs/xfs_trace.h > > > > +++ b/fs/xfs/xfs_trace.h > > > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, > > > > __entry->startino, __entry->freemask) > > > > ) > > > > > > > > +TRACE_EVENT(xfs_pwork_init, > > > > + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), > > > > + TP_ARGS(mp, nr_threads, pid), > > > > + TP_STRUCT__entry( > > > > + __field(dev_t, dev) > > > > + __field(unsigned int, nr_threads) > > > > + __field(pid_t, pid) > > > > + ), > > > > + TP_fast_assign( > > > > + __entry->dev = mp->m_super->s_dev; > > > > + __entry->nr_threads = nr_threads; > > > > + __entry->pid = pid; > > > > + ), > > > > + TP_printk("dev %d:%d nr_threads %u pid %u", > > > > + MAJOR(__entry->dev), MINOR(__entry->dev), > > > > + __entry->nr_threads, __entry->pid) > > > > +) > > > > + > > > > #endif /* _TRACE_XFS_H */ > > > > > > > > #undef TRACE_INCLUDE_PATH > > > >
On Tue, Jun 11, 2019 at 08:29:26AM -0700, Darrick J. Wong wrote: > On Tue, Jun 11, 2019 at 09:13:40AM -0400, Brian Foster wrote: > > On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote: > > > On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote: > > > > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote: > > > > > From: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > > > Create a parallel iwalk implementation and switch quotacheck to use it. > > > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > --- > > > > > > > > Interesting.. is there any commonality here with the ktask mechanism > > > > that's been in progress? I've not followed the details, but I thought it > > > > was a similar idea. The last post I see for that is here: > > > > > > > > https://marc.info/?l=linux-mm&m=154143701122927&w=2 > > > > > > Yes, xfs_pwork is... the result of ktask still not landing upstream > > > after a couple of years. :( > > > > > > > Heh, Ok. We could always port over to it if it ever does land. > > > > > > That aside, this all looks mostly fine to me. A few random thoughts.. > > > > > > > > > fs/xfs/Makefile | 1 > > > > > fs/xfs/xfs_globals.c | 3 + > > > > > fs/xfs/xfs_iwalk.c | 76 ++++++++++++++++++++++++++++++- > > > > > fs/xfs/xfs_iwalk.h | 2 + > > > > > fs/xfs/xfs_pwork.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > > > fs/xfs/xfs_pwork.h | 50 ++++++++++++++++++++ > > > > > fs/xfs/xfs_qm.c | 2 - > > > > > fs/xfs/xfs_sysctl.h | 6 ++ > > > > > fs/xfs/xfs_sysfs.c | 40 ++++++++++++++++ > > > > > fs/xfs/xfs_trace.h | 18 +++++++ > > > > > 10 files changed, 317 insertions(+), 3 deletions(-) > > > > > create mode 100644 fs/xfs/xfs_pwork.c > > > > > create mode 100644 fs/xfs/xfs_pwork.h > > > > > > > > > > > > > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > > > > > index 74d30ef0dbce..48940a27d4aa 100644 > > > > > --- a/fs/xfs/Makefile > > > > > +++ b/fs/xfs/Makefile > > > > > @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ > > > > > xfs_message.o \ > > > > > xfs_mount.o \ > > > > > xfs_mru_cache.o \ > > > > > + xfs_pwork.o \ > > > > > xfs_reflink.o \ > > > > > xfs_stats.o \ > > > > > xfs_super.o \ > > > > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c > > > > > index d0d377384120..4f93f2c4dc38 100644 > > > > > --- a/fs/xfs/xfs_globals.c > > > > > +++ b/fs/xfs/xfs_globals.c > > > > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { > > > > > .fstrm_timer = { 1, 30*100, 3600*100}, > > > > > .eofb_timer = { 1, 300, 3600*24}, > > > > > .cowb_timer = { 1, 1800, 3600*24}, > > > > > +#ifdef DEBUG > > > > > + .pwork_threads = { 0, 0, NR_CPUS }, > > > > > +#endif > > > > > }; > > > > > > > > > > struct xfs_globals xfs_globals = { > > > > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > > > > > index 8595258b5001..71ee1628aa70 100644 > > > > > --- a/fs/xfs/xfs_iwalk.c > > > > > +++ b/fs/xfs/xfs_iwalk.c > > > > > @@ -21,6 +21,7 @@ > > > > > #include "xfs_health.h" > > > > > #include "xfs_trans.h" > > > > > #include "xfs_iwalk.h" > > > > > +#include "xfs_pwork.h" > > > > > > > > > > /* > > > > > * Walking Inodes in the Filesystem > > > > > @@ -46,6 +47,9 @@ > > > > > */ > > > > > > > > > > struct xfs_iwalk_ag { > > > > > + /* parallel work control data; will be null if single threaded */ > > > > > + struct xfs_pwork pwork; > > > > > + > > > > > struct xfs_mount *mp; > > > > > struct xfs_trans *tp; > > > > > > > > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( > > > > > trace_xfs_iwalk_ag_rec(mp, agno, irec); > > > > > > > > > > for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { > > > > > + if (xfs_pwork_want_abort(&iwag->pwork)) > > > > > + return 0; > > > > > + > > > > > /* Skip if this inode is free */ > > > > > if (XFS_INOBT_MASK(j) & irec->ir_free) > > > > > continue; > > > > > @@ -360,7 +367,7 @@ xfs_iwalk_ag( > > > > > agino = XFS_INO_TO_AGINO(mp, iwag->startino); > > > > > error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > > > > > > > > > > - while (!error && has_more) { > > > > > + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { > > > > > struct xfs_inobt_rec_incore *irec; > > > > > > > > > > cond_resched(); > > > > > @@ -409,7 +416,7 @@ xfs_iwalk_ag( > > > > > xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > > > > > > > > > > /* Walk any records left behind in the cache. */ > > > > > - if (iwag->nr_recs == 0 || error) > > > > > + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) > > > > > return error; > > > > > > > > > > return xfs_iwalk_ag_recs(iwag); > > > > > @@ -465,6 +472,7 @@ xfs_iwalk( > > > > > .iwalk_fn = iwalk_fn, > > > > > .data = data, > > > > > .startino = startino, > > > > > + .pwork = XFS_PWORK_SINGLE_THREADED, > > > > > }; > > > > > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > > > int error; > > > > > @@ -486,3 +494,67 @@ xfs_iwalk( > > > > > xfs_iwalk_free(&iwag); > > > > > return error; > > > > > } > > > > > + > > > > > +/* Run per-thread iwalk work. */ > > > > > +static int > > > > > +xfs_iwalk_ag_work( > > > > > + struct xfs_mount *mp, > > > > > + struct xfs_pwork *pwork) > > > > > +{ > > > > > + struct xfs_iwalk_ag *iwag; > > > > > + int error; > > > > > + > > > > > + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); > > > > > + error = xfs_iwalk_alloc(iwag); > > > > > + if (error) > > > > > + goto out; > > > > > > > > In most cases this will never fail, but the error path if it does looks > > > > slightly painful. I was thinking if we could move this up into > > > > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when > > > > failure is imminent... > > > > > > > > > + > > > > > + error = xfs_iwalk_ag(iwag); > > > > > + xfs_iwalk_free(iwag); > > > > > +out: > > > > > + kmem_free(iwag); > > > > > + return error; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Walk all the inodes in the filesystem using multiple threads to process each > > > > > + * AG. > > > > > + */ > > > > > +int > > > > > +xfs_iwalk_threaded( > > > > > + struct xfs_mount *mp, > > > > > + xfs_ino_t startino, > > > > > + xfs_iwalk_fn iwalk_fn, > > > > > + unsigned int max_prefetch, > > > > > + void *data) > > > > > +{ > > > > > + struct xfs_pwork_ctl pctl; > > > > > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > > > > > + unsigned int nr_threads; > > > > > + int error; > > > > > + > > > > > + ASSERT(agno < mp->m_sb.sb_agcount); > > > > > + > > > > > + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); > > > > > + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", > > > > > + nr_threads); > > > > > + if (error) > > > > > + return error; > > > > > + > > > > > + for (; agno < mp->m_sb.sb_agcount; agno++) { > > > > > + struct xfs_iwalk_ag *iwag; > > > > > + > > > > > + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); > > > > > + iwag->mp = mp; > > > > > + iwag->tp = NULL; > > > > > + iwag->iwalk_fn = iwalk_fn; > > > > > + iwag->data = data; > > > > > + iwag->startino = startino; > > > > > + iwag->recs = NULL; > > > > > + xfs_iwalk_set_prefetch(iwag, max_prefetch); > > > > > + xfs_pwork_queue(&pctl, &iwag->pwork); > > > > > + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > > > > > + } > > > > > > > > ... but this is only bound by the number of AGs and so could result in a > > > > large number of allocations. FWIW, I wouldn't expect that to be a > > > > problem in the common case. I'm more thinking about the case of a > > > > specially crafted filesystem designed to cause problems on mount. > > > > > > <nod> I thought about that, and decided that it wasn't a good idea to > > > for each of the queued (but not processing) work items to be sitting on > > > a bunch of memory because that memory can't be put to useful work. > > > That's why I put it in xfs_iwalk_ag_work. > > > > > > > Yep. > > > > > Also that would necessitate iwalk feeding a destructor to pwork so that > > > it can deal with work items that were queued but never actually run. > > > > > > > What's the scenario for queued jobs that never run? We have to run the > > work item to free the iwag, so I'd assume we could free the record > > buffer in the same place if we wanted to (which we don't :P). > > Doh! That was a thinko on my part. pwork always runs the work item, > even if pctl->error != 0; it's up to the individual work item to check > if we're aborted and run for the exits, which _iwalk_ag_work doesn't do. > > Erugh, I'll fix the documentation and _iwalk_ag_work. > > > > > > > > > > + > > > > > + return xfs_pwork_destroy(&pctl); > > > > > +} > > ... > > > > > +/* > > > > > + * Return the amount of parallelism that the data device can handle, or 0 for > > > > > + * no limit. > > > > > + */ > > > > > +unsigned int > > > > > +xfs_pwork_guess_datadev_parallelism( > > > > > + struct xfs_mount *mp) > > > > > +{ > > > > > + struct xfs_buftarg *btp = mp->m_ddev_targp; > > > > > + int iomin; > > > > > + int ioopt; > > > > > + > > > > > + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) > > > > > + return num_online_cpus(); > > > > > + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) > > > > > + return mp->m_sb.sb_width / mp->m_sb.sb_unit; > > > > > + iomin = bdev_io_min(btp->bt_bdev); > > > > > + ioopt = bdev_io_opt(btp->bt_bdev); > > > > > + if (iomin && ioopt) > > > > > + return ioopt / iomin; > > > > > + > > > > > + return 1; > > > > > > > > Have you collected any performance data related to these heuristics? > > > > > > Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a > > > single SSD, ~10% on a 4-way raid0). That wasn't really all that > > > awesome, so I recorded a flame graph (see below) to find where the > > > remaining overhead is. A lot of it was in xfs_iget, and I also noticed > > > that deferred inode inactivation sped it up a little more. > > > > > > > So if I follow the graphic correctly, you have 4 walker threads running > > the quotacheck. The majority of overhead is the inode memory allocation > > followed by inode buffer lookup and then dquot lookup slightly behind > > that. If this is an SSD, the heuristic presumably set the thread count > > based on the CPU count, right? > > Right. > > > Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do > > you have granular enough data to see whether these are buffer cache hits > > or misses? > > I /think/ they're mostly hits, though I admit it's hard to read the > 24000 pixel wide output, even after clicking on xfs_buf_read_map to zoom > in a bit. > Ok, I guess it would be nice to see the source of that overhead for future reference. Maybe perf or something could show it better. > > > > I > > > > assume the feature is generally a win, but this also seems like we have > > > > a large window of variance here. E.g., an SSD on a server with hundreds > > > > of CPUs will enable as many threads as CPUs, but a single xTB spindle on > > > > the same box may run single threaded (a quick check of a few local > > > > devices all return an optimal I/O size of 0). Is there really no benefit > > > > parallelizing some of that work in the spinning rust case? > > > > > > Single-spindle spinning rust got worse even with 2 threads because the > > > heads ping-pong between AGs. It's not horrible with enough readahead, > > > but it turns into a disaster when I disabled readahead, unsurprisingly. > > > > > > > Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like > > device where we have a stripe unit/width set, but the device itself may > > have concurrency characteristics more like a single spindle as opposed > > to something like raid0? > > I'd have to study a wider variety of disk raid setups. I'm under the > impression that the inode allocation code tries to allocate contiguous > runs of inode clusters which will fill up stripe units (su), and then > (if there's still space) it'll keep going into the next stripe? > That sounds about right to me. Note that I'm not suggesting we necessarily have to test every possible storage config to land this mechanism. I think it would be reasonable to to restrict the heuristic to conservatively enable cases we have tested so far (i.e., nonrotational with a handful of threads) and expand it from there over time as we have opportunity to test more configs. > Hmm, I wonder how well inodes get distributed across stripes... > > > > > What about in the other direction where we might have a ton of threads > > > > for inodes across AGs that all happen to be in the same project quota, > > > > for example? > > > > > > I collected a flame graph... https://djwong.org/docs/quotacheck.svg > > > > > > > I'm specifically thinking about a few random systems I've used recently > > with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but > > if one did (and with enough AGs), I'm curious how this algorithm would > > behave under those conditions. > > > > I guess this all boils down to trying to understand if/what breakdown > > conditions might exist given the different possibilies allowed by the > > current heuristic. I'm wondering if we should either restrict this > > heuristic to enable concurrency specifically in the environments we've > > shown it to have benefits and/or with some kind of reasonable cap to > > limit unknown boundary conditions. E.g., the case of hundreds of AGs and > > hundreds of CPUs on SSD seems like it could go really well (a big flash > > raid0) or potentially really bad (one big SSD with a poorly configured > > fs). Thoughts? > > <nod> I wish we had a better way for the block layer to give us even a > rough estimate of how many iops a device can handle... or even the > ability to help us train ourselves after the fact by measuring average > latency over time vs. the last (say) 2s so that we could back off if we > thought the device was getting congested. > Yeah, we're kind of limited by the lack of detailed information on the storage setup. > [Hmm, Johannes merged some "pressure stall information" thing a couple > of releases back; I wonder if we can make use of that... I'll have a > look and report back.] > > I suppose we could cap the number of cpus at some arbitrary high number > for now (128?) and see how long it takes to hit scalability problems > because of it, but we could also just let things scale and see who > complains about quotacheck getting slower on their 500 CPU system with a > nonrotational slate stone. I suspect "one big SSD with a poorly > configured fs" is going to notice problems everywhere anyway. > Indeed, we could do that. I was thinking about a similar approach but from the opposite direction. I.e., rather than set a potential performance landmine (assuming there's some config out there that this would thrash), cap things at a few threads to start and expand out from there as dictated by testing, etc. As it is, quotacheck has been single threaded forever (AFAIA) anyways. I suppose with either approach we could also consider enabling the sysfs knob on production environments such that users have either the ability to speed up a currently slow quotacheck or recover from one that is thrashing due to too many workers (depending on which approach we take). We'd probably have to think a little bit harder about the interface if we wanted to do that, though. One obvious issue is that the thread count for one filesystem might be irrelevant for another, and quotacheck is a mount time operation. :/ I dunno.. I'm kind of curious to test this against some of the filesets I'm generating for the near allocation mode rework stuff. I only have a couple days before heading off for vacation though so realistically it might be a couple weeks before I have a chance to do that. It sounds like Dave has reviewed at least some of this work so far as well. Maybe he or others have some thoughts on how to approach the concurrency heuristic..? > Admittedly this is yet another one of those cases where I turn something > on and ... the bottleneck just shows up somewhere else now. I hadn't > realized that iget overhead is so high. > Same. Brian > > > It turned out that the inode setup overhead in xfs_iget is so high that > > > locking the dquot has negligible overhead. When I "fixed" quotacheck to > > > read the quota information straight from the inode cluster buffer if the > > > inode wasn't in memory, the runtime dropped by 60% but Dave warned me > > > not to take us back to inode buffer aliasing hell. I also noted that > > > if booting with mem=512M the memory reclamation overhead totally fries > > > us regardless of parallelisation. > > > > > > > I'm not familiar with the buffer aliasing problem.. I'm guessing this is > > talking about risk of inconsistent in-core inodes with inode buffers..? > > In any event, I agree that it's not worth risking integrity or > > overcomplicating things for something like quotacheck. > > Correct. > > --D > > > Brian > > > > > --D > > > > > > > > > > > Brian > > > > > > > > > +} > > > > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h > > > > > new file mode 100644 > > > > > index 000000000000..e0c1354a2d8c > > > > > --- /dev/null > > > > > +++ b/fs/xfs/xfs_pwork.h > > > > > @@ -0,0 +1,50 @@ > > > > > +// SPDX-License-Identifier: GPL-2.0+ > > > > > +/* > > > > > + * Copyright (C) 2019 Oracle. All Rights Reserved. > > > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com> > > > > > + */ > > > > > +#ifndef __XFS_PWORK_H__ > > > > > +#define __XFS_PWORK_H__ > > > > > + > > > > > +struct xfs_pwork; > > > > > +struct xfs_mount; > > > > > + > > > > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); > > > > > + > > > > > +/* > > > > > + * Parallel work coordination structure. > > > > > + */ > > > > > +struct xfs_pwork_ctl { > > > > > + struct workqueue_struct *wq; > > > > > + struct xfs_mount *mp; > > > > > + xfs_pwork_work_fn work_fn; > > > > > + int error; > > > > > +}; > > > > > + > > > > > +/* > > > > > + * Embed this parallel work control item inside your own work structure, > > > > > + * then queue work with it. > > > > > + */ > > > > > +struct xfs_pwork { > > > > > + struct work_struct work; > > > > > + struct xfs_pwork_ctl *pctl; > > > > > +}; > > > > > + > > > > > +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } > > > > > + > > > > > +/* Have we been told to abort? */ > > > > > +static inline bool > > > > > +xfs_pwork_want_abort( > > > > > + struct xfs_pwork *pwork) > > > > > +{ > > > > > + return pwork->pctl && pwork->pctl->error; > > > > > +} > > > > > + > > > > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, > > > > > + xfs_pwork_work_fn work_fn, const char *tag, > > > > > + unsigned int nr_threads); > > > > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); > > > > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); > > > > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); > > > > > + > > > > > +#endif /* __XFS_PWORK_H__ */ > > > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > > > > > index a5b2260406a8..e4f3785f7a64 100644 > > > > > --- a/fs/xfs/xfs_qm.c > > > > > +++ b/fs/xfs/xfs_qm.c > > > > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( > > > > > flags |= XFS_PQUOTA_CHKD; > > > > > } > > > > > > > > > > - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > > > + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); > > > > > if (error) > > > > > goto error_return; > > > > > > > > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h > > > > > index ad7f9be13087..b555e045e2f4 100644 > > > > > --- a/fs/xfs/xfs_sysctl.h > > > > > +++ b/fs/xfs/xfs_sysctl.h > > > > > @@ -37,6 +37,9 @@ typedef struct xfs_param { > > > > > xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ > > > > > xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ > > > > > xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ > > > > > +#ifdef DEBUG > > > > > + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ > > > > > +#endif > > > > > } xfs_param_t; > > > > > > > > > > /* > > > > > @@ -82,6 +85,9 @@ enum { > > > > > extern xfs_param_t xfs_params; > > > > > > > > > > struct xfs_globals { > > > > > +#ifdef DEBUG > > > > > + int pwork_threads; /* parallel workqueue threads */ > > > > > +#endif > > > > > int log_recovery_delay; /* log recovery delay (secs) */ > > > > > int mount_delay; /* mount setup delay (secs) */ > > > > > bool bug_on_assert; /* BUG() the kernel on assert failure */ > > > > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c > > > > > index cabda13f3c64..910e6b9cb1a7 100644 > > > > > --- a/fs/xfs/xfs_sysfs.c > > > > > +++ b/fs/xfs/xfs_sysfs.c > > > > > @@ -206,11 +206,51 @@ always_cow_show( > > > > > } > > > > > XFS_SYSFS_ATTR_RW(always_cow); > > > > > > > > > > +#ifdef DEBUG > > > > > +/* > > > > > + * Override how many threads the parallel work queue is allowed to create. > > > > > + * This has to be a debug-only global (instead of an errortag) because one of > > > > > + * the main users of parallel workqueues is mount time quotacheck. > > > > > + */ > > > > > +STATIC ssize_t > > > > > +pwork_threads_store( > > > > > + struct kobject *kobject, > > > > > + const char *buf, > > > > > + size_t count) > > > > > +{ > > > > > + int ret; > > > > > + int val; > > > > > + > > > > > + ret = kstrtoint(buf, 0, &val); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > + if (val < 0 || val > NR_CPUS) > > > > > + return -EINVAL; > > > > > + > > > > > + xfs_globals.pwork_threads = val; > > > > > + > > > > > + return count; > > > > > +} > > > > > + > > > > > +STATIC ssize_t > > > > > +pwork_threads_show( > > > > > + struct kobject *kobject, > > > > > + char *buf) > > > > > +{ > > > > > + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); > > > > > +} > > > > > +XFS_SYSFS_ATTR_RW(pwork_threads); > > > > > +#endif /* DEBUG */ > > > > > + > > > > > static struct attribute *xfs_dbg_attrs[] = { > > > > > ATTR_LIST(bug_on_assert), > > > > > ATTR_LIST(log_recovery_delay), > > > > > ATTR_LIST(mount_delay), > > > > > ATTR_LIST(always_cow), > > > > > +#ifdef DEBUG > > > > > + ATTR_LIST(pwork_threads), > > > > > +#endif > > > > > NULL, > > > > > }; > > > > > > > > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > > > > > index f9bb1d50bc0e..658cbade1998 100644 > > > > > --- a/fs/xfs/xfs_trace.h > > > > > +++ b/fs/xfs/xfs_trace.h > > > > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, > > > > > __entry->startino, __entry->freemask) > > > > > ) > > > > > > > > > > +TRACE_EVENT(xfs_pwork_init, > > > > > + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), > > > > > + TP_ARGS(mp, nr_threads, pid), > > > > > + TP_STRUCT__entry( > > > > > + __field(dev_t, dev) > > > > > + __field(unsigned int, nr_threads) > > > > > + __field(pid_t, pid) > > > > > + ), > > > > > + TP_fast_assign( > > > > > + __entry->dev = mp->m_super->s_dev; > > > > > + __entry->nr_threads = nr_threads; > > > > > + __entry->pid = pid; > > > > > + ), > > > > > + TP_printk("dev %d:%d nr_threads %u pid %u", > > > > > + MAJOR(__entry->dev), MINOR(__entry->dev), > > > > > + __entry->nr_threads, __entry->pid) > > > > > +) > > > > > + > > > > > #endif /* _TRACE_XFS_H */ > > > > > > > > > > #undef TRACE_INCLUDE_PATH > > > > >
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 74d30ef0dbce..48940a27d4aa 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_pwork.o \ xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index d0d377384120..4f93f2c4dc38 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -31,6 +31,9 @@ xfs_param_t xfs_params = { .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, .cowb_timer = { 1, 1800, 3600*24}, +#ifdef DEBUG + .pwork_threads = { 0, 0, NR_CPUS }, +#endif }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 8595258b5001..71ee1628aa70 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -21,6 +21,7 @@ #include "xfs_health.h" #include "xfs_trans.h" #include "xfs_iwalk.h" +#include "xfs_pwork.h" /* * Walking Inodes in the Filesystem @@ -46,6 +47,9 @@ */ struct xfs_iwalk_ag { + /* parallel work control data; will be null if single threaded */ + struct xfs_pwork pwork; + struct xfs_mount *mp; struct xfs_trans *tp; @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs( trace_xfs_iwalk_ag_rec(mp, agno, irec); for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { + if (xfs_pwork_want_abort(&iwag->pwork)) + return 0; + /* Skip if this inode is free */ if (XFS_INOBT_MASK(j) & irec->ir_free) continue; @@ -360,7 +367,7 @@ xfs_iwalk_ag( agino = XFS_INO_TO_AGINO(mp, iwag->startino); error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); - while (!error && has_more) { + while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) { struct xfs_inobt_rec_incore *irec; cond_resched(); @@ -409,7 +416,7 @@ xfs_iwalk_ag( xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); /* Walk any records left behind in the cache. */ - if (iwag->nr_recs == 0 || error) + if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork)) return error; return xfs_iwalk_ag_recs(iwag); @@ -465,6 +472,7 @@ xfs_iwalk( .iwalk_fn = iwalk_fn, .data = data, .startino = startino, + .pwork = XFS_PWORK_SINGLE_THREADED, }; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -486,3 +494,67 @@ xfs_iwalk( xfs_iwalk_free(&iwag); return error; } + +/* Run per-thread iwalk work. */ +static int +xfs_iwalk_ag_work( + struct xfs_mount *mp, + struct xfs_pwork *pwork) +{ + struct xfs_iwalk_ag *iwag; + int error; + + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); + error = xfs_iwalk_alloc(iwag); + if (error) + goto out; + + error = xfs_iwalk_ag(iwag); + xfs_iwalk_free(iwag); +out: + kmem_free(iwag); + return error; +} + +/* + * Walk all the inodes in the filesystem using multiple threads to process each + * AG. + */ +int +xfs_iwalk_threaded( + struct xfs_mount *mp, + xfs_ino_t startino, + xfs_iwalk_fn iwalk_fn, + unsigned int max_prefetch, + void *data) +{ + struct xfs_pwork_ctl pctl; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + unsigned int nr_threads; + int error; + + ASSERT(agno < mp->m_sb.sb_agcount); + + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", + nr_threads); + if (error) + return error; + + for (; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_iwalk_ag *iwag; + + iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); + iwag->mp = mp; + iwag->tp = NULL; + iwag->iwalk_fn = iwalk_fn; + iwag->data = data; + iwag->startino = startino; + iwag->recs = NULL; + xfs_iwalk_set_prefetch(iwag, max_prefetch); + xfs_pwork_queue(&pctl, &iwag->pwork); + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + } + + return xfs_pwork_destroy(&pctl); +} diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h index 45b1baabcd2d..40233a05a766 100644 --- a/fs/xfs/xfs_iwalk.h +++ b/fs/xfs/xfs_iwalk.h @@ -14,5 +14,7 @@ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, + xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); #endif /* __XFS_IWALK_H__ */ diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c new file mode 100644 index 000000000000..19605a3a2482 --- /dev/null +++ b/fs/xfs/xfs_pwork.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trace.h" +#include "xfs_sysctl.h" +#include "xfs_pwork.h" + +/* + * Parallel Work Queue + * =================== + * + * Abstract away the details of running a large and "obviously" parallelizable + * task across multiple CPUs. Callers initialize the pwork control object with + * a desired level of parallelization and a work function. Next, they embed + * struct xfs_pwork in whatever structure they use to pass work context to a + * worker thread and queue that pwork. The work function will be passed the + * pwork item when it is run (from process context) and any returned error will + * cause all threads to abort. + * + * This is the rough equivalent of the xfsprogs workqueue code, though we can't + * reuse that name here. + */ + +/* Invoke our caller's function. */ +static void +xfs_pwork_work( + struct work_struct *work) +{ + struct xfs_pwork *pwork; + struct xfs_pwork_ctl *pctl; + int error; + + pwork = container_of(work, struct xfs_pwork, work); + pctl = pwork->pctl; + error = pctl->work_fn(pctl->mp, pwork); + if (error && !pctl->error) + pctl->error = error; +} + +/* + * Set up control data for parallel work. @work_fn is the function that will + * be called. @tag will be written into the kernel threads. @nr_threads is + * the level of parallelism desired, or 0 for no limit. + */ +int +xfs_pwork_init( + struct xfs_mount *mp, + struct xfs_pwork_ctl *pctl, + xfs_pwork_work_fn work_fn, + const char *tag, + unsigned int nr_threads) +{ +#ifdef DEBUG + if (xfs_globals.pwork_threads > 0) + nr_threads = xfs_globals.pwork_threads; +#endif + trace_xfs_pwork_init(mp, nr_threads, current->pid); + + pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, + current->pid); + if (!pctl->wq) + return -ENOMEM; + pctl->work_fn = work_fn; + pctl->error = 0; + pctl->mp = mp; + + return 0; +} + +/* Queue some parallel work. */ +void +xfs_pwork_queue( + struct xfs_pwork_ctl *pctl, + struct xfs_pwork *pwork) +{ + INIT_WORK(&pwork->work, xfs_pwork_work); + pwork->pctl = pctl; + queue_work(pctl->wq, &pwork->work); +} + +/* Wait for the work to finish and tear down the control structure. */ +int +xfs_pwork_destroy( + struct xfs_pwork_ctl *pctl) +{ + destroy_workqueue(pctl->wq); + pctl->wq = NULL; + return pctl->error; +} + +/* + * Return the amount of parallelism that the data device can handle, or 0 for + * no limit. + */ +unsigned int +xfs_pwork_guess_datadev_parallelism( + struct xfs_mount *mp) +{ + struct xfs_buftarg *btp = mp->m_ddev_targp; + int iomin; + int ioopt; + + if (blk_queue_nonrot(btp->bt_bdev->bd_queue)) + return num_online_cpus(); + if (mp->m_sb.sb_width && mp->m_sb.sb_unit) + return mp->m_sb.sb_width / mp->m_sb.sb_unit; + iomin = bdev_io_min(btp->bt_bdev); + ioopt = bdev_io_opt(btp->bt_bdev); + if (iomin && ioopt) + return ioopt / iomin; + + return 1; +} diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h new file mode 100644 index 000000000000..e0c1354a2d8c --- /dev/null +++ b/fs/xfs/xfs_pwork.h @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_PWORK_H__ +#define __XFS_PWORK_H__ + +struct xfs_pwork; +struct xfs_mount; + +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); + +/* + * Parallel work coordination structure. + */ +struct xfs_pwork_ctl { + struct workqueue_struct *wq; + struct xfs_mount *mp; + xfs_pwork_work_fn work_fn; + int error; +}; + +/* + * Embed this parallel work control item inside your own work structure, + * then queue work with it. + */ +struct xfs_pwork { + struct work_struct work; + struct xfs_pwork_ctl *pctl; +}; + +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } + +/* Have we been told to abort? */ +static inline bool +xfs_pwork_want_abort( + struct xfs_pwork *pwork) +{ + return pwork->pctl && pwork->pctl->error; +} + +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, + xfs_pwork_work_fn work_fn, const char *tag, + unsigned int nr_threads); +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); + +#endif /* __XFS_PWORK_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a5b2260406a8..e4f3785f7a64 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } - error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL); + error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL); if (error) goto error_return; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ad7f9be13087..b555e045e2f4 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -37,6 +37,9 @@ typedef struct xfs_param { xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ +#ifdef DEBUG + xfs_sysctl_val_t pwork_threads; /* Parallel workqueue thread count */ +#endif } xfs_param_t; /* @@ -82,6 +85,9 @@ enum { extern xfs_param_t xfs_params; struct xfs_globals { +#ifdef DEBUG + int pwork_threads; /* parallel workqueue threads */ +#endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cabda13f3c64..910e6b9cb1a7 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -206,11 +206,51 @@ always_cow_show( } XFS_SYSFS_ATTR_RW(always_cow); +#ifdef DEBUG +/* + * Override how many threads the parallel work queue is allowed to create. + * This has to be a debug-only global (instead of an errortag) because one of + * the main users of parallel workqueues is mount time quotacheck. + */ +STATIC ssize_t +pwork_threads_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > NR_CPUS) + return -EINVAL; + + xfs_globals.pwork_threads = val; + + return count; +} + +STATIC ssize_t +pwork_threads_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); +} +XFS_SYSFS_ATTR_RW(pwork_threads); +#endif /* DEBUG */ + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), ATTR_LIST(always_cow), +#ifdef DEBUG + ATTR_LIST(pwork_threads), +#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f9bb1d50bc0e..658cbade1998 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec, __entry->startino, __entry->freemask) ) +TRACE_EVENT(xfs_pwork_init, + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), + TP_ARGS(mp, nr_threads, pid), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, nr_threads) + __field(pid_t, pid) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->nr_threads = nr_threads; + __entry->pid = pid; + ), + TP_printk("dev %d:%d nr_threads %u pid %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_threads, __entry->pid) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH