diff mbox series

[08/10] xfs: multithreaded iwalk implementation

Message ID 155968502066.1657646.3694276570612406995.stgit@magnolia (mailing list archive)
State Superseded
Headers show
Series xfs: refactor and improve inode iteration | expand

Commit Message

Darrick J. Wong June 4, 2019, 9:50 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Create a parallel iwalk implementation and switch quotacheck to use it.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile      |    1 
 fs/xfs/xfs_globals.c |    3 +
 fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
 fs/xfs/xfs_iwalk.h   |    2 +
 fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
 fs/xfs/xfs_qm.c      |    2 -
 fs/xfs/xfs_sysctl.h  |    6 ++
 fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
 fs/xfs/xfs_trace.h   |   18 +++++++
 10 files changed, 317 insertions(+), 3 deletions(-)
 create mode 100644 fs/xfs/xfs_pwork.c
 create mode 100644 fs/xfs/xfs_pwork.h

Comments

Brian Foster June 10, 2019, 7:40 p.m. UTC | #1
On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create a parallel iwalk implementation and switch quotacheck to use it.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

Interesting.. is there any commonality here with the ktask mechanism
that's been in progress? I've not followed the details, but I thought it
was a similar idea. The last post I see for that is here:

https://marc.info/?l=linux-mm&m=154143701122927&w=2

That aside, this all looks mostly fine to me. A few random thoughts..

>  fs/xfs/Makefile      |    1 
>  fs/xfs/xfs_globals.c |    3 +
>  fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
>  fs/xfs/xfs_iwalk.h   |    2 +
>  fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
>  fs/xfs/xfs_qm.c      |    2 -
>  fs/xfs/xfs_sysctl.h  |    6 ++
>  fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
>  fs/xfs/xfs_trace.h   |   18 +++++++
>  10 files changed, 317 insertions(+), 3 deletions(-)
>  create mode 100644 fs/xfs/xfs_pwork.c
>  create mode 100644 fs/xfs/xfs_pwork.h
> 
> 
> diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> index 74d30ef0dbce..48940a27d4aa 100644
> --- a/fs/xfs/Makefile
> +++ b/fs/xfs/Makefile
> @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
>  				   xfs_message.o \
>  				   xfs_mount.o \
>  				   xfs_mru_cache.o \
> +				   xfs_pwork.o \
>  				   xfs_reflink.o \
>  				   xfs_stats.o \
>  				   xfs_super.o \
> diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> index d0d377384120..4f93f2c4dc38 100644
> --- a/fs/xfs/xfs_globals.c
> +++ b/fs/xfs/xfs_globals.c
> @@ -31,6 +31,9 @@ xfs_param_t xfs_params = {
>  	.fstrm_timer	= {	1,		30*100,		3600*100},
>  	.eofb_timer	= {	1,		300,		3600*24},
>  	.cowb_timer	= {	1,		1800,		3600*24},
> +#ifdef DEBUG
> +	.pwork_threads	= {	0,		0,		NR_CPUS	},
> +#endif
>  };
>  
>  struct xfs_globals xfs_globals = {
> diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> index 8595258b5001..71ee1628aa70 100644
> --- a/fs/xfs/xfs_iwalk.c
> +++ b/fs/xfs/xfs_iwalk.c
> @@ -21,6 +21,7 @@
>  #include "xfs_health.h"
>  #include "xfs_trans.h"
>  #include "xfs_iwalk.h"
> +#include "xfs_pwork.h"
>  
>  /*
>   * Walking Inodes in the Filesystem
> @@ -46,6 +47,9 @@
>   */
>  
>  struct xfs_iwalk_ag {
> +	/* parallel work control data; will be null if single threaded */
> +	struct xfs_pwork		pwork;
> +
>  	struct xfs_mount		*mp;
>  	struct xfs_trans		*tp;
>  
> @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs(
>  		trace_xfs_iwalk_ag_rec(mp, agno, irec);
>  
>  		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> +			if (xfs_pwork_want_abort(&iwag->pwork))
> +				return 0;
> +
>  			/* Skip if this inode is free */
>  			if (XFS_INOBT_MASK(j) & irec->ir_free)
>  				continue;
> @@ -360,7 +367,7 @@ xfs_iwalk_ag(
>  	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
>  	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
>  
> -	while (!error && has_more) {
> +	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
>  		struct xfs_inobt_rec_incore	*irec;
>  
>  		cond_resched();
> @@ -409,7 +416,7 @@ xfs_iwalk_ag(
>  	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
>  
>  	/* Walk any records left behind in the cache. */
> -	if (iwag->nr_recs == 0 || error)
> +	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
>  		return error;
>  
>  	return xfs_iwalk_ag_recs(iwag);
> @@ -465,6 +472,7 @@ xfs_iwalk(
>  		.iwalk_fn	= iwalk_fn,
>  		.data		= data,
>  		.startino	= startino,
> +		.pwork		= XFS_PWORK_SINGLE_THREADED,
>  	};
>  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
>  	int			error;
> @@ -486,3 +494,67 @@ xfs_iwalk(
>  	xfs_iwalk_free(&iwag);
>  	return error;
>  }
> +
> +/* Run per-thread iwalk work. */
> +static int
> +xfs_iwalk_ag_work(
> +	struct xfs_mount	*mp,
> +	struct xfs_pwork	*pwork)
> +{
> +	struct xfs_iwalk_ag	*iwag;
> +	int			error;
> +
> +	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
> +	error = xfs_iwalk_alloc(iwag);
> +	if (error)
> +		goto out;

In most cases this will never fail, but the error path if it does looks
slightly painful. I was thinking if we could move this up into
xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when
failure is imminent...

> +
> +	error = xfs_iwalk_ag(iwag);
> +	xfs_iwalk_free(iwag);
> +out:
> +	kmem_free(iwag);
> +	return error;
> +}
> +
> +/*
> + * Walk all the inodes in the filesystem using multiple threads to process each
> + * AG.
> + */
> +int
> +xfs_iwalk_threaded(
> +	struct xfs_mount	*mp,
> +	xfs_ino_t		startino,
> +	xfs_iwalk_fn		iwalk_fn,
> +	unsigned int		max_prefetch,
> +	void			*data)
> +{
> +	struct xfs_pwork_ctl	pctl;
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> +	unsigned int		nr_threads;
> +	int			error;
> +
> +	ASSERT(agno < mp->m_sb.sb_agcount);
> +
> +	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> +	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
> +			nr_threads);
> +	if (error)
> +		return error;
> +
> +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> +		struct xfs_iwalk_ag	*iwag;
> +
> +		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
> +		iwag->mp = mp;
> +		iwag->tp = NULL;
> +		iwag->iwalk_fn = iwalk_fn;
> +		iwag->data = data;
> +		iwag->startino = startino;
> +		iwag->recs = NULL;
> +		xfs_iwalk_set_prefetch(iwag, max_prefetch);
> +		xfs_pwork_queue(&pctl, &iwag->pwork);
> +		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> +	}

... but this is only bound by the number of AGs and so could result in a
large number of allocations. FWIW, I wouldn't expect that to be a
problem in the common case. I'm more thinking about the case of a
specially crafted filesystem designed to cause problems on mount.

> +
> +	return xfs_pwork_destroy(&pctl);
> +}
> diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> index 45b1baabcd2d..40233a05a766 100644
> --- a/fs/xfs/xfs_iwalk.h
> +++ b/fs/xfs/xfs_iwalk.h
> @@ -14,5 +14,7 @@ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
>  
>  int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
>  		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
> +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
>  
>  #endif /* __XFS_IWALK_H__ */
> diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
> new file mode 100644
> index 000000000000..19605a3a2482
> --- /dev/null
> +++ b/fs/xfs/xfs_pwork.c
> @@ -0,0 +1,122 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + */
> +#include "xfs.h"
> +#include "xfs_fs.h"
> +#include "xfs_shared.h"
> +#include "xfs_format.h"
> +#include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
> +#include "xfs_trace.h"
> +#include "xfs_sysctl.h"
> +#include "xfs_pwork.h"
> +
> +/*
> + * Parallel Work Queue
> + * ===================
> + *
> + * Abstract away the details of running a large and "obviously" parallelizable
> + * task across multiple CPUs.  Callers initialize the pwork control object with
> + * a desired level of parallelization and a work function.  Next, they embed
> + * struct xfs_pwork in whatever structure they use to pass work context to a
> + * worker thread and queue that pwork.  The work function will be passed the
> + * pwork item when it is run (from process context) and any returned error will
> + * cause all threads to abort.
> + * 

FYI minor whitespace damage (trailing space) on the line above.

> + * This is the rough equivalent of the xfsprogs workqueue code, though we can't
> + * reuse that name here.
> + */
> +
> +/* Invoke our caller's function. */
> +static void
> +xfs_pwork_work(
> +	struct work_struct	*work)
> +{
> +	struct xfs_pwork	*pwork;
> +	struct xfs_pwork_ctl	*pctl;
> +	int			error;
> +
> +	pwork = container_of(work, struct xfs_pwork, work);
> +	pctl = pwork->pctl;
> +	error = pctl->work_fn(pctl->mp, pwork);
> +	if (error && !pctl->error)
> +		pctl->error = error;
> +}
> +
> +/*
> + * Set up control data for parallel work.  @work_fn is the function that will
> + * be called.  @tag will be written into the kernel threads.  @nr_threads is
> + * the level of parallelism desired, or 0 for no limit.
> + */
> +int
> +xfs_pwork_init(
> +	struct xfs_mount	*mp,
> +	struct xfs_pwork_ctl	*pctl,
> +	xfs_pwork_work_fn	work_fn,
> +	const char		*tag,
> +	unsigned int		nr_threads)
> +{
> +#ifdef DEBUG
> +	if (xfs_globals.pwork_threads > 0)
> +		nr_threads = xfs_globals.pwork_threads;
> +#endif

Hmm, it might be useful to have the ability to force the no limit case
from the debug knob. Can we use -1 or something here for "disabled?"

> +	trace_xfs_pwork_init(mp, nr_threads, current->pid);
> +
> +	pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
> +			current->pid);
> +	if (!pctl->wq)
> +		return -ENOMEM;
> +	pctl->work_fn = work_fn;
> +	pctl->error = 0;
> +	pctl->mp = mp;
> +
> +	return 0;
> +}
> +
> +/* Queue some parallel work. */
> +void
> +xfs_pwork_queue(
> +	struct xfs_pwork_ctl	*pctl,
> +	struct xfs_pwork	*pwork)
> +{
> +	INIT_WORK(&pwork->work, xfs_pwork_work);
> +	pwork->pctl = pctl;
> +	queue_work(pctl->wq, &pwork->work);
> +}
> +
> +/* Wait for the work to finish and tear down the control structure. */
> +int
> +xfs_pwork_destroy(
> +	struct xfs_pwork_ctl	*pctl)
> +{
> +	destroy_workqueue(pctl->wq);
> +	pctl->wq = NULL;
> +	return pctl->error;
> +}
> +
> +/*
> + * Return the amount of parallelism that the data device can handle, or 0 for
> + * no limit.
> + */
> +unsigned int
> +xfs_pwork_guess_datadev_parallelism(
> +	struct xfs_mount	*mp)
> +{
> +	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> +	int			iomin;
> +	int			ioopt;
> +
> +	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
> +		return num_online_cpus();
> +	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
> +		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
> +	iomin = bdev_io_min(btp->bt_bdev);
> +	ioopt = bdev_io_opt(btp->bt_bdev);
> +	if (iomin && ioopt)
> +		return ioopt / iomin;
> +
> +	return 1;

Have you collected any performance data related to these heuristics? I
assume the feature is generally a win, but this also seems like we have
a large window of variance here. E.g., an SSD on a server with hundreds
of CPUs will enable as many threads as CPUs, but a single xTB spindle on
the same box may run single threaded (a quick check of a few local
devices all return an optimal I/O size of 0). Is there really no benefit
parallelizing some of that work in the spinning rust case? What about in
the other direction where we might have a ton of threads for inodes
across AGs that all happen to be in the same project quota, for example?

Brian

> +}
> diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> new file mode 100644
> index 000000000000..e0c1354a2d8c
> --- /dev/null
> +++ b/fs/xfs/xfs_pwork.h
> @@ -0,0 +1,50 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + */
> +#ifndef __XFS_PWORK_H__
> +#define __XFS_PWORK_H__
> +
> +struct xfs_pwork;
> +struct xfs_mount;
> +
> +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
> +
> +/*
> + * Parallel work coordination structure.
> + */
> +struct xfs_pwork_ctl {
> +	struct workqueue_struct	*wq;
> +	struct xfs_mount	*mp;
> +	xfs_pwork_work_fn	work_fn;
> +	int			error;
> +};
> +
> +/*
> + * Embed this parallel work control item inside your own work structure,
> + * then queue work with it.
> + */
> +struct xfs_pwork {
> +	struct work_struct	work;
> +	struct xfs_pwork_ctl	*pctl;
> +};
> +
> +#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
> +
> +/* Have we been told to abort? */
> +static inline bool
> +xfs_pwork_want_abort(
> +	struct xfs_pwork	*pwork)
> +{
> +	return pwork->pctl && pwork->pctl->error;
> +}
> +
> +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
> +		xfs_pwork_work_fn work_fn, const char *tag,
> +		unsigned int nr_threads);
> +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
> +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
> +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> +
> +#endif /* __XFS_PWORK_H__ */
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index a5b2260406a8..e4f3785f7a64 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck(
>  		flags |= XFS_PQUOTA_CHKD;
>  	}
>  
> -	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> +	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
>  	if (error)
>  		goto error_return;
>  
> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> index ad7f9be13087..b555e045e2f4 100644
> --- a/fs/xfs/xfs_sysctl.h
> +++ b/fs/xfs/xfs_sysctl.h
> @@ -37,6 +37,9 @@ typedef struct xfs_param {
>  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
>  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
>  	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
> +#ifdef DEBUG
> +	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
> +#endif
>  } xfs_param_t;
>  
>  /*
> @@ -82,6 +85,9 @@ enum {
>  extern xfs_param_t	xfs_params;
>  
>  struct xfs_globals {
> +#ifdef DEBUG
> +	int	pwork_threads;		/* parallel workqueue threads */
> +#endif
>  	int	log_recovery_delay;	/* log recovery delay (secs) */
>  	int	mount_delay;		/* mount setup delay (secs) */
>  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index cabda13f3c64..910e6b9cb1a7 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -206,11 +206,51 @@ always_cow_show(
>  }
>  XFS_SYSFS_ATTR_RW(always_cow);
>  
> +#ifdef DEBUG
> +/*
> + * Override how many threads the parallel work queue is allowed to create.
> + * This has to be a debug-only global (instead of an errortag) because one of
> + * the main users of parallel workqueues is mount time quotacheck.
> + */
> +STATIC ssize_t
> +pwork_threads_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	int		ret;
> +	int		val;
> +
> +	ret = kstrtoint(buf, 0, &val);
> +	if (ret)
> +		return ret;
> +
> +	if (val < 0 || val > NR_CPUS)
> +		return -EINVAL;
> +
> +	xfs_globals.pwork_threads = val;
> +
> +	return count;
> +}
> +
> +STATIC ssize_t
> +pwork_threads_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
> +}
> +XFS_SYSFS_ATTR_RW(pwork_threads);
> +#endif /* DEBUG */
> +
>  static struct attribute *xfs_dbg_attrs[] = {
>  	ATTR_LIST(bug_on_assert),
>  	ATTR_LIST(log_recovery_delay),
>  	ATTR_LIST(mount_delay),
>  	ATTR_LIST(always_cow),
> +#ifdef DEBUG
> +	ATTR_LIST(pwork_threads),
> +#endif
>  	NULL,
>  };
>  
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index f9bb1d50bc0e..658cbade1998 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
>  		  __entry->startino, __entry->freemask)
>  )
>  
> +TRACE_EVENT(xfs_pwork_init,
> +	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
> +	TP_ARGS(mp, nr_threads, pid),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(unsigned int, nr_threads)
> +		__field(pid_t, pid)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->nr_threads = nr_threads;
> +		__entry->pid = pid;
> +	),
> +	TP_printk("dev %d:%d nr_threads %u pid %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->nr_threads, __entry->pid)
> +)
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
>
Darrick J. Wong June 11, 2019, 1:10 a.m. UTC | #2
On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote:
> On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create a parallel iwalk implementation and switch quotacheck to use it.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> Interesting.. is there any commonality here with the ktask mechanism
> that's been in progress? I've not followed the details, but I thought it
> was a similar idea. The last post I see for that is here:
> 
> https://marc.info/?l=linux-mm&m=154143701122927&w=2

Yes, xfs_pwork is... the result of ktask still not landing upstream
after a couple of years. :(

> That aside, this all looks mostly fine to me. A few random thoughts..
> 
> >  fs/xfs/Makefile      |    1 
> >  fs/xfs/xfs_globals.c |    3 +
> >  fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
> >  fs/xfs/xfs_iwalk.h   |    2 +
> >  fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
> >  fs/xfs/xfs_qm.c      |    2 -
> >  fs/xfs/xfs_sysctl.h  |    6 ++
> >  fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
> >  fs/xfs/xfs_trace.h   |   18 +++++++
> >  10 files changed, 317 insertions(+), 3 deletions(-)
> >  create mode 100644 fs/xfs/xfs_pwork.c
> >  create mode 100644 fs/xfs/xfs_pwork.h
> > 
> > 
> > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > index 74d30ef0dbce..48940a27d4aa 100644
> > --- a/fs/xfs/Makefile
> > +++ b/fs/xfs/Makefile
> > @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
> >  				   xfs_message.o \
> >  				   xfs_mount.o \
> >  				   xfs_mru_cache.o \
> > +				   xfs_pwork.o \
> >  				   xfs_reflink.o \
> >  				   xfs_stats.o \
> >  				   xfs_super.o \
> > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> > index d0d377384120..4f93f2c4dc38 100644
> > --- a/fs/xfs/xfs_globals.c
> > +++ b/fs/xfs/xfs_globals.c
> > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = {
> >  	.fstrm_timer	= {	1,		30*100,		3600*100},
> >  	.eofb_timer	= {	1,		300,		3600*24},
> >  	.cowb_timer	= {	1,		1800,		3600*24},
> > +#ifdef DEBUG
> > +	.pwork_threads	= {	0,		0,		NR_CPUS	},
> > +#endif
> >  };
> >  
> >  struct xfs_globals xfs_globals = {
> > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > index 8595258b5001..71ee1628aa70 100644
> > --- a/fs/xfs/xfs_iwalk.c
> > +++ b/fs/xfs/xfs_iwalk.c
> > @@ -21,6 +21,7 @@
> >  #include "xfs_health.h"
> >  #include "xfs_trans.h"
> >  #include "xfs_iwalk.h"
> > +#include "xfs_pwork.h"
> >  
> >  /*
> >   * Walking Inodes in the Filesystem
> > @@ -46,6 +47,9 @@
> >   */
> >  
> >  struct xfs_iwalk_ag {
> > +	/* parallel work control data; will be null if single threaded */
> > +	struct xfs_pwork		pwork;
> > +
> >  	struct xfs_mount		*mp;
> >  	struct xfs_trans		*tp;
> >  
> > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs(
> >  		trace_xfs_iwalk_ag_rec(mp, agno, irec);
> >  
> >  		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> > +			if (xfs_pwork_want_abort(&iwag->pwork))
> > +				return 0;
> > +
> >  			/* Skip if this inode is free */
> >  			if (XFS_INOBT_MASK(j) & irec->ir_free)
> >  				continue;
> > @@ -360,7 +367,7 @@ xfs_iwalk_ag(
> >  	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> >  	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> >  
> > -	while (!error && has_more) {
> > +	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
> >  		struct xfs_inobt_rec_incore	*irec;
> >  
> >  		cond_resched();
> > @@ -409,7 +416,7 @@ xfs_iwalk_ag(
> >  	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> >  
> >  	/* Walk any records left behind in the cache. */
> > -	if (iwag->nr_recs == 0 || error)
> > +	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
> >  		return error;
> >  
> >  	return xfs_iwalk_ag_recs(iwag);
> > @@ -465,6 +472,7 @@ xfs_iwalk(
> >  		.iwalk_fn	= iwalk_fn,
> >  		.data		= data,
> >  		.startino	= startino,
> > +		.pwork		= XFS_PWORK_SINGLE_THREADED,
> >  	};
> >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> >  	int			error;
> > @@ -486,3 +494,67 @@ xfs_iwalk(
> >  	xfs_iwalk_free(&iwag);
> >  	return error;
> >  }
> > +
> > +/* Run per-thread iwalk work. */
> > +static int
> > +xfs_iwalk_ag_work(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_pwork	*pwork)
> > +{
> > +	struct xfs_iwalk_ag	*iwag;
> > +	int			error;
> > +
> > +	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
> > +	error = xfs_iwalk_alloc(iwag);
> > +	if (error)
> > +		goto out;
> 
> In most cases this will never fail, but the error path if it does looks
> slightly painful. I was thinking if we could move this up into
> xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when
> failure is imminent...
> 
> > +
> > +	error = xfs_iwalk_ag(iwag);
> > +	xfs_iwalk_free(iwag);
> > +out:
> > +	kmem_free(iwag);
> > +	return error;
> > +}
> > +
> > +/*
> > + * Walk all the inodes in the filesystem using multiple threads to process each
> > + * AG.
> > + */
> > +int
> > +xfs_iwalk_threaded(
> > +	struct xfs_mount	*mp,
> > +	xfs_ino_t		startino,
> > +	xfs_iwalk_fn		iwalk_fn,
> > +	unsigned int		max_prefetch,
> > +	void			*data)
> > +{
> > +	struct xfs_pwork_ctl	pctl;
> > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > +	unsigned int		nr_threads;
> > +	int			error;
> > +
> > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > +
> > +	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> > +	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
> > +			nr_threads);
> > +	if (error)
> > +		return error;
> > +
> > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > +		struct xfs_iwalk_ag	*iwag;
> > +
> > +		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
> > +		iwag->mp = mp;
> > +		iwag->tp = NULL;
> > +		iwag->iwalk_fn = iwalk_fn;
> > +		iwag->data = data;
> > +		iwag->startino = startino;
> > +		iwag->recs = NULL;
> > +		xfs_iwalk_set_prefetch(iwag, max_prefetch);
> > +		xfs_pwork_queue(&pctl, &iwag->pwork);
> > +		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > +	}
> 
> ... but this is only bound by the number of AGs and so could result in a
> large number of allocations. FWIW, I wouldn't expect that to be a
> problem in the common case. I'm more thinking about the case of a
> specially crafted filesystem designed to cause problems on mount.

<nod> I thought about that, and decided that it wasn't a good idea to
for each of the queued (but not processing) work items to be sitting on
a bunch of memory because that memory can't be put to useful work.
That's why I put it in xfs_iwalk_ag_work.

Also that would necessitate iwalk feeding a destructor to pwork so that
it can deal with work items that were queued but never actually run.

> 
> > +
> > +	return xfs_pwork_destroy(&pctl);
> > +}
> > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> > index 45b1baabcd2d..40233a05a766 100644
> > --- a/fs/xfs/xfs_iwalk.h
> > +++ b/fs/xfs/xfs_iwalk.h
> > @@ -14,5 +14,7 @@ typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
> >  
> >  int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
> >  		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> > +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
> > +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> >  
> >  #endif /* __XFS_IWALK_H__ */
> > diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
> > new file mode 100644
> > index 000000000000..19605a3a2482
> > --- /dev/null
> > +++ b/fs/xfs/xfs_pwork.c
> > @@ -0,0 +1,122 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > + */
> > +#include "xfs.h"
> > +#include "xfs_fs.h"
> > +#include "xfs_shared.h"
> > +#include "xfs_format.h"
> > +#include "xfs_log_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_mount.h"
> > +#include "xfs_trace.h"
> > +#include "xfs_sysctl.h"
> > +#include "xfs_pwork.h"
> > +
> > +/*
> > + * Parallel Work Queue
> > + * ===================
> > + *
> > + * Abstract away the details of running a large and "obviously" parallelizable
> > + * task across multiple CPUs.  Callers initialize the pwork control object with
> > + * a desired level of parallelization and a work function.  Next, they embed
> > + * struct xfs_pwork in whatever structure they use to pass work context to a
> > + * worker thread and queue that pwork.  The work function will be passed the
> > + * pwork item when it is run (from process context) and any returned error will
> > + * cause all threads to abort.
> > + * 
> 
> FYI minor whitespace damage (trailing space) on the line above.

Fixed.

> > + * This is the rough equivalent of the xfsprogs workqueue code, though we can't
> > + * reuse that name here.
> > + */
> > +
> > +/* Invoke our caller's function. */
> > +static void
> > +xfs_pwork_work(
> > +	struct work_struct	*work)
> > +{
> > +	struct xfs_pwork	*pwork;
> > +	struct xfs_pwork_ctl	*pctl;
> > +	int			error;
> > +
> > +	pwork = container_of(work, struct xfs_pwork, work);
> > +	pctl = pwork->pctl;
> > +	error = pctl->work_fn(pctl->mp, pwork);
> > +	if (error && !pctl->error)
> > +		pctl->error = error;
> > +}
> > +
> > +/*
> > + * Set up control data for parallel work.  @work_fn is the function that will
> > + * be called.  @tag will be written into the kernel threads.  @nr_threads is
> > + * the level of parallelism desired, or 0 for no limit.
> > + */
> > +int
> > +xfs_pwork_init(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_pwork_ctl	*pctl,
> > +	xfs_pwork_work_fn	work_fn,
> > +	const char		*tag,
> > +	unsigned int		nr_threads)
> > +{
> > +#ifdef DEBUG
> > +	if (xfs_globals.pwork_threads > 0)
> > +		nr_threads = xfs_globals.pwork_threads;
> > +#endif
> 
> Hmm, it might be useful to have the ability to force the no limit case
> from the debug knob. Can we use -1 or something here for "disabled?"

Ok.

> > +	trace_xfs_pwork_init(mp, nr_threads, current->pid);
> > +
> > +	pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
> > +			current->pid);
> > +	if (!pctl->wq)
> > +		return -ENOMEM;
> > +	pctl->work_fn = work_fn;
> > +	pctl->error = 0;
> > +	pctl->mp = mp;
> > +
> > +	return 0;
> > +}
> > +
> > +/* Queue some parallel work. */
> > +void
> > +xfs_pwork_queue(
> > +	struct xfs_pwork_ctl	*pctl,
> > +	struct xfs_pwork	*pwork)
> > +{
> > +	INIT_WORK(&pwork->work, xfs_pwork_work);
> > +	pwork->pctl = pctl;
> > +	queue_work(pctl->wq, &pwork->work);
> > +}
> > +
> > +/* Wait for the work to finish and tear down the control structure. */
> > +int
> > +xfs_pwork_destroy(
> > +	struct xfs_pwork_ctl	*pctl)
> > +{
> > +	destroy_workqueue(pctl->wq);
> > +	pctl->wq = NULL;
> > +	return pctl->error;
> > +}
> > +
> > +/*
> > + * Return the amount of parallelism that the data device can handle, or 0 for
> > + * no limit.
> > + */
> > +unsigned int
> > +xfs_pwork_guess_datadev_parallelism(
> > +	struct xfs_mount	*mp)
> > +{
> > +	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> > +	int			iomin;
> > +	int			ioopt;
> > +
> > +	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
> > +		return num_online_cpus();
> > +	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
> > +		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
> > +	iomin = bdev_io_min(btp->bt_bdev);
> > +	ioopt = bdev_io_opt(btp->bt_bdev);
> > +	if (iomin && ioopt)
> > +		return ioopt / iomin;
> > +
> > +	return 1;
> 
> Have you collected any performance data related to these heuristics?

Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a
single SSD, ~10% on a 4-way raid0).  That wasn't really all that
awesome, so I recorded a flame graph (see below) to find where the
remaining overhead is.  A lot of it was in xfs_iget, and I also noticed
that deferred inode inactivation sped it up a little more.

> I
> assume the feature is generally a win, but this also seems like we have
> a large window of variance here. E.g., an SSD on a server with hundreds
> of CPUs will enable as many threads as CPUs, but a single xTB spindle on
> the same box may run single threaded (a quick check of a few local
> devices all return an optimal I/O size of 0). Is there really no benefit
> parallelizing some of that work in the spinning rust case?

Single-spindle spinning rust got worse even with 2 threads because the
heads ping-pong between AGs.  It's not horrible with enough readahead,
but it turns into a disaster when I disabled readahead, unsurprisingly.

> What about in the other direction where we might have a ton of threads
> for inodes across AGs that all happen to be in the same project quota,
> for example?

I collected a flame graph... https://djwong.org/docs/quotacheck.svg

It turned out that the inode setup overhead in xfs_iget is so high that
locking the dquot has negligible overhead.  When I "fixed" quotacheck to
read the quota information straight from the inode cluster buffer if the
inode wasn't in memory, the runtime dropped by 60% but Dave warned me
not to take us back to inode buffer aliasing hell.  I also noted that
if booting with mem=512M the memory reclamation overhead totally fries
us regardless of parallelisation.

--D

> 
> Brian
> 
> > +}
> > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> > new file mode 100644
> > index 000000000000..e0c1354a2d8c
> > --- /dev/null
> > +++ b/fs/xfs/xfs_pwork.h
> > @@ -0,0 +1,50 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > + */
> > +#ifndef __XFS_PWORK_H__
> > +#define __XFS_PWORK_H__
> > +
> > +struct xfs_pwork;
> > +struct xfs_mount;
> > +
> > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
> > +
> > +/*
> > + * Parallel work coordination structure.
> > + */
> > +struct xfs_pwork_ctl {
> > +	struct workqueue_struct	*wq;
> > +	struct xfs_mount	*mp;
> > +	xfs_pwork_work_fn	work_fn;
> > +	int			error;
> > +};
> > +
> > +/*
> > + * Embed this parallel work control item inside your own work structure,
> > + * then queue work with it.
> > + */
> > +struct xfs_pwork {
> > +	struct work_struct	work;
> > +	struct xfs_pwork_ctl	*pctl;
> > +};
> > +
> > +#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
> > +
> > +/* Have we been told to abort? */
> > +static inline bool
> > +xfs_pwork_want_abort(
> > +	struct xfs_pwork	*pwork)
> > +{
> > +	return pwork->pctl && pwork->pctl->error;
> > +}
> > +
> > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
> > +		xfs_pwork_work_fn work_fn, const char *tag,
> > +		unsigned int nr_threads);
> > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
> > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
> > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> > +
> > +#endif /* __XFS_PWORK_H__ */
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index a5b2260406a8..e4f3785f7a64 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck(
> >  		flags |= XFS_PQUOTA_CHKD;
> >  	}
> >  
> > -	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > +	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
> >  	if (error)
> >  		goto error_return;
> >  
> > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > index ad7f9be13087..b555e045e2f4 100644
> > --- a/fs/xfs/xfs_sysctl.h
> > +++ b/fs/xfs/xfs_sysctl.h
> > @@ -37,6 +37,9 @@ typedef struct xfs_param {
> >  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
> >  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
> >  	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
> > +#ifdef DEBUG
> > +	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
> > +#endif
> >  } xfs_param_t;
> >  
> >  /*
> > @@ -82,6 +85,9 @@ enum {
> >  extern xfs_param_t	xfs_params;
> >  
> >  struct xfs_globals {
> > +#ifdef DEBUG
> > +	int	pwork_threads;		/* parallel workqueue threads */
> > +#endif
> >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> >  	int	mount_delay;		/* mount setup delay (secs) */
> >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > index cabda13f3c64..910e6b9cb1a7 100644
> > --- a/fs/xfs/xfs_sysfs.c
> > +++ b/fs/xfs/xfs_sysfs.c
> > @@ -206,11 +206,51 @@ always_cow_show(
> >  }
> >  XFS_SYSFS_ATTR_RW(always_cow);
> >  
> > +#ifdef DEBUG
> > +/*
> > + * Override how many threads the parallel work queue is allowed to create.
> > + * This has to be a debug-only global (instead of an errortag) because one of
> > + * the main users of parallel workqueues is mount time quotacheck.
> > + */
> > +STATIC ssize_t
> > +pwork_threads_store(
> > +	struct kobject	*kobject,
> > +	const char	*buf,
> > +	size_t		count)
> > +{
> > +	int		ret;
> > +	int		val;
> > +
> > +	ret = kstrtoint(buf, 0, &val);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (val < 0 || val > NR_CPUS)
> > +		return -EINVAL;
> > +
> > +	xfs_globals.pwork_threads = val;
> > +
> > +	return count;
> > +}
> > +
> > +STATIC ssize_t
> > +pwork_threads_show(
> > +	struct kobject	*kobject,
> > +	char		*buf)
> > +{
> > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
> > +}
> > +XFS_SYSFS_ATTR_RW(pwork_threads);
> > +#endif /* DEBUG */
> > +
> >  static struct attribute *xfs_dbg_attrs[] = {
> >  	ATTR_LIST(bug_on_assert),
> >  	ATTR_LIST(log_recovery_delay),
> >  	ATTR_LIST(mount_delay),
> >  	ATTR_LIST(always_cow),
> > +#ifdef DEBUG
> > +	ATTR_LIST(pwork_threads),
> > +#endif
> >  	NULL,
> >  };
> >  
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index f9bb1d50bc0e..658cbade1998 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
> >  		  __entry->startino, __entry->freemask)
> >  )
> >  
> > +TRACE_EVENT(xfs_pwork_init,
> > +	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
> > +	TP_ARGS(mp, nr_threads, pid),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(unsigned int, nr_threads)
> > +		__field(pid_t, pid)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->nr_threads = nr_threads;
> > +		__entry->pid = pid;
> > +	),
> > +	TP_printk("dev %d:%d nr_threads %u pid %u",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > +		  __entry->nr_threads, __entry->pid)
> > +)
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> >
Brian Foster June 11, 2019, 1:13 p.m. UTC | #3
On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote:
> On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote:
> > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create a parallel iwalk implementation and switch quotacheck to use it.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > 
> > Interesting.. is there any commonality here with the ktask mechanism
> > that's been in progress? I've not followed the details, but I thought it
> > was a similar idea. The last post I see for that is here:
> > 
> > https://marc.info/?l=linux-mm&m=154143701122927&w=2
> 
> Yes, xfs_pwork is... the result of ktask still not landing upstream
> after a couple of years. :(
> 

Heh, Ok. We could always port over to it if it ever does land.

> > That aside, this all looks mostly fine to me. A few random thoughts..
> > 
> > >  fs/xfs/Makefile      |    1 
> > >  fs/xfs/xfs_globals.c |    3 +
> > >  fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
> > >  fs/xfs/xfs_iwalk.h   |    2 +
> > >  fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
> > >  fs/xfs/xfs_qm.c      |    2 -
> > >  fs/xfs/xfs_sysctl.h  |    6 ++
> > >  fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
> > >  fs/xfs/xfs_trace.h   |   18 +++++++
> > >  10 files changed, 317 insertions(+), 3 deletions(-)
> > >  create mode 100644 fs/xfs/xfs_pwork.c
> > >  create mode 100644 fs/xfs/xfs_pwork.h
> > > 
> > > 
> > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > > index 74d30ef0dbce..48940a27d4aa 100644
> > > --- a/fs/xfs/Makefile
> > > +++ b/fs/xfs/Makefile
> > > @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
> > >  				   xfs_message.o \
> > >  				   xfs_mount.o \
> > >  				   xfs_mru_cache.o \
> > > +				   xfs_pwork.o \
> > >  				   xfs_reflink.o \
> > >  				   xfs_stats.o \
> > >  				   xfs_super.o \
> > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> > > index d0d377384120..4f93f2c4dc38 100644
> > > --- a/fs/xfs/xfs_globals.c
> > > +++ b/fs/xfs/xfs_globals.c
> > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = {
> > >  	.fstrm_timer	= {	1,		30*100,		3600*100},
> > >  	.eofb_timer	= {	1,		300,		3600*24},
> > >  	.cowb_timer	= {	1,		1800,		3600*24},
> > > +#ifdef DEBUG
> > > +	.pwork_threads	= {	0,		0,		NR_CPUS	},
> > > +#endif
> > >  };
> > >  
> > >  struct xfs_globals xfs_globals = {
> > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > > index 8595258b5001..71ee1628aa70 100644
> > > --- a/fs/xfs/xfs_iwalk.c
> > > +++ b/fs/xfs/xfs_iwalk.c
> > > @@ -21,6 +21,7 @@
> > >  #include "xfs_health.h"
> > >  #include "xfs_trans.h"
> > >  #include "xfs_iwalk.h"
> > > +#include "xfs_pwork.h"
> > >  
> > >  /*
> > >   * Walking Inodes in the Filesystem
> > > @@ -46,6 +47,9 @@
> > >   */
> > >  
> > >  struct xfs_iwalk_ag {
> > > +	/* parallel work control data; will be null if single threaded */
> > > +	struct xfs_pwork		pwork;
> > > +
> > >  	struct xfs_mount		*mp;
> > >  	struct xfs_trans		*tp;
> > >  
> > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs(
> > >  		trace_xfs_iwalk_ag_rec(mp, agno, irec);
> > >  
> > >  		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> > > +			if (xfs_pwork_want_abort(&iwag->pwork))
> > > +				return 0;
> > > +
> > >  			/* Skip if this inode is free */
> > >  			if (XFS_INOBT_MASK(j) & irec->ir_free)
> > >  				continue;
> > > @@ -360,7 +367,7 @@ xfs_iwalk_ag(
> > >  	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> > >  	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> > >  
> > > -	while (!error && has_more) {
> > > +	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
> > >  		struct xfs_inobt_rec_incore	*irec;
> > >  
> > >  		cond_resched();
> > > @@ -409,7 +416,7 @@ xfs_iwalk_ag(
> > >  	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> > >  
> > >  	/* Walk any records left behind in the cache. */
> > > -	if (iwag->nr_recs == 0 || error)
> > > +	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
> > >  		return error;
> > >  
> > >  	return xfs_iwalk_ag_recs(iwag);
> > > @@ -465,6 +472,7 @@ xfs_iwalk(
> > >  		.iwalk_fn	= iwalk_fn,
> > >  		.data		= data,
> > >  		.startino	= startino,
> > > +		.pwork		= XFS_PWORK_SINGLE_THREADED,
> > >  	};
> > >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > >  	int			error;
> > > @@ -486,3 +494,67 @@ xfs_iwalk(
> > >  	xfs_iwalk_free(&iwag);
> > >  	return error;
> > >  }
> > > +
> > > +/* Run per-thread iwalk work. */
> > > +static int
> > > +xfs_iwalk_ag_work(
> > > +	struct xfs_mount	*mp,
> > > +	struct xfs_pwork	*pwork)
> > > +{
> > > +	struct xfs_iwalk_ag	*iwag;
> > > +	int			error;
> > > +
> > > +	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
> > > +	error = xfs_iwalk_alloc(iwag);
> > > +	if (error)
> > > +		goto out;
> > 
> > In most cases this will never fail, but the error path if it does looks
> > slightly painful. I was thinking if we could move this up into
> > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when
> > failure is imminent...
> > 
> > > +
> > > +	error = xfs_iwalk_ag(iwag);
> > > +	xfs_iwalk_free(iwag);
> > > +out:
> > > +	kmem_free(iwag);
> > > +	return error;
> > > +}
> > > +
> > > +/*
> > > + * Walk all the inodes in the filesystem using multiple threads to process each
> > > + * AG.
> > > + */
> > > +int
> > > +xfs_iwalk_threaded(
> > > +	struct xfs_mount	*mp,
> > > +	xfs_ino_t		startino,
> > > +	xfs_iwalk_fn		iwalk_fn,
> > > +	unsigned int		max_prefetch,
> > > +	void			*data)
> > > +{
> > > +	struct xfs_pwork_ctl	pctl;
> > > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > +	unsigned int		nr_threads;
> > > +	int			error;
> > > +
> > > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > > +
> > > +	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> > > +	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
> > > +			nr_threads);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > > +		struct xfs_iwalk_ag	*iwag;
> > > +
> > > +		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
> > > +		iwag->mp = mp;
> > > +		iwag->tp = NULL;
> > > +		iwag->iwalk_fn = iwalk_fn;
> > > +		iwag->data = data;
> > > +		iwag->startino = startino;
> > > +		iwag->recs = NULL;
> > > +		xfs_iwalk_set_prefetch(iwag, max_prefetch);
> > > +		xfs_pwork_queue(&pctl, &iwag->pwork);
> > > +		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > > +	}
> > 
> > ... but this is only bound by the number of AGs and so could result in a
> > large number of allocations. FWIW, I wouldn't expect that to be a
> > problem in the common case. I'm more thinking about the case of a
> > specially crafted filesystem designed to cause problems on mount.
> 
> <nod> I thought about that, and decided that it wasn't a good idea to
> for each of the queued (but not processing) work items to be sitting on
> a bunch of memory because that memory can't be put to useful work.
> That's why I put it in xfs_iwalk_ag_work.
> 

Yep.

> Also that would necessitate iwalk feeding a destructor to pwork so that
> it can deal with work items that were queued but never actually run.
> 

What's the scenario for queued jobs that never run? We have to run the
work item to free the iwag, so I'd assume we could free the record
buffer in the same place if we wanted to (which we don't :P).

> > 
> > > +
> > > +	return xfs_pwork_destroy(&pctl);
> > > +}
...
> > > +/*
> > > + * Return the amount of parallelism that the data device can handle, or 0 for
> > > + * no limit.
> > > + */
> > > +unsigned int
> > > +xfs_pwork_guess_datadev_parallelism(
> > > +	struct xfs_mount	*mp)
> > > +{
> > > +	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> > > +	int			iomin;
> > > +	int			ioopt;
> > > +
> > > +	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
> > > +		return num_online_cpus();
> > > +	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
> > > +		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
> > > +	iomin = bdev_io_min(btp->bt_bdev);
> > > +	ioopt = bdev_io_opt(btp->bt_bdev);
> > > +	if (iomin && ioopt)
> > > +		return ioopt / iomin;
> > > +
> > > +	return 1;
> > 
> > Have you collected any performance data related to these heuristics?
> 
> Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a
> single SSD, ~10% on a 4-way raid0).  That wasn't really all that
> awesome, so I recorded a flame graph (see below) to find where the
> remaining overhead is.  A lot of it was in xfs_iget, and I also noticed
> that deferred inode inactivation sped it up a little more.
> 

So if I follow the graphic correctly, you have 4 walker threads running
the quotacheck. The majority of overhead is the inode memory allocation
followed by inode buffer lookup and then dquot lookup slightly behind
that. If this is an SSD, the heuristic presumably set the thread count
based on the CPU count, right?

Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do
you have granular enough data to see whether these are buffer cache hits
or misses?

> > I
> > assume the feature is generally a win, but this also seems like we have
> > a large window of variance here. E.g., an SSD on a server with hundreds
> > of CPUs will enable as many threads as CPUs, but a single xTB spindle on
> > the same box may run single threaded (a quick check of a few local
> > devices all return an optimal I/O size of 0). Is there really no benefit
> > parallelizing some of that work in the spinning rust case?
> 
> Single-spindle spinning rust got worse even with 2 threads because the
> heads ping-pong between AGs.  It's not horrible with enough readahead,
> but it turns into a disaster when I disabled readahead, unsurprisingly.
> 

Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like
device where we have a stripe unit/width set, but the device itself may
have concurrency characteristics more like a single spindle as opposed
to something like raid0?

> > What about in the other direction where we might have a ton of threads
> > for inodes across AGs that all happen to be in the same project quota,
> > for example?
> 
> I collected a flame graph... https://djwong.org/docs/quotacheck.svg
> 

I'm specifically thinking about a few random systems I've used recently
with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but
if one did (and with enough AGs), I'm curious how this algorithm would
behave under those conditions.

I guess this all boils down to trying to understand if/what breakdown
conditions might exist given the different possibilies allowed by the
current heuristic. I'm wondering if we should either restrict this
heuristic to enable concurrency specifically in the environments we've
shown it to have benefits and/or with some kind of reasonable cap to
limit unknown boundary conditions. E.g., the case of hundreds of AGs and
hundreds of CPUs on SSD seems like it could go really well (a big flash
raid0) or potentially really bad (one big SSD with a poorly configured
fs). Thoughts?

> It turned out that the inode setup overhead in xfs_iget is so high that
> locking the dquot has negligible overhead.  When I "fixed" quotacheck to
> read the quota information straight from the inode cluster buffer if the
> inode wasn't in memory, the runtime dropped by 60% but Dave warned me
> not to take us back to inode buffer aliasing hell.  I also noted that
> if booting with mem=512M the memory reclamation overhead totally fries
> us regardless of parallelisation.
> 

I'm not familiar with the buffer aliasing problem.. I'm guessing this is
talking about risk of inconsistent in-core inodes with inode buffers..?
In any event, I agree that it's not worth risking integrity or
overcomplicating things for something like quotacheck.

Brian

> --D
> 
> > 
> > Brian
> > 
> > > +}
> > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> > > new file mode 100644
> > > index 000000000000..e0c1354a2d8c
> > > --- /dev/null
> > > +++ b/fs/xfs/xfs_pwork.h
> > > @@ -0,0 +1,50 @@
> > > +// SPDX-License-Identifier: GPL-2.0+
> > > +/*
> > > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > > + */
> > > +#ifndef __XFS_PWORK_H__
> > > +#define __XFS_PWORK_H__
> > > +
> > > +struct xfs_pwork;
> > > +struct xfs_mount;
> > > +
> > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
> > > +
> > > +/*
> > > + * Parallel work coordination structure.
> > > + */
> > > +struct xfs_pwork_ctl {
> > > +	struct workqueue_struct	*wq;
> > > +	struct xfs_mount	*mp;
> > > +	xfs_pwork_work_fn	work_fn;
> > > +	int			error;
> > > +};
> > > +
> > > +/*
> > > + * Embed this parallel work control item inside your own work structure,
> > > + * then queue work with it.
> > > + */
> > > +struct xfs_pwork {
> > > +	struct work_struct	work;
> > > +	struct xfs_pwork_ctl	*pctl;
> > > +};
> > > +
> > > +#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
> > > +
> > > +/* Have we been told to abort? */
> > > +static inline bool
> > > +xfs_pwork_want_abort(
> > > +	struct xfs_pwork	*pwork)
> > > +{
> > > +	return pwork->pctl && pwork->pctl->error;
> > > +}
> > > +
> > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
> > > +		xfs_pwork_work_fn work_fn, const char *tag,
> > > +		unsigned int nr_threads);
> > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
> > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
> > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> > > +
> > > +#endif /* __XFS_PWORK_H__ */
> > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > index a5b2260406a8..e4f3785f7a64 100644
> > > --- a/fs/xfs/xfs_qm.c
> > > +++ b/fs/xfs/xfs_qm.c
> > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck(
> > >  		flags |= XFS_PQUOTA_CHKD;
> > >  	}
> > >  
> > > -	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > > +	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > >  	if (error)
> > >  		goto error_return;
> > >  
> > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > > index ad7f9be13087..b555e045e2f4 100644
> > > --- a/fs/xfs/xfs_sysctl.h
> > > +++ b/fs/xfs/xfs_sysctl.h
> > > @@ -37,6 +37,9 @@ typedef struct xfs_param {
> > >  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
> > >  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
> > >  	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
> > > +#ifdef DEBUG
> > > +	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
> > > +#endif
> > >  } xfs_param_t;
> > >  
> > >  /*
> > > @@ -82,6 +85,9 @@ enum {
> > >  extern xfs_param_t	xfs_params;
> > >  
> > >  struct xfs_globals {
> > > +#ifdef DEBUG
> > > +	int	pwork_threads;		/* parallel workqueue threads */
> > > +#endif
> > >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> > >  	int	mount_delay;		/* mount setup delay (secs) */
> > >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > > index cabda13f3c64..910e6b9cb1a7 100644
> > > --- a/fs/xfs/xfs_sysfs.c
> > > +++ b/fs/xfs/xfs_sysfs.c
> > > @@ -206,11 +206,51 @@ always_cow_show(
> > >  }
> > >  XFS_SYSFS_ATTR_RW(always_cow);
> > >  
> > > +#ifdef DEBUG
> > > +/*
> > > + * Override how many threads the parallel work queue is allowed to create.
> > > + * This has to be a debug-only global (instead of an errortag) because one of
> > > + * the main users of parallel workqueues is mount time quotacheck.
> > > + */
> > > +STATIC ssize_t
> > > +pwork_threads_store(
> > > +	struct kobject	*kobject,
> > > +	const char	*buf,
> > > +	size_t		count)
> > > +{
> > > +	int		ret;
> > > +	int		val;
> > > +
> > > +	ret = kstrtoint(buf, 0, &val);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	if (val < 0 || val > NR_CPUS)
> > > +		return -EINVAL;
> > > +
> > > +	xfs_globals.pwork_threads = val;
> > > +
> > > +	return count;
> > > +}
> > > +
> > > +STATIC ssize_t
> > > +pwork_threads_show(
> > > +	struct kobject	*kobject,
> > > +	char		*buf)
> > > +{
> > > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
> > > +}
> > > +XFS_SYSFS_ATTR_RW(pwork_threads);
> > > +#endif /* DEBUG */
> > > +
> > >  static struct attribute *xfs_dbg_attrs[] = {
> > >  	ATTR_LIST(bug_on_assert),
> > >  	ATTR_LIST(log_recovery_delay),
> > >  	ATTR_LIST(mount_delay),
> > >  	ATTR_LIST(always_cow),
> > > +#ifdef DEBUG
> > > +	ATTR_LIST(pwork_threads),
> > > +#endif
> > >  	NULL,
> > >  };
> > >  
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index f9bb1d50bc0e..658cbade1998 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
> > >  		  __entry->startino, __entry->freemask)
> > >  )
> > >  
> > > +TRACE_EVENT(xfs_pwork_init,
> > > +	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
> > > +	TP_ARGS(mp, nr_threads, pid),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(unsigned int, nr_threads)
> > > +		__field(pid_t, pid)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->nr_threads = nr_threads;
> > > +		__entry->pid = pid;
> > > +	),
> > > +	TP_printk("dev %d:%d nr_threads %u pid %u",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > +		  __entry->nr_threads, __entry->pid)
> > > +)
> > > +
> > >  #endif /* _TRACE_XFS_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > >
Darrick J. Wong June 11, 2019, 3:29 p.m. UTC | #4
On Tue, Jun 11, 2019 at 09:13:40AM -0400, Brian Foster wrote:
> On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote:
> > On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote:
> > > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create a parallel iwalk implementation and switch quotacheck to use it.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > 
> > > Interesting.. is there any commonality here with the ktask mechanism
> > > that's been in progress? I've not followed the details, but I thought it
> > > was a similar idea. The last post I see for that is here:
> > > 
> > > https://marc.info/?l=linux-mm&m=154143701122927&w=2
> > 
> > Yes, xfs_pwork is... the result of ktask still not landing upstream
> > after a couple of years. :(
> > 
> 
> Heh, Ok. We could always port over to it if it ever does land.
> 
> > > That aside, this all looks mostly fine to me. A few random thoughts..
> > > 
> > > >  fs/xfs/Makefile      |    1 
> > > >  fs/xfs/xfs_globals.c |    3 +
> > > >  fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
> > > >  fs/xfs/xfs_iwalk.h   |    2 +
> > > >  fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
> > > >  fs/xfs/xfs_qm.c      |    2 -
> > > >  fs/xfs/xfs_sysctl.h  |    6 ++
> > > >  fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
> > > >  fs/xfs/xfs_trace.h   |   18 +++++++
> > > >  10 files changed, 317 insertions(+), 3 deletions(-)
> > > >  create mode 100644 fs/xfs/xfs_pwork.c
> > > >  create mode 100644 fs/xfs/xfs_pwork.h
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > > > index 74d30ef0dbce..48940a27d4aa 100644
> > > > --- a/fs/xfs/Makefile
> > > > +++ b/fs/xfs/Makefile
> > > > @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
> > > >  				   xfs_message.o \
> > > >  				   xfs_mount.o \
> > > >  				   xfs_mru_cache.o \
> > > > +				   xfs_pwork.o \
> > > >  				   xfs_reflink.o \
> > > >  				   xfs_stats.o \
> > > >  				   xfs_super.o \
> > > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> > > > index d0d377384120..4f93f2c4dc38 100644
> > > > --- a/fs/xfs/xfs_globals.c
> > > > +++ b/fs/xfs/xfs_globals.c
> > > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = {
> > > >  	.fstrm_timer	= {	1,		30*100,		3600*100},
> > > >  	.eofb_timer	= {	1,		300,		3600*24},
> > > >  	.cowb_timer	= {	1,		1800,		3600*24},
> > > > +#ifdef DEBUG
> > > > +	.pwork_threads	= {	0,		0,		NR_CPUS	},
> > > > +#endif
> > > >  };
> > > >  
> > > >  struct xfs_globals xfs_globals = {
> > > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > > > index 8595258b5001..71ee1628aa70 100644
> > > > --- a/fs/xfs/xfs_iwalk.c
> > > > +++ b/fs/xfs/xfs_iwalk.c
> > > > @@ -21,6 +21,7 @@
> > > >  #include "xfs_health.h"
> > > >  #include "xfs_trans.h"
> > > >  #include "xfs_iwalk.h"
> > > > +#include "xfs_pwork.h"
> > > >  
> > > >  /*
> > > >   * Walking Inodes in the Filesystem
> > > > @@ -46,6 +47,9 @@
> > > >   */
> > > >  
> > > >  struct xfs_iwalk_ag {
> > > > +	/* parallel work control data; will be null if single threaded */
> > > > +	struct xfs_pwork		pwork;
> > > > +
> > > >  	struct xfs_mount		*mp;
> > > >  	struct xfs_trans		*tp;
> > > >  
> > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs(
> > > >  		trace_xfs_iwalk_ag_rec(mp, agno, irec);
> > > >  
> > > >  		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> > > > +			if (xfs_pwork_want_abort(&iwag->pwork))
> > > > +				return 0;
> > > > +
> > > >  			/* Skip if this inode is free */
> > > >  			if (XFS_INOBT_MASK(j) & irec->ir_free)
> > > >  				continue;
> > > > @@ -360,7 +367,7 @@ xfs_iwalk_ag(
> > > >  	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> > > >  	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> > > >  
> > > > -	while (!error && has_more) {
> > > > +	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
> > > >  		struct xfs_inobt_rec_incore	*irec;
> > > >  
> > > >  		cond_resched();
> > > > @@ -409,7 +416,7 @@ xfs_iwalk_ag(
> > > >  	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> > > >  
> > > >  	/* Walk any records left behind in the cache. */
> > > > -	if (iwag->nr_recs == 0 || error)
> > > > +	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
> > > >  		return error;
> > > >  
> > > >  	return xfs_iwalk_ag_recs(iwag);
> > > > @@ -465,6 +472,7 @@ xfs_iwalk(
> > > >  		.iwalk_fn	= iwalk_fn,
> > > >  		.data		= data,
> > > >  		.startino	= startino,
> > > > +		.pwork		= XFS_PWORK_SINGLE_THREADED,
> > > >  	};
> > > >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > >  	int			error;
> > > > @@ -486,3 +494,67 @@ xfs_iwalk(
> > > >  	xfs_iwalk_free(&iwag);
> > > >  	return error;
> > > >  }
> > > > +
> > > > +/* Run per-thread iwalk work. */
> > > > +static int
> > > > +xfs_iwalk_ag_work(
> > > > +	struct xfs_mount	*mp,
> > > > +	struct xfs_pwork	*pwork)
> > > > +{
> > > > +	struct xfs_iwalk_ag	*iwag;
> > > > +	int			error;
> > > > +
> > > > +	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
> > > > +	error = xfs_iwalk_alloc(iwag);
> > > > +	if (error)
> > > > +		goto out;
> > > 
> > > In most cases this will never fail, but the error path if it does looks
> > > slightly painful. I was thinking if we could move this up into
> > > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when
> > > failure is imminent...
> > > 
> > > > +
> > > > +	error = xfs_iwalk_ag(iwag);
> > > > +	xfs_iwalk_free(iwag);
> > > > +out:
> > > > +	kmem_free(iwag);
> > > > +	return error;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Walk all the inodes in the filesystem using multiple threads to process each
> > > > + * AG.
> > > > + */
> > > > +int
> > > > +xfs_iwalk_threaded(
> > > > +	struct xfs_mount	*mp,
> > > > +	xfs_ino_t		startino,
> > > > +	xfs_iwalk_fn		iwalk_fn,
> > > > +	unsigned int		max_prefetch,
> > > > +	void			*data)
> > > > +{
> > > > +	struct xfs_pwork_ctl	pctl;
> > > > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > > +	unsigned int		nr_threads;
> > > > +	int			error;
> > > > +
> > > > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > > > +
> > > > +	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> > > > +	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
> > > > +			nr_threads);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > > > +		struct xfs_iwalk_ag	*iwag;
> > > > +
> > > > +		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
> > > > +		iwag->mp = mp;
> > > > +		iwag->tp = NULL;
> > > > +		iwag->iwalk_fn = iwalk_fn;
> > > > +		iwag->data = data;
> > > > +		iwag->startino = startino;
> > > > +		iwag->recs = NULL;
> > > > +		xfs_iwalk_set_prefetch(iwag, max_prefetch);
> > > > +		xfs_pwork_queue(&pctl, &iwag->pwork);
> > > > +		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > > > +	}
> > > 
> > > ... but this is only bound by the number of AGs and so could result in a
> > > large number of allocations. FWIW, I wouldn't expect that to be a
> > > problem in the common case. I'm more thinking about the case of a
> > > specially crafted filesystem designed to cause problems on mount.
> > 
> > <nod> I thought about that, and decided that it wasn't a good idea to
> > for each of the queued (but not processing) work items to be sitting on
> > a bunch of memory because that memory can't be put to useful work.
> > That's why I put it in xfs_iwalk_ag_work.
> > 
> 
> Yep.
> 
> > Also that would necessitate iwalk feeding a destructor to pwork so that
> > it can deal with work items that were queued but never actually run.
> > 
> 
> What's the scenario for queued jobs that never run? We have to run the
> work item to free the iwag, so I'd assume we could free the record
> buffer in the same place if we wanted to (which we don't :P).

Doh!  That was a thinko on my part.  pwork always runs the work item,
even if pctl->error != 0; it's up to the individual work item to check
if we're aborted and run for the exits, which _iwalk_ag_work doesn't do.

Erugh, I'll fix the documentation and _iwalk_ag_work.

> > > 
> > > > +
> > > > +	return xfs_pwork_destroy(&pctl);
> > > > +}
> ...
> > > > +/*
> > > > + * Return the amount of parallelism that the data device can handle, or 0 for
> > > > + * no limit.
> > > > + */
> > > > +unsigned int
> > > > +xfs_pwork_guess_datadev_parallelism(
> > > > +	struct xfs_mount	*mp)
> > > > +{
> > > > +	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> > > > +	int			iomin;
> > > > +	int			ioopt;
> > > > +
> > > > +	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
> > > > +		return num_online_cpus();
> > > > +	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
> > > > +		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
> > > > +	iomin = bdev_io_min(btp->bt_bdev);
> > > > +	ioopt = bdev_io_opt(btp->bt_bdev);
> > > > +	if (iomin && ioopt)
> > > > +		return ioopt / iomin;
> > > > +
> > > > +	return 1;
> > > 
> > > Have you collected any performance data related to these heuristics?
> > 
> > Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a
> > single SSD, ~10% on a 4-way raid0).  That wasn't really all that
> > awesome, so I recorded a flame graph (see below) to find where the
> > remaining overhead is.  A lot of it was in xfs_iget, and I also noticed
> > that deferred inode inactivation sped it up a little more.
> > 
> 
> So if I follow the graphic correctly, you have 4 walker threads running
> the quotacheck. The majority of overhead is the inode memory allocation
> followed by inode buffer lookup and then dquot lookup slightly behind
> that. If this is an SSD, the heuristic presumably set the thread count
> based on the CPU count, right?

Right.

> Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do
> you have granular enough data to see whether these are buffer cache hits
> or misses?

I /think/ they're mostly hits, though I admit it's hard to read the
24000 pixel wide output, even after clicking on xfs_buf_read_map to zoom
in a bit.

> > > I
> > > assume the feature is generally a win, but this also seems like we have
> > > a large window of variance here. E.g., an SSD on a server with hundreds
> > > of CPUs will enable as many threads as CPUs, but a single xTB spindle on
> > > the same box may run single threaded (a quick check of a few local
> > > devices all return an optimal I/O size of 0). Is there really no benefit
> > > parallelizing some of that work in the spinning rust case?
> > 
> > Single-spindle spinning rust got worse even with 2 threads because the
> > heads ping-pong between AGs.  It's not horrible with enough readahead,
> > but it turns into a disaster when I disabled readahead, unsurprisingly.
> > 
> 
> Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like
> device where we have a stripe unit/width set, but the device itself may
> have concurrency characteristics more like a single spindle as opposed
> to something like raid0?

I'd have to study a wider variety of disk raid setups.  I'm under the
impression that the inode allocation code tries to allocate contiguous
runs of inode clusters which will fill up stripe units (su), and then
(if there's still space) it'll keep going into the next stripe?

Hmm, I wonder how well inodes get distributed across stripes...

> > > What about in the other direction where we might have a ton of threads
> > > for inodes across AGs that all happen to be in the same project quota,
> > > for example?
> > 
> > I collected a flame graph... https://djwong.org/docs/quotacheck.svg
> > 
> 
> I'm specifically thinking about a few random systems I've used recently
> with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but
> if one did (and with enough AGs), I'm curious how this algorithm would
> behave under those conditions.
>
> I guess this all boils down to trying to understand if/what breakdown
> conditions might exist given the different possibilies allowed by the
> current heuristic. I'm wondering if we should either restrict this
> heuristic to enable concurrency specifically in the environments we've
> shown it to have benefits and/or with some kind of reasonable cap to
> limit unknown boundary conditions. E.g., the case of hundreds of AGs and
> hundreds of CPUs on SSD seems like it could go really well (a big flash
> raid0) or potentially really bad (one big SSD with a poorly configured
> fs). Thoughts?

<nod> I wish we had a better way for the block layer to give us even a
rough estimate of how many iops a device can handle... or even the
ability to help us train ourselves after the fact by measuring average
latency over time vs. the last (say) 2s so that we could back off if we
thought the device was getting congested.

[Hmm, Johannes merged some "pressure stall information" thing a couple
of releases back; I wonder if we can make use of that... I'll have a
look and report back.]

I suppose we could cap the number of cpus at some arbitrary high number
for now (128?) and see how long it takes to hit scalability problems
because of it, but we could also just let things scale and see who
complains about quotacheck getting slower on their 500 CPU system with a
nonrotational slate stone.  I suspect "one big SSD with a poorly
configured fs" is going to notice problems everywhere anyway.

Admittedly this is yet another one of those cases where I turn something
on and ... the bottleneck just shows up somewhere else now.  I hadn't
realized that iget overhead is so high.

> > It turned out that the inode setup overhead in xfs_iget is so high that
> > locking the dquot has negligible overhead.  When I "fixed" quotacheck to
> > read the quota information straight from the inode cluster buffer if the
> > inode wasn't in memory, the runtime dropped by 60% but Dave warned me
> > not to take us back to inode buffer aliasing hell.  I also noted that
> > if booting with mem=512M the memory reclamation overhead totally fries
> > us regardless of parallelisation.
> > 
> 
> I'm not familiar with the buffer aliasing problem.. I'm guessing this is
> talking about risk of inconsistent in-core inodes with inode buffers..?
> In any event, I agree that it's not worth risking integrity or
> overcomplicating things for something like quotacheck.

Correct.

--D

> Brian
> 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > +}
> > > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> > > > new file mode 100644
> > > > index 000000000000..e0c1354a2d8c
> > > > --- /dev/null
> > > > +++ b/fs/xfs/xfs_pwork.h
> > > > @@ -0,0 +1,50 @@
> > > > +// SPDX-License-Identifier: GPL-2.0+
> > > > +/*
> > > > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > > > + */
> > > > +#ifndef __XFS_PWORK_H__
> > > > +#define __XFS_PWORK_H__
> > > > +
> > > > +struct xfs_pwork;
> > > > +struct xfs_mount;
> > > > +
> > > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
> > > > +
> > > > +/*
> > > > + * Parallel work coordination structure.
> > > > + */
> > > > +struct xfs_pwork_ctl {
> > > > +	struct workqueue_struct	*wq;
> > > > +	struct xfs_mount	*mp;
> > > > +	xfs_pwork_work_fn	work_fn;
> > > > +	int			error;
> > > > +};
> > > > +
> > > > +/*
> > > > + * Embed this parallel work control item inside your own work structure,
> > > > + * then queue work with it.
> > > > + */
> > > > +struct xfs_pwork {
> > > > +	struct work_struct	work;
> > > > +	struct xfs_pwork_ctl	*pctl;
> > > > +};
> > > > +
> > > > +#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
> > > > +
> > > > +/* Have we been told to abort? */
> > > > +static inline bool
> > > > +xfs_pwork_want_abort(
> > > > +	struct xfs_pwork	*pwork)
> > > > +{
> > > > +	return pwork->pctl && pwork->pctl->error;
> > > > +}
> > > > +
> > > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
> > > > +		xfs_pwork_work_fn work_fn, const char *tag,
> > > > +		unsigned int nr_threads);
> > > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
> > > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
> > > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> > > > +
> > > > +#endif /* __XFS_PWORK_H__ */
> > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > > index a5b2260406a8..e4f3785f7a64 100644
> > > > --- a/fs/xfs/xfs_qm.c
> > > > +++ b/fs/xfs/xfs_qm.c
> > > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck(
> > > >  		flags |= XFS_PQUOTA_CHKD;
> > > >  	}
> > > >  
> > > > -	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > > > +	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > > >  	if (error)
> > > >  		goto error_return;
> > > >  
> > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > > > index ad7f9be13087..b555e045e2f4 100644
> > > > --- a/fs/xfs/xfs_sysctl.h
> > > > +++ b/fs/xfs/xfs_sysctl.h
> > > > @@ -37,6 +37,9 @@ typedef struct xfs_param {
> > > >  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
> > > >  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
> > > >  	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
> > > > +#ifdef DEBUG
> > > > +	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
> > > > +#endif
> > > >  } xfs_param_t;
> > > >  
> > > >  /*
> > > > @@ -82,6 +85,9 @@ enum {
> > > >  extern xfs_param_t	xfs_params;
> > > >  
> > > >  struct xfs_globals {
> > > > +#ifdef DEBUG
> > > > +	int	pwork_threads;		/* parallel workqueue threads */
> > > > +#endif
> > > >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> > > >  	int	mount_delay;		/* mount setup delay (secs) */
> > > >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > > > index cabda13f3c64..910e6b9cb1a7 100644
> > > > --- a/fs/xfs/xfs_sysfs.c
> > > > +++ b/fs/xfs/xfs_sysfs.c
> > > > @@ -206,11 +206,51 @@ always_cow_show(
> > > >  }
> > > >  XFS_SYSFS_ATTR_RW(always_cow);
> > > >  
> > > > +#ifdef DEBUG
> > > > +/*
> > > > + * Override how many threads the parallel work queue is allowed to create.
> > > > + * This has to be a debug-only global (instead of an errortag) because one of
> > > > + * the main users of parallel workqueues is mount time quotacheck.
> > > > + */
> > > > +STATIC ssize_t
> > > > +pwork_threads_store(
> > > > +	struct kobject	*kobject,
> > > > +	const char	*buf,
> > > > +	size_t		count)
> > > > +{
> > > > +	int		ret;
> > > > +	int		val;
> > > > +
> > > > +	ret = kstrtoint(buf, 0, &val);
> > > > +	if (ret)
> > > > +		return ret;
> > > > +
> > > > +	if (val < 0 || val > NR_CPUS)
> > > > +		return -EINVAL;
> > > > +
> > > > +	xfs_globals.pwork_threads = val;
> > > > +
> > > > +	return count;
> > > > +}
> > > > +
> > > > +STATIC ssize_t
> > > > +pwork_threads_show(
> > > > +	struct kobject	*kobject,
> > > > +	char		*buf)
> > > > +{
> > > > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
> > > > +}
> > > > +XFS_SYSFS_ATTR_RW(pwork_threads);
> > > > +#endif /* DEBUG */
> > > > +
> > > >  static struct attribute *xfs_dbg_attrs[] = {
> > > >  	ATTR_LIST(bug_on_assert),
> > > >  	ATTR_LIST(log_recovery_delay),
> > > >  	ATTR_LIST(mount_delay),
> > > >  	ATTR_LIST(always_cow),
> > > > +#ifdef DEBUG
> > > > +	ATTR_LIST(pwork_threads),
> > > > +#endif
> > > >  	NULL,
> > > >  };
> > > >  
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index f9bb1d50bc0e..658cbade1998 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
> > > >  		  __entry->startino, __entry->freemask)
> > > >  )
> > > >  
> > > > +TRACE_EVENT(xfs_pwork_init,
> > > > +	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
> > > > +	TP_ARGS(mp, nr_threads, pid),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(unsigned int, nr_threads)
> > > > +		__field(pid_t, pid)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->nr_threads = nr_threads;
> > > > +		__entry->pid = pid;
> > > > +	),
> > > > +	TP_printk("dev %d:%d nr_threads %u pid %u",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->nr_threads, __entry->pid)
> > > > +)
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > >
Brian Foster June 11, 2019, 5 p.m. UTC | #5
On Tue, Jun 11, 2019 at 08:29:26AM -0700, Darrick J. Wong wrote:
> On Tue, Jun 11, 2019 at 09:13:40AM -0400, Brian Foster wrote:
> > On Mon, Jun 10, 2019 at 06:10:20PM -0700, Darrick J. Wong wrote:
> > > On Mon, Jun 10, 2019 at 03:40:13PM -0400, Brian Foster wrote:
> > > > On Tue, Jun 04, 2019 at 02:50:20PM -0700, Darrick J. Wong wrote:
> > > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > 
> > > > > Create a parallel iwalk implementation and switch quotacheck to use it.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > 
> > > > Interesting.. is there any commonality here with the ktask mechanism
> > > > that's been in progress? I've not followed the details, but I thought it
> > > > was a similar idea. The last post I see for that is here:
> > > > 
> > > > https://marc.info/?l=linux-mm&m=154143701122927&w=2
> > > 
> > > Yes, xfs_pwork is... the result of ktask still not landing upstream
> > > after a couple of years. :(
> > > 
> > 
> > Heh, Ok. We could always port over to it if it ever does land.
> > 
> > > > That aside, this all looks mostly fine to me. A few random thoughts..
> > > > 
> > > > >  fs/xfs/Makefile      |    1 
> > > > >  fs/xfs/xfs_globals.c |    3 +
> > > > >  fs/xfs/xfs_iwalk.c   |   76 ++++++++++++++++++++++++++++++-
> > > > >  fs/xfs/xfs_iwalk.h   |    2 +
> > > > >  fs/xfs/xfs_pwork.c   |  122 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/xfs/xfs_pwork.h   |   50 ++++++++++++++++++++
> > > > >  fs/xfs/xfs_qm.c      |    2 -
> > > > >  fs/xfs/xfs_sysctl.h  |    6 ++
> > > > >  fs/xfs/xfs_sysfs.c   |   40 ++++++++++++++++
> > > > >  fs/xfs/xfs_trace.h   |   18 +++++++
> > > > >  10 files changed, 317 insertions(+), 3 deletions(-)
> > > > >  create mode 100644 fs/xfs/xfs_pwork.c
> > > > >  create mode 100644 fs/xfs/xfs_pwork.h
> > > > > 
> > > > > 
> > > > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
> > > > > index 74d30ef0dbce..48940a27d4aa 100644
> > > > > --- a/fs/xfs/Makefile
> > > > > +++ b/fs/xfs/Makefile
> > > > > @@ -84,6 +84,7 @@ xfs-y				+= xfs_aops.o \
> > > > >  				   xfs_message.o \
> > > > >  				   xfs_mount.o \
> > > > >  				   xfs_mru_cache.o \
> > > > > +				   xfs_pwork.o \
> > > > >  				   xfs_reflink.o \
> > > > >  				   xfs_stats.o \
> > > > >  				   xfs_super.o \
> > > > > diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
> > > > > index d0d377384120..4f93f2c4dc38 100644
> > > > > --- a/fs/xfs/xfs_globals.c
> > > > > +++ b/fs/xfs/xfs_globals.c
> > > > > @@ -31,6 +31,9 @@ xfs_param_t xfs_params = {
> > > > >  	.fstrm_timer	= {	1,		30*100,		3600*100},
> > > > >  	.eofb_timer	= {	1,		300,		3600*24},
> > > > >  	.cowb_timer	= {	1,		1800,		3600*24},
> > > > > +#ifdef DEBUG
> > > > > +	.pwork_threads	= {	0,		0,		NR_CPUS	},
> > > > > +#endif
> > > > >  };
> > > > >  
> > > > >  struct xfs_globals xfs_globals = {
> > > > > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > > > > index 8595258b5001..71ee1628aa70 100644
> > > > > --- a/fs/xfs/xfs_iwalk.c
> > > > > +++ b/fs/xfs/xfs_iwalk.c
> > > > > @@ -21,6 +21,7 @@
> > > > >  #include "xfs_health.h"
> > > > >  #include "xfs_trans.h"
> > > > >  #include "xfs_iwalk.h"
> > > > > +#include "xfs_pwork.h"
> > > > >  
> > > > >  /*
> > > > >   * Walking Inodes in the Filesystem
> > > > > @@ -46,6 +47,9 @@
> > > > >   */
> > > > >  
> > > > >  struct xfs_iwalk_ag {
> > > > > +	/* parallel work control data; will be null if single threaded */
> > > > > +	struct xfs_pwork		pwork;
> > > > > +
> > > > >  	struct xfs_mount		*mp;
> > > > >  	struct xfs_trans		*tp;
> > > > >  
> > > > > @@ -200,6 +204,9 @@ xfs_iwalk_ag_recs(
> > > > >  		trace_xfs_iwalk_ag_rec(mp, agno, irec);
> > > > >  
> > > > >  		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
> > > > > +			if (xfs_pwork_want_abort(&iwag->pwork))
> > > > > +				return 0;
> > > > > +
> > > > >  			/* Skip if this inode is free */
> > > > >  			if (XFS_INOBT_MASK(j) & irec->ir_free)
> > > > >  				continue;
> > > > > @@ -360,7 +367,7 @@ xfs_iwalk_ag(
> > > > >  	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> > > > >  	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> > > > >  
> > > > > -	while (!error && has_more) {
> > > > > +	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
> > > > >  		struct xfs_inobt_rec_incore	*irec;
> > > > >  
> > > > >  		cond_resched();
> > > > > @@ -409,7 +416,7 @@ xfs_iwalk_ag(
> > > > >  	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> > > > >  
> > > > >  	/* Walk any records left behind in the cache. */
> > > > > -	if (iwag->nr_recs == 0 || error)
> > > > > +	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
> > > > >  		return error;
> > > > >  
> > > > >  	return xfs_iwalk_ag_recs(iwag);
> > > > > @@ -465,6 +472,7 @@ xfs_iwalk(
> > > > >  		.iwalk_fn	= iwalk_fn,
> > > > >  		.data		= data,
> > > > >  		.startino	= startino,
> > > > > +		.pwork		= XFS_PWORK_SINGLE_THREADED,
> > > > >  	};
> > > > >  	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > > >  	int			error;
> > > > > @@ -486,3 +494,67 @@ xfs_iwalk(
> > > > >  	xfs_iwalk_free(&iwag);
> > > > >  	return error;
> > > > >  }
> > > > > +
> > > > > +/* Run per-thread iwalk work. */
> > > > > +static int
> > > > > +xfs_iwalk_ag_work(
> > > > > +	struct xfs_mount	*mp,
> > > > > +	struct xfs_pwork	*pwork)
> > > > > +{
> > > > > +	struct xfs_iwalk_ag	*iwag;
> > > > > +	int			error;
> > > > > +
> > > > > +	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
> > > > > +	error = xfs_iwalk_alloc(iwag);
> > > > > +	if (error)
> > > > > +		goto out;
> > > > 
> > > > In most cases this will never fail, but the error path if it does looks
> > > > slightly painful. I was thinking if we could move this up into
> > > > xfs_iwalk_threaded() so we wouldn't continue to queue work jobs when
> > > > failure is imminent...
> > > > 
> > > > > +
> > > > > +	error = xfs_iwalk_ag(iwag);
> > > > > +	xfs_iwalk_free(iwag);
> > > > > +out:
> > > > > +	kmem_free(iwag);
> > > > > +	return error;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Walk all the inodes in the filesystem using multiple threads to process each
> > > > > + * AG.
> > > > > + */
> > > > > +int
> > > > > +xfs_iwalk_threaded(
> > > > > +	struct xfs_mount	*mp,
> > > > > +	xfs_ino_t		startino,
> > > > > +	xfs_iwalk_fn		iwalk_fn,
> > > > > +	unsigned int		max_prefetch,
> > > > > +	void			*data)
> > > > > +{
> > > > > +	struct xfs_pwork_ctl	pctl;
> > > > > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > > > +	unsigned int		nr_threads;
> > > > > +	int			error;
> > > > > +
> > > > > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > > > > +
> > > > > +	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
> > > > > +	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
> > > > > +			nr_threads);
> > > > > +	if (error)
> > > > > +		return error;
> > > > > +
> > > > > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > > > > +		struct xfs_iwalk_ag	*iwag;
> > > > > +
> > > > > +		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
> > > > > +		iwag->mp = mp;
> > > > > +		iwag->tp = NULL;
> > > > > +		iwag->iwalk_fn = iwalk_fn;
> > > > > +		iwag->data = data;
> > > > > +		iwag->startino = startino;
> > > > > +		iwag->recs = NULL;
> > > > > +		xfs_iwalk_set_prefetch(iwag, max_prefetch);
> > > > > +		xfs_pwork_queue(&pctl, &iwag->pwork);
> > > > > +		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > > > > +	}
> > > > 
> > > > ... but this is only bound by the number of AGs and so could result in a
> > > > large number of allocations. FWIW, I wouldn't expect that to be a
> > > > problem in the common case. I'm more thinking about the case of a
> > > > specially crafted filesystem designed to cause problems on mount.
> > > 
> > > <nod> I thought about that, and decided that it wasn't a good idea to
> > > for each of the queued (but not processing) work items to be sitting on
> > > a bunch of memory because that memory can't be put to useful work.
> > > That's why I put it in xfs_iwalk_ag_work.
> > > 
> > 
> > Yep.
> > 
> > > Also that would necessitate iwalk feeding a destructor to pwork so that
> > > it can deal with work items that were queued but never actually run.
> > > 
> > 
> > What's the scenario for queued jobs that never run? We have to run the
> > work item to free the iwag, so I'd assume we could free the record
> > buffer in the same place if we wanted to (which we don't :P).
> 
> Doh!  That was a thinko on my part.  pwork always runs the work item,
> even if pctl->error != 0; it's up to the individual work item to check
> if we're aborted and run for the exits, which _iwalk_ag_work doesn't do.
> 
> Erugh, I'll fix the documentation and _iwalk_ag_work.
> 
> > > > 
> > > > > +
> > > > > +	return xfs_pwork_destroy(&pctl);
> > > > > +}
> > ...
> > > > > +/*
> > > > > + * Return the amount of parallelism that the data device can handle, or 0 for
> > > > > + * no limit.
> > > > > + */
> > > > > +unsigned int
> > > > > +xfs_pwork_guess_datadev_parallelism(
> > > > > +	struct xfs_mount	*mp)
> > > > > +{
> > > > > +	struct xfs_buftarg	*btp = mp->m_ddev_targp;
> > > > > +	int			iomin;
> > > > > +	int			ioopt;
> > > > > +
> > > > > +	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
> > > > > +		return num_online_cpus();
> > > > > +	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
> > > > > +		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
> > > > > +	iomin = bdev_io_min(btp->bt_bdev);
> > > > > +	ioopt = bdev_io_opt(btp->bt_bdev);
> > > > > +	if (iomin && ioopt)
> > > > > +		return ioopt / iomin;
> > > > > +
> > > > > +	return 1;
> > > > 
> > > > Have you collected any performance data related to these heuristics?
> > > 
> > > Yeah, the quotacheck runtime reduces by 5-10% on my SSDs (~5% on a
> > > single SSD, ~10% on a 4-way raid0).  That wasn't really all that
> > > awesome, so I recorded a flame graph (see below) to find where the
> > > remaining overhead is.  A lot of it was in xfs_iget, and I also noticed
> > > that deferred inode inactivation sped it up a little more.
> > > 
> > 
> > So if I follow the graphic correctly, you have 4 walker threads running
> > the quotacheck. The majority of overhead is the inode memory allocation
> > followed by inode buffer lookup and then dquot lookup slightly behind
> > that. If this is an SSD, the heuristic presumably set the thread count
> > based on the CPU count, right?
> 
> Right.
> 
> > Hmm, I can't tell from the image what happens down in xfs_buf_find(). Do
> > you have granular enough data to see whether these are buffer cache hits
> > or misses?
> 
> I /think/ they're mostly hits, though I admit it's hard to read the
> 24000 pixel wide output, even after clicking on xfs_buf_read_map to zoom
> in a bit.
> 

Ok, I guess it would be nice to see the source of that overhead for
future reference. Maybe perf or something could show it better.

> > > > I
> > > > assume the feature is generally a win, but this also seems like we have
> > > > a large window of variance here. E.g., an SSD on a server with hundreds
> > > > of CPUs will enable as many threads as CPUs, but a single xTB spindle on
> > > > the same box may run single threaded (a quick check of a few local
> > > > devices all return an optimal I/O size of 0). Is there really no benefit
> > > > parallelizing some of that work in the spinning rust case?
> > > 
> > > Single-spindle spinning rust got worse even with 2 threads because the
> > > heads ping-pong between AGs.  It's not horrible with enough readahead,
> > > but it turns into a disaster when I disabled readahead, unsurprisingly.
> > > 
> > 
> > Ok. Is that tempered by multi-spindle devices? What about a raid5/6 like
> > device where we have a stripe unit/width set, but the device itself may
> > have concurrency characteristics more like a single spindle as opposed
> > to something like raid0?
> 
> I'd have to study a wider variety of disk raid setups.  I'm under the
> impression that the inode allocation code tries to allocate contiguous
> runs of inode clusters which will fill up stripe units (su), and then
> (if there's still space) it'll keep going into the next stripe?
> 

That sounds about right to me.

Note that I'm not suggesting we necessarily have to test every possible
storage config to land this mechanism. I think it would be reasonable to
to restrict the heuristic to conservatively enable cases we have tested
so far (i.e., nonrotational with a handful of threads) and expand it
from there over time as we have opportunity to test more configs.

> Hmm, I wonder how well inodes get distributed across stripes...
> 
> > > > What about in the other direction where we might have a ton of threads
> > > > for inodes across AGs that all happen to be in the same project quota,
> > > > for example?
> > > 
> > > I collected a flame graph... https://djwong.org/docs/quotacheck.svg
> > > 
> > 
> > I'm specifically thinking about a few random systems I've used recently
> > with hundreds of CPUs. I'm not sure those boxes actually have SSDs, but
> > if one did (and with enough AGs), I'm curious how this algorithm would
> > behave under those conditions.
> >
> > I guess this all boils down to trying to understand if/what breakdown
> > conditions might exist given the different possibilies allowed by the
> > current heuristic. I'm wondering if we should either restrict this
> > heuristic to enable concurrency specifically in the environments we've
> > shown it to have benefits and/or with some kind of reasonable cap to
> > limit unknown boundary conditions. E.g., the case of hundreds of AGs and
> > hundreds of CPUs on SSD seems like it could go really well (a big flash
> > raid0) or potentially really bad (one big SSD with a poorly configured
> > fs). Thoughts?
> 
> <nod> I wish we had a better way for the block layer to give us even a
> rough estimate of how many iops a device can handle... or even the
> ability to help us train ourselves after the fact by measuring average
> latency over time vs. the last (say) 2s so that we could back off if we
> thought the device was getting congested.
> 

Yeah, we're kind of limited by the lack of detailed information on the
storage setup.

> [Hmm, Johannes merged some "pressure stall information" thing a couple
> of releases back; I wonder if we can make use of that... I'll have a
> look and report back.]
> 
> I suppose we could cap the number of cpus at some arbitrary high number
> for now (128?) and see how long it takes to hit scalability problems
> because of it, but we could also just let things scale and see who
> complains about quotacheck getting slower on their 500 CPU system with a
> nonrotational slate stone.  I suspect "one big SSD with a poorly
> configured fs" is going to notice problems everywhere anyway.
> 

Indeed, we could do that. I was thinking about a similar approach but
from the opposite direction. I.e., rather than set a potential
performance landmine (assuming there's some config out there that this
would thrash), cap things at a few threads to start and expand out from
there as dictated by testing, etc. As it is, quotacheck has been
single threaded forever (AFAIA) anyways.

I suppose with either approach we could also consider enabling the sysfs
knob on production environments such that users have either the ability
to speed up a currently slow quotacheck or recover from one that is
thrashing due to too many workers (depending on which approach we take).
We'd probably have to think a little bit harder about the interface if
we wanted to do that, though. One obvious issue is that the thread count
for one filesystem might be irrelevant for another, and quotacheck is a
mount time operation. :/

I dunno.. I'm kind of curious to test this against some of the filesets
I'm generating for the near allocation mode rework stuff. I only have a
couple days before heading off for vacation though so realistically it
might be a couple weeks before I have a chance to do that. It sounds
like Dave has reviewed at least some of this work so far as well. Maybe
he or others have some thoughts on how to approach the concurrency
heuristic..?

> Admittedly this is yet another one of those cases where I turn something
> on and ... the bottleneck just shows up somewhere else now.  I hadn't
> realized that iget overhead is so high.
> 

Same.

Brian

> > > It turned out that the inode setup overhead in xfs_iget is so high that
> > > locking the dquot has negligible overhead.  When I "fixed" quotacheck to
> > > read the quota information straight from the inode cluster buffer if the
> > > inode wasn't in memory, the runtime dropped by 60% but Dave warned me
> > > not to take us back to inode buffer aliasing hell.  I also noted that
> > > if booting with mem=512M the memory reclamation overhead totally fries
> > > us regardless of parallelisation.
> > > 
> > 
> > I'm not familiar with the buffer aliasing problem.. I'm guessing this is
> > talking about risk of inconsistent in-core inodes with inode buffers..?
> > In any event, I agree that it's not worth risking integrity or
> > overcomplicating things for something like quotacheck.
> 
> Correct.
> 
> --D
> 
> > Brian
> > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > +}
> > > > > diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
> > > > > new file mode 100644
> > > > > index 000000000000..e0c1354a2d8c
> > > > > --- /dev/null
> > > > > +++ b/fs/xfs/xfs_pwork.h
> > > > > @@ -0,0 +1,50 @@
> > > > > +// SPDX-License-Identifier: GPL-2.0+
> > > > > +/*
> > > > > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > + */
> > > > > +#ifndef __XFS_PWORK_H__
> > > > > +#define __XFS_PWORK_H__
> > > > > +
> > > > > +struct xfs_pwork;
> > > > > +struct xfs_mount;
> > > > > +
> > > > > +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
> > > > > +
> > > > > +/*
> > > > > + * Parallel work coordination structure.
> > > > > + */
> > > > > +struct xfs_pwork_ctl {
> > > > > +	struct workqueue_struct	*wq;
> > > > > +	struct xfs_mount	*mp;
> > > > > +	xfs_pwork_work_fn	work_fn;
> > > > > +	int			error;
> > > > > +};
> > > > > +
> > > > > +/*
> > > > > + * Embed this parallel work control item inside your own work structure,
> > > > > + * then queue work with it.
> > > > > + */
> > > > > +struct xfs_pwork {
> > > > > +	struct work_struct	work;
> > > > > +	struct xfs_pwork_ctl	*pctl;
> > > > > +};
> > > > > +
> > > > > +#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
> > > > > +
> > > > > +/* Have we been told to abort? */
> > > > > +static inline bool
> > > > > +xfs_pwork_want_abort(
> > > > > +	struct xfs_pwork	*pwork)
> > > > > +{
> > > > > +	return pwork->pctl && pwork->pctl->error;
> > > > > +}
> > > > > +
> > > > > +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
> > > > > +		xfs_pwork_work_fn work_fn, const char *tag,
> > > > > +		unsigned int nr_threads);
> > > > > +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
> > > > > +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
> > > > > +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
> > > > > +
> > > > > +#endif /* __XFS_PWORK_H__ */
> > > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > > > index a5b2260406a8..e4f3785f7a64 100644
> > > > > --- a/fs/xfs/xfs_qm.c
> > > > > +++ b/fs/xfs/xfs_qm.c
> > > > > @@ -1305,7 +1305,7 @@ xfs_qm_quotacheck(
> > > > >  		flags |= XFS_PQUOTA_CHKD;
> > > > >  	}
> > > > >  
> > > > > -	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > > > > +	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
> > > > >  	if (error)
> > > > >  		goto error_return;
> > > > >  
> > > > > diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> > > > > index ad7f9be13087..b555e045e2f4 100644
> > > > > --- a/fs/xfs/xfs_sysctl.h
> > > > > +++ b/fs/xfs/xfs_sysctl.h
> > > > > @@ -37,6 +37,9 @@ typedef struct xfs_param {
> > > > >  	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
> > > > >  	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
> > > > >  	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
> > > > > +#ifdef DEBUG
> > > > > +	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
> > > > > +#endif
> > > > >  } xfs_param_t;
> > > > >  
> > > > >  /*
> > > > > @@ -82,6 +85,9 @@ enum {
> > > > >  extern xfs_param_t	xfs_params;
> > > > >  
> > > > >  struct xfs_globals {
> > > > > +#ifdef DEBUG
> > > > > +	int	pwork_threads;		/* parallel workqueue threads */
> > > > > +#endif
> > > > >  	int	log_recovery_delay;	/* log recovery delay (secs) */
> > > > >  	int	mount_delay;		/* mount setup delay (secs) */
> > > > >  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> > > > > diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> > > > > index cabda13f3c64..910e6b9cb1a7 100644
> > > > > --- a/fs/xfs/xfs_sysfs.c
> > > > > +++ b/fs/xfs/xfs_sysfs.c
> > > > > @@ -206,11 +206,51 @@ always_cow_show(
> > > > >  }
> > > > >  XFS_SYSFS_ATTR_RW(always_cow);
> > > > >  
> > > > > +#ifdef DEBUG
> > > > > +/*
> > > > > + * Override how many threads the parallel work queue is allowed to create.
> > > > > + * This has to be a debug-only global (instead of an errortag) because one of
> > > > > + * the main users of parallel workqueues is mount time quotacheck.
> > > > > + */
> > > > > +STATIC ssize_t
> > > > > +pwork_threads_store(
> > > > > +	struct kobject	*kobject,
> > > > > +	const char	*buf,
> > > > > +	size_t		count)
> > > > > +{
> > > > > +	int		ret;
> > > > > +	int		val;
> > > > > +
> > > > > +	ret = kstrtoint(buf, 0, &val);
> > > > > +	if (ret)
> > > > > +		return ret;
> > > > > +
> > > > > +	if (val < 0 || val > NR_CPUS)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	xfs_globals.pwork_threads = val;
> > > > > +
> > > > > +	return count;
> > > > > +}
> > > > > +
> > > > > +STATIC ssize_t
> > > > > +pwork_threads_show(
> > > > > +	struct kobject	*kobject,
> > > > > +	char		*buf)
> > > > > +{
> > > > > +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
> > > > > +}
> > > > > +XFS_SYSFS_ATTR_RW(pwork_threads);
> > > > > +#endif /* DEBUG */
> > > > > +
> > > > >  static struct attribute *xfs_dbg_attrs[] = {
> > > > >  	ATTR_LIST(bug_on_assert),
> > > > >  	ATTR_LIST(log_recovery_delay),
> > > > >  	ATTR_LIST(mount_delay),
> > > > >  	ATTR_LIST(always_cow),
> > > > > +#ifdef DEBUG
> > > > > +	ATTR_LIST(pwork_threads),
> > > > > +#endif
> > > > >  	NULL,
> > > > >  };
> > > > >  
> > > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > > index f9bb1d50bc0e..658cbade1998 100644
> > > > > --- a/fs/xfs/xfs_trace.h
> > > > > +++ b/fs/xfs/xfs_trace.h
> > > > > @@ -3556,6 +3556,24 @@ TRACE_EVENT(xfs_iwalk_ag_rec,
> > > > >  		  __entry->startino, __entry->freemask)
> > > > >  )
> > > > >  
> > > > > +TRACE_EVENT(xfs_pwork_init,
> > > > > +	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
> > > > > +	TP_ARGS(mp, nr_threads, pid),
> > > > > +	TP_STRUCT__entry(
> > > > > +		__field(dev_t, dev)
> > > > > +		__field(unsigned int, nr_threads)
> > > > > +		__field(pid_t, pid)
> > > > > +	),
> > > > > +	TP_fast_assign(
> > > > > +		__entry->dev = mp->m_super->s_dev;
> > > > > +		__entry->nr_threads = nr_threads;
> > > > > +		__entry->pid = pid;
> > > > > +	),
> > > > > +	TP_printk("dev %d:%d nr_threads %u pid %u",
> > > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > > +		  __entry->nr_threads, __entry->pid)
> > > > > +)
> > > > > +
> > > > >  #endif /* _TRACE_XFS_H */
> > > > >  
> > > > >  #undef TRACE_INCLUDE_PATH
> > > > >
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 74d30ef0dbce..48940a27d4aa 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@  xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_pwork.o \
 				   xfs_reflink.o \
 				   xfs_stats.o \
 				   xfs_super.o \
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index d0d377384120..4f93f2c4dc38 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -31,6 +31,9 @@  xfs_param_t xfs_params = {
 	.fstrm_timer	= {	1,		30*100,		3600*100},
 	.eofb_timer	= {	1,		300,		3600*24},
 	.cowb_timer	= {	1,		1800,		3600*24},
+#ifdef DEBUG
+	.pwork_threads	= {	0,		0,		NR_CPUS	},
+#endif
 };
 
 struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 8595258b5001..71ee1628aa70 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -21,6 +21,7 @@ 
 #include "xfs_health.h"
 #include "xfs_trans.h"
 #include "xfs_iwalk.h"
+#include "xfs_pwork.h"
 
 /*
  * Walking Inodes in the Filesystem
@@ -46,6 +47,9 @@ 
  */
 
 struct xfs_iwalk_ag {
+	/* parallel work control data; will be null if single threaded */
+	struct xfs_pwork		pwork;
+
 	struct xfs_mount		*mp;
 	struct xfs_trans		*tp;
 
@@ -200,6 +204,9 @@  xfs_iwalk_ag_recs(
 		trace_xfs_iwalk_ag_rec(mp, agno, irec);
 
 		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			if (xfs_pwork_want_abort(&iwag->pwork))
+				return 0;
+
 			/* Skip if this inode is free */
 			if (XFS_INOBT_MASK(j) & irec->ir_free)
 				continue;
@@ -360,7 +367,7 @@  xfs_iwalk_ag(
 	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
 	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
 
-	while (!error && has_more) {
+	while (!error && has_more && !xfs_pwork_want_abort(&iwag->pwork)) {
 		struct xfs_inobt_rec_incore	*irec;
 
 		cond_resched();
@@ -409,7 +416,7 @@  xfs_iwalk_ag(
 	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
 
 	/* Walk any records left behind in the cache. */
-	if (iwag->nr_recs == 0 || error)
+	if (iwag->nr_recs == 0 || error || xfs_pwork_want_abort(&iwag->pwork))
 		return error;
 
 	return xfs_iwalk_ag_recs(iwag);
@@ -465,6 +472,7 @@  xfs_iwalk(
 		.iwalk_fn	= iwalk_fn,
 		.data		= data,
 		.startino	= startino,
+		.pwork		= XFS_PWORK_SINGLE_THREADED,
 	};
 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
 	int			error;
@@ -486,3 +494,67 @@  xfs_iwalk(
 	xfs_iwalk_free(&iwag);
 	return error;
 }
+
+/* Run per-thread iwalk work. */
+static int
+xfs_iwalk_ag_work(
+	struct xfs_mount	*mp,
+	struct xfs_pwork	*pwork)
+{
+	struct xfs_iwalk_ag	*iwag;
+	int			error;
+
+	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
+	error = xfs_iwalk_alloc(iwag);
+	if (error)
+		goto out;
+
+	error = xfs_iwalk_ag(iwag);
+	xfs_iwalk_free(iwag);
+out:
+	kmem_free(iwag);
+	return error;
+}
+
+/*
+ * Walk all the inodes in the filesystem using multiple threads to process each
+ * AG.
+ */
+int
+xfs_iwalk_threaded(
+	struct xfs_mount	*mp,
+	xfs_ino_t		startino,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		max_prefetch,
+	void			*data)
+{
+	struct xfs_pwork_ctl	pctl;
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	unsigned int		nr_threads;
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+
+	nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
+	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
+			nr_threads);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_iwalk_ag	*iwag;
+
+		iwag = kmem_alloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+		iwag->mp = mp;
+		iwag->tp = NULL;
+		iwag->iwalk_fn = iwalk_fn;
+		iwag->data = data;
+		iwag->startino = startino;
+		iwag->recs = NULL;
+		xfs_iwalk_set_prefetch(iwag, max_prefetch);
+		xfs_pwork_queue(&pctl, &iwag->pwork);
+		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+	}
+
+	return xfs_pwork_destroy(&pctl);
+}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 45b1baabcd2d..40233a05a766 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -14,5 +14,7 @@  typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
 
 int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
 		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
+int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
+		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
 
 #endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
new file mode 100644
index 000000000000..19605a3a2482
--- /dev/null
+++ b/fs/xfs/xfs_pwork.c
@@ -0,0 +1,122 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_sysctl.h"
+#include "xfs_pwork.h"
+
+/*
+ * Parallel Work Queue
+ * ===================
+ *
+ * Abstract away the details of running a large and "obviously" parallelizable
+ * task across multiple CPUs.  Callers initialize the pwork control object with
+ * a desired level of parallelization and a work function.  Next, they embed
+ * struct xfs_pwork in whatever structure they use to pass work context to a
+ * worker thread and queue that pwork.  The work function will be passed the
+ * pwork item when it is run (from process context) and any returned error will
+ * cause all threads to abort.
+ * 
+ * This is the rough equivalent of the xfsprogs workqueue code, though we can't
+ * reuse that name here.
+ */
+
+/* Invoke our caller's function. */
+static void
+xfs_pwork_work(
+	struct work_struct	*work)
+{
+	struct xfs_pwork	*pwork;
+	struct xfs_pwork_ctl	*pctl;
+	int			error;
+
+	pwork = container_of(work, struct xfs_pwork, work);
+	pctl = pwork->pctl;
+	error = pctl->work_fn(pctl->mp, pwork);
+	if (error && !pctl->error)
+		pctl->error = error;
+}
+
+/*
+ * Set up control data for parallel work.  @work_fn is the function that will
+ * be called.  @tag will be written into the kernel threads.  @nr_threads is
+ * the level of parallelism desired, or 0 for no limit.
+ */
+int
+xfs_pwork_init(
+	struct xfs_mount	*mp,
+	struct xfs_pwork_ctl	*pctl,
+	xfs_pwork_work_fn	work_fn,
+	const char		*tag,
+	unsigned int		nr_threads)
+{
+#ifdef DEBUG
+	if (xfs_globals.pwork_threads > 0)
+		nr_threads = xfs_globals.pwork_threads;
+#endif
+	trace_xfs_pwork_init(mp, nr_threads, current->pid);
+
+	pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+			current->pid);
+	if (!pctl->wq)
+		return -ENOMEM;
+	pctl->work_fn = work_fn;
+	pctl->error = 0;
+	pctl->mp = mp;
+
+	return 0;
+}
+
+/* Queue some parallel work. */
+void
+xfs_pwork_queue(
+	struct xfs_pwork_ctl	*pctl,
+	struct xfs_pwork	*pwork)
+{
+	INIT_WORK(&pwork->work, xfs_pwork_work);
+	pwork->pctl = pctl;
+	queue_work(pctl->wq, &pwork->work);
+}
+
+/* Wait for the work to finish and tear down the control structure. */
+int
+xfs_pwork_destroy(
+	struct xfs_pwork_ctl	*pctl)
+{
+	destroy_workqueue(pctl->wq);
+	pctl->wq = NULL;
+	return pctl->error;
+}
+
+/*
+ * Return the amount of parallelism that the data device can handle, or 0 for
+ * no limit.
+ */
+unsigned int
+xfs_pwork_guess_datadev_parallelism(
+	struct xfs_mount	*mp)
+{
+	struct xfs_buftarg	*btp = mp->m_ddev_targp;
+	int			iomin;
+	int			ioopt;
+
+	if (blk_queue_nonrot(btp->bt_bdev->bd_queue))
+		return num_online_cpus();
+	if (mp->m_sb.sb_width && mp->m_sb.sb_unit)
+		return mp->m_sb.sb_width / mp->m_sb.sb_unit;
+	iomin = bdev_io_min(btp->bt_bdev);
+	ioopt = bdev_io_opt(btp->bt_bdev);
+	if (iomin && ioopt)
+		return ioopt / iomin;
+
+	return 1;
+}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
new file mode 100644
index 000000000000..e0c1354a2d8c
--- /dev/null
+++ b/fs/xfs/xfs_pwork.h
@@ -0,0 +1,50 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_PWORK_H__
+#define __XFS_PWORK_H__
+
+struct xfs_pwork;
+struct xfs_mount;
+
+typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
+
+/*
+ * Parallel work coordination structure.
+ */
+struct xfs_pwork_ctl {
+	struct workqueue_struct	*wq;
+	struct xfs_mount	*mp;
+	xfs_pwork_work_fn	work_fn;
+	int			error;
+};
+
+/*
+ * Embed this parallel work control item inside your own work structure,
+ * then queue work with it.
+ */
+struct xfs_pwork {
+	struct work_struct	work;
+	struct xfs_pwork_ctl	*pctl;
+};
+
+#define XFS_PWORK_SINGLE_THREADED	{ .pctl = NULL }
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_want_abort(
+	struct xfs_pwork	*pwork)
+{
+	return pwork->pctl && pwork->pctl->error;
+}
+
+int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
+		xfs_pwork_work_fn work_fn, const char *tag,
+		unsigned int nr_threads);
+void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
+int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
+unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
+
+#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a5b2260406a8..e4f3785f7a64 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1305,7 +1305,7 @@  xfs_qm_quotacheck(
 		flags |= XFS_PQUOTA_CHKD;
 	}
 
-	error = xfs_iwalk(mp, NULL, 0, xfs_qm_dqusage_adjust, 0, NULL);
+	error = xfs_iwalk_threaded(mp, 0, xfs_qm_dqusage_adjust, 0, NULL);
 	if (error)
 		goto error_return;
 
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index ad7f9be13087..b555e045e2f4 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -37,6 +37,9 @@  typedef struct xfs_param {
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
 	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
 	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
+#ifdef DEBUG
+	xfs_sysctl_val_t pwork_threads;	/* Parallel workqueue thread count */
+#endif
 } xfs_param_t;
 
 /*
@@ -82,6 +85,9 @@  enum {
 extern xfs_param_t	xfs_params;
 
 struct xfs_globals {
+#ifdef DEBUG
+	int	pwork_threads;		/* parallel workqueue threads */
+#endif
 	int	log_recovery_delay;	/* log recovery delay (secs) */
 	int	mount_delay;		/* mount setup delay (secs) */
 	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cabda13f3c64..910e6b9cb1a7 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -206,11 +206,51 @@  always_cow_show(
 }
 XFS_SYSFS_ATTR_RW(always_cow);
 
+#ifdef DEBUG
+/*
+ * Override how many threads the parallel work queue is allowed to create.
+ * This has to be a debug-only global (instead of an errortag) because one of
+ * the main users of parallel workqueues is mount time quotacheck.
+ */
+STATIC ssize_t
+pwork_threads_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	int		ret;
+	int		val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < 0 || val > NR_CPUS)
+		return -EINVAL;
+
+	xfs_globals.pwork_threads = val;
+
+	return count;
+}
+
+STATIC ssize_t
+pwork_threads_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+}
+XFS_SYSFS_ATTR_RW(pwork_threads);
+#endif /* DEBUG */
+
 static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(bug_on_assert),
 	ATTR_LIST(log_recovery_delay),
 	ATTR_LIST(mount_delay),
 	ATTR_LIST(always_cow),
+#ifdef DEBUG
+	ATTR_LIST(pwork_threads),
+#endif
 	NULL,
 };
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f9bb1d50bc0e..658cbade1998 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3556,6 +3556,24 @@  TRACE_EVENT(xfs_iwalk_ag_rec,
 		  __entry->startino, __entry->freemask)
 )
 
+TRACE_EVENT(xfs_pwork_init,
+	TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
+	TP_ARGS(mp, nr_threads, pid),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned int, nr_threads)
+		__field(pid_t, pid)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->nr_threads = nr_threads;
+		__entry->pid = pid;
+	),
+	TP_printk("dev %d:%d nr_threads %u pid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->nr_threads, __entry->pid)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH