diff mbox series

[01/10] xfs: create simplified inode walk function

Message ID 155968497450.1657646.15305138327955918345.stgit@magnolia (mailing list archive)
State Superseded, archived
Headers show
Series xfs: refactor and improve inode iteration | expand

Commit Message

Darrick J. Wong June 4, 2019, 9:49 p.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Create a new iterator function to simplify walking inodes in an XFS
filesystem.  This new iterator will replace the existing open-coded
walking that goes on in various places.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/Makefile                  |    1 
 fs/xfs/libxfs/xfs_ialloc_btree.c |   31 +++
 fs/xfs/libxfs/xfs_ialloc_btree.h |    3 
 fs/xfs/xfs_itable.c              |    5 
 fs/xfs/xfs_itable.h              |    8 +
 fs/xfs/xfs_iwalk.c               |  400 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_iwalk.h               |   18 ++
 fs/xfs/xfs_trace.h               |   40 ++++
 8 files changed, 502 insertions(+), 4 deletions(-)
 create mode 100644 fs/xfs/xfs_iwalk.c
 create mode 100644 fs/xfs/xfs_iwalk.h

Comments

Brian Foster June 10, 2019, 1:58 p.m. UTC | #1
On Tue, Jun 04, 2019 at 02:49:34PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Create a new iterator function to simplify walking inodes in an XFS
> filesystem.  This new iterator will replace the existing open-coded
> walking that goes on in various places.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/Makefile                  |    1 
>  fs/xfs/libxfs/xfs_ialloc_btree.c |   31 +++
>  fs/xfs/libxfs/xfs_ialloc_btree.h |    3 
>  fs/xfs/xfs_itable.c              |    5 
>  fs/xfs/xfs_itable.h              |    8 +
>  fs/xfs/xfs_iwalk.c               |  400 ++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_iwalk.h               |   18 ++
>  fs/xfs/xfs_trace.h               |   40 ++++
>  8 files changed, 502 insertions(+), 4 deletions(-)
>  create mode 100644 fs/xfs/xfs_iwalk.c
>  create mode 100644 fs/xfs/xfs_iwalk.h
> 
> 
...
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> index ac4b65da4c2b..cb7eac2f51c0 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
> @@ -564,6 +564,34 @@ xfs_inobt_max_size(
>  					XFS_INODES_PER_CHUNK);
>  }
>  
> +/* Read AGI and create inobt cursor. */
> +int
> +xfs_inobt_cur(
> +	struct xfs_mount	*mp,
> +	struct xfs_trans	*tp,
> +	xfs_agnumber_t		agno,
> +	struct xfs_btree_cur	**curpp,
> +	struct xfs_buf		**agi_bpp)
> +{
> +	struct xfs_btree_cur	*cur;
> +	int			error;
> +
> +	ASSERT(*agi_bpp == NULL);
> +

FYI, the xfs_inobt_count_blocks() caller doesn't initialize the pointer
according to the assert.

> +	error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO);
> +	if (!cur) {
> +		xfs_trans_brelse(tp, *agi_bpp);
> +		*agi_bpp = NULL;
> +		return -ENOMEM;
> +	}
> +	*curpp = cur;
> +	return 0;
> +}
> +
>  static int
>  xfs_inobt_count_blocks(
>  	struct xfs_mount	*mp,
...
> diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> new file mode 100644
> index 000000000000..3e6c06e69c75
> --- /dev/null
> +++ b/fs/xfs/xfs_iwalk.c
> @@ -0,0 +1,400 @@
...
> +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
> +STATIC int
> +xfs_iwalk_ag(
> +	struct xfs_iwalk_ag		*iwag)
> +{
> +	struct xfs_mount		*mp = iwag->mp;
> +	struct xfs_trans		*tp = iwag->tp;
> +	struct xfs_buf			*agi_bp = NULL;
> +	struct xfs_btree_cur		*cur = NULL;
> +	xfs_agnumber_t			agno;
> +	xfs_agino_t			agino;
> +	int				has_more;
> +	int				error = 0;
> +
> +	/* Set up our cursor at the right place in the inode btree. */
> +	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
> +	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> +	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> +
> +	while (!error && has_more) {
> +		struct xfs_inobt_rec_incore	*irec;
> +
> +		cond_resched();
> +
> +		/* Fetch the inobt record. */
> +		irec = &iwag->recs[iwag->nr_recs];
> +		error = xfs_inobt_get_rec(cur, irec, &has_more);
> +		if (error || !has_more)
> +			break;
> +
> +		/* No allocated inodes in this chunk; skip it. */
> +		if (irec->ir_freecount == irec->ir_count) {
> +			error = xfs_btree_increment(cur, 0, &has_more);
> +			if (error)
> +				break;
> +			continue;
> +		}
> +
> +		/*
> +		 * Start readahead for this inode chunk in anticipation of
> +		 * walking the inodes.
> +		 */
> +		xfs_bulkstat_ichunk_ra(mp, agno, irec);
> +
> +		/*
> +		 * If there's space in the buffer for more records, increment
> +		 * the btree cursor and grab more.
> +		 */
> +		if (++iwag->nr_recs < iwag->sz_recs) {
> +			error = xfs_btree_increment(cur, 0, &has_more);
> +			if (error || !has_more)
> +				break;
> +			continue;
> +		}
> +
> +		/*
> +		 * Otherwise, we need to save cursor state and run the callback
> +		 * function on the cached records.  The run_callbacks function
> +		 * is supposed to return a cursor pointing to the record where
> +		 * we would be if we had been able to increment like above.
> +		 */
> +		error = xfs_iwalk_run_callbacks(iwag, xfs_iwalk_ag_recs, agno,
> +				&cur, &agi_bp, &has_more);
> +	}
> +
> +	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> +
> +	/* Walk any records left behind in the cache. */
> +	if (iwag->nr_recs == 0 || error)
> +		return error;
> +
> +	return xfs_iwalk_ag_recs(iwag);

Hmm, I find the above pattern to process the leftover records a bit
confusing because of how it is open coded. Could we find a way to reuse
xfs_iwalk_run_callbacks() in both cases so it looks more obvious? For
example, pass a flag to indicate whether the callback helper should
recreate the cursor for continued processing. FWIW, it looks like
has_more already reflects that state in the current logic above.

> +}
> +
> +/*
> + * Given the number of inodes to prefetch, set the number of inobt records that
> + * we cache in memory, which controls the number of inodes we try to read
> + * ahead.
> + *
> + * If no max prefetch was given, default to 4096 bytes' worth of inobt records;
> + * this should be plenty of inodes to read ahead.  This number (256 inobt
> + * records) was chosen so that the cache is never more than a single memory
> + * page.
> + */
> +static inline void
> +xfs_iwalk_set_prefetch(
> +	struct xfs_iwalk_ag	*iwag,
> +	unsigned int		max_prefetch)
> +{
> +	if (max_prefetch)
> +		iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) /
> +					XFS_INODES_PER_CHUNK;
> +	else
> +		iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore);
> +

Perhaps this should use PAGE_SIZE or a related macro?

Brian

> +	/*
> +	 * Allocate enough space to prefetch at least two records so that we
> +	 * can cache both the inobt record where the iwalk started and the next
> +	 * record.  This simplifies the AG inode walk loop setup code.
> +	 */
> +	iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2);
> +}
> +
> +/*
> + * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
> + * will be called for each allocated inode, being passed the inode's number and
> + * @data.  @max_prefetch controls how many inobt records' worth of inodes we
> + * try to readahead.
> + */
> +int
> +xfs_iwalk(
> +	struct xfs_mount	*mp,
> +	struct xfs_trans	*tp,
> +	xfs_ino_t		startino,
> +	xfs_iwalk_fn		iwalk_fn,
> +	unsigned int		max_prefetch,
> +	void			*data)
> +{
> +	struct xfs_iwalk_ag	iwag = {
> +		.mp		= mp,
> +		.tp		= tp,
> +		.iwalk_fn	= iwalk_fn,
> +		.data		= data,
> +		.startino	= startino,
> +	};
> +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> +	int			error;
> +
> +	ASSERT(agno < mp->m_sb.sb_agcount);
> +
> +	xfs_iwalk_set_prefetch(&iwag, max_prefetch);
> +	error = xfs_iwalk_alloc(&iwag);
> +	if (error)
> +		return error;
> +
> +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> +		error = xfs_iwalk_ag(&iwag);
> +		if (error)
> +			break;
> +		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> +	}
> +
> +	xfs_iwalk_free(&iwag);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> new file mode 100644
> index 000000000000..45b1baabcd2d
> --- /dev/null
> +++ b/fs/xfs/xfs_iwalk.h
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> + */
> +#ifndef __XFS_IWALK_H__
> +#define __XFS_IWALK_H__
> +
> +/* Walk all inodes in the filesystem starting from @startino. */
> +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
> +			    xfs_ino_t ino, void *data);
> +/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */
> +#define XFS_IWALK_ABORT	(1)
> +
> +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
> +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> +
> +#endif /* __XFS_IWALK_H__ */
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 2464ea351f83..f9bb1d50bc0e 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
>  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
>  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
>  
> +TRACE_EVENT(xfs_iwalk_ag,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		 xfs_agino_t startino),
> +	TP_ARGS(mp, agno, startino),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agino_t, startino)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->startino = startino;
> +	),
> +	TP_printk("dev %d:%d agno %d startino %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> +		  __entry->startino)
> +)
> +
> +TRACE_EVENT(xfs_iwalk_ag_rec,
> +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		 struct xfs_inobt_rec_incore *irec),
> +	TP_ARGS(mp, agno, irec),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_agnumber_t, agno)
> +		__field(xfs_agino_t, startino)
> +		__field(uint64_t, freemask)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->agno = agno;
> +		__entry->startino = irec->ir_startino;
> +		__entry->freemask = irec->ir_free;
> +	),
> +	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> +		  __entry->startino, __entry->freemask)
> +)
> +
>  #endif /* _TRACE_XFS_H */
>  
>  #undef TRACE_INCLUDE_PATH
>
Darrick J. Wong June 10, 2019, 4:59 p.m. UTC | #2
On Mon, Jun 10, 2019 at 09:58:19AM -0400, Brian Foster wrote:
> On Tue, Jun 04, 2019 at 02:49:34PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Create a new iterator function to simplify walking inodes in an XFS
> > filesystem.  This new iterator will replace the existing open-coded
> > walking that goes on in various places.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/Makefile                  |    1 
> >  fs/xfs/libxfs/xfs_ialloc_btree.c |   31 +++
> >  fs/xfs/libxfs/xfs_ialloc_btree.h |    3 
> >  fs/xfs/xfs_itable.c              |    5 
> >  fs/xfs/xfs_itable.h              |    8 +
> >  fs/xfs/xfs_iwalk.c               |  400 ++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_iwalk.h               |   18 ++
> >  fs/xfs/xfs_trace.h               |   40 ++++
> >  8 files changed, 502 insertions(+), 4 deletions(-)
> >  create mode 100644 fs/xfs/xfs_iwalk.c
> >  create mode 100644 fs/xfs/xfs_iwalk.h
> > 
> > 
> ...
> > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> > index ac4b65da4c2b..cb7eac2f51c0 100644
> > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
> > @@ -564,6 +564,34 @@ xfs_inobt_max_size(
> >  					XFS_INODES_PER_CHUNK);
> >  }
> >  
> > +/* Read AGI and create inobt cursor. */
> > +int
> > +xfs_inobt_cur(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_trans	*tp,
> > +	xfs_agnumber_t		agno,
> > +	struct xfs_btree_cur	**curpp,
> > +	struct xfs_buf		**agi_bpp)
> > +{
> > +	struct xfs_btree_cur	*cur;
> > +	int			error;
> > +
> > +	ASSERT(*agi_bpp == NULL);
> > +
> 
> FYI, the xfs_inobt_count_blocks() caller doesn't initialize the pointer
> according to the assert.

AgAGkgjwepth, there's a gcc plugin that makes all uninitialized stack
variables zero now, and so I never see these things anymore. :(

Thanks for catching this.

> 
> > +	error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
> > +	if (error)
> > +		return error;
> > +
> > +	cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO);
> > +	if (!cur) {
> > +		xfs_trans_brelse(tp, *agi_bpp);
> > +		*agi_bpp = NULL;
> > +		return -ENOMEM;
> > +	}
> > +	*curpp = cur;
> > +	return 0;
> > +}
> > +
> >  static int
> >  xfs_inobt_count_blocks(
> >  	struct xfs_mount	*mp,
> ...
> > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
> > new file mode 100644
> > index 000000000000..3e6c06e69c75
> > --- /dev/null
> > +++ b/fs/xfs/xfs_iwalk.c
> > @@ -0,0 +1,400 @@
> ...
> > +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
> > +STATIC int
> > +xfs_iwalk_ag(
> > +	struct xfs_iwalk_ag		*iwag)
> > +{
> > +	struct xfs_mount		*mp = iwag->mp;
> > +	struct xfs_trans		*tp = iwag->tp;
> > +	struct xfs_buf			*agi_bp = NULL;
> > +	struct xfs_btree_cur		*cur = NULL;
> > +	xfs_agnumber_t			agno;
> > +	xfs_agino_t			agino;
> > +	int				has_more;
> > +	int				error = 0;
> > +
> > +	/* Set up our cursor at the right place in the inode btree. */
> > +	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
> > +	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
> > +	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
> > +
> > +	while (!error && has_more) {
> > +		struct xfs_inobt_rec_incore	*irec;
> > +
> > +		cond_resched();
> > +
> > +		/* Fetch the inobt record. */
> > +		irec = &iwag->recs[iwag->nr_recs];
> > +		error = xfs_inobt_get_rec(cur, irec, &has_more);
> > +		if (error || !has_more)
> > +			break;
> > +
> > +		/* No allocated inodes in this chunk; skip it. */
> > +		if (irec->ir_freecount == irec->ir_count) {
> > +			error = xfs_btree_increment(cur, 0, &has_more);
> > +			if (error)
> > +				break;
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * Start readahead for this inode chunk in anticipation of
> > +		 * walking the inodes.
> > +		 */
> > +		xfs_bulkstat_ichunk_ra(mp, agno, irec);
> > +
> > +		/*
> > +		 * If there's space in the buffer for more records, increment
> > +		 * the btree cursor and grab more.
> > +		 */
> > +		if (++iwag->nr_recs < iwag->sz_recs) {
> > +			error = xfs_btree_increment(cur, 0, &has_more);
> > +			if (error || !has_more)
> > +				break;
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * Otherwise, we need to save cursor state and run the callback
> > +		 * function on the cached records.  The run_callbacks function
> > +		 * is supposed to return a cursor pointing to the record where
> > +		 * we would be if we had been able to increment like above.
> > +		 */
> > +		error = xfs_iwalk_run_callbacks(iwag, xfs_iwalk_ag_recs, agno,
> > +				&cur, &agi_bp, &has_more);
> > +	}
> > +
> > +	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
> > +
> > +	/* Walk any records left behind in the cache. */
> > +	if (iwag->nr_recs == 0 || error)
> > +		return error;
> > +
> > +	return xfs_iwalk_ag_recs(iwag);
> 
> Hmm, I find the above pattern to process the leftover records a bit
> confusing because of how it is open coded. Could we find a way to reuse
> xfs_iwalk_run_callbacks() in both cases so it looks more obvious? For
> example, pass a flag to indicate whether the callback helper should
> recreate the cursor for continued processing. FWIW, it looks like
> has_more already reflects that state in the current logic above.

Yeah, I think this can be done without making the function too
incohesive.

> > +}
> > +
> > +/*
> > + * Given the number of inodes to prefetch, set the number of inobt records that
> > + * we cache in memory, which controls the number of inodes we try to read
> > + * ahead.
> > + *
> > + * If no max prefetch was given, default to 4096 bytes' worth of inobt records;
> > + * this should be plenty of inodes to read ahead.  This number (256 inobt
> > + * records) was chosen so that the cache is never more than a single memory
> > + * page.
> > + */
> > +static inline void
> > +xfs_iwalk_set_prefetch(
> > +	struct xfs_iwalk_ag	*iwag,
> > +	unsigned int		max_prefetch)
> > +{
> > +	if (max_prefetch)
> > +		iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) /
> > +					XFS_INODES_PER_CHUNK;
> > +	else
> > +		iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore);
> > +
> 
> Perhaps this should use PAGE_SIZE or a related macro?

It did in the previous revision, but Dave pointed out that sz_recs then
becomes quite large on a system with 64k pages...

65536 bytes / 16 bytes per inobt record = 4096 records
4096 records * 64 inodes per record = 262144 inodes
262144 inodes * 512 bytes per inode = 128MB of inode readahead

I could extend the comment to explain why we don't use PAGE_SIZE...

/*
 * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
 * constrain the amount of inode readahead to 16k inodes regardless of CPU:
 *
 * 4096 bytes / 16 bytes per inobt record = 256 inobt records
 * 256 inobt records * 64 inodes per record = 16384 inodes
 * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
 */

--D

> Brian
> 
> > +	/*
> > +	 * Allocate enough space to prefetch at least two records so that we
> > +	 * can cache both the inobt record where the iwalk started and the next
> > +	 * record.  This simplifies the AG inode walk loop setup code.
> > +	 */
> > +	iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2);
> > +}
> > +
> > +/*
> > + * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
> > + * will be called for each allocated inode, being passed the inode's number and
> > + * @data.  @max_prefetch controls how many inobt records' worth of inodes we
> > + * try to readahead.
> > + */
> > +int
> > +xfs_iwalk(
> > +	struct xfs_mount	*mp,
> > +	struct xfs_trans	*tp,
> > +	xfs_ino_t		startino,
> > +	xfs_iwalk_fn		iwalk_fn,
> > +	unsigned int		max_prefetch,
> > +	void			*data)
> > +{
> > +	struct xfs_iwalk_ag	iwag = {
> > +		.mp		= mp,
> > +		.tp		= tp,
> > +		.iwalk_fn	= iwalk_fn,
> > +		.data		= data,
> > +		.startino	= startino,
> > +	};
> > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > +	int			error;
> > +
> > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > +
> > +	xfs_iwalk_set_prefetch(&iwag, max_prefetch);
> > +	error = xfs_iwalk_alloc(&iwag);
> > +	if (error)
> > +		return error;
> > +
> > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > +		error = xfs_iwalk_ag(&iwag);
> > +		if (error)
> > +			break;
> > +		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > +	}
> > +
> > +	xfs_iwalk_free(&iwag);
> > +	return error;
> > +}
> > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> > new file mode 100644
> > index 000000000000..45b1baabcd2d
> > --- /dev/null
> > +++ b/fs/xfs/xfs_iwalk.h
> > @@ -0,0 +1,18 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > + */
> > +#ifndef __XFS_IWALK_H__
> > +#define __XFS_IWALK_H__
> > +
> > +/* Walk all inodes in the filesystem starting from @startino. */
> > +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
> > +			    xfs_ino_t ino, void *data);
> > +/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */
> > +#define XFS_IWALK_ABORT	(1)
> > +
> > +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
> > +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> > +
> > +#endif /* __XFS_IWALK_H__ */
> > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > index 2464ea351f83..f9bb1d50bc0e 100644
> > --- a/fs/xfs/xfs_trace.h
> > +++ b/fs/xfs/xfs_trace.h
> > @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> >  
> > +TRACE_EVENT(xfs_iwalk_ag,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > +		 xfs_agino_t startino),
> > +	TP_ARGS(mp, agno, startino),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agino_t, startino)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->startino = startino;
> > +	),
> > +	TP_printk("dev %d:%d agno %d startino %u",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > +		  __entry->startino)
> > +)
> > +
> > +TRACE_EVENT(xfs_iwalk_ag_rec,
> > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > +		 struct xfs_inobt_rec_incore *irec),
> > +	TP_ARGS(mp, agno, irec),
> > +	TP_STRUCT__entry(
> > +		__field(dev_t, dev)
> > +		__field(xfs_agnumber_t, agno)
> > +		__field(xfs_agino_t, startino)
> > +		__field(uint64_t, freemask)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->dev = mp->m_super->s_dev;
> > +		__entry->agno = agno;
> > +		__entry->startino = irec->ir_startino;
> > +		__entry->freemask = irec->ir_free;
> > +	),
> > +	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
> > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > +		  __entry->startino, __entry->freemask)
> > +)
> > +
> >  #endif /* _TRACE_XFS_H */
> >  
> >  #undef TRACE_INCLUDE_PATH
> >
Brian Foster June 10, 2019, 5:55 p.m. UTC | #3
On Mon, Jun 10, 2019 at 09:59:09AM -0700, Darrick J. Wong wrote:
> On Mon, Jun 10, 2019 at 09:58:19AM -0400, Brian Foster wrote:
> > On Tue, Jun 04, 2019 at 02:49:34PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Create a new iterator function to simplify walking inodes in an XFS
> > > filesystem.  This new iterator will replace the existing open-coded
> > > walking that goes on in various places.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/Makefile                  |    1 
> > >  fs/xfs/libxfs/xfs_ialloc_btree.c |   31 +++
> > >  fs/xfs/libxfs/xfs_ialloc_btree.h |    3 
> > >  fs/xfs/xfs_itable.c              |    5 
> > >  fs/xfs/xfs_itable.h              |    8 +
> > >  fs/xfs/xfs_iwalk.c               |  400 ++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/xfs_iwalk.h               |   18 ++
> > >  fs/xfs/xfs_trace.h               |   40 ++++
> > >  8 files changed, 502 insertions(+), 4 deletions(-)
> > >  create mode 100644 fs/xfs/xfs_iwalk.c
> > >  create mode 100644 fs/xfs/xfs_iwalk.h
> > > 
> > > 
> > ...
> > > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> > > index ac4b65da4c2b..cb7eac2f51c0 100644
> > > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> > > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
...
> > > +}
> > > +
> > > +/*
> > > + * Given the number of inodes to prefetch, set the number of inobt records that
> > > + * we cache in memory, which controls the number of inodes we try to read
> > > + * ahead.
> > > + *
> > > + * If no max prefetch was given, default to 4096 bytes' worth of inobt records;
> > > + * this should be plenty of inodes to read ahead.  This number (256 inobt
> > > + * records) was chosen so that the cache is never more than a single memory
> > > + * page.
> > > + */
> > > +static inline void
> > > +xfs_iwalk_set_prefetch(
> > > +	struct xfs_iwalk_ag	*iwag,
> > > +	unsigned int		max_prefetch)
> > > +{
> > > +	if (max_prefetch)
> > > +		iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) /
> > > +					XFS_INODES_PER_CHUNK;
> > > +	else
> > > +		iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore);
> > > +
> > 
> > Perhaps this should use PAGE_SIZE or a related macro?
> 
> It did in the previous revision, but Dave pointed out that sz_recs then
> becomes quite large on a system with 64k pages...
> 
> 65536 bytes / 16 bytes per inobt record = 4096 records
> 4096 records * 64 inodes per record = 262144 inodes
> 262144 inodes * 512 bytes per inode = 128MB of inode readahead
> 

Ok, the comment just gave me the impression the intent was to fill a
single page.

> I could extend the comment to explain why we don't use PAGE_SIZE...
> 

Sounds good, though what I think would be better is to define a
IWALK_DEFAULT_RECS or some such somewhere and put the calculation
details with that.

Though now that you point out the readahead thing, aren't we at risk of
a similar problem for users who happen to pass a really large userspace
buffer? Should we cap the kernel allocation/readahead window in all
cases and not just the default case?

Brian

> /*
>  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
>  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
>  *
>  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
>  * 256 inobt records * 64 inodes per record = 16384 inodes
>  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
>  */
> 
> --D
> 
> > Brian
> > 
> > > +	/*
> > > +	 * Allocate enough space to prefetch at least two records so that we
> > > +	 * can cache both the inobt record where the iwalk started and the next
> > > +	 * record.  This simplifies the AG inode walk loop setup code.
> > > +	 */
> > > +	iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2);
> > > +}
> > > +
> > > +/*
> > > + * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
> > > + * will be called for each allocated inode, being passed the inode's number and
> > > + * @data.  @max_prefetch controls how many inobt records' worth of inodes we
> > > + * try to readahead.
> > > + */
> > > +int
> > > +xfs_iwalk(
> > > +	struct xfs_mount	*mp,
> > > +	struct xfs_trans	*tp,
> > > +	xfs_ino_t		startino,
> > > +	xfs_iwalk_fn		iwalk_fn,
> > > +	unsigned int		max_prefetch,
> > > +	void			*data)
> > > +{
> > > +	struct xfs_iwalk_ag	iwag = {
> > > +		.mp		= mp,
> > > +		.tp		= tp,
> > > +		.iwalk_fn	= iwalk_fn,
> > > +		.data		= data,
> > > +		.startino	= startino,
> > > +	};
> > > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > +	int			error;
> > > +
> > > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > > +
> > > +	xfs_iwalk_set_prefetch(&iwag, max_prefetch);
> > > +	error = xfs_iwalk_alloc(&iwag);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > > +		error = xfs_iwalk_ag(&iwag);
> > > +		if (error)
> > > +			break;
> > > +		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > > +	}
> > > +
> > > +	xfs_iwalk_free(&iwag);
> > > +	return error;
> > > +}
> > > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> > > new file mode 100644
> > > index 000000000000..45b1baabcd2d
> > > --- /dev/null
> > > +++ b/fs/xfs/xfs_iwalk.h
> > > @@ -0,0 +1,18 @@
> > > +// SPDX-License-Identifier: GPL-2.0+
> > > +/*
> > > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > > + */
> > > +#ifndef __XFS_IWALK_H__
> > > +#define __XFS_IWALK_H__
> > > +
> > > +/* Walk all inodes in the filesystem starting from @startino. */
> > > +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
> > > +			    xfs_ino_t ino, void *data);
> > > +/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */
> > > +#define XFS_IWALK_ABORT	(1)
> > > +
> > > +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
> > > +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> > > +
> > > +#endif /* __XFS_IWALK_H__ */
> > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > index 2464ea351f83..f9bb1d50bc0e 100644
> > > --- a/fs/xfs/xfs_trace.h
> > > +++ b/fs/xfs/xfs_trace.h
> > > @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> > >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> > >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> > >  
> > > +TRACE_EVENT(xfs_iwalk_ag,
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > +		 xfs_agino_t startino),
> > > +	TP_ARGS(mp, agno, startino),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(xfs_agino_t, startino)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->agno = agno;
> > > +		__entry->startino = startino;
> > > +	),
> > > +	TP_printk("dev %d:%d agno %d startino %u",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > > +		  __entry->startino)
> > > +)
> > > +
> > > +TRACE_EVENT(xfs_iwalk_ag_rec,
> > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > +		 struct xfs_inobt_rec_incore *irec),
> > > +	TP_ARGS(mp, agno, irec),
> > > +	TP_STRUCT__entry(
> > > +		__field(dev_t, dev)
> > > +		__field(xfs_agnumber_t, agno)
> > > +		__field(xfs_agino_t, startino)
> > > +		__field(uint64_t, freemask)
> > > +	),
> > > +	TP_fast_assign(
> > > +		__entry->dev = mp->m_super->s_dev;
> > > +		__entry->agno = agno;
> > > +		__entry->startino = irec->ir_startino;
> > > +		__entry->freemask = irec->ir_free;
> > > +	),
> > > +	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
> > > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > > +		  __entry->startino, __entry->freemask)
> > > +)
> > > +
> > >  #endif /* _TRACE_XFS_H */
> > >  
> > >  #undef TRACE_INCLUDE_PATH
> > >
Darrick J. Wong June 10, 2019, 11:11 p.m. UTC | #4
On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> On Mon, Jun 10, 2019 at 09:59:09AM -0700, Darrick J. Wong wrote:
> > On Mon, Jun 10, 2019 at 09:58:19AM -0400, Brian Foster wrote:
> > > On Tue, Jun 04, 2019 at 02:49:34PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Create a new iterator function to simplify walking inodes in an XFS
> > > > filesystem.  This new iterator will replace the existing open-coded
> > > > walking that goes on in various places.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/Makefile                  |    1 
> > > >  fs/xfs/libxfs/xfs_ialloc_btree.c |   31 +++
> > > >  fs/xfs/libxfs/xfs_ialloc_btree.h |    3 
> > > >  fs/xfs/xfs_itable.c              |    5 
> > > >  fs/xfs/xfs_itable.h              |    8 +
> > > >  fs/xfs/xfs_iwalk.c               |  400 ++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/xfs_iwalk.h               |   18 ++
> > > >  fs/xfs/xfs_trace.h               |   40 ++++
> > > >  8 files changed, 502 insertions(+), 4 deletions(-)
> > > >  create mode 100644 fs/xfs/xfs_iwalk.c
> > > >  create mode 100644 fs/xfs/xfs_iwalk.h
> > > > 
> > > > 
> > > ...
> > > > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> > > > index ac4b65da4c2b..cb7eac2f51c0 100644
> > > > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> > > > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
> ...
> > > > +}
> > > > +
> > > > +/*
> > > > + * Given the number of inodes to prefetch, set the number of inobt records that
> > > > + * we cache in memory, which controls the number of inodes we try to read
> > > > + * ahead.
> > > > + *
> > > > + * If no max prefetch was given, default to 4096 bytes' worth of inobt records;
> > > > + * this should be plenty of inodes to read ahead.  This number (256 inobt
> > > > + * records) was chosen so that the cache is never more than a single memory
> > > > + * page.
> > > > + */
> > > > +static inline void
> > > > +xfs_iwalk_set_prefetch(
> > > > +	struct xfs_iwalk_ag	*iwag,
> > > > +	unsigned int		max_prefetch)
> > > > +{
> > > > +	if (max_prefetch)
> > > > +		iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) /
> > > > +					XFS_INODES_PER_CHUNK;
> > > > +	else
> > > > +		iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore);
> > > > +
> > > 
> > > Perhaps this should use PAGE_SIZE or a related macro?
> > 
> > It did in the previous revision, but Dave pointed out that sz_recs then
> > becomes quite large on a system with 64k pages...
> > 
> > 65536 bytes / 16 bytes per inobt record = 4096 records
> > 4096 records * 64 inodes per record = 262144 inodes
> > 262144 inodes * 512 bytes per inode = 128MB of inode readahead
> > 
> 
> Ok, the comment just gave me the impression the intent was to fill a
> single page.
> 
> > I could extend the comment to explain why we don't use PAGE_SIZE...
> > 
> 
> Sounds good, though what I think would be better is to define a
> IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> details with that.
> 
> Though now that you point out the readahead thing, aren't we at risk of
> a similar problem for users who happen to pass a really large userspace
> buffer? Should we cap the kernel allocation/readahead window in all
> cases and not just the default case?

Hmm, that's right, we don't want to let userspace arbitrarily determine
the size of the buffer, and I think the current implementation caps it
the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).

Oh, right, and in the V1 patchset Dave said that we should constrain
readahead even further.

--D

> 
> Brian
> 
> > /*
> >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> >  *
> >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> >  * 256 inobt records * 64 inodes per record = 16384 inodes
> >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> >  */
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +	/*
> > > > +	 * Allocate enough space to prefetch at least two records so that we
> > > > +	 * can cache both the inobt record where the iwalk started and the next
> > > > +	 * record.  This simplifies the AG inode walk loop setup code.
> > > > +	 */
> > > > +	iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
> > > > + * will be called for each allocated inode, being passed the inode's number and
> > > > + * @data.  @max_prefetch controls how many inobt records' worth of inodes we
> > > > + * try to readahead.
> > > > + */
> > > > +int
> > > > +xfs_iwalk(
> > > > +	struct xfs_mount	*mp,
> > > > +	struct xfs_trans	*tp,
> > > > +	xfs_ino_t		startino,
> > > > +	xfs_iwalk_fn		iwalk_fn,
> > > > +	unsigned int		max_prefetch,
> > > > +	void			*data)
> > > > +{
> > > > +	struct xfs_iwalk_ag	iwag = {
> > > > +		.mp		= mp,
> > > > +		.tp		= tp,
> > > > +		.iwalk_fn	= iwalk_fn,
> > > > +		.data		= data,
> > > > +		.startino	= startino,
> > > > +	};
> > > > +	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
> > > > +	int			error;
> > > > +
> > > > +	ASSERT(agno < mp->m_sb.sb_agcount);
> > > > +
> > > > +	xfs_iwalk_set_prefetch(&iwag, max_prefetch);
> > > > +	error = xfs_iwalk_alloc(&iwag);
> > > > +	if (error)
> > > > +		return error;
> > > > +
> > > > +	for (; agno < mp->m_sb.sb_agcount; agno++) {
> > > > +		error = xfs_iwalk_ag(&iwag);
> > > > +		if (error)
> > > > +			break;
> > > > +		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
> > > > +	}
> > > > +
> > > > +	xfs_iwalk_free(&iwag);
> > > > +	return error;
> > > > +}
> > > > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> > > > new file mode 100644
> > > > index 000000000000..45b1baabcd2d
> > > > --- /dev/null
> > > > +++ b/fs/xfs/xfs_iwalk.h
> > > > @@ -0,0 +1,18 @@
> > > > +// SPDX-License-Identifier: GPL-2.0+
> > > > +/*
> > > > + * Copyright (C) 2019 Oracle.  All Rights Reserved.
> > > > + * Author: Darrick J. Wong <darrick.wong@oracle.com>
> > > > + */
> > > > +#ifndef __XFS_IWALK_H__
> > > > +#define __XFS_IWALK_H__
> > > > +
> > > > +/* Walk all inodes in the filesystem starting from @startino. */
> > > > +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
> > > > +			    xfs_ino_t ino, void *data);
> > > > +/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */
> > > > +#define XFS_IWALK_ABORT	(1)
> > > > +
> > > > +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
> > > > +		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
> > > > +
> > > > +#endif /* __XFS_IWALK_H__ */
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index 2464ea351f83..f9bb1d50bc0e 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
> > > >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
> > > >  DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
> > > >  
> > > > +TRACE_EVENT(xfs_iwalk_ag,
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > > +		 xfs_agino_t startino),
> > > > +	TP_ARGS(mp, agno, startino),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(xfs_agino_t, startino)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->agno = agno;
> > > > +		__entry->startino = startino;
> > > > +	),
> > > > +	TP_printk("dev %d:%d agno %d startino %u",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > > > +		  __entry->startino)
> > > > +)
> > > > +
> > > > +TRACE_EVENT(xfs_iwalk_ag_rec,
> > > > +	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
> > > > +		 struct xfs_inobt_rec_incore *irec),
> > > > +	TP_ARGS(mp, agno, irec),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_agnumber_t, agno)
> > > > +		__field(xfs_agino_t, startino)
> > > > +		__field(uint64_t, freemask)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = mp->m_super->s_dev;
> > > > +		__entry->agno = agno;
> > > > +		__entry->startino = irec->ir_startino;
> > > > +		__entry->freemask = irec->ir_free;
> > > > +	),
> > > > +	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
> > > > +		  __entry->startino, __entry->freemask)
> > > > +)
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > >
Dave Chinner June 11, 2019, 10:33 p.m. UTC | #5
On Mon, Jun 10, 2019 at 04:11:34PM -0700, Darrick J. Wong wrote:
> On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> > > I could extend the comment to explain why we don't use PAGE_SIZE...
> > > 
> > 
> > Sounds good, though what I think would be better is to define a
> > IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> > details with that.
> > 
> > Though now that you point out the readahead thing, aren't we at risk of
> > a similar problem for users who happen to pass a really large userspace
> > buffer? Should we cap the kernel allocation/readahead window in all
> > cases and not just the default case?
> 
> Hmm, that's right, we don't want to let userspace arbitrarily determine
> the size of the buffer, and I think the current implementation caps it
> the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).
> 
> Oh, right, and in the V1 patchset Dave said that we should constrain
> readahead even further.

Right, I should explain a bit further why, too - it's about
performance.  I've found that a user buffer size of ~1024 inodes is
generally enough to max out performance of bulkstat. i.e. somewhere
around 1000 inodes per syscall is enough to mostly amortise all of
the cost of syscall, setup, readahead, etc vs the CPU overhead of
copying all the inodes into the user buffer.

Once the user buffer goes over a few thousand inodes, performance
then starts to tail back off - we don't get any gains from trying to
bulkstat tens of thousands of inodes at a time, especially under
memory pressure because that can push us into readahead and buffer
cache thrashing.

> > > /*
> > >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> > >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> > >  *
> > >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> > >  * 256 inobt records * 64 inodes per record = 16384 inodes
> > >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> > >  */

Hence I suspect that even this is overkill - it makes no sense to
have a huge readahead window when there has been no measurable
performance benefit to doing large inode count bulkstat syscalls.

And, FWIW, readahead probably should also be capped at what the user
buffer can hold - no point in reading 16k inodes when the output
buffer can only fit 1000 inodes...

Cheers,

Dave.
Darrick J. Wong June 11, 2019, 11:05 p.m. UTC | #6
On Wed, Jun 12, 2019 at 08:33:41AM +1000, Dave Chinner wrote:
> On Mon, Jun 10, 2019 at 04:11:34PM -0700, Darrick J. Wong wrote:
> > On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> > > > I could extend the comment to explain why we don't use PAGE_SIZE...
> > > > 
> > > 
> > > Sounds good, though what I think would be better is to define a
> > > IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> > > details with that.
> > > 
> > > Though now that you point out the readahead thing, aren't we at risk of
> > > a similar problem for users who happen to pass a really large userspace
> > > buffer? Should we cap the kernel allocation/readahead window in all
> > > cases and not just the default case?
> > 
> > Hmm, that's right, we don't want to let userspace arbitrarily determine
> > the size of the buffer, and I think the current implementation caps it
> > the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).
> > 
> > Oh, right, and in the V1 patchset Dave said that we should constrain
> > readahead even further.
> 
> Right, I should explain a bit further why, too - it's about
> performance.  I've found that a user buffer size of ~1024 inodes is
> generally enough to max out performance of bulkstat. i.e. somewhere
> around 1000 inodes per syscall is enough to mostly amortise all of
> the cost of syscall, setup, readahead, etc vs the CPU overhead of
> copying all the inodes into the user buffer.
> 
> Once the user buffer goes over a few thousand inodes, performance
> then starts to tail back off - we don't get any gains from trying to
> bulkstat tens of thousands of inodes at a time, especially under
> memory pressure because that can push us into readahead and buffer
> cache thrashing.

<nod> I don't mind setting the max inobt record cache buffer size to a
smaller value (1024 bytes == 4096 inodes readahead?) so we can get a
little farther into future hardware scalability (or decreases in syscall
performance :P).

I guess the question here is how to relate the number of inodes the user
asked for to how many inobt records we have to read to find that many
allocated inodes?  Or in other words, what's the average ir_freecount
across all the inobt records?

Note that this is technically a decrease since the old code would
reserve 16K for this purpose...

> > > > /*
> > > >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> > > >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> > > >  *
> > > >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> > > >  * 256 inobt records * 64 inodes per record = 16384 inodes
> > > >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> > > >  */
> 
> Hence I suspect that even this is overkill - it makes no sense to
> have a huge readahead window when there has been no measurable
> performance benefit to doing large inode count bulkstat syscalls.
> 
> And, FWIW, readahead probably should also be capped at what the user
> buffer can hold - no point in reading 16k inodes when the output
> buffer can only fit 1000 inodes...

It already is -- the icount parameter from userspace is (eventually) fed
to xfs_iwalk-set_prefetch.

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Brian Foster June 12, 2019, 12:13 p.m. UTC | #7
On Tue, Jun 11, 2019 at 04:05:14PM -0700, Darrick J. Wong wrote:
> On Wed, Jun 12, 2019 at 08:33:41AM +1000, Dave Chinner wrote:
> > On Mon, Jun 10, 2019 at 04:11:34PM -0700, Darrick J. Wong wrote:
> > > On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> > > > > I could extend the comment to explain why we don't use PAGE_SIZE...
> > > > > 
> > > > 
> > > > Sounds good, though what I think would be better is to define a
> > > > IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> > > > details with that.
> > > > 
> > > > Though now that you point out the readahead thing, aren't we at risk of
> > > > a similar problem for users who happen to pass a really large userspace
> > > > buffer? Should we cap the kernel allocation/readahead window in all
> > > > cases and not just the default case?
> > > 
> > > Hmm, that's right, we don't want to let userspace arbitrarily determine
> > > the size of the buffer, and I think the current implementation caps it
> > > the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).
> > > 
> > > Oh, right, and in the V1 patchset Dave said that we should constrain
> > > readahead even further.
> > 
> > Right, I should explain a bit further why, too - it's about
> > performance.  I've found that a user buffer size of ~1024 inodes is
> > generally enough to max out performance of bulkstat. i.e. somewhere
> > around 1000 inodes per syscall is enough to mostly amortise all of
> > the cost of syscall, setup, readahead, etc vs the CPU overhead of
> > copying all the inodes into the user buffer.
> > 
> > Once the user buffer goes over a few thousand inodes, performance
> > then starts to tail back off - we don't get any gains from trying to
> > bulkstat tens of thousands of inodes at a time, especially under
> > memory pressure because that can push us into readahead and buffer
> > cache thrashing.
> 
> <nod> I don't mind setting the max inobt record cache buffer size to a
> smaller value (1024 bytes == 4096 inodes readahead?) so we can get a
> little farther into future hardware scalability (or decreases in syscall
> performance :P).
> 

The 1k baseline presumably applies to the current code. Taking a closer
look at the current code, we unconditionally allocate a 4 page record
buffer and start to fill it. For every record we grab, we issue
readahead on the underlying clusters.

Hmm, that seems like generally what this patchset is doing aside from
the more variable record buffer allocation. I'm fine with changing
things like record buffer allocation, readahead semantics, etc. given
Dave's practical analysis above, but TBH I don't think that should all
be part of the same patch. IMO, this rework patch should maintain as
close as possible to current behavior and a subsequent patches in the
series can tweak record buffer size and whatnot to improve readahead
logic. That makes this all easier to review, discuss and maintain in the
event of regression.

> I guess the question here is how to relate the number of inodes the user
> asked for to how many inobt records we have to read to find that many
> allocated inodes?  Or in other words, what's the average ir_freecount
> across all the inobt records?
> 

The current code basically looks like it allocates an oversized buffer
and hopes for the best with regard to readahead. If we took a similar
approach in terms of overestimating the buffer size (assuming not all
inode records are fully allocated), I suppose we could also track the
number of cluster readaheads issued and govern the collect/drain
sequences of the record buffer based on that..? But again, I think we
should have this as a separate "xfs: make iwalk readahead smarter ..."
patch that documents Dave's analysis above, perhaps includes some
numbers, etc..

> Note that this is technically a decrease since the old code would
> reserve 16K for this purpose...
> 

Indeed.

Brian

> > > > > /*
> > > > >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> > > > >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> > > > >  *
> > > > >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> > > > >  * 256 inobt records * 64 inodes per record = 16384 inodes
> > > > >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> > > > >  */
> > 
> > Hence I suspect that even this is overkill - it makes no sense to
> > have a huge readahead window when there has been no measurable
> > performance benefit to doing large inode count bulkstat syscalls.
> > 
> > And, FWIW, readahead probably should also be capped at what the user
> > buffer can hold - no point in reading 16k inodes when the output
> > buffer can only fit 1000 inodes...
> 
> It already is -- the icount parameter from userspace is (eventually) fed
> to xfs_iwalk-set_prefetch.
> 
> --D
> 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
Darrick J. Wong June 12, 2019, 4:53 p.m. UTC | #8
On Wed, Jun 12, 2019 at 08:13:10AM -0400, Brian Foster wrote:
> On Tue, Jun 11, 2019 at 04:05:14PM -0700, Darrick J. Wong wrote:
> > On Wed, Jun 12, 2019 at 08:33:41AM +1000, Dave Chinner wrote:
> > > On Mon, Jun 10, 2019 at 04:11:34PM -0700, Darrick J. Wong wrote:
> > > > On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> > > > > > I could extend the comment to explain why we don't use PAGE_SIZE...
> > > > > > 
> > > > > 
> > > > > Sounds good, though what I think would be better is to define a
> > > > > IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> > > > > details with that.
> > > > > 
> > > > > Though now that you point out the readahead thing, aren't we at risk of
> > > > > a similar problem for users who happen to pass a really large userspace
> > > > > buffer? Should we cap the kernel allocation/readahead window in all
> > > > > cases and not just the default case?
> > > > 
> > > > Hmm, that's right, we don't want to let userspace arbitrarily determine
> > > > the size of the buffer, and I think the current implementation caps it
> > > > the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).
> > > > 
> > > > Oh, right, and in the V1 patchset Dave said that we should constrain
> > > > readahead even further.
> > > 
> > > Right, I should explain a bit further why, too - it's about
> > > performance.  I've found that a user buffer size of ~1024 inodes is
> > > generally enough to max out performance of bulkstat. i.e. somewhere
> > > around 1000 inodes per syscall is enough to mostly amortise all of
> > > the cost of syscall, setup, readahead, etc vs the CPU overhead of
> > > copying all the inodes into the user buffer.
> > > 
> > > Once the user buffer goes over a few thousand inodes, performance
> > > then starts to tail back off - we don't get any gains from trying to
> > > bulkstat tens of thousands of inodes at a time, especially under
> > > memory pressure because that can push us into readahead and buffer
> > > cache thrashing.
> > 
> > <nod> I don't mind setting the max inobt record cache buffer size to a
> > smaller value (1024 bytes == 4096 inodes readahead?) so we can get a
> > little farther into future hardware scalability (or decreases in syscall
> > performance :P).
> > 
> 
> The 1k baseline presumably applies to the current code. Taking a closer
> look at the current code, we unconditionally allocate a 4 page record
> buffer and start to fill it. For every record we grab, we issue
> readahead on the underlying clusters.
> 
> Hmm, that seems like generally what this patchset is doing aside from
> the more variable record buffer allocation. I'm fine with changing
> things like record buffer allocation, readahead semantics, etc. given
> Dave's practical analysis above, but TBH I don't think that should all
> be part of the same patch. IMO, this rework patch should maintain as
> close as possible to current behavior and a subsequent patches in the
> series can tweak record buffer size and whatnot to improve readahead
> logic. That makes this all easier to review, discuss and maintain in the
> event of regression.

This also got me thinking that I should look up what INUMBERS does,
which is that it uses the icount to determine the cache size, up to a
maximum of PAGE_SIZE.  Since INUMBERS doesn't issue readahead I think
it's fine to preserve that behavior too... and probably with a separate
xfs_inobt_walk_set_prefetch function.

> > I guess the question here is how to relate the number of inodes the user
> > asked for to how many inobt records we have to read to find that many
> > allocated inodes?  Or in other words, what's the average ir_freecount
> > across all the inobt records?
> > 
> 
> The current code basically looks like it allocates an oversized buffer
> and hopes for the best with regard to readahead. If we took a similar
> approach in terms of overestimating the buffer size (assuming not all
> inode records are fully allocated), I suppose we could also track the
> number of cluster readaheads issued and govern the collect/drain
> sequences of the record buffer based on that..? But again, I think we
> should have this as a separate "xfs: make iwalk readahead smarter ..."
> patch that documents Dave's analysis above, perhaps includes some
> numbers, etc..

Ok.

--D

> > Note that this is technically a decrease since the old code would
> > reserve 16K for this purpose...
> > 
> 
> Indeed.
> 
> Brian
> 
> > > > > > /*
> > > > > >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> > > > > >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> > > > > >  *
> > > > > >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> > > > > >  * 256 inobt records * 64 inodes per record = 16384 inodes
> > > > > >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> > > > > >  */
> > > 
> > > Hence I suspect that even this is overkill - it makes no sense to
> > > have a huge readahead window when there has been no measurable
> > > performance benefit to doing large inode count bulkstat syscalls.
> > > 
> > > And, FWIW, readahead probably should also be capped at what the user
> > > buffer can hold - no point in reading 16k inodes when the output
> > > buffer can only fit 1000 inodes...
> > 
> > It already is -- the icount parameter from userspace is (eventually) fed
> > to xfs_iwalk-set_prefetch.
> > 
> > --D
> > 
> > > Cheers,
> > > 
> > > Dave.
> > > -- 
> > > Dave Chinner
> > > david@fromorbit.com
Darrick J. Wong June 12, 2019, 5:54 p.m. UTC | #9
On Tue, Jun 11, 2019 at 04:05:14PM -0700, Darrick J. Wong wrote:
> On Wed, Jun 12, 2019 at 08:33:41AM +1000, Dave Chinner wrote:
> > On Mon, Jun 10, 2019 at 04:11:34PM -0700, Darrick J. Wong wrote:
> > > On Mon, Jun 10, 2019 at 01:55:10PM -0400, Brian Foster wrote:
> > > > > I could extend the comment to explain why we don't use PAGE_SIZE...
> > > > > 
> > > > 
> > > > Sounds good, though what I think would be better is to define a
> > > > IWALK_DEFAULT_RECS or some such somewhere and put the calculation
> > > > details with that.
> > > > 
> > > > Though now that you point out the readahead thing, aren't we at risk of
> > > > a similar problem for users who happen to pass a really large userspace
> > > > buffer? Should we cap the kernel allocation/readahead window in all
> > > > cases and not just the default case?
> > > 
> > > Hmm, that's right, we don't want to let userspace arbitrarily determine
> > > the size of the buffer, and I think the current implementation caps it
> > > the readahaead at ... oh, PAGE_SIZE / sizeof(xfs_inogrp_t).
> > > 
> > > Oh, right, and in the V1 patchset Dave said that we should constrain
> > > readahead even further.
> > 
> > Right, I should explain a bit further why, too - it's about
> > performance.  I've found that a user buffer size of ~1024 inodes is
> > generally enough to max out performance of bulkstat. i.e. somewhere
> > around 1000 inodes per syscall is enough to mostly amortise all of
> > the cost of syscall, setup, readahead, etc vs the CPU overhead of
> > copying all the inodes into the user buffer.
> > 
> > Once the user buffer goes over a few thousand inodes, performance
> > then starts to tail back off - we don't get any gains from trying to
> > bulkstat tens of thousands of inodes at a time, especially under
> > memory pressure because that can push us into readahead and buffer
> > cache thrashing.
> 
> <nod> I don't mind setting the max inobt record cache buffer size to a
> smaller value (1024 bytes == 4096 inodes readahead?) so we can get a
> little farther into future hardware scalability (or decreases in syscall
> performance :P).
> 
> I guess the question here is how to relate the number of inodes the user
> asked for to how many inobt records we have to read to find that many
> allocated inodes?  Or in other words, what's the average ir_freecount
> across all the inobt records?

Partial results computed by using xfs_db to dump all the inobt records
in the fs to look at averge freecounts, and the number of inobt records
with zero freecount:

Development workstation / and /home:
4678 total, zerofree 72.49%, avg freecount 4.24
107940 total, zerofree 94.84%, avg freecount 0.78

This laptop /, /home, and /boot:
4613 total, zerofree 80.73%, avg freecount 3.11
49660 total, zerofree 99.54%, avg freecount 0.04
10 total, zerofree 20.00%, avg freecount 27.40

Backup server / and /home:
3897 total, zerofree 22.99%, avg freecount 27.08
55995 total, zerofree 99.87%, avg freecount 0.01

(Note that the root fs is nearly out of space now, thanks journald...)

xfstests host / and $TEST_DIR:
1834 total, zerofree 76.28%, avg freecount 3.31
20097 total, zerofree 83.41%, avg freecount 3.62

The script I used to generate these reports is attached.  From this
admittedly cursory output I "conclude" that bulkstat could get away with
prefetching (icount / 48) inobt records up to a max of 1000 inodes.
A more conservative estimate would be (icount / 32) inobt records.

--D

> 
> Note that this is technically a decrease since the old code would
> reserve 16K for this purpose...
> 
> > > > > /*
> > > > >  * Note: We hardcode 4096 here (instead of, say, PAGE_SIZE) because we want to
> > > > >  * constrain the amount of inode readahead to 16k inodes regardless of CPU:
> > > > >  *
> > > > >  * 4096 bytes / 16 bytes per inobt record = 256 inobt records
> > > > >  * 256 inobt records * 64 inodes per record = 16384 inodes
> > > > >  * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead
> > > > >  */
> > 
> > Hence I suspect that even this is overkill - it makes no sense to
> > have a huge readahead window when there has been no measurable
> > performance benefit to doing large inode count bulkstat syscalls.
> > 
> > And, FWIW, readahead probably should also be capped at what the user
> > buffer can hold - no point in reading 16k inodes when the output
> > buffer can only fit 1000 inodes...
> 
> It already is -- the icount parameter from userspace is (eventually) fed
> to xfs_iwalk-set_prefetch.
> 
> --D
> 
> > Cheers,
> > 
> > Dave.
> > -- 
> > Dave Chinner
> > david@fromorbit.com
diff mbox series

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 91831975363b..74d30ef0dbce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -80,6 +80,7 @@  xfs-y				+= xfs_aops.o \
 				   xfs_iops.o \
 				   xfs_inode.o \
 				   xfs_itable.o \
+				   xfs_iwalk.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index ac4b65da4c2b..cb7eac2f51c0 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -564,6 +564,34 @@  xfs_inobt_max_size(
 					XFS_INODES_PER_CHUNK);
 }
 
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(*agi_bpp == NULL);
+
+	error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, XFS_BTNUM_INO);
+	if (!cur) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+		return -ENOMEM;
+	}
+	*curpp = cur;
+	return 0;
+}
+
 static int
 xfs_inobt_count_blocks(
 	struct xfs_mount	*mp,
@@ -576,11 +604,10 @@  xfs_inobt_count_blocks(
 	struct xfs_btree_cur	*cur;
 	int			error;
 
-	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	error = xfs_inobt_cur(mp, tp, agno, &cur, &agbp);
 	if (error)
 		return error;
 
-	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 	error = xfs_btree_count_blocks(cur, tree_blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index ebdd0c6b8766..1bc44b4a2b6c 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -64,5 +64,8 @@  int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
 extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+		xfs_agnumber_t agno, struct xfs_btree_cur **curpp,
+		struct xfs_buf **agi_bpp);
 
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index eef307cf90a7..3ca1c454afe6 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -19,6 +19,7 @@ 
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_health.h"
+#include "xfs_iwalk.h"
 
 /*
  * Return stat information for one inode.
@@ -161,7 +162,7 @@  xfs_bulkstat_one(
  * Loop over all clusters in a chunk for a given incore inode allocation btree
  * record.  Do a readahead if there are any allocated inodes in that cluster.
  */
-STATIC void
+void
 xfs_bulkstat_ichunk_ra(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
@@ -195,7 +196,7 @@  xfs_bulkstat_ichunk_ra(
  * are some left allocated, update the data for the pointed-to record as well as
  * return the count of grabbed inodes.
  */
-STATIC int
+int
 xfs_bulkstat_grab_ichunk(
 	struct xfs_btree_cur		*cur,	/* btree cursor */
 	xfs_agino_t			agino,	/* starting inode of chunk */
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 8a822285b671..369e3f159d4e 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -84,4 +84,12 @@  xfs_inumbers(
 	void			__user *buffer, /* buffer with inode info */
 	inumbers_fmt_pf		formatter);
 
+/* Temporarily needed while we refactor functions. */
+struct xfs_btree_cur;
+struct xfs_inobt_rec_incore;
+void xfs_bulkstat_ichunk_ra(struct xfs_mount *mp, xfs_agnumber_t agno,
+		struct xfs_inobt_rec_incore *irec);
+int xfs_bulkstat_grab_ichunk(struct xfs_btree_cur *cur, xfs_agino_t agino,
+		int *icount, struct xfs_inobt_rec_incore *irec);
+
 #endif	/* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 000000000000..3e6c06e69c75
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c
@@ -0,0 +1,400 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+#include "xfs_iwalk.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes.  For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data.  The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * XFS_IWALK_ABORT to stop the iteration.  This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full.  @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+	struct xfs_mount		*mp;
+	struct xfs_trans		*tp;
+
+	/* Where do we start the traversal? */
+	xfs_ino_t			startino;
+
+	/* Array of inobt records we cache. */
+	struct xfs_inobt_rec_incore	*recs;
+
+	/* Number of entries allocated for the @recs array. */
+	unsigned int			sz_recs;
+
+	/* Number of entries in the @recs array that are in use. */
+	unsigned int			nr_recs;
+
+	/* Inode walk function and data pointer. */
+	xfs_iwalk_fn			iwalk_fn;
+	void				*data;
+};
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+	struct xfs_iwalk_ag	*iwag)
+{
+	size_t			size;
+
+	ASSERT(iwag->recs == NULL);
+	iwag->nr_recs = 0;
+
+	/* Allocate a prefetch buffer for inobt records. */
+	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+	if (iwag->recs == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+	struct xfs_iwalk_ag	*iwag)
+{
+	kmem_free(iwag->recs);
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	xfs_ino_t			ino;
+	unsigned int			i, j;
+	xfs_agnumber_t			agno;
+	int				error;
+
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	for (i = 0; i < iwag->nr_recs; i++) {
+		struct xfs_inobt_rec_incore	*irec = &iwag->recs[i];
+
+		trace_xfs_iwalk_ag_rec(mp, agno, irec);
+
+		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			/* Skip if this inode is free */
+			if (XFS_INOBT_MASK(j) & irec->ir_free)
+				continue;
+
+			/* Otherwise call our function. */
+			ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+			error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+			if (error)
+				return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			error)
+{
+	if (*curpp) {
+		xfs_btree_del_cursor(*curpp, error);
+		*curpp = NULL;
+	}
+	if (*agi_bpp) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+	}
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record.  This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+	struct xfs_iwalk_ag	*iwag,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			*has_more)
+{
+	struct xfs_mount	*mp = iwag->mp;
+	struct xfs_trans	*tp = iwag->tp;
+	int			icount;
+	int			error;
+
+	/* Set up a fresh cursor and empty the inobt cache. */
+	iwag->nr_recs = 0;
+	error = xfs_inobt_cur(mp, tp, agno, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	/* Starting at the beginning of the AG?  That's easy! */
+	if (agino == 0)
+		return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+	/*
+	 * Otherwise, we have to grab the inobt record where we left off, stuff
+	 * the record into our cache, and then see if there are more records.
+	 * We require a lookup cache of at least two elements so that we don't
+	 * have to deal with tearing down the cursor to walk the records.
+	 */
+	error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount,
+			&iwag->recs[iwag->nr_recs]);
+	if (error)
+		return error;
+	if (icount)
+		iwag->nr_recs++;
+
+	/*
+	 * set_prefetch is supposed to give us a large enough inobt record
+	 * cache that grab_ichunk can stage a partial first record and the loop
+	 * body can cache a record without having to check for cache space
+	 * until after it reads an inobt record.
+	 */
+	ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+	return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+typedef int (*xfs_iwalk_ag_recs_fn)(struct xfs_iwalk_ag *iwag);
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records.  When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor).
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+	struct xfs_iwalk_ag		*iwag,
+	xfs_iwalk_ag_recs_fn		walk_ag_recs_fn,
+	xfs_agnumber_t			agno,
+	struct xfs_btree_cur		**curpp,
+	struct xfs_buf			**agi_bpp,
+	int				*has_more)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_inobt_rec_incore	*irec;
+	xfs_agino_t			restart;
+	int				error;
+
+	ASSERT(iwag->nr_recs == iwag->sz_recs);
+
+	/* Delete cursor but remember the last record we cached... */
+	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+	irec = &iwag->recs[iwag->nr_recs - 1];
+	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+	error = walk_ag_recs_fn(iwag);
+	if (error)
+		return error;
+
+	/* ...empty the cache... */
+	iwag->nr_recs = 0;
+
+	/* ...and recreate the cursor just past where we left off. */
+	error = xfs_inobt_cur(mp, tp, agno, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_buf			*agi_bp = NULL;
+	struct xfs_btree_cur		*cur = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agino_t			agino;
+	int				has_more;
+	int				error = 0;
+
+	/* Set up our cursor at the right place in the inode btree. */
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+
+	while (!error && has_more) {
+		struct xfs_inobt_rec_incore	*irec;
+
+		cond_resched();
+
+		/* Fetch the inobt record. */
+		irec = &iwag->recs[iwag->nr_recs];
+		error = xfs_inobt_get_rec(cur, irec, &has_more);
+		if (error || !has_more)
+			break;
+
+		/* No allocated inodes in this chunk; skip it. */
+		if (irec->ir_freecount == irec->ir_count) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error)
+				break;
+			continue;
+		}
+
+		/*
+		 * Start readahead for this inode chunk in anticipation of
+		 * walking the inodes.
+		 */
+		xfs_bulkstat_ichunk_ra(mp, agno, irec);
+
+		/*
+		 * If there's space in the buffer for more records, increment
+		 * the btree cursor and grab more.
+		 */
+		if (++iwag->nr_recs < iwag->sz_recs) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error || !has_more)
+				break;
+			continue;
+		}
+
+		/*
+		 * Otherwise, we need to save cursor state and run the callback
+		 * function on the cached records.  The run_callbacks function
+		 * is supposed to return a cursor pointing to the record where
+		 * we would be if we had been able to increment like above.
+		 */
+		error = xfs_iwalk_run_callbacks(iwag, xfs_iwalk_ag_recs, agno,
+				&cur, &agi_bp, &has_more);
+	}
+
+	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+
+	/* Walk any records left behind in the cache. */
+	if (iwag->nr_recs == 0 || error)
+		return error;
+
+	return xfs_iwalk_ag_recs(iwag);
+}
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead.
+ *
+ * If no max prefetch was given, default to 4096 bytes' worth of inobt records;
+ * this should be plenty of inodes to read ahead.  This number (256 inobt
+ * records) was chosen so that the cache is never more than a single memory
+ * page.
+ */
+static inline void
+xfs_iwalk_set_prefetch(
+	struct xfs_iwalk_ag	*iwag,
+	unsigned int		max_prefetch)
+{
+	if (max_prefetch)
+		iwag->sz_recs = round_up(max_prefetch, XFS_INODES_PER_CHUNK) /
+					XFS_INODES_PER_CHUNK;
+	else
+		iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore);
+
+	/*
+	 * Allocate enough space to prefetch at least two records so that we
+	 * can cache both the inobt record where the iwalk started and the next
+	 * record.  This simplifies the AG inode walk loop setup code.
+	 */
+	iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data.  @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		max_prefetch,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.iwalk_fn	= iwalk_fn,
+		.data		= data,
+		.startino	= startino,
+	};
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+
+	xfs_iwalk_set_prefetch(&iwag, max_prefetch);
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+	}
+
+	xfs_iwalk_free(&iwag);
+	return error;
+}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
new file mode 100644
index 000000000000..45b1baabcd2d
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.h
@@ -0,0 +1,18 @@ 
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_IWALK_H__
+#define __XFS_IWALK_H__
+
+/* Walk all inodes in the filesystem starting from @startino. */
+typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+			    xfs_ino_t ino, void *data);
+/* Return value (for xfs_iwalk_fn) that aborts the walk immediately. */
+#define XFS_IWALK_ABORT	(1)
+
+int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
+		xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data);
+
+#endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2464ea351f83..f9bb1d50bc0e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3516,6 +3516,46 @@  DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
 DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
 DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
 
+TRACE_EVENT(xfs_iwalk_ag,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agino_t startino),
+	TP_ARGS(mp, agno, startino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = startino;
+	),
+	TP_printk("dev %d:%d agno %d startino %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino)
+)
+
+TRACE_EVENT(xfs_iwalk_ag_rec,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_inobt_rec_incore *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agino_t, startino)
+		__field(uint64_t, freemask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startino = irec->ir_startino;
+		__entry->freemask = irec->ir_free;
+	),
+	TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+		  __entry->startino, __entry->freemask)
+)
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH