diff mbox series

[4/5] xfs: speed up directory bestfree block scanning

Message ID 20181024225716.19459-5-david@fromorbit.com (mailing list archive)
State Superseded
Headers show
Series xfs: speed up large directory modifications | expand

Commit Message

Dave Chinner Oct. 24, 2018, 10:57 p.m. UTC
From: Dave Chinner <dchinner@redhat.com>

When running a "create millions inodes in a directory" test
recently, I noticed we were spending a huge amount of time
converting freespace block headers from disk format to in-memory
format:

 31.47%  [kernel]  [k] xfs_dir2_node_addname
 17.86%  [kernel]  [k] xfs_dir3_free_hdr_from_disk
  3.55%  [kernel]  [k] xfs_dir3_free_bests_p

We shouldn't be hitting the best free block scanning code so hard
when doing sequential directory creates, and it turns out there's
a highly suboptimal loop searching the the best free array in
the freespace block - it decodes the block header before checking
each entry inside a loop, instead of decoding the header once before
running the entry search loop.

This makes a massive difference to create rates. Profile now looks
like this:

  13.15%  [kernel]  [k] xfs_dir2_node_addname
   3.52%  [kernel]  [k] xfs_dir3_leaf_check_int
   3.11%  [kernel]  [k] xfs_log_commit_cil

And the wall time/average file create rate differences are
just as stark:

		create time(sec) / rate (files/s)
File count	     vanilla		    patched
  10k		   0.54 / 18.5k		   0.53 / 18.9k
  20k		   1.10	/ 18.1k		   1.05 / 19.0k
 100k		   4.21	/ 23.8k		   3.91 / 25.6k
 200k		   9.66	/ 20,7k		   7.37 / 27.1k
   1M		  86.61	/ 11.5k		  48.26 / 20.7k
   2M		 206.13	/  9.7k		 129.71 / 15.4k

The larger the directory, the bigger the performance improvement.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/libxfs/xfs_dir2_node.c | 83 ++++++++++++++---------------------
 1 file changed, 32 insertions(+), 51 deletions(-)

Comments

Christoph Hellwig Oct. 26, 2018, 10:24 a.m. UTC | #1
> @@ -1767,11 +1767,10 @@ xfs_dir2_node_find_freeblk(
>  		fbp = fblk->bp;
>  		free = fbp->b_addr;
>  		findex = fblk->index;
> +		bests = dp->d_ops->free_bests_p(free);
> +		dp->d_ops->free_hdr_from_disk(&freehdr, free);
>  		if (findex >= 0) {
>  			/* caller already found the freespace for us. */
> -			bests = dp->d_ops->free_bests_p(free);
> -			dp->d_ops->free_hdr_from_disk(&freehdr, free);
> -

This hunk just undoes a move in the previous patch, might be better
idea to just keep it as before there.

> +	for ( ; fbno < lastfbno; fbno++) {
> +		/* If we don't have a freeblock in hand, get the next one. */
>  		if (fbp == NULL) {
> +			/* If it's ifbno we already looked at it. */
> +			if (fbno == ifbno)
> +				continue;
> +

The only case where we have a fbp here is if we had a fblk passed in,
but it it did have the index set to -1.  But as far as I can tell
searching that again doesn't make any sense at all, so I'd apply
something like this in top of your patch (some of this also seems
to be in your next patch, so independent of the logic change might
be worth moving over here):

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 6a6572c5602e..d1fb0ff38584 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -1752,10 +1752,7 @@ xfs_dir2_node_find_freeblk(
 	struct xfs_buf		*fbp = NULL;
 	int			findex;
 	xfs_dir2_db_t		lastfbno;
-	xfs_dir2_db_t		ifbno = -1;
-	xfs_dir2_db_t		dbno = -1;
 	xfs_dir2_db_t		fbno = -1;
-	xfs_dir2_free_t		*free = NULL;
 	struct xfs_dir3_icfree_hdr freehdr;
 	__be16			*bests = NULL;
 	xfs_fileoff_t		fo;
@@ -1768,28 +1765,29 @@ xfs_dir2_node_find_freeblk(
 	 */
 	if (fblk) {
 		fbp = fblk->bp;
-		free = fbp->b_addr;
 		findex = fblk->index;
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
+		bests = dp->d_ops->free_bests_p(fbp->b_addr);
+		dp->d_ops->free_hdr_from_disk(&freehdr, fbp->b_addr);
 		if (findex >= 0) {
 			/* caller already found the freespace for us. */
 			ASSERT(findex < freehdr.nvalid);
 			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
 			ASSERT(be16_to_cpu(bests[findex]) >= length);
-			dbno = freehdr.firstdb + findex;
-			goto out;
+			goto found;
 		}
 
 		/*
 		 * The data block looked at didn't have enough room.
-		 * We'll start at the beginning of the freespace entries.
+		 * We'll start searching after this entry.
 		 */
-		ifbno = fblk->blkno;
-		fbno = ifbno;
+		fbno = fblk->blkno + 1;
+
+		xfs_trans_brelse(tp, fbp);
+		fblk->bp = NULL;
+	} else {
+		/* If we haven't got a search start block, set it now */
+		fbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
 	}
-	ASSERT(dbno == -1);
-	findex = 0;
 
 	/*
 	 * If we don't have a data block yet, we're going to scan the freespace
@@ -1801,64 +1799,50 @@ xfs_dir2_node_find_freeblk(
 		return error;
 	lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
 
-	/* If we haven't get a search start block, set it now */
-	if (fbno == -1)
-		fbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
-
 	/*
-	 * While we haven't identified a data block, search the freeblock
-	 * data for a data block with enough free space in it.
+	 * While we haven't identified a data block, search the freeblock data
+	 * for a data block with enough free space in it.
 	 */
 	for ( ; fbno < lastfbno; fbno++) {
-		/* If we don't have a freeblock in hand, get the next one. */
-		if (fbp == NULL) {
-			/* If it's ifbno we already looked at it. */
-			if (fbno == ifbno)
-				continue;
-
-			/*
-			 * Read the block.  There can be holes in the freespace
-			 * blocks, so this might not succeed.  This should be
-			 * really rare, so there's no reason to avoid it.
-			 */
-			error = xfs_dir2_free_try_read(tp, dp,
-					xfs_dir2_db_to_da(args->geo, fbno),
-					&fbp);
-			if (error)
-				return error;
-			if (!fbp)
-				continue;
+		/*
+		 * Read the block.  There can be holes in the freespace blocks,
+		 * so this might not succeed.  This should be really rare, so
+		 * there's no reason to avoid it.
+		 */
+		error = xfs_dir2_free_try_read(tp, dp,
+				xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+		if (error)
+			return error;
+		if (!fbp)
+			continue;
 
-			findex = 0;
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		}
+		bests = dp->d_ops->free_bests_p(fbp->b_addr);
+		dp->d_ops->free_hdr_from_disk(&freehdr, fbp->b_addr);
 
 		/* Scan the free entry array for a large enough free space. */
-		do {
-			if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-			    be16_to_cpu(bests[findex]) >= length) {
-				dbno = freehdr.firstdb + findex;
-				goto out;
-			}
-		} while (++findex < freehdr.nvalid);
+		for (findex = 0; findex < freehdr.nvalid; findex++) {
+			u16		best = be16_to_cpu(bests[findex]);
+
+			if (best != NULLDATAOFF && best >= length)
+				goto found;
+		}
 
 		/* Didn't find free space, go on to next free block */
 		xfs_trans_brelse(tp, fbp);
-		fbp = NULL;
-		if (fblk)
-			fblk->bp = NULL;
 	}
 
-out:
-	*dbnop = dbno;
+	*dbnop = -1;
+	*fbpp = NULL;
+	*findexp = 0;
+	return 0;
+
+found:
+	*dbnop = freehdr.firstdb + findex;
 	*fbpp = fbp;
 	*findexp = findex;
 	return 0;
 }
 
-
 /*
  * Add the data entry for a node-format directory name addition.
  * The leaf entry is added in xfs_dir2_leafn_add.
Dave Chinner Oct. 26, 2018, 10:58 a.m. UTC | #2
On Fri, Oct 26, 2018 at 03:24:24AM -0700, Christoph Hellwig wrote:
> > @@ -1767,11 +1767,10 @@ xfs_dir2_node_find_freeblk(
> >  		fbp = fblk->bp;
> >  		free = fbp->b_addr;
> >  		findex = fblk->index;
> > +		bests = dp->d_ops->free_bests_p(free);
> > +		dp->d_ops->free_hdr_from_disk(&freehdr, free);
> >  		if (findex >= 0) {
> >  			/* caller already found the freespace for us. */
> > -			bests = dp->d_ops->free_bests_p(free);
> > -			dp->d_ops->free_hdr_from_disk(&freehdr, free);
> > -
> 
> This hunk just undoes a move in the previous patch, might be better
> idea to just keep it as before there.
> 
> > +	for ( ; fbno < lastfbno; fbno++) {
> > +		/* If we don't have a freeblock in hand, get the next one. */
> >  		if (fbp == NULL) {
> > +			/* If it's ifbno we already looked at it. */
> > +			if (fbno == ifbno)
> > +				continue;
> > +
> 
> The only case where we have a fbp here is if we had a fblk passed in,
> but it it did have the index set to -1.  But as far as I can tell
> searching that again doesn't make any sense at all, so I'd apply
> something like this in top of your patch (some of this also seems
> to be in your next patch, so independent of the logic change might
> be worth moving over here):

So you've done a bunch of the rework that already in the next patch
in the series, plus a "fbno = fblk->blkno + 1;" logic change. 

Perhaps put this logic change at end of the series, rather than in
the middle where it breaks the rest of the changes?

Cheers,

Dave.
Christoph Hellwig Oct. 26, 2018, 11:56 a.m. UTC | #3
On Fri, Oct 26, 2018 at 09:58:42PM +1100, Dave Chinner wrote:
> > The only case where we have a fbp here is if we had a fblk passed in,
> > but it it did have the index set to -1.  But as far as I can tell
> > searching that again doesn't make any sense at all, so I'd apply
> > something like this in top of your patch (some of this also seems
> > to be in your next patch, so independent of the logic change might
> > be worth moving over here):
> 
> So you've done a bunch of the rework that already in the next patch
> in the series, plus a "fbno = fblk->blkno + 1;" logic change. 

I don't think this is a new logic change, as we start at fbno
already (both in the existing code and with your patch), but we got
here because that block did not contain a suitable free space.

That being said with the reverse search in the next patch the + 1
is pointless as that code changes again.  But many of the other changes
in this patch or your next patch (which I hadn't looked at yet when
I did this) should probably be in this one, otherwise we just create
churn.
Christoph Hellwig Oct. 26, 2018, 12:12 p.m. UTC | #4
On Fri, Oct 26, 2018 at 04:56:23AM -0700, Christoph Hellwig wrote:
> I don't think this is a new logic change, as we start at fbno
> already (both in the existing code and with your patch), but we got
> here because that block did not contain a suitable free space.
> 
> That being said with the reverse search in the next patch the + 1
> is pointless as that code changes again.  But many of the other changes
> in this patch or your next patch (which I hadn't looked at yet when
> I did this) should probably be in this one, otherwise we just create
> churn.

Or in other words, we could apply something like this (entirely
based on your next patch) here:

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 6a6572c5602e..919d6597fb19 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -1750,7 +1750,7 @@ xfs_dir2_node_find_freeblk(
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_trans	*tp = args->trans;
 	struct xfs_buf		*fbp = NULL;
-	int			findex;
+	int			findex = 0;
 	xfs_dir2_db_t		lastfbno;
 	xfs_dir2_db_t		ifbno = -1;
 	xfs_dir2_db_t		dbno = -1;
@@ -1778,7 +1778,7 @@ xfs_dir2_node_find_freeblk(
 			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
 			ASSERT(be16_to_cpu(bests[findex]) >= length);
 			dbno = freehdr.firstdb + findex;
-			goto out;
+			goto found_block;
 		}
 
 		/*
@@ -1787,9 +1787,9 @@ xfs_dir2_node_find_freeblk(
 		 */
 		ifbno = fblk->blkno;
 		fbno = ifbno;
+		xfs_trans_brelse(tp, fbp);
+		fblk->bp = NULL;
 	}
-	ASSERT(dbno == -1);
-	findex = 0;
 
 	/*
 	 * If we don't have a data block yet, we're going to scan the freespace
@@ -1810,48 +1810,42 @@ xfs_dir2_node_find_freeblk(
 	 * data for a data block with enough free space in it.
 	 */
 	for ( ; fbno < lastfbno; fbno++) {
-		/* If we don't have a freeblock in hand, get the next one. */
-		if (fbp == NULL) {
-			/* If it's ifbno we already looked at it. */
-			if (fbno == ifbno)
-				continue;
+		/* If it's ifbno we already looked at it. */
+		if (fbno == ifbno)
+			continue;
 
-			/*
-			 * Read the block.  There can be holes in the freespace
-			 * blocks, so this might not succeed.  This should be
-			 * really rare, so there's no reason to avoid it.
-			 */
-			error = xfs_dir2_free_try_read(tp, dp,
-					xfs_dir2_db_to_da(args->geo, fbno),
-					&fbp);
-			if (error)
-				return error;
-			if (!fbp)
-				continue;
+		/*
+		 * Read the block.  There can be holes in the freespace
+		 * blocks, so this might not succeed.  This should be
+		 * really rare, so there's no reason to avoid it.
+		 */
+		error = xfs_dir2_free_try_read(tp, dp,
+				xfs_dir2_db_to_da(args->geo, fbno),
+				&fbp);
+		if (error)
+			return error;
+		if (!fbp)
+			continue;
 
-			findex = 0;
-			free = fbp->b_addr;
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		}
+		findex = 0;
+		free = fbp->b_addr;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 
 		/* Scan the free entry array for a large enough free space. */
-		do {
+		for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
 			if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
 			    be16_to_cpu(bests[findex]) >= length) {
 				dbno = freehdr.firstdb + findex;
-				goto out;
+				goto found_block;
 			}
-		} while (++findex < freehdr.nvalid);
+		}
 
 		/* Didn't find free space, go on to next free block */
 		xfs_trans_brelse(tp, fbp);
-		fbp = NULL;
-		if (fblk)
-			fblk->bp = NULL;
 	}
 
-out:
+found_block:
 	*dbnop = dbno;
 	*fbpp = fbp;
 	*findexp = findex;
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 1ecf41e380fb..621f63de075c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -1754,7 +1754,7 @@  xfs_dir2_node_find_freeblk(
 	xfs_dir2_db_t		fbno = -1;
 	xfs_dir2_free_t		*free = NULL;
 	struct xfs_dir3_icfree_hdr freehdr;
-	__be16			*bests;
+	__be16			*bests = NULL;
 	xfs_fileoff_t		fo;
 	int			error;
 
@@ -1767,11 +1767,10 @@  xfs_dir2_node_find_freeblk(
 		fbp = fblk->bp;
 		free = fbp->b_addr;
 		findex = fblk->index;
+		bests = dp->d_ops->free_bests_p(free);
+		dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		if (findex >= 0) {
 			/* caller already found the freespace for us. */
-			bests = dp->d_ops->free_bests_p(free);
-			dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
 			ASSERT(findex < freehdr.nvalid);
 			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
 			ASSERT(be16_to_cpu(bests[findex]) >= length);
@@ -1805,29 +1804,19 @@  xfs_dir2_node_find_freeblk(
 
 	/*
 	 * While we haven't identified a data block, search the freeblock
-	 * data for a good data block.  If we find a null freeblock entry,
-	 * indicating a hole in the data blocks, remember that.
+	 * data for a data block with enough free space in it.
 	 */
-	while (dbno == -1) {
-		/*
-		 * If we don't have a freeblock in hand, get the next one.
-		 */
+	for ( ; fbno < lastfbno; fbno++) {
+		/* If we don't have a freeblock in hand, get the next one. */
 		if (fbp == NULL) {
+			/* If it's ifbno we already looked at it. */
+			if (fbno == ifbno)
+				continue;
+
 			/*
-			 * If it's ifbno we already looked at it.
-			 */
-			if (++fbno == ifbno)
-				fbno++;
-			/*
-			 * If it's off the end we're done.
-			 */
-			if (fbno >= lastfbno)
-				break;
-			/*
-			 * Read the block.  There can be holes in the
-			 * freespace blocks, so this might not succeed.
-			 * This should be really rare, so there's no reason
-			 * to avoid it.
+			 * Read the block.  There can be holes in the freespace
+			 * blocks, so this might not succeed.  This should be
+			 * really rare, so there's no reason to avoid it.
 			 */
 			error = xfs_dir2_free_try_read(tp, dp,
 					xfs_dir2_db_to_da(args->geo, fbno),
@@ -1836,37 +1825,29 @@  xfs_dir2_node_find_freeblk(
 				return error;
 			if (!fbp)
 				continue;
-			free = fbp->b_addr;
+
 			findex = 0;
+			free = fbp->b_addr;
+			bests = dp->d_ops->free_bests_p(free);
+			dp->d_ops->free_hdr_from_disk(&freehdr, free);
 		}
-		/*
-		 * Look at the current free entry.  Is it good enough?
-		 *
-		 * The bests initialisation should be where the bufer is read in
-		 * the above branch. But gcc is too stupid to realise that bests
-		 * and the freehdr are actually initialised if they are placed
-		 * there, so we have to do it here to avoid warnings. Blech.
-		 */
-		bests = dp->d_ops->free_bests_p(free);
-		dp->d_ops->free_hdr_from_disk(&freehdr, free);
-		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
-		    be16_to_cpu(bests[findex]) >= length)
-			dbno = freehdr.firstdb + findex;
-		else {
-			/*
-			 * Are we done with the freeblock?
-			 */
-			if (++findex == freehdr.nvalid) {
-				/*
-				 * Drop the block.
-				 */
-				xfs_trans_brelse(tp, fbp);
-				fbp = NULL;
-				if (fblk && fblk->bp)
-					fblk->bp = NULL;
+
+		/* Scan the free entry array for a large enough free space. */
+		do {
+			if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+			    be16_to_cpu(bests[findex]) >= length) {
+				dbno = freehdr.firstdb + findex;
+				goto out;
 			}
-		}
+		} while (++findex < freehdr.nvalid);
+
+		/* Didn't find free space, go on to next free block */
+		xfs_trans_brelse(tp, fbp);
+		fbp = NULL;
+		if (fblk)
+			fblk->bp = NULL;
 	}
+
 out:
 	*dbnop = dbno;
 	*fbpp = fbp;