diff mbox series

[v3,1/8] repair: turn bad inode list into array

Message ID 20210330142531.19809-2-hsiangkao@aol.com (mailing list archive)
State New, archived
Headers show
Series repair: Phase 6 performance improvements | expand

Commit Message

Gao Xiang March 30, 2021, 2:25 p.m. UTC
From: Gao Xiang <hsiangkao@redhat.com>

Just use array and reallocate one-by-one here (not sure if bulk
allocation is more effective or not.)

Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 repair/dir2.c | 34 +++++++++++++++++-----------------
 repair/dir2.h |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

Comments

Darrick J. Wong March 30, 2021, 6:46 p.m. UTC | #1
On Tue, Mar 30, 2021 at 10:25:24PM +0800, Gao Xiang wrote:
> From: Gao Xiang <hsiangkao@redhat.com>
> 
> Just use array and reallocate one-by-one here (not sure if bulk
> allocation is more effective or not.)

I'm not sure either.  The list of bad directories is likely to be
sparse, so perhaps the libfrog bitmap isn't going to beat a realloc
array for space efficiency... and reusing the slab array might be
overkill for an array-backed bitmap?

Eh, you know what?  I don't think the bad directory bitmap is hot enough
to justify broadening this into an algorithmic review of data
structures.  Incremental space efficiency is good enough for me.

> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
>  repair/dir2.c | 34 +++++++++++++++++-----------------
>  repair/dir2.h |  2 +-
>  2 files changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/repair/dir2.c b/repair/dir2.c
> index eabdb4f2d497..b6a8a5c40ae4 100644
> --- a/repair/dir2.c
> +++ b/repair/dir2.c
> @@ -20,40 +20,40 @@
>   * Known bad inode list.  These are seen when the leaf and node
>   * block linkages are incorrect.
>   */
> -typedef struct dir2_bad {
> -	xfs_ino_t	ino;
> -	struct dir2_bad	*next;
> -} dir2_bad_t;
> +struct dir2_bad {
> +	unsigned int	nr;

We could have more than 4 billion bad directories.

> +	xfs_ino_t	*itab;
> +};
>  
> -static dir2_bad_t *dir2_bad_list;
> +static struct dir2_bad	dir2_bad;
>  
>  static void
>  dir2_add_badlist(
>  	xfs_ino_t	ino)
>  {
> -	dir2_bad_t	*l;
> +	xfs_ino_t	*itab;
>  
> -	if ((l = malloc(sizeof(dir2_bad_t))) == NULL) {
> +	itab = realloc(dir2_bad.itab, (dir2_bad.nr + 1) * sizeof(xfs_ino_t));

Minor quibble: you could improve the efficiency of this by tracking both
the array size and fill count so that you only have to expand the array
by powers of two at a time.  The glibc heap is faster than it once was,
but you could help it along by only asking for memory in MMAP_THRESHOLD
chunks.  Or possibly even pagesize chunks.

--D

> +	if (!itab) {
>  		do_error(
>  _("malloc failed (%zu bytes) dir2_add_badlist:ino %" PRIu64 "\n"),
> -			sizeof(dir2_bad_t), ino);
> +			sizeof(xfs_ino_t), ino);
>  		exit(1);
>  	}
> -	l->next = dir2_bad_list;
> -	dir2_bad_list = l;
> -	l->ino = ino;
> +	itab[dir2_bad.nr++] = ino;
> +	dir2_bad.itab = itab;
>  }
>  
> -int
> +bool
>  dir2_is_badino(
>  	xfs_ino_t	ino)
>  {
> -	dir2_bad_t	*l;
> +	unsigned int i;
>  
> -	for (l = dir2_bad_list; l; l = l->next)
> -		if (l->ino == ino)
> -			return 1;
> -	return 0;
> +	for (i = 0; i < dir2_bad.nr; ++i)
> +		if (dir2_bad.itab[i] == ino)
> +			return true;
> +	return false;
>  }
>  
>  /*
> diff --git a/repair/dir2.h b/repair/dir2.h
> index 5795aac5eaab..af4cfb1da329 100644
> --- a/repair/dir2.h
> +++ b/repair/dir2.h
> @@ -27,7 +27,7 @@ process_sf_dir2_fixi8(
>  	struct xfs_dir2_sf_hdr	*sfp,
>  	xfs_dir2_sf_entry_t	**next_sfep);
>  
> -int
> +bool
>  dir2_is_badino(
>  	xfs_ino_t	ino);
>  
> -- 
> 2.20.1
>
Dave Chinner March 30, 2021, 10:18 p.m. UTC | #2
On Tue, Mar 30, 2021 at 10:25:24PM +0800, Gao Xiang wrote:
> From: Gao Xiang <hsiangkao@redhat.com>
> 
> Just use array and reallocate one-by-one here (not sure if bulk
> allocation is more effective or not.)

Did you profile repairing a filesystem with lots of broken
directories? Optimisations like this really need to be profile
guided and the impact docuemnted. That way reviewers can actually
see the benefit the change brings to the table....

> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
>  repair/dir2.c | 34 +++++++++++++++++-----------------
>  repair/dir2.h |  2 +-
>  2 files changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/repair/dir2.c b/repair/dir2.c
> index eabdb4f2d497..b6a8a5c40ae4 100644
> --- a/repair/dir2.c
> +++ b/repair/dir2.c
> @@ -20,40 +20,40 @@
>   * Known bad inode list.  These are seen when the leaf and node
>   * block linkages are incorrect.
>   */
> -typedef struct dir2_bad {
> -	xfs_ino_t	ino;
> -	struct dir2_bad	*next;
> -} dir2_bad_t;
> +struct dir2_bad {
> +	unsigned int	nr;
> +	xfs_ino_t	*itab;
> +};
>  
> -static dir2_bad_t *dir2_bad_list;
> +static struct dir2_bad	dir2_bad;
>  
>  static void
>  dir2_add_badlist(
>  	xfs_ino_t	ino)
>  {
> -	dir2_bad_t	*l;
> +	xfs_ino_t	*itab;
>  
> -	if ((l = malloc(sizeof(dir2_bad_t))) == NULL) {
> +	itab = realloc(dir2_bad.itab, (dir2_bad.nr + 1) * sizeof(xfs_ino_t));
> +	if (!itab) {
>  		do_error(
>  _("malloc failed (%zu bytes) dir2_add_badlist:ino %" PRIu64 "\n"),
> -			sizeof(dir2_bad_t), ino);
> +			sizeof(xfs_ino_t), ino);
>  		exit(1);
>  	}
> -	l->next = dir2_bad_list;
> -	dir2_bad_list = l;
> -	l->ino = ino;
> +	itab[dir2_bad.nr++] = ino;
> +	dir2_bad.itab = itab;
>  }
>  
> -int
> +bool
>  dir2_is_badino(
>  	xfs_ino_t	ino)
>  {
> -	dir2_bad_t	*l;
> +	unsigned int i;
>  
> -	for (l = dir2_bad_list; l; l = l->next)
> -		if (l->ino == ino)
> -			return 1;
> -	return 0;
> +	for (i = 0; i < dir2_bad.nr; ++i)
> +		if (dir2_bad.itab[i] == ino)
> +			return true;
> +	return false;

This ignores the problem with this code: it is a O(n * N) search
that gets done under an exclusive lock. Changing this to an array
doesn't improve the efficiency of the algorithm at all. It might
slighty reduce the magnitude of N, but modern CPU prefetchers detect
link list walks like this so are almost as fast sequentail array
walks. Hence this change will gain us relatively little when we have
millions of bad inodes to search.

IOWs, the scalability problem that needs to be solved here is not
"replace a linked list", it is "replace an O(n * N) search
algorithm". We should address the algorithmic problem, not the code
implementation issue.

That means we need to replace the linear search with a different
algorithm, not rework the data structure used to do a linear search.
We want this search to be reduced to O(n * log N) or O(n). We really
don't care about memory usage or even the overhead of per-object
memory allocation - we already do that and it isn't a performance
limitation, so optimising for memory allocation reductions is
optimising the wrong thing.

Replacing the linked list with an AVL tree or radix tree will make
the search O(log N), giving us the desired reduction in search
overhead to O(n * log N) and, more importantly, a significant
reduction in lock hold times.

Cheers,

Dave.
Gao Xiang March 30, 2021, 11:02 p.m. UTC | #3
On Wed, Mar 31, 2021 at 09:18:58AM +1100, Dave Chinner wrote:
> On Tue, Mar 30, 2021 at 10:25:24PM +0800, Gao Xiang wrote:
> > From: Gao Xiang <hsiangkao@redhat.com>
> > 
> > Just use array and reallocate one-by-one here (not sure if bulk
> > allocation is more effective or not.)
> 
> Did you profile repairing a filesystem with lots of broken
> directories? Optimisations like this really need to be profile
> guided and the impact docuemnted. That way reviewers can actually
> see the benefit the change brings to the table....

Nope, will do then (since I'm not confident with the target
performance tbh.)

> 
> > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > ---
> >  repair/dir2.c | 34 +++++++++++++++++-----------------
> >  repair/dir2.h |  2 +-
> >  2 files changed, 18 insertions(+), 18 deletions(-)
> > 
> > diff --git a/repair/dir2.c b/repair/dir2.c
> > index eabdb4f2d497..b6a8a5c40ae4 100644
> > --- a/repair/dir2.c
> > +++ b/repair/dir2.c
> > @@ -20,40 +20,40 @@
> >   * Known bad inode list.  These are seen when the leaf and node
> >   * block linkages are incorrect.
> >   */
> > -typedef struct dir2_bad {
> > -	xfs_ino_t	ino;
> > -	struct dir2_bad	*next;
> > -} dir2_bad_t;
> > +struct dir2_bad {
> > +	unsigned int	nr;
> > +	xfs_ino_t	*itab;
> > +};
> >  
> > -static dir2_bad_t *dir2_bad_list;
> > +static struct dir2_bad	dir2_bad;
> >  
> >  static void
> >  dir2_add_badlist(
> >  	xfs_ino_t	ino)
> >  {
> > -	dir2_bad_t	*l;
> > +	xfs_ino_t	*itab;
> >  
> > -	if ((l = malloc(sizeof(dir2_bad_t))) == NULL) {
> > +	itab = realloc(dir2_bad.itab, (dir2_bad.nr + 1) * sizeof(xfs_ino_t));
> > +	if (!itab) {
> >  		do_error(
> >  _("malloc failed (%zu bytes) dir2_add_badlist:ino %" PRIu64 "\n"),
> > -			sizeof(dir2_bad_t), ino);
> > +			sizeof(xfs_ino_t), ino);
> >  		exit(1);
> >  	}
> > -	l->next = dir2_bad_list;
> > -	dir2_bad_list = l;
> > -	l->ino = ino;
> > +	itab[dir2_bad.nr++] = ino;
> > +	dir2_bad.itab = itab;
> >  }
> >  
> > -int
> > +bool
> >  dir2_is_badino(
> >  	xfs_ino_t	ino)
> >  {
> > -	dir2_bad_t	*l;
> > +	unsigned int i;
> >  
> > -	for (l = dir2_bad_list; l; l = l->next)
> > -		if (l->ino == ino)
> > -			return 1;
> > -	return 0;
> > +	for (i = 0; i < dir2_bad.nr; ++i)
> > +		if (dir2_bad.itab[i] == ino)
> > +			return true;
> > +	return false;
> 
> This ignores the problem with this code: it is a O(n * N) search
> that gets done under an exclusive lock. Changing this to an array
> doesn't improve the efficiency of the algorithm at all. It might
> slighty reduce the magnitude of N, but modern CPU prefetchers detect
> link list walks like this so are almost as fast sequentail array
> walks. Hence this change will gain us relatively little when we have
> millions of bad inodes to search.
> 
> IOWs, the scalability problem that needs to be solved here is not
> "replace a linked list", it is "replace an O(n * N) search
> algorithm". We should address the algorithmic problem, not the code
> implementation issue.
> 
> That means we need to replace the linear search with a different
> algorithm, not rework the data structure used to do a linear search.
> We want this search to be reduced to O(n * log N) or O(n). We really
> don't care about memory usage or even the overhead of per-object
> memory allocation - we already do that and it isn't a performance
> limitation, so optimising for memory allocation reductions is
> optimising the wrong thing.
> 
> Replacing the linked list with an AVL tree or radix tree will make
> the search O(log N), giving us the desired reduction in search
> overhead to O(n * log N) and, more importantly, a significant
> reduction in lock hold times.

Obviously, I agree with your idea. Radix tree indexed by inode number
may waste space. I didn't notice that repair code had some AVL
infrastructure, let me try to use AVL instead.

But anyway, I'm not sure if it's a critical problem and if users really
came into bad inode bomb. Use array here just because some previous
review suggestion.

Thanks,
Gao Xiang

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Gao Xiang March 30, 2021, 11:05 p.m. UTC | #4
Hi Darrick,

On Tue, Mar 30, 2021 at 11:46:08AM -0700, Darrick J. Wong wrote:
> On Tue, Mar 30, 2021 at 10:25:24PM +0800, Gao Xiang wrote:
> > From: Gao Xiang <hsiangkao@redhat.com>
> > 
> > Just use array and reallocate one-by-one here (not sure if bulk
> > allocation is more effective or not.)
> 
> I'm not sure either.  The list of bad directories is likely to be
> sparse, so perhaps the libfrog bitmap isn't going to beat a realloc
> array for space efficiency... and reusing the slab array might be
> overkill for an array-backed bitmap?
> 
> Eh, you know what?  I don't think the bad directory bitmap is hot enough
> to justify broadening this into an algorithmic review of data
> structures.  Incremental space efficiency is good enough for me.

I'm not sure if it's a good choice tbh, let me try to use some
balanced binary tree instead as Dave suggested.

Thanks,
Gao Xiang
diff mbox series

Patch

diff --git a/repair/dir2.c b/repair/dir2.c
index eabdb4f2d497..b6a8a5c40ae4 100644
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -20,40 +20,40 @@ 
  * Known bad inode list.  These are seen when the leaf and node
  * block linkages are incorrect.
  */
-typedef struct dir2_bad {
-	xfs_ino_t	ino;
-	struct dir2_bad	*next;
-} dir2_bad_t;
+struct dir2_bad {
+	unsigned int	nr;
+	xfs_ino_t	*itab;
+};
 
-static dir2_bad_t *dir2_bad_list;
+static struct dir2_bad	dir2_bad;
 
 static void
 dir2_add_badlist(
 	xfs_ino_t	ino)
 {
-	dir2_bad_t	*l;
+	xfs_ino_t	*itab;
 
-	if ((l = malloc(sizeof(dir2_bad_t))) == NULL) {
+	itab = realloc(dir2_bad.itab, (dir2_bad.nr + 1) * sizeof(xfs_ino_t));
+	if (!itab) {
 		do_error(
 _("malloc failed (%zu bytes) dir2_add_badlist:ino %" PRIu64 "\n"),
-			sizeof(dir2_bad_t), ino);
+			sizeof(xfs_ino_t), ino);
 		exit(1);
 	}
-	l->next = dir2_bad_list;
-	dir2_bad_list = l;
-	l->ino = ino;
+	itab[dir2_bad.nr++] = ino;
+	dir2_bad.itab = itab;
 }
 
-int
+bool
 dir2_is_badino(
 	xfs_ino_t	ino)
 {
-	dir2_bad_t	*l;
+	unsigned int i;
 
-	for (l = dir2_bad_list; l; l = l->next)
-		if (l->ino == ino)
-			return 1;
-	return 0;
+	for (i = 0; i < dir2_bad.nr; ++i)
+		if (dir2_bad.itab[i] == ino)
+			return true;
+	return false;
 }
 
 /*
diff --git a/repair/dir2.h b/repair/dir2.h
index 5795aac5eaab..af4cfb1da329 100644
--- a/repair/dir2.h
+++ b/repair/dir2.h
@@ -27,7 +27,7 @@  process_sf_dir2_fixi8(
 	struct xfs_dir2_sf_hdr	*sfp,
 	xfs_dir2_sf_entry_t	**next_sfep);
 
-int
+bool
 dir2_is_badino(
 	xfs_ino_t	ino);