diff mbox series

[2/2] btrfs: add size class stats to sysfs

Message ID 3e95d7d8a42fa8969f415fc03ad999de3d29a196.1674679476.git.boris@bur.io (mailing list archive)
State New, archived
Headers show
Series btrfs: block group size class load fixes | expand

Commit Message

Boris Burkov Jan. 25, 2023, 8:50 p.m. UTC
Make it possible to see the distribution of size classes for block
groups. Helpful for testing and debugging the allocator w.r.t. to size
classes.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

Comments

David Sterba Jan. 27, 2023, 1:23 p.m. UTC | #1
On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> Make it possible to see the distribution of size classes for block
> groups. Helpful for testing and debugging the allocator w.r.t. to size
> classes.

Please note the sysfs file path.

> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 39 insertions(+)
> 
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index 108aa3876186..e1ae4d2323d6 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -9,6 +9,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/completion.h>
>  #include <linux/bug.h>
> +#include <linux/list.h>
>  #include <crypto/hash.h>
>  #include "messages.h"
>  #include "ctree.h"
> @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
>  	return len;
>  }
>  
> +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> +				       struct kobj_attribute *a, char *buf)
> +{
> +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> +	struct btrfs_block_group *bg;
> +	int none = 0;
> +	int small = 0;
> +	int medium = 0;
> +	int large = 0;

For simple counters please use unsigned types.

> +	int i;
> +
> +	down_read(&sinfo->groups_sem);

This is a big lock and reading the sysfs repeatedly could block space
reservations. I think RCU works for the block group list and the
size_class is a simple read so the synchronization can be lightweight.

> +	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {

	for (int = 0; ...)

> +		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> +			if (!btrfs_block_group_should_use_size_class(bg))
> +				continue;
> +			switch (bg->size_class) {
> +			case BTRFS_BG_SZ_NONE:
> +				none++;
> +				break;
> +			case BTRFS_BG_SZ_SMALL:
> +				small++;
> +				break;
> +			case BTRFS_BG_SZ_MEDIUM:
> +				medium++;
> +				break;
> +			case BTRFS_BG_SZ_LARGE:
> +				large++;
> +				break;
> +			}
> +		}
> +	}
> +	up_read(&sinfo->groups_sem);
> +	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);

This is lacks the types in the output, so this should be like

	"none %u\n"
	"small %u\n"
	...

For stats we can group the values in one file.

> +}
> +
>  #ifdef CONFIG_BTRFS_DEBUG
>  /*
>   * Request chunk allocation with current chunk size.
> @@ -835,6 +872,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
>  SPACE_INFO_ATTR(disk_used);
>  SPACE_INFO_ATTR(disk_total);
>  BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
> +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
>  
>  static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
>  						     struct kobj_attribute *a,
> @@ -887,6 +925,7 @@ static struct attribute *space_info_attrs[] = {
>  	BTRFS_ATTR_PTR(space_info, disk_total),
>  	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
>  	BTRFS_ATTR_PTR(space_info, chunk_size),
> +	BTRFS_ATTR_PTR(space_info, size_classes),
>  #ifdef CONFIG_BTRFS_DEBUG
>  	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
>  #endif
> -- 
> 2.38.1
Boris Burkov Jan. 27, 2023, 9:43 p.m. UTC | #2
On Fri, Jan 27, 2023 at 02:23:45PM +0100, David Sterba wrote:
> On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> > Make it possible to see the distribution of size classes for block
> > groups. Helpful for testing and debugging the allocator w.r.t. to size
> > classes.
> 
> Please note the sysfs file path.
> 
> > Signed-off-by: Boris Burkov <boris@bur.io>
> > ---
> >  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 39 insertions(+)
> > 
> > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> > index 108aa3876186..e1ae4d2323d6 100644
> > --- a/fs/btrfs/sysfs.c
> > +++ b/fs/btrfs/sysfs.c
> > @@ -9,6 +9,7 @@
> >  #include <linux/spinlock.h>
> >  #include <linux/completion.h>
> >  #include <linux/bug.h>
> > +#include <linux/list.h>
> >  #include <crypto/hash.h>
> >  #include "messages.h"
> >  #include "ctree.h"
> > @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
> >  	return len;
> >  }
> >  
> > +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> > +				       struct kobj_attribute *a, char *buf)
> > +{
> > +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> > +	struct btrfs_block_group *bg;
> > +	int none = 0;
> > +	int small = 0;
> > +	int medium = 0;
> > +	int large = 0;
> 
> For simple counters please use unsigned types.
> 
> > +	int i;
> > +
> > +	down_read(&sinfo->groups_sem);
> 
> This is a big lock and reading the sysfs repeatedly could block space
> reservations. I think RCU works for the block group list and the
> size_class is a simple read so the synchronization can be lightweight.

I believe space reservations only hold the read lock. The write lock is
needed only to remove or add block groups, so this shouldn't slow down
reservations. Also, FWIW, raid_bytes_show() uses the same
locking/iteration pattern.

I am not sure how to definitely safely concurrently iterate the block
groups without taking the lock. Are you suggesting I should just drop
the locking, and it won't crash but might be inaccurate? Or is there
some other RCU trick I am missing? I don't believe we use any RCU
specific methods when deleting from the list.

Sending a v3 with the rest of your review changes.

Thanks,
Boris

> 
> > +	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {
> 
> 	for (int = 0; ...)
> 
> > +		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> > +			if (!btrfs_block_group_should_use_size_class(bg))
> > +				continue;
> > +			switch (bg->size_class) {
> > +			case BTRFS_BG_SZ_NONE:
> > +				none++;
> > +				break;
> > +			case BTRFS_BG_SZ_SMALL:
> > +				small++;
> > +				break;
> > +			case BTRFS_BG_SZ_MEDIUM:
> > +				medium++;
> > +				break;
> > +			case BTRFS_BG_SZ_LARGE:
> > +				large++;
> > +				break;
> > +			}
> > +		}
> > +	}
> > +	up_read(&sinfo->groups_sem);
> > +	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);
> 
> This is lacks the types in the output, so this should be like
> 
> 	"none %u\n"
> 	"small %u\n"
> 	...
> 
> For stats we can group the values in one file.
> 
> > +}
> > +
> >  #ifdef CONFIG_BTRFS_DEBUG
> >  /*
> >   * Request chunk allocation with current chunk size.
> > @@ -835,6 +872,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
> >  SPACE_INFO_ATTR(disk_used);
> >  SPACE_INFO_ATTR(disk_total);
> >  BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
> > +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
> >  
> >  static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
> >  						     struct kobj_attribute *a,
> > @@ -887,6 +925,7 @@ static struct attribute *space_info_attrs[] = {
> >  	BTRFS_ATTR_PTR(space_info, disk_total),
> >  	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
> >  	BTRFS_ATTR_PTR(space_info, chunk_size),
> > +	BTRFS_ATTR_PTR(space_info, size_classes),
> >  #ifdef CONFIG_BTRFS_DEBUG
> >  	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
> >  #endif
> > -- 
> > 2.38.1
David Sterba Feb. 7, 2023, 3:08 p.m. UTC | #3
On Fri, Jan 27, 2023 at 01:43:19PM -0800, Boris Burkov wrote:
> On Fri, Jan 27, 2023 at 02:23:45PM +0100, David Sterba wrote:
> > On Wed, Jan 25, 2023 at 12:50:33PM -0800, Boris Burkov wrote:
> > > Make it possible to see the distribution of size classes for block
> > > groups. Helpful for testing and debugging the allocator w.r.t. to size
> > > classes.
> > 
> > Please note the sysfs file path.
> > 
> > > Signed-off-by: Boris Burkov <boris@bur.io>
> > > ---
> > >  fs/btrfs/sysfs.c | 39 +++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 39 insertions(+)
> > > 
> > > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> > > index 108aa3876186..e1ae4d2323d6 100644
> > > --- a/fs/btrfs/sysfs.c
> > > +++ b/fs/btrfs/sysfs.c
> > > @@ -9,6 +9,7 @@
> > >  #include <linux/spinlock.h>
> > >  #include <linux/completion.h>
> > >  #include <linux/bug.h>
> > > +#include <linux/list.h>
> > >  #include <crypto/hash.h>
> > >  #include "messages.h"
> > >  #include "ctree.h"
> > > @@ -778,6 +779,42 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
> > >  	return len;
> > >  }
> > >  
> > > +static ssize_t btrfs_size_classes_show(struct kobject *kobj,
> > > +				       struct kobj_attribute *a, char *buf)
> > > +{
> > > +	struct btrfs_space_info *sinfo = to_space_info(kobj);
> > > +	struct btrfs_block_group *bg;
> > > +	int none = 0;
> > > +	int small = 0;
> > > +	int medium = 0;
> > > +	int large = 0;
> > 
> > For simple counters please use unsigned types.
> > 
> > > +	int i;
> > > +
> > > +	down_read(&sinfo->groups_sem);
> > 
> > This is a big lock and reading the sysfs repeatedly could block space
> > reservations. I think RCU works for the block group list and the
> > size_class is a simple read so the synchronization can be lightweight.
> 
> I believe space reservations only hold the read lock. The write lock is
> needed only to remove or add block groups, so this shouldn't slow down
> reservations. Also, FWIW, raid_bytes_show() uses the same
> locking/iteration pattern.

It does use the same lock but it does not lock all space infos, only the
one it's supposed to print the numbers.

> I am not sure how to definitely safely concurrently iterate the block
> groups without taking the lock. Are you suggesting I should just drop
> the locking, and it won't crash but might be inaccurate? Or is there
> some other RCU trick I am missing? I don't believe we use any RCU
> specific methods when deleting from the list.

I don't see the RCU for block groups so some kind of locking needs to be
done, if we don't insist on accurate numbers the mutex can be taken but
eventually yielded (rwsem_is_contended or need_resched).
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 108aa3876186..e1ae4d2323d6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -9,6 +9,7 @@ 
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/bug.h>
+#include <linux/list.h>
 #include <crypto/hash.h>
 #include "messages.h"
 #include "ctree.h"
@@ -778,6 +779,42 @@  static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
 	return len;
 }
 
+static ssize_t btrfs_size_classes_show(struct kobject *kobj,
+				       struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_space_info *sinfo = to_space_info(kobj);
+	struct btrfs_block_group *bg;
+	int none = 0;
+	int small = 0;
+	int medium = 0;
+	int large = 0;
+	int i;
+
+	down_read(&sinfo->groups_sem);
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; ++i) {
+		list_for_each_entry(bg, &sinfo->block_groups[i], list) {
+			if (!btrfs_block_group_should_use_size_class(bg))
+				continue;
+			switch (bg->size_class) {
+			case BTRFS_BG_SZ_NONE:
+				none++;
+				break;
+			case BTRFS_BG_SZ_SMALL:
+				small++;
+				break;
+			case BTRFS_BG_SZ_MEDIUM:
+				medium++;
+				break;
+			case BTRFS_BG_SZ_LARGE:
+				large++;
+				break;
+			}
+		}
+	}
+	up_read(&sinfo->groups_sem);
+	return sysfs_emit(buf, "%d %d %d %d\n", none, small, medium, large);
+}
+
 #ifdef CONFIG_BTRFS_DEBUG
 /*
  * Request chunk allocation with current chunk size.
@@ -835,6 +872,7 @@  SPACE_INFO_ATTR(bytes_zone_unusable);
 SPACE_INFO_ATTR(disk_used);
 SPACE_INFO_ATTR(disk_total);
 BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store);
+BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show);
 
 static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
 						     struct kobj_attribute *a,
@@ -887,6 +925,7 @@  static struct attribute *space_info_attrs[] = {
 	BTRFS_ATTR_PTR(space_info, disk_total),
 	BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
 	BTRFS_ATTR_PTR(space_info, chunk_size),
+	BTRFS_ATTR_PTR(space_info, size_classes),
 #ifdef CONFIG_BTRFS_DEBUG
 	BTRFS_ATTR_PTR(space_info, force_chunk_alloc),
 #endif