diff mbox series

btrfs: qgroup: add sysfs interface for debug

Message ID 20200619015946.65638-1-wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: qgroup: add sysfs interface for debug | expand

Commit Message

Qu Wenruo June 19, 2020, 1:59 a.m. UTC
This patch will add the following sysfs interface:
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
 ^^^ Above are already in "btrfs qgroup show" command output ^^^

/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
/sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc

The last 3 rsv related members are not visible to users, but can be very
useful to debug qgroup limit related bugs.

Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
level and qgroup id is changed to '_'.

The interface is not hidden behind 'debug' as I want this interface to
be included into production build so we could have an easier life to
debug qgroup rsv related bugs.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/ctree.h  |   1 +
 fs/btrfs/qgroup.c |  38 ++++++++----
 fs/btrfs/qgroup.h |  12 ++++
 fs/btrfs/sysfs.c  | 149 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/sysfs.h  |   6 ++
 5 files changed, 194 insertions(+), 12 deletions(-)

Comments

Qu Wenruo June 19, 2020, 7:24 a.m. UTC | #1
On 2020/6/19 上午9:59, Qu Wenruo wrote:
> This patch will add the following sysfs interface:
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
> 
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
> 
> The last 3 rsv related members are not visible to users, but can be very
> useful to debug qgroup limit related bugs.
> 
> Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
> level and qgroup id is changed to '_'.
> 
> The interface is not hidden behind 'debug' as I want this interface to
> be included into production build so we could have an easier life to
> debug qgroup rsv related bugs.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/ctree.h  |   1 +
>  fs/btrfs/qgroup.c |  38 ++++++++----
>  fs/btrfs/qgroup.h |  12 ++++
>  fs/btrfs/sysfs.c  | 149 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/sysfs.h  |   6 ++
>  5 files changed, 194 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index d8301bf240e0..7576dfe39841 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -779,6 +779,7 @@ struct btrfs_fs_info {
>  	u32 thread_pool_size;
>  
>  	struct kobject *space_info_kobj;
> +	struct kobject *qgroup_kobj;
>  
>  	u64 total_pinned;
>  
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 74eb98479109..04fdd42f0eb5 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -22,6 +22,7 @@
>  #include "extent_io.h"
>  #include "qgroup.h"
>  #include "block-group.h"
> +#include "sysfs.h"
>  
>  /* TODO XXX FIXME
>   *  - subvol delete -> delete when ref goes to 0? delete limits also?
> @@ -192,38 +193,47 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
>  	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
>  	struct rb_node *parent = NULL;
>  	struct btrfs_qgroup *qgroup;
> +	int ret;
>  
>  	while (*p) {
>  		parent = *p;
>  		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
>  
> -		if (qgroup->qgroupid < qgroupid)
> +		if (qgroup->qgroupid < qgroupid) {
>  			p = &(*p)->rb_left;
> -		else if (qgroup->qgroupid > qgroupid)
> +		} else if (qgroup->qgroupid > qgroupid) {
>  			p = &(*p)->rb_right;
> -		else
> +		} else {
>  			return qgroup;
> +		}

Oh, extra brackets forgot to remove during debug.

Will address them in next update.

Thanks,
Qu

>  	}
>  
>  	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
>  	if (!qgroup)
>  		return ERR_PTR(-ENOMEM);
> -
>  	qgroup->qgroupid = qgroupid;
>  	INIT_LIST_HEAD(&qgroup->groups);
>  	INIT_LIST_HEAD(&qgroup->members);
>  	INIT_LIST_HEAD(&qgroup->dirty);
>  
> +	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
> +	if (ret < 0) {
> +		kfree(qgroup);
> +		return ERR_PTR(ret);
> +	}
> +
>  	rb_link_node(&qgroup->node, parent, p);
>  	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
>  
>  	return qgroup;
>  }
>  
> -static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
> +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
> +			    struct btrfs_qgroup *qgroup)
>  {
>  	struct btrfs_qgroup_list *list;
>  
> +	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
>  	list_del(&qgroup->dirty);
>  	while (!list_empty(&qgroup->groups)) {
>  		list = list_first_entry(&qgroup->groups,
> @@ -252,7 +262,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
>  		return -ENOENT;
>  
>  	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
> -	__del_qgroup_rb(qgroup);
> +	__del_qgroup_rb(fs_info, qgroup);
>  	return 0;
>  }
>  
> @@ -351,6 +361,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
>  		goto out;
>  	}
>  
> +	ret = btrfs_sysfs_add_qgroups(fs_info);
> +	if (ret < 0)
> +		goto out;
>  	/* default this to quota off, in case no status key is found */
>  	fs_info->qgroup_flags = 0;
>  
> @@ -500,16 +513,12 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
>  		ulist_free(fs_info->qgroup_ulist);
>  		fs_info->qgroup_ulist = NULL;
>  		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
> +		btrfs_sysfs_del_qgroups(fs_info);
>  	}
>  
>  	return ret < 0 ? ret : 0;
>  }
>  
> -static u64 btrfs_qgroup_subvolid(u64 qgroupid)
> -{
> -	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
> -}
> -
>  /*
>   * Called in close_ctree() when quota is still enabled.  This verifies we don't
>   * leak some reserved space.
> @@ -562,7 +571,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
>  	while ((n = rb_first(&fs_info->qgroup_tree))) {
>  		qgroup = rb_entry(n, struct btrfs_qgroup, node);
>  		rb_erase(n, &fs_info->qgroup_tree);
> -		__del_qgroup_rb(qgroup);
> +		__del_qgroup_rb(fs_info, qgroup);
>  	}
>  	/*
>  	 * We call btrfs_free_qgroup_config() when unmounting
> @@ -571,6 +580,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
>  	 */
>  	ulist_free(fs_info->qgroup_ulist);
>  	fs_info->qgroup_ulist = NULL;
> +	btrfs_sysfs_del_qgroups(fs_info);
>  }
>  
>  static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
> @@ -943,6 +953,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
>  		goto out;
>  	}
>  
> +	ret = btrfs_sysfs_add_qgroups(fs_info);
> +	if (ret < 0)
> +		goto out;
>  	/*
>  	 * 1 for quota root item
>  	 * 1 for BTRFS_QGROUP_STATUS item
> @@ -1089,6 +1102,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
>  		fs_info->qgroup_ulist = NULL;
>  		if (trans)
>  			btrfs_end_transaction(trans);
> +		btrfs_sysfs_del_qgroups(fs_info);
>  	}
>  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
>  	return ret;
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 3be5198a3719..728ffea7de48 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -8,6 +8,7 @@
>  
>  #include <linux/spinlock.h>
>  #include <linux/rbtree.h>
> +#include <linux/kobject.h>
>  #include "ulist.h"
>  #include "delayed-ref.h"
>  
> @@ -223,8 +224,19 @@ struct btrfs_qgroup {
>  	 */
>  	u64 old_refcnt;
>  	u64 new_refcnt;
> +
> +	/*
> +	 * Sysfs kobjectid
> +	 */
> +	struct kobject kobj;
> +	struct completion kobj_unregister;
>  };
>  
> +static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
> +{
> +	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
> +}
> +
>  /*
>   * For qgroup event trace points only
>   */
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index a39bff64ff24..8468c0a22695 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -19,6 +19,7 @@
>  #include "volumes.h"
>  #include "space-info.h"
>  #include "block-group.h"
> +#include "qgroup.h"
>  
>  struct btrfs_feature_attr {
>  	struct kobj_attribute kobj_attr;
> @@ -1455,6 +1456,154 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
>  	return error;
>  }
>  
> +#define QGROUP_ATTR(_member)						\
> +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj,	\
> +				      struct kobj_attribute *a, char *buf) \
> +{									\
> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
> +			struct btrfs_qgroup, kobj);			\
> +	u64 val;							\
> +									\
> +	spin_lock(&fs_info->qgroup_lock);				\
> +	val = qgroup->_member;						\
> +	spin_unlock(&fs_info->qgroup_lock);				\
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
> +}									\
> +BTRFS_ATTR(qgroup, _member, btrfs_qgroup_show_##_member)
> +
> +#define QGROUP_RSV_ATTR(_name, _type)					\
> +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj,\
> +				      struct kobj_attribute *a, char *buf) \
> +{									\
> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
> +			struct btrfs_qgroup, kobj);			\
> +	u64 val;							\
> +									\
> +	spin_lock(&fs_info->qgroup_lock);				\
> +	val = qgroup->rsv.values[_type];					\
> +	spin_unlock(&fs_info->qgroup_lock);				\
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
> +}									\
> +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
> +
> +QGROUP_ATTR(rfer);
> +QGROUP_ATTR(excl);
> +QGROUP_ATTR(max_rfer);
> +QGROUP_ATTR(max_excl);
> +QGROUP_ATTR(lim_flags);
> +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
> +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
> +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
> +
> +static struct attribute *qgroup_attrs[] = {
> +	BTRFS_ATTR_PTR(qgroup, rfer),
> +	BTRFS_ATTR_PTR(qgroup, excl),
> +	BTRFS_ATTR_PTR(qgroup, max_rfer),
> +	BTRFS_ATTR_PTR(qgroup, max_excl),
> +	BTRFS_ATTR_PTR(qgroup, lim_flags),
> +	BTRFS_ATTR_PTR(qgroup, rsv_data),
> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
> +	NULL
> +};
> +ATTRIBUTE_GROUPS(qgroup);
> +static void qgroup_release(struct kobject *kobj)
> +{
> +	struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup,
> +			kobj);
> +	memset(&qgroup->kobj, 0, sizeof(*kobj));
> +	complete(&qgroup->kobj_unregister);
> +}
> +
> +static struct kobj_type qgroup_ktype = {
> +	.sysfs_ops = &kobj_sysfs_ops,
> +	.release = qgroup_release,
> +	.default_groups = qgroup_groups,
> +};
> +
> +/*
> + * Needed string buffer size for qgroup, including tailing \0
> + *
> + * This includes U48_MAX + 1 + U16_MAX + 1.
> + * U48_MAX in dec can be 15 digits at, and U16_MAX can be 6 digits.
> + * Rounded up to 32 to provide some buffer.
> + */
> +#define QGROUP_STR_LEN	32
> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup)
> +{
> +	struct kobject *qgroups_kobj = fs_info->qgroup_kobj;
> +	int ret;
> +
> +	init_completion(&qgroup->kobj_unregister);
> +	ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
> +			"%u_%llu", (u16)btrfs_qgroup_level(qgroup->qgroupid),
> +			btrfs_qgroup_subvolid(qgroup->qgroupid));
> +	if (ret < 0)
> +		kobject_put(&qgroup->kobj);
> +
> +	return ret;
> +}
> +
> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_qgroup *qgroup;
> +	struct btrfs_qgroup *next;
> +
> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
> +			&fs_info->qgroup_tree, node) {
> +		if (qgroup->kobj.state_initialized) {
> +			kobject_del(&qgroup->kobj);
> +			kobject_put(&qgroup->kobj);
> +			wait_for_completion(&qgroup->kobj_unregister);
> +		}
> +	}
> +	kobject_del(fs_info->qgroup_kobj);
> +	kobject_put(fs_info->qgroup_kobj);
> +	fs_info->qgroup_kobj = NULL;
> +}
> +
> +/* Called when qgroup get initialized, thus there is no need for extra lock. */
> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
> +{
> +	struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
> +	struct btrfs_qgroup *qgroup;
> +	struct btrfs_qgroup *next;
> +	int ret = 0;
> +
> +	ASSERT(fsid_kobj);
> +	if (fs_info->qgroup_kobj)
> +		return 0;
> +
> +	fs_info->qgroup_kobj = kobject_create_and_add("qgroups", fsid_kobj);
> +	if (!fs_info->qgroup_kobj) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
> +			&fs_info->qgroup_tree, node) {
> +		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +out:
> +	if (ret < 0)
> +		btrfs_sysfs_del_qgroups(fs_info);
> +	return ret;
> +}
> +
> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup)
> +{
> +	kobject_del(&qgroup->kobj);
> +	kobject_put(&qgroup->kobj);
> +	wait_for_completion(&qgroup->kobj_unregister);
> +}
>  
>  /*
>   * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
> diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
> index 718a26c97833..1e27a9c94c84 100644
> --- a/fs/btrfs/sysfs.h
> +++ b/fs/btrfs/sysfs.h
> @@ -36,4 +36,10 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
>  void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
>  void btrfs_sysfs_update_devid(struct btrfs_device *device);
>  
> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup);
> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup);
>  #endif
>
David Sterba June 19, 2020, 9:39 a.m. UTC | #2
On Fri, Jun 19, 2020 at 09:59:46AM +0800, Qu Wenruo wrote:
> This patch will add the following sysfs interface:
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
> 
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
> 
> The last 3 rsv related members are not visible to users, but can be very
> useful to debug qgroup limit related bugs.
> 
> Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
> level and qgroup id is changed to '_'.
> 
> The interface is not hidden behind 'debug' as I want this interface to
> be included into production build so we could have an easier life to
> debug qgroup rsv related bugs.

But why do you want to export it to sysfs at all?
Qu Wenruo June 19, 2020, 9:52 a.m. UTC | #3
On 2020/6/19 下午5:39, David Sterba wrote:
> On Fri, Jun 19, 2020 at 09:59:46AM +0800, Qu Wenruo wrote:
>> This patch will add the following sysfs interface:
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
>>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
>>
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
>>
>> The last 3 rsv related members are not visible to users, but can be very
>> useful to debug qgroup limit related bugs.
>>
>> Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
>> level and qgroup id is changed to '_'.
>>
>> The interface is not hidden behind 'debug' as I want this interface to
>> be included into production build so we could have an easier life to
>> debug qgroup rsv related bugs.
> 
> But why do you want to export it to sysfs at all?
> 
There is an internal report where user is not that co-operative to do
more experiments, but insists on providing more debugging info.

And since they don't want to unset qgroup limit, nor unmount their root
fs to make sure the latest qgroup data rsv safenet catches leakage, the
last method to debug strange early EDQUOT is to export rsv info to user
space.

And, for most users, the new interface won't bother anyone, but when
things go wrong and the user is not cooperative, such interface can save
us a lot of time.

Thanks,
Qu
David Sterba June 25, 2020, 8:21 p.m. UTC | #4
On Fri, Jun 19, 2020 at 09:59:46AM +0800, Qu Wenruo wrote:
> This patch will add the following sysfs interface:
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
> 
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
> 
> The last 3 rsv related members are not visible to users, but can be very
> useful to debug qgroup limit related bugs.

I think debugging should not be the only usecase. The qgroups
information can be accessed via ioctls or parsing output of the 'btrfs
qgroup' commands. For that reason I suggest to pick better names of the
fields, rfer, excl are not self explanatory and we can't change them in
the structures. But we can for the sysfs interface.

> Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
> level and qgroup id is changed to '_'.

This seems fine.

> The interface is not hidden behind 'debug' as I want this interface to
> be included into production build so we could have an easier life to
> debug qgroup rsv related bugs.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/ctree.h  |   1 +
>  fs/btrfs/qgroup.c |  38 ++++++++----
>  fs/btrfs/qgroup.h |  12 ++++
>  fs/btrfs/sysfs.c  | 149 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/sysfs.h  |   6 ++
>  5 files changed, 194 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index d8301bf240e0..7576dfe39841 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -779,6 +779,7 @@ struct btrfs_fs_info {
>  	u32 thread_pool_size;
>  
>  	struct kobject *space_info_kobj;
> +	struct kobject *qgroup_kobj;
>  
>  	u64 total_pinned;
>  
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 74eb98479109..04fdd42f0eb5 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -22,6 +22,7 @@
>  #include "extent_io.h"
>  #include "qgroup.h"
>  #include "block-group.h"
> +#include "sysfs.h"
>  
>  /* TODO XXX FIXME
>   *  - subvol delete -> delete when ref goes to 0? delete limits also?
> @@ -192,38 +193,47 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
>  	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
>  	struct rb_node *parent = NULL;
>  	struct btrfs_qgroup *qgroup;
> +	int ret;
>  
>  	while (*p) {
>  		parent = *p;
>  		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
>  
> -		if (qgroup->qgroupid < qgroupid)
> +		if (qgroup->qgroupid < qgroupid) {
>  			p = &(*p)->rb_left;
> -		else if (qgroup->qgroupid > qgroupid)
> +		} else if (qgroup->qgroupid > qgroupid) {
>  			p = &(*p)->rb_right;
> -		else
> +		} else {
>  			return qgroup;
> +		}
>  	}
>  
>  	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
>  	if (!qgroup)
>  		return ERR_PTR(-ENOMEM);
> -
>  	qgroup->qgroupid = qgroupid;
>  	INIT_LIST_HEAD(&qgroup->groups);
>  	INIT_LIST_HEAD(&qgroup->members);
>  	INIT_LIST_HEAD(&qgroup->dirty);
>  
> +	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
> +	if (ret < 0) {
> +		kfree(qgroup);
> +		return ERR_PTR(ret);
> +	}
> +
>  	rb_link_node(&qgroup->node, parent, p);
>  	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
>  
>  	return qgroup;
>  }
>  
> -static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
> +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
> +			    struct btrfs_qgroup *qgroup)
>  {
>  	struct btrfs_qgroup_list *list;
>  
> +	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
>  	list_del(&qgroup->dirty);
>  	while (!list_empty(&qgroup->groups)) {
>  		list = list_first_entry(&qgroup->groups,
> @@ -252,7 +262,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
>  		return -ENOENT;
>  
>  	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
> -	__del_qgroup_rb(qgroup);
> +	__del_qgroup_rb(fs_info, qgroup);
>  	return 0;
>  }
>  
> @@ -351,6 +361,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
>  		goto out;
>  	}
>  
> +	ret = btrfs_sysfs_add_qgroups(fs_info);
> +	if (ret < 0)
> +		goto out;
>  	/* default this to quota off, in case no status key is found */
>  	fs_info->qgroup_flags = 0;
>  
> @@ -500,16 +513,12 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
>  		ulist_free(fs_info->qgroup_ulist);
>  		fs_info->qgroup_ulist = NULL;
>  		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
> +		btrfs_sysfs_del_qgroups(fs_info);
>  	}
>  
>  	return ret < 0 ? ret : 0;
>  }
>  
> -static u64 btrfs_qgroup_subvolid(u64 qgroupid)
> -{
> -	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
> -}
> -
>  /*
>   * Called in close_ctree() when quota is still enabled.  This verifies we don't
>   * leak some reserved space.
> @@ -562,7 +571,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
>  	while ((n = rb_first(&fs_info->qgroup_tree))) {
>  		qgroup = rb_entry(n, struct btrfs_qgroup, node);
>  		rb_erase(n, &fs_info->qgroup_tree);
> -		__del_qgroup_rb(qgroup);
> +		__del_qgroup_rb(fs_info, qgroup);
>  	}
>  	/*
>  	 * We call btrfs_free_qgroup_config() when unmounting
> @@ -571,6 +580,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
>  	 */
>  	ulist_free(fs_info->qgroup_ulist);
>  	fs_info->qgroup_ulist = NULL;
> +	btrfs_sysfs_del_qgroups(fs_info);
>  }
>  
>  static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
> @@ -943,6 +953,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
>  		goto out;
>  	}
>  
> +	ret = btrfs_sysfs_add_qgroups(fs_info);
> +	if (ret < 0)
> +		goto out;
>  	/*
>  	 * 1 for quota root item
>  	 * 1 for BTRFS_QGROUP_STATUS item
> @@ -1089,6 +1102,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
>  		fs_info->qgroup_ulist = NULL;
>  		if (trans)
>  			btrfs_end_transaction(trans);
> +		btrfs_sysfs_del_qgroups(fs_info);
>  	}
>  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
>  	return ret;
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 3be5198a3719..728ffea7de48 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -8,6 +8,7 @@
>  
>  #include <linux/spinlock.h>
>  #include <linux/rbtree.h>
> +#include <linux/kobject.h>
>  #include "ulist.h"
>  #include "delayed-ref.h"
>  
> @@ -223,8 +224,19 @@ struct btrfs_qgroup {
>  	 */
>  	u64 old_refcnt;
>  	u64 new_refcnt;
> +
> +	/*
> +	 * Sysfs kobjectid
> +	 */
> +	struct kobject kobj;
> +	struct completion kobj_unregister;

Why do you add the unregister completion to each qgroup? There's a
pattern where the parent directory wait's until all its
files/directories are released but I'm not sure if we need it here.

The size of each qgroup would increase by about 100 bytes, which is not
much but it adds up.

>  };
>  
> +static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
> +{
> +	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
> +}
> +
>  /*
>   * For qgroup event trace points only
>   */
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index a39bff64ff24..8468c0a22695 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -19,6 +19,7 @@
>  #include "volumes.h"
>  #include "space-info.h"
>  #include "block-group.h"
> +#include "qgroup.h"
>  
>  struct btrfs_feature_attr {
>  	struct kobj_attribute kobj_attr;
> @@ -1455,6 +1456,154 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
>  	return error;
>  }
>  
> +#define QGROUP_ATTR(_member)						\
> +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj,	\
> +				      struct kobj_attribute *a, char *buf) \
> +{									\
> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
> +			struct btrfs_qgroup, kobj);			\
> +	u64 val;							\
> +									\
> +	spin_lock(&fs_info->qgroup_lock);				\
> +	val = qgroup->_member;						\
> +	spin_unlock(&fs_info->qgroup_lock);				\
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
> +}									\
> +BTRFS_ATTR(qgroup, _member, btrfs_qgroup_show_##_member)

The macros need to follow the patterns we already have in sysfs.c,
please have a look at eg. global_rsv_size_show or SPACE_INFO_ATTR. It
reads the main pointers and calls btrfs_show_u64.

> +
> +#define QGROUP_RSV_ATTR(_name, _type)					\
> +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj,\
> +				      struct kobj_attribute *a, char *buf) \
> +{									\
> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
> +			struct btrfs_qgroup, kobj);			\
> +	u64 val;							\
> +									\
> +	spin_lock(&fs_info->qgroup_lock);				\
> +	val = qgroup->rsv.values[_type];					\
> +	spin_unlock(&fs_info->qgroup_lock);				\
> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
> +}									\
> +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
> +
> +QGROUP_ATTR(rfer);
> +QGROUP_ATTR(excl);
> +QGROUP_ATTR(max_rfer);
> +QGROUP_ATTR(max_excl);
> +QGROUP_ATTR(lim_flags);
> +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
> +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
> +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
> +
> +static struct attribute *qgroup_attrs[] = {
> +	BTRFS_ATTR_PTR(qgroup, rfer),
> +	BTRFS_ATTR_PTR(qgroup, excl),
> +	BTRFS_ATTR_PTR(qgroup, max_rfer),
> +	BTRFS_ATTR_PTR(qgroup, max_excl),
> +	BTRFS_ATTR_PTR(qgroup, lim_flags),
> +	BTRFS_ATTR_PTR(qgroup, rsv_data),
> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
> +	NULL
> +};
> +ATTRIBUTE_GROUPS(qgroup);
> +static void qgroup_release(struct kobject *kobj)
> +{
> +	struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup,
> +			kobj);
> +	memset(&qgroup->kobj, 0, sizeof(*kobj));
> +	complete(&qgroup->kobj_unregister);
> +}
> +
> +static struct kobj_type qgroup_ktype = {
> +	.sysfs_ops = &kobj_sysfs_ops,
> +	.release = qgroup_release,
> +	.default_groups = qgroup_groups,
> +};
> +
> +/*
> + * Needed string buffer size for qgroup, including tailing \0
> + *
> + * This includes U48_MAX + 1 + U16_MAX + 1.

What is U48_MAX?

> + * U48_MAX in dec can be 15 digits at, and U16_MAX can be 6 digits.
> + * Rounded up to 32 to provide some buffer.
> + */
> +#define QGROUP_STR_LEN	32

Unused define

> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup)
> +{
> +	struct kobject *qgroups_kobj = fs_info->qgroup_kobj;
> +	int ret;
> +
> +	init_completion(&qgroup->kobj_unregister);
> +	ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
> +			"%u_%llu", (u16)btrfs_qgroup_level(qgroup->qgroupid),

%u does not match u16

> +			btrfs_qgroup_subvolid(qgroup->qgroupid));
> +	if (ret < 0)
> +		kobject_put(&qgroup->kobj);
> +
> +	return ret;
> +}
> +
> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
> +{
> +	struct btrfs_qgroup *qgroup;
> +	struct btrfs_qgroup *next;
> +
> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
> +			&fs_info->qgroup_tree, node) {
> +		if (qgroup->kobj.state_initialized) {
> +			kobject_del(&qgroup->kobj);
> +			kobject_put(&qgroup->kobj);
> +			wait_for_completion(&qgroup->kobj_unregister);
> +		}
> +	}
> +	kobject_del(fs_info->qgroup_kobj);
> +	kobject_put(fs_info->qgroup_kobj);
> +	fs_info->qgroup_kobj = NULL;
> +}
> +
> +/* Called when qgroup get initialized, thus there is no need for extra lock. */
> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
> +{
> +	struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
> +	struct btrfs_qgroup *qgroup;
> +	struct btrfs_qgroup *next;
> +	int ret = 0;
> +
> +	ASSERT(fsid_kobj);
> +	if (fs_info->qgroup_kobj)
> +		return 0;
> +
> +	fs_info->qgroup_kobj = kobject_create_and_add("qgroups", fsid_kobj);
> +	if (!fs_info->qgroup_kobj) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
> +			&fs_info->qgroup_tree, node) {
> +		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +out:
> +	if (ret < 0)
> +		btrfs_sysfs_del_qgroups(fs_info);
> +	return ret;
> +}
> +
> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup)
> +{
> +	kobject_del(&qgroup->kobj);
> +	kobject_put(&qgroup->kobj);
> +	wait_for_completion(&qgroup->kobj_unregister);
> +}
>  
>  /*
>   * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
> diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
> index 718a26c97833..1e27a9c94c84 100644
> --- a/fs/btrfs/sysfs.h
> +++ b/fs/btrfs/sysfs.h
> @@ -36,4 +36,10 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
>  void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
>  void btrfs_sysfs_update_devid(struct btrfs_device *device);
>  
> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup);
> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
> +				struct btrfs_qgroup *qgroup);
>  #endif
> -- 
> 2.27.0
Qu Wenruo June 25, 2020, 11:21 p.m. UTC | #5
On 2020/6/26 上午4:21, David Sterba wrote:
> On Fri, Jun 19, 2020 at 09:59:46AM +0800, Qu Wenruo wrote:
>> This patch will add the following sysfs interface:
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
>>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
>>
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
>> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
>>
>> The last 3 rsv related members are not visible to users, but can be very
>> useful to debug qgroup limit related bugs.
> 
> I think debugging should not be the only usecase. The qgroups
> information can be accessed via ioctls or parsing output of the 'btrfs
> qgroup' commands. For that reason I suggest to pick better names of the
> fields, rfer, excl are not self explanatory and we can't change them in
> the structures. But we can for the sysfs interface.

Sure, I would go "reference" and "exclusive" then.

Although for rsv_* they are really for debug purpose, do they need
rename too?

> 
>> Also, to avoid '/' used in <qgroup_id>, the seperator between qgroup
>> level and qgroup id is changed to '_'.
> 
> This seems fine.
> 
>> The interface is not hidden behind 'debug' as I want this interface to
>> be included into production build so we could have an easier life to
>> debug qgroup rsv related bugs.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
...
>>  	u64 new_refcnt;
>> +
>> +	/*
>> +	 * Sysfs kobjectid
>> +	 */
>> +	struct kobject kobj;
>> +	struct completion kobj_unregister;
> 
> Why do you add the unregister completion to each qgroup? There's a
> pattern where the parent directory wait's until all its
> files/directories are released but I'm not sure if we need it here.

It looks like I'm a little paranoid here.
Since for qgroup we always release all qgroups then the parent
directory, the wait is not needed in theory.

> 
> The size of each qgroup would increase by about 100 bytes, which is not
> much but it adds up.
> 
>>  };
>>  
...
>> +#define QGROUP_ATTR(_member)						\
>> +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj,	\
>> +				      struct kobj_attribute *a, char *buf) \
>> +{									\
>> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
>> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
>> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
>> +			struct btrfs_qgroup, kobj);			\
>> +	u64 val;							\
>> +									\
>> +	spin_lock(&fs_info->qgroup_lock);				\
>> +	val = qgroup->_member;						\
>> +	spin_unlock(&fs_info->qgroup_lock);				\
>> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
>> +}									\
>> +BTRFS_ATTR(qgroup, _member, btrfs_qgroup_show_##_member)
> 
> The macros need to follow the patterns we already have in sysfs.c,
> please have a look at eg. global_rsv_size_show or SPACE_INFO_ATTR. It
> reads the main pointers and calls btrfs_show_u64.

Oh, that's great, no need to bother the locking part.

But do I really need to follow the to_space_info() macro? Those macros
really bothers me, as it's not that clear what we're converting from.


> 
>> +
>> +#define QGROUP_RSV_ATTR(_name, _type)					\
>> +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj,\
>> +				      struct kobj_attribute *a, char *buf) \
>> +{									\
>> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
>> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
>> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
>> +			struct btrfs_qgroup, kobj);			\
>> +	u64 val;							\
>> +									\
>> +	spin_lock(&fs_info->qgroup_lock);				\
>> +	val = qgroup->rsv.values[_type];					\
>> +	spin_unlock(&fs_info->qgroup_lock);				\
>> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
>> +}									\
>> +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
>> +
>> +QGROUP_ATTR(rfer);
>> +QGROUP_ATTR(excl);
>> +QGROUP_ATTR(max_rfer);
>> +QGROUP_ATTR(max_excl);
>> +QGROUP_ATTR(lim_flags);
>> +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
>> +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
>> +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
>> +
>> +static struct attribute *qgroup_attrs[] = {
>> +	BTRFS_ATTR_PTR(qgroup, rfer),
>> +	BTRFS_ATTR_PTR(qgroup, excl),
>> +	BTRFS_ATTR_PTR(qgroup, max_rfer),
>> +	BTRFS_ATTR_PTR(qgroup, max_excl),
>> +	BTRFS_ATTR_PTR(qgroup, lim_flags),
>> +	BTRFS_ATTR_PTR(qgroup, rsv_data),
>> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
>> +	BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
>> +	NULL
>> +};
>> +ATTRIBUTE_GROUPS(qgroup);
>> +static void qgroup_release(struct kobject *kobj)
>> +{
>> +	struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup,
>> +			kobj);
>> +	memset(&qgroup->kobj, 0, sizeof(*kobj));
>> +	complete(&qgroup->kobj_unregister);
>> +}
>> +
>> +static struct kobj_type qgroup_ktype = {
>> +	.sysfs_ops = &kobj_sysfs_ops,
>> +	.release = qgroup_release,
>> +	.default_groups = qgroup_groups,
>> +};
>> +
>> +/*
>> + * Needed string buffer size for qgroup, including tailing \0
>> + *
>> + * This includes U48_MAX + 1 + U16_MAX + 1.
> 
> What is U48_MAX?

For qgroup id, the upper 16 bits are for level and the the lower 48 bit
are for subvolume id.
So here we use U48 here.

But since the define is unused, it shouldn't be a problem any more.

> 
>> + * U48_MAX in dec can be 15 digits at, and U16_MAX can be 6 digits.
>> + * Rounded up to 32 to provide some buffer.
>> + */
>> +#define QGROUP_STR_LEN	32
> 
> Unused define
> 
>> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
>> +				struct btrfs_qgroup *qgroup)
>> +{
>> +	struct kobject *qgroups_kobj = fs_info->qgroup_kobj;
>> +	int ret;
>> +
>> +	init_completion(&qgroup->kobj_unregister);
>> +	ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
>> +			"%u_%llu", (u16)btrfs_qgroup_level(qgroup->qgroupid),
> 
> %u does not match u16

But my compiler doesn't complain about this.

Thanks,
Qu

> 
>> +			btrfs_qgroup_subvolid(qgroup->qgroupid));
>> +	if (ret < 0)
>> +		kobject_put(&qgroup->kobj);
>> +
>> +	return ret;
>> +}
>> +
>> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
>> +{
>> +	struct btrfs_qgroup *qgroup;
>> +	struct btrfs_qgroup *next;
>> +
>> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
>> +			&fs_info->qgroup_tree, node) {
>> +		if (qgroup->kobj.state_initialized) {
>> +			kobject_del(&qgroup->kobj);
>> +			kobject_put(&qgroup->kobj);
>> +			wait_for_completion(&qgroup->kobj_unregister);
>> +		}
>> +	}
>> +	kobject_del(fs_info->qgroup_kobj);
>> +	kobject_put(fs_info->qgroup_kobj);
>> +	fs_info->qgroup_kobj = NULL;
>> +}
>> +
>> +/* Called when qgroup get initialized, thus there is no need for extra lock. */
>> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
>> +{
>> +	struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
>> +	struct btrfs_qgroup *qgroup;
>> +	struct btrfs_qgroup *next;
>> +	int ret = 0;
>> +
>> +	ASSERT(fsid_kobj);
>> +	if (fs_info->qgroup_kobj)
>> +		return 0;
>> +
>> +	fs_info->qgroup_kobj = kobject_create_and_add("qgroups", fsid_kobj);
>> +	if (!fs_info->qgroup_kobj) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +	rbtree_postorder_for_each_entry_safe(qgroup, next,
>> +			&fs_info->qgroup_tree, node) {
>> +		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
>> +		if (ret < 0)
>> +			goto out;
>> +	}
>> +
>> +out:
>> +	if (ret < 0)
>> +		btrfs_sysfs_del_qgroups(fs_info);
>> +	return ret;
>> +}
>> +
>> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
>> +				struct btrfs_qgroup *qgroup)
>> +{
>> +	kobject_del(&qgroup->kobj);
>> +	kobject_put(&qgroup->kobj);
>> +	wait_for_completion(&qgroup->kobj_unregister);
>> +}
>>  
>>  /*
>>   * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
>> diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
>> index 718a26c97833..1e27a9c94c84 100644
>> --- a/fs/btrfs/sysfs.h
>> +++ b/fs/btrfs/sysfs.h
>> @@ -36,4 +36,10 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
>>  void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
>>  void btrfs_sysfs_update_devid(struct btrfs_device *device);
>>  
>> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
>> +				struct btrfs_qgroup *qgroup);
>> +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
>> +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
>> +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
>> +				struct btrfs_qgroup *qgroup);
>>  #endif
>> -- 
>> 2.27.0
David Sterba June 26, 2020, 10:46 a.m. UTC | #6
It does not pass even the self tests and qgroup creation:

[   26.275106] Btrfs loaded, crc32c=crc32c-intel, debug=on, integrity-checker=on, ref-verify=on
[   26.278075] BTRFS: selftest: sectorsize: 4096  nodesize: 4096
[   26.279861] BTRFS: selftest: running btrfs free space cache tests
[   26.281735] BTRFS: selftest: running extent only tests
[   26.283317] BTRFS: selftest: running bitmap only tests
[   26.284966] BTRFS: selftest: running bitmap and extent tests
[   26.286842] BTRFS: selftest: running space stealing from bitmap to extent tests
[   26.289587] BTRFS: selftest: running extent buffer operation tests
[   26.291507] BTRFS: selftest: running btrfs_split_item tests
[   26.293401] BTRFS: selftest: running extent I/O tests
[   26.295059] BTRFS: selftest: running find delalloc tests
[   26.475777] BTRFS: selftest: running find_first_clear_extent_bit test
[   26.477812] BTRFS: selftest: running extent buffer bitmap tests
[   26.499493] BTRFS: selftest: running inode tests
[   26.501164] BTRFS: selftest: running btrfs_get_extent tests
[   26.503599] BTRFS: selftest: running hole first btrfs_get_extent test
[   26.505971] BTRFS: selftest: running outstanding_extents tests
[   26.508136] BTRFS: selftest: running qgroup tests
[   26.509820] BTRFS: selftest: running qgroup add/remove tests
[   26.511566] BUG: sleeping function called from invalid context at mm/slab.h:567
[   26.514058] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 117, name: modprobe
[   26.516671] 2 locks held by modprobe/117:
[   26.517980]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
[   26.521114]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
[   26.524120] CPU: 1 PID: 117 Comm: modprobe Not tainted 5.8.0-rc2-default+ #1154
[   26.526439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[   26.529612] Call Trace:
[   26.530731]  dump_stack+0x78/0xa0
[   26.531983]  ___might_sleep.cold+0xa6/0xf9
[   26.533290]  ? kobject_set_name_vargs+0x1e/0x90
[   26.534674]  __kmalloc_track_caller+0x143/0x340
[   26.536122]  kvasprintf+0x64/0xc0
[   26.537257]  kobject_set_name_vargs+0x1e/0x90
[   26.538588]  kobject_init_and_add+0x5d/0xa0
[   26.539878]  ? lockdep_init_map_waits+0x4d/0x200
[   26.541398]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
[   26.543146]  add_qgroup_rb+0xb0/0x140 [btrfs]
[   26.544556]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
[   26.546006]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
[   26.547583]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
[   26.549015]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
[   26.550759]  ? 0xffffffffc0518000
[   26.551911]  init_btrfs_fs+0xf1/0x159 [btrfs]
[   26.553306]  do_one_initcall+0x63/0x320
[   26.554609]  ? rcu_read_lock_sched_held+0x5d/0x90
[   26.564925]  ? do_init_module+0x23/0x220
[   26.566180]  ? kmem_cache_alloc_trace+0x17b/0x2f0
[   26.567721]  do_init_module+0x5c/0x220
[   26.568978]  load_module+0xed8/0x1490
[   26.570252]  ? __do_sys_finit_module+0xbf/0xe0
[   26.571618]  __do_sys_finit_module+0xbf/0xe0
[   26.572955]  do_syscall_64+0x50/0xe0
[   26.574221]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   26.575812] RIP: 0033:0x7fc269fa6649
[   26.577191] Code: Bad RIP value.
[   26.578341] RSP: 002b:00007ffffdc63a88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
[   26.580890] RAX: ffffffffffffffda RBX: 000055c8d63eba50 RCX: 00007fc269fa6649
[   26.582850] RDX: 0000000000000000 RSI: 000055c8d63c49d2 RDI: 000000000000000a
[   26.584887] RBP: 0000000000040000 R08: 0000000000000000 R09: 000055c8d63eedf0
[   26.586923] R10: 000000000000000a R11: 0000000000000246 R12: 000055c8d63c49d2
[   26.589341] R13: 000055c8d63e6f10 R14: 0000000000000000 R15: 000055c8d63ee530
[   26.591720]
[   26.592542] =============================
[   26.593931] [ BUG: Invalid wait context ]
[   26.595453] 5.8.0-rc2-default+ #1154 Tainted: G        W
[   26.597191] -----------------------------
[   26.598478] modprobe/117 is trying to lock:
[   26.599887] ffffffff8a11e370 (kernfs_mutex){+.+.}-{3:3}, at: kernfs_add_one+0x23/0x150
[   26.602366] other info that might help us debug this:
[   26.604184] context-{4:4}
[   26.605379] 2 locks held by modprobe/117:
[   26.606868]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
[   26.610376]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
[   26.613764] stack backtrace:
[   26.615038] CPU: 1 PID: 117 Comm: modprobe Tainted: G        W         5.8.0-rc2-default+ #1154
[   26.618100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[   26.621610] Call Trace:
[   26.622731]  dump_stack+0x78/0xa0
[   26.624089]  __lock_acquire.cold+0xa0/0x16c
[   26.625593]  lock_acquire+0xa3/0x440
[   26.626951]  ? kernfs_add_one+0x23/0x150
[   26.628531]  __mutex_lock+0xa0/0xaf0
[   26.629958]  ? kernfs_add_one+0x23/0x150
[   26.631496]  ? kernfs_add_one+0x23/0x150
[   26.633669]  ? kernfs_add_one+0x23/0x150
[   26.635813]  kernfs_add_one+0x23/0x150
[   26.637438]  kernfs_create_dir_ns+0x58/0x80
[   26.638987]  sysfs_create_dir_ns+0x70/0xd0
[   26.640328]  ? rcu_read_lock_sched_held+0x5d/0x90
[   26.641799]  kobject_add_internal+0xbb/0x2d0
[   26.643162]  kobject_init_and_add+0x71/0xa0
[   26.644452]  ? lockdep_init_map_waits+0x4d/0x200
[   26.645889]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
[   26.647602]  add_qgroup_rb+0xb0/0x140 [btrfs]
[   26.649053]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
[   26.650521]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
[   26.652225]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
[   26.653787]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
[   26.655338]  ? 0xffffffffc0518000
[   26.656510]  init_btrfs_fs+0xf1/0x159 [btrfs]
[   26.657905]  do_one_initcall+0x63/0x320
[   26.659167]  ? rcu_read_lock_sched_held+0x5d/0x90
[   26.660563]  ? do_init_module+0x23/0x220
[   26.661841]  ? kmem_cache_alloc_trace+0x17b/0x2f0
[   26.663226]  do_init_module+0x5c/0x220
[   26.664428]  load_module+0xed8/0x1490
[   26.665607]  ? __do_sys_finit_module+0xbf/0xe0
[   26.666952]  __do_sys_finit_module+0xbf/0xe0
[   26.668253]  do_syscall_64+0x50/0xe0
[   26.669422]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   26.670981] RIP: 0033:0x7fc269fa6649

And creating a qgoup fires the same warning:

[  145.501257] BUG: sleeping function called from invalid context at mm/slab.h:567
[  145.506681] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 152, name: btrfs
[  145.521473] INFO: lockdep is turned off.
[  145.523000] CPU: 2 PID: 152 Comm: btrfs Tainted: G        W         5.8.0-rc2-default+ #1154
[  145.526231] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[  145.530130] Call Trace:
[  145.531296]  dump_stack+0x78/0xa0
[  145.532703]  ___might_sleep.cold+0xa6/0xf9
[  145.534287]  ? kobject_set_name_vargs+0x1e/0x90
[  145.536066]  __kmalloc_track_caller+0x143/0x340
[  145.537824]  kvasprintf+0x64/0xc0
[  145.539157]  kobject_set_name_vargs+0x1e/0x90
[  145.540769]  kobject_init_and_add+0x5d/0xa0
[  145.542348]  ? lockdep_init_map_waits+0x4d/0x200
[  145.544092]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
[  145.545966]  add_qgroup_rb+0xb0/0x140 [btrfs]
[  145.547654]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
[  145.549266]  btrfs_ioctl+0x17bd/0x2540 [btrfs]
[  145.550739]  ? handle_mm_fault+0x732/0xa30
[  145.552305]  ? up_read+0x18/0x240
[  145.553623]  ? ksys_ioctl+0x68/0xa0
[  145.555029]  ksys_ioctl+0x68/0xa0
[  145.556451]  __x64_sys_ioctl+0x16/0x20
[  145.557898]  do_syscall_64+0x50/0xe0
[  145.559206]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  145.560785] RIP: 0033:0x7f13619797b7
[  145.562035] Code: Bad RIP value.
[  145.563248] RSP: 002b:00007fff11e80428 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
[  145.566129] RAX: ffffffffffffffda RBX: 00007fff11e805e8 RCX: 00007f13619797b7
[  145.568255] RDX: 00007fff11e80440 RSI: 000000004010942a RDI: 0000000000000003
[  145.570208] RBP: 0000000000000001 R08: 000055cc5da232a0 R09: 00007f1361a43a40
[  145.572201] R10: fffffffffffff486 R11: 0000000000000206 R12: 00007fff11e80440
[  145.574242] R13: 0000000000000003 R14: 0000000000000000 R15: 000055cc5d9d31ac
Qu Wenruo June 26, 2020, 11:09 a.m. UTC | #7
On 2020/6/26 下午6:46, David Sterba wrote:
> It does not pass even the self tests and qgroup creation:

Very strange, as it passed my local tests.

> 
> [   26.275106] Btrfs loaded, crc32c=crc32c-intel, debug=on, integrity-checker=on, ref-verify=on
> [   26.278075] BTRFS: selftest: sectorsize: 4096  nodesize: 4096
> [   26.279861] BTRFS: selftest: running btrfs free space cache tests
> [   26.281735] BTRFS: selftest: running extent only tests
> [   26.283317] BTRFS: selftest: running bitmap only tests
> [   26.284966] BTRFS: selftest: running bitmap and extent tests
> [   26.286842] BTRFS: selftest: running space stealing from bitmap to extent tests
> [   26.289587] BTRFS: selftest: running extent buffer operation tests
> [   26.291507] BTRFS: selftest: running btrfs_split_item tests
> [   26.293401] BTRFS: selftest: running extent I/O tests
> [   26.295059] BTRFS: selftest: running find delalloc tests
> [   26.475777] BTRFS: selftest: running find_first_clear_extent_bit test
> [   26.477812] BTRFS: selftest: running extent buffer bitmap tests
> [   26.499493] BTRFS: selftest: running inode tests
> [   26.501164] BTRFS: selftest: running btrfs_get_extent tests
> [   26.503599] BTRFS: selftest: running hole first btrfs_get_extent test
> [   26.505971] BTRFS: selftest: running outstanding_extents tests
> [   26.508136] BTRFS: selftest: running qgroup tests
> [   26.509820] BTRFS: selftest: running qgroup add/remove tests
> [   26.511566] BUG: sleeping function called from invalid context at mm/slab.h:567
> [   26.514058] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 117, name: modprobe
> [   26.516671] 2 locks held by modprobe/117:
> [   26.517980]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
> [   26.521114]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
> [   26.524120] CPU: 1 PID: 117 Comm: modprobe Not tainted 5.8.0-rc2-default+ #1154
> [   26.526439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
> [   26.529612] Call Trace:
> [   26.530731]  dump_stack+0x78/0xa0
> [   26.531983]  ___might_sleep.cold+0xa6/0xf9
> [   26.533290]  ? kobject_set_name_vargs+0x1e/0x90
> [   26.534674]  __kmalloc_track_caller+0x143/0x340
> [   26.536122]  kvasprintf+0x64/0xc0

But according to the call trace, it's indeed allocating the memory.

And my test machine has lockdep enabled, not sure why this is not working.

But thanks anyway for the report,
Qu

> [   26.537257]  kobject_set_name_vargs+0x1e/0x90
> [   26.538588]  kobject_init_and_add+0x5d/0xa0
> [   26.539878]  ? lockdep_init_map_waits+0x4d/0x200
> [   26.541398]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
> [   26.543146]  add_qgroup_rb+0xb0/0x140 [btrfs]
> [   26.544556]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
> [   26.546006]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
> [   26.547583]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
> [   26.549015]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
> [   26.550759]  ? 0xffffffffc0518000
> [   26.551911]  init_btrfs_fs+0xf1/0x159 [btrfs]
> [   26.553306]  do_one_initcall+0x63/0x320
> [   26.554609]  ? rcu_read_lock_sched_held+0x5d/0x90
> [   26.564925]  ? do_init_module+0x23/0x220
> [   26.566180]  ? kmem_cache_alloc_trace+0x17b/0x2f0
> [   26.567721]  do_init_module+0x5c/0x220
> [   26.568978]  load_module+0xed8/0x1490
> [   26.570252]  ? __do_sys_finit_module+0xbf/0xe0
> [   26.571618]  __do_sys_finit_module+0xbf/0xe0
> [   26.572955]  do_syscall_64+0x50/0xe0
> [   26.574221]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   26.575812] RIP: 0033:0x7fc269fa6649
> [   26.577191] Code: Bad RIP value.
> [   26.578341] RSP: 002b:00007ffffdc63a88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
> [   26.580890] RAX: ffffffffffffffda RBX: 000055c8d63eba50 RCX: 00007fc269fa6649
> [   26.582850] RDX: 0000000000000000 RSI: 000055c8d63c49d2 RDI: 000000000000000a
> [   26.584887] RBP: 0000000000040000 R08: 0000000000000000 R09: 000055c8d63eedf0
> [   26.586923] R10: 000000000000000a R11: 0000000000000246 R12: 000055c8d63c49d2
> [   26.589341] R13: 000055c8d63e6f10 R14: 0000000000000000 R15: 000055c8d63ee530
> [   26.591720]
> [   26.592542] =============================
> [   26.593931] [ BUG: Invalid wait context ]
> [   26.595453] 5.8.0-rc2-default+ #1154 Tainted: G        W
> [   26.597191] -----------------------------
> [   26.598478] modprobe/117 is trying to lock:
> [   26.599887] ffffffff8a11e370 (kernfs_mutex){+.+.}-{3:3}, at: kernfs_add_one+0x23/0x150
> [   26.602366] other info that might help us debug this:
> [   26.604184] context-{4:4}
> [   26.605379] 2 locks held by modprobe/117:
> [   26.606868]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
> [   26.610376]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
> [   26.613764] stack backtrace:
> [   26.615038] CPU: 1 PID: 117 Comm: modprobe Tainted: G        W         5.8.0-rc2-default+ #1154
> [   26.618100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
> [   26.621610] Call Trace:
> [   26.622731]  dump_stack+0x78/0xa0
> [   26.624089]  __lock_acquire.cold+0xa0/0x16c
> [   26.625593]  lock_acquire+0xa3/0x440
> [   26.626951]  ? kernfs_add_one+0x23/0x150
> [   26.628531]  __mutex_lock+0xa0/0xaf0
> [   26.629958]  ? kernfs_add_one+0x23/0x150
> [   26.631496]  ? kernfs_add_one+0x23/0x150
> [   26.633669]  ? kernfs_add_one+0x23/0x150
> [   26.635813]  kernfs_add_one+0x23/0x150
> [   26.637438]  kernfs_create_dir_ns+0x58/0x80
> [   26.638987]  sysfs_create_dir_ns+0x70/0xd0
> [   26.640328]  ? rcu_read_lock_sched_held+0x5d/0x90
> [   26.641799]  kobject_add_internal+0xbb/0x2d0
> [   26.643162]  kobject_init_and_add+0x71/0xa0
> [   26.644452]  ? lockdep_init_map_waits+0x4d/0x200
> [   26.645889]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
> [   26.647602]  add_qgroup_rb+0xb0/0x140 [btrfs]
> [   26.649053]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
> [   26.650521]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
> [   26.652225]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
> [   26.653787]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
> [   26.655338]  ? 0xffffffffc0518000
> [   26.656510]  init_btrfs_fs+0xf1/0x159 [btrfs]
> [   26.657905]  do_one_initcall+0x63/0x320
> [   26.659167]  ? rcu_read_lock_sched_held+0x5d/0x90
> [   26.660563]  ? do_init_module+0x23/0x220
> [   26.661841]  ? kmem_cache_alloc_trace+0x17b/0x2f0
> [   26.663226]  do_init_module+0x5c/0x220
> [   26.664428]  load_module+0xed8/0x1490
> [   26.665607]  ? __do_sys_finit_module+0xbf/0xe0
> [   26.666952]  __do_sys_finit_module+0xbf/0xe0
> [   26.668253]  do_syscall_64+0x50/0xe0
> [   26.669422]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   26.670981] RIP: 0033:0x7fc269fa6649
> 
> And creating a qgoup fires the same warning:
> 
> [  145.501257] BUG: sleeping function called from invalid context at mm/slab.h:567
> [  145.506681] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 152, name: btrfs
> [  145.521473] INFO: lockdep is turned off.
> [  145.523000] CPU: 2 PID: 152 Comm: btrfs Tainted: G        W         5.8.0-rc2-default+ #1154
> [  145.526231] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
> [  145.530130] Call Trace:
> [  145.531296]  dump_stack+0x78/0xa0
> [  145.532703]  ___might_sleep.cold+0xa6/0xf9
> [  145.534287]  ? kobject_set_name_vargs+0x1e/0x90
> [  145.536066]  __kmalloc_track_caller+0x143/0x340
> [  145.537824]  kvasprintf+0x64/0xc0
> [  145.539157]  kobject_set_name_vargs+0x1e/0x90
> [  145.540769]  kobject_init_and_add+0x5d/0xa0
> [  145.542348]  ? lockdep_init_map_waits+0x4d/0x200
> [  145.544092]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
> [  145.545966]  add_qgroup_rb+0xb0/0x140 [btrfs]
> [  145.547654]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
> [  145.549266]  btrfs_ioctl+0x17bd/0x2540 [btrfs]
> [  145.550739]  ? handle_mm_fault+0x732/0xa30
> [  145.552305]  ? up_read+0x18/0x240
> [  145.553623]  ? ksys_ioctl+0x68/0xa0
> [  145.555029]  ksys_ioctl+0x68/0xa0
> [  145.556451]  __x64_sys_ioctl+0x16/0x20
> [  145.557898]  do_syscall_64+0x50/0xe0
> [  145.559206]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  145.560785] RIP: 0033:0x7f13619797b7
> [  145.562035] Code: Bad RIP value.
> [  145.563248] RSP: 002b:00007fff11e80428 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
> [  145.566129] RAX: ffffffffffffffda RBX: 00007fff11e805e8 RCX: 00007f13619797b7
> [  145.568255] RDX: 00007fff11e80440 RSI: 000000004010942a RDI: 0000000000000003
> [  145.570208] RBP: 0000000000000001 R08: 000055cc5da232a0 R09: 00007f1361a43a40
> [  145.572201] R10: fffffffffffff486 R11: 0000000000000206 R12: 00007fff11e80440
> [  145.574242] R13: 0000000000000003 R14: 0000000000000000 R15: 000055cc5d9d31ac
>
David Sterba June 26, 2020, 11:14 a.m. UTC | #8
On Fri, Jun 26, 2020 at 07:21:31AM +0800, Qu Wenruo wrote:
> 
> 
> On 2020/6/26 上午4:21, David Sterba wrote:
> > On Fri, Jun 19, 2020 at 09:59:46AM +0800, Qu Wenruo wrote:
> >> This patch will add the following sysfs interface:
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rfer
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/excl
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_rfer
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/max_excl
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/lim_flags
> >>  ^^^ Above are already in "btrfs qgroup show" command output ^^^
> >>
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_data
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_pertrans
> >> /sys/fs/btrfs/<UUID>/qgroups/<qgroup_id>/rsv_meta_prealloc
> >>
> >> The last 3 rsv related members are not visible to users, but can be very
> >> useful to debug qgroup limit related bugs.
> > 
> > I think debugging should not be the only usecase. The qgroups
> > information can be accessed via ioctls or parsing output of the 'btrfs
> > qgroup' commands. For that reason I suggest to pick better names of the
> > fields, rfer, excl are not self explanatory and we can't change them in
> > the structures. But we can for the sysfs interface.
> 
> Sure, I would go "reference" and "exclusive" then.

Yes please.

> Although for rsv_* they are really for debug purpose, do they need
> rename too?

I've checked the sysfs directory where the reserves global reserve is
exported and '_rsv' is used so I think it's fine for qgoups too.

> >>  	u64 new_refcnt;
> >> +
> >> +	/*
> >> +	 * Sysfs kobjectid
> >> +	 */
> >> +	struct kobject kobj;
> >> +	struct completion kobj_unregister;
> > 
> > Why do you add the unregister completion to each qgroup? There's a
> > pattern where the parent directory wait's until all its
> > files/directories are released but I'm not sure if we need it here.
> 
> It looks like I'm a little paranoid here.
> Since for qgroup we always release all qgroups then the parent
> directory, the wait is not needed in theory.

Then let's remove it to save the bytes.

The wait/completion is used only for kobjects that are embedded into
other structures:

$ grep 'struct kobject[^*]*$' *.h
space-info.h:   struct kobject kobj;
volumes.h:      struct kobject devid_kobj;
volumes.h:      struct kobject fsid_kobj;

$ grep complete sysfs.c
        complete(&fs_devs->kobj_unregister);
        complete(&device->kobj_unregister);

The space infos are handled in another way.

> > 
> > The size of each qgroup would increase by about 100 bytes, which is not
> > much but it adds up.
> > 
> >>  };
> >>  
> ...
> >> +#define QGROUP_ATTR(_member)						\
> >> +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj,	\
> >> +				      struct kobj_attribute *a, char *buf) \
> >> +{									\
> >> +	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
> >> +	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
> >> +	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
> >> +			struct btrfs_qgroup, kobj);			\
> >> +	u64 val;							\
> >> +									\
> >> +	spin_lock(&fs_info->qgroup_lock);				\
> >> +	val = qgroup->_member;						\
> >> +	spin_unlock(&fs_info->qgroup_lock);				\
> >> +	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
> >> +}									\
> >> +BTRFS_ATTR(qgroup, _member, btrfs_qgroup_show_##_member)
> > 
> > The macros need to follow the patterns we already have in sysfs.c,
> > please have a look at eg. global_rsv_size_show or SPACE_INFO_ATTR. It
> > reads the main pointers and calls btrfs_show_u64.
> 
> Oh, that's great, no need to bother the locking part.
> 
> But do I really need to follow the to_space_info() macro? Those macros
> really bothers me, as it's not that clear what we're converting from.

From should be always the kobject. Add helpers that will make the code
straightforward, ie. not necessary to declare 3 variables if this could
be hidding in the helper. And the helper does not need to be a macro,
static inline is fine too.

> >> +static struct kobj_type qgroup_ktype = {
> >> +	.sysfs_ops = &kobj_sysfs_ops,
> >> +	.release = qgroup_release,
> >> +	.default_groups = qgroup_groups,
> >> +};
> >> +
> >> +/*
> >> + * Needed string buffer size for qgroup, including tailing \0
> >> + *
> >> + * This includes U48_MAX + 1 + U16_MAX + 1.
> > 
> > What is U48_MAX?
> 
> For qgroup id, the upper 16 bits are for level and the the lower 48 bit
> are for subvolume id.
> So here we use U48 here.

Yeah I know about the 16/48 split, but there's no such type as u48.

> But since the define is unused, it shouldn't be a problem any more.
> 
> > 
> >> + * U48_MAX in dec can be 15 digits at, and U16_MAX can be 6 digits.
> >> + * Rounded up to 32 to provide some buffer.
> >> + */
> >> +#define QGROUP_STR_LEN	32
> > 
> > Unused define
> > 
> >> +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
> >> +				struct btrfs_qgroup *qgroup)
> >> +{
> >> +	struct kobject *qgroups_kobj = fs_info->qgroup_kobj;
> >> +	int ret;
> >> +
> >> +	init_completion(&qgroup->kobj_unregister);
> >> +	ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
> >> +			"%u_%llu", (u16)btrfs_qgroup_level(qgroup->qgroupid),
> > 
> > %u does not match u16
> 
> But my compiler doesn't complain about this.

That is very weak argument. There are several things just on that line
that would take time to explain why they work and touch parts of C that
are not considered trivial. That compiler can deal with that is great,
but we as humans reading the code would appreciate if the code is clear
not not depending on C language subtleties.

- btrfs_qgroup_level returns u64
- is type cast to a narrower type, the cast will basically do & 0xFFFF
  but we know there's no information loss 64bit -> 16bit from how the
  value is calculated in btrfs_qgroup_level
- the u16 type is narrower than int and appears in variable argument
  list of printf, so there's type promotion to int
- the format is unsigned int, so the (signed) int value is cast to
  unsigned and we get the correct result in the end

Now compare to:

- btrfs_qgroup_level returns u16 (without cast)
- printf format is %hu

The format matches the data.
David Sterba June 26, 2020, 11:40 a.m. UTC | #9
On Fri, Jun 26, 2020 at 07:09:41PM +0800, Qu Wenruo wrote:
> > [   26.508136] BTRFS: selftest: running qgroup tests
> > [   26.509820] BTRFS: selftest: running qgroup add/remove tests
> > [   26.511566] BUG: sleeping function called from invalid context at mm/slab.h:567
> > [   26.514058] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 117, name: modprobe
> > [   26.516671] 2 locks held by modprobe/117:
> > [   26.517980]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
> > [   26.521114]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
> > [   26.524120] CPU: 1 PID: 117 Comm: modprobe Not tainted 5.8.0-rc2-default+ #1154
> > [   26.526439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
> > [   26.529612] Call Trace:
> > [   26.530731]  dump_stack+0x78/0xa0
> > [   26.531983]  ___might_sleep.cold+0xa6/0xf9
> > [   26.533290]  ? kobject_set_name_vargs+0x1e/0x90
> > [   26.534674]  __kmalloc_track_caller+0x143/0x340
> > [   26.536122]  kvasprintf+0x64/0xc0
> 
> But according to the call trace, it's indeed allocating the memory.
> 
> And my test machine has lockdep enabled, not sure why this is not working.

CONFIG_DEBUG_ATOMIC_SLEEP
Qu Wenruo June 26, 2020, 11:40 a.m. UTC | #10
On 2020/6/26 下午7:09, Qu Wenruo wrote:
> 
> 
> On 2020/6/26 下午6:46, David Sterba wrote:
>> It does not pass even the self tests and qgroup creation:
> 
> Very strange, as it passed my local tests.
> 
>>
>> [   26.275106] Btrfs loaded, crc32c=crc32c-intel, debug=on, integrity-checker=on, ref-verify=on
>> [   26.278075] BTRFS: selftest: sectorsize: 4096  nodesize: 4096
>> [   26.279861] BTRFS: selftest: running btrfs free space cache tests
>> [   26.281735] BTRFS: selftest: running extent only tests
>> [   26.283317] BTRFS: selftest: running bitmap only tests
>> [   26.284966] BTRFS: selftest: running bitmap and extent tests
>> [   26.286842] BTRFS: selftest: running space stealing from bitmap to extent tests
>> [   26.289587] BTRFS: selftest: running extent buffer operation tests
>> [   26.291507] BTRFS: selftest: running btrfs_split_item tests
>> [   26.293401] BTRFS: selftest: running extent I/O tests
>> [   26.295059] BTRFS: selftest: running find delalloc tests
>> [   26.475777] BTRFS: selftest: running find_first_clear_extent_bit test
>> [   26.477812] BTRFS: selftest: running extent buffer bitmap tests
>> [   26.499493] BTRFS: selftest: running inode tests
>> [   26.501164] BTRFS: selftest: running btrfs_get_extent tests
>> [   26.503599] BTRFS: selftest: running hole first btrfs_get_extent test
>> [   26.505971] BTRFS: selftest: running outstanding_extents tests
>> [   26.508136] BTRFS: selftest: running qgroup tests
>> [   26.509820] BTRFS: selftest: running qgroup add/remove tests
>> [   26.511566] BUG: sleeping function called from invalid context at mm/slab.h:567
>> [   26.514058] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 117, name: modprobe
>> [   26.516671] 2 locks held by modprobe/117:
>> [   26.517980]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
>> [   26.521114]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
>> [   26.524120] CPU: 1 PID: 117 Comm: modprobe Not tainted 5.8.0-rc2-default+ #1154
>> [   26.526439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
>> [   26.529612] Call Trace:
>> [   26.530731]  dump_stack+0x78/0xa0
>> [   26.531983]  ___might_sleep.cold+0xa6/0xf9
>> [   26.533290]  ? kobject_set_name_vargs+0x1e/0x90
>> [   26.534674]  __kmalloc_track_caller+0x143/0x340
>> [   26.536122]  kvasprintf+0x64/0xc0
> 
> But according to the call trace, it's indeed allocating the memory.
> 
> And my test machine has lockdep enabled, not sure why this is not working.

Damn it, my previous tests triggered something during selftest and
disabled lockdep during development, that's why it never triggered
lockdep warning.

And after that happening, never rebooted the VM kernel.

Would learn to reboot before doing final test...

Thanks,
Qu
> 
> But thanks anyway for the report,
> Qu
> 
>> [   26.537257]  kobject_set_name_vargs+0x1e/0x90
>> [   26.538588]  kobject_init_and_add+0x5d/0xa0
>> [   26.539878]  ? lockdep_init_map_waits+0x4d/0x200
>> [   26.541398]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
>> [   26.543146]  add_qgroup_rb+0xb0/0x140 [btrfs]
>> [   26.544556]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
>> [   26.546006]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
>> [   26.547583]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
>> [   26.549015]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
>> [   26.550759]  ? 0xffffffffc0518000
>> [   26.551911]  init_btrfs_fs+0xf1/0x159 [btrfs]
>> [   26.553306]  do_one_initcall+0x63/0x320
>> [   26.554609]  ? rcu_read_lock_sched_held+0x5d/0x90
>> [   26.564925]  ? do_init_module+0x23/0x220
>> [   26.566180]  ? kmem_cache_alloc_trace+0x17b/0x2f0
>> [   26.567721]  do_init_module+0x5c/0x220
>> [   26.568978]  load_module+0xed8/0x1490
>> [   26.570252]  ? __do_sys_finit_module+0xbf/0xe0
>> [   26.571618]  __do_sys_finit_module+0xbf/0xe0
>> [   26.572955]  do_syscall_64+0x50/0xe0
>> [   26.574221]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [   26.575812] RIP: 0033:0x7fc269fa6649
>> [   26.577191] Code: Bad RIP value.
>> [   26.578341] RSP: 002b:00007ffffdc63a88 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
>> [   26.580890] RAX: ffffffffffffffda RBX: 000055c8d63eba50 RCX: 00007fc269fa6649
>> [   26.582850] RDX: 0000000000000000 RSI: 000055c8d63c49d2 RDI: 000000000000000a
>> [   26.584887] RBP: 0000000000040000 R08: 0000000000000000 R09: 000055c8d63eedf0
>> [   26.586923] R10: 000000000000000a R11: 0000000000000246 R12: 000055c8d63c49d2
>> [   26.589341] R13: 000055c8d63e6f10 R14: 0000000000000000 R15: 000055c8d63ee530
>> [   26.591720]
>> [   26.592542] =============================
>> [   26.593931] [ BUG: Invalid wait context ]
>> [   26.595453] 5.8.0-rc2-default+ #1154 Tainted: G        W
>> [   26.597191] -----------------------------
>> [   26.598478] modprobe/117 is trying to lock:
>> [   26.599887] ffffffff8a11e370 (kernfs_mutex){+.+.}-{3:3}, at: kernfs_add_one+0x23/0x150
>> [   26.602366] other info that might help us debug this:
>> [   26.604184] context-{4:4}
>> [   26.605379] 2 locks held by modprobe/117:
>> [   26.606868]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
>> [   26.610376]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
>> [   26.613764] stack backtrace:
>> [   26.615038] CPU: 1 PID: 117 Comm: modprobe Tainted: G        W         5.8.0-rc2-default+ #1154
>> [   26.618100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
>> [   26.621610] Call Trace:
>> [   26.622731]  dump_stack+0x78/0xa0
>> [   26.624089]  __lock_acquire.cold+0xa0/0x16c
>> [   26.625593]  lock_acquire+0xa3/0x440
>> [   26.626951]  ? kernfs_add_one+0x23/0x150
>> [   26.628531]  __mutex_lock+0xa0/0xaf0
>> [   26.629958]  ? kernfs_add_one+0x23/0x150
>> [   26.631496]  ? kernfs_add_one+0x23/0x150
>> [   26.633669]  ? kernfs_add_one+0x23/0x150
>> [   26.635813]  kernfs_add_one+0x23/0x150
>> [   26.637438]  kernfs_create_dir_ns+0x58/0x80
>> [   26.638987]  sysfs_create_dir_ns+0x70/0xd0
>> [   26.640328]  ? rcu_read_lock_sched_held+0x5d/0x90
>> [   26.641799]  kobject_add_internal+0xbb/0x2d0
>> [   26.643162]  kobject_init_and_add+0x71/0xa0
>> [   26.644452]  ? lockdep_init_map_waits+0x4d/0x200
>> [   26.645889]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
>> [   26.647602]  add_qgroup_rb+0xb0/0x140 [btrfs]
>> [   26.649053]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
>> [   26.650521]  test_no_shared_qgroup.isra.0+0x66/0x2b0 [btrfs]
>> [   26.652225]  btrfs_test_qgroups+0x1da/0x220 [btrfs]
>> [   26.653787]  btrfs_run_sanity_tests.cold+0x5c/0xd5 [btrfs]
>> [   26.655338]  ? 0xffffffffc0518000
>> [   26.656510]  init_btrfs_fs+0xf1/0x159 [btrfs]
>> [   26.657905]  do_one_initcall+0x63/0x320
>> [   26.659167]  ? rcu_read_lock_sched_held+0x5d/0x90
>> [   26.660563]  ? do_init_module+0x23/0x220
>> [   26.661841]  ? kmem_cache_alloc_trace+0x17b/0x2f0
>> [   26.663226]  do_init_module+0x5c/0x220
>> [   26.664428]  load_module+0xed8/0x1490
>> [   26.665607]  ? __do_sys_finit_module+0xbf/0xe0
>> [   26.666952]  __do_sys_finit_module+0xbf/0xe0
>> [   26.668253]  do_syscall_64+0x50/0xe0
>> [   26.669422]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [   26.670981] RIP: 0033:0x7fc269fa6649
>>
>> And creating a qgoup fires the same warning:
>>
>> [  145.501257] BUG: sleeping function called from invalid context at mm/slab.h:567
>> [  145.506681] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 152, name: btrfs
>> [  145.521473] INFO: lockdep is turned off.
>> [  145.523000] CPU: 2 PID: 152 Comm: btrfs Tainted: G        W         5.8.0-rc2-default+ #1154
>> [  145.526231] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
>> [  145.530130] Call Trace:
>> [  145.531296]  dump_stack+0x78/0xa0
>> [  145.532703]  ___might_sleep.cold+0xa6/0xf9
>> [  145.534287]  ? kobject_set_name_vargs+0x1e/0x90
>> [  145.536066]  __kmalloc_track_caller+0x143/0x340
>> [  145.537824]  kvasprintf+0x64/0xc0
>> [  145.539157]  kobject_set_name_vargs+0x1e/0x90
>> [  145.540769]  kobject_init_and_add+0x5d/0xa0
>> [  145.542348]  ? lockdep_init_map_waits+0x4d/0x200
>> [  145.544092]  btrfs_sysfs_add_one_qgroup+0x72/0xa0 [btrfs]
>> [  145.545966]  add_qgroup_rb+0xb0/0x140 [btrfs]
>> [  145.547654]  btrfs_create_qgroup+0x80/0xf0 [btrfs]
>> [  145.549266]  btrfs_ioctl+0x17bd/0x2540 [btrfs]
>> [  145.550739]  ? handle_mm_fault+0x732/0xa30
>> [  145.552305]  ? up_read+0x18/0x240
>> [  145.553623]  ? ksys_ioctl+0x68/0xa0
>> [  145.555029]  ksys_ioctl+0x68/0xa0
>> [  145.556451]  __x64_sys_ioctl+0x16/0x20
>> [  145.557898]  do_syscall_64+0x50/0xe0
>> [  145.559206]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [  145.560785] RIP: 0033:0x7f13619797b7
>> [  145.562035] Code: Bad RIP value.
>> [  145.563248] RSP: 002b:00007fff11e80428 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
>> [  145.566129] RAX: ffffffffffffffda RBX: 00007fff11e805e8 RCX: 00007f13619797b7
>> [  145.568255] RDX: 00007fff11e80440 RSI: 000000004010942a RDI: 0000000000000003
>> [  145.570208] RBP: 0000000000000001 R08: 000055cc5da232a0 R09: 00007f1361a43a40
>> [  145.572201] R10: fffffffffffff486 R11: 0000000000000206 R12: 00007fff11e80440
>> [  145.574242] R13: 0000000000000003 R14: 0000000000000000 R15: 000055cc5d9d31ac
>>
>
Qu Wenruo June 26, 2020, 11:43 a.m. UTC | #11
On 2020/6/26 下午7:40, David Sterba wrote:
> On Fri, Jun 26, 2020 at 07:09:41PM +0800, Qu Wenruo wrote:
>>> [   26.508136] BTRFS: selftest: running qgroup tests
>>> [   26.509820] BTRFS: selftest: running qgroup add/remove tests
>>> [   26.511566] BUG: sleeping function called from invalid context at mm/slab.h:567
>>> [   26.514058] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 117, name: modprobe
>>> [   26.516671] 2 locks held by modprobe/117:
>>> [   26.517980]  #0: ffff968162761a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_create_qgroup+0x29/0xf0 [btrfs]
>>> [   26.521114]  #1: ffff968162761960 (&fs_info->qgroup_lock){+.+.}-{2:2}, at: btrfs_create_qgroup+0x75/0xf0 [btrfs]
>>> [   26.524120] CPU: 1 PID: 117 Comm: modprobe Not tainted 5.8.0-rc2-default+ #1154
>>> [   26.526439] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
>>> [   26.529612] Call Trace:
>>> [   26.530731]  dump_stack+0x78/0xa0
>>> [   26.531983]  ___might_sleep.cold+0xa6/0xf9
>>> [   26.533290]  ? kobject_set_name_vargs+0x1e/0x90
>>> [   26.534674]  __kmalloc_track_caller+0x143/0x340
>>> [   26.536122]  kvasprintf+0x64/0xc0
>>
>> But according to the call trace, it's indeed allocating the memory.
>>
>> And my test machine has lockdep enabled, not sure why this is not working.
> 
> CONFIG_DEBUG_ATOMIC_SLEEP
> 
Oh... You're right, that got disabled...

Thanks,
Qu
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d8301bf240e0..7576dfe39841 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -779,6 +779,7 @@  struct btrfs_fs_info {
 	u32 thread_pool_size;
 
 	struct kobject *space_info_kobj;
+	struct kobject *qgroup_kobj;
 
 	u64 total_pinned;
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 74eb98479109..04fdd42f0eb5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -22,6 +22,7 @@ 
 #include "extent_io.h"
 #include "qgroup.h"
 #include "block-group.h"
+#include "sysfs.h"
 
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -192,38 +193,47 @@  static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 	struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 	struct rb_node *parent = NULL;
 	struct btrfs_qgroup *qgroup;
+	int ret;
 
 	while (*p) {
 		parent = *p;
 		qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 
-		if (qgroup->qgroupid < qgroupid)
+		if (qgroup->qgroupid < qgroupid) {
 			p = &(*p)->rb_left;
-		else if (qgroup->qgroupid > qgroupid)
+		} else if (qgroup->qgroupid > qgroupid) {
 			p = &(*p)->rb_right;
-		else
+		} else {
 			return qgroup;
+		}
 	}
 
 	qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 	if (!qgroup)
 		return ERR_PTR(-ENOMEM);
-
 	qgroup->qgroupid = qgroupid;
 	INIT_LIST_HEAD(&qgroup->groups);
 	INIT_LIST_HEAD(&qgroup->members);
 	INIT_LIST_HEAD(&qgroup->dirty);
 
+	ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+	if (ret < 0) {
+		kfree(qgroup);
+		return ERR_PTR(ret);
+	}
+
 	rb_link_node(&qgroup->node, parent, p);
 	rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 
 	return qgroup;
 }
 
-static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
+			    struct btrfs_qgroup *qgroup)
 {
 	struct btrfs_qgroup_list *list;
 
+	btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 	list_del(&qgroup->dirty);
 	while (!list_empty(&qgroup->groups)) {
 		list = list_first_entry(&qgroup->groups,
@@ -252,7 +262,7 @@  static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 		return -ENOENT;
 
 	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
-	__del_qgroup_rb(qgroup);
+	__del_qgroup_rb(fs_info, qgroup);
 	return 0;
 }
 
@@ -351,6 +361,9 @@  int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 		goto out;
 	}
 
+	ret = btrfs_sysfs_add_qgroups(fs_info);
+	if (ret < 0)
+		goto out;
 	/* default this to quota off, in case no status key is found */
 	fs_info->qgroup_flags = 0;
 
@@ -500,16 +513,12 @@  int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 		ulist_free(fs_info->qgroup_ulist);
 		fs_info->qgroup_ulist = NULL;
 		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		btrfs_sysfs_del_qgroups(fs_info);
 	}
 
 	return ret < 0 ? ret : 0;
 }
 
-static u64 btrfs_qgroup_subvolid(u64 qgroupid)
-{
-	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
-}
-
 /*
  * Called in close_ctree() when quota is still enabled.  This verifies we don't
  * leak some reserved space.
@@ -562,7 +571,7 @@  void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 	while ((n = rb_first(&fs_info->qgroup_tree))) {
 		qgroup = rb_entry(n, struct btrfs_qgroup, node);
 		rb_erase(n, &fs_info->qgroup_tree);
-		__del_qgroup_rb(qgroup);
+		__del_qgroup_rb(fs_info, qgroup);
 	}
 	/*
 	 * We call btrfs_free_qgroup_config() when unmounting
@@ -571,6 +580,7 @@  void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 	 */
 	ulist_free(fs_info->qgroup_ulist);
 	fs_info->qgroup_ulist = NULL;
+	btrfs_sysfs_del_qgroups(fs_info);
 }
 
 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
@@ -943,6 +953,9 @@  int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 		goto out;
 	}
 
+	ret = btrfs_sysfs_add_qgroups(fs_info);
+	if (ret < 0)
+		goto out;
 	/*
 	 * 1 for quota root item
 	 * 1 for BTRFS_QGROUP_STATUS item
@@ -1089,6 +1102,7 @@  int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 		fs_info->qgroup_ulist = NULL;
 		if (trans)
 			btrfs_end_transaction(trans);
+		btrfs_sysfs_del_qgroups(fs_info);
 	}
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 3be5198a3719..728ffea7de48 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -8,6 +8,7 @@ 
 
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
+#include <linux/kobject.h>
 #include "ulist.h"
 #include "delayed-ref.h"
 
@@ -223,8 +224,19 @@  struct btrfs_qgroup {
 	 */
 	u64 old_refcnt;
 	u64 new_refcnt;
+
+	/*
+	 * Sysfs kobjectid
+	 */
+	struct kobject kobj;
+	struct completion kobj_unregister;
 };
 
+static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+	return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
 /*
  * For qgroup event trace points only
  */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a39bff64ff24..8468c0a22695 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -19,6 +19,7 @@ 
 #include "volumes.h"
 #include "space-info.h"
 #include "block-group.h"
+#include "qgroup.h"
 
 struct btrfs_feature_attr {
 	struct kobj_attribute kobj_attr;
@@ -1455,6 +1456,154 @@  int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
 	return error;
 }
 
+#define QGROUP_ATTR(_member)						\
+static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj,	\
+				      struct kobj_attribute *a, char *buf) \
+{									\
+	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
+	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
+	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
+			struct btrfs_qgroup, kobj);			\
+	u64 val;							\
+									\
+	spin_lock(&fs_info->qgroup_lock);				\
+	val = qgroup->_member;						\
+	spin_unlock(&fs_info->qgroup_lock);				\
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
+}									\
+BTRFS_ATTR(qgroup, _member, btrfs_qgroup_show_##_member)
+
+#define QGROUP_RSV_ATTR(_name, _type)					\
+static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj,\
+				      struct kobj_attribute *a, char *buf) \
+{									\
+	struct kobject *fsid_kobj = qgroup_kobj->parent->parent;	\
+	struct btrfs_fs_info *fs_info = to_fs_info(fsid_kobj);		\
+	struct btrfs_qgroup *qgroup = container_of(qgroup_kobj,		\
+			struct btrfs_qgroup, kobj);			\
+	u64 val;							\
+									\
+	spin_lock(&fs_info->qgroup_lock);				\
+	val = qgroup->rsv.values[_type];					\
+	spin_unlock(&fs_info->qgroup_lock);				\
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);		\
+}									\
+BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name)
+
+QGROUP_ATTR(rfer);
+QGROUP_ATTR(excl);
+QGROUP_ATTR(max_rfer);
+QGROUP_ATTR(max_excl);
+QGROUP_ATTR(lim_flags);
+QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA);
+QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS);
+QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC);
+
+static struct attribute *qgroup_attrs[] = {
+	BTRFS_ATTR_PTR(qgroup, rfer),
+	BTRFS_ATTR_PTR(qgroup, excl),
+	BTRFS_ATTR_PTR(qgroup, max_rfer),
+	BTRFS_ATTR_PTR(qgroup, max_excl),
+	BTRFS_ATTR_PTR(qgroup, lim_flags),
+	BTRFS_ATTR_PTR(qgroup, rsv_data),
+	BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans),
+	BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc),
+	NULL
+};
+ATTRIBUTE_GROUPS(qgroup);
+static void qgroup_release(struct kobject *kobj)
+{
+	struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup,
+			kobj);
+	memset(&qgroup->kobj, 0, sizeof(*kobj));
+	complete(&qgroup->kobj_unregister);
+}
+
+static struct kobj_type qgroup_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = qgroup_release,
+	.default_groups = qgroup_groups,
+};
+
+/*
+ * Needed string buffer size for qgroup, including tailing \0
+ *
+ * This includes U48_MAX + 1 + U16_MAX + 1.
+ * U48_MAX in dec can be 15 digits at, and U16_MAX can be 6 digits.
+ * Rounded up to 32 to provide some buffer.
+ */
+#define QGROUP_STR_LEN	32
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup *qgroup)
+{
+	struct kobject *qgroups_kobj = fs_info->qgroup_kobj;
+	int ret;
+
+	init_completion(&qgroup->kobj_unregister);
+	ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj,
+			"%u_%llu", (u16)btrfs_qgroup_level(qgroup->qgroupid),
+			btrfs_qgroup_subvolid(qgroup->qgroupid));
+	if (ret < 0)
+		kobject_put(&qgroup->kobj);
+
+	return ret;
+}
+
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *next;
+
+	rbtree_postorder_for_each_entry_safe(qgroup, next,
+			&fs_info->qgroup_tree, node) {
+		if (qgroup->kobj.state_initialized) {
+			kobject_del(&qgroup->kobj);
+			kobject_put(&qgroup->kobj);
+			wait_for_completion(&qgroup->kobj_unregister);
+		}
+	}
+	kobject_del(fs_info->qgroup_kobj);
+	kobject_put(fs_info->qgroup_kobj);
+	fs_info->qgroup_kobj = NULL;
+}
+
+/* Called when qgroup get initialized, thus there is no need for extra lock. */
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
+{
+	struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
+	struct btrfs_qgroup *qgroup;
+	struct btrfs_qgroup *next;
+	int ret = 0;
+
+	ASSERT(fsid_kobj);
+	if (fs_info->qgroup_kobj)
+		return 0;
+
+	fs_info->qgroup_kobj = kobject_create_and_add("qgroups", fsid_kobj);
+	if (!fs_info->qgroup_kobj) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	rbtree_postorder_for_each_entry_safe(qgroup, next,
+			&fs_info->qgroup_tree, node) {
+		ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
+		if (ret < 0)
+			goto out;
+	}
+
+out:
+	if (ret < 0)
+		btrfs_sysfs_del_qgroups(fs_info);
+	return ret;
+}
+
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup *qgroup)
+{
+	kobject_del(&qgroup->kobj);
+	kobject_put(&qgroup->kobj);
+	wait_for_completion(&qgroup->kobj_unregister);
+}
 
 /*
  * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 718a26c97833..1e27a9c94c84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -36,4 +36,10 @@  int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
 void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
 void btrfs_sysfs_update_devid(struct btrfs_device *device);
 
+int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup *qgroup);
+void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
+				struct btrfs_qgroup *qgroup);
 #endif