diff mbox series

[v2,1/3] quota: add quota in-memory format support

Message ID 20221121142854.91109-2-lczerner@redhat.com (mailing list archive)
State New, archived
Headers show
Series shmem: user and group quota support for tmpfs | expand

Commit Message

Lukas Czerner Nov. 21, 2022, 2:28 p.m. UTC
In memory quota format relies on quota infrastructure to store dquot
information for us. While conventional quota formats for file systems
with persistent storage can load quota information into dquot from the
storage on-demand and hence quota dquot shrinker can free any dquot that
is not currently being used, it must be avoided here. Otherwise we can
lose valuable information, user provided limits, because there is no
persistent storage to load the information from afterwards.

One information that in-memory quota format needs to keep track of is a
sorted list of ids for each quota type. This is done by utilizing an rb
tree which root is stored in mem_dqinfo->dqi_priv for each quota type.

This format can be used to support quota on file system without persistent
storage such as tmpfs.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/quota/Kconfig           |   8 ++
 fs/quota/Makefile          |   1 +
 fs/quota/dquot.c           |   3 +
 fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
 include/linux/quota.h      |   7 +-
 include/uapi/linux/quota.h |   1 +
 6 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 fs/quota/quota_mem.c

Comments

Darrick J. Wong Nov. 21, 2022, 5:48 p.m. UTC | #1
On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> In memory quota format relies on quota infrastructure to store dquot
> information for us. While conventional quota formats for file systems
> with persistent storage can load quota information into dquot from the
> storage on-demand and hence quota dquot shrinker can free any dquot that
> is not currently being used, it must be avoided here. Otherwise we can
> lose valuable information, user provided limits, because there is no
> persistent storage to load the information from afterwards.
> 
> One information that in-memory quota format needs to keep track of is a
> sorted list of ids for each quota type. This is done by utilizing an rb
> tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> 
> This format can be used to support quota on file system without persistent
> storage such as tmpfs.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/quota/Kconfig           |   8 ++
>  fs/quota/Makefile          |   1 +
>  fs/quota/dquot.c           |   3 +
>  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
>  include/linux/quota.h      |   7 +-
>  include/uapi/linux/quota.h |   1 +
>  6 files changed, 279 insertions(+), 1 deletion(-)
>  create mode 100644 fs/quota/quota_mem.c
> 
> diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> index b59cd172b5f9..8ea9656ca37b 100644
> --- a/fs/quota/Kconfig
> +++ b/fs/quota/Kconfig
> @@ -67,6 +67,14 @@ config QFMT_V2
>  	  also supports 64-bit inode and block quota limits. If you need this
>  	  functionality say Y here.
>  
> +config QFMT_MEM
> +	tristate "Quota in-memory format support "
> +	depends on QUOTA
> +	help
> +	  This config option enables kernel support for in-memory quota
> +	  format support. Useful to support quota on file system without
> +	  permanent storage. If you need this functionality say Y here.
> +
>  config QUOTACTL
>  	bool
>  	default n
> diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> index 9160639daffa..935be3f7b731 100644
> --- a/fs/quota/Makefile
> +++ b/fs/quota/Makefile
> @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
>  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
>  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
>  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 0427b44bfee5..f1a7a03632a2 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  	spin_lock(&dq_list_lock);
>  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
>  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> +			continue;
>  		remove_dquot_hash(dquot);
>  		remove_free_dquot(dquot);
>  		remove_inuse(dquot);
> diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> new file mode 100644
> index 000000000000..7d5e82122143
> --- /dev/null
> +++ b/fs/quota/quota_mem.c
> @@ -0,0 +1,260 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * In memory quota format relies on quota infrastructure to store dquot
> + * information for us. While conventional quota formats for file systems
> + * with persistent storage can load quota information into dquot from the
> + * storage on-demand and hence quota dquot shrinker can free any dquot
> + * that is not currently being used, it must be avoided here. Otherwise we
> + * can lose valuable information, user provided limits, because there is
> + * no persistent storage to load the information from afterwards.

Hmm.  dquots can't /ever/ be reclaimed?  struct dquot is ~256 bytes, and
assuming 32-bit uids, the upper bound on dquot usage is 2^(32+8) bytes
== 1TB of memory usage?  Once allocated, you'd have to reboot the whole
machine to get that memory back?

Would it be wise to "persist" dquot contents to a (private) tmpfs file
to facilitate incore dquot reclaim?  The tmpfs file data can be paged
out, or even punched if all the dquot records in that page go back to
default settings.

--D

> + *
> + * One information that in-memory quota format needs to keep track of is
> + * a sorted list of ids for each quota type. This is done by utilizing
> + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> + * type.
> + *
> + * This format can be used to support quota on file system without persistent
> + * storage such as tmpfs.
> + */
> +#include <linux/errno.h>
> +#include <linux/fs.h>
> +#include <linux/mount.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/rbtree.h>
> +
> +#include <linux/quotaops.h>
> +#include <linux/quota.h>
> +
> +MODULE_AUTHOR("Lukas Czerner");
> +MODULE_DESCRIPTION("Quota in-memory format support");
> +MODULE_LICENSE("GPL");
> +
> +/*
> + * The following constants define the amount of time given a user
> + * before the soft limits are treated as hard limits (usually resulting
> + * in an allocation failure). The timer is started when the user crosses
> + * their soft limit, it is reset when they go below their soft limit.
> + */
> +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> +
> +struct quota_id {
> +	struct rb_node	node;
> +	qid_t		id;
> +};
> +
> +static int mem_check_quota_file(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 1;
> +}
> +
> +/*
> + * There is no real quota file. Just allocate rb_root for quota ids and
> + * set limits
> + */
> +static int mem_read_file_info(struct super_block *sb, int type)
> +{
> +	struct quota_info *dqopt = sb_dqopt(sb);
> +	struct mem_dqinfo *info = &dqopt->info[type];
> +	int ret = 0;
> +
> +	down_read(&dqopt->dqio_sem);
> +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> +	if (!info->dqi_priv) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Used space is stored as unsigned 64-bit value in bytes but
> +	 * quota core supports only signed 64-bit values so use that
> +	 * as a limit
> +	 */
> +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> +
> +	info->dqi_bgrace = MAX_DQ_TIME;
> +	info->dqi_igrace = MAX_IQ_TIME;
> +	info->dqi_flags = 0;
> +
> +out_unlock:
> +	up_read(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static int mem_write_file_info(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 0;
> +}
> +
> +/*
> + * Free all the quota_id entries in the rb tree and rb_root.
> + */
> +static int mem_free_file_info(struct super_block *sb, int type)
> +{
> +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> +	struct rb_root *root = info->dqi_priv;
> +	struct quota_id *entry;
> +	struct rb_node *node;
> +
> +	info->dqi_priv = NULL;
> +	node = rb_first(root);
> +	while (node) {
> +		entry = rb_entry(node, struct quota_id, node);
> +		node = rb_next(&entry->node);
> +
> +		rb_erase(&entry->node, root);
> +		kfree(entry);
> +	}
> +
> +	kfree(root);
> +	return 0;
> +}
> +
> +/*
> + * There is no real quota file, nothing to read. Just insert the id in
> + * the rb tree.
> + */
> +static int mem_read_dquot(struct dquot *dquot)
> +{
> +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> +	struct rb_node *parent = NULL, *new_node = NULL;
> +	struct quota_id *new_entry, *entry;
> +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> +	int ret = 0;
> +
> +	down_write(&dqopt->dqio_sem);
> +
> +	while (*n) {
> +		parent = *n;
> +		entry = rb_entry(parent, struct quota_id, node);
> +
> +		if (id < entry->id)
> +			n = &(*n)->rb_left;
> +		else if (id > entry->id)
> +			n = &(*n)->rb_right;
> +		else
> +			goto out_unlock;
> +	}
> +
> +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> +	if (!new_entry) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	new_entry->id = id;
> +	new_node = &new_entry->node;
> +	rb_link_node(new_node, parent, n);
> +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> +	dquot->dq_off = 1;
> +	/*
> +	 * Make sure dquot is never released by a shrinker because we
> +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> +	 */
> +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> +
> +out_unlock:
> +	up_write(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static int mem_write_dquot(struct dquot *dquot)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 0;
> +}
> +
> +static int mem_release_dquot(struct dquot *dquot)
> +{
> +	/*
> +	 * Everything is in memory only, release once we're done with
> +	 * quota via mem_free_file_info().
> +	 */
> +	return 0;
> +}
> +
> +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> +{
> +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> +	qid_t id = from_kqid(&init_user_ns, *qid);
> +	struct quota_info *dqopt = sb_dqopt(sb);
> +	struct quota_id *entry = NULL;
> +	int ret = 0;
> +
> +	down_read(&dqopt->dqio_sem);
> +	while (node) {
> +		entry = rb_entry(node, struct quota_id, node);
> +
> +		if (id < entry->id)
> +			node = node->rb_left;
> +		else if (id > entry->id)
> +			node = node->rb_right;
> +		else
> +			goto got_next_id;
> +	}
> +
> +	if (!entry) {
> +		ret = -ENOENT;
> +		goto out_unlock;
> +	}
> +
> +	if (id > entry->id) {
> +		node = rb_next(&entry->node);
> +		if (!node) {
> +			ret = -ENOENT;
> +			goto out_unlock;
> +		}
> +		entry = rb_entry(node, struct quota_id, node);
> +	}
> +
> +got_next_id:
> +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> +out_unlock:
> +	up_read(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static const struct quota_format_ops mem_format_ops = {
> +	.check_quota_file	= mem_check_quota_file,
> +	.read_file_info		= mem_read_file_info,
> +	.write_file_info	= mem_write_file_info,
> +	.free_file_info		= mem_free_file_info,
> +	.read_dqblk		= mem_read_dquot,
> +	.commit_dqblk		= mem_write_dquot,
> +	.release_dqblk		= mem_release_dquot,
> +	.get_next_id		= mem_get_next_id,
> +};
> +
> +static struct quota_format_type mem_quota_format = {
> +	.qf_fmt_id	= QFMT_MEM_ONLY,
> +	.qf_ops		= &mem_format_ops,
> +	.qf_owner	= THIS_MODULE
> +};
> +
> +static int __init init_mem_quota_format(void)
> +{
> +	return register_quota_format(&mem_quota_format);
> +}
> +
> +static void __exit exit_mem_quota_format(void)
> +{
> +	unregister_quota_format(&mem_quota_format);
> +}
> +
> +module_init(init_mem_quota_format);
> +module_exit(exit_mem_quota_format);
> diff --git a/include/linux/quota.h b/include/linux/quota.h
> index fd692b4a41d5..4398e05c8b72 100644
> --- a/include/linux/quota.h
> +++ b/include/linux/quota.h
> @@ -285,7 +285,11 @@ static inline void dqstats_dec(unsigned int type)
>  #define DQ_FAKE_B	3	/* no limits only usage */
>  #define DQ_READ_B	4	/* dquot was read into memory */
>  #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
> -#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
> +#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
> +				 * be released by a shrinker. It should remain
> +				 * in memory until quotas are being disabled on
> +				 * unmount. */
> +#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
>  				 * for the mask of entries set via SETQUOTA\
>  				 * quotactl. They are set under dq_data_lock\
>  				 * and the quota format handling dquot can\
> @@ -536,6 +540,7 @@ struct quota_module_name {
>  	{QFMT_VFS_OLD, "quota_v1"},\
>  	{QFMT_VFS_V0, "quota_v2"},\
>  	{QFMT_VFS_V1, "quota_v2"},\
> +	{QFMT_MEM_ONLY, "quota_mem"},\
>  	{0, NULL}}
>  
>  #endif /* _QUOTA_ */
> diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
> index f17c9636a859..ee9d2bad00c7 100644
> --- a/include/uapi/linux/quota.h
> +++ b/include/uapi/linux/quota.h
> @@ -77,6 +77,7 @@
>  #define	QFMT_VFS_V0 2
>  #define QFMT_OCFS2 3
>  #define	QFMT_VFS_V1 4
> +#define	QFMT_MEM_ONLY 5
>  
>  /* Size of block in which space limits are passed through the quota
>   * interface */
> -- 
> 2.38.1
>
Lukas Czerner Nov. 22, 2022, 9:04 a.m. UTC | #2
On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> > In memory quota format relies on quota infrastructure to store dquot
> > information for us. While conventional quota formats for file systems
> > with persistent storage can load quota information into dquot from the
> > storage on-demand and hence quota dquot shrinker can free any dquot that
> > is not currently being used, it must be avoided here. Otherwise we can
> > lose valuable information, user provided limits, because there is no
> > persistent storage to load the information from afterwards.
> > 
> > One information that in-memory quota format needs to keep track of is a
> > sorted list of ids for each quota type. This is done by utilizing an rb
> > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > 
> > This format can be used to support quota on file system without persistent
> > storage such as tmpfs.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > ---
> >  fs/quota/Kconfig           |   8 ++
> >  fs/quota/Makefile          |   1 +
> >  fs/quota/dquot.c           |   3 +
> >  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
> >  include/linux/quota.h      |   7 +-
> >  include/uapi/linux/quota.h |   1 +
> >  6 files changed, 279 insertions(+), 1 deletion(-)
> >  create mode 100644 fs/quota/quota_mem.c
> > 
> > diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> > index b59cd172b5f9..8ea9656ca37b 100644
> > --- a/fs/quota/Kconfig
> > +++ b/fs/quota/Kconfig
> > @@ -67,6 +67,14 @@ config QFMT_V2
> >  	  also supports 64-bit inode and block quota limits. If you need this
> >  	  functionality say Y here.
> >  
> > +config QFMT_MEM
> > +	tristate "Quota in-memory format support "
> > +	depends on QUOTA
> > +	help
> > +	  This config option enables kernel support for in-memory quota
> > +	  format support. Useful to support quota on file system without
> > +	  permanent storage. If you need this functionality say Y here.
> > +
> >  config QUOTACTL
> >  	bool
> >  	default n
> > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > index 9160639daffa..935be3f7b731 100644
> > --- a/fs/quota/Makefile
> > +++ b/fs/quota/Makefile
> > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > index 0427b44bfee5..f1a7a03632a2 100644
> > --- a/fs/quota/dquot.c
> > +++ b/fs/quota/dquot.c
> > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  	spin_lock(&dq_list_lock);
> >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > +			continue;
> >  		remove_dquot_hash(dquot);
> >  		remove_free_dquot(dquot);
> >  		remove_inuse(dquot);
> > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > new file mode 100644
> > index 000000000000..7d5e82122143
> > --- /dev/null
> > +++ b/fs/quota/quota_mem.c
> > @@ -0,0 +1,260 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * In memory quota format relies on quota infrastructure to store dquot
> > + * information for us. While conventional quota formats for file systems
> > + * with persistent storage can load quota information into dquot from the
> > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > + * that is not currently being used, it must be avoided here. Otherwise we
> > + * can lose valuable information, user provided limits, because there is
> > + * no persistent storage to load the information from afterwards.
> 
> Hmm.  dquots can't /ever/ be reclaimed?  struct dquot is ~256 bytes, and
> assuming 32-bit uids, the upper bound on dquot usage is 2^(32+8) bytes
> == 1TB of memory usage?  Once allocated, you'd have to reboot the whole
> machine to get that memory back?

Hi Darrick,

maybe there are some improvements to the documentation to be made. The
dquots will be freed on unmount as it would normaly. Also only dquots
containing actual user modified limits, so only dquots that are not
DQ_FAKE_B are prevented to be reclaimed by a shrinker see the condition in
dqcache_shrink_scan().

> 
> Would it be wise to "persist" dquot contents to a (private) tmpfs file
> to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> out, or even punched if all the dquot records in that page go back to
> default settings.

The dquot will be flagged as DQ_FAKE_B once the limits are set to 0. But
when I think about it this pose a problem with the default quota limits
because that would change the limits to the defaults once the dquot is
reclaimed and then allocated again. This can be solved by making a
custom .set_dqblk().

Other than this problem, does this address your concern about dquot
reclaim?

Thanks!
-Lukas

> 
> --D
> 
> > + *
> > + * One information that in-memory quota format needs to keep track of is
> > + * a sorted list of ids for each quota type. This is done by utilizing
> > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > + * type.
> > + *
> > + * This format can be used to support quota on file system without persistent
> > + * storage such as tmpfs.
> > + */
> > +#include <linux/errno.h>
> > +#include <linux/fs.h>
> > +#include <linux/mount.h>
> > +#include <linux/kernel.h>
> > +#include <linux/init.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/rbtree.h>
> > +
> > +#include <linux/quotaops.h>
> > +#include <linux/quota.h>
> > +
> > +MODULE_AUTHOR("Lukas Czerner");
> > +MODULE_DESCRIPTION("Quota in-memory format support");
> > +MODULE_LICENSE("GPL");
> > +
> > +/*
> > + * The following constants define the amount of time given a user
> > + * before the soft limits are treated as hard limits (usually resulting
> > + * in an allocation failure). The timer is started when the user crosses
> > + * their soft limit, it is reset when they go below their soft limit.
> > + */
> > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> > +
> > +struct quota_id {
> > +	struct rb_node	node;
> > +	qid_t		id;
> > +};
> > +
> > +static int mem_check_quota_file(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 1;
> > +}
> > +
> > +/*
> > + * There is no real quota file. Just allocate rb_root for quota ids and
> > + * set limits
> > + */
> > +static int mem_read_file_info(struct super_block *sb, int type)
> > +{
> > +	struct quota_info *dqopt = sb_dqopt(sb);
> > +	struct mem_dqinfo *info = &dqopt->info[type];
> > +	int ret = 0;
> > +
> > +	down_read(&dqopt->dqio_sem);
> > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > +	if (!info->dqi_priv) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/*
> > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > +	 * quota core supports only signed 64-bit values so use that
> > +	 * as a limit
> > +	 */
> > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > +
> > +	info->dqi_bgrace = MAX_DQ_TIME;
> > +	info->dqi_igrace = MAX_IQ_TIME;
> > +	info->dqi_flags = 0;
> > +
> > +out_unlock:
> > +	up_read(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static int mem_write_file_info(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Free all the quota_id entries in the rb tree and rb_root.
> > + */
> > +static int mem_free_file_info(struct super_block *sb, int type)
> > +{
> > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > +	struct rb_root *root = info->dqi_priv;
> > +	struct quota_id *entry;
> > +	struct rb_node *node;
> > +
> > +	info->dqi_priv = NULL;
> > +	node = rb_first(root);
> > +	while (node) {
> > +		entry = rb_entry(node, struct quota_id, node);
> > +		node = rb_next(&entry->node);
> > +
> > +		rb_erase(&entry->node, root);
> > +		kfree(entry);
> > +	}
> > +
> > +	kfree(root);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * There is no real quota file, nothing to read. Just insert the id in
> > + * the rb tree.
> > + */
> > +static int mem_read_dquot(struct dquot *dquot)
> > +{
> > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > +	struct rb_node *parent = NULL, *new_node = NULL;
> > +	struct quota_id *new_entry, *entry;
> > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> > +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> > +	int ret = 0;
> > +
> > +	down_write(&dqopt->dqio_sem);
> > +
> > +	while (*n) {
> > +		parent = *n;
> > +		entry = rb_entry(parent, struct quota_id, node);
> > +
> > +		if (id < entry->id)
> > +			n = &(*n)->rb_left;
> > +		else if (id > entry->id)
> > +			n = &(*n)->rb_right;
> > +		else
> > +			goto out_unlock;
> > +	}
> > +
> > +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> > +	if (!new_entry) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	new_entry->id = id;
> > +	new_node = &new_entry->node;
> > +	rb_link_node(new_node, parent, n);
> > +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> > +	dquot->dq_off = 1;
> > +	/*
> > +	 * Make sure dquot is never released by a shrinker because we
> > +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> > +	 */
> > +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> > +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> > +
> > +out_unlock:
> > +	up_write(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static int mem_write_dquot(struct dquot *dquot)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 0;
> > +}
> > +
> > +static int mem_release_dquot(struct dquot *dquot)
> > +{
> > +	/*
> > +	 * Everything is in memory only, release once we're done with
> > +	 * quota via mem_free_file_info().
> > +	 */
> > +	return 0;
> > +}
> > +
> > +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> > +{
> > +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> > +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> > +	qid_t id = from_kqid(&init_user_ns, *qid);
> > +	struct quota_info *dqopt = sb_dqopt(sb);
> > +	struct quota_id *entry = NULL;
> > +	int ret = 0;
> > +
> > +	down_read(&dqopt->dqio_sem);
> > +	while (node) {
> > +		entry = rb_entry(node, struct quota_id, node);
> > +
> > +		if (id < entry->id)
> > +			node = node->rb_left;
> > +		else if (id > entry->id)
> > +			node = node->rb_right;
> > +		else
> > +			goto got_next_id;
> > +	}
> > +
> > +	if (!entry) {
> > +		ret = -ENOENT;
> > +		goto out_unlock;
> > +	}
> > +
> > +	if (id > entry->id) {
> > +		node = rb_next(&entry->node);
> > +		if (!node) {
> > +			ret = -ENOENT;
> > +			goto out_unlock;
> > +		}
> > +		entry = rb_entry(node, struct quota_id, node);
> > +	}
> > +
> > +got_next_id:
> > +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> > +out_unlock:
> > +	up_read(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static const struct quota_format_ops mem_format_ops = {
> > +	.check_quota_file	= mem_check_quota_file,
> > +	.read_file_info		= mem_read_file_info,
> > +	.write_file_info	= mem_write_file_info,
> > +	.free_file_info		= mem_free_file_info,
> > +	.read_dqblk		= mem_read_dquot,
> > +	.commit_dqblk		= mem_write_dquot,
> > +	.release_dqblk		= mem_release_dquot,
> > +	.get_next_id		= mem_get_next_id,
> > +};
> > +
> > +static struct quota_format_type mem_quota_format = {
> > +	.qf_fmt_id	= QFMT_MEM_ONLY,
> > +	.qf_ops		= &mem_format_ops,
> > +	.qf_owner	= THIS_MODULE
> > +};
> > +
> > +static int __init init_mem_quota_format(void)
> > +{
> > +	return register_quota_format(&mem_quota_format);
> > +}
> > +
> > +static void __exit exit_mem_quota_format(void)
> > +{
> > +	unregister_quota_format(&mem_quota_format);
> > +}
> > +
> > +module_init(init_mem_quota_format);
> > +module_exit(exit_mem_quota_format);
> > diff --git a/include/linux/quota.h b/include/linux/quota.h
> > index fd692b4a41d5..4398e05c8b72 100644
> > --- a/include/linux/quota.h
> > +++ b/include/linux/quota.h
> > @@ -285,7 +285,11 @@ static inline void dqstats_dec(unsigned int type)
> >  #define DQ_FAKE_B	3	/* no limits only usage */
> >  #define DQ_READ_B	4	/* dquot was read into memory */
> >  #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
> > -#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
> > +#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
> > +				 * be released by a shrinker. It should remain
> > +				 * in memory until quotas are being disabled on
> > +				 * unmount. */
> > +#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
> >  				 * for the mask of entries set via SETQUOTA\
> >  				 * quotactl. They are set under dq_data_lock\
> >  				 * and the quota format handling dquot can\
> > @@ -536,6 +540,7 @@ struct quota_module_name {
> >  	{QFMT_VFS_OLD, "quota_v1"},\
> >  	{QFMT_VFS_V0, "quota_v2"},\
> >  	{QFMT_VFS_V1, "quota_v2"},\
> > +	{QFMT_MEM_ONLY, "quota_mem"},\
> >  	{0, NULL}}
> >  
> >  #endif /* _QUOTA_ */
> > diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
> > index f17c9636a859..ee9d2bad00c7 100644
> > --- a/include/uapi/linux/quota.h
> > +++ b/include/uapi/linux/quota.h
> > @@ -77,6 +77,7 @@
> >  #define	QFMT_VFS_V0 2
> >  #define QFMT_OCFS2 3
> >  #define	QFMT_VFS_V1 4
> > +#define	QFMT_MEM_ONLY 5
> >  
> >  /* Size of block in which space limits are passed through the quota
> >   * interface */
> > -- 
> > 2.38.1
> > 
>
Christoph Hellwig Nov. 22, 2022, 12:59 p.m. UTC | #3
On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> Would it be wise to "persist" dquot contents to a (private) tmpfs file
> to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> out, or even punched if all the dquot records in that page go back to
> default settings.

That seems like a good idea for memory usage, but I think this might
also make the code much simpler, as that just requires fairly trivial
quota_read and quota_write methods in the shmem code instead of new
support for an in-memory quota file.
Lukas Czerner Nov. 22, 2022, 2:21 p.m. UTC | #4
On Tue, Nov 22, 2022 at 04:59:11AM -0800, Christoph Hellwig wrote:
> On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> > Would it be wise to "persist" dquot contents to a (private) tmpfs file
> > to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> > out, or even punched if all the dquot records in that page go back to
> > default settings.
> 
> That seems like a good idea for memory usage, but I think this might
> also make the code much simpler, as that just requires fairly trivial
> quota_read and quota_write methods in the shmem code instead of new
> support for an in-memory quota file.

You mean like the implementation in the v1 ?

-Lukas
Brian Foster Nov. 22, 2022, 3:23 p.m. UTC | #5
On Tue, Nov 22, 2022 at 10:04:48AM +0100, Lukas Czerner wrote:
> On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> > On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> > > In memory quota format relies on quota infrastructure to store dquot
> > > information for us. While conventional quota formats for file systems
> > > with persistent storage can load quota information into dquot from the
> > > storage on-demand and hence quota dquot shrinker can free any dquot that
> > > is not currently being used, it must be avoided here. Otherwise we can
> > > lose valuable information, user provided limits, because there is no
> > > persistent storage to load the information from afterwards.
> > > 
> > > One information that in-memory quota format needs to keep track of is a
> > > sorted list of ids for each quota type. This is done by utilizing an rb
> > > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > > 
> > > This format can be used to support quota on file system without persistent
> > > storage such as tmpfs.
> > > 
> > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > ---
> > >  fs/quota/Kconfig           |   8 ++
> > >  fs/quota/Makefile          |   1 +
> > >  fs/quota/dquot.c           |   3 +
> > >  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
> > >  include/linux/quota.h      |   7 +-
> > >  include/uapi/linux/quota.h |   1 +
> > >  6 files changed, 279 insertions(+), 1 deletion(-)
> > >  create mode 100644 fs/quota/quota_mem.c
> > > 
> > > diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> > > index b59cd172b5f9..8ea9656ca37b 100644
> > > --- a/fs/quota/Kconfig
> > > +++ b/fs/quota/Kconfig
> > > @@ -67,6 +67,14 @@ config QFMT_V2
> > >  	  also supports 64-bit inode and block quota limits. If you need this
> > >  	  functionality say Y here.
> > >  
> > > +config QFMT_MEM
> > > +	tristate "Quota in-memory format support "
> > > +	depends on QUOTA
> > > +	help
> > > +	  This config option enables kernel support for in-memory quota
> > > +	  format support. Useful to support quota on file system without
> > > +	  permanent storage. If you need this functionality say Y here.
> > > +
> > >  config QUOTACTL
> > >  	bool
> > >  	default n
> > > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > > index 9160639daffa..935be3f7b731 100644
> > > --- a/fs/quota/Makefile
> > > +++ b/fs/quota/Makefile
> > > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> > >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> > >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> > >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> > > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > > index 0427b44bfee5..f1a7a03632a2 100644
> > > --- a/fs/quota/dquot.c
> > > +++ b/fs/quota/dquot.c
> > > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > >  	spin_lock(&dq_list_lock);
> > >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> > >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > > +			continue;
> > >  		remove_dquot_hash(dquot);
> > >  		remove_free_dquot(dquot);
> > >  		remove_inuse(dquot);
> > > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > > new file mode 100644
> > > index 000000000000..7d5e82122143
> > > --- /dev/null
> > > +++ b/fs/quota/quota_mem.c
> > > @@ -0,0 +1,260 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/*
> > > + * In memory quota format relies on quota infrastructure to store dquot
> > > + * information for us. While conventional quota formats for file systems
> > > + * with persistent storage can load quota information into dquot from the
> > > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > > + * that is not currently being used, it must be avoided here. Otherwise we
> > > + * can lose valuable information, user provided limits, because there is
> > > + * no persistent storage to load the information from afterwards.
> > 
> > Hmm.  dquots can't /ever/ be reclaimed?  struct dquot is ~256 bytes, and
> > assuming 32-bit uids, the upper bound on dquot usage is 2^(32+8) bytes
> > == 1TB of memory usage?  Once allocated, you'd have to reboot the whole
> > machine to get that memory back?
> 
> Hi Darrick,
> 
> maybe there are some improvements to the documentation to be made. The
> dquots will be freed on unmount as it would normaly. Also only dquots
> containing actual user modified limits, so only dquots that are not
> DQ_FAKE_B are prevented to be reclaimed by a shrinker see the condition in
> dqcache_shrink_scan().
> 
> > 
> > Would it be wise to "persist" dquot contents to a (private) tmpfs file
> > to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> > out, or even punched if all the dquot records in that page go back to
> > default settings.
> 
> The dquot will be flagged as DQ_FAKE_B once the limits are set to 0. But
> when I think about it this pose a problem with the default quota limits
> because that would change the limits to the defaults once the dquot is
> reclaimed and then allocated again. This can be solved by making a
> custom .set_dqblk().
> 

Hi Lukas,

I'm a little confused.. does the above mean the dquot limit would have
to be explicitly set to 0 by the admin in order to be reclaimed, even
though that limit would be initialized to some non-zero value via the
mount option? If so, wouldn't we want the ability to reclaim a dquot
when the usage counts go down to zero (i.e., so the user/group/whatever
for the dquot no longer has any tmpfs inode/block footprint), assuming
the limit hasn't also been modified from the initial defaults?

Brian

> Other than this problem, does this address your concern about dquot
> reclaim?
> 
> Thanks!
> -Lukas
> 
> > 
> > --D
> > 
> > > + *
> > > + * One information that in-memory quota format needs to keep track of is
> > > + * a sorted list of ids for each quota type. This is done by utilizing
> > > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > > + * type.
> > > + *
> > > + * This format can be used to support quota on file system without persistent
> > > + * storage such as tmpfs.
> > > + */
> > > +#include <linux/errno.h>
> > > +#include <linux/fs.h>
> > > +#include <linux/mount.h>
> > > +#include <linux/kernel.h>
> > > +#include <linux/init.h>
> > > +#include <linux/module.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/rbtree.h>
> > > +
> > > +#include <linux/quotaops.h>
> > > +#include <linux/quota.h>
> > > +
> > > +MODULE_AUTHOR("Lukas Czerner");
> > > +MODULE_DESCRIPTION("Quota in-memory format support");
> > > +MODULE_LICENSE("GPL");
> > > +
> > > +/*
> > > + * The following constants define the amount of time given a user
> > > + * before the soft limits are treated as hard limits (usually resulting
> > > + * in an allocation failure). The timer is started when the user crosses
> > > + * their soft limit, it is reset when they go below their soft limit.
> > > + */
> > > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > +
> > > +struct quota_id {
> > > +	struct rb_node	node;
> > > +	qid_t		id;
> > > +};
> > > +
> > > +static int mem_check_quota_file(struct super_block *sb, int type)
> > > +{
> > > +	/* There is no real quota file, nothing to do */
> > > +	return 1;
> > > +}
> > > +
> > > +/*
> > > + * There is no real quota file. Just allocate rb_root for quota ids and
> > > + * set limits
> > > + */
> > > +static int mem_read_file_info(struct super_block *sb, int type)
> > > +{
> > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > +	struct mem_dqinfo *info = &dqopt->info[type];
> > > +	int ret = 0;
> > > +
> > > +	down_read(&dqopt->dqio_sem);
> > > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > > +		ret = -EINVAL;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > > +	if (!info->dqi_priv) {
> > > +		ret = -ENOMEM;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	/*
> > > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > > +	 * quota core supports only signed 64-bit values so use that
> > > +	 * as a limit
> > > +	 */
> > > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > > +
> > > +	info->dqi_bgrace = MAX_DQ_TIME;
> > > +	info->dqi_igrace = MAX_IQ_TIME;
> > > +	info->dqi_flags = 0;
> > > +
> > > +out_unlock:
> > > +	up_read(&dqopt->dqio_sem);
> > > +	return ret;
> > > +}
> > > +
> > > +static int mem_write_file_info(struct super_block *sb, int type)
> > > +{
> > > +	/* There is no real quota file, nothing to do */
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Free all the quota_id entries in the rb tree and rb_root.
> > > + */
> > > +static int mem_free_file_info(struct super_block *sb, int type)
> > > +{
> > > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > > +	struct rb_root *root = info->dqi_priv;
> > > +	struct quota_id *entry;
> > > +	struct rb_node *node;
> > > +
> > > +	info->dqi_priv = NULL;
> > > +	node = rb_first(root);
> > > +	while (node) {
> > > +		entry = rb_entry(node, struct quota_id, node);
> > > +		node = rb_next(&entry->node);
> > > +
> > > +		rb_erase(&entry->node, root);
> > > +		kfree(entry);
> > > +	}
> > > +
> > > +	kfree(root);
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * There is no real quota file, nothing to read. Just insert the id in
> > > + * the rb tree.
> > > + */
> > > +static int mem_read_dquot(struct dquot *dquot)
> > > +{
> > > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > > +	struct rb_node *parent = NULL, *new_node = NULL;
> > > +	struct quota_id *new_entry, *entry;
> > > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> > > +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> > > +	int ret = 0;
> > > +
> > > +	down_write(&dqopt->dqio_sem);
> > > +
> > > +	while (*n) {
> > > +		parent = *n;
> > > +		entry = rb_entry(parent, struct quota_id, node);
> > > +
> > > +		if (id < entry->id)
> > > +			n = &(*n)->rb_left;
> > > +		else if (id > entry->id)
> > > +			n = &(*n)->rb_right;
> > > +		else
> > > +			goto out_unlock;
> > > +	}
> > > +
> > > +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> > > +	if (!new_entry) {
> > > +		ret = -ENOMEM;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	new_entry->id = id;
> > > +	new_node = &new_entry->node;
> > > +	rb_link_node(new_node, parent, n);
> > > +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> > > +	dquot->dq_off = 1;
> > > +	/*
> > > +	 * Make sure dquot is never released by a shrinker because we
> > > +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> > > +	 */
> > > +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> > > +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> > > +
> > > +out_unlock:
> > > +	up_write(&dqopt->dqio_sem);
> > > +	return ret;
> > > +}
> > > +
> > > +static int mem_write_dquot(struct dquot *dquot)
> > > +{
> > > +	/* There is no real quota file, nothing to do */
> > > +	return 0;
> > > +}
> > > +
> > > +static int mem_release_dquot(struct dquot *dquot)
> > > +{
> > > +	/*
> > > +	 * Everything is in memory only, release once we're done with
> > > +	 * quota via mem_free_file_info().
> > > +	 */
> > > +	return 0;
> > > +}
> > > +
> > > +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> > > +{
> > > +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> > > +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> > > +	qid_t id = from_kqid(&init_user_ns, *qid);
> > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > +	struct quota_id *entry = NULL;
> > > +	int ret = 0;
> > > +
> > > +	down_read(&dqopt->dqio_sem);
> > > +	while (node) {
> > > +		entry = rb_entry(node, struct quota_id, node);
> > > +
> > > +		if (id < entry->id)
> > > +			node = node->rb_left;
> > > +		else if (id > entry->id)
> > > +			node = node->rb_right;
> > > +		else
> > > +			goto got_next_id;
> > > +	}
> > > +
> > > +	if (!entry) {
> > > +		ret = -ENOENT;
> > > +		goto out_unlock;
> > > +	}
> > > +
> > > +	if (id > entry->id) {
> > > +		node = rb_next(&entry->node);
> > > +		if (!node) {
> > > +			ret = -ENOENT;
> > > +			goto out_unlock;
> > > +		}
> > > +		entry = rb_entry(node, struct quota_id, node);
> > > +	}
> > > +
> > > +got_next_id:
> > > +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> > > +out_unlock:
> > > +	up_read(&dqopt->dqio_sem);
> > > +	return ret;
> > > +}
> > > +
> > > +static const struct quota_format_ops mem_format_ops = {
> > > +	.check_quota_file	= mem_check_quota_file,
> > > +	.read_file_info		= mem_read_file_info,
> > > +	.write_file_info	= mem_write_file_info,
> > > +	.free_file_info		= mem_free_file_info,
> > > +	.read_dqblk		= mem_read_dquot,
> > > +	.commit_dqblk		= mem_write_dquot,
> > > +	.release_dqblk		= mem_release_dquot,
> > > +	.get_next_id		= mem_get_next_id,
> > > +};
> > > +
> > > +static struct quota_format_type mem_quota_format = {
> > > +	.qf_fmt_id	= QFMT_MEM_ONLY,
> > > +	.qf_ops		= &mem_format_ops,
> > > +	.qf_owner	= THIS_MODULE
> > > +};
> > > +
> > > +static int __init init_mem_quota_format(void)
> > > +{
> > > +	return register_quota_format(&mem_quota_format);
> > > +}
> > > +
> > > +static void __exit exit_mem_quota_format(void)
> > > +{
> > > +	unregister_quota_format(&mem_quota_format);
> > > +}
> > > +
> > > +module_init(init_mem_quota_format);
> > > +module_exit(exit_mem_quota_format);
> > > diff --git a/include/linux/quota.h b/include/linux/quota.h
> > > index fd692b4a41d5..4398e05c8b72 100644
> > > --- a/include/linux/quota.h
> > > +++ b/include/linux/quota.h
> > > @@ -285,7 +285,11 @@ static inline void dqstats_dec(unsigned int type)
> > >  #define DQ_FAKE_B	3	/* no limits only usage */
> > >  #define DQ_READ_B	4	/* dquot was read into memory */
> > >  #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
> > > -#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
> > > +#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
> > > +				 * be released by a shrinker. It should remain
> > > +				 * in memory until quotas are being disabled on
> > > +				 * unmount. */
> > > +#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
> > >  				 * for the mask of entries set via SETQUOTA\
> > >  				 * quotactl. They are set under dq_data_lock\
> > >  				 * and the quota format handling dquot can\
> > > @@ -536,6 +540,7 @@ struct quota_module_name {
> > >  	{QFMT_VFS_OLD, "quota_v1"},\
> > >  	{QFMT_VFS_V0, "quota_v2"},\
> > >  	{QFMT_VFS_V1, "quota_v2"},\
> > > +	{QFMT_MEM_ONLY, "quota_mem"},\
> > >  	{0, NULL}}
> > >  
> > >  #endif /* _QUOTA_ */
> > > diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
> > > index f17c9636a859..ee9d2bad00c7 100644
> > > --- a/include/uapi/linux/quota.h
> > > +++ b/include/uapi/linux/quota.h
> > > @@ -77,6 +77,7 @@
> > >  #define	QFMT_VFS_V0 2
> > >  #define QFMT_OCFS2 3
> > >  #define	QFMT_VFS_V1 4
> > > +#define	QFMT_MEM_ONLY 5
> > >  
> > >  /* Size of block in which space limits are passed through the quota
> > >   * interface */
> > > -- 
> > > 2.38.1
> > > 
> > 
> 
>
Christoph Hellwig Nov. 23, 2022, 7:58 a.m. UTC | #6
On Tue, Nov 22, 2022 at 03:21:17PM +0100, Lukas Czerner wrote:
> > That seems like a good idea for memory usage, but I think this might
> > also make the code much simpler, as that just requires fairly trivial
> > quota_read and quota_write methods in the shmem code instead of new
> > support for an in-memory quota file.
> 
> You mean like the implementation in the v1 ?

Having now found it: yes.
Lukas Czerner Nov. 23, 2022, 8:36 a.m. UTC | #7
On Tue, Nov 22, 2022 at 11:58:33PM -0800, Christoph Hellwig wrote:
> On Tue, Nov 22, 2022 at 03:21:17PM +0100, Lukas Czerner wrote:
> > > That seems like a good idea for memory usage, but I think this might
> > > also make the code much simpler, as that just requires fairly trivial
> > > quota_read and quota_write methods in the shmem code instead of new
> > > support for an in-memory quota file.
> > 
> > You mean like the implementation in the v1 ?
> 
> Having now found it: yes.
> 

Jan,

do you have any argument for this, since it was your suggestion?

I also think that the implementation is much simpler with in-memory
dquots because we will avoid all the hassle with creating and
maintaining quota file in a proper format. It's not just reads and
writes it's the entire machinery befind it in quota_v2.c and quota_tree.c.

But it is true that even with only user modified dquots being
non-reclaimable until unmount it could theoreticaly represent a
substantial memory consumption. Although I do wonder if this problem
is even real. How many user/group ids would you expect extremely heavy
quota user would have the limits set for? 1k, 10k, million, or even
more? Do you know?

-Lukas
Lukas Czerner Nov. 23, 2022, 9:52 a.m. UTC | #8
On Tue, Nov 22, 2022 at 10:23:57AM -0500, Brian Foster wrote:
> On Tue, Nov 22, 2022 at 10:04:48AM +0100, Lukas Czerner wrote:
> > On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> > > On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> > > > In memory quota format relies on quota infrastructure to store dquot
> > > > information for us. While conventional quota formats for file systems
> > > > with persistent storage can load quota information into dquot from the
> > > > storage on-demand and hence quota dquot shrinker can free any dquot that
> > > > is not currently being used, it must be avoided here. Otherwise we can
> > > > lose valuable information, user provided limits, because there is no
> > > > persistent storage to load the information from afterwards.
> > > > 
> > > > One information that in-memory quota format needs to keep track of is a
> > > > sorted list of ids for each quota type. This is done by utilizing an rb
> > > > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > > > 
> > > > This format can be used to support quota on file system without persistent
> > > > storage such as tmpfs.
> > > > 
> > > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > > ---
> > > >  fs/quota/Kconfig           |   8 ++
> > > >  fs/quota/Makefile          |   1 +
> > > >  fs/quota/dquot.c           |   3 +
> > > >  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
> > > >  include/linux/quota.h      |   7 +-
> > > >  include/uapi/linux/quota.h |   1 +
> > > >  6 files changed, 279 insertions(+), 1 deletion(-)
> > > >  create mode 100644 fs/quota/quota_mem.c
> > > > 
> > > > diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> > > > index b59cd172b5f9..8ea9656ca37b 100644
> > > > --- a/fs/quota/Kconfig
> > > > +++ b/fs/quota/Kconfig
> > > > @@ -67,6 +67,14 @@ config QFMT_V2
> > > >  	  also supports 64-bit inode and block quota limits. If you need this
> > > >  	  functionality say Y here.
> > > >  
> > > > +config QFMT_MEM
> > > > +	tristate "Quota in-memory format support "
> > > > +	depends on QUOTA
> > > > +	help
> > > > +	  This config option enables kernel support for in-memory quota
> > > > +	  format support. Useful to support quota on file system without
> > > > +	  permanent storage. If you need this functionality say Y here.
> > > > +
> > > >  config QUOTACTL
> > > >  	bool
> > > >  	default n
> > > > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > > > index 9160639daffa..935be3f7b731 100644
> > > > --- a/fs/quota/Makefile
> > > > +++ b/fs/quota/Makefile
> > > > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> > > >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> > > >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> > > >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > > > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> > > > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > > > index 0427b44bfee5..f1a7a03632a2 100644
> > > > --- a/fs/quota/dquot.c
> > > > +++ b/fs/quota/dquot.c
> > > > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > > >  	spin_lock(&dq_list_lock);
> > > >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> > > >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > > > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > > > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > > > +			continue;
> > > >  		remove_dquot_hash(dquot);
> > > >  		remove_free_dquot(dquot);
> > > >  		remove_inuse(dquot);
> > > > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > > > new file mode 100644
> > > > index 000000000000..7d5e82122143
> > > > --- /dev/null
> > > > +++ b/fs/quota/quota_mem.c
> > > > @@ -0,0 +1,260 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +/*
> > > > + * In memory quota format relies on quota infrastructure to store dquot
> > > > + * information for us. While conventional quota formats for file systems
> > > > + * with persistent storage can load quota information into dquot from the
> > > > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > > > + * that is not currently being used, it must be avoided here. Otherwise we
> > > > + * can lose valuable information, user provided limits, because there is
> > > > + * no persistent storage to load the information from afterwards.
> > > 
> > > Hmm.  dquots can't /ever/ be reclaimed?  struct dquot is ~256 bytes, and
> > > assuming 32-bit uids, the upper bound on dquot usage is 2^(32+8) bytes
> > > == 1TB of memory usage?  Once allocated, you'd have to reboot the whole
> > > machine to get that memory back?
> > 
> > Hi Darrick,
> > 
> > maybe there are some improvements to the documentation to be made. The
> > dquots will be freed on unmount as it would normaly. Also only dquots
> > containing actual user modified limits, so only dquots that are not
> > DQ_FAKE_B are prevented to be reclaimed by a shrinker see the condition in
> > dqcache_shrink_scan().
> > 
> > > 
> > > Would it be wise to "persist" dquot contents to a (private) tmpfs file
> > > to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> > > out, or even punched if all the dquot records in that page go back to
> > > default settings.
> > 
> > The dquot will be flagged as DQ_FAKE_B once the limits are set to 0. But
> > when I think about it this pose a problem with the default quota limits
> > because that would change the limits to the defaults once the dquot is
> > reclaimed and then allocated again. This can be solved by making a
> > custom .set_dqblk().
> > 
> 
> Hi Lukas,
> 
> I'm a little confused.. does the above mean the dquot limit would have
> to be explicitly set to 0 by the admin in order to be reclaimed, even
> though that limit would be initialized to some non-zero value via the
> mount option? If so, wouldn't we want the ability to reclaim a dquot
> when the usage counts go down to zero (i.e., so the user/group/whatever
> for the dquot no longer has any tmpfs inode/block footprint), assuming
> the limit hasn't also been modified from the initial defaults?

By creating a custom ->set_dqblk() in shmem we can make sure that the
dquot is non-reclaimable (set DQ_NO_SHRINK_B) *only* if the limits have
been set by the user to anything other than the defaults (defaults being
either 0, or value specified by the mount option). Also
DQ_NO_SHRINK_B can't be set on ->dqblk_read() and the condition in
dqcache_shrink_scan() would only test DQ_NO_SHRINK_B. Does it make more
sense to you?

This is something I'd have to do for v3. Sorry for the confusion.

Thanks!
-Lukas

> 
> Brian
> 
> > Other than this problem, does this address your concern about dquot
> > reclaim?
> > 
> > Thanks!
> > -Lukas
> > 
> > > 
> > > --D
> > > 
> > > > + *
> > > > + * One information that in-memory quota format needs to keep track of is
> > > > + * a sorted list of ids for each quota type. This is done by utilizing
> > > > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > > > + * type.
> > > > + *
> > > > + * This format can be used to support quota on file system without persistent
> > > > + * storage such as tmpfs.
> > > > + */
> > > > +#include <linux/errno.h>
> > > > +#include <linux/fs.h>
> > > > +#include <linux/mount.h>
> > > > +#include <linux/kernel.h>
> > > > +#include <linux/init.h>
> > > > +#include <linux/module.h>
> > > > +#include <linux/slab.h>
> > > > +#include <linux/rbtree.h>
> > > > +
> > > > +#include <linux/quotaops.h>
> > > > +#include <linux/quota.h>
> > > > +
> > > > +MODULE_AUTHOR("Lukas Czerner");
> > > > +MODULE_DESCRIPTION("Quota in-memory format support");
> > > > +MODULE_LICENSE("GPL");
> > > > +
> > > > +/*
> > > > + * The following constants define the amount of time given a user
> > > > + * before the soft limits are treated as hard limits (usually resulting
> > > > + * in an allocation failure). The timer is started when the user crosses
> > > > + * their soft limit, it is reset when they go below their soft limit.
> > > > + */
> > > > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > > +
> > > > +struct quota_id {
> > > > +	struct rb_node	node;
> > > > +	qid_t		id;
> > > > +};
> > > > +
> > > > +static int mem_check_quota_file(struct super_block *sb, int type)
> > > > +{
> > > > +	/* There is no real quota file, nothing to do */
> > > > +	return 1;
> > > > +}
> > > > +
> > > > +/*
> > > > + * There is no real quota file. Just allocate rb_root for quota ids and
> > > > + * set limits
> > > > + */
> > > > +static int mem_read_file_info(struct super_block *sb, int type)
> > > > +{
> > > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > > +	struct mem_dqinfo *info = &dqopt->info[type];
> > > > +	int ret = 0;
> > > > +
> > > > +	down_read(&dqopt->dqio_sem);
> > > > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > > > +		ret = -EINVAL;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > > > +	if (!info->dqi_priv) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > > > +	 * quota core supports only signed 64-bit values so use that
> > > > +	 * as a limit
> > > > +	 */
> > > > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > > > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > > > +
> > > > +	info->dqi_bgrace = MAX_DQ_TIME;
> > > > +	info->dqi_igrace = MAX_IQ_TIME;
> > > > +	info->dqi_flags = 0;
> > > > +
> > > > +out_unlock:
> > > > +	up_read(&dqopt->dqio_sem);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int mem_write_file_info(struct super_block *sb, int type)
> > > > +{
> > > > +	/* There is no real quota file, nothing to do */
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Free all the quota_id entries in the rb tree and rb_root.
> > > > + */
> > > > +static int mem_free_file_info(struct super_block *sb, int type)
> > > > +{
> > > > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > > > +	struct rb_root *root = info->dqi_priv;
> > > > +	struct quota_id *entry;
> > > > +	struct rb_node *node;
> > > > +
> > > > +	info->dqi_priv = NULL;
> > > > +	node = rb_first(root);
> > > > +	while (node) {
> > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > +		node = rb_next(&entry->node);
> > > > +
> > > > +		rb_erase(&entry->node, root);
> > > > +		kfree(entry);
> > > > +	}
> > > > +
> > > > +	kfree(root);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * There is no real quota file, nothing to read. Just insert the id in
> > > > + * the rb tree.
> > > > + */
> > > > +static int mem_read_dquot(struct dquot *dquot)
> > > > +{
> > > > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > > > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > > > +	struct rb_node *parent = NULL, *new_node = NULL;
> > > > +	struct quota_id *new_entry, *entry;
> > > > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> > > > +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> > > > +	int ret = 0;
> > > > +
> > > > +	down_write(&dqopt->dqio_sem);
> > > > +
> > > > +	while (*n) {
> > > > +		parent = *n;
> > > > +		entry = rb_entry(parent, struct quota_id, node);
> > > > +
> > > > +		if (id < entry->id)
> > > > +			n = &(*n)->rb_left;
> > > > +		else if (id > entry->id)
> > > > +			n = &(*n)->rb_right;
> > > > +		else
> > > > +			goto out_unlock;
> > > > +	}
> > > > +
> > > > +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> > > > +	if (!new_entry) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	new_entry->id = id;
> > > > +	new_node = &new_entry->node;
> > > > +	rb_link_node(new_node, parent, n);
> > > > +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> > > > +	dquot->dq_off = 1;
> > > > +	/*
> > > > +	 * Make sure dquot is never released by a shrinker because we
> > > > +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> > > > +	 */
> > > > +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> > > > +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> > > > +
> > > > +out_unlock:
> > > > +	up_write(&dqopt->dqio_sem);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int mem_write_dquot(struct dquot *dquot)
> > > > +{
> > > > +	/* There is no real quota file, nothing to do */
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int mem_release_dquot(struct dquot *dquot)
> > > > +{
> > > > +	/*
> > > > +	 * Everything is in memory only, release once we're done with
> > > > +	 * quota via mem_free_file_info().
> > > > +	 */
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> > > > +{
> > > > +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> > > > +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> > > > +	qid_t id = from_kqid(&init_user_ns, *qid);
> > > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > > +	struct quota_id *entry = NULL;
> > > > +	int ret = 0;
> > > > +
> > > > +	down_read(&dqopt->dqio_sem);
> > > > +	while (node) {
> > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > +
> > > > +		if (id < entry->id)
> > > > +			node = node->rb_left;
> > > > +		else if (id > entry->id)
> > > > +			node = node->rb_right;
> > > > +		else
> > > > +			goto got_next_id;
> > > > +	}
> > > > +
> > > > +	if (!entry) {
> > > > +		ret = -ENOENT;
> > > > +		goto out_unlock;
> > > > +	}
> > > > +
> > > > +	if (id > entry->id) {
> > > > +		node = rb_next(&entry->node);
> > > > +		if (!node) {
> > > > +			ret = -ENOENT;
> > > > +			goto out_unlock;
> > > > +		}
> > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > +	}
> > > > +
> > > > +got_next_id:
> > > > +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> > > > +out_unlock:
> > > > +	up_read(&dqopt->dqio_sem);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static const struct quota_format_ops mem_format_ops = {
> > > > +	.check_quota_file	= mem_check_quota_file,
> > > > +	.read_file_info		= mem_read_file_info,
> > > > +	.write_file_info	= mem_write_file_info,
> > > > +	.free_file_info		= mem_free_file_info,
> > > > +	.read_dqblk		= mem_read_dquot,
> > > > +	.commit_dqblk		= mem_write_dquot,
> > > > +	.release_dqblk		= mem_release_dquot,
> > > > +	.get_next_id		= mem_get_next_id,
> > > > +};
> > > > +
> > > > +static struct quota_format_type mem_quota_format = {
> > > > +	.qf_fmt_id	= QFMT_MEM_ONLY,
> > > > +	.qf_ops		= &mem_format_ops,
> > > > +	.qf_owner	= THIS_MODULE
> > > > +};
> > > > +
> > > > +static int __init init_mem_quota_format(void)
> > > > +{
> > > > +	return register_quota_format(&mem_quota_format);
> > > > +}
> > > > +
> > > > +static void __exit exit_mem_quota_format(void)
> > > > +{
> > > > +	unregister_quota_format(&mem_quota_format);
> > > > +}
> > > > +
> > > > +module_init(init_mem_quota_format);
> > > > +module_exit(exit_mem_quota_format);
> > > > diff --git a/include/linux/quota.h b/include/linux/quota.h
> > > > index fd692b4a41d5..4398e05c8b72 100644
> > > > --- a/include/linux/quota.h
> > > > +++ b/include/linux/quota.h
> > > > @@ -285,7 +285,11 @@ static inline void dqstats_dec(unsigned int type)
> > > >  #define DQ_FAKE_B	3	/* no limits only usage */
> > > >  #define DQ_READ_B	4	/* dquot was read into memory */
> > > >  #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
> > > > -#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
> > > > +#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
> > > > +				 * be released by a shrinker. It should remain
> > > > +				 * in memory until quotas are being disabled on
> > > > +				 * unmount. */
> > > > +#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
> > > >  				 * for the mask of entries set via SETQUOTA\
> > > >  				 * quotactl. They are set under dq_data_lock\
> > > >  				 * and the quota format handling dquot can\
> > > > @@ -536,6 +540,7 @@ struct quota_module_name {
> > > >  	{QFMT_VFS_OLD, "quota_v1"},\
> > > >  	{QFMT_VFS_V0, "quota_v2"},\
> > > >  	{QFMT_VFS_V1, "quota_v2"},\
> > > > +	{QFMT_MEM_ONLY, "quota_mem"},\
> > > >  	{0, NULL}}
> > > >  
> > > >  #endif /* _QUOTA_ */
> > > > diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
> > > > index f17c9636a859..ee9d2bad00c7 100644
> > > > --- a/include/uapi/linux/quota.h
> > > > +++ b/include/uapi/linux/quota.h
> > > > @@ -77,6 +77,7 @@
> > > >  #define	QFMT_VFS_V0 2
> > > >  #define QFMT_OCFS2 3
> > > >  #define	QFMT_VFS_V1 4
> > > > +#define	QFMT_MEM_ONLY 5
> > > >  
> > > >  /* Size of block in which space limits are passed through the quota
> > > >   * interface */
> > > > -- 
> > > > 2.38.1
> > > > 
> > > 
> > 
> > 
>
Brian Foster Nov. 23, 2022, 12:32 p.m. UTC | #9
On Wed, Nov 23, 2022 at 10:52:27AM +0100, Lukas Czerner wrote:
> On Tue, Nov 22, 2022 at 10:23:57AM -0500, Brian Foster wrote:
> > On Tue, Nov 22, 2022 at 10:04:48AM +0100, Lukas Czerner wrote:
> > > On Mon, Nov 21, 2022 at 09:48:18AM -0800, Darrick J. Wong wrote:
> > > > On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> > > > > In memory quota format relies on quota infrastructure to store dquot
> > > > > information for us. While conventional quota formats for file systems
> > > > > with persistent storage can load quota information into dquot from the
> > > > > storage on-demand and hence quota dquot shrinker can free any dquot that
> > > > > is not currently being used, it must be avoided here. Otherwise we can
> > > > > lose valuable information, user provided limits, because there is no
> > > > > persistent storage to load the information from afterwards.
> > > > > 
> > > > > One information that in-memory quota format needs to keep track of is a
> > > > > sorted list of ids for each quota type. This is done by utilizing an rb
> > > > > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > > > > 
> > > > > This format can be used to support quota on file system without persistent
> > > > > storage such as tmpfs.
> > > > > 
> > > > > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > > > > ---
> > > > >  fs/quota/Kconfig           |   8 ++
> > > > >  fs/quota/Makefile          |   1 +
> > > > >  fs/quota/dquot.c           |   3 +
> > > > >  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
> > > > >  include/linux/quota.h      |   7 +-
> > > > >  include/uapi/linux/quota.h |   1 +
> > > > >  6 files changed, 279 insertions(+), 1 deletion(-)
> > > > >  create mode 100644 fs/quota/quota_mem.c
> > > > > 
> > > > > diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> > > > > index b59cd172b5f9..8ea9656ca37b 100644
> > > > > --- a/fs/quota/Kconfig
> > > > > +++ b/fs/quota/Kconfig
> > > > > @@ -67,6 +67,14 @@ config QFMT_V2
> > > > >  	  also supports 64-bit inode and block quota limits. If you need this
> > > > >  	  functionality say Y here.
> > > > >  
> > > > > +config QFMT_MEM
> > > > > +	tristate "Quota in-memory format support "
> > > > > +	depends on QUOTA
> > > > > +	help
> > > > > +	  This config option enables kernel support for in-memory quota
> > > > > +	  format support. Useful to support quota on file system without
> > > > > +	  permanent storage. If you need this functionality say Y here.
> > > > > +
> > > > >  config QUOTACTL
> > > > >  	bool
> > > > >  	default n
> > > > > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > > > > index 9160639daffa..935be3f7b731 100644
> > > > > --- a/fs/quota/Makefile
> > > > > +++ b/fs/quota/Makefile
> > > > > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> > > > >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> > > > >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> > > > >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > > > > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> > > > > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > > > > index 0427b44bfee5..f1a7a03632a2 100644
> > > > > --- a/fs/quota/dquot.c
> > > > > +++ b/fs/quota/dquot.c
> > > > > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> > > > >  	spin_lock(&dq_list_lock);
> > > > >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> > > > >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > > > > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > > > > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > > > > +			continue;
> > > > >  		remove_dquot_hash(dquot);
> > > > >  		remove_free_dquot(dquot);
> > > > >  		remove_inuse(dquot);
> > > > > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > > > > new file mode 100644
> > > > > index 000000000000..7d5e82122143
> > > > > --- /dev/null
> > > > > +++ b/fs/quota/quota_mem.c
> > > > > @@ -0,0 +1,260 @@
> > > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > > +/*
> > > > > + * In memory quota format relies on quota infrastructure to store dquot
> > > > > + * information for us. While conventional quota formats for file systems
> > > > > + * with persistent storage can load quota information into dquot from the
> > > > > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > > > > + * that is not currently being used, it must be avoided here. Otherwise we
> > > > > + * can lose valuable information, user provided limits, because there is
> > > > > + * no persistent storage to load the information from afterwards.
> > > > 
> > > > Hmm.  dquots can't /ever/ be reclaimed?  struct dquot is ~256 bytes, and
> > > > assuming 32-bit uids, the upper bound on dquot usage is 2^(32+8) bytes
> > > > == 1TB of memory usage?  Once allocated, you'd have to reboot the whole
> > > > machine to get that memory back?
> > > 
> > > Hi Darrick,
> > > 
> > > maybe there are some improvements to the documentation to be made. The
> > > dquots will be freed on unmount as it would normaly. Also only dquots
> > > containing actual user modified limits, so only dquots that are not
> > > DQ_FAKE_B are prevented to be reclaimed by a shrinker see the condition in
> > > dqcache_shrink_scan().
> > > 
> > > > 
> > > > Would it be wise to "persist" dquot contents to a (private) tmpfs file
> > > > to facilitate incore dquot reclaim?  The tmpfs file data can be paged
> > > > out, or even punched if all the dquot records in that page go back to
> > > > default settings.
> > > 
> > > The dquot will be flagged as DQ_FAKE_B once the limits are set to 0. But
> > > when I think about it this pose a problem with the default quota limits
> > > because that would change the limits to the defaults once the dquot is
> > > reclaimed and then allocated again. This can be solved by making a
> > > custom .set_dqblk().
> > > 
> > 
> > Hi Lukas,
> > 
> > I'm a little confused.. does the above mean the dquot limit would have
> > to be explicitly set to 0 by the admin in order to be reclaimed, even
> > though that limit would be initialized to some non-zero value via the
> > mount option? If so, wouldn't we want the ability to reclaim a dquot
> > when the usage counts go down to zero (i.e., so the user/group/whatever
> > for the dquot no longer has any tmpfs inode/block footprint), assuming
> > the limit hasn't also been modified from the initial defaults?
> 
> By creating a custom ->set_dqblk() in shmem we can make sure that the
> dquot is non-reclaimable (set DQ_NO_SHRINK_B) *only* if the limits have
> been set by the user to anything other than the defaults (defaults being
> either 0, or value specified by the mount option). Also
> DQ_NO_SHRINK_B can't be set on ->dqblk_read() and the condition in
> dqcache_shrink_scan() would only test DQ_NO_SHRINK_B. Does it make more
> sense to you?
> 

Ok. Yes, I think I get the general idea. Thanks for the info.

Brian

> This is something I'd have to do for v3. Sorry for the confusion.
> 
> Thanks!
> -Lukas
> 
> > 
> > Brian
> > 
> > > Other than this problem, does this address your concern about dquot
> > > reclaim?
> > > 
> > > Thanks!
> > > -Lukas
> > > 
> > > > 
> > > > --D
> > > > 
> > > > > + *
> > > > > + * One information that in-memory quota format needs to keep track of is
> > > > > + * a sorted list of ids for each quota type. This is done by utilizing
> > > > > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > > > > + * type.
> > > > > + *
> > > > > + * This format can be used to support quota on file system without persistent
> > > > > + * storage such as tmpfs.
> > > > > + */
> > > > > +#include <linux/errno.h>
> > > > > +#include <linux/fs.h>
> > > > > +#include <linux/mount.h>
> > > > > +#include <linux/kernel.h>
> > > > > +#include <linux/init.h>
> > > > > +#include <linux/module.h>
> > > > > +#include <linux/slab.h>
> > > > > +#include <linux/rbtree.h>
> > > > > +
> > > > > +#include <linux/quotaops.h>
> > > > > +#include <linux/quota.h>
> > > > > +
> > > > > +MODULE_AUTHOR("Lukas Czerner");
> > > > > +MODULE_DESCRIPTION("Quota in-memory format support");
> > > > > +MODULE_LICENSE("GPL");
> > > > > +
> > > > > +/*
> > > > > + * The following constants define the amount of time given a user
> > > > > + * before the soft limits are treated as hard limits (usually resulting
> > > > > + * in an allocation failure). The timer is started when the user crosses
> > > > > + * their soft limit, it is reset when they go below their soft limit.
> > > > > + */
> > > > > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > > > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> > > > > +
> > > > > +struct quota_id {
> > > > > +	struct rb_node	node;
> > > > > +	qid_t		id;
> > > > > +};
> > > > > +
> > > > > +static int mem_check_quota_file(struct super_block *sb, int type)
> > > > > +{
> > > > > +	/* There is no real quota file, nothing to do */
> > > > > +	return 1;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * There is no real quota file. Just allocate rb_root for quota ids and
> > > > > + * set limits
> > > > > + */
> > > > > +static int mem_read_file_info(struct super_block *sb, int type)
> > > > > +{
> > > > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > > > +	struct mem_dqinfo *info = &dqopt->info[type];
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	down_read(&dqopt->dqio_sem);
> > > > > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > > > > +		ret = -EINVAL;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > > > > +	if (!info->dqi_priv) {
> > > > > +		ret = -ENOMEM;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	/*
> > > > > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > > > > +	 * quota core supports only signed 64-bit values so use that
> > > > > +	 * as a limit
> > > > > +	 */
> > > > > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > > > > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > > > > +
> > > > > +	info->dqi_bgrace = MAX_DQ_TIME;
> > > > > +	info->dqi_igrace = MAX_IQ_TIME;
> > > > > +	info->dqi_flags = 0;
> > > > > +
> > > > > +out_unlock:
> > > > > +	up_read(&dqopt->dqio_sem);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int mem_write_file_info(struct super_block *sb, int type)
> > > > > +{
> > > > > +	/* There is no real quota file, nothing to do */
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Free all the quota_id entries in the rb tree and rb_root.
> > > > > + */
> > > > > +static int mem_free_file_info(struct super_block *sb, int type)
> > > > > +{
> > > > > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > > > > +	struct rb_root *root = info->dqi_priv;
> > > > > +	struct quota_id *entry;
> > > > > +	struct rb_node *node;
> > > > > +
> > > > > +	info->dqi_priv = NULL;
> > > > > +	node = rb_first(root);
> > > > > +	while (node) {
> > > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > > +		node = rb_next(&entry->node);
> > > > > +
> > > > > +		rb_erase(&entry->node, root);
> > > > > +		kfree(entry);
> > > > > +	}
> > > > > +
> > > > > +	kfree(root);
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * There is no real quota file, nothing to read. Just insert the id in
> > > > > + * the rb tree.
> > > > > + */
> > > > > +static int mem_read_dquot(struct dquot *dquot)
> > > > > +{
> > > > > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > > > > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > > > > +	struct rb_node *parent = NULL, *new_node = NULL;
> > > > > +	struct quota_id *new_entry, *entry;
> > > > > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> > > > > +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	down_write(&dqopt->dqio_sem);
> > > > > +
> > > > > +	while (*n) {
> > > > > +		parent = *n;
> > > > > +		entry = rb_entry(parent, struct quota_id, node);
> > > > > +
> > > > > +		if (id < entry->id)
> > > > > +			n = &(*n)->rb_left;
> > > > > +		else if (id > entry->id)
> > > > > +			n = &(*n)->rb_right;
> > > > > +		else
> > > > > +			goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> > > > > +	if (!new_entry) {
> > > > > +		ret = -ENOMEM;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	new_entry->id = id;
> > > > > +	new_node = &new_entry->node;
> > > > > +	rb_link_node(new_node, parent, n);
> > > > > +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> > > > > +	dquot->dq_off = 1;
> > > > > +	/*
> > > > > +	 * Make sure dquot is never released by a shrinker because we
> > > > > +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> > > > > +	 */
> > > > > +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> > > > > +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> > > > > +
> > > > > +out_unlock:
> > > > > +	up_write(&dqopt->dqio_sem);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int mem_write_dquot(struct dquot *dquot)
> > > > > +{
> > > > > +	/* There is no real quota file, nothing to do */
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int mem_release_dquot(struct dquot *dquot)
> > > > > +{
> > > > > +	/*
> > > > > +	 * Everything is in memory only, release once we're done with
> > > > > +	 * quota via mem_free_file_info().
> > > > > +	 */
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> > > > > +{
> > > > > +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> > > > > +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> > > > > +	qid_t id = from_kqid(&init_user_ns, *qid);
> > > > > +	struct quota_info *dqopt = sb_dqopt(sb);
> > > > > +	struct quota_id *entry = NULL;
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	down_read(&dqopt->dqio_sem);
> > > > > +	while (node) {
> > > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > > +
> > > > > +		if (id < entry->id)
> > > > > +			node = node->rb_left;
> > > > > +		else if (id > entry->id)
> > > > > +			node = node->rb_right;
> > > > > +		else
> > > > > +			goto got_next_id;
> > > > > +	}
> > > > > +
> > > > > +	if (!entry) {
> > > > > +		ret = -ENOENT;
> > > > > +		goto out_unlock;
> > > > > +	}
> > > > > +
> > > > > +	if (id > entry->id) {
> > > > > +		node = rb_next(&entry->node);
> > > > > +		if (!node) {
> > > > > +			ret = -ENOENT;
> > > > > +			goto out_unlock;
> > > > > +		}
> > > > > +		entry = rb_entry(node, struct quota_id, node);
> > > > > +	}
> > > > > +
> > > > > +got_next_id:
> > > > > +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> > > > > +out_unlock:
> > > > > +	up_read(&dqopt->dqio_sem);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static const struct quota_format_ops mem_format_ops = {
> > > > > +	.check_quota_file	= mem_check_quota_file,
> > > > > +	.read_file_info		= mem_read_file_info,
> > > > > +	.write_file_info	= mem_write_file_info,
> > > > > +	.free_file_info		= mem_free_file_info,
> > > > > +	.read_dqblk		= mem_read_dquot,
> > > > > +	.commit_dqblk		= mem_write_dquot,
> > > > > +	.release_dqblk		= mem_release_dquot,
> > > > > +	.get_next_id		= mem_get_next_id,
> > > > > +};
> > > > > +
> > > > > +static struct quota_format_type mem_quota_format = {
> > > > > +	.qf_fmt_id	= QFMT_MEM_ONLY,
> > > > > +	.qf_ops		= &mem_format_ops,
> > > > > +	.qf_owner	= THIS_MODULE
> > > > > +};
> > > > > +
> > > > > +static int __init init_mem_quota_format(void)
> > > > > +{
> > > > > +	return register_quota_format(&mem_quota_format);
> > > > > +}
> > > > > +
> > > > > +static void __exit exit_mem_quota_format(void)
> > > > > +{
> > > > > +	unregister_quota_format(&mem_quota_format);
> > > > > +}
> > > > > +
> > > > > +module_init(init_mem_quota_format);
> > > > > +module_exit(exit_mem_quota_format);
> > > > > diff --git a/include/linux/quota.h b/include/linux/quota.h
> > > > > index fd692b4a41d5..4398e05c8b72 100644
> > > > > --- a/include/linux/quota.h
> > > > > +++ b/include/linux/quota.h
> > > > > @@ -285,7 +285,11 @@ static inline void dqstats_dec(unsigned int type)
> > > > >  #define DQ_FAKE_B	3	/* no limits only usage */
> > > > >  #define DQ_READ_B	4	/* dquot was read into memory */
> > > > >  #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
> > > > > -#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
> > > > > +#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
> > > > > +				 * be released by a shrinker. It should remain
> > > > > +				 * in memory until quotas are being disabled on
> > > > > +				 * unmount. */
> > > > > +#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
> > > > >  				 * for the mask of entries set via SETQUOTA\
> > > > >  				 * quotactl. They are set under dq_data_lock\
> > > > >  				 * and the quota format handling dquot can\
> > > > > @@ -536,6 +540,7 @@ struct quota_module_name {
> > > > >  	{QFMT_VFS_OLD, "quota_v1"},\
> > > > >  	{QFMT_VFS_V0, "quota_v2"},\
> > > > >  	{QFMT_VFS_V1, "quota_v2"},\
> > > > > +	{QFMT_MEM_ONLY, "quota_mem"},\
> > > > >  	{0, NULL}}
> > > > >  
> > > > >  #endif /* _QUOTA_ */
> > > > > diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
> > > > > index f17c9636a859..ee9d2bad00c7 100644
> > > > > --- a/include/uapi/linux/quota.h
> > > > > +++ b/include/uapi/linux/quota.h
> > > > > @@ -77,6 +77,7 @@
> > > > >  #define	QFMT_VFS_V0 2
> > > > >  #define QFMT_OCFS2 3
> > > > >  #define	QFMT_VFS_V1 4
> > > > > +#define	QFMT_MEM_ONLY 5
> > > > >  
> > > > >  /* Size of block in which space limits are passed through the quota
> > > > >   * interface */
> > > > > -- 
> > > > > 2.38.1
> > > > > 
> > > > 
> > > 
> > > 
> > 
>
Brian Foster Nov. 23, 2022, 12:37 p.m. UTC | #10
On Wed, Nov 23, 2022 at 09:36:15AM +0100, Lukas Czerner wrote:
> On Tue, Nov 22, 2022 at 11:58:33PM -0800, Christoph Hellwig wrote:
> > On Tue, Nov 22, 2022 at 03:21:17PM +0100, Lukas Czerner wrote:
> > > > That seems like a good idea for memory usage, but I think this might
> > > > also make the code much simpler, as that just requires fairly trivial
> > > > quota_read and quota_write methods in the shmem code instead of new
> > > > support for an in-memory quota file.
> > > 
> > > You mean like the implementation in the v1 ?
> > 
> > Having now found it: yes.
> > 
> 
> Jan,
> 
> do you have any argument for this, since it was your suggestion?
> 
> I also think that the implementation is much simpler with in-memory
> dquots because we will avoid all the hassle with creating and
> maintaining quota file in a proper format. It's not just reads and
> writes it's the entire machinery befind it in quota_v2.c and quota_tree.c.
> 
> But it is true that even with only user modified dquots being
> non-reclaimable until unmount it could theoreticaly represent a
> substantial memory consumption. Although I do wonder if this problem
> is even real. How many user/group ids would you expect extremely heavy
> quota user would have the limits set for? 1k, 10k, million, or even
> more? Do you know?
> 

I don't know this code well enough to have a strong opinion on the v1
vs. v2 approach in general, but FWIW it does seem to me that the benefit
of v1 from a memory savings perspective is perhaps overstated. AFAICT,
tmpfs already pins inodes/denties (notably larger than dquots) in-core
for the lifetime of the inode, so it's not like we'll be saving much
memory from dquots that are actually in-use. I think this dquot memory
should be limited indirectly by the max inode restriction, as well.

That means the potential wastage is measured in dquots that are no
longer referenced, but have previously had a non-default quota limit set
by the admin, right? Even with the v1 approach, I don't think it's wise
to just push such otherwise unused dquots into swap space indefinitely.

Perhaps a reasonable approach to the memory usage issue is to just cap
the number of dquots that are allowed to have custom limits on tmpfs?
E.g., to echo Lukas above.. if there was a cap of something like 512-1k
custom quota limits, would that really be a problem for quota users on
tmpfs? Other users would still be covered by the default mount-time
limits. Of course, you could always make such a cap flexible as a % of
tmpfs size, or configurable via mount option, etc. Just a thought.

Brian

> -Lukas
> 
>
Jan Kara Nov. 23, 2022, 5:07 p.m. UTC | #11
On Mon 21-11-22 15:28:52, Lukas Czerner wrote:
> In memory quota format relies on quota infrastructure to store dquot
> information for us. While conventional quota formats for file systems
> with persistent storage can load quota information into dquot from the
> storage on-demand and hence quota dquot shrinker can free any dquot that
> is not currently being used, it must be avoided here. Otherwise we can
> lose valuable information, user provided limits, because there is no
> persistent storage to load the information from afterwards.
> 
> One information that in-memory quota format needs to keep track of is a
> sorted list of ids for each quota type. This is done by utilizing an rb
> tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> 
> This format can be used to support quota on file system without persistent
> storage such as tmpfs.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>

I was thinking about this somewhat and sketching some code on my computer
to make things even simpler. See suggestions below.

> diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> index 9160639daffa..935be3f7b731 100644
> --- a/fs/quota/Makefile
> +++ b/fs/quota/Makefile
> @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
>  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
>  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
>  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o

So I wouldn't go for new generic quota format. Instead I'd define &
register private quota format in mm/shmem.c like:

static struct quota_format_type shmem_quota_format = {
        .qf_fmt_id      = QFMT_SHMEM,
        .qf_ops         = &shmem_quota_format_ops,
        .qf_owner       = THIS_MODULE
};

OCFS2 already does the very same thing so you can take some inspiration
from it. Also all the ops will be private to shmem.

> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 0427b44bfee5..f1a7a03632a2 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  	spin_lock(&dq_list_lock);
>  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
>  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> +			continue;
>  		remove_dquot_hash(dquot);
>  		remove_free_dquot(dquot);
>  		remove_inuse(dquot);

I'd leave dquot reclaim alone. See below how to avoid loosing usage
information / limits.

> diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> new file mode 100644
> index 000000000000..7d5e82122143
> --- /dev/null
> +++ b/fs/quota/quota_mem.c
> @@ -0,0 +1,260 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * In memory quota format relies on quota infrastructure to store dquot
> + * information for us. While conventional quota formats for file systems
> + * with persistent storage can load quota information into dquot from the
> + * storage on-demand and hence quota dquot shrinker can free any dquot
> + * that is not currently being used, it must be avoided here. Otherwise we
> + * can lose valuable information, user provided limits, because there is
> + * no persistent storage to load the information from afterwards.
> + *
> + * One information that in-memory quota format needs to keep track of is
> + * a sorted list of ids for each quota type. This is done by utilizing
> + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> + * type.
> + *
> + * This format can be used to support quota on file system without persistent
> + * storage such as tmpfs.
> + */
> +#include <linux/errno.h>
> +#include <linux/fs.h>
> +#include <linux/mount.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/rbtree.h>
> +
> +#include <linux/quotaops.h>
> +#include <linux/quota.h>
> +
> +MODULE_AUTHOR("Lukas Czerner");
> +MODULE_DESCRIPTION("Quota in-memory format support");
> +MODULE_LICENSE("GPL");
> +
> +/*
> + * The following constants define the amount of time given a user
> + * before the soft limits are treated as hard limits (usually resulting
> + * in an allocation failure). The timer is started when the user crosses
> + * their soft limit, it is reset when they go below their soft limit.
> + */
> +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */

These would then become shmem private defaults.

> +struct quota_id {
> +	struct rb_node	node;
> +	qid_t		id;
> +};

Instead of this I'd define:

struct shmem_dquot {
        struct rb_node node;
        qid_t id;
	qsize_t bhardlimit;
	qsize_t bsoftlimit;
	qsize_t ihardlimit;
	qsize_t isoftlimit;
};

It would be kept in rbtree like you do with quota_id but it will be also
used as ultimate "persistent" storage of quota information when dquot gets
reclaimed. We don't need to store grace times or usage information because
if there is non-zero usage, dquot is referenced from the inode and thus
cannot be reclaimed.

> +static int mem_check_quota_file(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 1;
> +}
> +
> +/*
> + * There is no real quota file. Just allocate rb_root for quota ids and
> + * set limits
> + */
> +static int mem_read_file_info(struct super_block *sb, int type)
> +{
> +	struct quota_info *dqopt = sb_dqopt(sb);
> +	struct mem_dqinfo *info = &dqopt->info[type];
> +	int ret = 0;
> +
> +	down_read(&dqopt->dqio_sem);

No need for dqio_sem here...

> +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}

Also this check is not needed.

> +
> +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> +	if (!info->dqi_priv) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Used space is stored as unsigned 64-bit value in bytes but
> +	 * quota core supports only signed 64-bit values so use that
> +	 * as a limit
> +	 */
> +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> +
> +	info->dqi_bgrace = MAX_DQ_TIME;
> +	info->dqi_igrace = MAX_IQ_TIME;
> +	info->dqi_flags = 0;
> +
> +out_unlock:
> +	up_read(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static int mem_write_file_info(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 0;
> +}
> +
> +/*
> + * Free all the quota_id entries in the rb tree and rb_root.
> + */
> +static int mem_free_file_info(struct super_block *sb, int type)
> +{
> +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> +	struct rb_root *root = info->dqi_priv;
> +	struct quota_id *entry;
> +	struct rb_node *node;
> +
> +	info->dqi_priv = NULL;
> +	node = rb_first(root);
> +	while (node) {
> +		entry = rb_entry(node, struct quota_id, node);
> +		node = rb_next(&entry->node);
> +
> +		rb_erase(&entry->node, root);
> +		kfree(entry);
> +	}
> +
> +	kfree(root);
> +	return 0;
> +}

These should be all ops that are needed for the quota format. So quota
format ops can be just:

static const struct quota_format_ops shmem_format_ops = {
        .check_quota_file       = shmem_check_quota_file,
        .read_file_info         = shmem_read_file_info,
        .write_file_info        = shmem_write_file_info,
        .free_file_info         = shmem_free_file_info,
};

We deal with remaining operations by diverting quota callbacks earlier in
filesystem hooks like:

static const struct dquot_operations shmem_quota_operations = {
	.acquire_dquot		= shmem_acquire_dquot,
	..release_dquot		= shmem_release_dquot,
	.alloc_dquot		= dquot_alloc,
	.destroy_dquot		= dquot_destroy,
	.write_info		= <do nothing>,
	.mark_dirty		= <do nothing>,
	.get_next_id		= shmem_get_next_id,
};

Now shmem_get_next_id() will basically do what you do in mem_get_next_id()
just you need to wrap it in sb_has_quota_active() check as
dquot_get_next_id() does.

shmem_acquire_dquot() will do what you do in mem_read_dquot(), just if we
find the id in the rbtree, we copy limits into the dquot structure.

shmem_release_dquot() will just copy current limits to the structure in the
rbtree. It can also verify there's no usage. It can also just delete the
structure from the rbtree if the limits match the default ones.

And that should be it.

> +/*
> + * There is no real quota file, nothing to read. Just insert the id in
> + * the rb tree.
> + */
> +static int mem_read_dquot(struct dquot *dquot)
> +{
> +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> +	struct rb_node *parent = NULL, *new_node = NULL;
> +	struct quota_id *new_entry, *entry;
> +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> +	int ret = 0;
> +
> +	down_write(&dqopt->dqio_sem);
> +
> +	while (*n) {
> +		parent = *n;
> +		entry = rb_entry(parent, struct quota_id, node);
> +
> +		if (id < entry->id)
> +			n = &(*n)->rb_left;
> +		else if (id > entry->id)
> +			n = &(*n)->rb_right;
> +		else
> +			goto out_unlock;
> +	}
> +
> +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> +	if (!new_entry) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	new_entry->id = id;
> +	new_node = &new_entry->node;
> +	rb_link_node(new_node, parent, n);
> +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> +	dquot->dq_off = 1;
> +	/*
> +	 * Make sure dquot is never released by a shrinker because we
> +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> +	 */
> +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> +
> +out_unlock:
> +	up_write(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static int mem_write_dquot(struct dquot *dquot)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 0;
> +}
> +
...

> +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> +{
> +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> +	qid_t id = from_kqid(&init_user_ns, *qid);
> +	struct quota_info *dqopt = sb_dqopt(sb);
> +	struct quota_id *entry = NULL;
> +	int ret = 0;
> +
> +	down_read(&dqopt->dqio_sem);
> +	while (node) {
> +		entry = rb_entry(node, struct quota_id, node);
> +
> +		if (id < entry->id)
> +			node = node->rb_left;
> +		else if (id > entry->id)
> +			node = node->rb_right;
> +		else
> +			goto got_next_id;
> +	}
> +
> +	if (!entry) {
> +		ret = -ENOENT;
> +		goto out_unlock;
> +	}
> +
> +	if (id > entry->id) {
> +		node = rb_next(&entry->node);
> +		if (!node) {
> +			ret = -ENOENT;
> +			goto out_unlock;
> +		}
> +		entry = rb_entry(node, struct quota_id, node);
> +	}
> +
> +got_next_id:
> +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> +out_unlock:
> +	up_read(&dqopt->dqio_sem);
> +	return ret;
> +}

								Honza
Darrick J. Wong Nov. 23, 2022, 6:09 p.m. UTC | #12
On Wed, Nov 23, 2022 at 09:36:15AM +0100, Lukas Czerner wrote:
> On Tue, Nov 22, 2022 at 11:58:33PM -0800, Christoph Hellwig wrote:
> > On Tue, Nov 22, 2022 at 03:21:17PM +0100, Lukas Czerner wrote:
> > > > That seems like a good idea for memory usage, but I think this might
> > > > also make the code much simpler, as that just requires fairly trivial
> > > > quota_read and quota_write methods in the shmem code instead of new
> > > > support for an in-memory quota file.
> > > 
> > > You mean like the implementation in the v1 ?
> > 
> > Having now found it: yes.
> > 
> 
> Jan,
> 
> do you have any argument for this, since it was your suggestion?
> 
> I also think that the implementation is much simpler with in-memory
> dquots because we will avoid all the hassle with creating and
> maintaining quota file in a proper format. It's not just reads and
> writes it's the entire machinery befind it in quota_v2.c and quota_tree.c.
> 
> But it is true that even with only user modified dquots being
> non-reclaimable until unmount it could theoreticaly represent a
> substantial memory consumption. Although I do wonder if this problem
> is even real. How many user/group ids would you expect extremely heavy
> quota user would have the limits set for? 1k, 10k, million, or even
> more? Do you know?

The last time I checked, some of our container schedulers will heap
~1000 containers onto a single host(!!) at a time.  Assuming that a
container with a single container might map ~10 uids from the global
namespace, that's easily 10,000 at a time.  If the container runtime
only reuses global uid namespace when it runs out of namespace (i.e. it
doesn't immediately recycle them) then you could actually get up in the
millions or billions pretty easily.  The dquot counters would drop to
zero so you might still be able to reclaim the old ones, though it
sounds like you'd have to unset any per-dquot limits to get it to do
that.

That said, fsx in fstests will make all sorts of chown/chgrp calls,
which has lead to problems with the XFS quota files reaching their
maximum size (~580M per quota type) and filling up the whole fs.

--D

> -Lukas
>
Lukas Czerner Nov. 25, 2022, 9:30 a.m. UTC | #13
On Wed, Nov 23, 2022 at 06:07:39PM +0100, Jan Kara wrote:
> On Mon 21-11-22 15:28:52, Lukas Czerner wrote:
> > In memory quota format relies on quota infrastructure to store dquot
> > information for us. While conventional quota formats for file systems
> > with persistent storage can load quota information into dquot from the
> > storage on-demand and hence quota dquot shrinker can free any dquot that
> > is not currently being used, it must be avoided here. Otherwise we can
> > lose valuable information, user provided limits, because there is no
> > persistent storage to load the information from afterwards.
> > 
> > One information that in-memory quota format needs to keep track of is a
> > sorted list of ids for each quota type. This is done by utilizing an rb
> > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > 
> > This format can be used to support quota on file system without persistent
> > storage such as tmpfs.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> 
> I was thinking about this somewhat and sketching some code on my computer
> to make things even simpler. See suggestions below.

Thanks Honzo.

> 
> > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > index 9160639daffa..935be3f7b731 100644
> > --- a/fs/quota/Makefile
> > +++ b/fs/quota/Makefile
> > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> 
> So I wouldn't go for new generic quota format. Instead I'd define &
> register private quota format in mm/shmem.c like:
> 
> static struct quota_format_type shmem_quota_format = {
>         .qf_fmt_id      = QFMT_SHMEM,
>         .qf_ops         = &shmem_quota_format_ops,
>         .qf_owner       = THIS_MODULE
> };
> 
> OCFS2 already does the very same thing so you can take some inspiration
> from it. Also all the ops will be private to shmem.

Ok.

> 
> > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > index 0427b44bfee5..f1a7a03632a2 100644
> > --- a/fs/quota/dquot.c
> > +++ b/fs/quota/dquot.c
> > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  	spin_lock(&dq_list_lock);
> >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > +			continue;
> >  		remove_dquot_hash(dquot);
> >  		remove_free_dquot(dquot);
> >  		remove_inuse(dquot);
> 
> I'd leave dquot reclaim alone. See below how to avoid loosing usage
> information / limits.

Yes, if the idea is to keep the version of the limits per id in shmem.c
then this won't be needed.

> 
> > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > new file mode 100644
> > index 000000000000..7d5e82122143
> > --- /dev/null
> > +++ b/fs/quota/quota_mem.c
> > @@ -0,0 +1,260 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * In memory quota format relies on quota infrastructure to store dquot
> > + * information for us. While conventional quota formats for file systems
> > + * with persistent storage can load quota information into dquot from the
> > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > + * that is not currently being used, it must be avoided here. Otherwise we
> > + * can lose valuable information, user provided limits, because there is
> > + * no persistent storage to load the information from afterwards.
> > + *
> > + * One information that in-memory quota format needs to keep track of is
> > + * a sorted list of ids for each quota type. This is done by utilizing
> > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > + * type.
> > + *
> > + * This format can be used to support quota on file system without persistent
> > + * storage such as tmpfs.
> > + */
> > +#include <linux/errno.h>
> > +#include <linux/fs.h>
> > +#include <linux/mount.h>
> > +#include <linux/kernel.h>
> > +#include <linux/init.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/rbtree.h>
> > +
> > +#include <linux/quotaops.h>
> > +#include <linux/quota.h>
> > +
> > +MODULE_AUTHOR("Lukas Czerner");
> > +MODULE_DESCRIPTION("Quota in-memory format support");
> > +MODULE_LICENSE("GPL");
> > +
> > +/*
> > + * The following constants define the amount of time given a user
> > + * before the soft limits are treated as hard limits (usually resulting
> > + * in an allocation failure). The timer is started when the user crosses
> > + * their soft limit, it is reset when they go below their soft limit.
> > + */
> > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> 
> These would then become shmem private defaults.
> 
> > +struct quota_id {
> > +	struct rb_node	node;
> > +	qid_t		id;
> > +};
> 
> Instead of this I'd define:
> 
> struct shmem_dquot {
>         struct rb_node node;
>         qid_t id;
> 	qsize_t bhardlimit;
> 	qsize_t bsoftlimit;
> 	qsize_t ihardlimit;
> 	qsize_t isoftlimit;
> };
> 
> It would be kept in rbtree like you do with quota_id but it will be also
> used as ultimate "persistent" storage of quota information when dquot gets
> reclaimed. We don't need to store grace times or usage information because
> if there is non-zero usage, dquot is referenced from the inode and thus
> cannot be reclaimed.

Ok, this approach will duplicate the limits, but has advantage of having
much smaller footprint than entire dquot so in case we don't have any
usage in dquot we can safely reclaim it if needed without loosing user
provided limits. Probably a worthwhile trade-off.

Or perhaps we can eliminate the duplicity when we store it in the tree on
->destroy_dquot() and free it after we load the limits into dquot on
->acquire_dquo().

> 
> > +static int mem_check_quota_file(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 1;
> > +}
> > +
> > +/*
> > + * There is no real quota file. Just allocate rb_root for quota ids and
> > + * set limits
> > + */
> > +static int mem_read_file_info(struct super_block *sb, int type)
> > +{
> > +	struct quota_info *dqopt = sb_dqopt(sb);
> > +	struct mem_dqinfo *info = &dqopt->info[type];
> > +	int ret = 0;
> > +
> > +	down_read(&dqopt->dqio_sem);
> 
> No need for dqio_sem here...
> 
> > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> 
> Also this check is not needed.
> 
> > +
> > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > +	if (!info->dqi_priv) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/*
> > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > +	 * quota core supports only signed 64-bit values so use that
> > +	 * as a limit
> > +	 */
> > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > +
> > +	info->dqi_bgrace = MAX_DQ_TIME;
> > +	info->dqi_igrace = MAX_IQ_TIME;
> > +	info->dqi_flags = 0;
> > +
> > +out_unlock:
> > +	up_read(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static int mem_write_file_info(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Free all the quota_id entries in the rb tree and rb_root.
> > + */
> > +static int mem_free_file_info(struct super_block *sb, int type)
> > +{
> > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > +	struct rb_root *root = info->dqi_priv;
> > +	struct quota_id *entry;
> > +	struct rb_node *node;
> > +
> > +	info->dqi_priv = NULL;
> > +	node = rb_first(root);
> > +	while (node) {
> > +		entry = rb_entry(node, struct quota_id, node);
> > +		node = rb_next(&entry->node);
> > +
> > +		rb_erase(&entry->node, root);
> > +		kfree(entry);
> > +	}
> > +
> > +	kfree(root);
> > +	return 0;
> > +}
> 
> These should be all ops that are needed for the quota format. So quota
> format ops can be just:
> 
> static const struct quota_format_ops shmem_format_ops = {
>         .check_quota_file       = shmem_check_quota_file,
>         .read_file_info         = shmem_read_file_info,
>         .write_file_info        = shmem_write_file_info,
>         .free_file_info         = shmem_free_file_info,
> };
> 
> We deal with remaining operations by diverting quota callbacks earlier in
> filesystem hooks like:
> 
> static const struct dquot_operations shmem_quota_operations = {
> 	.acquire_dquot		= shmem_acquire_dquot,
> 	..release_dquot		= shmem_release_dquot,
> 	.alloc_dquot		= dquot_alloc,
> 	.destroy_dquot		= dquot_destroy,
> 	.write_info		= <do nothing>,
> 	.mark_dirty		= <do nothing>,
> 	.get_next_id		= shmem_get_next_id,
> };
> 
> Now shmem_get_next_id() will basically do what you do in mem_get_next_id()
> just you need to wrap it in sb_has_quota_active() check as
> dquot_get_next_id() does.
> 
> shmem_acquire_dquot() will do what you do in mem_read_dquot(), just if we
> find the id in the rbtree, we copy limits into the dquot structure.
> 
> shmem_release_dquot() will just copy current limits to the structure in the
> rbtree. It can also verify there's no usage. It can also just delete the
> structure from the rbtree if the limits match the default ones.
> 
> And that should be it.

Makes sense. Thanks for the suggestions. I'll work on v3.

-Lukas

> 
> > +/*
> > + * There is no real quota file, nothing to read. Just insert the id in
> > + * the rb tree.
> > + */
> > +static int mem_read_dquot(struct dquot *dquot)
> > +{
> > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > +	struct rb_node *parent = NULL, *new_node = NULL;
> > +	struct quota_id *new_entry, *entry;
> > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> > +	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
> > +	int ret = 0;
> > +
> > +	down_write(&dqopt->dqio_sem);
> > +
> > +	while (*n) {
> > +		parent = *n;
> > +		entry = rb_entry(parent, struct quota_id, node);
> > +
> > +		if (id < entry->id)
> > +			n = &(*n)->rb_left;
> > +		else if (id > entry->id)
> > +			n = &(*n)->rb_right;
> > +		else
> > +			goto out_unlock;
> > +	}
> > +
> > +	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
> > +	if (!new_entry) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	new_entry->id = id;
> > +	new_node = &new_entry->node;
> > +	rb_link_node(new_node, parent, n);
> > +	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
> > +	dquot->dq_off = 1;
> > +	/*
> > +	 * Make sure dquot is never released by a shrinker because we
> > +	 * rely on quota infrastructure to store mem_dqblk in dquot.
> > +	 */
> > +	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
> > +	set_bit(DQ_FAKE_B, &dquot->dq_flags);
> > +
> > +out_unlock:
> > +	up_write(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static int mem_write_dquot(struct dquot *dquot)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 0;
> > +}
> > +
> ...
> 
> > +static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
> > +{
> > +	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
> > +	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
> > +	qid_t id = from_kqid(&init_user_ns, *qid);
> > +	struct quota_info *dqopt = sb_dqopt(sb);
> > +	struct quota_id *entry = NULL;
> > +	int ret = 0;
> > +
> > +	down_read(&dqopt->dqio_sem);
> > +	while (node) {
> > +		entry = rb_entry(node, struct quota_id, node);
> > +
> > +		if (id < entry->id)
> > +			node = node->rb_left;
> > +		else if (id > entry->id)
> > +			node = node->rb_right;
> > +		else
> > +			goto got_next_id;
> > +	}
> > +
> > +	if (!entry) {
> > +		ret = -ENOENT;
> > +		goto out_unlock;
> > +	}
> > +
> > +	if (id > entry->id) {
> > +		node = rb_next(&entry->node);
> > +		if (!node) {
> > +			ret = -ENOENT;
> > +			goto out_unlock;
> > +		}
> > +		entry = rb_entry(node, struct quota_id, node);
> > +	}
> > +
> > +got_next_id:
> > +	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
> > +out_unlock:
> > +	up_read(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
>
Jan Kara Nov. 28, 2022, 10:03 a.m. UTC | #14
On Fri 25-11-22 10:30:10, Lukas Czerner wrote:
> On Wed, Nov 23, 2022 at 06:07:39PM +0100, Jan Kara wrote:
> > On Mon 21-11-22 15:28:52, Lukas Czerner wrote:
> > Instead of this I'd define:
> > 
> > struct shmem_dquot {
> >         struct rb_node node;
> >         qid_t id;
> > 	qsize_t bhardlimit;
> > 	qsize_t bsoftlimit;
> > 	qsize_t ihardlimit;
> > 	qsize_t isoftlimit;
> > };
> > 
> > It would be kept in rbtree like you do with quota_id but it will be also
> > used as ultimate "persistent" storage of quota information when dquot gets
> > reclaimed. We don't need to store grace times or usage information because
> > if there is non-zero usage, dquot is referenced from the inode and thus
> > cannot be reclaimed.
> 
> Ok, this approach will duplicate the limits, but has advantage of having
> much smaller footprint than entire dquot so in case we don't have any
> usage in dquot we can safely reclaim it if needed without loosing user
> provided limits. Probably a worthwhile trade-off.
> 
> Or perhaps we can eliminate the duplicity when we store it in the tree on
> ->destroy_dquot() and free it after we load the limits into dquot on
> ->acquire_dquo().

We could but I don't think it's worth the hassle. In particular because
->release_dquot() must not fail (there's no simple way to recover from such
failure). I don't say it is impossible to overcome but just not worth the
saved memory IMHO.

								Honza
Christian Brauner Nov. 29, 2022, 11:21 a.m. UTC | #15
On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> In memory quota format relies on quota infrastructure to store dquot
> information for us. While conventional quota formats for file systems
> with persistent storage can load quota information into dquot from the
> storage on-demand and hence quota dquot shrinker can free any dquot that
> is not currently being used, it must be avoided here. Otherwise we can
> lose valuable information, user provided limits, because there is no
> persistent storage to load the information from afterwards.
> 
> One information that in-memory quota format needs to keep track of is a
> sorted list of ids for each quota type. This is done by utilizing an rb
> tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> 
> This format can be used to support quota on file system without persistent
> storage such as tmpfs.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/quota/Kconfig           |   8 ++
>  fs/quota/Makefile          |   1 +
>  fs/quota/dquot.c           |   3 +
>  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
>  include/linux/quota.h      |   7 +-
>  include/uapi/linux/quota.h |   1 +
>  6 files changed, 279 insertions(+), 1 deletion(-)
>  create mode 100644 fs/quota/quota_mem.c
> 
> diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> index b59cd172b5f9..8ea9656ca37b 100644
> --- a/fs/quota/Kconfig
> +++ b/fs/quota/Kconfig
> @@ -67,6 +67,14 @@ config QFMT_V2
>  	  also supports 64-bit inode and block quota limits. If you need this
>  	  functionality say Y here.
>  
> +config QFMT_MEM
> +	tristate "Quota in-memory format support "
> +	depends on QUOTA
> +	help
> +	  This config option enables kernel support for in-memory quota
> +	  format support. Useful to support quota on file system without
> +	  permanent storage. If you need this functionality say Y here.
> +
>  config QUOTACTL
>  	bool
>  	default n
> diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> index 9160639daffa..935be3f7b731 100644
> --- a/fs/quota/Makefile
> +++ b/fs/quota/Makefile
> @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
>  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
>  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
>  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 0427b44bfee5..f1a7a03632a2 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  	spin_lock(&dq_list_lock);
>  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
>  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> +			continue;
>  		remove_dquot_hash(dquot);
>  		remove_free_dquot(dquot);
>  		remove_inuse(dquot);
> diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> new file mode 100644
> index 000000000000..7d5e82122143
> --- /dev/null
> +++ b/fs/quota/quota_mem.c
> @@ -0,0 +1,260 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * In memory quota format relies on quota infrastructure to store dquot
> + * information for us. While conventional quota formats for file systems
> + * with persistent storage can load quota information into dquot from the
> + * storage on-demand and hence quota dquot shrinker can free any dquot
> + * that is not currently being used, it must be avoided here. Otherwise we
> + * can lose valuable information, user provided limits, because there is
> + * no persistent storage to load the information from afterwards.
> + *
> + * One information that in-memory quota format needs to keep track of is
> + * a sorted list of ids for each quota type. This is done by utilizing
> + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> + * type.
> + *
> + * This format can be used to support quota on file system without persistent
> + * storage such as tmpfs.
> + */
> +#include <linux/errno.h>
> +#include <linux/fs.h>
> +#include <linux/mount.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/rbtree.h>
> +
> +#include <linux/quotaops.h>
> +#include <linux/quota.h>
> +
> +MODULE_AUTHOR("Lukas Czerner");
> +MODULE_DESCRIPTION("Quota in-memory format support");
> +MODULE_LICENSE("GPL");
> +
> +/*
> + * The following constants define the amount of time given a user
> + * before the soft limits are treated as hard limits (usually resulting
> + * in an allocation failure). The timer is started when the user crosses
> + * their soft limit, it is reset when they go below their soft limit.
> + */
> +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> +
> +struct quota_id {
> +	struct rb_node	node;
> +	qid_t		id;
> +};
> +
> +static int mem_check_quota_file(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 1;
> +}
> +
> +/*
> + * There is no real quota file. Just allocate rb_root for quota ids and
> + * set limits
> + */
> +static int mem_read_file_info(struct super_block *sb, int type)
> +{
> +	struct quota_info *dqopt = sb_dqopt(sb);
> +	struct mem_dqinfo *info = &dqopt->info[type];
> +	int ret = 0;
> +
> +	down_read(&dqopt->dqio_sem);
> +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> +	if (!info->dqi_priv) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * Used space is stored as unsigned 64-bit value in bytes but
> +	 * quota core supports only signed 64-bit values so use that
> +	 * as a limit
> +	 */
> +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> +
> +	info->dqi_bgrace = MAX_DQ_TIME;
> +	info->dqi_igrace = MAX_IQ_TIME;
> +	info->dqi_flags = 0;
> +
> +out_unlock:
> +	up_read(&dqopt->dqio_sem);
> +	return ret;
> +}
> +
> +static int mem_write_file_info(struct super_block *sb, int type)
> +{
> +	/* There is no real quota file, nothing to do */
> +	return 0;
> +}
> +
> +/*
> + * Free all the quota_id entries in the rb tree and rb_root.
> + */
> +static int mem_free_file_info(struct super_block *sb, int type)
> +{
> +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> +	struct rb_root *root = info->dqi_priv;
> +	struct quota_id *entry;
> +	struct rb_node *node;
> +
> +	info->dqi_priv = NULL;
> +	node = rb_first(root);
> +	while (node) {
> +		entry = rb_entry(node, struct quota_id, node);
> +		node = rb_next(&entry->node);
> +
> +		rb_erase(&entry->node, root);
> +		kfree(entry);
> +	}
> +
> +	kfree(root);
> +	return 0;
> +}
> +
> +/*
> + * There is no real quota file, nothing to read. Just insert the id in
> + * the rb tree.
> + */
> +static int mem_read_dquot(struct dquot *dquot)
> +{
> +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> +	struct rb_node *parent = NULL, *new_node = NULL;
> +	struct quota_id *new_entry, *entry;
> +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);

Hey Lukas,

tmpfs instances can be mounted inside of mount namespaces owned by user
namespaces as is the case in unprivileged containers. An easy example is:

unshare --mount --user --map-root
mount -t tmpfs tmpfs /mnt

This tmpfs instances will be mounted with sb->s_user_ns set to the
userns just created during the unshare call and not to init_user_ns. So
this means that the filesystem idmapping isn't a 1:1 mapping. This needs
to be taken into account:

qid_t id = from_kqid(sb->s_user_ns, dquot->dq_id);

similar below.

But dquot_load_quota_sb() which you use in a later patch is restricted
to the init_user_ns which means that your patch as it stands is only
useable for tmpfs instances mounted in the init_user_ns.

If that's intentional then the code above is probably fine but if it's
not then you need preliminary patches to support quotas from filesystems
mountable in non-initial user namespaces.

Enabling this shouldn't be a big deal as it mostly involves updating
callsites to account for sb->s_user_ns when reading and writing quotas.
I've looked at that a while ago but there was no filesystem with quota
support that was also mountable in a user namespaces. Idmapped mounts
are already taken care of.
Lukas Czerner Nov. 29, 2022, 1:11 p.m. UTC | #16
On Tue, Nov 29, 2022 at 12:21:33PM +0100, Christian Brauner wrote:
> On Mon, Nov 21, 2022 at 03:28:52PM +0100, Lukas Czerner wrote:
> > In memory quota format relies on quota infrastructure to store dquot
> > information for us. While conventional quota formats for file systems
> > with persistent storage can load quota information into dquot from the
> > storage on-demand and hence quota dquot shrinker can free any dquot that
> > is not currently being used, it must be avoided here. Otherwise we can
> > lose valuable information, user provided limits, because there is no
> > persistent storage to load the information from afterwards.
> > 
> > One information that in-memory quota format needs to keep track of is a
> > sorted list of ids for each quota type. This is done by utilizing an rb
> > tree which root is stored in mem_dqinfo->dqi_priv for each quota type.
> > 
> > This format can be used to support quota on file system without persistent
> > storage such as tmpfs.
> > 
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > ---
> >  fs/quota/Kconfig           |   8 ++
> >  fs/quota/Makefile          |   1 +
> >  fs/quota/dquot.c           |   3 +
> >  fs/quota/quota_mem.c       | 260 +++++++++++++++++++++++++++++++++++++
> >  include/linux/quota.h      |   7 +-
> >  include/uapi/linux/quota.h |   1 +
> >  6 files changed, 279 insertions(+), 1 deletion(-)
> >  create mode 100644 fs/quota/quota_mem.c
> > 
> > diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
> > index b59cd172b5f9..8ea9656ca37b 100644
> > --- a/fs/quota/Kconfig
> > +++ b/fs/quota/Kconfig
> > @@ -67,6 +67,14 @@ config QFMT_V2
> >  	  also supports 64-bit inode and block quota limits. If you need this
> >  	  functionality say Y here.
> >  
> > +config QFMT_MEM
> > +	tristate "Quota in-memory format support "
> > +	depends on QUOTA
> > +	help
> > +	  This config option enables kernel support for in-memory quota
> > +	  format support. Useful to support quota on file system without
> > +	  permanent storage. If you need this functionality say Y here.
> > +
> >  config QUOTACTL
> >  	bool
> >  	default n
> > diff --git a/fs/quota/Makefile b/fs/quota/Makefile
> > index 9160639daffa..935be3f7b731 100644
> > --- a/fs/quota/Makefile
> > +++ b/fs/quota/Makefile
> > @@ -5,3 +5,4 @@ obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
> >  obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
> >  obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
> >  obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
> > +obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
> > diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> > index 0427b44bfee5..f1a7a03632a2 100644
> > --- a/fs/quota/dquot.c
> > +++ b/fs/quota/dquot.c
> > @@ -736,6 +736,9 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
> >  	spin_lock(&dq_list_lock);
> >  	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
> >  		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
> > +		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
> > +		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
> > +			continue;
> >  		remove_dquot_hash(dquot);
> >  		remove_free_dquot(dquot);
> >  		remove_inuse(dquot);
> > diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
> > new file mode 100644
> > index 000000000000..7d5e82122143
> > --- /dev/null
> > +++ b/fs/quota/quota_mem.c
> > @@ -0,0 +1,260 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * In memory quota format relies on quota infrastructure to store dquot
> > + * information for us. While conventional quota formats for file systems
> > + * with persistent storage can load quota information into dquot from the
> > + * storage on-demand and hence quota dquot shrinker can free any dquot
> > + * that is not currently being used, it must be avoided here. Otherwise we
> > + * can lose valuable information, user provided limits, because there is
> > + * no persistent storage to load the information from afterwards.
> > + *
> > + * One information that in-memory quota format needs to keep track of is
> > + * a sorted list of ids for each quota type. This is done by utilizing
> > + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
> > + * type.
> > + *
> > + * This format can be used to support quota on file system without persistent
> > + * storage such as tmpfs.
> > + */
> > +#include <linux/errno.h>
> > +#include <linux/fs.h>
> > +#include <linux/mount.h>
> > +#include <linux/kernel.h>
> > +#include <linux/init.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/rbtree.h>
> > +
> > +#include <linux/quotaops.h>
> > +#include <linux/quota.h>
> > +
> > +MODULE_AUTHOR("Lukas Czerner");
> > +MODULE_DESCRIPTION("Quota in-memory format support");
> > +MODULE_LICENSE("GPL");
> > +
> > +/*
> > + * The following constants define the amount of time given a user
> > + * before the soft limits are treated as hard limits (usually resulting
> > + * in an allocation failure). The timer is started when the user crosses
> > + * their soft limit, it is reset when they go below their soft limit.
> > + */
> > +#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
> > +#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
> > +
> > +struct quota_id {
> > +	struct rb_node	node;
> > +	qid_t		id;
> > +};
> > +
> > +static int mem_check_quota_file(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 1;
> > +}
> > +
> > +/*
> > + * There is no real quota file. Just allocate rb_root for quota ids and
> > + * set limits
> > + */
> > +static int mem_read_file_info(struct super_block *sb, int type)
> > +{
> > +	struct quota_info *dqopt = sb_dqopt(sb);
> > +	struct mem_dqinfo *info = &dqopt->info[type];
> > +	int ret = 0;
> > +
> > +	down_read(&dqopt->dqio_sem);
> > +	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
> > +	if (!info->dqi_priv) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	/*
> > +	 * Used space is stored as unsigned 64-bit value in bytes but
> > +	 * quota core supports only signed 64-bit values so use that
> > +	 * as a limit
> > +	 */
> > +	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
> > +	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
> > +
> > +	info->dqi_bgrace = MAX_DQ_TIME;
> > +	info->dqi_igrace = MAX_IQ_TIME;
> > +	info->dqi_flags = 0;
> > +
> > +out_unlock:
> > +	up_read(&dqopt->dqio_sem);
> > +	return ret;
> > +}
> > +
> > +static int mem_write_file_info(struct super_block *sb, int type)
> > +{
> > +	/* There is no real quota file, nothing to do */
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Free all the quota_id entries in the rb tree and rb_root.
> > + */
> > +static int mem_free_file_info(struct super_block *sb, int type)
> > +{
> > +	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
> > +	struct rb_root *root = info->dqi_priv;
> > +	struct quota_id *entry;
> > +	struct rb_node *node;
> > +
> > +	info->dqi_priv = NULL;
> > +	node = rb_first(root);
> > +	while (node) {
> > +		entry = rb_entry(node, struct quota_id, node);
> > +		node = rb_next(&entry->node);
> > +
> > +		rb_erase(&entry->node, root);
> > +		kfree(entry);
> > +	}
> > +
> > +	kfree(root);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * There is no real quota file, nothing to read. Just insert the id in
> > + * the rb tree.
> > + */
> > +static int mem_read_dquot(struct dquot *dquot)
> > +{
> > +	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
> > +	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
> > +	struct rb_node *parent = NULL, *new_node = NULL;
> > +	struct quota_id *new_entry, *entry;
> > +	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
> 
> Hey Lukas,
> 
> tmpfs instances can be mounted inside of mount namespaces owned by user
> namespaces as is the case in unprivileged containers. An easy example is:
> 
> unshare --mount --user --map-root
> mount -t tmpfs tmpfs /mnt
> 
> This tmpfs instances will be mounted with sb->s_user_ns set to the
> userns just created during the unshare call and not to init_user_ns. So
> this means that the filesystem idmapping isn't a 1:1 mapping. This needs
> to be taken into account:
> 
> qid_t id = from_kqid(sb->s_user_ns, dquot->dq_id);
> 
> similar below.
> 
> But dquot_load_quota_sb() which you use in a later patch is restricted
> to the init_user_ns which means that your patch as it stands is only
> useable for tmpfs instances mounted in the init_user_ns.
> 
> If that's intentional then the code above is probably fine but if it's
> not then you need preliminary patches to support quotas from filesystems
> mountable in non-initial user namespaces.
> 
> Enabling this shouldn't be a big deal as it mostly involves updating
> callsites to account for sb->s_user_ns when reading and writing quotas.
> I've looked at that a while ago but there was no filesystem with quota
> support that was also mountable in a user namespaces. Idmapped mounts
> are already taken care of.
> 

Hi Christian,

that's a good point, thank you for bringing that to my attention I
didn't think of that at all. I'll have to think about whether it makes
sense to enable it outside init_user_ns as well. Can't think of why not
atm.

Thanks!
-Lukas
diff mbox series

Patch

diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index b59cd172b5f9..8ea9656ca37b 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -67,6 +67,14 @@  config QFMT_V2
 	  also supports 64-bit inode and block quota limits. If you need this
 	  functionality say Y here.
 
+config QFMT_MEM
+	tristate "Quota in-memory format support "
+	depends on QUOTA
+	help
+	  This config option enables kernel support for in-memory quota
+	  format support. Useful to support quota on file system without
+	  permanent storage. If you need this functionality say Y here.
+
 config QUOTACTL
 	bool
 	default n
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 9160639daffa..935be3f7b731 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -5,3 +5,4 @@  obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o kqid.o
 obj-$(CONFIG_QUOTA_NETLINK_INTERFACE)	+= netlink.o
+obj-$(CONFIG_QFMT_MEM)		+= quota_mem.o
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0427b44bfee5..f1a7a03632a2 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -736,6 +736,9 @@  dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	spin_lock(&dq_list_lock);
 	while (!list_empty(&free_dquots) && sc->nr_to_scan) {
 		dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
+		if (test_bit(DQ_NO_SHRINK_B, &dquot->dq_flags) &&
+		    !test_bit(DQ_FAKE_B, &dquot->dq_flags))
+			continue;
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
diff --git a/fs/quota/quota_mem.c b/fs/quota/quota_mem.c
new file mode 100644
index 000000000000..7d5e82122143
--- /dev/null
+++ b/fs/quota/quota_mem.c
@@ -0,0 +1,260 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * In memory quota format relies on quota infrastructure to store dquot
+ * information for us. While conventional quota formats for file systems
+ * with persistent storage can load quota information into dquot from the
+ * storage on-demand and hence quota dquot shrinker can free any dquot
+ * that is not currently being used, it must be avoided here. Otherwise we
+ * can lose valuable information, user provided limits, because there is
+ * no persistent storage to load the information from afterwards.
+ *
+ * One information that in-memory quota format needs to keep track of is
+ * a sorted list of ids for each quota type. This is done by utilizing
+ * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota
+ * type.
+ *
+ * This format can be used to support quota on file system without persistent
+ * storage such as tmpfs.
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+#include <linux/quotaops.h>
+#include <linux/quota.h>
+
+MODULE_AUTHOR("Lukas Czerner");
+MODULE_DESCRIPTION("Quota in-memory format support");
+MODULE_LICENSE("GPL");
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
+
+struct quota_id {
+	struct rb_node	node;
+	qid_t		id;
+};
+
+static int mem_check_quota_file(struct super_block *sb, int type)
+{
+	/* There is no real quota file, nothing to do */
+	return 1;
+}
+
+/*
+ * There is no real quota file. Just allocate rb_root for quota ids and
+ * set limits
+ */
+static int mem_read_file_info(struct super_block *sb, int type)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct mem_dqinfo *info = &dqopt->info[type];
+	int ret = 0;
+
+	down_read(&dqopt->dqio_sem);
+	if (info->dqi_fmt_id != QFMT_MEM_ONLY) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS);
+	if (!info->dqi_priv) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	/*
+	 * Used space is stored as unsigned 64-bit value in bytes but
+	 * quota core supports only signed 64-bit values so use that
+	 * as a limit
+	 */
+	info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
+	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
+
+	info->dqi_bgrace = MAX_DQ_TIME;
+	info->dqi_igrace = MAX_IQ_TIME;
+	info->dqi_flags = 0;
+
+out_unlock:
+	up_read(&dqopt->dqio_sem);
+	return ret;
+}
+
+static int mem_write_file_info(struct super_block *sb, int type)
+{
+	/* There is no real quota file, nothing to do */
+	return 0;
+}
+
+/*
+ * Free all the quota_id entries in the rb tree and rb_root.
+ */
+static int mem_free_file_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = &sb_dqopt(sb)->info[type];
+	struct rb_root *root = info->dqi_priv;
+	struct quota_id *entry;
+	struct rb_node *node;
+
+	info->dqi_priv = NULL;
+	node = rb_first(root);
+	while (node) {
+		entry = rb_entry(node, struct quota_id, node);
+		node = rb_next(&entry->node);
+
+		rb_erase(&entry->node, root);
+		kfree(entry);
+	}
+
+	kfree(root);
+	return 0;
+}
+
+/*
+ * There is no real quota file, nothing to read. Just insert the id in
+ * the rb tree.
+ */
+static int mem_read_dquot(struct dquot *dquot)
+{
+	struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type);
+	struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+	struct quota_id *new_entry, *entry;
+	qid_t id = from_kqid(&init_user_ns, dquot->dq_id);
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+	int ret = 0;
+
+	down_write(&dqopt->dqio_sem);
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct quota_id, node);
+
+		if (id < entry->id)
+			n = &(*n)->rb_left;
+		else if (id > entry->id)
+			n = &(*n)->rb_right;
+		else
+			goto out_unlock;
+	}
+
+	new_entry = kmalloc(sizeof(struct quota_id), GFP_NOFS);
+	if (!new_entry) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	new_entry->id = id;
+	new_node = &new_entry->node;
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, (struct rb_root *)info->dqi_priv);
+	dquot->dq_off = 1;
+	/*
+	 * Make sure dquot is never released by a shrinker because we
+	 * rely on quota infrastructure to store mem_dqblk in dquot.
+	 */
+	set_bit(DQ_NO_SHRINK_B, &dquot->dq_flags);
+	set_bit(DQ_FAKE_B, &dquot->dq_flags);
+
+out_unlock:
+	up_write(&dqopt->dqio_sem);
+	return ret;
+}
+
+static int mem_write_dquot(struct dquot *dquot)
+{
+	/* There is no real quota file, nothing to do */
+	return 0;
+}
+
+static int mem_release_dquot(struct dquot *dquot)
+{
+	/*
+	 * Everything is in memory only, release once we're done with
+	 * quota via mem_free_file_info().
+	 */
+	return 0;
+}
+
+static int mem_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, qid->type);
+	struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node;
+	qid_t id = from_kqid(&init_user_ns, *qid);
+	struct quota_info *dqopt = sb_dqopt(sb);
+	struct quota_id *entry = NULL;
+	int ret = 0;
+
+	down_read(&dqopt->dqio_sem);
+	while (node) {
+		entry = rb_entry(node, struct quota_id, node);
+
+		if (id < entry->id)
+			node = node->rb_left;
+		else if (id > entry->id)
+			node = node->rb_right;
+		else
+			goto got_next_id;
+	}
+
+	if (!entry) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (id > entry->id) {
+		node = rb_next(&entry->node);
+		if (!node) {
+			ret = -ENOENT;
+			goto out_unlock;
+		}
+		entry = rb_entry(node, struct quota_id, node);
+	}
+
+got_next_id:
+	*qid = make_kqid(&init_user_ns, qid->type, entry->id);
+out_unlock:
+	up_read(&dqopt->dqio_sem);
+	return ret;
+}
+
+static const struct quota_format_ops mem_format_ops = {
+	.check_quota_file	= mem_check_quota_file,
+	.read_file_info		= mem_read_file_info,
+	.write_file_info	= mem_write_file_info,
+	.free_file_info		= mem_free_file_info,
+	.read_dqblk		= mem_read_dquot,
+	.commit_dqblk		= mem_write_dquot,
+	.release_dqblk		= mem_release_dquot,
+	.get_next_id		= mem_get_next_id,
+};
+
+static struct quota_format_type mem_quota_format = {
+	.qf_fmt_id	= QFMT_MEM_ONLY,
+	.qf_ops		= &mem_format_ops,
+	.qf_owner	= THIS_MODULE
+};
+
+static int __init init_mem_quota_format(void)
+{
+	return register_quota_format(&mem_quota_format);
+}
+
+static void __exit exit_mem_quota_format(void)
+{
+	unregister_quota_format(&mem_quota_format);
+}
+
+module_init(init_mem_quota_format);
+module_exit(exit_mem_quota_format);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index fd692b4a41d5..4398e05c8b72 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -285,7 +285,11 @@  static inline void dqstats_dec(unsigned int type)
 #define DQ_FAKE_B	3	/* no limits only usage */
 #define DQ_READ_B	4	/* dquot was read into memory */
 #define DQ_ACTIVE_B	5	/* dquot is active (dquot_release not called) */
-#define DQ_LASTSET_B	6	/* Following 6 bits (see QIF_) are reserved\
+#define DQ_NO_SHRINK_B	6	/* modified dquot (not DQ_FAKE_B) is never to
+				 * be released by a shrinker. It should remain
+				 * in memory until quotas are being disabled on
+				 * unmount. */
+#define DQ_LASTSET_B	7	/* Following 6 bits (see QIF_) are reserved\
 				 * for the mask of entries set via SETQUOTA\
 				 * quotactl. They are set under dq_data_lock\
 				 * and the quota format handling dquot can\
@@ -536,6 +540,7 @@  struct quota_module_name {
 	{QFMT_VFS_OLD, "quota_v1"},\
 	{QFMT_VFS_V0, "quota_v2"},\
 	{QFMT_VFS_V1, "quota_v2"},\
+	{QFMT_MEM_ONLY, "quota_mem"},\
 	{0, NULL}}
 
 #endif /* _QUOTA_ */
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index f17c9636a859..ee9d2bad00c7 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -77,6 +77,7 @@ 
 #define	QFMT_VFS_V0 2
 #define QFMT_OCFS2 3
 #define	QFMT_VFS_V1 4
+#define	QFMT_MEM_ONLY 5
 
 /* Size of block in which space limits are passed through the quota
  * interface */