diff mbox

[RFC,2/3] ceph: quotas: support for ceph.quota.max_files

Message ID 20170906141224.11813-3-lhenriques@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Luis Henriques Sept. 6, 2017, 2:12 p.m. UTC
This patch adds support for the max_files quota.  It hooks into all the
ceph functions that add new filesystem objects that need to be checked
against the quota limit.  -EDQUOT is returned when this limit is hit.

Note that we're not checking quotas on ceph_link().  ceph_link doesn't
really create a new inode,  and since the MDS doesn't update the directory
statistics when a new (hard) link is created (only with symlinks), they
are not accounted as a new file.

Signed-off-by: Luis Henriques <lhenriques@suse.com>
---
 fs/ceph/dir.c   | 11 +++++++++++
 fs/ceph/file.c  |  4 +++-
 fs/ceph/quota.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h |  1 +
 4 files changed, 57 insertions(+), 1 deletion(-)

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Yan, Zheng Sept. 7, 2017, 2:22 p.m. UTC | #1
> On 6 Sep 2017, at 22:12, Luis Henriques <lhenriques@suse.com> wrote:
> 
> This patch adds support for the max_files quota.  It hooks into all the
> ceph functions that add new filesystem objects that need to be checked
> against the quota limit.  -EDQUOT is returned when this limit is hit.
> 
> Note that we're not checking quotas on ceph_link().  ceph_link doesn't
> really create a new inode,  and since the MDS doesn't update the directory
> statistics when a new (hard) link is created (only with symlinks), they
> are not accounted as a new file.
> 
> Signed-off-by: Luis Henriques <lhenriques@suse.com>
> ---
> fs/ceph/dir.c   | 11 +++++++++++
> fs/ceph/file.c  |  4 +++-
> fs/ceph/quota.c | 42 ++++++++++++++++++++++++++++++++++++++++++
> fs/ceph/super.h |  1 +
> 4 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index ef7240ace576..fb6adcf0ff51 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -815,6 +815,9 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
> 	if (ceph_snap(dir) != CEPH_NOSNAP)
> 		return -EROFS;
> 
> +	if (ceph_quota_is_quota_files_exceeded(dir))
> +		return -EDQUOT;
> +
> 	err = ceph_pre_init_acls(dir, &mode, &acls);
> 	if (err < 0)
> 		return err;
> @@ -868,6 +871,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
> 	if (ceph_snap(dir) != CEPH_NOSNAP)
> 		return -EROFS;
> 
> +	if (ceph_quota_is_quota_files_exceeded(dir))
> +		return -EDQUOT;
> +
> 	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
> 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
> 	if (IS_ERR(req)) {
> @@ -917,6 +923,11 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
> 		goto out;
> 	}
> 
> +	if (ceph_quota_is_quota_files_exceeded(dir)) {
> +		err = -EDQUOT;
> +		goto out;
> +	}
> +
> 	mode |= S_IFDIR;
> 	err = ceph_pre_init_acls(dir, &mode, &acls);
> 	if (err < 0)
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 3d48c415f3cb..708a9b841382 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -370,7 +370,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> 	struct ceph_mds_request *req;
> 	struct dentry *dn;
> 	struct ceph_acls_info acls = {};
> -       int mask;
> +	int mask;
> 	int err;
> 
> 	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
> @@ -381,6 +381,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> 		return -ENAMETOOLONG;
> 
> 	if (flags & O_CREAT) {
> +		if (ceph_quota_is_quota_files_exceeded(dir))
> +			return -EDQUOT;
> 		err = ceph_pre_init_acls(dir, &mode, &acls);
> 		if (err < 0)
> 			return err;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index c02d73a8d167..1bd02658f16a 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -57,3 +57,45 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
> 
> 	iput(inode);
> }
> +
> +bool ceph_quota_is_quota_files_exceeded(struct inode *inode)
> +{
> +	struct ceph_inode_info *ci;
> +	struct dentry *next, *parent;
> +	u64 max_files;
> +	u64 rentries = 0;
> +	unsigned seq;
> +	bool result = false;
> +
> +	WARN_ON(!S_ISDIR(inode->i_mode));
> +
> +retry:
> +	seq = read_seqbegin(&rename_lock);
> +	ci = ceph_inode(inode);
> +	next = d_find_any_alias(inode);
> +
> +	while (true) {
> +		spin_lock(&ci->i_ceph_lock);
> +		max_files = ci->i_max_files;
> +		rentries = ci->i_rfiles + ci->i_rsubdirs;
> +		spin_unlock(&ci->i_ceph_lock);
> +
> +		if ((max_files && (rentries >= max_files)) || IS_ROOT(next))
> +			break;
> +
> +		parent = dget_parent(next);
> +		ci = ceph_inode(d_inode(parent));
> +		dput(next);
> +		next = parent;
> +	}
> +
> +	dput(next);
> +
> +	if (read_seqretry(&rename_lock, seq))
> +		goto retry;
> +
> +	if (max_files && (rentries >= max_files))
> +		result = true;

This bottom-up dentry traversal code worries me. I vaguely remember that bottom-up
dentry traversal in kernel is discouraged. Then there are multiples clients modifying
the filesystem at the same time, the rename_lock does not help. That's why user space
code Client::get_quota_root() checks dentry lease and does lookup parent. I’m not sure
if we can do the same operations in kernel, because locking is much more complex in
kernel.

For the long term, I prefer unifying quota and snapshot implementation. The inode
trace in MClientReply contains information about which quota realm the inode belongs
to. So client can find quota information easily. (This requires bigger change for both
mds and client)

Regards
Yan, Zheng



> +
> +	return result;
> +}
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 50c96ea7dc7c..ef131107dbf6 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1011,5 +1011,6 @@ extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
> extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
> 			      struct ceph_mds_session *session,
> 			      struct ceph_msg *msg);
> +extern bool ceph_quota_is_quota_files_exceeded(struct inode *inode);
> 
> #endif /* _FS_CEPH_SUPER_H */

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Luis Henriques Sept. 8, 2017, 8:33 a.m. UTC | #2
"Yan, Zheng" <zyan@redhat.com> writes:

>> On 6 Sep 2017, at 22:12, Luis Henriques <lhenriques@suse.com> wrote:
<snip>
> This bottom-up dentry traversal code worries me. I vaguely remember that bottom-up
> dentry traversal in kernel is discouraged. Then there are multiples clients modifying
> the filesystem at the same time, the rename_lock does not help. That's why user space
> code Client::get_quota_root() checks dentry lease and does lookup parent. I’m not sure
> if we can do the same operations in kernel, because locking is much more complex in
> kernel.

So, you're saying that in addition to using the rename_lock (for local
renames), that loop will also need to do something similar to what's
being done already in function ceph_d_revalidate.  I.e., it needs to
validate the lease (as in function dentry_lease_is_valid) and send a
CEPH_MDS_OP_LOOKUP if a dentry is invalid.  Or am I missing something?

> For the long term, I prefer unifying quota and snapshot implementation. The inode
> trace in MClientReply contains information about which quota realm the inode belongs
> to. So client can find quota information easily. (This requires bigger change for both
> mds and client)

My motivation for trying to bring the kernel client a bit closer to
the fuse client was that there are currently valid use-cases for this
quota implementation, with all its limitations.

Now, I completely agree that ideally the core quota implementation
should be moved to the MDS.  This would simplify the clients side,
and, above all, would remove the limitation of requiring clients
cooperation.

Obviously, I would be more than happy to help on the kernel client
side of this solution.  But I'm afraid that the real hard work would
be on the MDS code, where things such as multi-MDS and dir
fragmentation would make this solution quite complex.

Cheers,
Yan, Zheng Sept. 8, 2017, 10:43 a.m. UTC | #3
> On 8 Sep 2017, at 16:33, Luis Henriques <lhenriques@suse.com> wrote:
> 
> "Yan, Zheng" <zyan@redhat.com> writes:
> 
>>> On 6 Sep 2017, at 22:12, Luis Henriques <lhenriques@suse.com> wrote:
> <snip>
>> This bottom-up dentry traversal code worries me. I vaguely remember that bottom-up
>> dentry traversal in kernel is discouraged. Then there are multiples clients modifying
>> the filesystem at the same time, the rename_lock does not help. That's why user space
>> code Client::get_quota_root() checks dentry lease and does lookup parent. I’m not sure
>> if we can do the same operations in kernel, because locking is much more complex in
>> kernel.
> 
> So, you're saying that in addition to using the rename_lock (for local
> renames), that loop will also need to do something similar to what's
> being done already in function ceph_d_revalidate.  I.e., it needs to
> validate the lease (as in function dentry_lease_is_valid) and send a
> CEPH_MDS_OP_LOOKUP if a dentry is invalid.  Or am I missing something?

yes

> 
>> For the long term, I prefer unifying quota and snapshot implementation. The inode
>> trace in MClientReply contains information about which quota realm the inode belongs
>> to. So client can find quota information easily. (This requires bigger change for both
>> mds and client)
> 
> My motivation for trying to bring the kernel client a bit closer to
> the fuse client was that there are currently valid use-cases for this
> quota implementation, with all its limitations.
> 
> Now, I completely agree that ideally the core quota implementation
> should be moved to the MDS.  This would simplify the clients side,
> and, above all, would remove the limitation of requiring clients
> cooperation.
> 
> Obviously, I would be more than happy to help on the kernel client
> side of this solution.  But I'm afraid that the real hard work would
> be on the MDS code, where things such as multi-MDS and dir
> fragmentation would make this solution quite complex.

I will start to hammer this code soon, please wait a few days.

Regards
Yan, Zheng

> 
> Cheers,
> -- 
> Luís

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ef7240ace576..fb6adcf0ff51 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -815,6 +815,9 @@  static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
+	if (ceph_quota_is_quota_files_exceeded(dir))
+		return -EDQUOT;
+
 	err = ceph_pre_init_acls(dir, &mode, &acls);
 	if (err < 0)
 		return err;
@@ -868,6 +871,9 @@  static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
+	if (ceph_quota_is_quota_files_exceeded(dir))
+		return -EDQUOT;
+
 	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
 	if (IS_ERR(req)) {
@@ -917,6 +923,11 @@  static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto out;
 	}
 
+	if (ceph_quota_is_quota_files_exceeded(dir)) {
+		err = -EDQUOT;
+		goto out;
+	}
+
 	mode |= S_IFDIR;
 	err = ceph_pre_init_acls(dir, &mode, &acls);
 	if (err < 0)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3d48c415f3cb..708a9b841382 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -370,7 +370,7 @@  int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	struct ceph_mds_request *req;
 	struct dentry *dn;
 	struct ceph_acls_info acls = {};
-       int mask;
+	int mask;
 	int err;
 
 	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -381,6 +381,8 @@  int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		return -ENAMETOOLONG;
 
 	if (flags & O_CREAT) {
+		if (ceph_quota_is_quota_files_exceeded(dir))
+			return -EDQUOT;
 		err = ceph_pre_init_acls(dir, &mode, &acls);
 		if (err < 0)
 			return err;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index c02d73a8d167..1bd02658f16a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -57,3 +57,45 @@  void ceph_handle_quota(struct ceph_mds_client *mdsc,
 
 	iput(inode);
 }
+
+bool ceph_quota_is_quota_files_exceeded(struct inode *inode)
+{
+	struct ceph_inode_info *ci;
+	struct dentry *next, *parent;
+	u64 max_files;
+	u64 rentries = 0;
+	unsigned seq;
+	bool result = false;
+
+	WARN_ON(!S_ISDIR(inode->i_mode));
+
+retry:
+	seq = read_seqbegin(&rename_lock);
+	ci = ceph_inode(inode);
+	next = d_find_any_alias(inode);
+
+	while (true) {
+		spin_lock(&ci->i_ceph_lock);
+		max_files = ci->i_max_files;
+		rentries = ci->i_rfiles + ci->i_rsubdirs;
+		spin_unlock(&ci->i_ceph_lock);
+
+		if ((max_files && (rentries >= max_files)) || IS_ROOT(next))
+			break;
+
+		parent = dget_parent(next);
+		ci = ceph_inode(d_inode(parent));
+		dput(next);
+		next = parent;
+	}
+
+	dput(next);
+
+	if (read_seqretry(&rename_lock, seq))
+		goto retry;
+
+	if (max_files && (rentries >= max_files))
+		result = true;
+
+	return result;
+}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 50c96ea7dc7c..ef131107dbf6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1011,5 +1011,6 @@  extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
 extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
 			      struct ceph_mds_session *session,
 			      struct ceph_msg *msg);
+extern bool ceph_quota_is_quota_files_exceeded(struct inode *inode);
 
 #endif /* _FS_CEPH_SUPER_H */