diff mbox series

[v2,2/2] ceph: quota: fix quota subdir mounts

Message ID 20190312142019.30936-3-lhenriques@suse.com (mailing list archive)
State New, archived
Headers show
Series fix quota subdir mounts | expand

Commit Message

Luis Henriques March 12, 2019, 2:20 p.m. UTC
The CephFS kernel client does not enforce quotas set in a directory that
isn't visible from the mount point.  For example, given the path
'/dir1/dir2', if quotas are set in 'dir1' and the filesystem is mounted with

  mount -t ceph <server>:<port>:/dir1/ /mnt

then the client won't be able to access 'dir1' inode, even if 'dir2' belongs
to a quota realm that points to it.

This patch fixes this issue by simply doing an MDS LOOKUPINO operation for
unknown inodes.  Any inode reference obtained this way will be added to a
list in ceph_mds_client, and will only be released when the filesystem is
umounted.

Link: https://tracker.ceph.com/issues/38482
Reported-by: Hendrik Peyerl <hpeyerl@plusline.net>
Signed-off-by: Luis Henriques <lhenriques@suse.com>
---
 fs/ceph/mds_client.c |  15 ++++++
 fs/ceph/mds_client.h |   2 +
 fs/ceph/quota.c      | 106 +++++++++++++++++++++++++++++++++++++++----
 fs/ceph/super.h      |   2 +
 4 files changed, 115 insertions(+), 10 deletions(-)

Comments

Yan, Zheng March 18, 2019, 9:01 a.m. UTC | #1
On Tue, Mar 12, 2019 at 10:22 PM Luis Henriques <lhenriques@suse.com> wrote:
>
> The CephFS kernel client does not enforce quotas set in a directory that
> isn't visible from the mount point.  For example, given the path
> '/dir1/dir2', if quotas are set in 'dir1' and the filesystem is mounted with
>
>   mount -t ceph <server>:<port>:/dir1/ /mnt
>
> then the client won't be able to access 'dir1' inode, even if 'dir2' belongs
> to a quota realm that points to it.
>
> This patch fixes this issue by simply doing an MDS LOOKUPINO operation for
> unknown inodes.  Any inode reference obtained this way will be added to a
> list in ceph_mds_client, and will only be released when the filesystem is
> umounted.
>
> Link: https://tracker.ceph.com/issues/38482
> Reported-by: Hendrik Peyerl <hpeyerl@plusline.net>
> Signed-off-by: Luis Henriques <lhenriques@suse.com>
> ---
>  fs/ceph/mds_client.c |  15 ++++++
>  fs/ceph/mds_client.h |   2 +
>  fs/ceph/quota.c      | 106 +++++++++++++++++++++++++++++++++++++++----
>  fs/ceph/super.h      |   2 +
>  4 files changed, 115 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 163fc74bf221..1dc24c3525fe 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3656,6 +3656,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>         mdsc->max_sessions = 0;
>         mdsc->stopping = 0;
>         atomic64_set(&mdsc->quotarealms_count, 0);
> +       INIT_LIST_HEAD(&mdsc->quotarealms_inodes_list);
> +       spin_lock_init(&mdsc->quotarealms_inodes_lock);
>         mdsc->last_snap_seq = 0;
>         init_rwsem(&mdsc->snap_rwsem);
>         mdsc->snap_realms = RB_ROOT;
> @@ -3726,6 +3728,8 @@ static void wait_requests(struct ceph_mds_client *mdsc)
>   */
>  void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>  {
> +       struct ceph_inode_info *ci;
> +
>         dout("pre_umount\n");
>         mdsc->stopping = 1;
>
> @@ -3738,6 +3742,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>          * their inode/dcache refs
>          */
>         ceph_msgr_flush();
> +       /*
> +        * It's now safe to clean quotarealms_inode_list without holding
> +        * mdsc->quotarealms_inodes_lock
> +        */
> +       while (!list_empty(&mdsc->quotarealms_inodes_list)) {
> +               ci = list_first_entry(&mdsc->quotarealms_inodes_list,
> +                                     struct ceph_inode_info,
> +                                     i_quotarealms_inode_item);
> +               list_del(&ci->i_quotarealms_inode_item);
> +               iput(&ci->vfs_inode);
> +       }
>  }
>
>  /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 729da155ebf0..58968fb338ec 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -329,6 +329,8 @@ struct ceph_mds_client {
>         int                     stopping;      /* true if shutting down */
>
>         atomic64_t              quotarealms_count; /* # realms with quota */
> +       struct list_head        quotarealms_inodes_list;
> +       spinlock_t              quotarealms_inodes_lock;
>
>         /*
>          * snap_rwsem will cover cap linkage into snaprealms, and
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 9455d3aef0c3..d1ab1b331c0d 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
>  static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> -       return atomic64_read(&mdsc->quotarealms_count) > 0;
> +       struct super_block *sb = mdsc->fsc->sb;
> +
> +       if (atomic64_read(&mdsc->quotarealms_count) > 0)
> +               return true;
> +       /* if root is the real CephFS root, we don't have quota realms */
> +       if (sb->s_root->d_inode &&
> +           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +               return false;
> +       /* otherwise, we can't know for sure */
> +       return true;
>  }
>
>  void ceph_handle_quota(struct ceph_mds_client *mdsc,
> @@ -68,6 +77,37 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>         iput(inode);
>  }
>
> +/*
> + * This function will try to lookup a realm inode.  If the inode is found
> + * (through an MDS LOOKUPINO operation), the realm->inode will be updated and
> + * the inode will also be added to an mdsc list which will be freed only when
> + * the filesystem is umounted.
> + */
> +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
> +                                            struct super_block *sb,
> +                                            struct ceph_snap_realm *realm)
> +{
> +       struct inode *in;
> +
> +       in = ceph_lookup_inode(sb, realm->ino);
> +       if (IS_ERR(in)) {
> +               pr_warn("Can't lookup inode %llx (err: %ld)\n",
> +                       realm->ino, PTR_ERR(in));
> +               return in;
> +       }
> +
> +       spin_lock(&mdsc->quotarealms_inodes_lock);
> +       list_add(&ceph_inode(in)->i_quotarealms_inode_item,
> +                &mdsc->quotarealms_inodes_list);
> +       spin_unlock(&mdsc->quotarealms_inodes_lock);
> +
> +       spin_lock(&realm->inodes_with_caps_lock);
> +       realm->inode = in;
> +       spin_unlock(&realm->inodes_with_caps_lock);
> +
> +       return in;
> +}
> +
>  /*
>   * This function walks through the snaprealm for an inode and returns the
>   * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
> @@ -76,9 +116,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>   *
>   * Note that the caller is responsible for calling ceph_put_snap_realm() on the
>   * returned realm.
> + *
> + * Callers of this function need to hold mdsc->snap_rwsem.  However, if there's
> + * a need to do an inode lookup, this rwsem will be temporarily dropped.  Hence
> + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
> + * this function will return -EAGAIN; otherwise, the snaprealms walk-through
> + * will be restarted.
>   */
>  static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
> -                                              struct inode *inode)
> +                                              struct inode *inode, bool retry)
>  {
>         struct ceph_inode_info *ci = NULL;
>         struct ceph_snap_realm *realm, *next;
> @@ -88,6 +134,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
>         if (ceph_snap(inode) != CEPH_NOSNAP)
>                 return NULL;
>
> +restart:
>         realm = ceph_inode(inode)->i_snap_realm;
>         if (realm)
>                 ceph_get_snap_realm(mdsc, realm);
> @@ -95,11 +142,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
>                 pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
>                                    "null i_snap_realm\n", ceph_vinop(inode));
>         while (realm) {
> +               bool has_inode;
> +
>                 spin_lock(&realm->inodes_with_caps_lock);
> -               in = realm->inode ? igrab(realm->inode) : NULL;
> +               has_inode = realm->inode;
> +               in = has_inode ? igrab(realm->inode) : NULL;
>                 spin_unlock(&realm->inodes_with_caps_lock);
> -               if (!in)
> +               if (has_inode && !in)
>                         break;
> +               if (!in) {
> +                       up_read(&mdsc->snap_rwsem);
> +                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
> +                       down_read(&mdsc->snap_rwsem);
> +                       if (IS_ERR(in))
> +                               break;
> +                       ceph_put_snap_realm(mdsc, realm);
> +                       if (!retry)
> +                               return ERR_PTR(-EAGAIN);
> +                       goto restart;
> +               }
>
>                 ci = ceph_inode(in);
>                 has_quota = __ceph_has_any_quota(ci);
> @@ -125,9 +186,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
>         struct ceph_snap_realm *old_realm, *new_realm;
>         bool is_same;
>
> +restart:
> +       /*
> +        * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
> +        * However, get_quota_realm may drop it temporarily.  By setting the
> +        * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
> +        * dropped and we can then restart the whole operation.
> +        */
>         down_read(&mdsc->snap_rwsem);
> -       old_realm = get_quota_realm(mdsc, old);
> -       new_realm = get_quota_realm(mdsc, new);
> +       old_realm = get_quota_realm(mdsc, old, true);
> +       new_realm = get_quota_realm(mdsc, new, false);
> +       if (PTR_ERR(new_realm) == -EAGAIN) {
> +               up_read(&mdsc->snap_rwsem);
> +               if (old_realm)
> +                       ceph_put_snap_realm(mdsc, old_realm);
> +               goto restart;
> +       }
>         is_same = (old_realm == new_realm);
>         up_read(&mdsc->snap_rwsem);
>
> @@ -166,6 +240,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
>                 return false;
>
>         down_read(&mdsc->snap_rwsem);
> +restart:
>         realm = ceph_inode(inode)->i_snap_realm;
>         if (realm)
>                 ceph_get_snap_realm(mdsc, realm);
> @@ -173,12 +248,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
>                 pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
>                                    "null i_snap_realm\n", ceph_vinop(inode));
>         while (realm) {
> +               bool has_inode;
> +
>                 spin_lock(&realm->inodes_with_caps_lock);
> -               in = realm->inode ? igrab(realm->inode) : NULL;
> +               has_inode = realm->inode;
> +               in = has_inode ? igrab(realm->inode) : NULL;
>                 spin_unlock(&realm->inodes_with_caps_lock);
> -               if (!in)
> +               if (has_inode && !in)
>                         break;
> -
> +               if (!in) {
> +                       up_read(&mdsc->snap_rwsem);
> +                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
> +                       down_read(&mdsc->snap_rwsem);
> +                       if (IS_ERR(in))
> +                               break;
> +                       ceph_put_snap_realm(mdsc, realm);
> +                       goto restart;
> +               }
>                 ci = ceph_inode(in);
>                 spin_lock(&ci->i_ceph_lock);
>                 if (op == QUOTA_CHECK_MAX_FILES_OP) {
> @@ -314,7 +400,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
>         bool is_updated = false;
>
>         down_read(&mdsc->snap_rwsem);
> -       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
> +       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
>         up_read(&mdsc->snap_rwsem);
>         if (!realm)
>                 return false;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index ce51e98b08ec..cc7766aeb73b 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -375,6 +375,8 @@ struct ceph_inode_info {
>         struct list_head i_snap_realm_item;
>         struct list_head i_snap_flush_item;
>
> +       struct list_head i_quotarealms_inode_item;
> +
>         struct work_struct i_wb_work;  /* writeback work */
>         struct work_struct i_pg_inv_work;  /* page invalidation work */
>

After reading the code carefully. I feel a little uncomfortable with
the "lookup_ino" in get_quota_realm.  how about populating directories
above the 'mount subdir' during mounting (similar to cifs_get_root ).

Regards
Yan, Zheng
Gregory Farnum March 18, 2019, 9:05 a.m. UTC | #2
On Mon, Mar 18, 2019 at 2:32 PM Yan, Zheng <ukernel@gmail.com> wrote:
> After reading the code carefully. I feel a little uncomfortable with
> the "lookup_ino" in get_quota_realm.  how about populating directories
> above the 'mount subdir' during mounting (similar to cifs_get_root ).

Isn't that going to be a problem for any clients which have restricted
filesystem access permissions? They may not be able to see all the
directories above their mount point.
-Greg
Yan, Zheng March 18, 2019, 9:12 a.m. UTC | #3
On Mon, Mar 18, 2019 at 5:06 PM Gregory Farnum <gfarnum@redhat.com> wrote:
>
> On Mon, Mar 18, 2019 at 2:32 PM Yan, Zheng <ukernel@gmail.com> wrote:
> > After reading the code carefully. I feel a little uncomfortable with
> > the "lookup_ino" in get_quota_realm.  how about populating directories
> > above the 'mount subdir' during mounting (similar to cifs_get_root ).
>
> Isn't that going to be a problem for any clients which have restricted
> filesystem access permissions? They may not be able to see all the
> directories above their mount point.
> -Greg

using lookup_ino to get inode above the "mount subdir" has the same problem
Luis Henriques March 18, 2019, 10:55 a.m. UTC | #4
"Yan, Zheng" <ukernel@gmail.com> writes:

> On Mon, Mar 18, 2019 at 5:06 PM Gregory Farnum <gfarnum@redhat.com> wrote:
>>
>> On Mon, Mar 18, 2019 at 2:32 PM Yan, Zheng <ukernel@gmail.com> wrote:
>> > After reading the code carefully. I feel a little uncomfortable with
>> > the "lookup_ino" in get_quota_realm.  how about populating directories
>> > above the 'mount subdir' during mounting (similar to cifs_get_root ).

Wouldn't it be a problem if the directory layout (or, in this case, the
snaprealm layout) change during the mount lifetime?  In that case we
would need to do this lookup anyway.

>>
>> Isn't that going to be a problem for any clients which have
>>restricted filesystem access permissions? They may not be able to see
>>all the directories above their mount point.  -Greg
>
> using lookup_ino to get inode above the "mount subdir" has the same problem
>

In this case I believe we get an -EPERM from the MDS.  And then the
client simply falls back to the 'default' behaviour, which is to allow
the user to create/write to files as if there were no quotas set.

Cheers,
Gregory Farnum March 18, 2019, 11:19 a.m. UTC | #5
On Mon, Mar 18, 2019 at 4:25 PM Luis Henriques <lhenriques@suse.com> wrote:
>
> "Yan, Zheng" <ukernel@gmail.com> writes:
>
> > On Mon, Mar 18, 2019 at 5:06 PM Gregory Farnum <gfarnum@redhat.com> wrote:
> >>
> >> On Mon, Mar 18, 2019 at 2:32 PM Yan, Zheng <ukernel@gmail.com> wrote:
> >> > After reading the code carefully. I feel a little uncomfortable with
> >> > the "lookup_ino" in get_quota_realm.  how about populating directories
> >> > above the 'mount subdir' during mounting (similar to cifs_get_root ).
>
> Wouldn't it be a problem if the directory layout (or, in this case, the
> snaprealm layout) change during the mount lifetime?  In that case we
> would need to do this lookup anyway.
>
> >>
> >> Isn't that going to be a problem for any clients which have
> >>restricted filesystem access permissions? They may not be able to see
> >>all the directories above their mount point.  -Greg
> >
> > using lookup_ino to get inode above the "mount subdir" has the same problem
> >
>
> In this case I believe we get an -EPERM from the MDS.  And then the
> client simply falls back to the 'default' behaviour, which is to allow
> the user to create/write to files as if there were no quotas set.

Right, but a client is a lot more likely to have access to
/home/<username>/ than they are to also be able to look at /, and
/home.

So if they can do a lookup on /home/<username>/ they get their quota,
but if they have to look it up from the root it's lost even though the
client has permission to see all the things that contain their quota
state... :/

>
> Cheers,
> --
> Luis
Yan, Zheng March 18, 2019, 12:55 p.m. UTC | #6
On Mon, Mar 18, 2019 at 6:55 PM Luis Henriques <lhenriques@suse.com> wrote:
>
> "Yan, Zheng" <ukernel@gmail.com> writes:
>
> > On Mon, Mar 18, 2019 at 5:06 PM Gregory Farnum <gfarnum@redhat.com> wrote:
> >>
> >> On Mon, Mar 18, 2019 at 2:32 PM Yan, Zheng <ukernel@gmail.com> wrote:
> >> > After reading the code carefully. I feel a little uncomfortable with
> >> > the "lookup_ino" in get_quota_realm.  how about populating directories
> >> > above the 'mount subdir' during mounting (similar to cifs_get_root ).
>
> Wouldn't it be a problem if the directory layout (or, in this case, the
> snaprealm layout) change during the mount lifetime?  In that case we
> would need to do this lookup anyway.
>

right

> >>
> >> Isn't that going to be a problem for any clients which have
> >>restricted filesystem access permissions? They may not be able to see
> >>all the directories above their mount point.  -Greg
> >
> > using lookup_ino to get inode above the "mount subdir" has the same problem
> >
>
> In this case I believe we get an -EPERM from the MDS.  And then the
> client simply falls back to the 'default' behaviour, which is to allow
> the user to create/write to files as if there were no quotas set.
>

fair enough

> Cheers,
> --
> Luis
Yan, Zheng March 18, 2019, 1:06 p.m. UTC | #7
On Tue, Mar 12, 2019 at 10:22 PM Luis Henriques <lhenriques@suse.com> wrote:
>
> The CephFS kernel client does not enforce quotas set in a directory that
> isn't visible from the mount point.  For example, given the path
> '/dir1/dir2', if quotas are set in 'dir1' and the filesystem is mounted with
>
>   mount -t ceph <server>:<port>:/dir1/ /mnt
>
> then the client won't be able to access 'dir1' inode, even if 'dir2' belongs
> to a quota realm that points to it.
>
> This patch fixes this issue by simply doing an MDS LOOKUPINO operation for
> unknown inodes.  Any inode reference obtained this way will be added to a
> list in ceph_mds_client, and will only be released when the filesystem is
> umounted.
>
> Link: https://tracker.ceph.com/issues/38482
> Reported-by: Hendrik Peyerl <hpeyerl@plusline.net>
> Signed-off-by: Luis Henriques <lhenriques@suse.com>
> ---
>  fs/ceph/mds_client.c |  15 ++++++
>  fs/ceph/mds_client.h |   2 +
>  fs/ceph/quota.c      | 106 +++++++++++++++++++++++++++++++++++++++----
>  fs/ceph/super.h      |   2 +
>  4 files changed, 115 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 163fc74bf221..1dc24c3525fe 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3656,6 +3656,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>         mdsc->max_sessions = 0;
>         mdsc->stopping = 0;
>         atomic64_set(&mdsc->quotarealms_count, 0);
> +       INIT_LIST_HEAD(&mdsc->quotarealms_inodes_list);
> +       spin_lock_init(&mdsc->quotarealms_inodes_lock);
>         mdsc->last_snap_seq = 0;
>         init_rwsem(&mdsc->snap_rwsem);
>         mdsc->snap_realms = RB_ROOT;
> @@ -3726,6 +3728,8 @@ static void wait_requests(struct ceph_mds_client *mdsc)
>   */
>  void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>  {
> +       struct ceph_inode_info *ci;
> +
>         dout("pre_umount\n");
>         mdsc->stopping = 1;
>
> @@ -3738,6 +3742,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>          * their inode/dcache refs
>          */
>         ceph_msgr_flush();
> +       /*
> +        * It's now safe to clean quotarealms_inode_list without holding
> +        * mdsc->quotarealms_inodes_lock
> +        */
> +       while (!list_empty(&mdsc->quotarealms_inodes_list)) {
> +               ci = list_first_entry(&mdsc->quotarealms_inodes_list,
> +                                     struct ceph_inode_info,
> +                                     i_quotarealms_inode_item);
> +               list_del(&ci->i_quotarealms_inode_item);
> +               iput(&ci->vfs_inode);
> +       }
>  }
>
>  /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 729da155ebf0..58968fb338ec 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -329,6 +329,8 @@ struct ceph_mds_client {
>         int                     stopping;      /* true if shutting down */
>
>         atomic64_t              quotarealms_count; /* # realms with quota */
> +       struct list_head        quotarealms_inodes_list;
> +       spinlock_t              quotarealms_inodes_lock;
>
>         /*
>          * snap_rwsem will cover cap linkage into snaprealms, and
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 9455d3aef0c3..d1ab1b331c0d 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
>  static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> -       return atomic64_read(&mdsc->quotarealms_count) > 0;
> +       struct super_block *sb = mdsc->fsc->sb;
> +
> +       if (atomic64_read(&mdsc->quotarealms_count) > 0)
> +               return true;
> +       /* if root is the real CephFS root, we don't have quota realms */
> +       if (sb->s_root->d_inode &&
> +           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +               return false;
> +       /* otherwise, we can't know for sure */
> +       return true;
>  }
>
>  void ceph_handle_quota(struct ceph_mds_client *mdsc,
> @@ -68,6 +77,37 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>         iput(inode);
>  }
>
> +/*
> + * This function will try to lookup a realm inode.  If the inode is found
> + * (through an MDS LOOKUPINO operation), the realm->inode will be updated and
> + * the inode will also be added to an mdsc list which will be freed only when
> + * the filesystem is umounted.
> + */
> +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
> +                                            struct super_block *sb,
> +                                            struct ceph_snap_realm *realm)
> +{
> +       struct inode *in;
> +
> +       in = ceph_lookup_inode(sb, realm->ino);
> +       if (IS_ERR(in)) {
> +               pr_warn("Can't lookup inode %llx (err: %ld)\n",
> +                       realm->ino, PTR_ERR(in));
> +               return in;
> +       }
> +
> +       spin_lock(&mdsc->quotarealms_inodes_lock);
> +       list_add(&ceph_inode(in)->i_quotarealms_inode_item,
> +                &mdsc->quotarealms_inodes_list);
> +       spin_unlock(&mdsc->quotarealms_inodes_lock);
> +
Multiple threads can call this function for the same inode at the same
time. need to handle this. Besides, client needs to record lookupino
error. Otherwise, client may repeatedly send useless request.


> +       spin_lock(&realm->inodes_with_caps_lock);
> +       realm->inode = in;

reply of lookup_ino should alreadly set realm->inode

> +       spin_unlock(&realm->inodes_with_caps_lock);
> +
> +       return in;
> +}
> +
>  /*
>   * This function walks through the snaprealm for an inode and returns the
>   * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
> @@ -76,9 +116,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
>   *
>   * Note that the caller is responsible for calling ceph_put_snap_realm() on the
>   * returned realm.
> + *
> + * Callers of this function need to hold mdsc->snap_rwsem.  However, if there's
> + * a need to do an inode lookup, this rwsem will be temporarily dropped.  Hence
> + * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
> + * this function will return -EAGAIN; otherwise, the snaprealms walk-through
> + * will be restarted.
>   */
>  static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
> -                                              struct inode *inode)
> +                                              struct inode *inode, bool retry)
>  {
>         struct ceph_inode_info *ci = NULL;
>         struct ceph_snap_realm *realm, *next;
> @@ -88,6 +134,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
>         if (ceph_snap(inode) != CEPH_NOSNAP)
>                 return NULL;
>
> +restart:
>         realm = ceph_inode(inode)->i_snap_realm;
>         if (realm)
>                 ceph_get_snap_realm(mdsc, realm);
> @@ -95,11 +142,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
>                 pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
>                                    "null i_snap_realm\n", ceph_vinop(inode));
>         while (realm) {
> +               bool has_inode;
> +
>                 spin_lock(&realm->inodes_with_caps_lock);
> -               in = realm->inode ? igrab(realm->inode) : NULL;
> +               has_inode = realm->inode;
> +               in = has_inode ? igrab(realm->inode) : NULL;
>                 spin_unlock(&realm->inodes_with_caps_lock);
> -               if (!in)
> +               if (has_inode && !in)
>                         break;
> +               if (!in) {
> +                       up_read(&mdsc->snap_rwsem);
> +                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
> +                       down_read(&mdsc->snap_rwsem);
> +                       if (IS_ERR(in))
> +                               break;
> +                       ceph_put_snap_realm(mdsc, realm);
> +                       if (!retry)
> +                               return ERR_PTR(-EAGAIN);
> +                       goto restart;
> +               }
>
>                 ci = ceph_inode(in);
>                 has_quota = __ceph_has_any_quota(ci);
> @@ -125,9 +186,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
>         struct ceph_snap_realm *old_realm, *new_realm;
>         bool is_same;
>
> +restart:
> +       /*
> +        * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
> +        * However, get_quota_realm may drop it temporarily.  By setting the
> +        * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
> +        * dropped and we can then restart the whole operation.
> +        */
>         down_read(&mdsc->snap_rwsem);
> -       old_realm = get_quota_realm(mdsc, old);
> -       new_realm = get_quota_realm(mdsc, new);
> +       old_realm = get_quota_realm(mdsc, old, true);
> +       new_realm = get_quota_realm(mdsc, new, false);
> +       if (PTR_ERR(new_realm) == -EAGAIN) {
> +               up_read(&mdsc->snap_rwsem);
> +               if (old_realm)
> +                       ceph_put_snap_realm(mdsc, old_realm);
> +               goto restart;
> +       }
>         is_same = (old_realm == new_realm);
>         up_read(&mdsc->snap_rwsem);
>
> @@ -166,6 +240,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
>                 return false;
>
>         down_read(&mdsc->snap_rwsem);
> +restart:
>         realm = ceph_inode(inode)->i_snap_realm;
>         if (realm)
>                 ceph_get_snap_realm(mdsc, realm);
> @@ -173,12 +248,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
>                 pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
>                                    "null i_snap_realm\n", ceph_vinop(inode));
>         while (realm) {
> +               bool has_inode;
> +
>                 spin_lock(&realm->inodes_with_caps_lock);
> -               in = realm->inode ? igrab(realm->inode) : NULL;
> +               has_inode = realm->inode;
> +               in = has_inode ? igrab(realm->inode) : NULL;
>                 spin_unlock(&realm->inodes_with_caps_lock);
> -               if (!in)
> +               if (has_inode && !in)
>                         break;
> -
> +               if (!in) {
> +                       up_read(&mdsc->snap_rwsem);
> +                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
> +                       down_read(&mdsc->snap_rwsem);
> +                       if (IS_ERR(in))
> +                               break;
> +                       ceph_put_snap_realm(mdsc, realm);
> +                       goto restart;
> +               }
>                 ci = ceph_inode(in);
>                 spin_lock(&ci->i_ceph_lock);
>                 if (op == QUOTA_CHECK_MAX_FILES_OP) {
> @@ -314,7 +400,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
>         bool is_updated = false;
>
>         down_read(&mdsc->snap_rwsem);
> -       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
> +       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
>         up_read(&mdsc->snap_rwsem);
>         if (!realm)
>                 return false;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index ce51e98b08ec..cc7766aeb73b 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -375,6 +375,8 @@ struct ceph_inode_info {
>         struct list_head i_snap_realm_item;
>         struct list_head i_snap_flush_item;
>
> +       struct list_head i_quotarealms_inode_item;
> +
>         struct work_struct i_wb_work;  /* writeback work */
>         struct work_struct i_pg_inv_work;  /* page invalidation work */
>
Luis Henriques March 19, 2019, 4:42 p.m. UTC | #8
"Yan, Zheng" <ukernel@gmail.com> writes:

> On Tue, Mar 12, 2019 at 10:22 PM Luis Henriques <lhenriques@suse.com> wrote:
...
>> +static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
>> +                                            struct super_block *sb,
>> +                                            struct ceph_snap_realm *realm)
>> +{
>> +       struct inode *in;
>> +
>> +       in = ceph_lookup_inode(sb, realm->ino);
>> +       if (IS_ERR(in)) {
>> +               pr_warn("Can't lookup inode %llx (err: %ld)\n",
>> +                       realm->ino, PTR_ERR(in));
>> +               return in;
>> +       }
>> +
>> +       spin_lock(&mdsc->quotarealms_inodes_lock);
>> +       list_add(&ceph_inode(in)->i_quotarealms_inode_item,
>> +                &mdsc->quotarealms_inodes_list);
>> +       spin_unlock(&mdsc->quotarealms_inodes_lock);
>> +
> Multiple threads can call this function for the same inode at the same
> time. need to handle this. Besides, client needs to record lookupino
> error. Otherwise, client may repeatedly send useless request.

Good point.  So, the only way I see to fix this is to drop the
mdsc->quotarealms_inodes_list and instead use an ordered list/tree of
structs that would either point to the corresponding ceph inode or to
NULL if there was an error in the lookup:

	struct ceph_realm_inode {
                u64 ino;
		struct ceph_inode_info *ci;
		spinlock_t lock;
		unsigned long timeout;
	}

The 'timeout' field would be used to try to do the lookup again if the
error occurred long time ago.

The code would then create a new struct for the realm->ino (if one is
not found in the mdsc list), lock it and do the lookupino; if there's a
struct already on the list, it either means there's a lookupino in
progress or there was an error in the last lookup.

This sounds overly complicated so I may be missing the obvious simple
fix.  Any ideas?

>> +       spin_lock(&realm->inodes_with_caps_lock);
>> +       realm->inode = in;
>
> reply of lookup_ino should alreadly set realm->inode

Yes, of course.  This was silly.

Cheers,
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..1dc24c3525fe 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3656,6 +3656,8 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	mdsc->max_sessions = 0;
 	mdsc->stopping = 0;
 	atomic64_set(&mdsc->quotarealms_count, 0);
+	INIT_LIST_HEAD(&mdsc->quotarealms_inodes_list);
+	spin_lock_init(&mdsc->quotarealms_inodes_lock);
 	mdsc->last_snap_seq = 0;
 	init_rwsem(&mdsc->snap_rwsem);
 	mdsc->snap_realms = RB_ROOT;
@@ -3726,6 +3728,8 @@  static void wait_requests(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 {
+	struct ceph_inode_info *ci;
+
 	dout("pre_umount\n");
 	mdsc->stopping = 1;
 
@@ -3738,6 +3742,17 @@  void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	 * their inode/dcache refs
 	 */
 	ceph_msgr_flush();
+	/*
+	 * It's now safe to clean quotarealms_inode_list without holding
+	 * mdsc->quotarealms_inodes_lock
+	 */
+	while (!list_empty(&mdsc->quotarealms_inodes_list)) {
+		ci = list_first_entry(&mdsc->quotarealms_inodes_list,
+				      struct ceph_inode_info,
+				      i_quotarealms_inode_item);
+		list_del(&ci->i_quotarealms_inode_item);
+		iput(&ci->vfs_inode);
+	}
 }
 
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..58968fb338ec 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -329,6 +329,8 @@  struct ceph_mds_client {
 	int                     stopping;      /* true if shutting down */
 
 	atomic64_t		quotarealms_count; /* # realms with quota */
+	struct list_head	quotarealms_inodes_list;
+	spinlock_t		quotarealms_inodes_lock;
 
 	/*
 	 * snap_rwsem will cover cap linkage into snaprealms, and
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9455d3aef0c3..d1ab1b331c0d 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -22,7 +22,16 @@  void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
 static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	return atomic64_read(&mdsc->quotarealms_count) > 0;
+	struct super_block *sb = mdsc->fsc->sb;
+
+	if (atomic64_read(&mdsc->quotarealms_count) > 0)
+		return true;
+	/* if root is the real CephFS root, we don't have quota realms */
+	if (sb->s_root->d_inode &&
+	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+		return false;
+	/* otherwise, we can't know for sure */
+	return true;
 }
 
 void ceph_handle_quota(struct ceph_mds_client *mdsc,
@@ -68,6 +77,37 @@  void ceph_handle_quota(struct ceph_mds_client *mdsc,
 	iput(inode);
 }
 
+/*
+ * This function will try to lookup a realm inode.  If the inode is found
+ * (through an MDS LOOKUPINO operation), the realm->inode will be updated and
+ * the inode will also be added to an mdsc list which will be freed only when
+ * the filesystem is umounted.
+ */
+static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
+					     struct super_block *sb,
+					     struct ceph_snap_realm *realm)
+{
+	struct inode *in;
+
+	in = ceph_lookup_inode(sb, realm->ino);
+	if (IS_ERR(in)) {
+		pr_warn("Can't lookup inode %llx (err: %ld)\n",
+			realm->ino, PTR_ERR(in));
+		return in;
+	}
+
+	spin_lock(&mdsc->quotarealms_inodes_lock);
+	list_add(&ceph_inode(in)->i_quotarealms_inode_item,
+		 &mdsc->quotarealms_inodes_list);
+	spin_unlock(&mdsc->quotarealms_inodes_lock);
+
+	spin_lock(&realm->inodes_with_caps_lock);
+	realm->inode = in;
+	spin_unlock(&realm->inodes_with_caps_lock);
+
+	return in;
+}
+
 /*
  * This function walks through the snaprealm for an inode and returns the
  * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
@@ -76,9 +116,15 @@  void ceph_handle_quota(struct ceph_mds_client *mdsc,
  *
  * Note that the caller is responsible for calling ceph_put_snap_realm() on the
  * returned realm.
+ *
+ * Callers of this function need to hold mdsc->snap_rwsem.  However, if there's
+ * a need to do an inode lookup, this rwsem will be temporarily dropped.  Hence
+ * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
+ * this function will return -EAGAIN; otherwise, the snaprealms walk-through
+ * will be restarted.
  */
 static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
-					       struct inode *inode)
+					       struct inode *inode, bool retry)
 {
 	struct ceph_inode_info *ci = NULL;
 	struct ceph_snap_realm *realm, *next;
@@ -88,6 +134,7 @@  static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return NULL;
 
+restart:
 	realm = ceph_inode(inode)->i_snap_realm;
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
@@ -95,11 +142,25 @@  static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
 		pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
 				   "null i_snap_realm\n", ceph_vinop(inode));
 	while (realm) {
+		bool has_inode;
+
 		spin_lock(&realm->inodes_with_caps_lock);
-		in = realm->inode ? igrab(realm->inode) : NULL;
+		has_inode = realm->inode;
+		in = has_inode ? igrab(realm->inode) : NULL;
 		spin_unlock(&realm->inodes_with_caps_lock);
-		if (!in)
+		if (has_inode && !in)
 			break;
+		if (!in) {
+			up_read(&mdsc->snap_rwsem);
+			in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+			down_read(&mdsc->snap_rwsem);
+			if (IS_ERR(in))
+				break;
+			ceph_put_snap_realm(mdsc, realm);
+			if (!retry)
+				return ERR_PTR(-EAGAIN);
+			goto restart;
+		}
 
 		ci = ceph_inode(in);
 		has_quota = __ceph_has_any_quota(ci);
@@ -125,9 +186,22 @@  bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
 	struct ceph_snap_realm *old_realm, *new_realm;
 	bool is_same;
 
+restart:
+	/*
+	 * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
+	 * However, get_quota_realm may drop it temporarily.  By setting the
+	 * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
+	 * dropped and we can then restart the whole operation.
+	 */
 	down_read(&mdsc->snap_rwsem);
-	old_realm = get_quota_realm(mdsc, old);
-	new_realm = get_quota_realm(mdsc, new);
+	old_realm = get_quota_realm(mdsc, old, true);
+	new_realm = get_quota_realm(mdsc, new, false);
+	if (PTR_ERR(new_realm) == -EAGAIN) {
+		up_read(&mdsc->snap_rwsem);
+		if (old_realm)
+			ceph_put_snap_realm(mdsc, old_realm);
+		goto restart;
+	}
 	is_same = (old_realm == new_realm);
 	up_read(&mdsc->snap_rwsem);
 
@@ -166,6 +240,7 @@  static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
 		return false;
 
 	down_read(&mdsc->snap_rwsem);
+restart:
 	realm = ceph_inode(inode)->i_snap_realm;
 	if (realm)
 		ceph_get_snap_realm(mdsc, realm);
@@ -173,12 +248,23 @@  static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
 		pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
 				   "null i_snap_realm\n", ceph_vinop(inode));
 	while (realm) {
+		bool has_inode;
+
 		spin_lock(&realm->inodes_with_caps_lock);
-		in = realm->inode ? igrab(realm->inode) : NULL;
+		has_inode = realm->inode;
+		in = has_inode ? igrab(realm->inode) : NULL;
 		spin_unlock(&realm->inodes_with_caps_lock);
-		if (!in)
+		if (has_inode && !in)
 			break;
-
+		if (!in) {
+			up_read(&mdsc->snap_rwsem);
+			in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+			down_read(&mdsc->snap_rwsem);
+			if (IS_ERR(in))
+				break;
+			ceph_put_snap_realm(mdsc, realm);
+			goto restart;
+		}
 		ci = ceph_inode(in);
 		spin_lock(&ci->i_ceph_lock);
 		if (op == QUOTA_CHECK_MAX_FILES_OP) {
@@ -314,7 +400,7 @@  bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
 	bool is_updated = false;
 
 	down_read(&mdsc->snap_rwsem);
-	realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
+	realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
 	up_read(&mdsc->snap_rwsem);
 	if (!realm)
 		return false;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ce51e98b08ec..cc7766aeb73b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -375,6 +375,8 @@  struct ceph_inode_info {
 	struct list_head i_snap_realm_item;
 	struct list_head i_snap_flush_item;
 
+	struct list_head i_quotarealms_inode_item;
+
 	struct work_struct i_wb_work;  /* writeback work */
 	struct work_struct i_pg_inv_work;  /* page invalidation work */