diff mbox series

[1/2] ceph: issue getattr/lookup reqs to MDSes in an aggregative pattern

Message ID 20190225122924.25827-1-xxhdx1985126@gmail.com (mailing list archive)
State New, archived
Headers show
Series [1/2] ceph: issue getattr/lookup reqs to MDSes in an aggregative pattern | expand

Commit Message

Xuehan Xu Feb. 25, 2019, 12:29 p.m. UTC
From: Xuehan Xu <xuxuehan@360.cn>

Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
op, issue a new one if there is no inflight req that requires that same caps
as the current getattr/lookup op.

Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
---
 fs/ceph/caps.c       |   1 +
 fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
 fs/ceph/inode.c      |  55 +++++++++++++----
 fs/ceph/mds_client.c |  30 +++++++++-
 fs/ceph/mds_client.h |  15 ++++-
 fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h      |  14 +++++
 7 files changed, 304 insertions(+), 39 deletions(-)

Comments

Xuehan Xu Feb. 25, 2019, 12:31 p.m. UTC | #1
Hi, zheng yan.

I've just modified the code to solve the issue that you mentioned the
other day, please take a look:-)  thanks

On Mon, 25 Feb 2019 at 20:29, <xxhdx1985126@gmail.com> wrote:
>
> From: Xuehan Xu <xuxuehan@360.cn>
>
> Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> op, issue a new one if there is no inflight req that requires that same caps
> as the current getattr/lookup op.
>
> Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> ---
>  fs/ceph/caps.c       |   1 +
>  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
>  fs/ceph/inode.c      |  55 +++++++++++++----
>  fs/ceph/mds_client.c |  30 +++++++++-
>  fs/ceph/mds_client.h |  15 ++++-
>  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/super.h      |  14 +++++
>  7 files changed, 304 insertions(+), 39 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index bba28a5034ba..5a177a3c283f 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
>                 goto out;
>
>         ci->i_flushing_caps &= ~cleaned;
> +       ci->last_flush_ack_tid = flush_tid;
>
>         spin_lock(&mdsc->cap_dirty_lock);
>
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 82928cea0209..6f17c0319158 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>                                 }
>                         }
>                 }
> +               dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
>         }
>         spin_unlock(&dentry->d_lock);
>
> @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>                                          CEPH_MDS_LEASE_RENEW, seq);
>                 ceph_put_mds_session(session);
>         }
> -       dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> +       dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
>         return valid;
>  }
>
> @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
>         if (!valid) {
>                 struct ceph_mds_client *mdsc =
>                         ceph_sb_to_client(dir->i_sb)->mdsc;
> -               struct ceph_mds_request *req;
> +               struct ceph_mds_request *req = NULL;
> +               struct ceph_inode_info* cdir = ceph_inode(dir);
>                 int op, err;
>                 u32 mask;
>
>                 if (flags & LOOKUP_RCU)
>                         return -ECHILD;
> +               mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> +               if (ceph_security_xattr_wanted(dir))
> +                       mask |= CEPH_CAP_XATTR_SHARED;
>
>                 op = ceph_snap(dir) == CEPH_SNAPDIR ?
>                         CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> -               req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> -               if (!IS_ERR(req)) {
> -                       req->r_dentry = dget(dentry);
> -                       req->r_num_caps = 2;
> -                       req->r_parent = dir;
> +               if (op == CEPH_MDS_OP_LOOKUP) {
> +                       mutex_lock(&cdir->lookups_inflight_lock);
> +                       dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> +                                       dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> +                       req = __search_inode_getattr_or_lookup(cdir, mask, true);
> +               }
> +               if (req && op == CEPH_MDS_OP_LOOKUP) {
> +                       dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> +                                       dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +                       ceph_mdsc_get_request(req);
> +                       mutex_unlock(&cdir->lookups_inflight_lock);
> +                       err = ceph_mdsc_wait_for_request(req);
> +                       dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> +                                       dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +               } else {
>
> -                       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> -                       if (ceph_security_xattr_wanted(dir))
> -                               mask |= CEPH_CAP_XATTR_SHARED;
> -                       req->r_args.getattr.mask = cpu_to_le32(mask);
> +                       req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> +                       if (op == CEPH_MDS_OP_LOOKUP) {
> +                               if (!IS_ERR(req)) {
> +                                       unsigned long res = 0;
> +                                       req->r_dentry = dget(dentry);
> +                                       req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> +                                       req->r_num_caps = 2;
> +                                       req->r_parent = dir;
> +                                       req->r_args.getattr.mask = cpu_to_le32(mask);
> +                                       res = __register_inode_getattr_or_lookup(cdir, req, true);
> +                                       if (IS_ERR_VALUE(res)) {
> +                                               mutex_unlock(&cdir->lookups_inflight_lock);
> +                                               goto out;
> +                                       }
> +                                       dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> +                                                       dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +                               }
> +                               mutex_unlock(&cdir->lookups_inflight_lock);
> +                       }
> +                       if (IS_ERR(req))
> +                               goto out;
>
>                         err = ceph_mdsc_do_request(mdsc, NULL, req);
> -                       switch (err) {
> -                       case 0:
> -                               if (d_really_is_positive(dentry) &&
> -                                   d_inode(dentry) == req->r_target_inode)
> -                                       valid = 1;
> -                               break;
> -                       case -ENOENT:
> -                               if (d_really_is_negative(dentry))
> -                                       valid = 1;
> -                               /* Fallthrough */
> -                       default:
> -                               break;
> +                       if (op == CEPH_MDS_OP_LOOKUP) {
> +                               mutex_lock(&cdir->lookups_inflight_lock);
> +                               __unregister_inode_getattr_or_lookup(cdir, req, true);
> +                               dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> +                                               dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +                               mutex_unlock(&cdir->lookups_inflight_lock);
>                         }
> -                       ceph_mdsc_put_request(req);
> -                       dout("d_revalidate %p lookup result=%d\n",
> -                            dentry, err);
>                 }
> +               switch (err) {
> +               case 0:
> +                       if (d_really_is_positive(dentry) &&
> +                           d_inode(dentry) == req->r_target_inode)
> +                               valid = 1;
> +                       break;
> +               case -ENOENT:
> +                       if (d_really_is_negative(dentry))
> +                               valid = 1;
> +                       /* Fallthrough */
> +               default:
> +                       break;
> +               }
> +               ceph_mdsc_put_request(req);
> +               dout("d_revalidate %p lookup result=%d\n",
> +                               dentry, err);
>         }
>
> +out:
>         dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
>         if (valid) {
>                 ceph_dentry_lru_touch(dentry);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 9d1f34d46627..8705f0645a24 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>         dout("alloc_inode %p\n", &ci->vfs_inode);
>
>         spin_lock_init(&ci->i_ceph_lock);
> +       mutex_init(&ci->getattrs_inflight_lock);
> +       mutex_init(&ci->lookups_inflight_lock);
>
>         ci->i_version = 0;
>         ci->i_inline_version = 0;
> @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>         ci->i_xattrs.index_version = 0;
>
>         ci->i_caps = RB_ROOT;
> +       ci->getattrs_inflight = RB_ROOT;
> +       ci->lookups_inflight = RB_ROOT;
>         ci->i_auth_cap = NULL;
>         ci->i_dirty_caps = 0;
>         ci->i_flushing_caps = 0;
> +       ci->last_flush_ack_tid = 0;
>         INIT_LIST_HEAD(&ci->i_dirty_item);
>         INIT_LIST_HEAD(&ci->i_flushing_item);
>         ci->i_prealloc_cap_flush = NULL;
> @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
>          * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
>          * we expect a negative dentry.
>          */
> +       dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>         if (!tgt_vino && d_really_is_positive(dentry))
>                 return;
> -
> +       dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>         if (tgt_vino && (d_really_is_negative(dentry) ||
>                         !ceph_ino_compare(d_inode(dentry), tgt_vino)))
>                 return;
> @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>         struct ceph_mds_request *req;
>         int mode;
>         int err;
> +       struct ceph_inode_info* cinode = ceph_inode(inode);
>
>         if (ceph_snap(inode) == CEPH_SNAPDIR) {
>                 dout("do_getattr inode %p SNAPDIR\n", inode);
> @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>         if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
>                 return 0;
>
> -       mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> -       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> -       if (IS_ERR(req))
> -               return PTR_ERR(req);
> -       req->r_inode = inode;
> -       ihold(inode);
> -       req->r_num_caps = 1;
> -       req->r_args.getattr.mask = cpu_to_le32(mask);
> -       req->r_locked_page = locked_page;
> -       err = ceph_mdsc_do_request(mdsc, NULL, req);
> +       mutex_lock(&cinode->getattrs_inflight_lock);
> +       dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> +       req = __search_inode_getattr_or_lookup(cinode, mask, false);
> +       if (req) {
> +               dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +               ceph_mdsc_get_request(req);
> +               mutex_unlock(&cinode->getattrs_inflight_lock);
> +               err = ceph_mdsc_wait_for_request(req);
> +               dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +       } else {
> +               mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> +               req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> +               if (!IS_ERR(req)) {
> +                       unsigned long res = 0;
> +                       req->r_inode = inode;
> +                       req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> +                       ihold(inode);
> +                       req->r_num_caps = 1;
> +                       req->r_args.getattr.mask = cpu_to_le32(mask);
> +                       req->r_locked_page = locked_page;
> +                       res = __register_inode_getattr_or_lookup(cinode, req, false);
> +                       if (IS_ERR_VALUE(res)) {
> +                               mutex_unlock(&cinode->getattrs_inflight_lock);
> +                               return res;
> +                       }
> +                       dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +               }
> +               mutex_unlock(&cinode->getattrs_inflight_lock);
> +               if (IS_ERR(req))
> +                       return PTR_ERR(req);
> +               err = ceph_mdsc_do_request(mdsc, NULL, req);
> +               mutex_lock(&cinode->getattrs_inflight_lock);
> +               __unregister_inode_getattr_or_lookup(cinode, req, false);
> +               dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +               mutex_unlock(&cinode->getattrs_inflight_lock);
> +       }
>         if (locked_page && err == 0) {
>                 u64 inline_version = req->r_reply_info.targeti.inline_version;
>                 if (inline_version == 0) {
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 163fc74bf221..5c6e00f2fd0a 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
>         return 0;
>  }
>
> +void ceph_mds_requests_tree_release(struct kref* kref)
> +{
> +       struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> +
> +       BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> +       kfree(mrtree);
> +}
> +
>  /*
>   * Create an mds request.
>   */
> @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>         req->r_fmode = -1;
>         kref_init(&req->r_kref);
>         RB_CLEAR_NODE(&req->r_node);
> +       RB_CLEAR_NODE(&req->getattr_lookup_node);
>         INIT_LIST_HEAD(&req->r_wait);
> +       init_completion(&req->batch_op_completion);
>         init_completion(&req->r_completion);
>         init_completion(&req->r_safe_completion);
>         INIT_LIST_HEAD(&req->r_unsafe_item);
> @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>         mutex_unlock(&mdsc->mutex);
>  }
>
> +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> +{
> +       int err = 0;
> +       long timeleft = wait_for_completion_killable_timeout(
> +                       &req->batch_op_completion,
> +                       ceph_timeout_jiffies(req->r_timeout));
> +       if (timeleft > 0)
> +               err = 0;
> +       else if (!timeleft)
> +               err = -EIO;  /* timed out */
> +       else
> +               err = timeleft;  /* killed */
> +       if (!err)
> +               return err;
> +       return req->batch_op_err;
> +}
> +
>  /*
>   * Synchrously perform an mds request.  Take care of all of the
>   * session setup, forwarding, retry details.
> @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>         } else {
>                 err = req->r_err;
>         }
> -
> +       req->batch_op_err = err;
> +       complete_all(&req->batch_op_completion);
>  out:
>         mutex_unlock(&mdsc->mutex);
>         dout("do_request %p done, result %d\n", req, err);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 729da155ebf0..41727677643e 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
>  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
>                                                  struct ceph_mds_request *req);
>
> +struct mds_request_tree {
> +       int mask;
> +       struct kref ref;
> +       struct rb_root mds_requests;
> +       struct rb_node parent_tree_node;
> +};
> +
> +extern void ceph_mds_requests_tree_release(struct kref* ref);
> +
>  /*
>   * an in-flight mds request
>   */
>  struct ceph_mds_request {
>         u64 r_tid;                   /* transaction id */
> +       u64 last_caps_flush_ack_tid;
>         struct rb_node r_node;
> +       struct rb_node getattr_lookup_node;
>         struct ceph_mds_client *r_mdsc;
>
>         int r_op;                    /* mds op code */
> @@ -264,7 +275,7 @@ struct ceph_mds_request {
>         struct ceph_msg  *r_reply;
>         struct ceph_mds_reply_info_parsed r_reply_info;
>         struct page *r_locked_page;
> -       int r_err;
> +       int r_err, batch_op_err;
>
>         unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
>         unsigned long r_started;  /* start time to measure timeout against */
> @@ -287,6 +298,7 @@ struct ceph_mds_request {
>
>         struct kref       r_kref;
>         struct list_head  r_wait;
> +       struct completion batch_op_completion;
>         struct completion r_completion;
>         struct completion r_safe_completion;
>         ceph_mds_request_callback_t r_callback;
> @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
>  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>                                      struct ceph_mds_request *req);
> +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
>  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>                                 struct inode *dir,
>                                 struct ceph_mds_request *req);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index da2cd8e89062..857fbbc0b768 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
>         destroy_caches();
>  }
>
> +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +                                         struct ceph_mds_request* req,
> +                                         bool is_lookup)
> +{
> +       struct rb_node *node = NULL;
> +       struct mds_request_tree* tmp = NULL;
> +       if (!is_lookup)
> +               node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +       else
> +               node = inode->lookups_inflight.rb_node;
> +
> +       while (node)
> +       {
> +               tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +
> +               if (tmp->mask > req->r_args.getattr.mask)
> +                       node = node->rb_left;
> +               else if (tmp->mask < req->r_args.getattr.mask)
> +                       node = node->rb_right;
> +               else
> +                       break;
> +       }
> +
> +       if (!node)
> +               BUG();
> +
> +       rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> +
> +       if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> +               if (!is_lookup)
> +                       rb_erase(node, &inode->getattrs_inflight);
> +               else
> +                       rb_erase(node, &inode->lookups_inflight);
> +               kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> +       }
> +}
> +
> +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +                                       struct ceph_mds_request* req,
> +                                       bool is_lookup)
> +{
> +       struct rb_node **p = NULL, *parent = NULL;
> +       struct mds_request_tree *tmp = NULL;
> +       struct ceph_mds_request* tmp_req = NULL;
> +       struct mds_request_tree* mrtree = NULL;
> +
> +       if (!is_lookup)
> +               p = &ci->getattrs_inflight.rb_node;
> +       else
> +               p = &ci->lookups_inflight.rb_node;
> +
> +       while (*p) {
> +               parent = *p;
> +               tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> +               if (req->r_args.getattr.mask < tmp->mask)
> +                       p = &(*p)->rb_left;
> +               else if (req->r_args.getattr.mask > tmp->mask)
> +                       p = &(*p)->rb_right;
> +               else
> +                       break;
> +       }
> +
> +       if (!(*p)) {
> +               mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> +
> +               if (!mrtree)
> +                       return -ENOMEM;
> +
> +               mrtree->mds_requests = RB_ROOT;
> +               mrtree->mask = req->r_args.getattr.mask;
> +               kref_init(&mrtree->ref);
> +               rb_link_node(&mrtree->parent_tree_node, parent, p);
> +               if (!is_lookup) {
> +                       rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> +               } else {
> +                       rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> +               }
> +       } else {
> +               mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> +       }
> +
> +       p = &mrtree->mds_requests.rb_node;
> +       parent = NULL;
> +       while (*p) {
> +               parent = *p;
> +               tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> +               if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> +                       p = &(*p)->rb_left;
> +               else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> +                       p = &(*p)->rb_right;
> +               else
> +                       BUG();
> +       }
> +
> +       rb_link_node(&req->getattr_lookup_node, parent, p);
> +       rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> +       return 0;
> +}
> +
> +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +                                     int mask,
> +                                     bool is_lookup)
> +{
> +       struct rb_node *node = NULL;
> +       if (!is_lookup)
> +               node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +       else
> +               node = inode->lookups_inflight.rb_node;
> +
> +       while (node)
> +       {
> +               struct mds_request_tree* tmp = NULL;
> +               tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +
> +               if (tmp->mask > mask) {
> +                       node = node->rb_left;
> +               } else if (tmp->mask < mask) {
> +                       node = node->rb_right;
> +               } else {
> +                       struct rb_node* inner_node = tmp->mds_requests.rb_node;
> +                       while (inner_node) {
> +                               struct ceph_mds_request* req = NULL;
> +
> +                               req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> +                               if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> +                                       inner_node = inner_node->rb_left;
> +                               } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> +                                       inner_node = inner_node->rb_right;
> +                               } else
> +                                       return req; /* Found it */
> +                       }
> +                       return NULL;
> +               }
> +       }
> +       return NULL;
> +}
>  module_init(init_ceph);
>  module_exit(exit_ceph);
>
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index dfb64a5211b6..abf761f2a122 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -296,6 +296,8 @@ struct ceph_inode_info {
>         struct ceph_vino i_vino;   /* ceph ino + snap */
>
>         spinlock_t i_ceph_lock;
> +       struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> +       struct rb_root getattrs_inflight, lookups_inflight;
>
>         u64 i_version;
>         u64 i_inline_version;
> @@ -329,6 +331,7 @@ struct ceph_inode_info {
>         struct rb_root i_caps;           /* cap list */
>         struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>         unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> +       u64 last_flush_ack_tid;
>         struct list_head i_dirty_item, i_flushing_item;
>         /* we need to track cap writeback on a per-cap-bit basis, to allow
>          * overlapping, pipelined cap flushes to the mds.  we can probably
> @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
>                                 u64 time_warp_seq, struct timespec64 *ctime,
>                                 struct timespec64 *mtime,
>                                 struct timespec64 *atime);
> +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +                                              struct ceph_mds_request* req,
> +                                              bool is_lookup);
> +
> +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +                                                struct ceph_mds_request* req,
> +                                                bool is_lookup);
> +
> +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +                                            int mask,
> +                                            bool is_lookup);
>  extern int ceph_fill_trace(struct super_block *sb,
>                            struct ceph_mds_request *req);
>  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> --
> 2.19.1
>
Jeff Layton Feb. 26, 2019, 11:54 p.m. UTC | #2
On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> From: Xuehan Xu <xuxuehan@360.cn>
> 
> Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> op, issue a new one if there is no inflight req that requires that same caps
> as the current getattr/lookup op.
> 
> Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> ---
>  fs/ceph/caps.c       |   1 +
>  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
>  fs/ceph/inode.c      |  55 +++++++++++++----
>  fs/ceph/mds_client.c |  30 +++++++++-
>  fs/ceph/mds_client.h |  15 ++++-
>  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/super.h      |  14 +++++
>  7 files changed, 304 insertions(+), 39 deletions(-)
> 

The description on this patch could be fleshed out a bit.

How much does this help performance and on what workloads? This is a lot
of code to add without much justification.

Looking over the patch, the basic idea is to implement a cache (in
rbtrees) of getattrs and lookups in flight and just wait on those if
there already is one that matches what we need. I agree with Zheng that
this could end up violating the strict cache coherency guarantees that
ceph tries to provide.

You're relying on an assumption that if you issue a getattr/lookup while
another one is already in flight, then it's legitimate to use the result
of the initial one for the one issued later. That's not necessarily the
case. I suspect this would be possible, even from a single client:

- client1/task1 issues stat(), server performs getattr
but reply gets delayed for a bit

- client2 does write() + fsync()

- client1/task2 issues stat(), finds the entry in cache from task1 and
ends up waiting for first getattr to complete

- getattr reply comes in, and both stat calls get back the same
attributes, showing pre-write size/mtime

In this case, the synchronous write by client2 was provably done before
the second getattr, but the second stat() call wouldn't see the new
size.

The read handling patch may suffer from similar issues, though I haven't
gone over that one in as much detail.

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index bba28a5034ba..5a177a3c283f 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
>  		goto out;
>  
>  	ci->i_flushing_caps &= ~cleaned;
> +	ci->last_flush_ack_tid = flush_tid;
>  
>  	spin_lock(&mdsc->cap_dirty_lock);
>  
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 82928cea0209..6f17c0319158 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>  				}
>  			}
>  		}
> +		dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
>  	}
>  	spin_unlock(&dentry->d_lock);
>  
> @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>  					 CEPH_MDS_LEASE_RENEW, seq);
>  		ceph_put_mds_session(session);
>  	}
> -	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> +	dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
>  	return valid;
>  }
>  
> @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
>  	if (!valid) {
>  		struct ceph_mds_client *mdsc =
>  			ceph_sb_to_client(dir->i_sb)->mdsc;
> -		struct ceph_mds_request *req;
> +		struct ceph_mds_request *req = NULL;
> +		struct ceph_inode_info* cdir = ceph_inode(dir);
>  		int op, err;
>  		u32 mask;
>  
>  		if (flags & LOOKUP_RCU)
>  			return -ECHILD;
> +		mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> +		if (ceph_security_xattr_wanted(dir))
> +			mask |= CEPH_CAP_XATTR_SHARED;
>  
>  		op = ceph_snap(dir) == CEPH_SNAPDIR ?
>  			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> -		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> -		if (!IS_ERR(req)) {
> -			req->r_dentry = dget(dentry);
> -			req->r_num_caps = 2;
> -			req->r_parent = dir;
> +		if (op == CEPH_MDS_OP_LOOKUP) {
> +			mutex_lock(&cdir->lookups_inflight_lock);
> +			dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n", 
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> +			req = __search_inode_getattr_or_lookup(cdir, mask, true);
> +		}
> +		if (req && op == CEPH_MDS_OP_LOOKUP) {
> +			dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +			ceph_mdsc_get_request(req);
> +			mutex_unlock(&cdir->lookups_inflight_lock);
> +			err = ceph_mdsc_wait_for_request(req);
> +			dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +		} else {
>  
> -			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> -			if (ceph_security_xattr_wanted(dir))
> -				mask |= CEPH_CAP_XATTR_SHARED;
> -			req->r_args.getattr.mask = cpu_to_le32(mask);
> +			req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> +			if (op == CEPH_MDS_OP_LOOKUP) {
> +				if (!IS_ERR(req)) {
> +					unsigned long res = 0;
> +					req->r_dentry = dget(dentry);
> +					req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> +					req->r_num_caps = 2;
> +					req->r_parent = dir;
> +					req->r_args.getattr.mask = cpu_to_le32(mask);
> +					res = __register_inode_getattr_or_lookup(cdir, req, true);
> +					if (IS_ERR_VALUE(res)) {
> +						mutex_unlock(&cdir->lookups_inflight_lock);
> +						goto out;
> +					}
> +					dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
> +							dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +				}
> +				mutex_unlock(&cdir->lookups_inflight_lock);
> +			}
> +			if (IS_ERR(req))
> +				goto out;
>  
>  			err = ceph_mdsc_do_request(mdsc, NULL, req);
> -			switch (err) {
> -			case 0:
> -				if (d_really_is_positive(dentry) &&
> -				    d_inode(dentry) == req->r_target_inode)
> -					valid = 1;
> -				break;
> -			case -ENOENT:
> -				if (d_really_is_negative(dentry))
> -					valid = 1;
> -				/* Fallthrough */
> -			default:
> -				break;
> +			if (op == CEPH_MDS_OP_LOOKUP) {
> +				mutex_lock(&cdir->lookups_inflight_lock);
> +				__unregister_inode_getattr_or_lookup(cdir, req, true);
> +				dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", 
> +						dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +				mutex_unlock(&cdir->lookups_inflight_lock);
>  			}
> -			ceph_mdsc_put_request(req);
> -			dout("d_revalidate %p lookup result=%d\n",
> -			     dentry, err);
>  		}
> +		switch (err) {
> +		case 0:
> +			if (d_really_is_positive(dentry) &&
> +			    d_inode(dentry) == req->r_target_inode)
> +				valid = 1;
> +			break;
> +		case -ENOENT:
> +			if (d_really_is_negative(dentry))
> +				valid = 1;
> +			/* Fallthrough */
> +		default:
> +			break;
> +		}
> +		ceph_mdsc_put_request(req);
> +		dout("d_revalidate %p lookup result=%d\n",
> +   				dentry, err);
>  	}
>  
> +out:
>  	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
>  	if (valid) {
>  		ceph_dentry_lru_touch(dentry);

d_revalidate is changed to use the cache, but not lookup. Why?

> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 9d1f34d46627..8705f0645a24 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	dout("alloc_inode %p\n", &ci->vfs_inode);
>  
>  	spin_lock_init(&ci->i_ceph_lock);
> +	mutex_init(&ci->getattrs_inflight_lock);
> +	mutex_init(&ci->lookups_inflight_lock);
>  
>  	ci->i_version = 0;
>  	ci->i_inline_version = 0;
> @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_xattrs.index_version = 0;
>  
>  	ci->i_caps = RB_ROOT;
> +	ci->getattrs_inflight = RB_ROOT;
> +	ci->lookups_inflight = RB_ROOT;
>  	ci->i_auth_cap = NULL;
>  	ci->i_dirty_caps = 0;
>  	ci->i_flushing_caps = 0;
> +	ci->last_flush_ack_tid = 0;
>  	INIT_LIST_HEAD(&ci->i_dirty_item);
>  	INIT_LIST_HEAD(&ci->i_flushing_item);
>  	ci->i_prealloc_cap_flush = NULL;
> @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
>  	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
>  	 * we expect a negative dentry.
>  	 */
> +	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>  	if (!tgt_vino && d_really_is_positive(dentry))
>  		return;
> -
> +	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>  	if (tgt_vino && (d_really_is_negative(dentry) ||
>  			!ceph_ino_compare(d_inode(dentry), tgt_vino)))
>  		return;
> @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>  	struct ceph_mds_request *req;
>  	int mode;
>  	int err;
> +	struct ceph_inode_info* cinode = ceph_inode(inode);
>  
>  	if (ceph_snap(inode) == CEPH_SNAPDIR) {
>  		dout("do_getattr inode %p SNAPDIR\n", inode);
> @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>  	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
>  		return 0;
>  
> -	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> -	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> -	if (IS_ERR(req))
> -		return PTR_ERR(req);
> -	req->r_inode = inode;
> -	ihold(inode);
> -	req->r_num_caps = 1;
> -	req->r_args.getattr.mask = cpu_to_le32(mask);
> -	req->r_locked_page = locked_page;
> -	err = ceph_mdsc_do_request(mdsc, NULL, req);
> +	mutex_lock(&cinode->getattrs_inflight_lock);
> +	dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> +	req = __search_inode_getattr_or_lookup(cinode, mask, false);
> +	if (req) {
> +		dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +		ceph_mdsc_get_request(req);
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +		err = ceph_mdsc_wait_for_request(req);
> +		dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +	} else {
> +		mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> +		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> +		if (!IS_ERR(req)) {
> +			unsigned long res = 0;
> +			req->r_inode = inode;
> +			req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> +			ihold(inode);
> +			req->r_num_caps = 1;
> +			req->r_args.getattr.mask = cpu_to_le32(mask);
> +			req->r_locked_page = locked_page;
> +			res = __register_inode_getattr_or_lookup(cinode, req, false);
> +			if (IS_ERR_VALUE(res)) {
> +				mutex_unlock(&cinode->getattrs_inflight_lock);
> +				return res;
> +			}
> +			dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +		}
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +		if (IS_ERR(req))
> +			return PTR_ERR(req);
> +		err = ceph_mdsc_do_request(mdsc, NULL, req);
> +		mutex_lock(&cinode->getattrs_inflight_lock);
> +		__unregister_inode_getattr_or_lookup(cinode, req, false);
> +		dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +	}
>  	if (locked_page && err == 0) {
>  		u64 inline_version = req->r_reply_info.targeti.inline_version;
>  		if (inline_version == 0) {
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 163fc74bf221..5c6e00f2fd0a 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
>  	return 0;
>  }
>  
> +void ceph_mds_requests_tree_release(struct kref* kref)
> +{
> +	struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> +
> +	BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> +	kfree(mrtree);
> +}
> +
>  /*
>   * Create an mds request.
>   */
> @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>  	req->r_fmode = -1;
>  	kref_init(&req->r_kref);
>  	RB_CLEAR_NODE(&req->r_node);
> +	RB_CLEAR_NODE(&req->getattr_lookup_node);
>  	INIT_LIST_HEAD(&req->r_wait);
> +	init_completion(&req->batch_op_completion);
>  	init_completion(&req->r_completion);
>  	init_completion(&req->r_safe_completion);
>  	INIT_LIST_HEAD(&req->r_unsafe_item);
> @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>  	mutex_unlock(&mdsc->mutex);
>  }
>  
> +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> +{
> +	int err = 0;
> +	long timeleft = wait_for_completion_killable_timeout(
> +			&req->batch_op_completion,
> +			ceph_timeout_jiffies(req->r_timeout));
> +	if (timeleft > 0)
> +		err = 0;
> +	else if (!timeleft)
> +		err = -EIO;  /* timed out */
> +	else
> +		err = timeleft;  /* killed */
> +	if (!err)
> +		return err;
> +	return req->batch_op_err;
> +}
> +
>  /*
>   * Synchrously perform an mds request.  Take care of all of the
>   * session setup, forwarding, retry details.
> @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>  	} else {
>  		err = req->r_err;
>  	}
> -
> +	req->batch_op_err = err;
> +	complete_all(&req->batch_op_completion);
>  out:
>  	mutex_unlock(&mdsc->mutex);
>  	dout("do_request %p done, result %d\n", req, err);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 729da155ebf0..41727677643e 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
>  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
>  						 struct ceph_mds_request *req);
>  
> +struct mds_request_tree {
> +	int mask;
> +	struct kref ref;
> +	struct rb_root mds_requests;
> +	struct rb_node parent_tree_node;
> +};
> +
> +extern void ceph_mds_requests_tree_release(struct kref* ref);
> +
>  /*
>   * an in-flight mds request
>   */
>  struct ceph_mds_request {
>  	u64 r_tid;                   /* transaction id */
> +	u64 last_caps_flush_ack_tid;
>  	struct rb_node r_node;
> +	struct rb_node getattr_lookup_node;
>  	struct ceph_mds_client *r_mdsc;
>  
>  	int r_op;                    /* mds op code */
> @@ -264,7 +275,7 @@ struct ceph_mds_request {
>  	struct ceph_msg  *r_reply;
>  	struct ceph_mds_reply_info_parsed r_reply_info;
>  	struct page *r_locked_page;
> -	int r_err;
> +	int r_err, batch_op_err;
>  
>  	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
>  	unsigned long r_started;  /* start time to measure timeout against */
> @@ -287,6 +298,7 @@ struct ceph_mds_request {
>  
>  	struct kref       r_kref;
>  	struct list_head  r_wait;
> +	struct completion batch_op_completion;
>  	struct completion r_completion;
>  	struct completion r_safe_completion;
>  	ceph_mds_request_callback_t r_callback;
> @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
>  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>  				     struct ceph_mds_request *req);
> +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
>  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>  				struct inode *dir,
>  				struct ceph_mds_request *req);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index da2cd8e89062..857fbbc0b768 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
>  	destroy_caches();
>  }
>  
> +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +					  struct ceph_mds_request* req,
> +					  bool is_lookup)
> +{
> +	struct rb_node *node = NULL;
> +	struct mds_request_tree* tmp = NULL;
> +	if (!is_lookup)
> +		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +	else
> +		node = inode->lookups_inflight.rb_node;
> +	
> +	while (node)
> +	{
> +		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +		
> +		if (tmp->mask > req->r_args.getattr.mask)
> +			node = node->rb_left;
> +		else if (tmp->mask < req->r_args.getattr.mask)
> +			node = node->rb_right;
> +		else
> +			break;
> +	}
> +
> +	if (!node)
> +		BUG();
> +
> +	rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> +	
> +	if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> +		if (!is_lookup)
> +			rb_erase(node, &inode->getattrs_inflight);
> +		else
> +			rb_erase(node, &inode->lookups_inflight);
> +		kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> +	}
> +}
> +
> +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +					struct ceph_mds_request* req,
> +					bool is_lookup)
> +{
> +	struct rb_node **p = NULL, *parent = NULL;
> +	struct mds_request_tree *tmp = NULL;
> +	struct ceph_mds_request* tmp_req = NULL;
> +	struct mds_request_tree* mrtree = NULL;
> +
> +	if (!is_lookup)
> +		p = &ci->getattrs_inflight.rb_node;
> +	else
> +		p = &ci->lookups_inflight.rb_node;
> +	
> +	while (*p) {
> +		parent = *p;
> +		tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> +		if (req->r_args.getattr.mask < tmp->mask)
> +			p = &(*p)->rb_left;
> +		else if (req->r_args.getattr.mask > tmp->mask)
> +			p = &(*p)->rb_right;
> +		else
> +			break;
> +	}
> +	
> +	if (!(*p)) {
> +		mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> +
> +		if (!mrtree)
> +			return -ENOMEM;
> +
> +		mrtree->mds_requests = RB_ROOT;
> +		mrtree->mask = req->r_args.getattr.mask;
> +		kref_init(&mrtree->ref);
> +		rb_link_node(&mrtree->parent_tree_node, parent, p);
> +		if (!is_lookup) {
> +			rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> +		} else {
> +			rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> +		}
> +	} else {
> +		mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> +	}
> +
> +	p = &mrtree->mds_requests.rb_node;
> +	parent = NULL;
> +	while (*p) {
> +		parent = *p;
> +		tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> +		if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> +			p = &(*p)->rb_left;
> +		else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> +			p = &(*p)->rb_right;
> +		else
> +			BUG();
> +	}
> +
> +	rb_link_node(&req->getattr_lookup_node, parent, p);
> +	rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> +	return 0;
> +}
> +
> +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +				      int mask,
> +				      bool is_lookup)
> +{
> +	struct rb_node *node = NULL;
> +	if (!is_lookup)
> +		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +	else
> +		node = inode->lookups_inflight.rb_node;
> +	
> +	while (node)
> +	{
> +		struct mds_request_tree* tmp = NULL;
> +		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +		
> +		if (tmp->mask > mask) {
> +			node = node->rb_left;
> +		} else if (tmp->mask < mask) {
> +			node = node->rb_right;
> +		} else {
> +			struct rb_node* inner_node = tmp->mds_requests.rb_node;
> +			while (inner_node) {
> +				struct ceph_mds_request* req = NULL;
> +
> +				req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> +				if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> +					inner_node = inner_node->rb_left;
> +				} else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> +					inner_node = inner_node->rb_right;
> +				} else
> +					return req; /* Found it */
> +			}
> +			return NULL;
> +		}
> +	}
> +	return NULL;
> +}
>  module_init(init_ceph);
>  module_exit(exit_ceph);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index dfb64a5211b6..abf761f2a122 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -296,6 +296,8 @@ struct ceph_inode_info {
>  	struct ceph_vino i_vino;   /* ceph ino + snap */
>  
>  	spinlock_t i_ceph_lock;
> +	struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> +	struct rb_root getattrs_inflight, lookups_inflight;

If you do end up pursuing this further, I think it might be better to
use a spinlock to protect the trees.

Take the lock, search the tree and if there isn't one, drop the lock,
allocate, take it and search again, and insert the one you allocated if
no conflicting structure was inserted in the meantime.

In addition to cutting down on context switches when these things are
contended, that would also make it easier to hide the locking inside the
cache search/register/unregister routines.

>  
>  	u64 i_version;
>  	u64 i_inline_version;
> @@ -329,6 +331,7 @@ struct ceph_inode_info {
>  	struct rb_root i_caps;           /* cap list */
>  	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>  	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> +	u64 last_flush_ack_tid;
>  	struct list_head i_dirty_item, i_flushing_item;
>  	/* we need to track cap writeback on a per-cap-bit basis, to allow
>  	 * overlapping, pipelined cap flushes to the mds.  we can probably
> @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
>  				u64 time_warp_seq, struct timespec64 *ctime,
>  				struct timespec64 *mtime,
>  				struct timespec64 *atime);
> +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +					       struct ceph_mds_request* req,
> +					       bool is_lookup);
> +
> +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +						 struct ceph_mds_request* req,
> +						 bool is_lookup);
> +
> +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +					     int mask,
> +					     bool is_lookup);
>  extern int ceph_fill_trace(struct super_block *sb,
>  			   struct ceph_mds_request *req);
>  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
Xuehan Xu Feb. 27, 2019, 3:31 a.m. UTC | #3
On Wed, 27 Feb 2019 at 07:54, Jeff Layton <jlayton@redhat.com> wrote:
>
> On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> > From: Xuehan Xu <xuxuehan@360.cn>
> >
> > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> > op, issue a new one if there is no inflight req that requires that same caps
> > as the current getattr/lookup op.
> >
> > Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> > ---
> >  fs/ceph/caps.c       |   1 +
> >  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
> >  fs/ceph/inode.c      |  55 +++++++++++++----
> >  fs/ceph/mds_client.c |  30 +++++++++-
> >  fs/ceph/mds_client.h |  15 ++++-
> >  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/ceph/super.h      |  14 +++++
> >  7 files changed, 304 insertions(+), 39 deletions(-)
> >
>
> The description on this patch could be fleshed out a bit.
>
> How much does this help performance and on what workloads? This is a lot
> of code to add without much justification.

Hi, Jeff and Zheng, thanks for your review and help:-) I'm sorry that
I forgot to add the "Fixes" tag in the description of the patches.
These two patches is supposed to solve the issue:
http://tracker.ceph.com/issues/36609.

Basically, the scenario we are facing now is as follows:
     We have several dozens of web servers all of which mounted the
same directory for their php scripts and would ask cephfs for the
latest file content if some of those scripts is modified. To improve
the web servers' efficiency, each of them runs tens of threads to
serve web requests, so, as you can see, when some scripts is modified,
there would be thousands of threads asking for the same file which
would lead to thousands of getattr/lookup and read reqs to the cephfs
cluster and cause huge pressure on both MDSes and OSDs.

>
> Looking over the patch, the basic idea is to implement a cache (in
> rbtrees) of getattrs and lookups in flight and just wait on those if
> there already is one that matches what we need. I agree with Zheng that
> this could end up violating the strict cache coherency guarantees that
> ceph tries to provide.
>
> You're relying on an assumption that if you issue a getattr/lookup while
> another one is already in flight, then it's legitimate to use the result
> of the initial one for the one issued later. That's not necessarily the
> case. I suspect this would be possible, even from a single client:
>
> - client1/task1 issues stat(), server performs getattr
> but reply gets delayed for a bit
>
> - client2 does write() + fsync()
>
> - client1/task2 issues stat(), finds the entry in cache from task1 and
> ends up waiting for first getattr to complete
>
> - getattr reply comes in, and both stat calls get back the same
> attributes, showing pre-write size/mtime
>
> In this case, the synchronous write by client2 was provably done before
> the second getattr, but the second stat() call wouldn't see the new
> size.

I understand that this is a big problem. If what I think is right,
this problem may be only with the read handling patch.
Here's what I think:
    First, I think the problem would come up only when client1/task2
issues stat() after the finish of client2's write()+sync(), since,
only in this case, a pre-write size/mtime seen by client1/task2 would
cause errors. And a pre-write size/mtime would be seen by
client1/task2 only when client1/task1's stat() is processed before
client2's write()+sync(). However, when client1/task1's stat() is
processed successfully, the inode's filelock should be in states like
LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
move the inode's to states like LOCK_MIX which would cause a cap
revoke from client1. And it's only when client1 acked the revoke can
the MDS go on processing the metadata modification caused by client2's
write()+fsync(). Also, MDS sends messages to the same client in the
same order of the creation of those messages. So, when client1/task2
issues stat(), the result of the previous one must have been processed
by client1 and can't be seen by client1/task2, which means this
problem wouldn't occur. Am I right about this? I don't know if I'm
missing anything.

However, as for the read handling patch, this problem do exists, and
it's really confusing me.

>
> The read handling patch may suffer from similar issues, though I haven't
> gone over that one in as much detail.
>
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index bba28a5034ba..5a177a3c283f 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> >               goto out;
> >
> >       ci->i_flushing_caps &= ~cleaned;
> > +     ci->last_flush_ack_tid = flush_tid;
> >
> >       spin_lock(&mdsc->cap_dirty_lock);
> >
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index 82928cea0209..6f17c0319158 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> >                               }
> >                       }
> >               }
> > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> >       }
> >       spin_unlock(&dentry->d_lock);
> >
> > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> >                                        CEPH_MDS_LEASE_RENEW, seq);
> >               ceph_put_mds_session(session);
> >       }
> > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> >       return valid;
> >  }
> >
> > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> >       if (!valid) {
> >               struct ceph_mds_client *mdsc =
> >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > -             struct ceph_mds_request *req;
> > +             struct ceph_mds_request *req = NULL;
> > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> >               int op, err;
> >               u32 mask;
> >
> >               if (flags & LOOKUP_RCU)
> >                       return -ECHILD;
> > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > +             if (ceph_security_xattr_wanted(dir))
> > +                     mask |= CEPH_CAP_XATTR_SHARED;
> >
> >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > -             if (!IS_ERR(req)) {
> > -                     req->r_dentry = dget(dentry);
> > -                     req->r_num_caps = 2;
> > -                     req->r_parent = dir;
> > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > +             }
> > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > +                     ceph_mdsc_get_request(req);
> > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > +                     err = ceph_mdsc_wait_for_request(req);
> > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > +             } else {
> >
> > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > -                     if (ceph_security_xattr_wanted(dir))
> > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > +                             if (!IS_ERR(req)) {
> > +                                     unsigned long res = 0;
> > +                                     req->r_dentry = dget(dentry);
> > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > +                                     req->r_num_caps = 2;
> > +                                     req->r_parent = dir;
> > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > +                                     if (IS_ERR_VALUE(res)) {
> > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > +                                             goto out;
> > +                                     }
> > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > +                             }
> > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > +                     }
> > +                     if (IS_ERR(req))
> > +                             goto out;
> >
> >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > -                     switch (err) {
> > -                     case 0:
> > -                             if (d_really_is_positive(dentry) &&
> > -                                 d_inode(dentry) == req->r_target_inode)
> > -                                     valid = 1;
> > -                             break;
> > -                     case -ENOENT:
> > -                             if (d_really_is_negative(dentry))
> > -                                     valid = 1;
> > -                             /* Fallthrough */
> > -                     default:
> > -                             break;
> > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> >                       }
> > -                     ceph_mdsc_put_request(req);
> > -                     dout("d_revalidate %p lookup result=%d\n",
> > -                          dentry, err);
> >               }
> > +             switch (err) {
> > +             case 0:
> > +                     if (d_really_is_positive(dentry) &&
> > +                         d_inode(dentry) == req->r_target_inode)
> > +                             valid = 1;
> > +                     break;
> > +             case -ENOENT:
> > +                     if (d_really_is_negative(dentry))
> > +                             valid = 1;
> > +                     /* Fallthrough */
> > +             default:
> > +                     break;
> > +             }
> > +             ceph_mdsc_put_request(req);
> > +             dout("d_revalidate %p lookup result=%d\n",
> > +                             dentry, err);
> >       }
> >
> > +out:
> >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> >       if (valid) {
> >               ceph_dentry_lru_touch(dentry);
>
> d_revalidate is changed to use the cache, but not lookup. Why?

Multiple same lookups would also be sent out in our scenario mentioned
above, so we add a cache for lookups, too. I don't know if I'm getting
your question correctly.

>
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 9d1f34d46627..8705f0645a24 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >       dout("alloc_inode %p\n", &ci->vfs_inode);
> >
> >       spin_lock_init(&ci->i_ceph_lock);
> > +     mutex_init(&ci->getattrs_inflight_lock);
> > +     mutex_init(&ci->lookups_inflight_lock);
> >
> >       ci->i_version = 0;
> >       ci->i_inline_version = 0;
> > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >       ci->i_xattrs.index_version = 0;
> >
> >       ci->i_caps = RB_ROOT;
> > +     ci->getattrs_inflight = RB_ROOT;
> > +     ci->lookups_inflight = RB_ROOT;
> >       ci->i_auth_cap = NULL;
> >       ci->i_dirty_caps = 0;
> >       ci->i_flushing_caps = 0;
> > +     ci->last_flush_ack_tid = 0;
> >       INIT_LIST_HEAD(&ci->i_dirty_item);
> >       INIT_LIST_HEAD(&ci->i_flushing_item);
> >       ci->i_prealloc_cap_flush = NULL;
> > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> >        * we expect a negative dentry.
> >        */
> > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> >       if (!tgt_vino && d_really_is_positive(dentry))
> >               return;
> > -
> > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> >       if (tgt_vino && (d_really_is_negative(dentry) ||
> >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> >               return;
> > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >       struct ceph_mds_request *req;
> >       int mode;
> >       int err;
> > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> >
> >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> >               return 0;
> >
> > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > -     if (IS_ERR(req))
> > -             return PTR_ERR(req);
> > -     req->r_inode = inode;
> > -     ihold(inode);
> > -     req->r_num_caps = 1;
> > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > -     req->r_locked_page = locked_page;
> > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > +     if (req) {
> > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > +             ceph_mdsc_get_request(req);
> > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > +             err = ceph_mdsc_wait_for_request(req);
> > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > +     } else {
> > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > +             if (!IS_ERR(req)) {
> > +                     unsigned long res = 0;
> > +                     req->r_inode = inode;
> > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > +                     ihold(inode);
> > +                     req->r_num_caps = 1;
> > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > +                     req->r_locked_page = locked_page;
> > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > +                     if (IS_ERR_VALUE(res)) {
> > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > +                             return res;
> > +                     }
> > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > +             }
> > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > +             if (IS_ERR(req))
> > +                     return PTR_ERR(req);
> > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > +     }
> >       if (locked_page && err == 0) {
> >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> >               if (inline_version == 0) {
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 163fc74bf221..5c6e00f2fd0a 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> >       return 0;
> >  }
> >
> > +void ceph_mds_requests_tree_release(struct kref* kref)
> > +{
> > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > +
> > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > +     kfree(mrtree);
> > +}
> > +
> >  /*
> >   * Create an mds request.
> >   */
> > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> >       req->r_fmode = -1;
> >       kref_init(&req->r_kref);
> >       RB_CLEAR_NODE(&req->r_node);
> > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> >       INIT_LIST_HEAD(&req->r_wait);
> > +     init_completion(&req->batch_op_completion);
> >       init_completion(&req->r_completion);
> >       init_completion(&req->r_safe_completion);
> >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> >       mutex_unlock(&mdsc->mutex);
> >  }
> >
> > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > +{
> > +     int err = 0;
> > +     long timeleft = wait_for_completion_killable_timeout(
> > +                     &req->batch_op_completion,
> > +                     ceph_timeout_jiffies(req->r_timeout));
> > +     if (timeleft > 0)
> > +             err = 0;
> > +     else if (!timeleft)
> > +             err = -EIO;  /* timed out */
> > +     else
> > +             err = timeleft;  /* killed */
> > +     if (!err)
> > +             return err;
> > +     return req->batch_op_err;
> > +}
> > +
> >  /*
> >   * Synchrously perform an mds request.  Take care of all of the
> >   * session setup, forwarding, retry details.
> > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> >       } else {
> >               err = req->r_err;
> >       }
> > -
> > +     req->batch_op_err = err;
> > +     complete_all(&req->batch_op_completion);
> >  out:
> >       mutex_unlock(&mdsc->mutex);
> >       dout("do_request %p done, result %d\n", req, err);
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index 729da155ebf0..41727677643e 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> >                                                struct ceph_mds_request *req);
> >
> > +struct mds_request_tree {
> > +     int mask;
> > +     struct kref ref;
> > +     struct rb_root mds_requests;
> > +     struct rb_node parent_tree_node;
> > +};
> > +
> > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > +
> >  /*
> >   * an in-flight mds request
> >   */
> >  struct ceph_mds_request {
> >       u64 r_tid;                   /* transaction id */
> > +     u64 last_caps_flush_ack_tid;
> >       struct rb_node r_node;
> > +     struct rb_node getattr_lookup_node;
> >       struct ceph_mds_client *r_mdsc;
> >
> >       int r_op;                    /* mds op code */
> > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> >       struct ceph_msg  *r_reply;
> >       struct ceph_mds_reply_info_parsed r_reply_info;
> >       struct page *r_locked_page;
> > -     int r_err;
> > +     int r_err, batch_op_err;
> >
> >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> >       unsigned long r_started;  /* start time to measure timeout against */
> > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> >
> >       struct kref       r_kref;
> >       struct list_head  r_wait;
> > +     struct completion batch_op_completion;
> >       struct completion r_completion;
> >       struct completion r_safe_completion;
> >       ceph_mds_request_callback_t r_callback;
> > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> >                                    struct ceph_mds_request *req);
> > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> >                               struct inode *dir,
> >                               struct ceph_mds_request *req);
> > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > index da2cd8e89062..857fbbc0b768 100644
> > --- a/fs/ceph/super.c
> > +++ b/fs/ceph/super.c
> > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> >       destroy_caches();
> >  }
> >
> > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > +                                       struct ceph_mds_request* req,
> > +                                       bool is_lookup)
> > +{
> > +     struct rb_node *node = NULL;
> > +     struct mds_request_tree* tmp = NULL;
> > +     if (!is_lookup)
> > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > +     else
> > +             node = inode->lookups_inflight.rb_node;
> > +
> > +     while (node)
> > +     {
> > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > +
> > +             if (tmp->mask > req->r_args.getattr.mask)
> > +                     node = node->rb_left;
> > +             else if (tmp->mask < req->r_args.getattr.mask)
> > +                     node = node->rb_right;
> > +             else
> > +                     break;
> > +     }
> > +
> > +     if (!node)
> > +             BUG();
> > +
> > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > +
> > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > +             if (!is_lookup)
> > +                     rb_erase(node, &inode->getattrs_inflight);
> > +             else
> > +                     rb_erase(node, &inode->lookups_inflight);
> > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > +     }
> > +}
> > +
> > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > +                                     struct ceph_mds_request* req,
> > +                                     bool is_lookup)
> > +{
> > +     struct rb_node **p = NULL, *parent = NULL;
> > +     struct mds_request_tree *tmp = NULL;
> > +     struct ceph_mds_request* tmp_req = NULL;
> > +     struct mds_request_tree* mrtree = NULL;
> > +
> > +     if (!is_lookup)
> > +             p = &ci->getattrs_inflight.rb_node;
> > +     else
> > +             p = &ci->lookups_inflight.rb_node;
> > +
> > +     while (*p) {
> > +             parent = *p;
> > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > +             if (req->r_args.getattr.mask < tmp->mask)
> > +                     p = &(*p)->rb_left;
> > +             else if (req->r_args.getattr.mask > tmp->mask)
> > +                     p = &(*p)->rb_right;
> > +             else
> > +                     break;
> > +     }
> > +
> > +     if (!(*p)) {
> > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > +
> > +             if (!mrtree)
> > +                     return -ENOMEM;
> > +
> > +             mrtree->mds_requests = RB_ROOT;
> > +             mrtree->mask = req->r_args.getattr.mask;
> > +             kref_init(&mrtree->ref);
> > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > +             if (!is_lookup) {
> > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > +             } else {
> > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > +             }
> > +     } else {
> > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > +     }
> > +
> > +     p = &mrtree->mds_requests.rb_node;
> > +     parent = NULL;
> > +     while (*p) {
> > +             parent = *p;
> > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > +                     p = &(*p)->rb_left;
> > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > +                     p = &(*p)->rb_right;
> > +             else
> > +                     BUG();
> > +     }
> > +
> > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > +     return 0;
> > +}
> > +
> > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > +                                   int mask,
> > +                                   bool is_lookup)
> > +{
> > +     struct rb_node *node = NULL;
> > +     if (!is_lookup)
> > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > +     else
> > +             node = inode->lookups_inflight.rb_node;
> > +
> > +     while (node)
> > +     {
> > +             struct mds_request_tree* tmp = NULL;
> > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > +
> > +             if (tmp->mask > mask) {
> > +                     node = node->rb_left;
> > +             } else if (tmp->mask < mask) {
> > +                     node = node->rb_right;
> > +             } else {
> > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > +                     while (inner_node) {
> > +                             struct ceph_mds_request* req = NULL;
> > +
> > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > +                                     inner_node = inner_node->rb_left;
> > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > +                                     inner_node = inner_node->rb_right;
> > +                             } else
> > +                                     return req; /* Found it */
> > +                     }
> > +                     return NULL;
> > +             }
> > +     }
> > +     return NULL;
> > +}
> >  module_init(init_ceph);
> >  module_exit(exit_ceph);
> >
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index dfb64a5211b6..abf761f2a122 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> >       struct ceph_vino i_vino;   /* ceph ino + snap */
> >
> >       spinlock_t i_ceph_lock;
> > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > +     struct rb_root getattrs_inflight, lookups_inflight;
>
> If you do end up pursuing this further, I think it might be better to
> use a spinlock to protect the trees.
>
> Take the lock, search the tree and if there isn't one, drop the lock,
> allocate, take it and search again, and insert the one you allocated if
> no conflicting structure was inserted in the meantime.
>
> In addition to cutting down on context switches when these things are
> contended, that would also make it easier to hide the locking inside the
> cache search/register/unregister routines.

You are right. At first, I tried to use spinlock, but that caused
errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
know how that occurred. Do you have any clew about that?

I can't thank you enough for the help:-) Thanks very much:-)
>
> >
> >       u64 i_version;
> >       u64 i_inline_version;
> > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> >       struct rb_root i_caps;           /* cap list */
> >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > +     u64 last_flush_ack_tid;
> >       struct list_head i_dirty_item, i_flushing_item;
> >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> >                               u64 time_warp_seq, struct timespec64 *ctime,
> >                               struct timespec64 *mtime,
> >                               struct timespec64 *atime);
> > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > +                                            struct ceph_mds_request* req,
> > +                                            bool is_lookup);
> > +
> > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > +                                              struct ceph_mds_request* req,
> > +                                              bool is_lookup);
> > +
> > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > +                                          int mask,
> > +                                          bool is_lookup);
> >  extern int ceph_fill_trace(struct super_block *sb,
> >                          struct ceph_mds_request *req);
> >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Jeff Layton Feb. 27, 2019, 12:08 p.m. UTC | #4
On Wed, 2019-02-27 at 11:31 +0800, Xuehan Xu wrote:
> On Wed, 27 Feb 2019 at 07:54, Jeff Layton <jlayton@redhat.com> wrote:
> > On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> > > From: Xuehan Xu <xuxuehan@360.cn>
> > > 
> > > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> > > op, issue a new one if there is no inflight req that requires that same caps
> > > as the current getattr/lookup op.
> > > 
> > > Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> > > ---
> > >  fs/ceph/caps.c       |   1 +
> > >  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
> > >  fs/ceph/inode.c      |  55 +++++++++++++----
> > >  fs/ceph/mds_client.c |  30 +++++++++-
> > >  fs/ceph/mds_client.h |  15 ++++-
> > >  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
> > >  fs/ceph/super.h      |  14 +++++
> > >  7 files changed, 304 insertions(+), 39 deletions(-)
> > > 
> > 
> > The description on this patch could be fleshed out a bit.
> > 
> > How much does this help performance and on what workloads? This is a lot
> > of code to add without much justification.
> 
> Hi, Jeff and Zheng, thanks for your review and help:-) I'm sorry that
> I forgot to add the "Fixes" tag in the description of the patches.
> These two patches is supposed to solve the issue:
> http://tracker.ceph.com/issues/36609.
> 

Have you measured the performance to know how much of an impact this
has?

> Basically, the scenario we are facing now is as follows:
>      We have several dozens of web servers all of which mounted the
> same directory for their php scripts and would ask cephfs for the
> latest file content if some of those scripts is modified. To improve
> the web servers' efficiency, each of them runs tens of threads to
> serve web requests, so, as you can see, when some scripts is modified,
> there would be thousands of threads asking for the same file which
> would lead to thousands of getattr/lookup and read reqs to the cephfs
> cluster and cause huge pressure on both MDSes and OSDs.
> 

How often are these files changing?

Between changes to the files, the web server clients should all end up
with read-only caps that allow them to cache the inode attributes. When
there is a change to the file, those will need to be recalled of course
and you might see more activity on the MDS, but that's not necessarily a
problem.

> > Looking over the patch, the basic idea is to implement a cache (in
> > rbtrees) of getattrs and lookups in flight and just wait on those if
> > there already is one that matches what we need. I agree with Zheng that
> > this could end up violating the strict cache coherency guarantees that
> > ceph tries to provide.
> > 
> > You're relying on an assumption that if you issue a getattr/lookup while
> > another one is already in flight, then it's legitimate to use the result
> > of the initial one for the one issued later. That's not necessarily the
> > case. I suspect this would be possible, even from a single client:
> > 
> > - client1/task1 issues stat(), server performs getattr
> > but reply gets delayed for a bit
> > 
> > - client2 does write() + fsync()
> > 
> > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > ends up waiting for first getattr to complete
> > 
> > - getattr reply comes in, and both stat calls get back the same
> > attributes, showing pre-write size/mtime
> > 
> > In this case, the synchronous write by client2 was provably done before
> > the second getattr, but the second stat() call wouldn't see the new
> > size.
> 
> I understand that this is a big problem. If what I think is right,
> this problem may be only with the read handling patch.
> Here's what I think:
>     First, I think the problem would come up only when client1/task2
> issues stat() after the finish of client2's write()+sync(), since,
> only in this case, a pre-write size/mtime seen by client1/task2 would
> cause errors. And a pre-write size/mtime would be seen by
> client1/task2 only when client1/task1's stat() is processed before
> client2's write()+sync(). However, when client1/task1's stat() is
> processed successfully, the inode's filelock should be in states like
> LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> move the inode's to states like LOCK_MIX which would cause a cap
> revoke from client1. And it's only when client1 acked the revoke can
> the MDS go on processing the metadata modification caused by client2's
> write()+fsync(). Also, MDS sends messages to the same client in the
> same order of the creation of those messages. So, when client1/task2
> issues stat(), the result of the previous one must have been processed
> by client1 and can't be seen by client1/task2, which means this
> problem wouldn't occur. Am I right about this? I don't know if I'm
> missing anything.
> 

There is never a guarantee that the MDS will give you the caps you've
requested, so I don't think you can rely on client1 having to process a
cap revoke to get you out of this situation.

IOW, I believe this is possible:

- client1 requests caps to do a stat(), doesn't get them (maybe because
client2 is actively writing?)
- client1 issues GETATTR to MDS, which gets processed there, but reply
gets delayed
- client2 does write()+fsync()
- client1 issues another stat(), ends up waiting on the first GETATTR,
which won't contain the most up to date information

No cap revoke to client1 here since it never got any in the first place.

> However, as for the read handling patch, this problem do exists, and
> it's really confusing me.
> 

The cap handling code in cephfs is extremely complex. My hat is off to
you for even attempting to understand the MDS LOCK_* handling code.

> > The read handling patch may suffer from similar issues, though I haven't
> > gone over that one in as much detail.
> > 
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index bba28a5034ba..5a177a3c283f 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > >               goto out;
> > > 
> > >       ci->i_flushing_caps &= ~cleaned;
> > > +     ci->last_flush_ack_tid = flush_tid;
> > > 
> > >       spin_lock(&mdsc->cap_dirty_lock);
> > > 
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index 82928cea0209..6f17c0319158 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > >                               }
> > >                       }
> > >               }
> > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > >       }
> > >       spin_unlock(&dentry->d_lock);
> > > 
> > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > >               ceph_put_mds_session(session);
> > >       }
> > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > >       return valid;
> > >  }
> > > 
> > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > >       if (!valid) {
> > >               struct ceph_mds_client *mdsc =
> > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > -             struct ceph_mds_request *req;
> > > +             struct ceph_mds_request *req = NULL;
> > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > >               int op, err;
> > >               u32 mask;
> > > 
> > >               if (flags & LOOKUP_RCU)
> > >                       return -ECHILD;
> > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > +             if (ceph_security_xattr_wanted(dir))
> > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > 
> > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > -             if (!IS_ERR(req)) {
> > > -                     req->r_dentry = dget(dentry);
> > > -                     req->r_num_caps = 2;
> > > -                     req->r_parent = dir;
> > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > +             }
> > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > +                     ceph_mdsc_get_request(req);
> > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > +                     err = ceph_mdsc_wait_for_request(req);
> > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > +             } else {
> > > 
> > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > -                     if (ceph_security_xattr_wanted(dir))
> > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > +                             if (!IS_ERR(req)) {
> > > +                                     unsigned long res = 0;
> > > +                                     req->r_dentry = dget(dentry);
> > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > +                                     req->r_num_caps = 2;
> > > +                                     req->r_parent = dir;
> > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > +                                     if (IS_ERR_VALUE(res)) {
> > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > +                                             goto out;
> > > +                                     }
> > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > +                             }
> > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > +                     }
> > > +                     if (IS_ERR(req))
> > > +                             goto out;
> > > 
> > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > -                     switch (err) {
> > > -                     case 0:
> > > -                             if (d_really_is_positive(dentry) &&
> > > -                                 d_inode(dentry) == req->r_target_inode)
> > > -                                     valid = 1;
> > > -                             break;
> > > -                     case -ENOENT:
> > > -                             if (d_really_is_negative(dentry))
> > > -                                     valid = 1;
> > > -                             /* Fallthrough */
> > > -                     default:
> > > -                             break;
> > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > >                       }
> > > -                     ceph_mdsc_put_request(req);
> > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > -                          dentry, err);
> > >               }
> > > +             switch (err) {
> > > +             case 0:
> > > +                     if (d_really_is_positive(dentry) &&
> > > +                         d_inode(dentry) == req->r_target_inode)
> > > +                             valid = 1;
> > > +                     break;
> > > +             case -ENOENT:
> > > +                     if (d_really_is_negative(dentry))
> > > +                             valid = 1;
> > > +                     /* Fallthrough */
> > > +             default:
> > > +                     break;
> > > +             }
> > > +             ceph_mdsc_put_request(req);
> > > +             dout("d_revalidate %p lookup result=%d\n",
> > > +                             dentry, err);
> > >       }
> > > 
> > > +out:
> > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > >       if (valid) {
> > >               ceph_dentry_lru_touch(dentry);
> > 
> > d_revalidate is changed to use the cache, but not lookup. Why?
> 
> Multiple same lookups would also be sent out in our scenario mentioned
> above, so we add a cache for lookups, too. I don't know if I'm getting
> your question correctly.
> 

d_revalidate only checks the validity of an existing dentry. To get a
dentry in the first place, you have to use the directory inode's lookup
readdir operations. lookup isn't changed here and I was wondering why.

Now that I think about it though, you probably need to do that anyway,
as the VFS code already does something similar when there are multiple
lookups for the same dentry in flight.

> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index 9d1f34d46627..8705f0645a24 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > 
> > >       spin_lock_init(&ci->i_ceph_lock);
> > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > +     mutex_init(&ci->lookups_inflight_lock);
> > > 
> > >       ci->i_version = 0;
> > >       ci->i_inline_version = 0;
> > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > >       ci->i_xattrs.index_version = 0;
> > > 
> > >       ci->i_caps = RB_ROOT;
> > > +     ci->getattrs_inflight = RB_ROOT;
> > > +     ci->lookups_inflight = RB_ROOT;
> > >       ci->i_auth_cap = NULL;
> > >       ci->i_dirty_caps = 0;
> > >       ci->i_flushing_caps = 0;
> > > +     ci->last_flush_ack_tid = 0;
> > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > >       ci->i_prealloc_cap_flush = NULL;
> > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > >        * we expect a negative dentry.
> > >        */
> > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > >       if (!tgt_vino && d_really_is_positive(dentry))
> > >               return;
> > > -
> > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > >               return;
> > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > >       struct ceph_mds_request *req;
> > >       int mode;
> > >       int err;
> > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > 
> > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > >               return 0;
> > > 
> > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > -     if (IS_ERR(req))
> > > -             return PTR_ERR(req);
> > > -     req->r_inode = inode;
> > > -     ihold(inode);
> > > -     req->r_num_caps = 1;
> > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > -     req->r_locked_page = locked_page;
> > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > +     if (req) {
> > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > +             ceph_mdsc_get_request(req);
> > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > +             err = ceph_mdsc_wait_for_request(req);
> > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > +     } else {
> > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > +             if (!IS_ERR(req)) {
> > > +                     unsigned long res = 0;
> > > +                     req->r_inode = inode;
> > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > +                     ihold(inode);
> > > +                     req->r_num_caps = 1;
> > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > +                     req->r_locked_page = locked_page;
> > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > +                     if (IS_ERR_VALUE(res)) {
> > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > +                             return res;
> > > +                     }
> > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > +             }
> > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > +             if (IS_ERR(req))
> > > +                     return PTR_ERR(req);
> > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > +     }
> > >       if (locked_page && err == 0) {
> > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > >               if (inline_version == 0) {
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > >       return 0;
> > >  }
> > > 
> > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > +{
> > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > +
> > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > +     kfree(mrtree);
> > > +}
> > > +
> > >  /*
> > >   * Create an mds request.
> > >   */
> > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > >       req->r_fmode = -1;
> > >       kref_init(&req->r_kref);
> > >       RB_CLEAR_NODE(&req->r_node);
> > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > >       INIT_LIST_HEAD(&req->r_wait);
> > > +     init_completion(&req->batch_op_completion);
> > >       init_completion(&req->r_completion);
> > >       init_completion(&req->r_safe_completion);
> > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > >       mutex_unlock(&mdsc->mutex);
> > >  }
> > > 
> > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > +{
> > > +     int err = 0;
> > > +     long timeleft = wait_for_completion_killable_timeout(
> > > +                     &req->batch_op_completion,
> > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > +     if (timeleft > 0)
> > > +             err = 0;
> > > +     else if (!timeleft)
> > > +             err = -EIO;  /* timed out */
> > > +     else
> > > +             err = timeleft;  /* killed */
> > > +     if (!err)
> > > +             return err;
> > > +     return req->batch_op_err;
> > > +}
> > > +
> > >  /*
> > >   * Synchrously perform an mds request.  Take care of all of the
> > >   * session setup, forwarding, retry details.
> > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > >       } else {
> > >               err = req->r_err;
> > >       }
> > > -
> > > +     req->batch_op_err = err;
> > > +     complete_all(&req->batch_op_completion);
> > >  out:
> > >       mutex_unlock(&mdsc->mutex);
> > >       dout("do_request %p done, result %d\n", req, err);
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 729da155ebf0..41727677643e 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > >                                                struct ceph_mds_request *req);
> > > 
> > > +struct mds_request_tree {
> > > +     int mask;
> > > +     struct kref ref;
> > > +     struct rb_root mds_requests;
> > > +     struct rb_node parent_tree_node;
> > > +};
> > > +
> > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > +
> > >  /*
> > >   * an in-flight mds request
> > >   */
> > >  struct ceph_mds_request {
> > >       u64 r_tid;                   /* transaction id */
> > > +     u64 last_caps_flush_ack_tid;
> > >       struct rb_node r_node;
> > > +     struct rb_node getattr_lookup_node;
> > >       struct ceph_mds_client *r_mdsc;
> > > 
> > >       int r_op;                    /* mds op code */
> > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > >       struct ceph_msg  *r_reply;
> > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > >       struct page *r_locked_page;
> > > -     int r_err;
> > > +     int r_err, batch_op_err;
> > > 
> > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > >       unsigned long r_started;  /* start time to measure timeout against */
> > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > 
> > >       struct kref       r_kref;
> > >       struct list_head  r_wait;
> > > +     struct completion batch_op_completion;
> > >       struct completion r_completion;
> > >       struct completion r_safe_completion;
> > >       ceph_mds_request_callback_t r_callback;
> > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > >                                    struct ceph_mds_request *req);
> > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > >                               struct inode *dir,
> > >                               struct ceph_mds_request *req);
> > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > index da2cd8e89062..857fbbc0b768 100644
> > > --- a/fs/ceph/super.c
> > > +++ b/fs/ceph/super.c
> > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > >       destroy_caches();
> > >  }
> > > 
> > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > +                                       struct ceph_mds_request* req,
> > > +                                       bool is_lookup)
> > > +{
> > > +     struct rb_node *node = NULL;
> > > +     struct mds_request_tree* tmp = NULL;
> > > +     if (!is_lookup)
> > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > +     else
> > > +             node = inode->lookups_inflight.rb_node;
> > > +
> > > +     while (node)
> > > +     {
> > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > +
> > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > +                     node = node->rb_left;
> > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > +                     node = node->rb_right;
> > > +             else
> > > +                     break;
> > > +     }
> > > +
> > > +     if (!node)
> > > +             BUG();
> > > +
> > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > +
> > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > +             if (!is_lookup)
> > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > +             else
> > > +                     rb_erase(node, &inode->lookups_inflight);
> > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > +     }
> > > +}
> > > +
> > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > +                                     struct ceph_mds_request* req,
> > > +                                     bool is_lookup)
> > > +{
> > > +     struct rb_node **p = NULL, *parent = NULL;
> > > +     struct mds_request_tree *tmp = NULL;
> > > +     struct ceph_mds_request* tmp_req = NULL;
> > > +     struct mds_request_tree* mrtree = NULL;
> > > +
> > > +     if (!is_lookup)
> > > +             p = &ci->getattrs_inflight.rb_node;
> > > +     else
> > > +             p = &ci->lookups_inflight.rb_node;
> > > +
> > > +     while (*p) {
> > > +             parent = *p;
> > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > +                     p = &(*p)->rb_left;
> > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > +                     p = &(*p)->rb_right;
> > > +             else
> > > +                     break;
> > > +     }
> > > +
> > > +     if (!(*p)) {
> > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > +
> > > +             if (!mrtree)
> > > +                     return -ENOMEM;
> > > +
> > > +             mrtree->mds_requests = RB_ROOT;
> > > +             mrtree->mask = req->r_args.getattr.mask;
> > > +             kref_init(&mrtree->ref);
> > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > +             if (!is_lookup) {
> > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > +             } else {
> > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > +             }
> > > +     } else {
> > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > +     }
> > > +
> > > +     p = &mrtree->mds_requests.rb_node;
> > > +     parent = NULL;
> > > +     while (*p) {
> > > +             parent = *p;
> > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > +                     p = &(*p)->rb_left;
> > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > +                     p = &(*p)->rb_right;
> > > +             else
> > > +                     BUG();
> > > +     }
> > > +
> > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > +     return 0;
> > > +}
> > > +
> > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > +                                   int mask,
> > > +                                   bool is_lookup)
> > > +{
> > > +     struct rb_node *node = NULL;
> > > +     if (!is_lookup)
> > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > +     else
> > > +             node = inode->lookups_inflight.rb_node;
> > > +
> > > +     while (node)
> > > +     {
> > > +             struct mds_request_tree* tmp = NULL;
> > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > +
> > > +             if (tmp->mask > mask) {
> > > +                     node = node->rb_left;
> > > +             } else if (tmp->mask < mask) {
> > > +                     node = node->rb_right;
> > > +             } else {
> > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > +                     while (inner_node) {
> > > +                             struct ceph_mds_request* req = NULL;
> > > +
> > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > +                                     inner_node = inner_node->rb_left;
> > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > +                                     inner_node = inner_node->rb_right;
> > > +                             } else
> > > +                                     return req; /* Found it */
> > > +                     }
> > > +                     return NULL;
> > > +             }
> > > +     }
> > > +     return NULL;
> > > +}
> > >  module_init(init_ceph);
> > >  module_exit(exit_ceph);
> > > 
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index dfb64a5211b6..abf761f2a122 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > 
> > >       spinlock_t i_ceph_lock;
> > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > 
> > If you do end up pursuing this further, I think it might be better to
> > use a spinlock to protect the trees.
> > 
> > Take the lock, search the tree and if there isn't one, drop the lock,
> > allocate, take it and search again, and insert the one you allocated if
> > no conflicting structure was inserted in the meantime.
> > 
> > In addition to cutting down on context switches when these things are
> > contended, that would also make it easier to hide the locking inside the
> > cache search/register/unregister routines.
> 
> You are right. At first, I tried to use spinlock, but that caused
> errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> know how that occurred. Do you have any clew about that?
> 
> I can't thank you enough for the help:-) Thanks very much:-)


Often that means you forgot to unlock something, or maybe hit some lock
inversion? Without more analysis it's impossible to know. :)

> > >       u64 i_version;
> > >       u64 i_inline_version;
> > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > >       struct rb_root i_caps;           /* cap list */
> > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > +     u64 last_flush_ack_tid;
> > >       struct list_head i_dirty_item, i_flushing_item;
> > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > >                               struct timespec64 *mtime,
> > >                               struct timespec64 *atime);
> > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > +                                            struct ceph_mds_request* req,
> > > +                                            bool is_lookup);
> > > +
> > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > +                                              struct ceph_mds_request* req,
> > > +                                              bool is_lookup);
> > > +
> > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > +                                          int mask,
> > > +                                          bool is_lookup);
> > >  extern int ceph_fill_trace(struct super_block *sb,
> > >                          struct ceph_mds_request *req);
> > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> >
Xuehan Xu Feb. 27, 2019, 3:13 p.m. UTC | #5
On Wed, 27 Feb 2019 at 20:08, Jeff Layton <jlayton@redhat.com> wrote:
>
> On Wed, 2019-02-27 at 11:31 +0800, Xuehan Xu wrote:
> > On Wed, 27 Feb 2019 at 07:54, Jeff Layton <jlayton@redhat.com> wrote:
> > > On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> > > > From: Xuehan Xu <xuxuehan@360.cn>
> > > >
> > > > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> > > > op, issue a new one if there is no inflight req that requires that same caps
> > > > as the current getattr/lookup op.
> > > >
> > > > Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> > > > ---
> > > >  fs/ceph/caps.c       |   1 +
> > > >  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
> > > >  fs/ceph/inode.c      |  55 +++++++++++++----
> > > >  fs/ceph/mds_client.c |  30 +++++++++-
> > > >  fs/ceph/mds_client.h |  15 ++++-
> > > >  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/ceph/super.h      |  14 +++++
> > > >  7 files changed, 304 insertions(+), 39 deletions(-)
> > > >
> > >
> > > The description on this patch could be fleshed out a bit.
> > >
> > > How much does this help performance and on what workloads? This is a lot
> > > of code to add without much justification.
> >
> > Hi, Jeff and Zheng, thanks for your review and help:-) I'm sorry that
> > I forgot to add the "Fixes" tag in the description of the patches.
> > These two patches is supposed to solve the issue:
> > http://tracker.ceph.com/issues/36609.
> >
>
> Have you measured the performance to know how much of an impact this
> has?

Hi, Jeff. Thanks for your reply:-)
 In our case, those requests can take more than ten seconds to finish,
which is intolerable to web servers. Actually, every time this
scenario occurs, it makes our monitoring system wrongly recognize
those web servers as down and shift all web traffic to other IDCs.

>
> > Basically, the scenario we are facing now is as follows:
> >      We have several dozens of web servers all of which mounted the
> > same directory for their php scripts and would ask cephfs for the
> > latest file content if some of those scripts is modified. To improve
> > the web servers' efficiency, each of them runs tens of threads to
> > serve web requests, so, as you can see, when some scripts is modified,
> > there would be thousands of threads asking for the same file which
> > would lead to thousands of getattr/lookup and read reqs to the cephfs
> > cluster and cause huge pressure on both MDSes and OSDs.
> >
>
> How often are these files changing?
>
> Between changes to the files, the web server clients should all end up
> with read-only caps that allow them to cache the inode attributes. When
> there is a change to the file, those will need to be recalled of course
> and you might see more activity on the MDS, but that's not necessarily a
> problem.

File changes is not very often, maybe one or two times a week. But,
since each time files get changed the scenario mentioned above occurs,
this problem really makes us suffer.

>
> > > Looking over the patch, the basic idea is to implement a cache (in
> > > rbtrees) of getattrs and lookups in flight and just wait on those if
> > > there already is one that matches what we need. I agree with Zheng that
> > > this could end up violating the strict cache coherency guarantees that
> > > ceph tries to provide.
> > >
> > > You're relying on an assumption that if you issue a getattr/lookup while
> > > another one is already in flight, then it's legitimate to use the result
> > > of the initial one for the one issued later. That's not necessarily the
> > > case. I suspect this would be possible, even from a single client:
> > >
> > > - client1/task1 issues stat(), server performs getattr
> > > but reply gets delayed for a bit
> > >
> > > - client2 does write() + fsync()
> > >
> > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > > ends up waiting for first getattr to complete
> > >
> > > - getattr reply comes in, and both stat calls get back the same
> > > attributes, showing pre-write size/mtime
> > >
> > > In this case, the synchronous write by client2 was provably done before
> > > the second getattr, but the second stat() call wouldn't see the new
> > > size.
> >
> > I understand that this is a big problem. If what I think is right,
> > this problem may be only with the read handling patch.
> > Here's what I think:
> >     First, I think the problem would come up only when client1/task2
> > issues stat() after the finish of client2's write()+sync(), since,
> > only in this case, a pre-write size/mtime seen by client1/task2 would
> > cause errors. And a pre-write size/mtime would be seen by
> > client1/task2 only when client1/task1's stat() is processed before
> > client2's write()+sync(). However, when client1/task1's stat() is
> > processed successfully, the inode's filelock should be in states like
> > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> > move the inode's to states like LOCK_MIX which would cause a cap
> > revoke from client1. And it's only when client1 acked the revoke can
> > the MDS go on processing the metadata modification caused by client2's
> > write()+fsync(). Also, MDS sends messages to the same client in the
> > same order of the creation of those messages. So, when client1/task2
> > issues stat(), the result of the previous one must have been processed
> > by client1 and can't be seen by client1/task2, which means this
> > problem wouldn't occur. Am I right about this? I don't know if I'm
> > missing anything.
> >
>
> There is never a guarantee that the MDS will give you the caps you've
> requested, so I don't think you can rely on client1 having to process a
> cap revoke to get you out of this situation.
>
> IOW, I believe this is possible:
>
> - client1 requests caps to do a stat(), doesn't get them (maybe because
> client2 is actively writing?)
> - client1 issues GETATTR to MDS, which gets processed there, but reply
> gets delayed
> - client2 does write()+fsync()
> - client1 issues another stat(), ends up waiting on the first GETATTR,
> which won't contain the most up to date information
>
> No cap revoke to client1 here since it never got any in the first place.

Yes, you are right. But I think, in this case, the first GETATTR would
get processed after client2's write()+fsync(), which means it would
get the latest inode content. So the problem wouldn't occur, either.
Am I right?

>
> > However, as for the read handling patch, this problem do exists, and
> > it's really confusing me.
> >
>
> The cap handling code in cephfs is extremely complex. My hat is off to
> you for even attempting to understand the MDS LOCK_* handling code.
>
> > > The read handling patch may suffer from similar issues, though I haven't
> > > gone over that one in as much detail.
> > >
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index bba28a5034ba..5a177a3c283f 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > > >               goto out;
> > > >
> > > >       ci->i_flushing_caps &= ~cleaned;
> > > > +     ci->last_flush_ack_tid = flush_tid;
> > > >
> > > >       spin_lock(&mdsc->cap_dirty_lock);
> > > >
> > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > index 82928cea0209..6f17c0319158 100644
> > > > --- a/fs/ceph/dir.c
> > > > +++ b/fs/ceph/dir.c
> > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > >                               }
> > > >                       }
> > > >               }
> > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > > >       }
> > > >       spin_unlock(&dentry->d_lock);
> > > >
> > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > > >               ceph_put_mds_session(session);
> > > >       }
> > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > > >       return valid;
> > > >  }
> > > >
> > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > > >       if (!valid) {
> > > >               struct ceph_mds_client *mdsc =
> > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > > -             struct ceph_mds_request *req;
> > > > +             struct ceph_mds_request *req = NULL;
> > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > > >               int op, err;
> > > >               u32 mask;
> > > >
> > > >               if (flags & LOOKUP_RCU)
> > > >                       return -ECHILD;
> > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > +             if (ceph_security_xattr_wanted(dir))
> > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > >
> > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > -             if (!IS_ERR(req)) {
> > > > -                     req->r_dentry = dget(dentry);
> > > > -                     req->r_num_caps = 2;
> > > > -                     req->r_parent = dir;
> > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > > +             }
> > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > +                     ceph_mdsc_get_request(req);
> > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > > +                     err = ceph_mdsc_wait_for_request(req);
> > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > +             } else {
> > > >
> > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > -                     if (ceph_security_xattr_wanted(dir))
> > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > +                             if (!IS_ERR(req)) {
> > > > +                                     unsigned long res = 0;
> > > > +                                     req->r_dentry = dget(dentry);
> > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > > +                                     req->r_num_caps = 2;
> > > > +                                     req->r_parent = dir;
> > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > > +                                     if (IS_ERR_VALUE(res)) {
> > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > +                                             goto out;
> > > > +                                     }
> > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > +                             }
> > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > +                     }
> > > > +                     if (IS_ERR(req))
> > > > +                             goto out;
> > > >
> > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > -                     switch (err) {
> > > > -                     case 0:
> > > > -                             if (d_really_is_positive(dentry) &&
> > > > -                                 d_inode(dentry) == req->r_target_inode)
> > > > -                                     valid = 1;
> > > > -                             break;
> > > > -                     case -ENOENT:
> > > > -                             if (d_really_is_negative(dentry))
> > > > -                                     valid = 1;
> > > > -                             /* Fallthrough */
> > > > -                     default:
> > > > -                             break;
> > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > >                       }
> > > > -                     ceph_mdsc_put_request(req);
> > > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > > -                          dentry, err);
> > > >               }
> > > > +             switch (err) {
> > > > +             case 0:
> > > > +                     if (d_really_is_positive(dentry) &&
> > > > +                         d_inode(dentry) == req->r_target_inode)
> > > > +                             valid = 1;
> > > > +                     break;
> > > > +             case -ENOENT:
> > > > +                     if (d_really_is_negative(dentry))
> > > > +                             valid = 1;
> > > > +                     /* Fallthrough */
> > > > +             default:
> > > > +                     break;
> > > > +             }
> > > > +             ceph_mdsc_put_request(req);
> > > > +             dout("d_revalidate %p lookup result=%d\n",
> > > > +                             dentry, err);
> > > >       }
> > > >
> > > > +out:
> > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > > >       if (valid) {
> > > >               ceph_dentry_lru_touch(dentry);
> > >
> > > d_revalidate is changed to use the cache, but not lookup. Why?
> >
> > Multiple same lookups would also be sent out in our scenario mentioned
> > above, so we add a cache for lookups, too. I don't know if I'm getting
> > your question correctly.
> >
>
> d_revalidate only checks the validity of an existing dentry. To get a
> dentry in the first place, you have to use the directory inode's lookup
> readdir operations. lookup isn't changed here and I was wondering why.
>
> Now that I think about it though, you probably need to do that anyway,
> as the VFS code already does something similar when there are multiple
> lookups for the same dentry in flight.

It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
ceph kernel module to use lookup request to revalidate dentry.

>
> > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > index 9d1f34d46627..8705f0645a24 100644
> > > > --- a/fs/ceph/inode.c
> > > > +++ b/fs/ceph/inode.c
> > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > >
> > > >       spin_lock_init(&ci->i_ceph_lock);
> > > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > > +     mutex_init(&ci->lookups_inflight_lock);
> > > >
> > > >       ci->i_version = 0;
> > > >       ci->i_inline_version = 0;
> > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > >       ci->i_xattrs.index_version = 0;
> > > >
> > > >       ci->i_caps = RB_ROOT;
> > > > +     ci->getattrs_inflight = RB_ROOT;
> > > > +     ci->lookups_inflight = RB_ROOT;
> > > >       ci->i_auth_cap = NULL;
> > > >       ci->i_dirty_caps = 0;
> > > >       ci->i_flushing_caps = 0;
> > > > +     ci->last_flush_ack_tid = 0;
> > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > > >       ci->i_prealloc_cap_flush = NULL;
> > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > > >        * we expect a negative dentry.
> > > >        */
> > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > >       if (!tgt_vino && d_really_is_positive(dentry))
> > > >               return;
> > > > -
> > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > > >               return;
> > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > >       struct ceph_mds_request *req;
> > > >       int mode;
> > > >       int err;
> > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > >
> > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > > >               return 0;
> > > >
> > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > -     if (IS_ERR(req))
> > > > -             return PTR_ERR(req);
> > > > -     req->r_inode = inode;
> > > > -     ihold(inode);
> > > > -     req->r_num_caps = 1;
> > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > -     req->r_locked_page = locked_page;
> > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > > +     if (req) {
> > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > +             ceph_mdsc_get_request(req);
> > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > +             err = ceph_mdsc_wait_for_request(req);
> > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > +     } else {
> > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > +             if (!IS_ERR(req)) {
> > > > +                     unsigned long res = 0;
> > > > +                     req->r_inode = inode;
> > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > > +                     ihold(inode);
> > > > +                     req->r_num_caps = 1;
> > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > +                     req->r_locked_page = locked_page;
> > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > > +                     if (IS_ERR_VALUE(res)) {
> > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > +                             return res;
> > > > +                     }
> > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > +             }
> > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > +             if (IS_ERR(req))
> > > > +                     return PTR_ERR(req);
> > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > +     }
> > > >       if (locked_page && err == 0) {
> > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > > >               if (inline_version == 0) {
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > > >       return 0;
> > > >  }
> > > >
> > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > > +{
> > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > > +
> > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > > +     kfree(mrtree);
> > > > +}
> > > > +
> > > >  /*
> > > >   * Create an mds request.
> > > >   */
> > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > > >       req->r_fmode = -1;
> > > >       kref_init(&req->r_kref);
> > > >       RB_CLEAR_NODE(&req->r_node);
> > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > > >       INIT_LIST_HEAD(&req->r_wait);
> > > > +     init_completion(&req->batch_op_completion);
> > > >       init_completion(&req->r_completion);
> > > >       init_completion(&req->r_safe_completion);
> > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > >       mutex_unlock(&mdsc->mutex);
> > > >  }
> > > >
> > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > > +{
> > > > +     int err = 0;
> > > > +     long timeleft = wait_for_completion_killable_timeout(
> > > > +                     &req->batch_op_completion,
> > > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > > +     if (timeleft > 0)
> > > > +             err = 0;
> > > > +     else if (!timeleft)
> > > > +             err = -EIO;  /* timed out */
> > > > +     else
> > > > +             err = timeleft;  /* killed */
> > > > +     if (!err)
> > > > +             return err;
> > > > +     return req->batch_op_err;
> > > > +}
> > > > +
> > > >  /*
> > > >   * Synchrously perform an mds request.  Take care of all of the
> > > >   * session setup, forwarding, retry details.
> > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > >       } else {
> > > >               err = req->r_err;
> > > >       }
> > > > -
> > > > +     req->batch_op_err = err;
> > > > +     complete_all(&req->batch_op_completion);
> > > >  out:
> > > >       mutex_unlock(&mdsc->mutex);
> > > >       dout("do_request %p done, result %d\n", req, err);
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index 729da155ebf0..41727677643e 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > > >                                                struct ceph_mds_request *req);
> > > >
> > > > +struct mds_request_tree {
> > > > +     int mask;
> > > > +     struct kref ref;
> > > > +     struct rb_root mds_requests;
> > > > +     struct rb_node parent_tree_node;
> > > > +};
> > > > +
> > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > > +
> > > >  /*
> > > >   * an in-flight mds request
> > > >   */
> > > >  struct ceph_mds_request {
> > > >       u64 r_tid;                   /* transaction id */
> > > > +     u64 last_caps_flush_ack_tid;
> > > >       struct rb_node r_node;
> > > > +     struct rb_node getattr_lookup_node;
> > > >       struct ceph_mds_client *r_mdsc;
> > > >
> > > >       int r_op;                    /* mds op code */
> > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > > >       struct ceph_msg  *r_reply;
> > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > > >       struct page *r_locked_page;
> > > > -     int r_err;
> > > > +     int r_err, batch_op_err;
> > > >
> > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > > >       unsigned long r_started;  /* start time to measure timeout against */
> > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > >
> > > >       struct kref       r_kref;
> > > >       struct list_head  r_wait;
> > > > +     struct completion batch_op_completion;
> > > >       struct completion r_completion;
> > > >       struct completion r_safe_completion;
> > > >       ceph_mds_request_callback_t r_callback;
> > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > >                                    struct ceph_mds_request *req);
> > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > >                               struct inode *dir,
> > > >                               struct ceph_mds_request *req);
> > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > index da2cd8e89062..857fbbc0b768 100644
> > > > --- a/fs/ceph/super.c
> > > > +++ b/fs/ceph/super.c
> > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > > >       destroy_caches();
> > > >  }
> > > >
> > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > +                                       struct ceph_mds_request* req,
> > > > +                                       bool is_lookup)
> > > > +{
> > > > +     struct rb_node *node = NULL;
> > > > +     struct mds_request_tree* tmp = NULL;
> > > > +     if (!is_lookup)
> > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > +     else
> > > > +             node = inode->lookups_inflight.rb_node;
> > > > +
> > > > +     while (node)
> > > > +     {
> > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > +
> > > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > > +                     node = node->rb_left;
> > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > > +                     node = node->rb_right;
> > > > +             else
> > > > +                     break;
> > > > +     }
> > > > +
> > > > +     if (!node)
> > > > +             BUG();
> > > > +
> > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > > +
> > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > > +             if (!is_lookup)
> > > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > > +             else
> > > > +                     rb_erase(node, &inode->lookups_inflight);
> > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > > +     }
> > > > +}
> > > > +
> > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > +                                     struct ceph_mds_request* req,
> > > > +                                     bool is_lookup)
> > > > +{
> > > > +     struct rb_node **p = NULL, *parent = NULL;
> > > > +     struct mds_request_tree *tmp = NULL;
> > > > +     struct ceph_mds_request* tmp_req = NULL;
> > > > +     struct mds_request_tree* mrtree = NULL;
> > > > +
> > > > +     if (!is_lookup)
> > > > +             p = &ci->getattrs_inflight.rb_node;
> > > > +     else
> > > > +             p = &ci->lookups_inflight.rb_node;
> > > > +
> > > > +     while (*p) {
> > > > +             parent = *p;
> > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > > +                     p = &(*p)->rb_left;
> > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > > +                     p = &(*p)->rb_right;
> > > > +             else
> > > > +                     break;
> > > > +     }
> > > > +
> > > > +     if (!(*p)) {
> > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > > +
> > > > +             if (!mrtree)
> > > > +                     return -ENOMEM;
> > > > +
> > > > +             mrtree->mds_requests = RB_ROOT;
> > > > +             mrtree->mask = req->r_args.getattr.mask;
> > > > +             kref_init(&mrtree->ref);
> > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > > +             if (!is_lookup) {
> > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > > +             } else {
> > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > > +             }
> > > > +     } else {
> > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > > +     }
> > > > +
> > > > +     p = &mrtree->mds_requests.rb_node;
> > > > +     parent = NULL;
> > > > +     while (*p) {
> > > > +             parent = *p;
> > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > > +                     p = &(*p)->rb_left;
> > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > > +                     p = &(*p)->rb_right;
> > > > +             else
> > > > +                     BUG();
> > > > +     }
> > > > +
> > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > +                                   int mask,
> > > > +                                   bool is_lookup)
> > > > +{
> > > > +     struct rb_node *node = NULL;
> > > > +     if (!is_lookup)
> > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > +     else
> > > > +             node = inode->lookups_inflight.rb_node;
> > > > +
> > > > +     while (node)
> > > > +     {
> > > > +             struct mds_request_tree* tmp = NULL;
> > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > +
> > > > +             if (tmp->mask > mask) {
> > > > +                     node = node->rb_left;
> > > > +             } else if (tmp->mask < mask) {
> > > > +                     node = node->rb_right;
> > > > +             } else {
> > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > > +                     while (inner_node) {
> > > > +                             struct ceph_mds_request* req = NULL;
> > > > +
> > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > > +                                     inner_node = inner_node->rb_left;
> > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > > +                                     inner_node = inner_node->rb_right;
> > > > +                             } else
> > > > +                                     return req; /* Found it */
> > > > +                     }
> > > > +                     return NULL;
> > > > +             }
> > > > +     }
> > > > +     return NULL;
> > > > +}
> > > >  module_init(init_ceph);
> > > >  module_exit(exit_ceph);
> > > >
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index dfb64a5211b6..abf761f2a122 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > >
> > > >       spinlock_t i_ceph_lock;
> > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > >
> > > If you do end up pursuing this further, I think it might be better to
> > > use a spinlock to protect the trees.
> > >
> > > Take the lock, search the tree and if there isn't one, drop the lock,
> > > allocate, take it and search again, and insert the one you allocated if
> > > no conflicting structure was inserted in the meantime.
> > >
> > > In addition to cutting down on context switches when these things are
> > > contended, that would also make it easier to hide the locking inside the
> > > cache search/register/unregister routines.
> >
> > You are right. At first, I tried to use spinlock, but that caused
> > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> > know how that occurred. Do you have any clew about that?
> >
> > I can't thank you enough for the help:-) Thanks very much:-)
>
>
> Often that means you forgot to unlock something, or maybe hit some lock
> inversion? Without more analysis it's impossible to know. :)

Thanks:-) I wouldn't try to debug that:-)
>
> > > >       u64 i_version;
> > > >       u64 i_inline_version;
> > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > > >       struct rb_root i_caps;           /* cap list */
> > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > > +     u64 last_flush_ack_tid;
> > > >       struct list_head i_dirty_item, i_flushing_item;
> > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > > >                               struct timespec64 *mtime,
> > > >                               struct timespec64 *atime);
> > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > +                                            struct ceph_mds_request* req,
> > > > +                                            bool is_lookup);
> > > > +
> > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > +                                              struct ceph_mds_request* req,
> > > > +                                              bool is_lookup);
> > > > +
> > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > +                                          int mask,
> > > > +                                          bool is_lookup);
> > > >  extern int ceph_fill_trace(struct super_block *sb,
> > > >                          struct ceph_mds_request *req);
> > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> > >
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Xuehan Xu Feb. 27, 2019, 3:21 p.m. UTC | #6
On Wed, 27 Feb 2019 at 23:13, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
>
> On Wed, 27 Feb 2019 at 20:08, Jeff Layton <jlayton@redhat.com> wrote:
> >
> > On Wed, 2019-02-27 at 11:31 +0800, Xuehan Xu wrote:
> > > On Wed, 27 Feb 2019 at 07:54, Jeff Layton <jlayton@redhat.com> wrote:
> > > > On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> > > > > From: Xuehan Xu <xuxuehan@360.cn>
> > > > >
> > > > > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> > > > > op, issue a new one if there is no inflight req that requires that same caps
> > > > > as the current getattr/lookup op.
> > > > >
> > > > > Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> > > > > ---
> > > > >  fs/ceph/caps.c       |   1 +
> > > > >  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
> > > > >  fs/ceph/inode.c      |  55 +++++++++++++----
> > > > >  fs/ceph/mds_client.c |  30 +++++++++-
> > > > >  fs/ceph/mds_client.h |  15 ++++-
> > > > >  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/ceph/super.h      |  14 +++++
> > > > >  7 files changed, 304 insertions(+), 39 deletions(-)
> > > > >
> > > >
> > > > The description on this patch could be fleshed out a bit.
> > > >
> > > > How much does this help performance and on what workloads? This is a lot
> > > > of code to add without much justification.
> > >
> > > Hi, Jeff and Zheng, thanks for your review and help:-) I'm sorry that
> > > I forgot to add the "Fixes" tag in the description of the patches.
> > > These two patches is supposed to solve the issue:
> > > http://tracker.ceph.com/issues/36609.
> > >
> >
> > Have you measured the performance to know how much of an impact this
> > has?
>
> Hi, Jeff. Thanks for your reply:-)
>  In our case, those requests can take more than ten seconds to finish,
> which is intolerable to web servers. Actually, every time this
> scenario occurs, it makes our monitoring system wrongly recognize
> those web servers as down and shift all web traffic to other IDCs.
>
> >
> > > Basically, the scenario we are facing now is as follows:
> > >      We have several dozens of web servers all of which mounted the
> > > same directory for their php scripts and would ask cephfs for the
> > > latest file content if some of those scripts is modified. To improve
> > > the web servers' efficiency, each of them runs tens of threads to
> > > serve web requests, so, as you can see, when some scripts is modified,
> > > there would be thousands of threads asking for the same file which
> > > would lead to thousands of getattr/lookup and read reqs to the cephfs
> > > cluster and cause huge pressure on both MDSes and OSDs.
> > >
> >
> > How often are these files changing?
> >
> > Between changes to the files, the web server clients should all end up
> > with read-only caps that allow them to cache the inode attributes. When
> > there is a change to the file, those will need to be recalled of course
> > and you might see more activity on the MDS, but that's not necessarily a
> > problem.
>
> File changes is not very often, maybe one or two times a week. But,
> since each time files get changed the scenario mentioned above occurs,
> this problem really makes us suffer.
>
> >
> > > > Looking over the patch, the basic idea is to implement a cache (in
> > > > rbtrees) of getattrs and lookups in flight and just wait on those if
> > > > there already is one that matches what we need. I agree with Zheng that
> > > > this could end up violating the strict cache coherency guarantees that
> > > > ceph tries to provide.
> > > >
> > > > You're relying on an assumption that if you issue a getattr/lookup while
> > > > another one is already in flight, then it's legitimate to use the result
> > > > of the initial one for the one issued later. That's not necessarily the
> > > > case. I suspect this would be possible, even from a single client:
> > > >
> > > > - client1/task1 issues stat(), server performs getattr
> > > > but reply gets delayed for a bit
> > > >
> > > > - client2 does write() + fsync()
> > > >
> > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > > > ends up waiting for first getattr to complete
> > > >
> > > > - getattr reply comes in, and both stat calls get back the same
> > > > attributes, showing pre-write size/mtime
> > > >
> > > > In this case, the synchronous write by client2 was provably done before
> > > > the second getattr, but the second stat() call wouldn't see the new
> > > > size.
> > >
> > > I understand that this is a big problem. If what I think is right,
> > > this problem may be only with the read handling patch.
> > > Here's what I think:
> > >     First, I think the problem would come up only when client1/task2
> > > issues stat() after the finish of client2's write()+sync(), since,
> > > only in this case, a pre-write size/mtime seen by client1/task2 would
> > > cause errors. And a pre-write size/mtime would be seen by
> > > client1/task2 only when client1/task1's stat() is processed before
> > > client2's write()+sync(). However, when client1/task1's stat() is
> > > processed successfully, the inode's filelock should be in states like
> > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> > > move the inode's to states like LOCK_MIX which would cause a cap
> > > revoke from client1. And it's only when client1 acked the revoke can
> > > the MDS go on processing the metadata modification caused by client2's
> > > write()+fsync(). Also, MDS sends messages to the same client in the
> > > same order of the creation of those messages. So, when client1/task2
> > > issues stat(), the result of the previous one must have been processed
> > > by client1 and can't be seen by client1/task2, which means this
> > > problem wouldn't occur. Am I right about this? I don't know if I'm
> > > missing anything.
> > >
> >
> > There is never a guarantee that the MDS will give you the caps you've
> > requested, so I don't think you can rely on client1 having to process a
> > cap revoke to get you out of this situation.
> >
> > IOW, I believe this is possible:
> >
> > - client1 requests caps to do a stat(), doesn't get them (maybe because
> > client2 is actively writing?)
> > - client1 issues GETATTR to MDS, which gets processed there, but reply
> > gets delayed
> > - client2 does write()+fsync()
> > - client1 issues another stat(), ends up waiting on the first GETATTR,
> > which won't contain the most up to date information
> >
> > No cap revoke to client1 here since it never got any in the first place.
>
> Yes, you are right. But I think, in this case, the first GETATTR would
> get processed after client2's write()+fsync(), which means it would
> get the latest inode content. So the problem wouldn't occur, either.
> Am I right?
>
> >
> > > However, as for the read handling patch, this problem do exists, and
> > > it's really confusing me.
> > >
> >
> > The cap handling code in cephfs is extremely complex. My hat is off to
> > you for even attempting to understand the MDS LOCK_* handling code.
> >
> > > > The read handling patch may suffer from similar issues, though I haven't
> > > > gone over that one in as much detail.
> > > >
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index bba28a5034ba..5a177a3c283f 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > > > >               goto out;
> > > > >
> > > > >       ci->i_flushing_caps &= ~cleaned;
> > > > > +     ci->last_flush_ack_tid = flush_tid;
> > > > >
> > > > >       spin_lock(&mdsc->cap_dirty_lock);
> > > > >
> > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > index 82928cea0209..6f17c0319158 100644
> > > > > --- a/fs/ceph/dir.c
> > > > > +++ b/fs/ceph/dir.c
> > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > >                               }
> > > > >                       }
> > > > >               }
> > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > > > >       }
> > > > >       spin_unlock(&dentry->d_lock);
> > > > >
> > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > > > >               ceph_put_mds_session(session);
> > > > >       }
> > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > > > >       return valid;
> > > > >  }
> > > > >
> > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > > > >       if (!valid) {
> > > > >               struct ceph_mds_client *mdsc =
> > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > > > -             struct ceph_mds_request *req;
> > > > > +             struct ceph_mds_request *req = NULL;
> > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > > > >               int op, err;
> > > > >               u32 mask;
> > > > >
> > > > >               if (flags & LOOKUP_RCU)
> > > > >                       return -ECHILD;
> > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > +             if (ceph_security_xattr_wanted(dir))
> > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > > >
> > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > -             if (!IS_ERR(req)) {
> > > > > -                     req->r_dentry = dget(dentry);
> > > > > -                     req->r_num_caps = 2;
> > > > > -                     req->r_parent = dir;
> > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > > > +             }
> > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > +                     ceph_mdsc_get_request(req);
> > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                     err = ceph_mdsc_wait_for_request(req);
> > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > +             } else {
> > > > >
> > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > -                     if (ceph_security_xattr_wanted(dir))
> > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                             if (!IS_ERR(req)) {
> > > > > +                                     unsigned long res = 0;
> > > > > +                                     req->r_dentry = dget(dentry);
> > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > > > +                                     req->r_num_caps = 2;
> > > > > +                                     req->r_parent = dir;
> > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > > > +                                     if (IS_ERR_VALUE(res)) {
> > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                                             goto out;
> > > > > +                                     }
> > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > +                             }
> > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                     }
> > > > > +                     if (IS_ERR(req))
> > > > > +                             goto out;
> > > > >
> > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > -                     switch (err) {
> > > > > -                     case 0:
> > > > > -                             if (d_really_is_positive(dentry) &&
> > > > > -                                 d_inode(dentry) == req->r_target_inode)
> > > > > -                                     valid = 1;
> > > > > -                             break;
> > > > > -                     case -ENOENT:
> > > > > -                             if (d_really_is_negative(dentry))
> > > > > -                                     valid = 1;
> > > > > -                             /* Fallthrough */
> > > > > -                     default:
> > > > > -                             break;
> > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > >                       }
> > > > > -                     ceph_mdsc_put_request(req);
> > > > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > > > -                          dentry, err);
> > > > >               }
> > > > > +             switch (err) {
> > > > > +             case 0:
> > > > > +                     if (d_really_is_positive(dentry) &&
> > > > > +                         d_inode(dentry) == req->r_target_inode)
> > > > > +                             valid = 1;
> > > > > +                     break;
> > > > > +             case -ENOENT:
> > > > > +                     if (d_really_is_negative(dentry))
> > > > > +                             valid = 1;
> > > > > +                     /* Fallthrough */
> > > > > +             default:
> > > > > +                     break;
> > > > > +             }
> > > > > +             ceph_mdsc_put_request(req);
> > > > > +             dout("d_revalidate %p lookup result=%d\n",
> > > > > +                             dentry, err);
> > > > >       }
> > > > >
> > > > > +out:
> > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > > > >       if (valid) {
> > > > >               ceph_dentry_lru_touch(dentry);
> > > >
> > > > d_revalidate is changed to use the cache, but not lookup. Why?
> > >
> > > Multiple same lookups would also be sent out in our scenario mentioned
> > > above, so we add a cache for lookups, too. I don't know if I'm getting
> > > your question correctly.
> > >
> >
> > d_revalidate only checks the validity of an existing dentry. To get a
> > dentry in the first place, you have to use the directory inode's lookup
> > readdir operations. lookup isn't changed here and I was wondering why.
> >
> > Now that I think about it though, you probably need to do that anyway,
> > as the VFS code already does something similar when there are multiple
> > lookups for the same dentry in flight.
>
> It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
> ceph kernel module to use lookup request to revalidate dentry.

d_revalidate could be invoked in "fast_lookup" which is allowed to be
invoked from multiple threads concurrently, so it can also send
multiple same lookups to ads


>
> >
> > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > index 9d1f34d46627..8705f0645a24 100644
> > > > > --- a/fs/ceph/inode.c
> > > > > +++ b/fs/ceph/inode.c
> > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > > >
> > > > >       spin_lock_init(&ci->i_ceph_lock);
> > > > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > > > +     mutex_init(&ci->lookups_inflight_lock);
> > > > >
> > > > >       ci->i_version = 0;
> > > > >       ci->i_inline_version = 0;
> > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > >       ci->i_xattrs.index_version = 0;
> > > > >
> > > > >       ci->i_caps = RB_ROOT;
> > > > > +     ci->getattrs_inflight = RB_ROOT;
> > > > > +     ci->lookups_inflight = RB_ROOT;
> > > > >       ci->i_auth_cap = NULL;
> > > > >       ci->i_dirty_caps = 0;
> > > > >       ci->i_flushing_caps = 0;
> > > > > +     ci->last_flush_ack_tid = 0;
> > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > > > >       ci->i_prealloc_cap_flush = NULL;
> > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > > > >        * we expect a negative dentry.
> > > > >        */
> > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > >       if (!tgt_vino && d_really_is_positive(dentry))
> > > > >               return;
> > > > > -
> > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > > > >               return;
> > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > >       struct ceph_mds_request *req;
> > > > >       int mode;
> > > > >       int err;
> > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > > >
> > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > > > >               return 0;
> > > > >
> > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > -     if (IS_ERR(req))
> > > > > -             return PTR_ERR(req);
> > > > > -     req->r_inode = inode;
> > > > > -     ihold(inode);
> > > > > -     req->r_num_caps = 1;
> > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > -     req->r_locked_page = locked_page;
> > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > > > +     if (req) {
> > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > +             ceph_mdsc_get_request(req);
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +             err = ceph_mdsc_wait_for_request(req);
> > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > +     } else {
> > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > +             if (!IS_ERR(req)) {
> > > > > +                     unsigned long res = 0;
> > > > > +                     req->r_inode = inode;
> > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > > > +                     ihold(inode);
> > > > > +                     req->r_num_caps = 1;
> > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                     req->r_locked_page = locked_page;
> > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > > > +                     if (IS_ERR_VALUE(res)) {
> > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +                             return res;
> > > > > +                     }
> > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > +             }
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +             if (IS_ERR(req))
> > > > > +                     return PTR_ERR(req);
> > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +     }
> > > > >       if (locked_page && err == 0) {
> > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > > > >               if (inline_version == 0) {
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > > > >       return 0;
> > > > >  }
> > > > >
> > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > > > +{
> > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > > > +
> > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > > > +     kfree(mrtree);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Create an mds request.
> > > > >   */
> > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > > > >       req->r_fmode = -1;
> > > > >       kref_init(&req->r_kref);
> > > > >       RB_CLEAR_NODE(&req->r_node);
> > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > > > >       INIT_LIST_HEAD(&req->r_wait);
> > > > > +     init_completion(&req->batch_op_completion);
> > > > >       init_completion(&req->r_completion);
> > > > >       init_completion(&req->r_safe_completion);
> > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > >       mutex_unlock(&mdsc->mutex);
> > > > >  }
> > > > >
> > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > > > +{
> > > > > +     int err = 0;
> > > > > +     long timeleft = wait_for_completion_killable_timeout(
> > > > > +                     &req->batch_op_completion,
> > > > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > > > +     if (timeleft > 0)
> > > > > +             err = 0;
> > > > > +     else if (!timeleft)
> > > > > +             err = -EIO;  /* timed out */
> > > > > +     else
> > > > > +             err = timeleft;  /* killed */
> > > > > +     if (!err)
> > > > > +             return err;
> > > > > +     return req->batch_op_err;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Synchrously perform an mds request.  Take care of all of the
> > > > >   * session setup, forwarding, retry details.
> > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > >       } else {
> > > > >               err = req->r_err;
> > > > >       }
> > > > > -
> > > > > +     req->batch_op_err = err;
> > > > > +     complete_all(&req->batch_op_completion);
> > > > >  out:
> > > > >       mutex_unlock(&mdsc->mutex);
> > > > >       dout("do_request %p done, result %d\n", req, err);
> > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > index 729da155ebf0..41727677643e 100644
> > > > > --- a/fs/ceph/mds_client.h
> > > > > +++ b/fs/ceph/mds_client.h
> > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > > > >                                                struct ceph_mds_request *req);
> > > > >
> > > > > +struct mds_request_tree {
> > > > > +     int mask;
> > > > > +     struct kref ref;
> > > > > +     struct rb_root mds_requests;
> > > > > +     struct rb_node parent_tree_node;
> > > > > +};
> > > > > +
> > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > > > +
> > > > >  /*
> > > > >   * an in-flight mds request
> > > > >   */
> > > > >  struct ceph_mds_request {
> > > > >       u64 r_tid;                   /* transaction id */
> > > > > +     u64 last_caps_flush_ack_tid;
> > > > >       struct rb_node r_node;
> > > > > +     struct rb_node getattr_lookup_node;
> > > > >       struct ceph_mds_client *r_mdsc;
> > > > >
> > > > >       int r_op;                    /* mds op code */
> > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > > > >       struct ceph_msg  *r_reply;
> > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > > > >       struct page *r_locked_page;
> > > > > -     int r_err;
> > > > > +     int r_err, batch_op_err;
> > > > >
> > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > > > >       unsigned long r_started;  /* start time to measure timeout against */
> > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > > >
> > > > >       struct kref       r_kref;
> > > > >       struct list_head  r_wait;
> > > > > +     struct completion batch_op_completion;
> > > > >       struct completion r_completion;
> > > > >       struct completion r_safe_completion;
> > > > >       ceph_mds_request_callback_t r_callback;
> > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > >                                    struct ceph_mds_request *req);
> > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > >                               struct inode *dir,
> > > > >                               struct ceph_mds_request *req);
> > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > index da2cd8e89062..857fbbc0b768 100644
> > > > > --- a/fs/ceph/super.c
> > > > > +++ b/fs/ceph/super.c
> > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > > > >       destroy_caches();
> > > > >  }
> > > > >
> > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                       struct ceph_mds_request* req,
> > > > > +                                       bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node *node = NULL;
> > > > > +     struct mds_request_tree* tmp = NULL;
> > > > > +     if (!is_lookup)
> > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > +     else
> > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (node)
> > > > > +     {
> > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > +
> > > > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > > > +                     node = node->rb_left;
> > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > > > +                     node = node->rb_right;
> > > > > +             else
> > > > > +                     break;
> > > > > +     }
> > > > > +
> > > > > +     if (!node)
> > > > > +             BUG();
> > > > > +
> > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > > > +
> > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > > > +             if (!is_lookup)
> > > > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > > > +             else
> > > > > +                     rb_erase(node, &inode->lookups_inflight);
> > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > > > +     }
> > > > > +}
> > > > > +
> > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                     struct ceph_mds_request* req,
> > > > > +                                     bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node **p = NULL, *parent = NULL;
> > > > > +     struct mds_request_tree *tmp = NULL;
> > > > > +     struct ceph_mds_request* tmp_req = NULL;
> > > > > +     struct mds_request_tree* mrtree = NULL;
> > > > > +
> > > > > +     if (!is_lookup)
> > > > > +             p = &ci->getattrs_inflight.rb_node;
> > > > > +     else
> > > > > +             p = &ci->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (*p) {
> > > > > +             parent = *p;
> > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > > > +                     p = &(*p)->rb_left;
> > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > > > +                     p = &(*p)->rb_right;
> > > > > +             else
> > > > > +                     break;
> > > > > +     }
> > > > > +
> > > > > +     if (!(*p)) {
> > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > > > +
> > > > > +             if (!mrtree)
> > > > > +                     return -ENOMEM;
> > > > > +
> > > > > +             mrtree->mds_requests = RB_ROOT;
> > > > > +             mrtree->mask = req->r_args.getattr.mask;
> > > > > +             kref_init(&mrtree->ref);
> > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > > > +             if (!is_lookup) {
> > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > > > +             } else {
> > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > > > +             }
> > > > > +     } else {
> > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > > > +     }
> > > > > +
> > > > > +     p = &mrtree->mds_requests.rb_node;
> > > > > +     parent = NULL;
> > > > > +     while (*p) {
> > > > > +             parent = *p;
> > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > > > +                     p = &(*p)->rb_left;
> > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > > > +                     p = &(*p)->rb_right;
> > > > > +             else
> > > > > +                     BUG();
> > > > > +     }
> > > > > +
> > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                   int mask,
> > > > > +                                   bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node *node = NULL;
> > > > > +     if (!is_lookup)
> > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > +     else
> > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (node)
> > > > > +     {
> > > > > +             struct mds_request_tree* tmp = NULL;
> > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > +
> > > > > +             if (tmp->mask > mask) {
> > > > > +                     node = node->rb_left;
> > > > > +             } else if (tmp->mask < mask) {
> > > > > +                     node = node->rb_right;
> > > > > +             } else {
> > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > > > +                     while (inner_node) {
> > > > > +                             struct ceph_mds_request* req = NULL;
> > > > > +
> > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > > > +                                     inner_node = inner_node->rb_left;
> > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > > > +                                     inner_node = inner_node->rb_right;
> > > > > +                             } else
> > > > > +                                     return req; /* Found it */
> > > > > +                     }
> > > > > +                     return NULL;
> > > > > +             }
> > > > > +     }
> > > > > +     return NULL;
> > > > > +}
> > > > >  module_init(init_ceph);
> > > > >  module_exit(exit_ceph);
> > > > >
> > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > index dfb64a5211b6..abf761f2a122 100644
> > > > > --- a/fs/ceph/super.h
> > > > > +++ b/fs/ceph/super.h
> > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > > >
> > > > >       spinlock_t i_ceph_lock;
> > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > > >
> > > > If you do end up pursuing this further, I think it might be better to
> > > > use a spinlock to protect the trees.
> > > >
> > > > Take the lock, search the tree and if there isn't one, drop the lock,
> > > > allocate, take it and search again, and insert the one you allocated if
> > > > no conflicting structure was inserted in the meantime.
> > > >
> > > > In addition to cutting down on context switches when these things are
> > > > contended, that would also make it easier to hide the locking inside the
> > > > cache search/register/unregister routines.
> > >
> > > You are right. At first, I tried to use spinlock, but that caused
> > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> > > know how that occurred. Do you have any clew about that?
> > >
> > > I can't thank you enough for the help:-) Thanks very much:-)
> >
> >
> > Often that means you forgot to unlock something, or maybe hit some lock
> > inversion? Without more analysis it's impossible to know. :)
>
> Thanks:-) I wouldn't try to debug that:-)
> >
> > > > >       u64 i_version;
> > > > >       u64 i_inline_version;
> > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > > > >       struct rb_root i_caps;           /* cap list */
> > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > > > +     u64 last_flush_ack_tid;
> > > > >       struct list_head i_dirty_item, i_flushing_item;
> > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > > > >                               struct timespec64 *mtime,
> > > > >                               struct timespec64 *atime);
> > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                            struct ceph_mds_request* req,
> > > > > +                                            bool is_lookup);
> > > > > +
> > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                              struct ceph_mds_request* req,
> > > > > +                                              bool is_lookup);
> > > > > +
> > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                          int mask,
> > > > > +                                          bool is_lookup);
> > > > >  extern int ceph_fill_trace(struct super_block *sb,
> > > > >                          struct ceph_mds_request *req);
> > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> > > >
> >
> > --
> > Jeff Layton <jlayton@redhat.com>
> >
Jeff Layton Feb. 27, 2019, 4:32 p.m. UTC | #7
On Wed, 2019-02-27 at 23:13 +0800, Xuehan Xu wrote:
> On Wed, 27 Feb 2019 at 20:08, Jeff Layton <jlayton@redhat.com> wrote:
> > On Wed, 2019-02-27 at 11:31 +0800, Xuehan Xu wrote:
> > > On Wed, 27 Feb 2019 at 07:54, Jeff Layton <jlayton@redhat.com> wrote:
> > > > On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@gmail.com wrote:
> > > > > From: Xuehan Xu <xuxuehan@360.cn>
> > > > > 
> > > > > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> > > > > op, issue a new one if there is no inflight req that requires that same caps
> > > > > as the current getattr/lookup op.
> > > > > 
> > > > > Signed-off-by: Xuehan Xu <xuxuehan@360.cn>
> > > > > ---
> > > > >  fs/ceph/caps.c       |   1 +
> > > > >  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
> > > > >  fs/ceph/inode.c      |  55 +++++++++++++----
> > > > >  fs/ceph/mds_client.c |  30 +++++++++-
> > > > >  fs/ceph/mds_client.h |  15 ++++-
> > > > >  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
> > > > >  fs/ceph/super.h      |  14 +++++
> > > > >  7 files changed, 304 insertions(+), 39 deletions(-)
> > > > > 
> > > > 
> > > > The description on this patch could be fleshed out a bit.
> > > > 
> > > > How much does this help performance and on what workloads? This is a lot
> > > > of code to add without much justification.
> > > 
> > > Hi, Jeff and Zheng, thanks for your review and help:-) I'm sorry that
> > > I forgot to add the "Fixes" tag in the description of the patches.
> > > These two patches is supposed to solve the issue:
> > > http://tracker.ceph.com/issues/36609.
> > > 
> > 
> > Have you measured the performance to know how much of an impact this
> > has?
> 
> Hi, Jeff. Thanks for your reply:-)
>  In our case, those requests can take more than ten seconds to finish,
> which is intolerable to web servers. Actually, every time this
> scenario occurs, it makes our monitoring system wrongly recognize
> those web servers as down and shift all web traffic to other IDCs.
> 

Ouch, ok. It's understandable that you'd want that fixed and kudos for
taking a stab at it. The idea is not a bad one, but we'd need to
consider how to do it while maintaining strict cache coherency.

> > > Basically, the scenario we are facing now is as follows:
> > >      We have several dozens of web servers all of which mounted the
> > > same directory for their php scripts and would ask cephfs for the
> > > latest file content if some of those scripts is modified. To improve
> > > the web servers' efficiency, each of them runs tens of threads to
> > > serve web requests, so, as you can see, when some scripts is modified,
> > > there would be thousands of threads asking for the same file which
> > > would lead to thousands of getattr/lookup and read reqs to the cephfs
> > > cluster and cause huge pressure on both MDSes and OSDs.
> > > 
> > 
> > How often are these files changing?
> > 
> > Between changes to the files, the web server clients should all end up
> > with read-only caps that allow them to cache the inode attributes. When
> > there is a change to the file, those will need to be recalled of course
> > and you might see more activity on the MDS, but that's not necessarily a
> > problem.
> 
> File changes is not very often, maybe one or two times a week. But,
> since each time files get changed the scenario mentioned above occurs,
> this problem really makes us suffer.
> 

I assume that you have verified that this patch does help in that
scenario?

So to make sure I understand the scenario in terms of caps:

Most clients have read caps for the files in question prior to changes
to these files (and probably leases on the directories and such). When
you go to update the files, all of those caps get recalled (which
probably also takes a little while).

After the file is written, a bunch of tasks on the clients issue stat()
calls in parallel, which become LOOKUPs and/or GETATTRs you end up with
a thundering herd problem on the MDS.

Is that about right?

If so, I don't see a way around that while preserving cache coherency
right offhand. I'd have to think about it.

> > > > Looking over the patch, the basic idea is to implement a cache (in
> > > > rbtrees) of getattrs and lookups in flight and just wait on those if
> > > > there already is one that matches what we need. I agree with Zheng that
> > > > this could end up violating the strict cache coherency guarantees that
> > > > ceph tries to provide.
> > > > 
> > > > You're relying on an assumption that if you issue a getattr/lookup while
> > > > another one is already in flight, then it's legitimate to use the result
> > > > of the initial one for the one issued later. That's not necessarily the
> > > > case. I suspect this would be possible, even from a single client:
> > > > 
> > > > - client1/task1 issues stat(), server performs getattr
> > > > but reply gets delayed for a bit
> > > > 
> > > > - client2 does write() + fsync()
> > > > 
> > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > > > ends up waiting for first getattr to complete
> > > > 
> > > > - getattr reply comes in, and both stat calls get back the same
> > > > attributes, showing pre-write size/mtime
> > > > 
> > > > In this case, the synchronous write by client2 was provably done before
> > > > the second getattr, but the second stat() call wouldn't see the new
> > > > size.
> > > 
> > > I understand that this is a big problem. If what I think is right,
> > > this problem may be only with the read handling patch.
> > > Here's what I think:
> > >     First, I think the problem would come up only when client1/task2
> > > issues stat() after the finish of client2's write()+sync(), since,
> > > only in this case, a pre-write size/mtime seen by client1/task2 would
> > > cause errors. And a pre-write size/mtime would be seen by
> > > client1/task2 only when client1/task1's stat() is processed before
> > > client2's write()+sync(). However, when client1/task1's stat() is
> > > processed successfully, the inode's filelock should be in states like
> > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> > > move the inode's to states like LOCK_MIX which would cause a cap
> > > revoke from client1. And it's only when client1 acked the revoke can
> > > the MDS go on processing the metadata modification caused by client2's
> > > write()+fsync(). Also, MDS sends messages to the same client in the
> > > same order of the creation of those messages. So, when client1/task2
> > > issues stat(), the result of the previous one must have been processed
> > > by client1 and can't be seen by client1/task2, which means this
> > > problem wouldn't occur. Am I right about this? I don't know if I'm
> > > missing anything.
> > > 
> > 
> > There is never a guarantee that the MDS will give you the caps you've
> > requested, so I don't think you can rely on client1 having to process a
> > cap revoke to get you out of this situation.
> > 
> > IOW, I believe this is possible:
> > 
> > - client1 requests caps to do a stat(), doesn't get them (maybe because
> > client2 is actively writing?)
> > - client1 issues GETATTR to MDS, which gets processed there, but reply
> > gets delayed
> > - client2 does write()+fsync()
> > - client1 issues another stat(), ends up waiting on the first GETATTR,
> > which won't contain the most up to date information
> > 
> > No cap revoke to client1 here since it never got any in the first place.
> 
> Yes, you are right. But I think, in this case, the first GETATTR would
> get processed after client2's write()+fsync(), which means it would
> get the latest inode content. So the problem wouldn't occur, either.
> Am I right?
> 

No. In this (hypothetical) case the GETATTR gets processed first. The
reply just gets delayed for a bit -- possibly in the sender's transmit
buffer or receiver's receive buffer...or, maybe some frames in the reply
get dropped somewhere and you have to retransmit (bad cable)? The
possibilities are endless when it comes to network filesystems. :)

The key point is that on a network filesystem there is always some delay
between an operation being performed on the server and the client seeing
the result. All you know is when you transmitted the request and when
you received the reply.

CephFS is no different here, but it tries to maintain strict cache
coherency through the cap handling.

> > > However, as for the read handling patch, this problem do exists, and
> > > it's really confusing me.
> > > 
> > 
> > The cap handling code in cephfs is extremely complex. My hat is off to
> > you for even attempting to understand the MDS LOCK_* handling code.
> > 
> > > > The read handling patch may suffer from similar issues, though I haven't
> > > > gone over that one in as much detail.
> > > > 
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index bba28a5034ba..5a177a3c283f 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > > > >               goto out;
> > > > > 
> > > > >       ci->i_flushing_caps &= ~cleaned;
> > > > > +     ci->last_flush_ack_tid = flush_tid;
> > > > > 
> > > > >       spin_lock(&mdsc->cap_dirty_lock);
> > > > > 
> > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > index 82928cea0209..6f17c0319158 100644
> > > > > --- a/fs/ceph/dir.c
> > > > > +++ b/fs/ceph/dir.c
> > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > >                               }
> > > > >                       }
> > > > >               }
> > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > > > >       }
> > > > >       spin_unlock(&dentry->d_lock);
> > > > > 
> > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > > > >               ceph_put_mds_session(session);
> > > > >       }
> > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > > > >       return valid;
> > > > >  }
> > > > > 
> > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > > > >       if (!valid) {
> > > > >               struct ceph_mds_client *mdsc =
> > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > > > -             struct ceph_mds_request *req;
> > > > > +             struct ceph_mds_request *req = NULL;
> > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > > > >               int op, err;
> > > > >               u32 mask;
> > > > > 
> > > > >               if (flags & LOOKUP_RCU)
> > > > >                       return -ECHILD;
> > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > +             if (ceph_security_xattr_wanted(dir))
> > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > > > 
> > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > -             if (!IS_ERR(req)) {
> > > > > -                     req->r_dentry = dget(dentry);
> > > > > -                     req->r_num_caps = 2;
> > > > > -                     req->r_parent = dir;
> > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > > > +             }
> > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > +                     ceph_mdsc_get_request(req);
> > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                     err = ceph_mdsc_wait_for_request(req);
> > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > +             } else {
> > > > > 
> > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > -                     if (ceph_security_xattr_wanted(dir))
> > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                             if (!IS_ERR(req)) {
> > > > > +                                     unsigned long res = 0;
> > > > > +                                     req->r_dentry = dget(dentry);
> > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > > > +                                     req->r_num_caps = 2;
> > > > > +                                     req->r_parent = dir;
> > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > > > +                                     if (IS_ERR_VALUE(res)) {
> > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                                             goto out;
> > > > > +                                     }
> > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > +                             }
> > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > +                     }
> > > > > +                     if (IS_ERR(req))
> > > > > +                             goto out;
> > > > > 
> > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > -                     switch (err) {
> > > > > -                     case 0:
> > > > > -                             if (d_really_is_positive(dentry) &&
> > > > > -                                 d_inode(dentry) == req->r_target_inode)
> > > > > -                                     valid = 1;
> > > > > -                             break;
> > > > > -                     case -ENOENT:
> > > > > -                             if (d_really_is_negative(dentry))
> > > > > -                                     valid = 1;
> > > > > -                             /* Fallthrough */
> > > > > -                     default:
> > > > > -                             break;
> > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > >                       }
> > > > > -                     ceph_mdsc_put_request(req);
> > > > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > > > -                          dentry, err);
> > > > >               }
> > > > > +             switch (err) {
> > > > > +             case 0:
> > > > > +                     if (d_really_is_positive(dentry) &&
> > > > > +                         d_inode(dentry) == req->r_target_inode)
> > > > > +                             valid = 1;
> > > > > +                     break;
> > > > > +             case -ENOENT:
> > > > > +                     if (d_really_is_negative(dentry))
> > > > > +                             valid = 1;
> > > > > +                     /* Fallthrough */
> > > > > +             default:
> > > > > +                     break;
> > > > > +             }
> > > > > +             ceph_mdsc_put_request(req);
> > > > > +             dout("d_revalidate %p lookup result=%d\n",
> > > > > +                             dentry, err);
> > > > >       }
> > > > > 
> > > > > +out:
> > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > > > >       if (valid) {
> > > > >               ceph_dentry_lru_touch(dentry);
> > > > 
> > > > d_revalidate is changed to use the cache, but not lookup. Why?
> > > 
> > > Multiple same lookups would also be sent out in our scenario mentioned
> > > above, so we add a cache for lookups, too. I don't know if I'm getting
> > > your question correctly.
> > > 
> > 
> > d_revalidate only checks the validity of an existing dentry. To get a
> > dentry in the first place, you have to use the directory inode's lookup
> > readdir operations. lookup isn't changed here and I was wondering why.
> > 
> > Now that I think about it though, you probably need to do that anyway,
> > as the VFS code already does something similar when there are multiple
> > lookups for the same dentry in flight.
> 
> It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
> ceph kernel module to use lookup request to revalidate dentry.
> 

Yes, but it doesn't dispatch through the local kernel's inode lookup
operation. ceph_lookup also does CEPH_MDS_OP_LOOKUP operations.

> > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > index 9d1f34d46627..8705f0645a24 100644
> > > > > --- a/fs/ceph/inode.c
> > > > > +++ b/fs/ceph/inode.c
> > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > > > 
> > > > >       spin_lock_init(&ci->i_ceph_lock);
> > > > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > > > +     mutex_init(&ci->lookups_inflight_lock);
> > > > > 
> > > > >       ci->i_version = 0;
> > > > >       ci->i_inline_version = 0;
> > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > >       ci->i_xattrs.index_version = 0;
> > > > > 
> > > > >       ci->i_caps = RB_ROOT;
> > > > > +     ci->getattrs_inflight = RB_ROOT;
> > > > > +     ci->lookups_inflight = RB_ROOT;
> > > > >       ci->i_auth_cap = NULL;
> > > > >       ci->i_dirty_caps = 0;
> > > > >       ci->i_flushing_caps = 0;
> > > > > +     ci->last_flush_ack_tid = 0;
> > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > > > >       ci->i_prealloc_cap_flush = NULL;
> > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > > > >        * we expect a negative dentry.
> > > > >        */
> > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > >       if (!tgt_vino && d_really_is_positive(dentry))
> > > > >               return;
> > > > > -
> > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > > > >               return;
> > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > >       struct ceph_mds_request *req;
> > > > >       int mode;
> > > > >       int err;
> > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > > > 
> > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > > > >               return 0;
> > > > > 
> > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > -     if (IS_ERR(req))
> > > > > -             return PTR_ERR(req);
> > > > > -     req->r_inode = inode;
> > > > > -     ihold(inode);
> > > > > -     req->r_num_caps = 1;
> > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > -     req->r_locked_page = locked_page;
> > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > > > +     if (req) {
> > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > +             ceph_mdsc_get_request(req);
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +             err = ceph_mdsc_wait_for_request(req);
> > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > +     } else {
> > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > +             if (!IS_ERR(req)) {
> > > > > +                     unsigned long res = 0;
> > > > > +                     req->r_inode = inode;
> > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > > > +                     ihold(inode);
> > > > > +                     req->r_num_caps = 1;
> > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > +                     req->r_locked_page = locked_page;
> > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > > > +                     if (IS_ERR_VALUE(res)) {
> > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +                             return res;
> > > > > +                     }
> > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > +             }
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +             if (IS_ERR(req))
> > > > > +                     return PTR_ERR(req);
> > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > +     }
> > > > >       if (locked_page && err == 0) {
> > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > > > >               if (inline_version == 0) {
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > > > >       return 0;
> > > > >  }
> > > > > 
> > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > > > +{
> > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > > > +
> > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > > > +     kfree(mrtree);
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Create an mds request.
> > > > >   */
> > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > > > >       req->r_fmode = -1;
> > > > >       kref_init(&req->r_kref);
> > > > >       RB_CLEAR_NODE(&req->r_node);
> > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > > > >       INIT_LIST_HEAD(&req->r_wait);
> > > > > +     init_completion(&req->batch_op_completion);
> > > > >       init_completion(&req->r_completion);
> > > > >       init_completion(&req->r_safe_completion);
> > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > >       mutex_unlock(&mdsc->mutex);
> > > > >  }
> > > > > 
> > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > > > +{
> > > > > +     int err = 0;
> > > > > +     long timeleft = wait_for_completion_killable_timeout(
> > > > > +                     &req->batch_op_completion,
> > > > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > > > +     if (timeleft > 0)
> > > > > +             err = 0;
> > > > > +     else if (!timeleft)
> > > > > +             err = -EIO;  /* timed out */
> > > > > +     else
> > > > > +             err = timeleft;  /* killed */
> > > > > +     if (!err)
> > > > > +             return err;
> > > > > +     return req->batch_op_err;
> > > > > +}
> > > > > +
> > > > >  /*
> > > > >   * Synchrously perform an mds request.  Take care of all of the
> > > > >   * session setup, forwarding, retry details.
> > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > >       } else {
> > > > >               err = req->r_err;
> > > > >       }
> > > > > -
> > > > > +     req->batch_op_err = err;
> > > > > +     complete_all(&req->batch_op_completion);
> > > > >  out:
> > > > >       mutex_unlock(&mdsc->mutex);
> > > > >       dout("do_request %p done, result %d\n", req, err);
> > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > index 729da155ebf0..41727677643e 100644
> > > > > --- a/fs/ceph/mds_client.h
> > > > > +++ b/fs/ceph/mds_client.h
> > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > > > >                                                struct ceph_mds_request *req);
> > > > > 
> > > > > +struct mds_request_tree {
> > > > > +     int mask;
> > > > > +     struct kref ref;
> > > > > +     struct rb_root mds_requests;
> > > > > +     struct rb_node parent_tree_node;
> > > > > +};
> > > > > +
> > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > > > +
> > > > >  /*
> > > > >   * an in-flight mds request
> > > > >   */
> > > > >  struct ceph_mds_request {
> > > > >       u64 r_tid;                   /* transaction id */
> > > > > +     u64 last_caps_flush_ack_tid;
> > > > >       struct rb_node r_node;
> > > > > +     struct rb_node getattr_lookup_node;
> > > > >       struct ceph_mds_client *r_mdsc;
> > > > > 
> > > > >       int r_op;                    /* mds op code */
> > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > > > >       struct ceph_msg  *r_reply;
> > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > > > >       struct page *r_locked_page;
> > > > > -     int r_err;
> > > > > +     int r_err, batch_op_err;
> > > > > 
> > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > > > >       unsigned long r_started;  /* start time to measure timeout against */
> > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > > > 
> > > > >       struct kref       r_kref;
> > > > >       struct list_head  r_wait;
> > > > > +     struct completion batch_op_completion;
> > > > >       struct completion r_completion;
> > > > >       struct completion r_safe_completion;
> > > > >       ceph_mds_request_callback_t r_callback;
> > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > >                                    struct ceph_mds_request *req);
> > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > >                               struct inode *dir,
> > > > >                               struct ceph_mds_request *req);
> > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > index da2cd8e89062..857fbbc0b768 100644
> > > > > --- a/fs/ceph/super.c
> > > > > +++ b/fs/ceph/super.c
> > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > > > >       destroy_caches();
> > > > >  }
> > > > > 
> > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                       struct ceph_mds_request* req,
> > > > > +                                       bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node *node = NULL;
> > > > > +     struct mds_request_tree* tmp = NULL;
> > > > > +     if (!is_lookup)
> > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > +     else
> > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (node)
> > > > > +     {
> > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > +
> > > > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > > > +                     node = node->rb_left;
> > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > > > +                     node = node->rb_right;
> > > > > +             else
> > > > > +                     break;
> > > > > +     }
> > > > > +
> > > > > +     if (!node)
> > > > > +             BUG();
> > > > > +
> > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > > > +
> > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > > > +             if (!is_lookup)
> > > > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > > > +             else
> > > > > +                     rb_erase(node, &inode->lookups_inflight);
> > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > > > +     }
> > > > > +}
> > > > > +
> > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                     struct ceph_mds_request* req,
> > > > > +                                     bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node **p = NULL, *parent = NULL;
> > > > > +     struct mds_request_tree *tmp = NULL;
> > > > > +     struct ceph_mds_request* tmp_req = NULL;
> > > > > +     struct mds_request_tree* mrtree = NULL;
> > > > > +
> > > > > +     if (!is_lookup)
> > > > > +             p = &ci->getattrs_inflight.rb_node;
> > > > > +     else
> > > > > +             p = &ci->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (*p) {
> > > > > +             parent = *p;
> > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > > > +                     p = &(*p)->rb_left;
> > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > > > +                     p = &(*p)->rb_right;
> > > > > +             else
> > > > > +                     break;
> > > > > +     }
> > > > > +
> > > > > +     if (!(*p)) {
> > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > > > +
> > > > > +             if (!mrtree)
> > > > > +                     return -ENOMEM;
> > > > > +
> > > > > +             mrtree->mds_requests = RB_ROOT;
> > > > > +             mrtree->mask = req->r_args.getattr.mask;
> > > > > +             kref_init(&mrtree->ref);
> > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > > > +             if (!is_lookup) {
> > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > > > +             } else {
> > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > > > +             }
> > > > > +     } else {
> > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > > > +     }
> > > > > +
> > > > > +     p = &mrtree->mds_requests.rb_node;
> > > > > +     parent = NULL;
> > > > > +     while (*p) {
> > > > > +             parent = *p;
> > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > > > +                     p = &(*p)->rb_left;
> > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > > > +                     p = &(*p)->rb_right;
> > > > > +             else
> > > > > +                     BUG();
> > > > > +     }
> > > > > +
> > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > > > +     return 0;
> > > > > +}
> > > > > +
> > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                   int mask,
> > > > > +                                   bool is_lookup)
> > > > > +{
> > > > > +     struct rb_node *node = NULL;
> > > > > +     if (!is_lookup)
> > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > +     else
> > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > +
> > > > > +     while (node)
> > > > > +     {
> > > > > +             struct mds_request_tree* tmp = NULL;
> > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > +
> > > > > +             if (tmp->mask > mask) {
> > > > > +                     node = node->rb_left;
> > > > > +             } else if (tmp->mask < mask) {
> > > > > +                     node = node->rb_right;
> > > > > +             } else {
> > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > > > +                     while (inner_node) {
> > > > > +                             struct ceph_mds_request* req = NULL;
> > > > > +
> > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > > > +                                     inner_node = inner_node->rb_left;
> > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > > > +                                     inner_node = inner_node->rb_right;
> > > > > +                             } else
> > > > > +                                     return req; /* Found it */
> > > > > +                     }
> > > > > +                     return NULL;
> > > > > +             }
> > > > > +     }
> > > > > +     return NULL;
> > > > > +}
> > > > >  module_init(init_ceph);
> > > > >  module_exit(exit_ceph);
> > > > > 
> > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > index dfb64a5211b6..abf761f2a122 100644
> > > > > --- a/fs/ceph/super.h
> > > > > +++ b/fs/ceph/super.h
> > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > > > 
> > > > >       spinlock_t i_ceph_lock;
> > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > > > 
> > > > If you do end up pursuing this further, I think it might be better to
> > > > use a spinlock to protect the trees.
> > > > 
> > > > Take the lock, search the tree and if there isn't one, drop the lock,
> > > > allocate, take it and search again, and insert the one you allocated if
> > > > no conflicting structure was inserted in the meantime.
> > > > 
> > > > In addition to cutting down on context switches when these things are
> > > > contended, that would also make it easier to hide the locking inside the
> > > > cache search/register/unregister routines.
> > > 
> > > You are right. At first, I tried to use spinlock, but that caused
> > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> > > know how that occurred. Do you have any clew about that?
> > > 
> > > I can't thank you enough for the help:-) Thanks very much:-)
> > 
> > Often that means you forgot to unlock something, or maybe hit some lock
> > inversion? Without more analysis it's impossible to know. :)
> 
> Thanks:-) I wouldn't try to debug that:-)
> > > > >       u64 i_version;
> > > > >       u64 i_inline_version;
> > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > > > >       struct rb_root i_caps;           /* cap list */
> > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > > > +     u64 last_flush_ack_tid;
> > > > >       struct list_head i_dirty_item, i_flushing_item;
> > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > > > >                               struct timespec64 *mtime,
> > > > >                               struct timespec64 *atime);
> > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                            struct ceph_mds_request* req,
> > > > > +                                            bool is_lookup);
> > > > > +
> > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > +                                              struct ceph_mds_request* req,
> > > > > +                                              bool is_lookup);
> > > > > +
> > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > +                                          int mask,
> > > > > +                                          bool is_lookup);
> > > > >  extern int ceph_fill_trace(struct super_block *sb,
> > > > >                          struct ceph_mds_request *req);
> > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> > 
> > --
> > Jeff Layton <jlayton@redhat.com>
> >
Xuehan Xu Feb. 28, 2019, 2:31 a.m. UTC | #8
> > Hi, Jeff. Thanks for your reply:-)
> >  In our case, those requests can take more than ten seconds to finish,
> > which is intolerable to web servers. Actually, every time this
> > scenario occurs, it makes our monitoring system wrongly recognize
> > those web servers as down and shift all web traffic to other IDCs.
> >
>
> Ouch, ok. It's understandable that you'd want that fixed and kudos for
> taking a stab at it. The idea is not a bad one, but we'd need to
> consider how to do it while maintaining strict cache coherency.
>
> > > > Basically, the scenario we are facing now is as follows:
> > > >      We have several dozens of web servers all of which mounted the
> > > > same directory for their php scripts and would ask cephfs for the
> > > > latest file content if some of those scripts is modified. To improve
> > > > the web servers' efficiency, each of them runs tens of threads to
> > > > serve web requests, so, as you can see, when some scripts is modified,
> > > > there would be thousands of threads asking for the same file which
> > > > would lead to thousands of getattr/lookup and read reqs to the cephfs
> > > > cluster and cause huge pressure on both MDSes and OSDs.
> > > >
> > >
> > > How often are these files changing?
> > >
> > > Between changes to the files, the web server clients should all end up
> > > with read-only caps that allow them to cache the inode attributes. When
> > > there is a change to the file, those will need to be recalled of course
> > > and you might see more activity on the MDS, but that's not necessarily a
> > > problem.
> >
> > File changes is not very often, maybe one or two times a week. But,
> > since each time files get changed the scenario mentioned above occurs,
> > this problem really makes us suffer.
> >
>
> I assume that you have verified that this patch does help in that
> scenario?
>

Yes:-) This patch can reduce the latency from more than ten seconds to
within 100 ms most of the time:-) It's only the cache coherency that's
still haunting us....

> So to make sure I understand the scenario in terms of caps:
>
> Most clients have read caps for the files in question prior to changes
> to these files (and probably leases on the directories and such). When
> you go to update the files, all of those caps get recalled (which
> probably also takes a little while).
>
> After the file is written, a bunch of tasks on the clients issue stat()
> calls in parallel, which become LOOKUPs and/or GETATTRs you end up with
> a thundering herd problem on the MDS.
>
> Is that about right?
>

Yes, that's right:-)

> If so, I don't see a way around that while preserving cache coherency
> right offhand. I'd have to think about it
>
> > > > > Looking over the patch, the basic idea is to implement a cache (in
> > > > > rbtrees) of getattrs and lookups in flight and just wait on those if
> > > > > there already is one that matches what we need. I agree with Zheng that
> > > > > this could end up violating the strict cache coherency guarantees that
> > > > > ceph tries to provide.
> > > > >
> > > > > You're relying on an assumption that if you issue a getattr/lookup while
> > > > > another one is already in flight, then it's legitimate to use the result
> > > > > of the initial one for the one issued later. That's not necessarily the
> > > > > case. I suspect this would be possible, even from a single client:
> > > > >
> > > > > - client1/task1 issues stat(), server performs getattr
> > > > > but reply gets delayed for a bit
> > > > >
> > > > > - client2 does write() + fsync()
> > > > >
> > > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > > > > ends up waiting for first getattr to complete
> > > > >
> > > > > - getattr reply comes in, and both stat calls get back the same
> > > > > attributes, showing pre-write size/mtime
> > > > >
> > > > > In this case, the synchronous write by client2 was provably done before
> > > > > the second getattr, but the second stat() call wouldn't see the new
> > > > > size.
> > > >
> > > > I understand that this is a big problem. If what I think is right,
> > > > this problem may be only with the read handling patch.
> > > > Here's what I think:
> > > >     First, I think the problem would come up only when client1/task2
> > > > issues stat() after the finish of client2's write()+sync(), since,
> > > > only in this case, a pre-write size/mtime seen by client1/task2 would
> > > > cause errors. And a pre-write size/mtime would be seen by
> > > > client1/task2 only when client1/task1's stat() is processed before
> > > > client2's write()+sync(). However, when client1/task1's stat() is
> > > > processed successfully, the inode's filelock should be in states like
> > > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> > > > move the inode's to states like LOCK_MIX which would cause a cap
> > > > revoke from client1. And it's only when client1 acked the revoke can
> > > > the MDS go on processing the metadata modification caused by client2's
> > > > write()+fsync(). Also, MDS sends messages to the same client in the
> > > > same order of the creation of those messages. So, when client1/task2
> > > > issues stat(), the result of the previous one must have been processed
> > > > by client1 and can't be seen by client1/task2, which means this
> > > > problem wouldn't occur. Am I right about this? I don't know if I'm
> > > > missing anything.
> > > >
> > >
> > > There is never a guarantee that the MDS will give you the caps you've
> > > requested, so I don't think you can rely on client1 having to process a
> > > cap revoke to get you out of this situation.
> > >
> > > IOW, I believe this is possible:
> > >
> > > - client1 requests caps to do a stat(), doesn't get them (maybe because
> > > client2 is actively writing?)
> > > - client1 issues GETATTR to MDS, which gets processed there, but reply
> > > gets delayed
> > > - client2 does write()+fsync()
> > > - client1 issues another stat(), ends up waiting on the first GETATTR,
> > > which won't contain the most up to date information
> > >
> > > No cap revoke to client1 here since it never got any in the first place.
> >
> > Yes, you are right. But I think, in this case, the first GETATTR would
> > get processed after client2's write()+fsync(), which means it would
> > get the latest inode content. So the problem wouldn't occur, either.
> > Am I right?
> >
>
> No. In this (hypothetical) case the GETATTR gets processed first. The
> reply just gets delayed for a bit -- possibly in the sender's transmit
> buffer or receiver's receive buffer...or, maybe some frames in the reply
> get dropped somewhere and you have to retransmit (bad cable)? The
> possibilities are endless when it comes to network filesystems. :)
>
> The key point is that on a network filesystem there is always some delay
> between an operation being performed on the server and the client seeing
> the result. All you know is when you transmitted the request and when
> you received the reply.
>
> CephFS is no different here, but it tries to maintain strict cache
> coherency through the cap handling.

Oh, I think I get your point. You mean that there are chances that
even if MDS process the GETATTR reqs successfully and respond to the
client, it won't give the client the caps that they asked for; so
there are no guarantee that the finish of client2's write()+fsync()
would indicate the first GETATTR is already finished on the client1's
side. Am I right?

>
> > > > However, as for the read handling patch, this problem do exists, and
> > > > it's really confusing me.
> > > >
> > >
> > > The cap handling code in cephfs is extremely complex. My hat is off to
> > > you for even attempting to understand the MDS LOCK_* handling code.
> > >
> > > > > The read handling patch may suffer from similar issues, though I haven't
> > > > > gone over that one in as much detail.
> > > > >
> > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > > index bba28a5034ba..5a177a3c283f 100644
> > > > > > --- a/fs/ceph/caps.c
> > > > > > +++ b/fs/ceph/caps.c
> > > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > > > > >               goto out;
> > > > > >
> > > > > >       ci->i_flushing_caps &= ~cleaned;
> > > > > > +     ci->last_flush_ack_tid = flush_tid;
> > > > > >
> > > > > >       spin_lock(&mdsc->cap_dirty_lock);
> > > > > >
> > > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > > index 82928cea0209..6f17c0319158 100644
> > > > > > --- a/fs/ceph/dir.c
> > > > > > +++ b/fs/ceph/dir.c
> > > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > > >                               }
> > > > > >                       }
> > > > > >               }
> > > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > > > > >       }
> > > > > >       spin_unlock(&dentry->d_lock);
> > > > > >
> > > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > > > > >               ceph_put_mds_session(session);
> > > > > >       }
> > > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > > > > >       return valid;
> > > > > >  }
> > > > > >
> > > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > > > > >       if (!valid) {
> > > > > >               struct ceph_mds_client *mdsc =
> > > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > > > > -             struct ceph_mds_request *req;
> > > > > > +             struct ceph_mds_request *req = NULL;
> > > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > > > > >               int op, err;
> > > > > >               u32 mask;
> > > > > >
> > > > > >               if (flags & LOOKUP_RCU)
> > > > > >                       return -ECHILD;
> > > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > > +             if (ceph_security_xattr_wanted(dir))
> > > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > > > >
> > > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > > -             if (!IS_ERR(req)) {
> > > > > > -                     req->r_dentry = dget(dentry);
> > > > > > -                     req->r_num_caps = 2;
> > > > > > -                     req->r_parent = dir;
> > > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > > > > +             }
> > > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > > +                     ceph_mdsc_get_request(req);
> > > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > +                     err = ceph_mdsc_wait_for_request(req);
> > > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > > +             } else {
> > > > > >
> > > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > > -                     if (ceph_security_xattr_wanted(dir))
> > > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > +                             if (!IS_ERR(req)) {
> > > > > > +                                     unsigned long res = 0;
> > > > > > +                                     req->r_dentry = dget(dentry);
> > > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > > > > +                                     req->r_num_caps = 2;
> > > > > > +                                     req->r_parent = dir;
> > > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > > > > +                                     if (IS_ERR_VALUE(res)) {
> > > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > +                                             goto out;
> > > > > > +                                     }
> > > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > > +                             }
> > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > +                     }
> > > > > > +                     if (IS_ERR(req))
> > > > > > +                             goto out;
> > > > > >
> > > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > -                     switch (err) {
> > > > > > -                     case 0:
> > > > > > -                             if (d_really_is_positive(dentry) &&
> > > > > > -                                 d_inode(dentry) == req->r_target_inode)
> > > > > > -                                     valid = 1;
> > > > > > -                             break;
> > > > > > -                     case -ENOENT:
> > > > > > -                             if (d_really_is_negative(dentry))
> > > > > > -                                     valid = 1;
> > > > > > -                             /* Fallthrough */
> > > > > > -                     default:
> > > > > > -                             break;
> > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > >                       }
> > > > > > -                     ceph_mdsc_put_request(req);
> > > > > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > > > > -                          dentry, err);
> > > > > >               }
> > > > > > +             switch (err) {
> > > > > > +             case 0:
> > > > > > +                     if (d_really_is_positive(dentry) &&
> > > > > > +                         d_inode(dentry) == req->r_target_inode)
> > > > > > +                             valid = 1;
> > > > > > +                     break;
> > > > > > +             case -ENOENT:
> > > > > > +                     if (d_really_is_negative(dentry))
> > > > > > +                             valid = 1;
> > > > > > +                     /* Fallthrough */
> > > > > > +             default:
> > > > > > +                     break;
> > > > > > +             }
> > > > > > +             ceph_mdsc_put_request(req);
> > > > > > +             dout("d_revalidate %p lookup result=%d\n",
> > > > > > +                             dentry, err);
> > > > > >       }
> > > > > >
> > > > > > +out:
> > > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > > > > >       if (valid) {
> > > > > >               ceph_dentry_lru_touch(dentry);
> > > > >
> > > > > d_revalidate is changed to use the cache, but not lookup. Why?
> > > >
> > > > Multiple same lookups would also be sent out in our scenario mentioned
> > > > above, so we add a cache for lookups, too. I don't know if I'm getting
> > > > your question correctly.
> > > >
> > >
> > > d_revalidate only checks the validity of an existing dentry. To get a
> > > dentry in the first place, you have to use the directory inode's lookup
> > > readdir operations. lookup isn't changed here and I was wondering why.
> > >
> > > Now that I think about it though, you probably need to do that anyway,
> > > as the VFS code already does something similar when there are multiple
> > > lookups for the same dentry in flight.
> >
> > It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
> > ceph kernel module to use lookup request to revalidate dentry.
> >
>
> Yes, but it doesn't dispatch through the local kernel's inode lookup
> operation. ceph_lookup also does CEPH_MDS_OP_LOOKUP operations.

I now get your point:-) Will try to fix that as well:-)

>
> > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > index 9d1f34d46627..8705f0645a24 100644
> > > > > > --- a/fs/ceph/inode.c
> > > > > > +++ b/fs/ceph/inode.c
> > > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > > > >
> > > > > >       spin_lock_init(&ci->i_ceph_lock);
> > > > > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > > > > +     mutex_init(&ci->lookups_inflight_lock);
> > > > > >
> > > > > >       ci->i_version = 0;
> > > > > >       ci->i_inline_version = 0;
> > > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > > >       ci->i_xattrs.index_version = 0;
> > > > > >
> > > > > >       ci->i_caps = RB_ROOT;
> > > > > > +     ci->getattrs_inflight = RB_ROOT;
> > > > > > +     ci->lookups_inflight = RB_ROOT;
> > > > > >       ci->i_auth_cap = NULL;
> > > > > >       ci->i_dirty_caps = 0;
> > > > > >       ci->i_flushing_caps = 0;
> > > > > > +     ci->last_flush_ack_tid = 0;
> > > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > > > > >       ci->i_prealloc_cap_flush = NULL;
> > > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > > > > >        * we expect a negative dentry.
> > > > > >        */
> > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > > >       if (!tgt_vino && d_really_is_positive(dentry))
> > > > > >               return;
> > > > > > -
> > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > > > > >               return;
> > > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > >       struct ceph_mds_request *req;
> > > > > >       int mode;
> > > > > >       int err;
> > > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > > > >
> > > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > > > > >               return 0;
> > > > > >
> > > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > > -     if (IS_ERR(req))
> > > > > > -             return PTR_ERR(req);
> > > > > > -     req->r_inode = inode;
> > > > > > -     ihold(inode);
> > > > > > -     req->r_num_caps = 1;
> > > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > -     req->r_locked_page = locked_page;
> > > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > > > > +     if (req) {
> > > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > > +             ceph_mdsc_get_request(req);
> > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > +             err = ceph_mdsc_wait_for_request(req);
> > > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > > +     } else {
> > > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > > +             if (!IS_ERR(req)) {
> > > > > > +                     unsigned long res = 0;
> > > > > > +                     req->r_inode = inode;
> > > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > > > > +                     ihold(inode);
> > > > > > +                     req->r_num_caps = 1;
> > > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > +                     req->r_locked_page = locked_page;
> > > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > > > > +                     if (IS_ERR_VALUE(res)) {
> > > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > +                             return res;
> > > > > > +                     }
> > > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > > +             }
> > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > +             if (IS_ERR(req))
> > > > > > +                     return PTR_ERR(req);
> > > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > +     }
> > > > > >       if (locked_page && err == 0) {
> > > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > > > > >               if (inline_version == 0) {
> > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > > > > --- a/fs/ceph/mds_client.c
> > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > > > > >       return 0;
> > > > > >  }
> > > > > >
> > > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > > > > +{
> > > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > > > > +
> > > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > > > > +     kfree(mrtree);
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * Create an mds request.
> > > > > >   */
> > > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > > > > >       req->r_fmode = -1;
> > > > > >       kref_init(&req->r_kref);
> > > > > >       RB_CLEAR_NODE(&req->r_node);
> > > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > > > > >       INIT_LIST_HEAD(&req->r_wait);
> > > > > > +     init_completion(&req->batch_op_completion);
> > > > > >       init_completion(&req->r_completion);
> > > > > >       init_completion(&req->r_safe_completion);
> > > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > > >       mutex_unlock(&mdsc->mutex);
> > > > > >  }
> > > > > >
> > > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > > > > +{
> > > > > > +     int err = 0;
> > > > > > +     long timeleft = wait_for_completion_killable_timeout(
> > > > > > +                     &req->batch_op_completion,
> > > > > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > > > > +     if (timeleft > 0)
> > > > > > +             err = 0;
> > > > > > +     else if (!timeleft)
> > > > > > +             err = -EIO;  /* timed out */
> > > > > > +     else
> > > > > > +             err = timeleft;  /* killed */
> > > > > > +     if (!err)
> > > > > > +             return err;
> > > > > > +     return req->batch_op_err;
> > > > > > +}
> > > > > > +
> > > > > >  /*
> > > > > >   * Synchrously perform an mds request.  Take care of all of the
> > > > > >   * session setup, forwarding, retry details.
> > > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > > >       } else {
> > > > > >               err = req->r_err;
> > > > > >       }
> > > > > > -
> > > > > > +     req->batch_op_err = err;
> > > > > > +     complete_all(&req->batch_op_completion);
> > > > > >  out:
> > > > > >       mutex_unlock(&mdsc->mutex);
> > > > > >       dout("do_request %p done, result %d\n", req, err);
> > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > index 729da155ebf0..41727677643e 100644
> > > > > > --- a/fs/ceph/mds_client.h
> > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > > > > >                                                struct ceph_mds_request *req);
> > > > > >
> > > > > > +struct mds_request_tree {
> > > > > > +     int mask;
> > > > > > +     struct kref ref;
> > > > > > +     struct rb_root mds_requests;
> > > > > > +     struct rb_node parent_tree_node;
> > > > > > +};
> > > > > > +
> > > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > > > > +
> > > > > >  /*
> > > > > >   * an in-flight mds request
> > > > > >   */
> > > > > >  struct ceph_mds_request {
> > > > > >       u64 r_tid;                   /* transaction id */
> > > > > > +     u64 last_caps_flush_ack_tid;
> > > > > >       struct rb_node r_node;
> > > > > > +     struct rb_node getattr_lookup_node;
> > > > > >       struct ceph_mds_client *r_mdsc;
> > > > > >
> > > > > >       int r_op;                    /* mds op code */
> > > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > > > > >       struct ceph_msg  *r_reply;
> > > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > > > > >       struct page *r_locked_page;
> > > > > > -     int r_err;
> > > > > > +     int r_err, batch_op_err;
> > > > > >
> > > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > > > > >       unsigned long r_started;  /* start time to measure timeout against */
> > > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > > > >
> > > > > >       struct kref       r_kref;
> > > > > >       struct list_head  r_wait;
> > > > > > +     struct completion batch_op_completion;
> > > > > >       struct completion r_completion;
> > > > > >       struct completion r_safe_completion;
> > > > > >       ceph_mds_request_callback_t r_callback;
> > > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > > >                                    struct ceph_mds_request *req);
> > > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > > >                               struct inode *dir,
> > > > > >                               struct ceph_mds_request *req);
> > > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > > index da2cd8e89062..857fbbc0b768 100644
> > > > > > --- a/fs/ceph/super.c
> > > > > > +++ b/fs/ceph/super.c
> > > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > > > > >       destroy_caches();
> > > > > >  }
> > > > > >
> > > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > +                                       struct ceph_mds_request* req,
> > > > > > +                                       bool is_lookup)
> > > > > > +{
> > > > > > +     struct rb_node *node = NULL;
> > > > > > +     struct mds_request_tree* tmp = NULL;
> > > > > > +     if (!is_lookup)
> > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > > +     else
> > > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > > +
> > > > > > +     while (node)
> > > > > > +     {
> > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > > +
> > > > > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > > > > +                     node = node->rb_left;
> > > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > > > > +                     node = node->rb_right;
> > > > > > +             else
> > > > > > +                     break;
> > > > > > +     }
> > > > > > +
> > > > > > +     if (!node)
> > > > > > +             BUG();
> > > > > > +
> > > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > > > > +
> > > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > > > > +             if (!is_lookup)
> > > > > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > > > > +             else
> > > > > > +                     rb_erase(node, &inode->lookups_inflight);
> > > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > > > > +     }
> > > > > > +}
> > > > > > +
> > > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > +                                     struct ceph_mds_request* req,
> > > > > > +                                     bool is_lookup)
> > > > > > +{
> > > > > > +     struct rb_node **p = NULL, *parent = NULL;
> > > > > > +     struct mds_request_tree *tmp = NULL;
> > > > > > +     struct ceph_mds_request* tmp_req = NULL;
> > > > > > +     struct mds_request_tree* mrtree = NULL;
> > > > > > +
> > > > > > +     if (!is_lookup)
> > > > > > +             p = &ci->getattrs_inflight.rb_node;
> > > > > > +     else
> > > > > > +             p = &ci->lookups_inflight.rb_node;
> > > > > > +
> > > > > > +     while (*p) {
> > > > > > +             parent = *p;
> > > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > > > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > > > > +                     p = &(*p)->rb_left;
> > > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > > > > +                     p = &(*p)->rb_right;
> > > > > > +             else
> > > > > > +                     break;
> > > > > > +     }
> > > > > > +
> > > > > > +     if (!(*p)) {
> > > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > > > > +
> > > > > > +             if (!mrtree)
> > > > > > +                     return -ENOMEM;
> > > > > > +
> > > > > > +             mrtree->mds_requests = RB_ROOT;
> > > > > > +             mrtree->mask = req->r_args.getattr.mask;
> > > > > > +             kref_init(&mrtree->ref);
> > > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > > > > +             if (!is_lookup) {
> > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > > > > +             } else {
> > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > > > > +             }
> > > > > > +     } else {
> > > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > > > > +     }
> > > > > > +
> > > > > > +     p = &mrtree->mds_requests.rb_node;
> > > > > > +     parent = NULL;
> > > > > > +     while (*p) {
> > > > > > +             parent = *p;
> > > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > > > > +                     p = &(*p)->rb_left;
> > > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > > > > +                     p = &(*p)->rb_right;
> > > > > > +             else
> > > > > > +                     BUG();
> > > > > > +     }
> > > > > > +
> > > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > > > > +     return 0;
> > > > > > +}
> > > > > > +
> > > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > +                                   int mask,
> > > > > > +                                   bool is_lookup)
> > > > > > +{
> > > > > > +     struct rb_node *node = NULL;
> > > > > > +     if (!is_lookup)
> > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > > +     else
> > > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > > +
> > > > > > +     while (node)
> > > > > > +     {
> > > > > > +             struct mds_request_tree* tmp = NULL;
> > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > > +
> > > > > > +             if (tmp->mask > mask) {
> > > > > > +                     node = node->rb_left;
> > > > > > +             } else if (tmp->mask < mask) {
> > > > > > +                     node = node->rb_right;
> > > > > > +             } else {
> > > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > > > > +                     while (inner_node) {
> > > > > > +                             struct ceph_mds_request* req = NULL;
> > > > > > +
> > > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > > > > +                                     inner_node = inner_node->rb_left;
> > > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > > > > +                                     inner_node = inner_node->rb_right;
> > > > > > +                             } else
> > > > > > +                                     return req; /* Found it */
> > > > > > +                     }
> > > > > > +                     return NULL;
> > > > > > +             }
> > > > > > +     }
> > > > > > +     return NULL;
> > > > > > +}
> > > > > >  module_init(init_ceph);
> > > > > >  module_exit(exit_ceph);
> > > > > >
> > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > index dfb64a5211b6..abf761f2a122 100644
> > > > > > --- a/fs/ceph/super.h
> > > > > > +++ b/fs/ceph/super.h
> > > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > > > >
> > > > > >       spinlock_t i_ceph_lock;
> > > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > > > >
> > > > > If you do end up pursuing this further, I think it might be better to
> > > > > use a spinlock to protect the trees.
> > > > >
> > > > > Take the lock, search the tree and if there isn't one, drop the lock,
> > > > > allocate, take it and search again, and insert the one you allocated if
> > > > > no conflicting structure was inserted in the meantime.
> > > > >
> > > > > In addition to cutting down on context switches when these things are
> > > > > contended, that would also make it easier to hide the locking inside the
> > > > > cache search/register/unregister routines.
> > > >
> > > > You are right. At first, I tried to use spinlock, but that caused
> > > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> > > > know how that occurred. Do you have any clew about that?
> > > >
> > > > I can't thank you enough for the help:-) Thanks very much:-)
> > >
> > > Often that means you forgot to unlock something, or maybe hit some lock
> > > inversion? Without more analysis it's impossible to know. :)
> >
> > Thanks:-) I wouldn't try to debug that:-)
> > > > > >       u64 i_version;
> > > > > >       u64 i_inline_version;
> > > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > > > > >       struct rb_root i_caps;           /* cap list */
> > > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > > > > +     u64 last_flush_ack_tid;
> > > > > >       struct list_head i_dirty_item, i_flushing_item;
> > > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > > > > >                               struct timespec64 *mtime,
> > > > > >                               struct timespec64 *atime);
> > > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > +                                            struct ceph_mds_request* req,
> > > > > > +                                            bool is_lookup);
> > > > > > +
> > > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > +                                              struct ceph_mds_request* req,
> > > > > > +                                              bool is_lookup);
> > > > > > +
> > > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > +                                          int mask,
> > > > > > +                                          bool is_lookup);
> > > > > >  extern int ceph_fill_trace(struct super_block *sb,
> > > > > >                          struct ceph_mds_request *req);
> > > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> > >
> > > --
> > > Jeff Layton <jlayton@redhat.com>
> > >
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Xuehan Xu Feb. 28, 2019, 6:09 a.m. UTC | #9
Hi, Jeff and Zheng. Do you think it's feasible to give users choices
over what kind of cache coherency they need like what is done in
glusterfs(https://lists.gluster.org/pipermail/gluster-devel/2015-September/046611.html)?

On Thu, 28 Feb 2019 at 10:31, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
>
> > > Hi, Jeff. Thanks for your reply:-)
> > >  In our case, those requests can take more than ten seconds to finish,
> > > which is intolerable to web servers. Actually, every time this
> > > scenario occurs, it makes our monitoring system wrongly recognize
> > > those web servers as down and shift all web traffic to other IDCs.
> > >
> >
> > Ouch, ok. It's understandable that you'd want that fixed and kudos for
> > taking a stab at it. The idea is not a bad one, but we'd need to
> > consider how to do it while maintaining strict cache coherency.
> >
> > > > > Basically, the scenario we are facing now is as follows:
> > > > >      We have several dozens of web servers all of which mounted the
> > > > > same directory for their php scripts and would ask cephfs for the
> > > > > latest file content if some of those scripts is modified. To improve
> > > > > the web servers' efficiency, each of them runs tens of threads to
> > > > > serve web requests, so, as you can see, when some scripts is modified,
> > > > > there would be thousands of threads asking for the same file which
> > > > > would lead to thousands of getattr/lookup and read reqs to the cephfs
> > > > > cluster and cause huge pressure on both MDSes and OSDs.
> > > > >
> > > >
> > > > How often are these files changing?
> > > >
> > > > Between changes to the files, the web server clients should all end up
> > > > with read-only caps that allow them to cache the inode attributes. When
> > > > there is a change to the file, those will need to be recalled of course
> > > > and you might see more activity on the MDS, but that's not necessarily a
> > > > problem.
> > >
> > > File changes is not very often, maybe one or two times a week. But,
> > > since each time files get changed the scenario mentioned above occurs,
> > > this problem really makes us suffer.
> > >
> >
> > I assume that you have verified that this patch does help in that
> > scenario?
> >
>
> Yes:-) This patch can reduce the latency from more than ten seconds to
> within 100 ms most of the time:-) It's only the cache coherency that's
> still haunting us....
>
> > So to make sure I understand the scenario in terms of caps:
> >
> > Most clients have read caps for the files in question prior to changes
> > to these files (and probably leases on the directories and such). When
> > you go to update the files, all of those caps get recalled (which
> > probably also takes a little while).
> >
> > After the file is written, a bunch of tasks on the clients issue stat()
> > calls in parallel, which become LOOKUPs and/or GETATTRs you end up with
> > a thundering herd problem on the MDS.
> >
> > Is that about right?
> >
>
> Yes, that's right:-)
>
> > If so, I don't see a way around that while preserving cache coherency
> > right offhand. I'd have to think about it
> >
> > > > > > Looking over the patch, the basic idea is to implement a cache (in
> > > > > > rbtrees) of getattrs and lookups in flight and just wait on those if
> > > > > > there already is one that matches what we need. I agree with Zheng that
> > > > > > this could end up violating the strict cache coherency guarantees that
> > > > > > ceph tries to provide.
> > > > > >
> > > > > > You're relying on an assumption that if you issue a getattr/lookup while
> > > > > > another one is already in flight, then it's legitimate to use the result
> > > > > > of the initial one for the one issued later. That's not necessarily the
> > > > > > case. I suspect this would be possible, even from a single client:
> > > > > >
> > > > > > - client1/task1 issues stat(), server performs getattr
> > > > > > but reply gets delayed for a bit
> > > > > >
> > > > > > - client2 does write() + fsync()
> > > > > >
> > > > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> > > > > > ends up waiting for first getattr to complete
> > > > > >
> > > > > > - getattr reply comes in, and both stat calls get back the same
> > > > > > attributes, showing pre-write size/mtime
> > > > > >
> > > > > > In this case, the synchronous write by client2 was provably done before
> > > > > > the second getattr, but the second stat() call wouldn't see the new
> > > > > > size.
> > > > >
> > > > > I understand that this is a big problem. If what I think is right,
> > > > > this problem may be only with the read handling patch.
> > > > > Here's what I think:
> > > > >     First, I think the problem would come up only when client1/task2
> > > > > issues stat() after the finish of client2's write()+sync(), since,
> > > > > only in this case, a pre-write size/mtime seen by client1/task2 would
> > > > > cause errors. And a pre-write size/mtime would be seen by
> > > > > client1/task2 only when client1/task1's stat() is processed before
> > > > > client2's write()+sync(). However, when client1/task1's stat() is
> > > > > processed successfully, the inode's filelock should be in states like
> > > > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> > > > > move the inode's to states like LOCK_MIX which would cause a cap
> > > > > revoke from client1. And it's only when client1 acked the revoke can
> > > > > the MDS go on processing the metadata modification caused by client2's
> > > > > write()+fsync(). Also, MDS sends messages to the same client in the
> > > > > same order of the creation of those messages. So, when client1/task2
> > > > > issues stat(), the result of the previous one must have been processed
> > > > > by client1 and can't be seen by client1/task2, which means this
> > > > > problem wouldn't occur. Am I right about this? I don't know if I'm
> > > > > missing anything.
> > > > >
> > > >
> > > > There is never a guarantee that the MDS will give you the caps you've
> > > > requested, so I don't think you can rely on client1 having to process a
> > > > cap revoke to get you out of this situation.
> > > >
> > > > IOW, I believe this is possible:
> > > >
> > > > - client1 requests caps to do a stat(), doesn't get them (maybe because
> > > > client2 is actively writing?)
> > > > - client1 issues GETATTR to MDS, which gets processed there, but reply
> > > > gets delayed
> > > > - client2 does write()+fsync()
> > > > - client1 issues another stat(), ends up waiting on the first GETATTR,
> > > > which won't contain the most up to date information
> > > >
> > > > No cap revoke to client1 here since it never got any in the first place.
> > >
> > > Yes, you are right. But I think, in this case, the first GETATTR would
> > > get processed after client2's write()+fsync(), which means it would
> > > get the latest inode content. So the problem wouldn't occur, either.
> > > Am I right?
> > >
> >
> > No. In this (hypothetical) case the GETATTR gets processed first. The
> > reply just gets delayed for a bit -- possibly in the sender's transmit
> > buffer or receiver's receive buffer...or, maybe some frames in the reply
> > get dropped somewhere and you have to retransmit (bad cable)? The
> > possibilities are endless when it comes to network filesystems. :)
> >
> > The key point is that on a network filesystem there is always some delay
> > between an operation being performed on the server and the client seeing
> > the result. All you know is when you transmitted the request and when
> > you received the reply.
> >
> > CephFS is no different here, but it tries to maintain strict cache
> > coherency through the cap handling.
>
> Oh, I think I get your point. You mean that there are chances that
> even if MDS process the GETATTR reqs successfully and respond to the
> client, it won't give the client the caps that they asked for; so
> there are no guarantee that the finish of client2's write()+fsync()
> would indicate the first GETATTR is already finished on the client1's
> side. Am I right?
>
> >
> > > > > However, as for the read handling patch, this problem do exists, and
> > > > > it's really confusing me.
> > > > >
> > > >
> > > > The cap handling code in cephfs is extremely complex. My hat is off to
> > > > you for even attempting to understand the MDS LOCK_* handling code.
> > > >
> > > > > > The read handling patch may suffer from similar issues, though I haven't
> > > > > > gone over that one in as much detail.
> > > > > >
> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > > > index bba28a5034ba..5a177a3c283f 100644
> > > > > > > --- a/fs/ceph/caps.c
> > > > > > > +++ b/fs/ceph/caps.c
> > > > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> > > > > > >               goto out;
> > > > > > >
> > > > > > >       ci->i_flushing_caps &= ~cleaned;
> > > > > > > +     ci->last_flush_ack_tid = flush_tid;
> > > > > > >
> > > > > > >       spin_lock(&mdsc->cap_dirty_lock);
> > > > > > >
> > > > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > > > index 82928cea0209..6f17c0319158 100644
> > > > > > > --- a/fs/ceph/dir.c
> > > > > > > +++ b/fs/ceph/dir.c
> > > > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > > > >                               }
> > > > > > >                       }
> > > > > > >               }
> > > > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> > > > > > >       }
> > > > > > >       spin_unlock(&dentry->d_lock);
> > > > > > >
> > > > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> > > > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> > > > > > >               ceph_put_mds_session(session);
> > > > > > >       }
> > > > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> > > > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> > > > > > >       return valid;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> > > > > > >       if (!valid) {
> > > > > > >               struct ceph_mds_client *mdsc =
> > > > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> > > > > > > -             struct ceph_mds_request *req;
> > > > > > > +             struct ceph_mds_request *req = NULL;
> > > > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> > > > > > >               int op, err;
> > > > > > >               u32 mask;
> > > > > > >
> > > > > > >               if (flags & LOOKUP_RCU)
> > > > > > >                       return -ECHILD;
> > > > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > > > +             if (ceph_security_xattr_wanted(dir))
> > > > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> > > > > > >
> > > > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> > > > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> > > > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > > > -             if (!IS_ERR(req)) {
> > > > > > > -                     req->r_dentry = dget(dentry);
> > > > > > > -                     req->r_num_caps = 2;
> > > > > > > -                     req->r_parent = dir;
> > > > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> > > > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> > > > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> > > > > > > +             }
> > > > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> > > > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > > > +                     ceph_mdsc_get_request(req);
> > > > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > > +                     err = ceph_mdsc_wait_for_request(req);
> > > > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > > > +             } else {
> > > > > > >
> > > > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > > > > -                     if (ceph_security_xattr_wanted(dir))
> > > > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> > > > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > > +                             if (!IS_ERR(req)) {
> > > > > > > +                                     unsigned long res = 0;
> > > > > > > +                                     req->r_dentry = dget(dentry);
> > > > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> > > > > > > +                                     req->r_num_caps = 2;
> > > > > > > +                                     req->r_parent = dir;
> > > > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> > > > > > > +                                     if (IS_ERR_VALUE(res)) {
> > > > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > > +                                             goto out;
> > > > > > > +                                     }
> > > > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> > > > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> > > > > > > +                             }
> > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > > +                     }
> > > > > > > +                     if (IS_ERR(req))
> > > > > > > +                             goto out;
> > > > > > >
> > > > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > -                     switch (err) {
> > > > > > > -                     case 0:
> > > > > > > -                             if (d_really_is_positive(dentry) &&
> > > > > > > -                                 d_inode(dentry) == req->r_target_inode)
> > > > > > > -                                     valid = 1;
> > > > > > > -                             break;
> > > > > > > -                     case -ENOENT:
> > > > > > > -                             if (d_really_is_negative(dentry))
> > > > > > > -                                     valid = 1;
> > > > > > > -                             /* Fallthrough */
> > > > > > > -                     default:
> > > > > > > -                             break;
> > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> > > > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> > > > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> > > > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> > > > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> > > > > > >                       }
> > > > > > > -                     ceph_mdsc_put_request(req);
> > > > > > > -                     dout("d_revalidate %p lookup result=%d\n",
> > > > > > > -                          dentry, err);
> > > > > > >               }
> > > > > > > +             switch (err) {
> > > > > > > +             case 0:
> > > > > > > +                     if (d_really_is_positive(dentry) &&
> > > > > > > +                         d_inode(dentry) == req->r_target_inode)
> > > > > > > +                             valid = 1;
> > > > > > > +                     break;
> > > > > > > +             case -ENOENT:
> > > > > > > +                     if (d_really_is_negative(dentry))
> > > > > > > +                             valid = 1;
> > > > > > > +                     /* Fallthrough */
> > > > > > > +             default:
> > > > > > > +                     break;
> > > > > > > +             }
> > > > > > > +             ceph_mdsc_put_request(req);
> > > > > > > +             dout("d_revalidate %p lookup result=%d\n",
> > > > > > > +                             dentry, err);
> > > > > > >       }
> > > > > > >
> > > > > > > +out:
> > > > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> > > > > > >       if (valid) {
> > > > > > >               ceph_dentry_lru_touch(dentry);
> > > > > >
> > > > > > d_revalidate is changed to use the cache, but not lookup. Why?
> > > > >
> > > > > Multiple same lookups would also be sent out in our scenario mentioned
> > > > > above, so we add a cache for lookups, too. I don't know if I'm getting
> > > > > your question correctly.
> > > > >
> > > >
> > > > d_revalidate only checks the validity of an existing dentry. To get a
> > > > dentry in the first place, you have to use the directory inode's lookup
> > > > readdir operations. lookup isn't changed here and I was wondering why.
> > > >
> > > > Now that I think about it though, you probably need to do that anyway,
> > > > as the VFS code already does something similar when there are multiple
> > > > lookups for the same dentry in flight.
> > >
> > > It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
> > > ceph kernel module to use lookup request to revalidate dentry.
> > >
> >
> > Yes, but it doesn't dispatch through the local kernel's inode lookup
> > operation. ceph_lookup also does CEPH_MDS_OP_LOOKUP operations.
>
> I now get your point:-) Will try to fix that as well:-)
>
> >
> > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > > index 9d1f34d46627..8705f0645a24 100644
> > > > > > > --- a/fs/ceph/inode.c
> > > > > > > +++ b/fs/ceph/inode.c
> > > > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> > > > > > >
> > > > > > >       spin_lock_init(&ci->i_ceph_lock);
> > > > > > > +     mutex_init(&ci->getattrs_inflight_lock);
> > > > > > > +     mutex_init(&ci->lookups_inflight_lock);
> > > > > > >
> > > > > > >       ci->i_version = 0;
> > > > > > >       ci->i_inline_version = 0;
> > > > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> > > > > > >       ci->i_xattrs.index_version = 0;
> > > > > > >
> > > > > > >       ci->i_caps = RB_ROOT;
> > > > > > > +     ci->getattrs_inflight = RB_ROOT;
> > > > > > > +     ci->lookups_inflight = RB_ROOT;
> > > > > > >       ci->i_auth_cap = NULL;
> > > > > > >       ci->i_dirty_caps = 0;
> > > > > > >       ci->i_flushing_caps = 0;
> > > > > > > +     ci->last_flush_ack_tid = 0;
> > > > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> > > > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> > > > > > >       ci->i_prealloc_cap_flush = NULL;
> > > > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> > > > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> > > > > > >        * we expect a negative dentry.
> > > > > > >        */
> > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > > > >       if (!tgt_vino && d_really_is_positive(dentry))
> > > > > > >               return;
> > > > > > > -
> > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> > > > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> > > > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> > > > > > >               return;
> > > > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > >       struct ceph_mds_request *req;
> > > > > > >       int mode;
> > > > > > >       int err;
> > > > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> > > > > > >
> > > > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> > > > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> > > > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> > > > > > >               return 0;
> > > > > > >
> > > > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > > > -     if (IS_ERR(req))
> > > > > > > -             return PTR_ERR(req);
> > > > > > > -     req->r_inode = inode;
> > > > > > > -     ihold(inode);
> > > > > > > -     req->r_num_caps = 1;
> > > > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > > -     req->r_locked_page = locked_page;
> > > > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> > > > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> > > > > > > +     if (req) {
> > > > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > > > +             ceph_mdsc_get_request(req);
> > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > > +             err = ceph_mdsc_wait_for_request(req);
> > > > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > > > +     } else {
> > > > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> > > > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> > > > > > > +             if (!IS_ERR(req)) {
> > > > > > > +                     unsigned long res = 0;
> > > > > > > +                     req->r_inode = inode;
> > > > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> > > > > > > +                     ihold(inode);
> > > > > > > +                     req->r_num_caps = 1;
> > > > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> > > > > > > +                     req->r_locked_page = locked_page;
> > > > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> > > > > > > +                     if (IS_ERR_VALUE(res)) {
> > > > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > > +                             return res;
> > > > > > > +                     }
> > > > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> > > > > > > +             }
> > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > > +             if (IS_ERR(req))
> > > > > > > +                     return PTR_ERR(req);
> > > > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> > > > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> > > > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> > > > > > > +     }
> > > > > > >       if (locked_page && err == 0) {
> > > > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> > > > > > >               if (inline_version == 0) {
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index 163fc74bf221..5c6e00f2fd0a 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> > > > > > >       return 0;
> > > > > > >  }
> > > > > > >
> > > > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> > > > > > > +{
> > > > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> > > > > > > +
> > > > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> > > > > > > +     kfree(mrtree);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * Create an mds request.
> > > > > > >   */
> > > > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> > > > > > >       req->r_fmode = -1;
> > > > > > >       kref_init(&req->r_kref);
> > > > > > >       RB_CLEAR_NODE(&req->r_node);
> > > > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> > > > > > >       INIT_LIST_HEAD(&req->r_wait);
> > > > > > > +     init_completion(&req->batch_op_completion);
> > > > > > >       init_completion(&req->r_completion);
> > > > > > >       init_completion(&req->r_safe_completion);
> > > > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> > > > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > > > >       mutex_unlock(&mdsc->mutex);
> > > > > > >  }
> > > > > > >
> > > > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> > > > > > > +{
> > > > > > > +     int err = 0;
> > > > > > > +     long timeleft = wait_for_completion_killable_timeout(
> > > > > > > +                     &req->batch_op_completion,
> > > > > > > +                     ceph_timeout_jiffies(req->r_timeout));
> > > > > > > +     if (timeleft > 0)
> > > > > > > +             err = 0;
> > > > > > > +     else if (!timeleft)
> > > > > > > +             err = -EIO;  /* timed out */
> > > > > > > +     else
> > > > > > > +             err = timeleft;  /* killed */
> > > > > > > +     if (!err)
> > > > > > > +             return err;
> > > > > > > +     return req->batch_op_err;
> > > > > > > +}
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * Synchrously perform an mds request.  Take care of all of the
> > > > > > >   * session setup, forwarding, retry details.
> > > > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > > > >       } else {
> > > > > > >               err = req->r_err;
> > > > > > >       }
> > > > > > > -
> > > > > > > +     req->batch_op_err = err;
> > > > > > > +     complete_all(&req->batch_op_completion);
> > > > > > >  out:
> > > > > > >       mutex_unlock(&mdsc->mutex);
> > > > > > >       dout("do_request %p done, result %d\n", req, err);
> > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > index 729da155ebf0..41727677643e 100644
> > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> > > > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> > > > > > >                                                struct ceph_mds_request *req);
> > > > > > >
> > > > > > > +struct mds_request_tree {
> > > > > > > +     int mask;
> > > > > > > +     struct kref ref;
> > > > > > > +     struct rb_root mds_requests;
> > > > > > > +     struct rb_node parent_tree_node;
> > > > > > > +};
> > > > > > > +
> > > > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> > > > > > > +
> > > > > > >  /*
> > > > > > >   * an in-flight mds request
> > > > > > >   */
> > > > > > >  struct ceph_mds_request {
> > > > > > >       u64 r_tid;                   /* transaction id */
> > > > > > > +     u64 last_caps_flush_ack_tid;
> > > > > > >       struct rb_node r_node;
> > > > > > > +     struct rb_node getattr_lookup_node;
> > > > > > >       struct ceph_mds_client *r_mdsc;
> > > > > > >
> > > > > > >       int r_op;                    /* mds op code */
> > > > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> > > > > > >       struct ceph_msg  *r_reply;
> > > > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> > > > > > >       struct page *r_locked_page;
> > > > > > > -     int r_err;
> > > > > > > +     int r_err, batch_op_err;
> > > > > > >
> > > > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> > > > > > >       unsigned long r_started;  /* start time to measure timeout against */
> > > > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> > > > > > >
> > > > > > >       struct kref       r_kref;
> > > > > > >       struct list_head  r_wait;
> > > > > > > +     struct completion batch_op_completion;
> > > > > > >       struct completion r_completion;
> > > > > > >       struct completion r_safe_completion;
> > > > > > >       ceph_mds_request_callback_t r_callback;
> > > > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> > > > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> > > > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> > > > > > >                                    struct ceph_mds_request *req);
> > > > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> > > > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> > > > > > >                               struct inode *dir,
> > > > > > >                               struct ceph_mds_request *req);
> > > > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > > > > > index da2cd8e89062..857fbbc0b768 100644
> > > > > > > --- a/fs/ceph/super.c
> > > > > > > +++ b/fs/ceph/super.c
> > > > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> > > > > > >       destroy_caches();
> > > > > > >  }
> > > > > > >
> > > > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > > +                                       struct ceph_mds_request* req,
> > > > > > > +                                       bool is_lookup)
> > > > > > > +{
> > > > > > > +     struct rb_node *node = NULL;
> > > > > > > +     struct mds_request_tree* tmp = NULL;
> > > > > > > +     if (!is_lookup)
> > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > > > +     else
> > > > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > > > +
> > > > > > > +     while (node)
> > > > > > > +     {
> > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > > > +
> > > > > > > +             if (tmp->mask > req->r_args.getattr.mask)
> > > > > > > +                     node = node->rb_left;
> > > > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> > > > > > > +                     node = node->rb_right;
> > > > > > > +             else
> > > > > > > +                     break;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     if (!node)
> > > > > > > +             BUG();
> > > > > > > +
> > > > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> > > > > > > +
> > > > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> > > > > > > +             if (!is_lookup)
> > > > > > > +                     rb_erase(node, &inode->getattrs_inflight);
> > > > > > > +             else
> > > > > > > +                     rb_erase(node, &inode->lookups_inflight);
> > > > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > > +                                     struct ceph_mds_request* req,
> > > > > > > +                                     bool is_lookup)
> > > > > > > +{
> > > > > > > +     struct rb_node **p = NULL, *parent = NULL;
> > > > > > > +     struct mds_request_tree *tmp = NULL;
> > > > > > > +     struct ceph_mds_request* tmp_req = NULL;
> > > > > > > +     struct mds_request_tree* mrtree = NULL;
> > > > > > > +
> > > > > > > +     if (!is_lookup)
> > > > > > > +             p = &ci->getattrs_inflight.rb_node;
> > > > > > > +     else
> > > > > > > +             p = &ci->lookups_inflight.rb_node;
> > > > > > > +
> > > > > > > +     while (*p) {
> > > > > > > +             parent = *p;
> > > > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> > > > > > > +             if (req->r_args.getattr.mask < tmp->mask)
> > > > > > > +                     p = &(*p)->rb_left;
> > > > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> > > > > > > +                     p = &(*p)->rb_right;
> > > > > > > +             else
> > > > > > > +                     break;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     if (!(*p)) {
> > > > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> > > > > > > +
> > > > > > > +             if (!mrtree)
> > > > > > > +                     return -ENOMEM;
> > > > > > > +
> > > > > > > +             mrtree->mds_requests = RB_ROOT;
> > > > > > > +             mrtree->mask = req->r_args.getattr.mask;
> > > > > > > +             kref_init(&mrtree->ref);
> > > > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> > > > > > > +             if (!is_lookup) {
> > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> > > > > > > +             } else {
> > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> > > > > > > +             }
> > > > > > > +     } else {
> > > > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     p = &mrtree->mds_requests.rb_node;
> > > > > > > +     parent = NULL;
> > > > > > > +     while (*p) {
> > > > > > > +             parent = *p;
> > > > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> > > > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> > > > > > > +                     p = &(*p)->rb_left;
> > > > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> > > > > > > +                     p = &(*p)->rb_right;
> > > > > > > +             else
> > > > > > > +                     BUG();
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> > > > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> > > > > > > +     return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > > +                                   int mask,
> > > > > > > +                                   bool is_lookup)
> > > > > > > +{
> > > > > > > +     struct rb_node *node = NULL;
> > > > > > > +     if (!is_lookup)
> > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> > > > > > > +     else
> > > > > > > +             node = inode->lookups_inflight.rb_node;
> > > > > > > +
> > > > > > > +     while (node)
> > > > > > > +     {
> > > > > > > +             struct mds_request_tree* tmp = NULL;
> > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> > > > > > > +
> > > > > > > +             if (tmp->mask > mask) {
> > > > > > > +                     node = node->rb_left;
> > > > > > > +             } else if (tmp->mask < mask) {
> > > > > > > +                     node = node->rb_right;
> > > > > > > +             } else {
> > > > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> > > > > > > +                     while (inner_node) {
> > > > > > > +                             struct ceph_mds_request* req = NULL;
> > > > > > > +
> > > > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> > > > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> > > > > > > +                                     inner_node = inner_node->rb_left;
> > > > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> > > > > > > +                                     inner_node = inner_node->rb_right;
> > > > > > > +                             } else
> > > > > > > +                                     return req; /* Found it */
> > > > > > > +                     }
> > > > > > > +                     return NULL;
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +     return NULL;
> > > > > > > +}
> > > > > > >  module_init(init_ceph);
> > > > > > >  module_exit(exit_ceph);
> > > > > > >
> > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > index dfb64a5211b6..abf761f2a122 100644
> > > > > > > --- a/fs/ceph/super.h
> > > > > > > +++ b/fs/ceph/super.h
> > > > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> > > > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> > > > > > >
> > > > > > >       spinlock_t i_ceph_lock;
> > > > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> > > > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> > > > > >
> > > > > > If you do end up pursuing this further, I think it might be better to
> > > > > > use a spinlock to protect the trees.
> > > > > >
> > > > > > Take the lock, search the tree and if there isn't one, drop the lock,
> > > > > > allocate, take it and search again, and insert the one you allocated if
> > > > > > no conflicting structure was inserted in the meantime.
> > > > > >
> > > > > > In addition to cutting down on context switches when these things are
> > > > > > contended, that would also make it easier to hide the locking inside the
> > > > > > cache search/register/unregister routines.
> > > > >
> > > > > You are right. At first, I tried to use spinlock, but that caused
> > > > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> > > > > know how that occurred. Do you have any clew about that?
> > > > >
> > > > > I can't thank you enough for the help:-) Thanks very much:-)
> > > >
> > > > Often that means you forgot to unlock something, or maybe hit some lock
> > > > inversion? Without more analysis it's impossible to know. :)
> > >
> > > Thanks:-) I wouldn't try to debug that:-)
> > > > > > >       u64 i_version;
> > > > > > >       u64 i_inline_version;
> > > > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> > > > > > >       struct rb_root i_caps;           /* cap list */
> > > > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> > > > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> > > > > > > +     u64 last_flush_ack_tid;
> > > > > > >       struct list_head i_dirty_item, i_flushing_item;
> > > > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> > > > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> > > > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> > > > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> > > > > > >                               struct timespec64 *mtime,
> > > > > > >                               struct timespec64 *atime);
> > > > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > > +                                            struct ceph_mds_request* req,
> > > > > > > +                                            bool is_lookup);
> > > > > > > +
> > > > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> > > > > > > +                                              struct ceph_mds_request* req,
> > > > > > > +                                              bool is_lookup);
> > > > > > > +
> > > > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> > > > > > > +                                          int mask,
> > > > > > > +                                          bool is_lookup);
> > > > > > >  extern int ceph_fill_trace(struct super_block *sb,
> > > > > > >                          struct ceph_mds_request *req);
> > > > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> > > >
> > > > --
> > > > Jeff Layton <jlayton@redhat.com>
> > > >
> >
> > --
> > Jeff Layton <jlayton@redhat.com>
> >
Xuehan Xu March 1, 2019, 9:08 a.m. UTC | #10
Hi, Jeff and Zheng.

I think we can make MDS notify all clients who want the As/Ls/Xs/Fs
caps of a specific inode about any modification to the corresponding
fields of that inode, before acking those modifications. And only
after all clients have acked this notification can the MDS ack the
corresponding modification. I think this can give clients a hint about
modification from other clients by which they can arrange their only
getattrs/lookups.

And similarly, for file data read/write, I think we can leverage the
OSD's watch/notify mechanism to achieve the same effect.

Do you think this is an option? Thanks:-)

On Thu, 28 Feb 2019 at 19:11, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
>
>
>
> On Thu, Feb 28, 2019, 14:09 Xuehan Xu <xxhdx1985126@gmail.com wrote:
>>
>> Hi, Jeff and Zheng. Do you think it's feasible to give users choices
>> over what kind of cache coherency they need like what is done in
>> glusterfs(https://lists.gluster.org/pipermail/gluster-devel/2015-September/046611.html)?
>
>
> Or maybe we should make write clients notify other Fr clients about the update before responding to the writing process?
>>
>>
>> On Thu, 28 Feb 2019 at 10:31, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
>> >
>> > > > Hi, Jeff. Thanks for your reply:-)
>> > > >  In our case, those requests can take more than ten seconds to finish,
>> > > > which is intolerable to web servers. Actually, every time this
>> > > > scenario occurs, it makes our monitoring system wrongly recognize
>> > > > those web servers as down and shift all web traffic to other IDCs.
>> > > >
>> > >
>> > > Ouch, ok. It's understandable that you'd want that fixed and kudos for
>> > > taking a stab at it. The idea is not a bad one, but we'd need to
>> > > consider how to do it while maintaining strict cache coherency.
>> > >
>> > > > > > Basically, the scenario we are facing now is as follows:
>> > > > > >      We have several dozens of web servers all of which mounted the
>> > > > > > same directory for their php scripts and would ask cephfs for the
>> > > > > > latest file content if some of those scripts is modified. To improve
>> > > > > > the web servers' efficiency, each of them runs tens of threads to
>> > > > > > serve web requests, so, as you can see, when some scripts is modified,
>> > > > > > there would be thousands of threads asking for the same file which
>> > > > > > would lead to thousands of getattr/lookup and read reqs to the cephfs
>> > > > > > cluster and cause huge pressure on both MDSes and OSDs.
>> > > > > >
>> > > > >
>> > > > > How often are these files changing?
>> > > > >
>> > > > > Between changes to the files, the web server clients should all end up
>> > > > > with read-only caps that allow them to cache the inode attributes. When
>> > > > > there is a change to the file, those will need to be recalled of course
>> > > > > and you might see more activity on the MDS, but that's not necessarily a
>> > > > > problem.
>> > > >
>> > > > File changes is not very often, maybe one or two times a week. But,
>> > > > since each time files get changed the scenario mentioned above occurs,
>> > > > this problem really makes us suffer.
>> > > >
>> > >
>> > > I assume that you have verified that this patch does help in that
>> > > scenario?
>> > >
>> >
>> > Yes:-) This patch can reduce the latency from more than ten seconds to
>> > within 100 ms most of the time:-) It's only the cache coherency that's
>> > still haunting us....
>> >
>> > > So to make sure I understand the scenario in terms of caps:
>> > >
>> > > Most clients have read caps for the files in question prior to changes
>> > > to these files (and probably leases on the directories and such). When
>> > > you go to update the files, all of those caps get recalled (which
>> > > probably also takes a little while).
>> > >
>> > > After the file is written, a bunch of tasks on the clients issue stat()
>> > > calls in parallel, which become LOOKUPs and/or GETATTRs you end up with
>> > > a thundering herd problem on the MDS.
>> > >
>> > > Is that about right?
>> > >
>> >
>> > Yes, that's right:-)
>> >
>> > > If so, I don't see a way around that while preserving cache coherency
>> > > right offhand. I'd have to think about it
>> > >
>> > > > > > > Looking over the patch, the basic idea is to implement a cache (in
>> > > > > > > rbtrees) of getattrs and lookups in flight and just wait on those if
>> > > > > > > there already is one that matches what we need. I agree with Zheng that
>> > > > > > > this could end up violating the strict cache coherency guarantees that
>> > > > > > > ceph tries to provide.
>> > > > > > >
>> > > > > > > You're relying on an assumption that if you issue a getattr/lookup while
>> > > > > > > another one is already in flight, then it's legitimate to use the result
>> > > > > > > of the initial one for the one issued later. That's not necessarily the
>> > > > > > > case. I suspect this would be possible, even from a single client:
>> > > > > > >
>> > > > > > > - client1/task1 issues stat(), server performs getattr
>> > > > > > > but reply gets delayed for a bit
>> > > > > > >
>> > > > > > > - client2 does write() + fsync()
>> > > > > > >
>> > > > > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
>> > > > > > > ends up waiting for first getattr to complete
>> > > > > > >
>> > > > > > > - getattr reply comes in, and both stat calls get back the same
>> > > > > > > attributes, showing pre-write size/mtime
>> > > > > > >
>> > > > > > > In this case, the synchronous write by client2 was provably done before
>> > > > > > > the second getattr, but the second stat() call wouldn't see the new
>> > > > > > > size.
>> > > > > >
>> > > > > > I understand that this is a big problem. If what I think is right,
>> > > > > > this problem may be only with the read handling patch.
>> > > > > > Here's what I think:
>> > > > > >     First, I think the problem would come up only when client1/task2
>> > > > > > issues stat() after the finish of client2's write()+sync(), since,
>> > > > > > only in this case, a pre-write size/mtime seen by client1/task2 would
>> > > > > > cause errors. And a pre-write size/mtime would be seen by
>> > > > > > client1/task2 only when client1/task1's stat() is processed before
>> > > > > > client2's write()+sync(). However, when client1/task1's stat() is
>> > > > > > processed successfully, the inode's filelock should be in states like
>> > > > > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
>> > > > > > move the inode's to states like LOCK_MIX which would cause a cap
>> > > > > > revoke from client1. And it's only when client1 acked the revoke can
>> > > > > > the MDS go on processing the metadata modification caused by client2's
>> > > > > > write()+fsync(). Also, MDS sends messages to the same client in the
>> > > > > > same order of the creation of those messages. So, when client1/task2
>> > > > > > issues stat(), the result of the previous one must have been processed
>> > > > > > by client1 and can't be seen by client1/task2, which means this
>> > > > > > problem wouldn't occur. Am I right about this? I don't know if I'm
>> > > > > > missing anything.
>> > > > > >
>> > > > >
>> > > > > There is never a guarantee that the MDS will give you the caps you've
>> > > > > requested, so I don't think you can rely on client1 having to process a
>> > > > > cap revoke to get you out of this situation.
>> > > > >
>> > > > > IOW, I believe this is possible:
>> > > > >
>> > > > > - client1 requests caps to do a stat(), doesn't get them (maybe because
>> > > > > client2 is actively writing?)
>> > > > > - client1 issues GETATTR to MDS, which gets processed there, but reply
>> > > > > gets delayed
>> > > > > - client2 does write()+fsync()
>> > > > > - client1 issues another stat(), ends up waiting on the first GETATTR,
>> > > > > which won't contain the most up to date information
>> > > > >
>> > > > > No cap revoke to client1 here since it never got any in the first place.
>> > > >
>> > > > Yes, you are right. But I think, in this case, the first GETATTR would
>> > > > get processed after client2's write()+fsync(), which means it would
>> > > > get the latest inode content. So the problem wouldn't occur, either.
>> > > > Am I right?
>> > > >
>> > >
>> > > No. In this (hypothetical) case the GETATTR gets processed first. The
>> > > reply just gets delayed for a bit -- possibly in the sender's transmit
>> > > buffer or receiver's receive buffer...or, maybe some frames in the reply
>> > > get dropped somewhere and you have to retransmit (bad cable)? The
>> > > possibilities are endless when it comes to network filesystems. :)
>> > >
>> > > The key point is that on a network filesystem there is always some delay
>> > > between an operation being performed on the server and the client seeing
>> > > the result. All you know is when you transmitted the request and when
>> > > you received the reply.
>> > >
>> > > CephFS is no different here, but it tries to maintain strict cache
>> > > coherency through the cap handling.
>> >
>> > Oh, I think I get your point. You mean that there are chances that
>> > even if MDS process the GETATTR reqs successfully and respond to the
>> > client, it won't give the client the caps that they asked for; so
>> > there are no guarantee that the finish of client2's write()+fsync()
>> > would indicate the first GETATTR is already finished on the client1's
>> > side. Am I right?
>> >
>> > >
>> > > > > > However, as for the read handling patch, this problem do exists, and
>> > > > > > it's really confusing me.
>> > > > > >
>> > > > >
>> > > > > The cap handling code in cephfs is extremely complex. My hat is off to
>> > > > > you for even attempting to understand the MDS LOCK_* handling code.
>> > > > >
>> > > > > > > The read handling patch may suffer from similar issues, though I haven't
>> > > > > > > gone over that one in as much detail.
>> > > > > > >
>> > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> > > > > > > > index bba28a5034ba..5a177a3c283f 100644
>> > > > > > > > --- a/fs/ceph/caps.c
>> > > > > > > > +++ b/fs/ceph/caps.c
>> > > > > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
>> > > > > > > >               goto out;
>> > > > > > > >
>> > > > > > > >       ci->i_flushing_caps &= ~cleaned;
>> > > > > > > > +     ci->last_flush_ack_tid = flush_tid;
>> > > > > > > >
>> > > > > > > >       spin_lock(&mdsc->cap_dirty_lock);
>> > > > > > > >
>> > > > > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>> > > > > > > > index 82928cea0209..6f17c0319158 100644
>> > > > > > > > --- a/fs/ceph/dir.c
>> > > > > > > > +++ b/fs/ceph/dir.c
>> > > > > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>> > > > > > > >                               }
>> > > > > > > >                       }
>> > > > > > > >               }
>> > > > > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
>> > > > > > > >       }
>> > > > > > > >       spin_unlock(&dentry->d_lock);
>> > > > > > > >
>> > > > > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>> > > > > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
>> > > > > > > >               ceph_put_mds_session(session);
>> > > > > > > >       }
>> > > > > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
>> > > > > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
>> > > > > > > >       return valid;
>> > > > > > > >  }
>> > > > > > > >
>> > > > > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
>> > > > > > > >       if (!valid) {
>> > > > > > > >               struct ceph_mds_client *mdsc =
>> > > > > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
>> > > > > > > > -             struct ceph_mds_request *req;
>> > > > > > > > +             struct ceph_mds_request *req = NULL;
>> > > > > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
>> > > > > > > >               int op, err;
>> > > > > > > >               u32 mask;
>> > > > > > > >
>> > > > > > > >               if (flags & LOOKUP_RCU)
>> > > > > > > >                       return -ECHILD;
>> > > > > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
>> > > > > > > > +             if (ceph_security_xattr_wanted(dir))
>> > > > > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
>> > > > > > > >
>> > > > > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
>> > > > > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
>> > > > > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
>> > > > > > > > -             if (!IS_ERR(req)) {
>> > > > > > > > -                     req->r_dentry = dget(dentry);
>> > > > > > > > -                     req->r_num_caps = 2;
>> > > > > > > > -                     req->r_parent = dir;
>> > > > > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
>> > > > > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
>> > > > > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
>> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
>> > > > > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
>> > > > > > > > +             }
>> > > > > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
>> > > > > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
>> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
>> > > > > > > > +                     ceph_mdsc_get_request(req);
>> > > > > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
>> > > > > > > > +                     err = ceph_mdsc_wait_for_request(req);
>> > > > > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
>> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
>> > > > > > > > +             } else {
>> > > > > > > >
>> > > > > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
>> > > > > > > > -                     if (ceph_security_xattr_wanted(dir))
>> > > > > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
>> > > > > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
>> > > > > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
>> > > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
>> > > > > > > > +                             if (!IS_ERR(req)) {
>> > > > > > > > +                                     unsigned long res = 0;
>> > > > > > > > +                                     req->r_dentry = dget(dentry);
>> > > > > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
>> > > > > > > > +                                     req->r_num_caps = 2;
>> > > > > > > > +                                     req->r_parent = dir;
>> > > > > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
>> > > > > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
>> > > > > > > > +                                     if (IS_ERR_VALUE(res)) {
>> > > > > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
>> > > > > > > > +                                             goto out;
>> > > > > > > > +                                     }
>> > > > > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
>> > > > > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
>> > > > > > > > +                             }
>> > > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
>> > > > > > > > +                     }
>> > > > > > > > +                     if (IS_ERR(req))
>> > > > > > > > +                             goto out;
>> > > > > > > >
>> > > > > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
>> > > > > > > > -                     switch (err) {
>> > > > > > > > -                     case 0:
>> > > > > > > > -                             if (d_really_is_positive(dentry) &&
>> > > > > > > > -                                 d_inode(dentry) == req->r_target_inode)
>> > > > > > > > -                                     valid = 1;
>> > > > > > > > -                             break;
>> > > > > > > > -                     case -ENOENT:
>> > > > > > > > -                             if (d_really_is_negative(dentry))
>> > > > > > > > -                                     valid = 1;
>> > > > > > > > -                             /* Fallthrough */
>> > > > > > > > -                     default:
>> > > > > > > > -                             break;
>> > > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
>> > > > > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
>> > > > > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
>> > > > > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
>> > > > > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
>> > > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
>> > > > > > > >                       }
>> > > > > > > > -                     ceph_mdsc_put_request(req);
>> > > > > > > > -                     dout("d_revalidate %p lookup result=%d\n",
>> > > > > > > > -                          dentry, err);
>> > > > > > > >               }
>> > > > > > > > +             switch (err) {
>> > > > > > > > +             case 0:
>> > > > > > > > +                     if (d_really_is_positive(dentry) &&
>> > > > > > > > +                         d_inode(dentry) == req->r_target_inode)
>> > > > > > > > +                             valid = 1;
>> > > > > > > > +                     break;
>> > > > > > > > +             case -ENOENT:
>> > > > > > > > +                     if (d_really_is_negative(dentry))
>> > > > > > > > +                             valid = 1;
>> > > > > > > > +                     /* Fallthrough */
>> > > > > > > > +             default:
>> > > > > > > > +                     break;
>> > > > > > > > +             }
>> > > > > > > > +             ceph_mdsc_put_request(req);
>> > > > > > > > +             dout("d_revalidate %p lookup result=%d\n",
>> > > > > > > > +                             dentry, err);
>> > > > > > > >       }
>> > > > > > > >
>> > > > > > > > +out:
>> > > > > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
>> > > > > > > >       if (valid) {
>> > > > > > > >               ceph_dentry_lru_touch(dentry);
>> > > > > > >
>> > > > > > > d_revalidate is changed to use the cache, but not lookup. Why?
>> > > > > >
>> > > > > > Multiple same lookups would also be sent out in our scenario mentioned
>> > > > > > above, so we add a cache for lookups, too. I don't know if I'm getting
>> > > > > > your question correctly.
>> > > > > >
>> > > > >
>> > > > > d_revalidate only checks the validity of an existing dentry. To get a
>> > > > > dentry in the first place, you have to use the directory inode's lookup
>> > > > > readdir operations. lookup isn't changed here and I was wondering why.
>> > > > >
>> > > > > Now that I think about it though, you probably need to do that anyway,
>> > > > > as the VFS code already does something similar when there are multiple
>> > > > > lookups for the same dentry in flight.
>> > > >
>> > > > It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
>> > > > ceph kernel module to use lookup request to revalidate dentry.
>> > > >
>> > >
>> > > Yes, but it doesn't dispatch through the local kernel's inode lookup
>> > > operation. ceph_lookup also does CEPH_MDS_OP_LOOKUP operations.
>> >
>> > I now get your point:-) Will try to fix that as well:-)
>> >
>> > >
>> > > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> > > > > > > > index 9d1f34d46627..8705f0645a24 100644
>> > > > > > > > --- a/fs/ceph/inode.c
>> > > > > > > > +++ b/fs/ceph/inode.c
>> > > > > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>> > > > > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
>> > > > > > > >
>> > > > > > > >       spin_lock_init(&ci->i_ceph_lock);
>> > > > > > > > +     mutex_init(&ci->getattrs_inflight_lock);
>> > > > > > > > +     mutex_init(&ci->lookups_inflight_lock);
>> > > > > > > >
>> > > > > > > >       ci->i_version = 0;
>> > > > > > > >       ci->i_inline_version = 0;
>> > > > > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>> > > > > > > >       ci->i_xattrs.index_version = 0;
>> > > > > > > >
>> > > > > > > >       ci->i_caps = RB_ROOT;
>> > > > > > > > +     ci->getattrs_inflight = RB_ROOT;
>> > > > > > > > +     ci->lookups_inflight = RB_ROOT;
>> > > > > > > >       ci->i_auth_cap = NULL;
>> > > > > > > >       ci->i_dirty_caps = 0;
>> > > > > > > >       ci->i_flushing_caps = 0;
>> > > > > > > > +     ci->last_flush_ack_tid = 0;
>> > > > > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
>> > > > > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
>> > > > > > > >       ci->i_prealloc_cap_flush = NULL;
>> > > > > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
>> > > > > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
>> > > > > > > >        * we expect a negative dentry.
>> > > > > > > >        */
>> > > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>> > > > > > > >       if (!tgt_vino && d_really_is_positive(dentry))
>> > > > > > > >               return;
>> > > > > > > > -
>> > > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>> > > > > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
>> > > > > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
>> > > > > > > >               return;
>> > > > > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>> > > > > > > >       struct ceph_mds_request *req;
>> > > > > > > >       int mode;
>> > > > > > > >       int err;
>> > > > > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
>> > > > > > > >
>> > > > > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
>> > > > > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
>> > > > > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>> > > > > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
>> > > > > > > >               return 0;
>> > > > > > > >
>> > > > > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
>> > > > > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
>> > > > > > > > -     if (IS_ERR(req))
>> > > > > > > > -             return PTR_ERR(req);
>> > > > > > > > -     req->r_inode = inode;
>> > > > > > > > -     ihold(inode);
>> > > > > > > > -     req->r_num_caps = 1;
>> > > > > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
>> > > > > > > > -     req->r_locked_page = locked_page;
>> > > > > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
>> > > > > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
>> > > > > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
>> > > > > > > > +     if (req) {
>> > > > > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
>> > > > > > > > +             ceph_mdsc_get_request(req);
>> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +             err = ceph_mdsc_wait_for_request(req);
>> > > > > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
>> > > > > > > > +     } else {
>> > > > > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
>> > > > > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
>> > > > > > > > +             if (!IS_ERR(req)) {
>> > > > > > > > +                     unsigned long res = 0;
>> > > > > > > > +                     req->r_inode = inode;
>> > > > > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
>> > > > > > > > +                     ihold(inode);
>> > > > > > > > +                     req->r_num_caps = 1;
>> > > > > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
>> > > > > > > > +                     req->r_locked_page = locked_page;
>> > > > > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
>> > > > > > > > +                     if (IS_ERR_VALUE(res)) {
>> > > > > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +                             return res;
>> > > > > > > > +                     }
>> > > > > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
>> > > > > > > > +             }
>> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +             if (IS_ERR(req))
>> > > > > > > > +                     return PTR_ERR(req);
>> > > > > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
>> > > > > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
>> > > > > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
>> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
>> > > > > > > > +     }
>> > > > > > > >       if (locked_page && err == 0) {
>> > > > > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
>> > > > > > > >               if (inline_version == 0) {
>> > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> > > > > > > > index 163fc74bf221..5c6e00f2fd0a 100644
>> > > > > > > > --- a/fs/ceph/mds_client.c
>> > > > > > > > +++ b/fs/ceph/mds_client.c
>> > > > > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
>> > > > > > > >       return 0;
>> > > > > > > >  }
>> > > > > > > >
>> > > > > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
>> > > > > > > > +{
>> > > > > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
>> > > > > > > > +
>> > > > > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
>> > > > > > > > +     kfree(mrtree);
>> > > > > > > > +}
>> > > > > > > > +
>> > > > > > > >  /*
>> > > > > > > >   * Create an mds request.
>> > > > > > > >   */
>> > > > > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>> > > > > > > >       req->r_fmode = -1;
>> > > > > > > >       kref_init(&req->r_kref);
>> > > > > > > >       RB_CLEAR_NODE(&req->r_node);
>> > > > > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
>> > > > > > > >       INIT_LIST_HEAD(&req->r_wait);
>> > > > > > > > +     init_completion(&req->batch_op_completion);
>> > > > > > > >       init_completion(&req->r_completion);
>> > > > > > > >       init_completion(&req->r_safe_completion);
>> > > > > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
>> > > > > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>> > > > > > > >       mutex_unlock(&mdsc->mutex);
>> > > > > > > >  }
>> > > > > > > >
>> > > > > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
>> > > > > > > > +{
>> > > > > > > > +     int err = 0;
>> > > > > > > > +     long timeleft = wait_for_completion_killable_timeout(
>> > > > > > > > +                     &req->batch_op_completion,
>> > > > > > > > +                     ceph_timeout_jiffies(req->r_timeout));
>> > > > > > > > +     if (timeleft > 0)
>> > > > > > > > +             err = 0;
>> > > > > > > > +     else if (!timeleft)
>> > > > > > > > +             err = -EIO;  /* timed out */
>> > > > > > > > +     else
>> > > > > > > > +             err = timeleft;  /* killed */
>> > > > > > > > +     if (!err)
>> > > > > > > > +             return err;
>> > > > > > > > +     return req->batch_op_err;
>> > > > > > > > +}
>> > > > > > > > +
>> > > > > > > >  /*
>> > > > > > > >   * Synchrously perform an mds request.  Take care of all of the
>> > > > > > > >   * session setup, forwarding, retry details.
>> > > > > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>> > > > > > > >       } else {
>> > > > > > > >               err = req->r_err;
>> > > > > > > >       }
>> > > > > > > > -
>> > > > > > > > +     req->batch_op_err = err;
>> > > > > > > > +     complete_all(&req->batch_op_completion);
>> > > > > > > >  out:
>> > > > > > > >       mutex_unlock(&mdsc->mutex);
>> > > > > > > >       dout("do_request %p done, result %d\n", req, err);
>> > > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> > > > > > > > index 729da155ebf0..41727677643e 100644
>> > > > > > > > --- a/fs/ceph/mds_client.h
>> > > > > > > > +++ b/fs/ceph/mds_client.h
>> > > > > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
>> > > > > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
>> > > > > > > >                                                struct ceph_mds_request *req);
>> > > > > > > >
>> > > > > > > > +struct mds_request_tree {
>> > > > > > > > +     int mask;
>> > > > > > > > +     struct kref ref;
>> > > > > > > > +     struct rb_root mds_requests;
>> > > > > > > > +     struct rb_node parent_tree_node;
>> > > > > > > > +};
>> > > > > > > > +
>> > > > > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
>> > > > > > > > +
>> > > > > > > >  /*
>> > > > > > > >   * an in-flight mds request
>> > > > > > > >   */
>> > > > > > > >  struct ceph_mds_request {
>> > > > > > > >       u64 r_tid;                   /* transaction id */
>> > > > > > > > +     u64 last_caps_flush_ack_tid;
>> > > > > > > >       struct rb_node r_node;
>> > > > > > > > +     struct rb_node getattr_lookup_node;
>> > > > > > > >       struct ceph_mds_client *r_mdsc;
>> > > > > > > >
>> > > > > > > >       int r_op;                    /* mds op code */
>> > > > > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
>> > > > > > > >       struct ceph_msg  *r_reply;
>> > > > > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
>> > > > > > > >       struct page *r_locked_page;
>> > > > > > > > -     int r_err;
>> > > > > > > > +     int r_err, batch_op_err;
>> > > > > > > >
>> > > > > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
>> > > > > > > >       unsigned long r_started;  /* start time to measure timeout against */
>> > > > > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
>> > > > > > > >
>> > > > > > > >       struct kref       r_kref;
>> > > > > > > >       struct list_head  r_wait;
>> > > > > > > > +     struct completion batch_op_completion;
>> > > > > > > >       struct completion r_completion;
>> > > > > > > >       struct completion r_safe_completion;
>> > > > > > > >       ceph_mds_request_callback_t r_callback;
>> > > > > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
>> > > > > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
>> > > > > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>> > > > > > > >                                    struct ceph_mds_request *req);
>> > > > > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
>> > > > > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>> > > > > > > >                               struct inode *dir,
>> > > > > > > >                               struct ceph_mds_request *req);
>> > > > > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> > > > > > > > index da2cd8e89062..857fbbc0b768 100644
>> > > > > > > > --- a/fs/ceph/super.c
>> > > > > > > > +++ b/fs/ceph/super.c
>> > > > > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
>> > > > > > > >       destroy_caches();
>> > > > > > > >  }
>> > > > > > > >
>> > > > > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
>> > > > > > > > +                                       struct ceph_mds_request* req,
>> > > > > > > > +                                       bool is_lookup)
>> > > > > > > > +{
>> > > > > > > > +     struct rb_node *node = NULL;
>> > > > > > > > +     struct mds_request_tree* tmp = NULL;
>> > > > > > > > +     if (!is_lookup)
>> > > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
>> > > > > > > > +     else
>> > > > > > > > +             node = inode->lookups_inflight.rb_node;
>> > > > > > > > +
>> > > > > > > > +     while (node)
>> > > > > > > > +     {
>> > > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
>> > > > > > > > +
>> > > > > > > > +             if (tmp->mask > req->r_args.getattr.mask)
>> > > > > > > > +                     node = node->rb_left;
>> > > > > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
>> > > > > > > > +                     node = node->rb_right;
>> > > > > > > > +             else
>> > > > > > > > +                     break;
>> > > > > > > > +     }
>> > > > > > > > +
>> > > > > > > > +     if (!node)
>> > > > > > > > +             BUG();
>> > > > > > > > +
>> > > > > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
>> > > > > > > > +
>> > > > > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
>> > > > > > > > +             if (!is_lookup)
>> > > > > > > > +                     rb_erase(node, &inode->getattrs_inflight);
>> > > > > > > > +             else
>> > > > > > > > +                     rb_erase(node, &inode->lookups_inflight);
>> > > > > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
>> > > > > > > > +     }
>> > > > > > > > +}
>> > > > > > > > +
>> > > > > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
>> > > > > > > > +                                     struct ceph_mds_request* req,
>> > > > > > > > +                                     bool is_lookup)
>> > > > > > > > +{
>> > > > > > > > +     struct rb_node **p = NULL, *parent = NULL;
>> > > > > > > > +     struct mds_request_tree *tmp = NULL;
>> > > > > > > > +     struct ceph_mds_request* tmp_req = NULL;
>> > > > > > > > +     struct mds_request_tree* mrtree = NULL;
>> > > > > > > > +
>> > > > > > > > +     if (!is_lookup)
>> > > > > > > > +             p = &ci->getattrs_inflight.rb_node;
>> > > > > > > > +     else
>> > > > > > > > +             p = &ci->lookups_inflight.rb_node;
>> > > > > > > > +
>> > > > > > > > +     while (*p) {
>> > > > > > > > +             parent = *p;
>> > > > > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
>> > > > > > > > +             if (req->r_args.getattr.mask < tmp->mask)
>> > > > > > > > +                     p = &(*p)->rb_left;
>> > > > > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
>> > > > > > > > +                     p = &(*p)->rb_right;
>> > > > > > > > +             else
>> > > > > > > > +                     break;
>> > > > > > > > +     }
>> > > > > > > > +
>> > > > > > > > +     if (!(*p)) {
>> > > > > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
>> > > > > > > > +
>> > > > > > > > +             if (!mrtree)
>> > > > > > > > +                     return -ENOMEM;
>> > > > > > > > +
>> > > > > > > > +             mrtree->mds_requests = RB_ROOT;
>> > > > > > > > +             mrtree->mask = req->r_args.getattr.mask;
>> > > > > > > > +             kref_init(&mrtree->ref);
>> > > > > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
>> > > > > > > > +             if (!is_lookup) {
>> > > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
>> > > > > > > > +             } else {
>> > > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
>> > > > > > > > +             }
>> > > > > > > > +     } else {
>> > > > > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
>> > > > > > > > +     }
>> > > > > > > > +
>> > > > > > > > +     p = &mrtree->mds_requests.rb_node;
>> > > > > > > > +     parent = NULL;
>> > > > > > > > +     while (*p) {
>> > > > > > > > +             parent = *p;
>> > > > > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
>> > > > > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
>> > > > > > > > +                     p = &(*p)->rb_left;
>> > > > > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
>> > > > > > > > +                     p = &(*p)->rb_right;
>> > > > > > > > +             else
>> > > > > > > > +                     BUG();
>> > > > > > > > +     }
>> > > > > > > > +
>> > > > > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
>> > > > > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
>> > > > > > > > +     return 0;
>> > > > > > > > +}
>> > > > > > > > +
>> > > > > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
>> > > > > > > > +                                   int mask,
>> > > > > > > > +                                   bool is_lookup)
>> > > > > > > > +{
>> > > > > > > > +     struct rb_node *node = NULL;
>> > > > > > > > +     if (!is_lookup)
>> > > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
>> > > > > > > > +     else
>> > > > > > > > +             node = inode->lookups_inflight.rb_node;
>> > > > > > > > +
>> > > > > > > > +     while (node)
>> > > > > > > > +     {
>> > > > > > > > +             struct mds_request_tree* tmp = NULL;
>> > > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
>> > > > > > > > +
>> > > > > > > > +             if (tmp->mask > mask) {
>> > > > > > > > +                     node = node->rb_left;
>> > > > > > > > +             } else if (tmp->mask < mask) {
>> > > > > > > > +                     node = node->rb_right;
>> > > > > > > > +             } else {
>> > > > > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
>> > > > > > > > +                     while (inner_node) {
>> > > > > > > > +                             struct ceph_mds_request* req = NULL;
>> > > > > > > > +
>> > > > > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
>> > > > > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
>> > > > > > > > +                                     inner_node = inner_node->rb_left;
>> > > > > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
>> > > > > > > > +                                     inner_node = inner_node->rb_right;
>> > > > > > > > +                             } else
>> > > > > > > > +                                     return req; /* Found it */
>> > > > > > > > +                     }
>> > > > > > > > +                     return NULL;
>> > > > > > > > +             }
>> > > > > > > > +     }
>> > > > > > > > +     return NULL;
>> > > > > > > > +}
>> > > > > > > >  module_init(init_ceph);
>> > > > > > > >  module_exit(exit_ceph);
>> > > > > > > >
>> > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> > > > > > > > index dfb64a5211b6..abf761f2a122 100644
>> > > > > > > > --- a/fs/ceph/super.h
>> > > > > > > > +++ b/fs/ceph/super.h
>> > > > > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
>> > > > > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
>> > > > > > > >
>> > > > > > > >       spinlock_t i_ceph_lock;
>> > > > > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
>> > > > > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
>> > > > > > >
>> > > > > > > If you do end up pursuing this further, I think it might be better to
>> > > > > > > use a spinlock to protect the trees.
>> > > > > > >
>> > > > > > > Take the lock, search the tree and if there isn't one, drop the lock,
>> > > > > > > allocate, take it and search again, and insert the one you allocated if
>> > > > > > > no conflicting structure was inserted in the meantime.
>> > > > > > >
>> > > > > > > In addition to cutting down on context switches when these things are
>> > > > > > > contended, that would also make it easier to hide the locking inside the
>> > > > > > > cache search/register/unregister routines.
>> > > > > >
>> > > > > > You are right. At first, I tried to use spinlock, but that caused
>> > > > > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
>> > > > > > know how that occurred. Do you have any clew about that?
>> > > > > >
>> > > > > > I can't thank you enough for the help:-) Thanks very much:-)
>> > > > >
>> > > > > Often that means you forgot to unlock something, or maybe hit some lock
>> > > > > inversion? Without more analysis it's impossible to know. :)
>> > > >
>> > > > Thanks:-) I wouldn't try to debug that:-)
>> > > > > > > >       u64 i_version;
>> > > > > > > >       u64 i_inline_version;
>> > > > > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
>> > > > > > > >       struct rb_root i_caps;           /* cap list */
>> > > > > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>> > > > > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
>> > > > > > > > +     u64 last_flush_ack_tid;
>> > > > > > > >       struct list_head i_dirty_item, i_flushing_item;
>> > > > > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
>> > > > > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
>> > > > > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
>> > > > > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
>> > > > > > > >                               struct timespec64 *mtime,
>> > > > > > > >                               struct timespec64 *atime);
>> > > > > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
>> > > > > > > > +                                            struct ceph_mds_request* req,
>> > > > > > > > +                                            bool is_lookup);
>> > > > > > > > +
>> > > > > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
>> > > > > > > > +                                              struct ceph_mds_request* req,
>> > > > > > > > +                                              bool is_lookup);
>> > > > > > > > +
>> > > > > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
>> > > > > > > > +                                          int mask,
>> > > > > > > > +                                          bool is_lookup);
>> > > > > > > >  extern int ceph_fill_trace(struct super_block *sb,
>> > > > > > > >                          struct ceph_mds_request *req);
>> > > > > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
>> > > > >
>> > > > > --
>> > > > > Jeff Layton <jlayton@redhat.com>
>> > > > >
>> > >
>> > > --
>> > > Jeff Layton <jlayton@redhat.com>
>> > >
Yan, Zheng March 1, 2019, 3:36 p.m. UTC | #11
On Fri, Mar 1, 2019 at 5:08 PM Xuehan Xu <xxhdx1985126@gmail.com> wrote:
>
> Hi, Jeff and Zheng.
>
> I think we can make MDS notify all clients who want the As/Ls/Xs/Fs
> caps of a specific inode about any modification to the corresponding
> fields of that inode, before acking those modifications. And only
> after all clients have acked this notification can the MDS ack the
> corresponding modification. I think this can give clients a hint about
> modification from other clients by which they can arrange their only
> getattrs/lookups.
>

I think we can force mds to return shared cap for getattr.  something like:

- If getattr reply include shared cap, use the reply to satisfy all
pending getattr. (pin the shared cap until all getattr are returned)
- if getattr reply does not include shared cap and there are pending
getattr. send a getattr request with special flag. The flag makes mds
return shared cap



> And similarly, for file data read/write, I think we can leverage the
> OSD's watch/notify mechanism to achieve the same effect.
>
> Do you think this is an option? Thanks:-)
>
> On Thu, 28 Feb 2019 at 19:11, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
> >
> >
> >
> > On Thu, Feb 28, 2019, 14:09 Xuehan Xu <xxhdx1985126@gmail.com wrote:
> >>
> >> Hi, Jeff and Zheng. Do you think it's feasible to give users choices
> >> over what kind of cache coherency they need like what is done in
> >> glusterfs(https://lists.gluster.org/pipermail/gluster-devel/2015-September/046611.html)?
> >
> >
> > Or maybe we should make write clients notify other Fr clients about the update before responding to the writing process?
> >>
> >>
> >> On Thu, 28 Feb 2019 at 10:31, Xuehan Xu <xxhdx1985126@gmail.com> wrote:
> >> >
> >> > > > Hi, Jeff. Thanks for your reply:-)
> >> > > >  In our case, those requests can take more than ten seconds to finish,
> >> > > > which is intolerable to web servers. Actually, every time this
> >> > > > scenario occurs, it makes our monitoring system wrongly recognize
> >> > > > those web servers as down and shift all web traffic to other IDCs.
> >> > > >
> >> > >
> >> > > Ouch, ok. It's understandable that you'd want that fixed and kudos for
> >> > > taking a stab at it. The idea is not a bad one, but we'd need to
> >> > > consider how to do it while maintaining strict cache coherency.
> >> > >
> >> > > > > > Basically, the scenario we are facing now is as follows:
> >> > > > > >      We have several dozens of web servers all of which mounted the
> >> > > > > > same directory for their php scripts and would ask cephfs for the
> >> > > > > > latest file content if some of those scripts is modified. To improve
> >> > > > > > the web servers' efficiency, each of them runs tens of threads to
> >> > > > > > serve web requests, so, as you can see, when some scripts is modified,
> >> > > > > > there would be thousands of threads asking for the same file which
> >> > > > > > would lead to thousands of getattr/lookup and read reqs to the cephfs
> >> > > > > > cluster and cause huge pressure on both MDSes and OSDs.
> >> > > > > >
> >> > > > >
> >> > > > > How often are these files changing?
> >> > > > >
> >> > > > > Between changes to the files, the web server clients should all end up
> >> > > > > with read-only caps that allow them to cache the inode attributes. When
> >> > > > > there is a change to the file, those will need to be recalled of course
> >> > > > > and you might see more activity on the MDS, but that's not necessarily a
> >> > > > > problem.
> >> > > >
> >> > > > File changes is not very often, maybe one or two times a week. But,
> >> > > > since each time files get changed the scenario mentioned above occurs,
> >> > > > this problem really makes us suffer.
> >> > > >
> >> > >
> >> > > I assume that you have verified that this patch does help in that
> >> > > scenario?
> >> > >
> >> >
> >> > Yes:-) This patch can reduce the latency from more than ten seconds to
> >> > within 100 ms most of the time:-) It's only the cache coherency that's
> >> > still haunting us....
> >> >
> >> > > So to make sure I understand the scenario in terms of caps:
> >> > >
> >> > > Most clients have read caps for the files in question prior to changes
> >> > > to these files (and probably leases on the directories and such). When
> >> > > you go to update the files, all of those caps get recalled (which
> >> > > probably also takes a little while).
> >> > >
> >> > > After the file is written, a bunch of tasks on the clients issue stat()
> >> > > calls in parallel, which become LOOKUPs and/or GETATTRs you end up with
> >> > > a thundering herd problem on the MDS.
> >> > >
> >> > > Is that about right?
> >> > >
> >> >
> >> > Yes, that's right:-)
> >> >
> >> > > If so, I don't see a way around that while preserving cache coherency
> >> > > right offhand. I'd have to think about it
> >> > >
> >> > > > > > > Looking over the patch, the basic idea is to implement a cache (in
> >> > > > > > > rbtrees) of getattrs and lookups in flight and just wait on those if
> >> > > > > > > there already is one that matches what we need. I agree with Zheng that
> >> > > > > > > this could end up violating the strict cache coherency guarantees that
> >> > > > > > > ceph tries to provide.
> >> > > > > > >
> >> > > > > > > You're relying on an assumption that if you issue a getattr/lookup while
> >> > > > > > > another one is already in flight, then it's legitimate to use the result
> >> > > > > > > of the initial one for the one issued later. That's not necessarily the
> >> > > > > > > case. I suspect this would be possible, even from a single client:
> >> > > > > > >
> >> > > > > > > - client1/task1 issues stat(), server performs getattr
> >> > > > > > > but reply gets delayed for a bit
> >> > > > > > >
> >> > > > > > > - client2 does write() + fsync()
> >> > > > > > >
> >> > > > > > > - client1/task2 issues stat(), finds the entry in cache from task1 and
> >> > > > > > > ends up waiting for first getattr to complete
> >> > > > > > >
> >> > > > > > > - getattr reply comes in, and both stat calls get back the same
> >> > > > > > > attributes, showing pre-write size/mtime
> >> > > > > > >
> >> > > > > > > In this case, the synchronous write by client2 was provably done before
> >> > > > > > > the second getattr, but the second stat() call wouldn't see the new
> >> > > > > > > size.
> >> > > > > >
> >> > > > > > I understand that this is a big problem. If what I think is right,
> >> > > > > > this problem may be only with the read handling patch.
> >> > > > > > Here's what I think:
> >> > > > > >     First, I think the problem would come up only when client1/task2
> >> > > > > > issues stat() after the finish of client2's write()+sync(), since,
> >> > > > > > only in this case, a pre-write size/mtime seen by client1/task2 would
> >> > > > > > cause errors. And a pre-write size/mtime would be seen by
> >> > > > > > client1/task2 only when client1/task1's stat() is processed before
> >> > > > > > client2's write()+sync(). However, when client1/task1's stat() is
> >> > > > > > processed successfully, the inode's filelock should be in states like
> >> > > > > > LOCK_SYNC, so when processing client2's write()+sync(), the MDS must
> >> > > > > > move the inode's to states like LOCK_MIX which would cause a cap
> >> > > > > > revoke from client1. And it's only when client1 acked the revoke can
> >> > > > > > the MDS go on processing the metadata modification caused by client2's
> >> > > > > > write()+fsync(). Also, MDS sends messages to the same client in the
> >> > > > > > same order of the creation of those messages. So, when client1/task2
> >> > > > > > issues stat(), the result of the previous one must have been processed
> >> > > > > > by client1 and can't be seen by client1/task2, which means this
> >> > > > > > problem wouldn't occur. Am I right about this? I don't know if I'm
> >> > > > > > missing anything.
> >> > > > > >
> >> > > > >
> >> > > > > There is never a guarantee that the MDS will give you the caps you've
> >> > > > > requested, so I don't think you can rely on client1 having to process a
> >> > > > > cap revoke to get you out of this situation.
> >> > > > >
> >> > > > > IOW, I believe this is possible:
> >> > > > >
> >> > > > > - client1 requests caps to do a stat(), doesn't get them (maybe because
> >> > > > > client2 is actively writing?)
> >> > > > > - client1 issues GETATTR to MDS, which gets processed there, but reply
> >> > > > > gets delayed
> >> > > > > - client2 does write()+fsync()
> >> > > > > - client1 issues another stat(), ends up waiting on the first GETATTR,
> >> > > > > which won't contain the most up to date information
> >> > > > >
> >> > > > > No cap revoke to client1 here since it never got any in the first place.
> >> > > >
> >> > > > Yes, you are right. But I think, in this case, the first GETATTR would
> >> > > > get processed after client2's write()+fsync(), which means it would
> >> > > > get the latest inode content. So the problem wouldn't occur, either.
> >> > > > Am I right?
> >> > > >
> >> > >
> >> > > No. In this (hypothetical) case the GETATTR gets processed first. The
> >> > > reply just gets delayed for a bit -- possibly in the sender's transmit
> >> > > buffer or receiver's receive buffer...or, maybe some frames in the reply
> >> > > get dropped somewhere and you have to retransmit (bad cable)? The
> >> > > possibilities are endless when it comes to network filesystems. :)
> >> > >
> >> > > The key point is that on a network filesystem there is always some delay
> >> > > between an operation being performed on the server and the client seeing
> >> > > the result. All you know is when you transmitted the request and when
> >> > > you received the reply.
> >> > >
> >> > > CephFS is no different here, but it tries to maintain strict cache
> >> > > coherency through the cap handling.
> >> >
> >> > Oh, I think I get your point. You mean that there are chances that
> >> > even if MDS process the GETATTR reqs successfully and respond to the
> >> > client, it won't give the client the caps that they asked for; so
> >> > there are no guarantee that the finish of client2's write()+fsync()
> >> > would indicate the first GETATTR is already finished on the client1's
> >> > side. Am I right?
> >> >
> >> > >
> >> > > > > > However, as for the read handling patch, this problem do exists, and
> >> > > > > > it's really confusing me.
> >> > > > > >
> >> > > > >
> >> > > > > The cap handling code in cephfs is extremely complex. My hat is off to
> >> > > > > you for even attempting to understand the MDS LOCK_* handling code.
> >> > > > >
> >> > > > > > > The read handling patch may suffer from similar issues, though I haven't
> >> > > > > > > gone over that one in as much detail.
> >> > > > > > >
> >> > > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> >> > > > > > > > index bba28a5034ba..5a177a3c283f 100644
> >> > > > > > > > --- a/fs/ceph/caps.c
> >> > > > > > > > +++ b/fs/ceph/caps.c
> >> > > > > > > > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> >> > > > > > > >               goto out;
> >> > > > > > > >
> >> > > > > > > >       ci->i_flushing_caps &= ~cleaned;
> >> > > > > > > > +     ci->last_flush_ack_tid = flush_tid;
> >> > > > > > > >
> >> > > > > > > >       spin_lock(&mdsc->cap_dirty_lock);
> >> > > > > > > >
> >> > > > > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> >> > > > > > > > index 82928cea0209..6f17c0319158 100644
> >> > > > > > > > --- a/fs/ceph/dir.c
> >> > > > > > > > +++ b/fs/ceph/dir.c
> >> > > > > > > > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> >> > > > > > > >                               }
> >> > > > > > > >                       }
> >> > > > > > > >               }
> >> > > > > > > > +             dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
> >> > > > > > > >       }
> >> > > > > > > >       spin_unlock(&dentry->d_lock);
> >> > > > > > > >
> >> > > > > > > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
> >> > > > > > > >                                        CEPH_MDS_LEASE_RENEW, seq);
> >> > > > > > > >               ceph_put_mds_session(session);
> >> > > > > > > >       }
> >> > > > > > > > -     dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> >> > > > > > > > +     dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
> >> > > > > > > >       return valid;
> >> > > > > > > >  }
> >> > > > > > > >
> >> > > > > > > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
> >> > > > > > > >       if (!valid) {
> >> > > > > > > >               struct ceph_mds_client *mdsc =
> >> > > > > > > >                       ceph_sb_to_client(dir->i_sb)->mdsc;
> >> > > > > > > > -             struct ceph_mds_request *req;
> >> > > > > > > > +             struct ceph_mds_request *req = NULL;
> >> > > > > > > > +             struct ceph_inode_info* cdir = ceph_inode(dir);
> >> > > > > > > >               int op, err;
> >> > > > > > > >               u32 mask;
> >> > > > > > > >
> >> > > > > > > >               if (flags & LOOKUP_RCU)
> >> > > > > > > >                       return -ECHILD;
> >> > > > > > > > +             mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> >> > > > > > > > +             if (ceph_security_xattr_wanted(dir))
> >> > > > > > > > +                     mask |= CEPH_CAP_XATTR_SHARED;
> >> > > > > > > >
> >> > > > > > > >               op = ceph_snap(dir) == CEPH_SNAPDIR ?
> >> > > > > > > >                       CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> >> > > > > > > > -             req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> >> > > > > > > > -             if (!IS_ERR(req)) {
> >> > > > > > > > -                     req->r_dentry = dget(dentry);
> >> > > > > > > > -                     req->r_num_caps = 2;
> >> > > > > > > > -                     req->r_parent = dir;
> >> > > > > > > > +             if (op == CEPH_MDS_OP_LOOKUP) {
> >> > > > > > > > +                     mutex_lock(&cdir->lookups_inflight_lock);
> >> > > > > > > > +                     dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n",
> >> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> >> > > > > > > > +                     req = __search_inode_getattr_or_lookup(cdir, mask, true);
> >> > > > > > > > +             }
> >> > > > > > > > +             if (req && op == CEPH_MDS_OP_LOOKUP) {
> >> > > > > > > > +                     dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> >> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> >> > > > > > > > +                     ceph_mdsc_get_request(req);
> >> > > > > > > > +                     mutex_unlock(&cdir->lookups_inflight_lock);
> >> > > > > > > > +                     err = ceph_mdsc_wait_for_request(req);
> >> > > > > > > > +                     dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> >> > > > > > > > +                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> >> > > > > > > > +             } else {
> >> > > > > > > >
> >> > > > > > > > -                     mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> >> > > > > > > > -                     if (ceph_security_xattr_wanted(dir))
> >> > > > > > > > -                             mask |= CEPH_CAP_XATTR_SHARED;
> >> > > > > > > > -                     req->r_args.getattr.mask = cpu_to_le32(mask);
> >> > > > > > > > +                     req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> >> > > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> >> > > > > > > > +                             if (!IS_ERR(req)) {
> >> > > > > > > > +                                     unsigned long res = 0;
> >> > > > > > > > +                                     req->r_dentry = dget(dentry);
> >> > > > > > > > +                                     req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> >> > > > > > > > +                                     req->r_num_caps = 2;
> >> > > > > > > > +                                     req->r_parent = dir;
> >> > > > > > > > +                                     req->r_args.getattr.mask = cpu_to_le32(mask);
> >> > > > > > > > +                                     res = __register_inode_getattr_or_lookup(cdir, req, true);
> >> > > > > > > > +                                     if (IS_ERR_VALUE(res)) {
> >> > > > > > > > +                                             mutex_unlock(&cdir->lookups_inflight_lock);
> >> > > > > > > > +                                             goto out;
> >> > > > > > > > +                                     }
> >> > > > > > > > +                                     dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n",
> >> > > > > > > > +                                                     dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> >> > > > > > > > +                             }
> >> > > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> >> > > > > > > > +                     }
> >> > > > > > > > +                     if (IS_ERR(req))
> >> > > > > > > > +                             goto out;
> >> > > > > > > >
> >> > > > > > > >                       err = ceph_mdsc_do_request(mdsc, NULL, req);
> >> > > > > > > > -                     switch (err) {
> >> > > > > > > > -                     case 0:
> >> > > > > > > > -                             if (d_really_is_positive(dentry) &&
> >> > > > > > > > -                                 d_inode(dentry) == req->r_target_inode)
> >> > > > > > > > -                                     valid = 1;
> >> > > > > > > > -                             break;
> >> > > > > > > > -                     case -ENOENT:
> >> > > > > > > > -                             if (d_really_is_negative(dentry))
> >> > > > > > > > -                                     valid = 1;
> >> > > > > > > > -                             /* Fallthrough */
> >> > > > > > > > -                     default:
> >> > > > > > > > -                             break;
> >> > > > > > > > +                     if (op == CEPH_MDS_OP_LOOKUP) {
> >> > > > > > > > +                             mutex_lock(&cdir->lookups_inflight_lock);
> >> > > > > > > > +                             __unregister_inode_getattr_or_lookup(cdir, req, true);
> >> > > > > > > > +                             dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> >> > > > > > > > +                                             dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> >> > > > > > > > +                             mutex_unlock(&cdir->lookups_inflight_lock);
> >> > > > > > > >                       }
> >> > > > > > > > -                     ceph_mdsc_put_request(req);
> >> > > > > > > > -                     dout("d_revalidate %p lookup result=%d\n",
> >> > > > > > > > -                          dentry, err);
> >> > > > > > > >               }
> >> > > > > > > > +             switch (err) {
> >> > > > > > > > +             case 0:
> >> > > > > > > > +                     if (d_really_is_positive(dentry) &&
> >> > > > > > > > +                         d_inode(dentry) == req->r_target_inode)
> >> > > > > > > > +                             valid = 1;
> >> > > > > > > > +                     break;
> >> > > > > > > > +             case -ENOENT:
> >> > > > > > > > +                     if (d_really_is_negative(dentry))
> >> > > > > > > > +                             valid = 1;
> >> > > > > > > > +                     /* Fallthrough */
> >> > > > > > > > +             default:
> >> > > > > > > > +                     break;
> >> > > > > > > > +             }
> >> > > > > > > > +             ceph_mdsc_put_request(req);
> >> > > > > > > > +             dout("d_revalidate %p lookup result=%d\n",
> >> > > > > > > > +                             dentry, err);
> >> > > > > > > >       }
> >> > > > > > > >
> >> > > > > > > > +out:
> >> > > > > > > >       dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
> >> > > > > > > >       if (valid) {
> >> > > > > > > >               ceph_dentry_lru_touch(dentry);
> >> > > > > > >
> >> > > > > > > d_revalidate is changed to use the cache, but not lookup. Why?
> >> > > > > >
> >> > > > > > Multiple same lookups would also be sent out in our scenario mentioned
> >> > > > > > above, so we add a cache for lookups, too. I don't know if I'm getting
> >> > > > > > your question correctly.
> >> > > > > >
> >> > > > >
> >> > > > > d_revalidate only checks the validity of an existing dentry. To get a
> >> > > > > dentry in the first place, you have to use the directory inode's lookup
> >> > > > > readdir operations. lookup isn't changed here and I was wondering why.
> >> > > > >
> >> > > > > Now that I think about it though, you probably need to do that anyway,
> >> > > > > as the VFS code already does something similar when there are multiple
> >> > > > > lookups for the same dentry in flight.
> >> > > >
> >> > > > It seems that the commit 200fd27c8fa2ba8bb4529033967b69a7cbfa2c2e made
> >> > > > ceph kernel module to use lookup request to revalidate dentry.
> >> > > >
> >> > >
> >> > > Yes, but it doesn't dispatch through the local kernel's inode lookup
> >> > > operation. ceph_lookup also does CEPH_MDS_OP_LOOKUP operations.
> >> >
> >> > I now get your point:-) Will try to fix that as well:-)
> >> >
> >> > >
> >> > > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> >> > > > > > > > index 9d1f34d46627..8705f0645a24 100644
> >> > > > > > > > --- a/fs/ceph/inode.c
> >> > > > > > > > +++ b/fs/ceph/inode.c
> >> > > > > > > > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >> > > > > > > >       dout("alloc_inode %p\n", &ci->vfs_inode);
> >> > > > > > > >
> >> > > > > > > >       spin_lock_init(&ci->i_ceph_lock);
> >> > > > > > > > +     mutex_init(&ci->getattrs_inflight_lock);
> >> > > > > > > > +     mutex_init(&ci->lookups_inflight_lock);
> >> > > > > > > >
> >> > > > > > > >       ci->i_version = 0;
> >> > > > > > > >       ci->i_inline_version = 0;
> >> > > > > > > > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> >> > > > > > > >       ci->i_xattrs.index_version = 0;
> >> > > > > > > >
> >> > > > > > > >       ci->i_caps = RB_ROOT;
> >> > > > > > > > +     ci->getattrs_inflight = RB_ROOT;
> >> > > > > > > > +     ci->lookups_inflight = RB_ROOT;
> >> > > > > > > >       ci->i_auth_cap = NULL;
> >> > > > > > > >       ci->i_dirty_caps = 0;
> >> > > > > > > >       ci->i_flushing_caps = 0;
> >> > > > > > > > +     ci->last_flush_ack_tid = 0;
> >> > > > > > > >       INIT_LIST_HEAD(&ci->i_dirty_item);
> >> > > > > > > >       INIT_LIST_HEAD(&ci->i_flushing_item);
> >> > > > > > > >       ci->i_prealloc_cap_flush = NULL;
> >> > > > > > > > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
> >> > > > > > > >        * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
> >> > > > > > > >        * we expect a negative dentry.
> >> > > > > > > >        */
> >> > > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> >> > > > > > > >       if (!tgt_vino && d_really_is_positive(dentry))
> >> > > > > > > >               return;
> >> > > > > > > > -
> >> > > > > > > > +     dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
> >> > > > > > > >       if (tgt_vino && (d_really_is_negative(dentry) ||
> >> > > > > > > >                       !ceph_ino_compare(d_inode(dentry), tgt_vino)))
> >> > > > > > > >               return;
> >> > > > > > > > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >> > > > > > > >       struct ceph_mds_request *req;
> >> > > > > > > >       int mode;
> >> > > > > > > >       int err;
> >> > > > > > > > +     struct ceph_inode_info* cinode = ceph_inode(inode);
> >> > > > > > > >
> >> > > > > > > >       if (ceph_snap(inode) == CEPH_SNAPDIR) {
> >> > > > > > > >               dout("do_getattr inode %p SNAPDIR\n", inode);
> >> > > > > > > > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >> > > > > > > >       if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
> >> > > > > > > >               return 0;
> >> > > > > > > >
> >> > > > > > > > -     mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> >> > > > > > > > -     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> >> > > > > > > > -     if (IS_ERR(req))
> >> > > > > > > > -             return PTR_ERR(req);
> >> > > > > > > > -     req->r_inode = inode;
> >> > > > > > > > -     ihold(inode);
> >> > > > > > > > -     req->r_num_caps = 1;
> >> > > > > > > > -     req->r_args.getattr.mask = cpu_to_le32(mask);
> >> > > > > > > > -     req->r_locked_page = locked_page;
> >> > > > > > > > -     err = ceph_mdsc_do_request(mdsc, NULL, req);
> >> > > > > > > > +     mutex_lock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +     dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> >> > > > > > > > +     req = __search_inode_getattr_or_lookup(cinode, mask, false);
> >> > > > > > > > +     if (req) {
> >> > > > > > > > +             dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> >> > > > > > > > +             ceph_mdsc_get_request(req);
> >> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +             err = ceph_mdsc_wait_for_request(req);
> >> > > > > > > > +             dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> >> > > > > > > > +     } else {
> >> > > > > > > > +             mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> >> > > > > > > > +             req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> >> > > > > > > > +             if (!IS_ERR(req)) {
> >> > > > > > > > +                     unsigned long res = 0;
> >> > > > > > > > +                     req->r_inode = inode;
> >> > > > > > > > +                     req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> >> > > > > > > > +                     ihold(inode);
> >> > > > > > > > +                     req->r_num_caps = 1;
> >> > > > > > > > +                     req->r_args.getattr.mask = cpu_to_le32(mask);
> >> > > > > > > > +                     req->r_locked_page = locked_page;
> >> > > > > > > > +                     res = __register_inode_getattr_or_lookup(cinode, req, false);
> >> > > > > > > > +                     if (IS_ERR_VALUE(res)) {
> >> > > > > > > > +                             mutex_unlock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +                             return res;
> >> > > > > > > > +                     }
> >> > > > > > > > +                     dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> >> > > > > > > > +             }
> >> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +             if (IS_ERR(req))
> >> > > > > > > > +                     return PTR_ERR(req);
> >> > > > > > > > +             err = ceph_mdsc_do_request(mdsc, NULL, req);
> >> > > > > > > > +             mutex_lock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +             __unregister_inode_getattr_or_lookup(cinode, req, false);
> >> > > > > > > > +             dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> >> > > > > > > > +             mutex_unlock(&cinode->getattrs_inflight_lock);
> >> > > > > > > > +     }
> >> > > > > > > >       if (locked_page && err == 0) {
> >> > > > > > > >               u64 inline_version = req->r_reply_info.targeti.inline_version;
> >> > > > > > > >               if (inline_version == 0) {
> >> > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> > > > > > > > index 163fc74bf221..5c6e00f2fd0a 100644
> >> > > > > > > > --- a/fs/ceph/mds_client.c
> >> > > > > > > > +++ b/fs/ceph/mds_client.c
> >> > > > > > > > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> >> > > > > > > >       return 0;
> >> > > > > > > >  }
> >> > > > > > > >
> >> > > > > > > > +void ceph_mds_requests_tree_release(struct kref* kref)
> >> > > > > > > > +{
> >> > > > > > > > +     struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> >> > > > > > > > +
> >> > > > > > > > +     BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> >> > > > > > > > +     kfree(mrtree);
> >> > > > > > > > +}
> >> > > > > > > > +
> >> > > > > > > >  /*
> >> > > > > > > >   * Create an mds request.
> >> > > > > > > >   */
> >> > > > > > > > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
> >> > > > > > > >       req->r_fmode = -1;
> >> > > > > > > >       kref_init(&req->r_kref);
> >> > > > > > > >       RB_CLEAR_NODE(&req->r_node);
> >> > > > > > > > +     RB_CLEAR_NODE(&req->getattr_lookup_node);
> >> > > > > > > >       INIT_LIST_HEAD(&req->r_wait);
> >> > > > > > > > +     init_completion(&req->batch_op_completion);
> >> > > > > > > >       init_completion(&req->r_completion);
> >> > > > > > > >       init_completion(&req->r_safe_completion);
> >> > > > > > > >       INIT_LIST_HEAD(&req->r_unsafe_item);
> >> > > > > > > > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> >> > > > > > > >       mutex_unlock(&mdsc->mutex);
> >> > > > > > > >  }
> >> > > > > > > >
> >> > > > > > > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> >> > > > > > > > +{
> >> > > > > > > > +     int err = 0;
> >> > > > > > > > +     long timeleft = wait_for_completion_killable_timeout(
> >> > > > > > > > +                     &req->batch_op_completion,
> >> > > > > > > > +                     ceph_timeout_jiffies(req->r_timeout));
> >> > > > > > > > +     if (timeleft > 0)
> >> > > > > > > > +             err = 0;
> >> > > > > > > > +     else if (!timeleft)
> >> > > > > > > > +             err = -EIO;  /* timed out */
> >> > > > > > > > +     else
> >> > > > > > > > +             err = timeleft;  /* killed */
> >> > > > > > > > +     if (!err)
> >> > > > > > > > +             return err;
> >> > > > > > > > +     return req->batch_op_err;
> >> > > > > > > > +}
> >> > > > > > > > +
> >> > > > > > > >  /*
> >> > > > > > > >   * Synchrously perform an mds request.  Take care of all of the
> >> > > > > > > >   * session setup, forwarding, retry details.
> >> > > > > > > > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> >> > > > > > > >       } else {
> >> > > > > > > >               err = req->r_err;
> >> > > > > > > >       }
> >> > > > > > > > -
> >> > > > > > > > +     req->batch_op_err = err;
> >> > > > > > > > +     complete_all(&req->batch_op_completion);
> >> > > > > > > >  out:
> >> > > > > > > >       mutex_unlock(&mdsc->mutex);
> >> > > > > > > >       dout("do_request %p done, result %d\n", req, err);
> >> > > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >> > > > > > > > index 729da155ebf0..41727677643e 100644
> >> > > > > > > > --- a/fs/ceph/mds_client.h
> >> > > > > > > > +++ b/fs/ceph/mds_client.h
> >> > > > > > > > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
> >> > > > > > > >  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
> >> > > > > > > >                                                struct ceph_mds_request *req);
> >> > > > > > > >
> >> > > > > > > > +struct mds_request_tree {
> >> > > > > > > > +     int mask;
> >> > > > > > > > +     struct kref ref;
> >> > > > > > > > +     struct rb_root mds_requests;
> >> > > > > > > > +     struct rb_node parent_tree_node;
> >> > > > > > > > +};
> >> > > > > > > > +
> >> > > > > > > > +extern void ceph_mds_requests_tree_release(struct kref* ref);
> >> > > > > > > > +
> >> > > > > > > >  /*
> >> > > > > > > >   * an in-flight mds request
> >> > > > > > > >   */
> >> > > > > > > >  struct ceph_mds_request {
> >> > > > > > > >       u64 r_tid;                   /* transaction id */
> >> > > > > > > > +     u64 last_caps_flush_ack_tid;
> >> > > > > > > >       struct rb_node r_node;
> >> > > > > > > > +     struct rb_node getattr_lookup_node;
> >> > > > > > > >       struct ceph_mds_client *r_mdsc;
> >> > > > > > > >
> >> > > > > > > >       int r_op;                    /* mds op code */
> >> > > > > > > > @@ -264,7 +275,7 @@ struct ceph_mds_request {
> >> > > > > > > >       struct ceph_msg  *r_reply;
> >> > > > > > > >       struct ceph_mds_reply_info_parsed r_reply_info;
> >> > > > > > > >       struct page *r_locked_page;
> >> > > > > > > > -     int r_err;
> >> > > > > > > > +     int r_err, batch_op_err;
> >> > > > > > > >
> >> > > > > > > >       unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
> >> > > > > > > >       unsigned long r_started;  /* start time to measure timeout against */
> >> > > > > > > > @@ -287,6 +298,7 @@ struct ceph_mds_request {
> >> > > > > > > >
> >> > > > > > > >       struct kref       r_kref;
> >> > > > > > > >       struct list_head  r_wait;
> >> > > > > > > > +     struct completion batch_op_completion;
> >> > > > > > > >       struct completion r_completion;
> >> > > > > > > >       struct completion r_safe_completion;
> >> > > > > > > >       ceph_mds_request_callback_t r_callback;
> >> > > > > > > > @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
> >> > > > > > > >  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
> >> > > > > > > >  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
> >> > > > > > > >                                    struct ceph_mds_request *req);
> >> > > > > > > > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
> >> > > > > > > >  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
> >> > > > > > > >                               struct inode *dir,
> >> > > > > > > >                               struct ceph_mds_request *req);
> >> > > > > > > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> >> > > > > > > > index da2cd8e89062..857fbbc0b768 100644
> >> > > > > > > > --- a/fs/ceph/super.c
> >> > > > > > > > +++ b/fs/ceph/super.c
> >> > > > > > > > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
> >> > > > > > > >       destroy_caches();
> >> > > > > > > >  }
> >> > > > > > > >
> >> > > > > > > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> >> > > > > > > > +                                       struct ceph_mds_request* req,
> >> > > > > > > > +                                       bool is_lookup)
> >> > > > > > > > +{
> >> > > > > > > > +     struct rb_node *node = NULL;
> >> > > > > > > > +     struct mds_request_tree* tmp = NULL;
> >> > > > > > > > +     if (!is_lookup)
> >> > > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> >> > > > > > > > +     else
> >> > > > > > > > +             node = inode->lookups_inflight.rb_node;
> >> > > > > > > > +
> >> > > > > > > > +     while (node)
> >> > > > > > > > +     {
> >> > > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> >> > > > > > > > +
> >> > > > > > > > +             if (tmp->mask > req->r_args.getattr.mask)
> >> > > > > > > > +                     node = node->rb_left;
> >> > > > > > > > +             else if (tmp->mask < req->r_args.getattr.mask)
> >> > > > > > > > +                     node = node->rb_right;
> >> > > > > > > > +             else
> >> > > > > > > > +                     break;
> >> > > > > > > > +     }
> >> > > > > > > > +
> >> > > > > > > > +     if (!node)
> >> > > > > > > > +             BUG();
> >> > > > > > > > +
> >> > > > > > > > +     rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> >> > > > > > > > +
> >> > > > > > > > +     if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> >> > > > > > > > +             if (!is_lookup)
> >> > > > > > > > +                     rb_erase(node, &inode->getattrs_inflight);
> >> > > > > > > > +             else
> >> > > > > > > > +                     rb_erase(node, &inode->lookups_inflight);
> >> > > > > > > > +             kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> >> > > > > > > > +     }
> >> > > > > > > > +}
> >> > > > > > > > +
> >> > > > > > > > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> >> > > > > > > > +                                     struct ceph_mds_request* req,
> >> > > > > > > > +                                     bool is_lookup)
> >> > > > > > > > +{
> >> > > > > > > > +     struct rb_node **p = NULL, *parent = NULL;
> >> > > > > > > > +     struct mds_request_tree *tmp = NULL;
> >> > > > > > > > +     struct ceph_mds_request* tmp_req = NULL;
> >> > > > > > > > +     struct mds_request_tree* mrtree = NULL;
> >> > > > > > > > +
> >> > > > > > > > +     if (!is_lookup)
> >> > > > > > > > +             p = &ci->getattrs_inflight.rb_node;
> >> > > > > > > > +     else
> >> > > > > > > > +             p = &ci->lookups_inflight.rb_node;
> >> > > > > > > > +
> >> > > > > > > > +     while (*p) {
> >> > > > > > > > +             parent = *p;
> >> > > > > > > > +             tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> >> > > > > > > > +             if (req->r_args.getattr.mask < tmp->mask)
> >> > > > > > > > +                     p = &(*p)->rb_left;
> >> > > > > > > > +             else if (req->r_args.getattr.mask > tmp->mask)
> >> > > > > > > > +                     p = &(*p)->rb_right;
> >> > > > > > > > +             else
> >> > > > > > > > +                     break;
> >> > > > > > > > +     }
> >> > > > > > > > +
> >> > > > > > > > +     if (!(*p)) {
> >> > > > > > > > +             mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> >> > > > > > > > +
> >> > > > > > > > +             if (!mrtree)
> >> > > > > > > > +                     return -ENOMEM;
> >> > > > > > > > +
> >> > > > > > > > +             mrtree->mds_requests = RB_ROOT;
> >> > > > > > > > +             mrtree->mask = req->r_args.getattr.mask;
> >> > > > > > > > +             kref_init(&mrtree->ref);
> >> > > > > > > > +             rb_link_node(&mrtree->parent_tree_node, parent, p);
> >> > > > > > > > +             if (!is_lookup) {
> >> > > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> >> > > > > > > > +             } else {
> >> > > > > > > > +                     rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> >> > > > > > > > +             }
> >> > > > > > > > +     } else {
> >> > > > > > > > +             mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> >> > > > > > > > +     }
> >> > > > > > > > +
> >> > > > > > > > +     p = &mrtree->mds_requests.rb_node;
> >> > > > > > > > +     parent = NULL;
> >> > > > > > > > +     while (*p) {
> >> > > > > > > > +             parent = *p;
> >> > > > > > > > +             tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> >> > > > > > > > +             if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> >> > > > > > > > +                     p = &(*p)->rb_left;
> >> > > > > > > > +             else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> >> > > > > > > > +                     p = &(*p)->rb_right;
> >> > > > > > > > +             else
> >> > > > > > > > +                     BUG();
> >> > > > > > > > +     }
> >> > > > > > > > +
> >> > > > > > > > +     rb_link_node(&req->getattr_lookup_node, parent, p);
> >> > > > > > > > +     rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> >> > > > > > > > +     return 0;
> >> > > > > > > > +}
> >> > > > > > > > +
> >> > > > > > > > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> >> > > > > > > > +                                   int mask,
> >> > > > > > > > +                                   bool is_lookup)
> >> > > > > > > > +{
> >> > > > > > > > +     struct rb_node *node = NULL;
> >> > > > > > > > +     if (!is_lookup)
> >> > > > > > > > +             node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> >> > > > > > > > +     else
> >> > > > > > > > +             node = inode->lookups_inflight.rb_node;
> >> > > > > > > > +
> >> > > > > > > > +     while (node)
> >> > > > > > > > +     {
> >> > > > > > > > +             struct mds_request_tree* tmp = NULL;
> >> > > > > > > > +             tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> >> > > > > > > > +
> >> > > > > > > > +             if (tmp->mask > mask) {
> >> > > > > > > > +                     node = node->rb_left;
> >> > > > > > > > +             } else if (tmp->mask < mask) {
> >> > > > > > > > +                     node = node->rb_right;
> >> > > > > > > > +             } else {
> >> > > > > > > > +                     struct rb_node* inner_node = tmp->mds_requests.rb_node;
> >> > > > > > > > +                     while (inner_node) {
> >> > > > > > > > +                             struct ceph_mds_request* req = NULL;
> >> > > > > > > > +
> >> > > > > > > > +                             req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> >> > > > > > > > +                             if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> >> > > > > > > > +                                     inner_node = inner_node->rb_left;
> >> > > > > > > > +                             } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> >> > > > > > > > +                                     inner_node = inner_node->rb_right;
> >> > > > > > > > +                             } else
> >> > > > > > > > +                                     return req; /* Found it */
> >> > > > > > > > +                     }
> >> > > > > > > > +                     return NULL;
> >> > > > > > > > +             }
> >> > > > > > > > +     }
> >> > > > > > > > +     return NULL;
> >> > > > > > > > +}
> >> > > > > > > >  module_init(init_ceph);
> >> > > > > > > >  module_exit(exit_ceph);
> >> > > > > > > >
> >> > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> >> > > > > > > > index dfb64a5211b6..abf761f2a122 100644
> >> > > > > > > > --- a/fs/ceph/super.h
> >> > > > > > > > +++ b/fs/ceph/super.h
> >> > > > > > > > @@ -296,6 +296,8 @@ struct ceph_inode_info {
> >> > > > > > > >       struct ceph_vino i_vino;   /* ceph ino + snap */
> >> > > > > > > >
> >> > > > > > > >       spinlock_t i_ceph_lock;
> >> > > > > > > > +     struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> >> > > > > > > > +     struct rb_root getattrs_inflight, lookups_inflight;
> >> > > > > > >
> >> > > > > > > If you do end up pursuing this further, I think it might be better to
> >> > > > > > > use a spinlock to protect the trees.
> >> > > > > > >
> >> > > > > > > Take the lock, search the tree and if there isn't one, drop the lock,
> >> > > > > > > allocate, take it and search again, and insert the one you allocated if
> >> > > > > > > no conflicting structure was inserted in the meantime.
> >> > > > > > >
> >> > > > > > > In addition to cutting down on context switches when these things are
> >> > > > > > > contended, that would also make it easier to hide the locking inside the
> >> > > > > > > cache search/register/unregister routines.
> >> > > > > >
> >> > > > > > You are right. At first, I tried to use spinlock, but that caused
> >> > > > > > errors like "BUG: soft lockup - CPU#1 stuck for XXXs", and I don't
> >> > > > > > know how that occurred. Do you have any clew about that?
> >> > > > > >
> >> > > > > > I can't thank you enough for the help:-) Thanks very much:-)
> >> > > > >
> >> > > > > Often that means you forgot to unlock something, or maybe hit some lock
> >> > > > > inversion? Without more analysis it's impossible to know. :)
> >> > > >
> >> > > > Thanks:-) I wouldn't try to debug that:-)
> >> > > > > > > >       u64 i_version;
> >> > > > > > > >       u64 i_inline_version;
> >> > > > > > > > @@ -329,6 +331,7 @@ struct ceph_inode_info {
> >> > > > > > > >       struct rb_root i_caps;           /* cap list */
> >> > > > > > > >       struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
> >> > > > > > > >       unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> >> > > > > > > > +     u64 last_flush_ack_tid;
> >> > > > > > > >       struct list_head i_dirty_item, i_flushing_item;
> >> > > > > > > >       /* we need to track cap writeback on a per-cap-bit basis, to allow
> >> > > > > > > >        * overlapping, pipelined cap flushes to the mds.  we can probably
> >> > > > > > > > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
> >> > > > > > > >                               u64 time_warp_seq, struct timespec64 *ctime,
> >> > > > > > > >                               struct timespec64 *mtime,
> >> > > > > > > >                               struct timespec64 *atime);
> >> > > > > > > > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> >> > > > > > > > +                                            struct ceph_mds_request* req,
> >> > > > > > > > +                                            bool is_lookup);
> >> > > > > > > > +
> >> > > > > > > > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> >> > > > > > > > +                                              struct ceph_mds_request* req,
> >> > > > > > > > +                                              bool is_lookup);
> >> > > > > > > > +
> >> > > > > > > > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> >> > > > > > > > +                                          int mask,
> >> > > > > > > > +                                          bool is_lookup);
> >> > > > > > > >  extern int ceph_fill_trace(struct super_block *sb,
> >> > > > > > > >                          struct ceph_mds_request *req);
> >> > > > > > > >  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
> >> > > > >
> >> > > > > --
> >> > > > > Jeff Layton <jlayton@redhat.com>
> >> > > > >
> >> > >
> >> > > --
> >> > > Jeff Layton <jlayton@redhat.com>
> >> > >
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bba28a5034ba..5a177a3c283f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3360,6 +3360,7 @@  static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 		goto out;
 
 	ci->i_flushing_caps &= ~cleaned;
+	ci->last_flush_ack_tid = flush_tid;
 
 	spin_lock(&mdsc->cap_dirty_lock);
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..6f17c0319158 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1180,6 +1180,7 @@  static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
 				}
 			}
 		}
+		dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
 	}
 	spin_unlock(&dentry->d_lock);
 
@@ -1188,7 +1189,7 @@  static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
 					 CEPH_MDS_LEASE_RENEW, seq);
 		ceph_put_mds_session(session);
 	}
-	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
 	return valid;
 }
 
@@ -1256,46 +1257,85 @@  static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (!valid) {
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(dir->i_sb)->mdsc;
-		struct ceph_mds_request *req;
+		struct ceph_mds_request *req = NULL;
+		struct ceph_inode_info* cdir = ceph_inode(dir);
 		int op, err;
 		u32 mask;
 
 		if (flags & LOOKUP_RCU)
 			return -ECHILD;
+		mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+		if (ceph_security_xattr_wanted(dir))
+			mask |= CEPH_CAP_XATTR_SHARED;
 
 		op = ceph_snap(dir) == CEPH_SNAPDIR ?
 			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
-		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
-		if (!IS_ERR(req)) {
-			req->r_dentry = dget(dentry);
-			req->r_num_caps = 2;
-			req->r_parent = dir;
+		if (op == CEPH_MDS_OP_LOOKUP) {
+			mutex_lock(&cdir->lookups_inflight_lock);
+			dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n", 
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
+			req = __search_inode_getattr_or_lookup(cdir, mask, true);
+		}
+		if (req && op == CEPH_MDS_OP_LOOKUP) {
+			dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
+			ceph_mdsc_get_request(req);
+			mutex_unlock(&cdir->lookups_inflight_lock);
+			err = ceph_mdsc_wait_for_request(req);
+			dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
+					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
+		} else {
 
-			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
-			if (ceph_security_xattr_wanted(dir))
-				mask |= CEPH_CAP_XATTR_SHARED;
-			req->r_args.getattr.mask = cpu_to_le32(mask);
+			req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+			if (op == CEPH_MDS_OP_LOOKUP) {
+				if (!IS_ERR(req)) {
+					unsigned long res = 0;
+					req->r_dentry = dget(dentry);
+					req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
+					req->r_num_caps = 2;
+					req->r_parent = dir;
+					req->r_args.getattr.mask = cpu_to_le32(mask);
+					res = __register_inode_getattr_or_lookup(cdir, req, true);
+					if (IS_ERR_VALUE(res)) {
+						mutex_unlock(&cdir->lookups_inflight_lock);
+						goto out;
+					}
+					dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
+							dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
+				}
+				mutex_unlock(&cdir->lookups_inflight_lock);
+			}
+			if (IS_ERR(req))
+				goto out;
 
 			err = ceph_mdsc_do_request(mdsc, NULL, req);
-			switch (err) {
-			case 0:
-				if (d_really_is_positive(dentry) &&
-				    d_inode(dentry) == req->r_target_inode)
-					valid = 1;
-				break;
-			case -ENOENT:
-				if (d_really_is_negative(dentry))
-					valid = 1;
-				/* Fallthrough */
-			default:
-				break;
+			if (op == CEPH_MDS_OP_LOOKUP) {
+				mutex_lock(&cdir->lookups_inflight_lock);
+				__unregister_inode_getattr_or_lookup(cdir, req, true);
+				dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", 
+						dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
+				mutex_unlock(&cdir->lookups_inflight_lock);
 			}
-			ceph_mdsc_put_request(req);
-			dout("d_revalidate %p lookup result=%d\n",
-			     dentry, err);
 		}
+		switch (err) {
+		case 0:
+			if (d_really_is_positive(dentry) &&
+			    d_inode(dentry) == req->r_target_inode)
+				valid = 1;
+			break;
+		case -ENOENT:
+			if (d_really_is_negative(dentry))
+				valid = 1;
+			/* Fallthrough */
+		default:
+			break;
+		}
+		ceph_mdsc_put_request(req);
+		dout("d_revalidate %p lookup result=%d\n",
+   				dentry, err);
 	}
 
+out:
 	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
 	if (valid) {
 		ceph_dentry_lru_touch(dentry);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9d1f34d46627..8705f0645a24 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -430,6 +430,8 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	dout("alloc_inode %p\n", &ci->vfs_inode);
 
 	spin_lock_init(&ci->i_ceph_lock);
+	mutex_init(&ci->getattrs_inflight_lock);
+	mutex_init(&ci->lookups_inflight_lock);
 
 	ci->i_version = 0;
 	ci->i_inline_version = 0;
@@ -461,9 +463,12 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_xattrs.index_version = 0;
 
 	ci->i_caps = RB_ROOT;
+	ci->getattrs_inflight = RB_ROOT;
+	ci->lookups_inflight = RB_ROOT;
 	ci->i_auth_cap = NULL;
 	ci->i_dirty_caps = 0;
 	ci->i_flushing_caps = 0;
+	ci->last_flush_ack_tid = 0;
 	INIT_LIST_HEAD(&ci->i_dirty_item);
 	INIT_LIST_HEAD(&ci->i_flushing_item);
 	ci->i_prealloc_cap_flush = NULL;
@@ -1044,9 +1049,10 @@  static void update_dentry_lease(struct dentry *dentry,
 	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
 	 * we expect a negative dentry.
 	 */
+	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
 	if (!tgt_vino && d_really_is_positive(dentry))
 		return;
-
+	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
 	if (tgt_vino && (d_really_is_negative(dentry) ||
 			!ceph_ino_compare(d_inode(dentry), tgt_vino)))
 		return;
@@ -2188,6 +2194,7 @@  int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	struct ceph_mds_request *req;
 	int mode;
 	int err;
+	struct ceph_inode_info* cinode = ceph_inode(inode);
 
 	if (ceph_snap(inode) == CEPH_SNAPDIR) {
 		dout("do_getattr inode %p SNAPDIR\n", inode);
@@ -2199,16 +2206,42 @@  int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
 		return 0;
 
-	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
-	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	req->r_inode = inode;
-	ihold(inode);
-	req->r_num_caps = 1;
-	req->r_args.getattr.mask = cpu_to_le32(mask);
-	req->r_locked_page = locked_page;
-	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	mutex_lock(&cinode->getattrs_inflight_lock);
+	dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
+	req = __search_inode_getattr_or_lookup(cinode, mask, false);
+	if (req) {
+		dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
+		ceph_mdsc_get_request(req);
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+		err = ceph_mdsc_wait_for_request(req);
+		dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
+	} else {
+		mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
+		if (!IS_ERR(req)) {
+			unsigned long res = 0;
+			req->r_inode = inode;
+			req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
+			ihold(inode);
+			req->r_num_caps = 1;
+			req->r_args.getattr.mask = cpu_to_le32(mask);
+			req->r_locked_page = locked_page;
+			res = __register_inode_getattr_or_lookup(cinode, req, false);
+			if (IS_ERR_VALUE(res)) {
+				mutex_unlock(&cinode->getattrs_inflight_lock);
+				return res;
+			}
+			dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
+		}
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		mutex_lock(&cinode->getattrs_inflight_lock);
+		__unregister_inode_getattr_or_lookup(cinode, req, false);
+		dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
+		mutex_unlock(&cinode->getattrs_inflight_lock);
+	}
 	if (locked_page && err == 0) {
 		u64 inline_version = req->r_reply_info.targeti.inline_version;
 		if (inline_version == 0) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 163fc74bf221..5c6e00f2fd0a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1815,6 +1815,14 @@  int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
 	return 0;
 }
 
+void ceph_mds_requests_tree_release(struct kref* kref)
+{
+	struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
+
+	BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
+	kfree(mrtree);
+}
+
 /*
  * Create an mds request.
  */
@@ -1836,7 +1844,9 @@  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	req->r_fmode = -1;
 	kref_init(&req->r_kref);
 	RB_CLEAR_NODE(&req->r_node);
+	RB_CLEAR_NODE(&req->getattr_lookup_node);
 	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->batch_op_completion);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
@@ -2429,6 +2439,23 @@  void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 	mutex_unlock(&mdsc->mutex);
 }
 
+int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
+{
+	int err = 0;
+	long timeleft = wait_for_completion_killable_timeout(
+			&req->batch_op_completion,
+			ceph_timeout_jiffies(req->r_timeout));
+	if (timeleft > 0)
+		err = 0;
+	else if (!timeleft)
+		err = -EIO;  /* timed out */
+	else
+		err = timeleft;  /* killed */
+	if (!err)
+		return err;
+	return req->batch_op_err;
+}
+
 /*
  * Synchrously perform an mds request.  Take care of all of the
  * session setup, forwarding, retry details.
@@ -2501,7 +2528,8 @@  int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	} else {
 		err = req->r_err;
 	}
-
+	req->batch_op_err = err;
+	complete_all(&req->batch_op_completion);
 out:
 	mutex_unlock(&mdsc->mutex);
 	dout("do_request %p done, result %d\n", req, err);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 729da155ebf0..41727677643e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -207,12 +207,23 @@  typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
 						 struct ceph_mds_request *req);
 
+struct mds_request_tree {
+	int mask;
+	struct kref ref;
+	struct rb_root mds_requests;
+	struct rb_node parent_tree_node;
+};
+
+extern void ceph_mds_requests_tree_release(struct kref* ref);
+
 /*
  * an in-flight mds request
  */
 struct ceph_mds_request {
 	u64 r_tid;                   /* transaction id */
+	u64 last_caps_flush_ack_tid;
 	struct rb_node r_node;
+	struct rb_node getattr_lookup_node;
 	struct ceph_mds_client *r_mdsc;
 
 	int r_op;                    /* mds op code */
@@ -264,7 +275,7 @@  struct ceph_mds_request {
 	struct ceph_msg  *r_reply;
 	struct ceph_mds_reply_info_parsed r_reply_info;
 	struct page *r_locked_page;
-	int r_err;
+	int r_err, batch_op_err;
 
 	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
@@ -287,6 +298,7 @@  struct ceph_mds_request {
 
 	struct kref       r_kref;
 	struct list_head  r_wait;
+	struct completion batch_op_completion;
 	struct completion r_completion;
 	struct completion r_safe_completion;
 	ceph_mds_request_callback_t r_callback;
@@ -425,6 +437,7 @@  extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
 				     struct ceph_mds_request *req);
+extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct inode *dir,
 				struct ceph_mds_request *req);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index da2cd8e89062..857fbbc0b768 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1177,6 +1177,142 @@  static void __exit exit_ceph(void)
 	destroy_caches();
 }
 
+void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
+					  struct ceph_mds_request* req,
+					  bool is_lookup)
+{
+	struct rb_node *node = NULL;
+	struct mds_request_tree* tmp = NULL;
+	if (!is_lookup)
+		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
+	else
+		node = inode->lookups_inflight.rb_node;
+	
+	while (node)
+	{
+		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
+		
+		if (tmp->mask > req->r_args.getattr.mask)
+			node = node->rb_left;
+		else if (tmp->mask < req->r_args.getattr.mask)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		BUG();
+
+	rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
+	
+	if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
+		if (!is_lookup)
+			rb_erase(node, &inode->getattrs_inflight);
+		else
+			rb_erase(node, &inode->lookups_inflight);
+		kref_put(&tmp->ref, ceph_mds_requests_tree_release);
+	}
+}
+
+int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+					struct ceph_mds_request* req,
+					bool is_lookup)
+{
+	struct rb_node **p = NULL, *parent = NULL;
+	struct mds_request_tree *tmp = NULL;
+	struct ceph_mds_request* tmp_req = NULL;
+	struct mds_request_tree* mrtree = NULL;
+
+	if (!is_lookup)
+		p = &ci->getattrs_inflight.rb_node;
+	else
+		p = &ci->lookups_inflight.rb_node;
+	
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
+		if (req->r_args.getattr.mask < tmp->mask)
+			p = &(*p)->rb_left;
+		else if (req->r_args.getattr.mask > tmp->mask)
+			p = &(*p)->rb_right;
+		else
+			break;
+	}
+	
+	if (!(*p)) {
+		mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
+
+		if (!mrtree)
+			return -ENOMEM;
+
+		mrtree->mds_requests = RB_ROOT;
+		mrtree->mask = req->r_args.getattr.mask;
+		kref_init(&mrtree->ref);
+		rb_link_node(&mrtree->parent_tree_node, parent, p);
+		if (!is_lookup) {
+			rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
+		} else {
+			rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
+		}
+	} else {
+		mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
+	}
+
+	p = &mrtree->mds_requests.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+		tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
+		if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
+			p = &(*p)->rb_left;
+		else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&req->getattr_lookup_node, parent, p);
+	rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
+	return 0;
+}
+
+struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
+				      int mask,
+				      bool is_lookup)
+{
+	struct rb_node *node = NULL;
+	if (!is_lookup)
+		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
+	else
+		node = inode->lookups_inflight.rb_node;
+	
+	while (node)
+	{
+		struct mds_request_tree* tmp = NULL;
+		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
+		
+		if (tmp->mask > mask) {
+			node = node->rb_left;
+		} else if (tmp->mask < mask) {
+			node = node->rb_right;
+		} else {
+			struct rb_node* inner_node = tmp->mds_requests.rb_node;
+			while (inner_node) {
+				struct ceph_mds_request* req = NULL;
+
+				req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
+				if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
+					inner_node = inner_node->rb_left;
+				} else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
+					inner_node = inner_node->rb_right;
+				} else
+					return req; /* Found it */
+			}
+			return NULL;
+		}
+	}
+	return NULL;
+}
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..abf761f2a122 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -296,6 +296,8 @@  struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
 
 	spinlock_t i_ceph_lock;
+	struct mutex getattrs_inflight_lock, lookups_inflight_lock;
+	struct rb_root getattrs_inflight, lookups_inflight;
 
 	u64 i_version;
 	u64 i_inline_version;
@@ -329,6 +331,7 @@  struct ceph_inode_info {
 	struct rb_root i_caps;           /* cap list */
 	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
 	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	u64 last_flush_ack_tid;
 	struct list_head i_dirty_item, i_flushing_item;
 	/* we need to track cap writeback on a per-cap-bit basis, to allow
 	 * overlapping, pipelined cap flushes to the mds.  we can probably
@@ -864,6 +867,17 @@  extern void ceph_fill_file_time(struct inode *inode, int issued,
 				u64 time_warp_seq, struct timespec64 *ctime,
 				struct timespec64 *mtime,
 				struct timespec64 *atime);
+extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+					       struct ceph_mds_request* req,
+					       bool is_lookup);
+
+extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
+						 struct ceph_mds_request* req,
+						 bool is_lookup);
+
+extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
+					     int mask,
+					     bool is_lookup);
 extern int ceph_fill_trace(struct super_block *sb,
 			   struct ceph_mds_request *req);
 extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,