[v5,03/12] ceph: add infrastructure for waiting for async create to complete
diff mbox series

Message ID 20200219132526.17590-4-jlayton@kernel.org
State New
Headers show
Series
  • ceph: async directory operations support
Related show

Commit Message

Jeff Layton Feb. 19, 2020, 1:25 p.m. UTC
When we issue an async create, we must ensure that any later on-the-wire
requests involving it wait for the create reply.

Expand i_ceph_flags to be an unsigned long, and add a new bit that
MDS requests can wait on. If the bit is set in the inode when sending
caps, then don't send it and just return that it has been delayed.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 13 ++++++++++++-
 fs/ceph/dir.c        |  2 +-
 fs/ceph/mds_client.c | 20 +++++++++++++++++++-
 fs/ceph/mds_client.h |  7 +++++++
 fs/ceph/super.h      |  4 +++-
 5 files changed, 42 insertions(+), 4 deletions(-)

Comments

Yan, Zheng Feb. 20, 2020, 3:32 a.m. UTC | #1
On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> When we issue an async create, we must ensure that any later on-the-wire
> requests involving it wait for the create reply.
>
> Expand i_ceph_flags to be an unsigned long, and add a new bit that
> MDS requests can wait on. If the bit is set in the inode when sending
> caps, then don't send it and just return that it has been delayed.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 13 ++++++++++++-
>  fs/ceph/dir.c        |  2 +-
>  fs/ceph/mds_client.c | 20 +++++++++++++++++++-
>  fs/ceph/mds_client.h |  7 +++++++
>  fs/ceph/super.h      |  4 +++-
>  5 files changed, 42 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index d05717397c2a..85e13aa359d2 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>                                 struct ceph_inode_info *ci,
>                                 bool set_timeout)
>  {
> -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>              ci->i_ceph_flags, ci->i_hold_caps_max);
>         if (!mdsc->stopping) {
>                 spin_lock(&mdsc->cap_delay_lock);
> @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>         int delayed = 0;
>         int ret;
>
> +       /* Don't send anything if it's still being created. Return delayed */
> +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> +               spin_unlock(&ci->i_ceph_lock);
> +               dout("%s async create in flight for %p\n", __func__, inode);
> +               return 1;
> +       }
> +

Maybe it's better to check this in ceph_check_caps().  Other callers
of __send_cap() shouldn't encounter async creating inode

>         held = cap->issued | cap->implemented;
>         revoking = cap->implemented & ~cap->issued;
>         retain &= ~revoking;
> @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>         if (datasync)
>                 goto out;
>
> +       ret = ceph_wait_on_async_create(inode);
> +       if (ret)
> +               goto out;
> +
>         dirty = try_flush_caps(inode, &flush_tid);
>         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
>
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index a87274935a09..5b83bda57056 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
>                 struct ceph_dentry_info *di = ceph_dentry(dentry);
>
>                 spin_lock(&ci->i_ceph_lock);
> -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
>                 if (strncmp(dentry->d_name.name,
>                             fsc->mount_options->snapdir_name,
>                             dentry->d_name.len) &&
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 94d18e643a3d..38eb9dd5062b 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
>  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
>                               struct ceph_mds_request *req)
>  {
> -       int err;
> +       int err = 0;
>
>         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
>         if (req->r_inode)
> @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
>                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
>                                   CEPH_CAP_PIN);
>
> +       if (req->r_inode) {
> +               err = ceph_wait_on_async_create(req->r_inode);
> +               if (err) {
> +                       dout("%s: wait for async create returned: %d\n",
> +                            __func__, err);
> +                       return err;
> +               }
> +       }
> +
> +       if (!err && req->r_old_inode) {
> +               err = ceph_wait_on_async_create(req->r_old_inode);
> +               if (err) {
> +                       dout("%s: wait for async create returned: %d\n",
> +                            __func__, err);
> +                       return err;
> +               }
> +       }
> +
>         dout("submit_request on %p for inode %p\n", req, dir);
>         mutex_lock(&mdsc->mutex);
>         __register_request(mdsc, req, dir);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 95ac00e59e66..8043f2b439b1 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
>  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>                           struct ceph_mds_session *session,
>                           int max_caps);
> +static inline int ceph_wait_on_async_create(struct inode *inode)
> +{
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> +                          TASK_INTERRUPTIBLE);
> +}
>  #endif
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 3430d7ffe8f7..bfb03adb4a08 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -316,7 +316,7 @@ struct ceph_inode_info {
>         u64 i_inline_version;
>         u32 i_time_warp_seq;
>
> -       unsigned i_ceph_flags;
> +       unsigned long i_ceph_flags;
>         atomic64_t i_release_count;
>         atomic64_t i_ordered_count;
>         atomic64_t i_complete_seq[2];
> @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
>  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
>  #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
>
>  /*
>   * Masks of ceph inode work.
> --
> 2.24.1
>
Jeff Layton Feb. 20, 2020, 1:01 p.m. UTC | #2
On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
> On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > When we issue an async create, we must ensure that any later on-the-wire
> > requests involving it wait for the create reply.
> > 
> > Expand i_ceph_flags to be an unsigned long, and add a new bit that
> > MDS requests can wait on. If the bit is set in the inode when sending
> > caps, then don't send it and just return that it has been delayed.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/caps.c       | 13 ++++++++++++-
> >  fs/ceph/dir.c        |  2 +-
> >  fs/ceph/mds_client.c | 20 +++++++++++++++++++-
> >  fs/ceph/mds_client.h |  7 +++++++
> >  fs/ceph/super.h      |  4 +++-
> >  5 files changed, 42 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index d05717397c2a..85e13aa359d2 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> >                                 struct ceph_inode_info *ci,
> >                                 bool set_timeout)
> >  {
> > -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> > +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> >              ci->i_ceph_flags, ci->i_hold_caps_max);
> >         if (!mdsc->stopping) {
> >                 spin_lock(&mdsc->cap_delay_lock);
> > @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> >         int delayed = 0;
> >         int ret;
> > 
> > +       /* Don't send anything if it's still being created. Return delayed */
> > +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> > +               spin_unlock(&ci->i_ceph_lock);
> > +               dout("%s async create in flight for %p\n", __func__, inode);
> > +               return 1;
> > +       }
> > +
> 
> Maybe it's better to check this in ceph_check_caps().  Other callers
> of __send_cap() shouldn't encounter async creating inode
> 

I've been looking, but what actually guarantees that?

Only ceph_check_caps calls it for UPDATE, but the other two callers call
it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
write_inode before the create reply comes in, particularly if we just
create and then close the file.

As a side note, I still struggle with the fact thatthere seems to be no
coherent overall description of the cap protocol. What distinguishes a
FLUSH from an UPDATE, for instance? The MDS code and comments seem to
treat them somewhat interchangeably.


> >         held = cap->issued | cap->implemented;
> >         revoking = cap->implemented & ~cap->issued;
> >         retain &= ~revoking;
> > @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> >         if (datasync)
> >                 goto out;
> > 
> > +       ret = ceph_wait_on_async_create(inode);
> > +       if (ret)
> > +               goto out;
> > +
> >         dirty = try_flush_caps(inode, &flush_tid);
> >         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > 
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index a87274935a09..5b83bda57056 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
> >                 struct ceph_dentry_info *di = ceph_dentry(dentry);
> > 
> >                 spin_lock(&ci->i_ceph_lock);
> > -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> > +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
> >                 if (strncmp(dentry->d_name.name,
> >                             fsc->mount_options->snapdir_name,
> >                             dentry->d_name.len) &&
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 94d18e643a3d..38eb9dd5062b 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
> >  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> >                               struct ceph_mds_request *req)
> >  {
> > -       int err;
> > +       int err = 0;
> > 
> >         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
> >         if (req->r_inode)
> > @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> >                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
> >                                   CEPH_CAP_PIN);
> > 
> > +       if (req->r_inode) {
> > +               err = ceph_wait_on_async_create(req->r_inode);
> > +               if (err) {
> > +                       dout("%s: wait for async create returned: %d\n",
> > +                            __func__, err);
> > +                       return err;
> > +               }
> > +       }
> > +
> > +       if (!err && req->r_old_inode) {
> > +               err = ceph_wait_on_async_create(req->r_old_inode);
> > +               if (err) {
> > +                       dout("%s: wait for async create returned: %d\n",
> > +                            __func__, err);
> > +                       return err;
> > +               }
> > +       }
> > +
> >         dout("submit_request on %p for inode %p\n", req, dir);
> >         mutex_lock(&mdsc->mutex);
> >         __register_request(mdsc, req, dir);
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index 95ac00e59e66..8043f2b439b1 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> >                           struct ceph_mds_session *session,
> >                           int max_caps);
> > +static inline int ceph_wait_on_async_create(struct inode *inode)
> > +{
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > +                          TASK_INTERRUPTIBLE);
> > +}
> >  #endif
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 3430d7ffe8f7..bfb03adb4a08 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -316,7 +316,7 @@ struct ceph_inode_info {
> >         u64 i_inline_version;
> >         u32 i_time_warp_seq;
> > 
> > -       unsigned i_ceph_flags;
> > +       unsigned long i_ceph_flags;
> >         atomic64_t i_release_count;
> >         atomic64_t i_ordered_count;
> >         atomic64_t i_complete_seq[2];
> > @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> >  #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> > +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> > +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
> > 
> >  /*
> >   * Masks of ceph inode work.
> > --
> > 2.24.1
> >
Yan, Zheng Feb. 20, 2020, 1:33 p.m. UTC | #3
On Thu, Feb 20, 2020 at 9:01 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
> > On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > When we issue an async create, we must ensure that any later on-the-wire
> > > requests involving it wait for the create reply.
> > >
> > > Expand i_ceph_flags to be an unsigned long, and add a new bit that
> > > MDS requests can wait on. If the bit is set in the inode when sending
> > > caps, then don't send it and just return that it has been delayed.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/caps.c       | 13 ++++++++++++-
> > >  fs/ceph/dir.c        |  2 +-
> > >  fs/ceph/mds_client.c | 20 +++++++++++++++++++-
> > >  fs/ceph/mds_client.h |  7 +++++++
> > >  fs/ceph/super.h      |  4 +++-
> > >  5 files changed, 42 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index d05717397c2a..85e13aa359d2 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > >                                 struct ceph_inode_info *ci,
> > >                                 bool set_timeout)
> > >  {
> > > -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> > > +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > >              ci->i_ceph_flags, ci->i_hold_caps_max);
> > >         if (!mdsc->stopping) {
> > >                 spin_lock(&mdsc->cap_delay_lock);
> > > @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> > >         int delayed = 0;
> > >         int ret;
> > >
> > > +       /* Don't send anything if it's still being created. Return delayed */
> > > +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> > > +               spin_unlock(&ci->i_ceph_lock);
> > > +               dout("%s async create in flight for %p\n", __func__, inode);
> > > +               return 1;
> > > +       }
> > > +
> >
> > Maybe it's better to check this in ceph_check_caps().  Other callers
> > of __send_cap() shouldn't encounter async creating inode
> >
>
> I've been looking, but what actually guarantees that?
>
> Only ceph_check_caps calls it for UPDATE, but the other two callers call
> it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
> write_inode before the create reply comes in, particularly if we just
> create and then close the file.
>

I missed write_inode case. but make __send_cap() skip sending message
can cause problem. For example, if we skip a message that flush dirty
caps. call ceph_check_caps() again may not re-do the flush.

> As a side note, I still struggle with the fact thatthere seems to be no
> coherent overall description of the cap protocol. What distinguishes a
> FLUSH from an UPDATE, for instance? The MDS code and comments seem to
> treat them somewhat interchangeably.
>

UPDATE is super set of FLUSH, UPDATE can always replace FLUSH.

>
> > >         held = cap->issued | cap->implemented;
> > >         revoking = cap->implemented & ~cap->issued;
> > >         retain &= ~revoking;
> > > @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > >         if (datasync)
> > >                 goto out;
> > >
> > > +       ret = ceph_wait_on_async_create(inode);
> > > +       if (ret)
> > > +               goto out;
> > > +
> > >         dirty = try_flush_caps(inode, &flush_tid);
> > >         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > >
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index a87274935a09..5b83bda57056 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
> > >                 struct ceph_dentry_info *di = ceph_dentry(dentry);
> > >
> > >                 spin_lock(&ci->i_ceph_lock);
> > > -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> > > +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
> > >                 if (strncmp(dentry->d_name.name,
> > >                             fsc->mount_options->snapdir_name,
> > >                             dentry->d_name.len) &&
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 94d18e643a3d..38eb9dd5062b 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
> > >  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > >                               struct ceph_mds_request *req)
> > >  {
> > > -       int err;
> > > +       int err = 0;
> > >
> > >         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
> > >         if (req->r_inode)
> > > @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > >                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
> > >                                   CEPH_CAP_PIN);
> > >
> > > +       if (req->r_inode) {
> > > +               err = ceph_wait_on_async_create(req->r_inode);
> > > +               if (err) {
> > > +                       dout("%s: wait for async create returned: %d\n",
> > > +                            __func__, err);
> > > +                       return err;
> > > +               }
> > > +       }
> > > +
> > > +       if (!err && req->r_old_inode) {
> > > +               err = ceph_wait_on_async_create(req->r_old_inode);
> > > +               if (err) {
> > > +                       dout("%s: wait for async create returned: %d\n",
> > > +                            __func__, err);
> > > +                       return err;
> > > +               }
> > > +       }
> > > +
> > >         dout("submit_request on %p for inode %p\n", req, dir);
> > >         mutex_lock(&mdsc->mutex);
> > >         __register_request(mdsc, req, dir);
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 95ac00e59e66..8043f2b439b1 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> > >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > >                           struct ceph_mds_session *session,
> > >                           int max_caps);
> > > +static inline int ceph_wait_on_async_create(struct inode *inode)
> > > +{
> > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > +
> > > +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > > +                          TASK_INTERRUPTIBLE);
> > > +}
> > >  #endif
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 3430d7ffe8f7..bfb03adb4a08 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -316,7 +316,7 @@ struct ceph_inode_info {
> > >         u64 i_inline_version;
> > >         u32 i_time_warp_seq;
> > >
> > > -       unsigned i_ceph_flags;
> > > +       unsigned long i_ceph_flags;
> > >         atomic64_t i_release_count;
> > >         atomic64_t i_ordered_count;
> > >         atomic64_t i_complete_seq[2];
> > > @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> > >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> > >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > >  #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> > > +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> > > +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
> > >
> > >  /*
> > >   * Masks of ceph inode work.
> > > --
> > > 2.24.1
> > >
>
> --
> Jeff Layton <jlayton@kernel.org>
>
Jeff Layton Feb. 20, 2020, 2:53 p.m. UTC | #4
On Thu, 2020-02-20 at 21:33 +0800, Yan, Zheng wrote:
> On Thu, Feb 20, 2020 at 9:01 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
> > > On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > When we issue an async create, we must ensure that any later on-the-wire
> > > > requests involving it wait for the create reply.
> > > > 
> > > > Expand i_ceph_flags to be an unsigned long, and add a new bit that
> > > > MDS requests can wait on. If the bit is set in the inode when sending
> > > > caps, then don't send it and just return that it has been delayed.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/caps.c       | 13 ++++++++++++-
> > > >  fs/ceph/dir.c        |  2 +-
> > > >  fs/ceph/mds_client.c | 20 +++++++++++++++++++-
> > > >  fs/ceph/mds_client.h |  7 +++++++
> > > >  fs/ceph/super.h      |  4 +++-
> > > >  5 files changed, 42 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index d05717397c2a..85e13aa359d2 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > >                                 struct ceph_inode_info *ci,
> > > >                                 bool set_timeout)
> > > >  {
> > > > -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> > > > +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > > >              ci->i_ceph_flags, ci->i_hold_caps_max);
> > > >         if (!mdsc->stopping) {
> > > >                 spin_lock(&mdsc->cap_delay_lock);
> > > > @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> > > >         int delayed = 0;
> > > >         int ret;
> > > > 
> > > > +       /* Don't send anything if it's still being created. Return delayed */
> > > > +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> > > > +               spin_unlock(&ci->i_ceph_lock);
> > > > +               dout("%s async create in flight for %p\n", __func__, inode);
> > > > +               return 1;
> > > > +       }
> > > > +
> > > 
> > > Maybe it's better to check this in ceph_check_caps().  Other callers
> > > of __send_cap() shouldn't encounter async creating inode
> > > 
> > 
> > I've been looking, but what actually guarantees that?
> > 
> > Only ceph_check_caps calls it for UPDATE, but the other two callers call
> > it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
> > write_inode before the create reply comes in, particularly if we just
> > create and then close the file.
> > 
> 
> I missed write_inode case. but make __send_cap() skip sending message
> can cause problem. For example, if we skip a message that flush dirty
> caps. call ceph_check_caps() again may not re-do the flush.
> 

Ugh. Ok, so I guess we'll need to fix that first. I assume that making
sure the flush is redone after being delayed is the right thing to do
here?

> > As a side note, I still struggle with the fact thatthere seems to be no
> > coherent overall description of the cap protocol. What distinguishes a
> > FLUSH from an UPDATE, for instance? The MDS code and comments seem to
> > treat them somewhat interchangeably.
> > 
> 
> UPDATE is super set of FLUSH, UPDATE can always replace FLUSH.
> 

I'll toss this note onto my jumble of notes, for my (eventual) planned
document that describes the cap protocol.

> > > >         held = cap->issued | cap->implemented;
> > > >         revoking = cap->implemented & ~cap->issued;
> > > >         retain &= ~revoking;
> > > > @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > > >         if (datasync)
> > > >                 goto out;
> > > > 
> > > > +       ret = ceph_wait_on_async_create(inode);
> > > > +       if (ret)
> > > > +               goto out;
> > > > +
> > > >         dirty = try_flush_caps(inode, &flush_tid);
> > > >         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > > > 
> > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > index a87274935a09..5b83bda57056 100644
> > > > --- a/fs/ceph/dir.c
> > > > +++ b/fs/ceph/dir.c
> > > > @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
> > > >                 struct ceph_dentry_info *di = ceph_dentry(dentry);
> > > > 
> > > >                 spin_lock(&ci->i_ceph_lock);
> > > > -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> > > > +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
> > > >                 if (strncmp(dentry->d_name.name,
> > > >                             fsc->mount_options->snapdir_name,
> > > >                             dentry->d_name.len) &&
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index 94d18e643a3d..38eb9dd5062b 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
> > > >  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > >                               struct ceph_mds_request *req)
> > > >  {
> > > > -       int err;
> > > > +       int err = 0;
> > > > 
> > > >         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
> > > >         if (req->r_inode)
> > > > @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > >                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
> > > >                                   CEPH_CAP_PIN);
> > > > 
> > > > +       if (req->r_inode) {
> > > > +               err = ceph_wait_on_async_create(req->r_inode);
> > > > +               if (err) {
> > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > +                            __func__, err);
> > > > +                       return err;
> > > > +               }
> > > > +       }
> > > > +
> > > > +       if (!err && req->r_old_inode) {
> > > > +               err = ceph_wait_on_async_create(req->r_old_inode);
> > > > +               if (err) {
> > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > +                            __func__, err);
> > > > +                       return err;
> > > > +               }
> > > > +       }
> > > > +
> > > >         dout("submit_request on %p for inode %p\n", req, dir);
> > > >         mutex_lock(&mdsc->mutex);
> > > >         __register_request(mdsc, req, dir);
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index 95ac00e59e66..8043f2b439b1 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> > > >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > > >                           struct ceph_mds_session *session,
> > > >                           int max_caps);
> > > > +static inline int ceph_wait_on_async_create(struct inode *inode)
> > > > +{
> > > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > > +
> > > > +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > > > +                          TASK_INTERRUPTIBLE);
> > > > +}
> > > >  #endif
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index 3430d7ffe8f7..bfb03adb4a08 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -316,7 +316,7 @@ struct ceph_inode_info {
> > > >         u64 i_inline_version;
> > > >         u32 i_time_warp_seq;
> > > > 
> > > > -       unsigned i_ceph_flags;
> > > > +       unsigned long i_ceph_flags;
> > > >         atomic64_t i_release_count;
> > > >         atomic64_t i_ordered_count;
> > > >         atomic64_t i_complete_seq[2];
> > > > @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> > > >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> > > >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > > >  #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> > > > +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> > > > +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
> > > > 
> > > >  /*
> > > >   * Masks of ceph inode work.
> > > > --
> > > > 2.24.1
> > > > 
> > 
> > --
> > Jeff Layton <jlayton@kernel.org>
> >
Jeff Layton Feb. 25, 2020, 7:45 p.m. UTC | #5
On Thu, 2020-02-20 at 09:53 -0500, Jeff Layton wrote:
> On Thu, 2020-02-20 at 21:33 +0800, Yan, Zheng wrote:
> > On Thu, Feb 20, 2020 at 9:01 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
> > > > On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > When we issue an async create, we must ensure that any later on-the-wire
> > > > > requests involving it wait for the create reply.
> > > > > 
> > > > > Expand i_ceph_flags to be an unsigned long, and add a new bit that
> > > > > MDS requests can wait on. If the bit is set in the inode when sending
> > > > > caps, then don't send it and just return that it has been delayed.
> > > > > 
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >  fs/ceph/caps.c       | 13 ++++++++++++-
> > > > >  fs/ceph/dir.c        |  2 +-
> > > > >  fs/ceph/mds_client.c | 20 +++++++++++++++++++-
> > > > >  fs/ceph/mds_client.h |  7 +++++++
> > > > >  fs/ceph/super.h      |  4 +++-
> > > > >  5 files changed, 42 insertions(+), 4 deletions(-)
> > > > > 
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index d05717397c2a..85e13aa359d2 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > > >                                 struct ceph_inode_info *ci,
> > > > >                                 bool set_timeout)
> > > > >  {
> > > > > -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> > > > > +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > > > >              ci->i_ceph_flags, ci->i_hold_caps_max);
> > > > >         if (!mdsc->stopping) {
> > > > >                 spin_lock(&mdsc->cap_delay_lock);
> > > > > @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> > > > >         int delayed = 0;
> > > > >         int ret;
> > > > > 
> > > > > +       /* Don't send anything if it's still being created. Return delayed */
> > > > > +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> > > > > +               spin_unlock(&ci->i_ceph_lock);
> > > > > +               dout("%s async create in flight for %p\n", __func__, inode);
> > > > > +               return 1;
> > > > > +       }
> > > > > +
> > > > 
> > > > Maybe it's better to check this in ceph_check_caps().  Other callers
> > > > of __send_cap() shouldn't encounter async creating inode

I'm not sure that's the case, is it? Suppose we call ceph_check_caps
and it ends up delayed. We requeue the cap and then later someone calls
fsync() and we end up calling try_flush_caps even though we haven't
gotten the async create reply yet.

> > > 
> > > I've been looking, but what actually guarantees that?
> > > 
> > > Only ceph_check_caps calls it for UPDATE, but the other two callers call
> > > it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
> > > write_inode before the create reply comes in, particularly if we just
> > > create and then close the file.
> > > 
> > 
> > I missed write_inode case. but make __send_cap() skip sending message
> > can cause problem. For example, if we skip a message that flush dirty
> > caps. call ceph_check_caps() again may not re-do the flush.
> > 
> 
> Ugh. Ok, so I guess we'll need to fix that first. I assume that making
> sure the flush is redone after being delayed is the right thing to do
> here?
> 

Hmm...looking at this more closely today.

__send_cap calls send_cap_msg, and that function does a number of
allocations which could fail. So if this is a problem, it's a problem
today, and we should fix it. There are 3 callers of __send_cap:

try_flush_caps : requeues the cap (and sets the timeouts) if __send_cap
returns non-zero. I think this one is (probably?) OK.

__kick_flushing_caps : just throws a pr_err if __send_cap returns non-
zero, but since the cap is already queued here, there should be no need
to requeue it.

ceph_check_caps : the cap is requeued iff it's delayed.

So...I'm not sure I fully understand your concern. AFAICT, the cap
should end up being queued if the send failed.

I think that's probably the best we can do here. If we end up trying to
flush caps and we haven't gotten the async reply yet, we don't really
have much of a choice other than to wait to flush.

Perhaps though, we ought to call __kick_flushing_caps when an async
create reply comes in just to ensure that we do flush in a timely
fashion once that does occur.

Thoughts?


> > > As a side note, I still struggle with the fact thatthere seems to be no
> > > coherent overall description of the cap protocol. What distinguishes a
> > > FLUSH from an UPDATE, for instance? The MDS code and comments seem to
> > > treat them somewhat interchangeably.
> > > 
> > 
> > UPDATE is super set of FLUSH, UPDATE can always replace FLUSH.
> > 
> 
> I'll toss this note onto my jumble of notes, for my (eventual) planned
> document that describes the cap protocol.
> 
> > > > >         held = cap->issued | cap->implemented;
> > > > >         revoking = cap->implemented & ~cap->issued;
> > > > >         retain &= ~revoking;
> > > > > @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > > > >         if (datasync)
> > > > >                 goto out;
> > > > > 
> > > > > +       ret = ceph_wait_on_async_create(inode);
> > > > > +       if (ret)
> > > > > +               goto out;
> > > > > +
> > > > >         dirty = try_flush_caps(inode, &flush_tid);
> > > > >         dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > > > > 
> > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > index a87274935a09..5b83bda57056 100644
> > > > > --- a/fs/ceph/dir.c
> > > > > +++ b/fs/ceph/dir.c
> > > > > @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
> > > > >                 struct ceph_dentry_info *di = ceph_dentry(dentry);
> > > > > 
> > > > >                 spin_lock(&ci->i_ceph_lock);
> > > > > -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> > > > > +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
> > > > >                 if (strncmp(dentry->d_name.name,
> > > > >                             fsc->mount_options->snapdir_name,
> > > > >                             dentry->d_name.len) &&
> > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > index 94d18e643a3d..38eb9dd5062b 100644
> > > > > --- a/fs/ceph/mds_client.c
> > > > > +++ b/fs/ceph/mds_client.c
> > > > > @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
> > > > >  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > > >                               struct ceph_mds_request *req)
> > > > >  {
> > > > > -       int err;
> > > > > +       int err = 0;
> > > > > 
> > > > >         /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
> > > > >         if (req->r_inode)
> > > > > @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > > >                 ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
> > > > >                                   CEPH_CAP_PIN);
> > > > > 
> > > > > +       if (req->r_inode) {
> > > > > +               err = ceph_wait_on_async_create(req->r_inode);
> > > > > +               if (err) {
> > > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > > +                            __func__, err);
> > > > > +                       return err;
> > > > > +               }
> > > > > +       }
> > > > > +
> > > > > +       if (!err && req->r_old_inode) {
> > > > > +               err = ceph_wait_on_async_create(req->r_old_inode);
> > > > > +               if (err) {
> > > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > > +                            __func__, err);
> > > > > +                       return err;
> > > > > +               }
> > > > > +       }
> > > > > +
> > > > >         dout("submit_request on %p for inode %p\n", req, dir);
> > > > >         mutex_lock(&mdsc->mutex);
> > > > >         __register_request(mdsc, req, dir);
> > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > index 95ac00e59e66..8043f2b439b1 100644
> > > > > --- a/fs/ceph/mds_client.h
> > > > > +++ b/fs/ceph/mds_client.h
> > > > > @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> > > > >  extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > > > >                           struct ceph_mds_session *session,
> > > > >                           int max_caps);
> > > > > +static inline int ceph_wait_on_async_create(struct inode *inode)
> > > > > +{
> > > > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > > > +
> > > > > +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > > > > +                          TASK_INTERRUPTIBLE);
> > > > > +}
> > > > >  #endif
> > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > index 3430d7ffe8f7..bfb03adb4a08 100644
> > > > > --- a/fs/ceph/super.h
> > > > > +++ b/fs/ceph/super.h
> > > > > @@ -316,7 +316,7 @@ struct ceph_inode_info {
> > > > >         u64 i_inline_version;
> > > > >         u32 i_time_warp_seq;
> > > > > 
> > > > > -       unsigned i_ceph_flags;
> > > > > +       unsigned long i_ceph_flags;
> > > > >         atomic64_t i_release_count;
> > > > >         atomic64_t i_ordered_count;
> > > > >         atomic64_t i_complete_seq[2];
> > > > > @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> > > > >  #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> > > > >  #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > > > >  #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> > > > > +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> > > > > +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
> > > > > 
> > > > >  /*
> > > > >   * Masks of ceph inode work.
> > > > > --
> > > > > 2.24.1
> > > > > 
> > > 
> > > --
> > > Jeff Layton <jlayton@kernel.org>
> > >
Yan, Zheng Feb. 26, 2020, 2:10 p.m. UTC | #6
On 2/26/20 3:45 AM, Jeff Layton wrote:
> On Thu, 2020-02-20 at 09:53 -0500, Jeff Layton wrote:
>> On Thu, 2020-02-20 at 21:33 +0800, Yan, Zheng wrote:
>>> On Thu, Feb 20, 2020 at 9:01 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>> On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
>>>>> On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>> When we issue an async create, we must ensure that any later on-the-wire
>>>>>> requests involving it wait for the create reply.
>>>>>>
>>>>>> Expand i_ceph_flags to be an unsigned long, and add a new bit that
>>>>>> MDS requests can wait on. If the bit is set in the inode when sending
>>>>>> caps, then don't send it and just return that it has been delayed.
>>>>>>
>>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>>> ---
>>>>>>   fs/ceph/caps.c       | 13 ++++++++++++-
>>>>>>   fs/ceph/dir.c        |  2 +-
>>>>>>   fs/ceph/mds_client.c | 20 +++++++++++++++++++-
>>>>>>   fs/ceph/mds_client.h |  7 +++++++
>>>>>>   fs/ceph/super.h      |  4 +++-
>>>>>>   5 files changed, 42 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>>>> index d05717397c2a..85e13aa359d2 100644
>>>>>> --- a/fs/ceph/caps.c
>>>>>> +++ b/fs/ceph/caps.c
>>>>>> @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
>>>>>>                                  struct ceph_inode_info *ci,
>>>>>>                                  bool set_timeout)
>>>>>>   {
>>>>>> -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
>>>>>> +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
>>>>>>               ci->i_ceph_flags, ci->i_hold_caps_max);
>>>>>>          if (!mdsc->stopping) {
>>>>>>                  spin_lock(&mdsc->cap_delay_lock);
>>>>>> @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
>>>>>>          int delayed = 0;
>>>>>>          int ret;
>>>>>>
>>>>>> +       /* Don't send anything if it's still being created. Return delayed */
>>>>>> +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
>>>>>> +               spin_unlock(&ci->i_ceph_lock);
>>>>>> +               dout("%s async create in flight for %p\n", __func__, inode);
>>>>>> +               return 1;
>>>>>> +       }
>>>>>> +
>>>>>
>>>>> Maybe it's better to check this in ceph_check_caps().  Other callers
>>>>> of __send_cap() shouldn't encounter async creating inode
> 
> I'm not sure that's the case, is it? Suppose we call ceph_check_caps
> and it ends up delayed. We requeue the cap and then later someone calls
> fsync() and we end up calling try_flush_caps even though we haven't
> gotten the async create reply yet.

your patch adds a wait_on_async_create for fsync case.

> 
>>>>
>>>> I've been looking, but what actually guarantees that?
>>>>
>>>> Only ceph_check_caps calls it for UPDATE, but the other two callers call
>>>> it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
>>>> write_inode before the create reply comes in, particularly if we just
>>>> create and then close the file.
>>>>
>>>
>>> I missed write_inode case. but make __send_cap() skip sending message
>>> can cause problem. For example, if we skip a message that flush dirty
>>> caps. call ceph_check_caps() again may not re-do the flush.
>>>
>>
>> Ugh. Ok, so I guess we'll need to fix that first. I assume that making
>> sure the flush is redone after being delayed is the right thing to do
>> here?
>>
> 
> Hmm...looking at this more closely today.
> 
> __send_cap calls send_cap_msg, and that function does a number of
> allocations which could fail. So if this is a problem, it's a problem
> today, and we should fix it. There are 3 callers of __send_cap:
> 
> try_flush_caps : requeues the cap (and sets the timeouts) if __send_cap
> returns non-zero. I think this one is (probably?) OK.
> 
I think we can return error back to fsync() for this case.

> __kick_flushing_caps : just throws a pr_err if __send_cap returns non-
> zero, but since the cap is already queued here, there should be no need
> to requeue it.
>

This one is really problematic. ceph_early_kick_flushing_caps() needs to 
re-send flushes when recovering mds is in reconnect state. Otherwise, 
flush may overwrite other client's new change.


> ceph_check_caps : the cap is requeued iff it's delayed.
> 
> So...I'm not sure I fully understand your concern. AFAICT, the cap
> should end up being queued if the send failed.

If ceph_check_caps flushed dirty cap and it failed to send msg. it need 
to undo what __mark_caps_flushing() did

> 
> I think that's probably the best we can do here. If we end up trying to
> flush caps and we haven't gotten the async reply yet, we don't really
> have much of a choice other than to wait to flush.
> 

I think the best is make send_cap_msg never fail. If free memory is 
slow, make the memory allocation wait.

> Perhaps though, we ought to call __kick_flushing_caps when an async
> create reply comes in just to ensure that we do flush in a timely
> fashion once that does occur.
> 
> Thoughts?
> 
> 
>>>> As a side note, I still struggle with the fact thatthere seems to be no
>>>> coherent overall description of the cap protocol. What distinguishes a
>>>> FLUSH from an UPDATE, for instance? The MDS code and comments seem to
>>>> treat them somewhat interchangeably.
>>>>
>>>
>>> UPDATE is super set of FLUSH, UPDATE can always replace FLUSH.
>>>
>>
>> I'll toss this note onto my jumble of notes, for my (eventual) planned
>> document that describes the cap protocol.
>>
>>>>>>          held = cap->issued | cap->implemented;
>>>>>>          revoking = cap->implemented & ~cap->issued;
>>>>>>          retain &= ~revoking;
>>>>>> @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
>>>>>>          if (datasync)
>>>>>>                  goto out;
>>>>>>
>>>>>> +       ret = ceph_wait_on_async_create(inode);
>>>>>> +       if (ret)
>>>>>> +               goto out;
>>>>>> +
>>>>>>          dirty = try_flush_caps(inode, &flush_tid);
>>>>>>          dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
>>>>>>
>>>>>> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>>>>>> index a87274935a09..5b83bda57056 100644
>>>>>> --- a/fs/ceph/dir.c
>>>>>> +++ b/fs/ceph/dir.c
>>>>>> @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
>>>>>>                  struct ceph_dentry_info *di = ceph_dentry(dentry);
>>>>>>
>>>>>>                  spin_lock(&ci->i_ceph_lock);
>>>>>> -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
>>>>>> +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
>>>>>>                  if (strncmp(dentry->d_name.name,
>>>>>>                              fsc->mount_options->snapdir_name,
>>>>>>                              dentry->d_name.len) &&
>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>> index 94d18e643a3d..38eb9dd5062b 100644
>>>>>> --- a/fs/ceph/mds_client.c
>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>> @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
>>>>>>   int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
>>>>>>                                struct ceph_mds_request *req)
>>>>>>   {
>>>>>> -       int err;
>>>>>> +       int err = 0;
>>>>>>
>>>>>>          /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
>>>>>>          if (req->r_inode)
>>>>>> @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
>>>>>>                  ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
>>>>>>                                    CEPH_CAP_PIN);
>>>>>>
>>>>>> +       if (req->r_inode) {
>>>>>> +               err = ceph_wait_on_async_create(req->r_inode);
>>>>>> +               if (err) {
>>>>>> +                       dout("%s: wait for async create returned: %d\n",
>>>>>> +                            __func__, err);
>>>>>> +                       return err;
>>>>>> +               }
>>>>>> +       }
>>>>>> +
>>>>>> +       if (!err && req->r_old_inode) {
>>>>>> +               err = ceph_wait_on_async_create(req->r_old_inode);
>>>>>> +               if (err) {
>>>>>> +                       dout("%s: wait for async create returned: %d\n",
>>>>>> +                            __func__, err);
>>>>>> +                       return err;
>>>>>> +               }
>>>>>> +       }
>>>>>> +
>>>>>>          dout("submit_request on %p for inode %p\n", req, dir);
>>>>>>          mutex_lock(&mdsc->mutex);
>>>>>>          __register_request(mdsc, req, dir);
>>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>>> index 95ac00e59e66..8043f2b439b1 100644
>>>>>> --- a/fs/ceph/mds_client.h
>>>>>> +++ b/fs/ceph/mds_client.h
>>>>>> @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
>>>>>>   extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
>>>>>>                            struct ceph_mds_session *session,
>>>>>>                            int max_caps);
>>>>>> +static inline int ceph_wait_on_async_create(struct inode *inode)
>>>>>> +{
>>>>>> +       struct ceph_inode_info *ci = ceph_inode(inode);
>>>>>> +
>>>>>> +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
>>>>>> +                          TASK_INTERRUPTIBLE);
>>>>>> +}
>>>>>>   #endif
>>>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>>>> index 3430d7ffe8f7..bfb03adb4a08 100644
>>>>>> --- a/fs/ceph/super.h
>>>>>> +++ b/fs/ceph/super.h
>>>>>> @@ -316,7 +316,7 @@ struct ceph_inode_info {
>>>>>>          u64 i_inline_version;
>>>>>>          u32 i_time_warp_seq;
>>>>>>
>>>>>> -       unsigned i_ceph_flags;
>>>>>> +       unsigned long i_ceph_flags;
>>>>>>          atomic64_t i_release_count;
>>>>>>          atomic64_t i_ordered_count;
>>>>>>          atomic64_t i_complete_seq[2];
>>>>>> @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
>>>>>>   #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
>>>>>>   #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
>>>>>>   #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
>>>>>> +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
>>>>>> +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
>>>>>>
>>>>>>   /*
>>>>>>    * Masks of ceph inode work.
>>>>>> --
>>>>>> 2.24.1
>>>>>>
>>>>
>>>> --
>>>> Jeff Layton <jlayton@kernel.org>
>>>>
>
Jeff Layton Feb. 27, 2020, 8:06 p.m. UTC | #7
On Wed, 2020-02-26 at 22:10 +0800, Yan, Zheng wrote:
> On 2/26/20 3:45 AM, Jeff Layton wrote:
> > On Thu, 2020-02-20 at 09:53 -0500, Jeff Layton wrote:
> > > On Thu, 2020-02-20 at 21:33 +0800, Yan, Zheng wrote:
> > > > On Thu, Feb 20, 2020 at 9:01 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > On Thu, 2020-02-20 at 11:32 +0800, Yan, Zheng wrote:
> > > > > > On Wed, Feb 19, 2020 at 9:27 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > When we issue an async create, we must ensure that any later on-the-wire
> > > > > > > requests involving it wait for the create reply.
> > > > > > > 
> > > > > > > Expand i_ceph_flags to be an unsigned long, and add a new bit that
> > > > > > > MDS requests can wait on. If the bit is set in the inode when sending
> > > > > > > caps, then don't send it and just return that it has been delayed.
> > > > > > > 
> > > > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > > > ---
> > > > > > >   fs/ceph/caps.c       | 13 ++++++++++++-
> > > > > > >   fs/ceph/dir.c        |  2 +-
> > > > > > >   fs/ceph/mds_client.c | 20 +++++++++++++++++++-
> > > > > > >   fs/ceph/mds_client.h |  7 +++++++
> > > > > > >   fs/ceph/super.h      |  4 +++-
> > > > > > >   5 files changed, 42 insertions(+), 4 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > > > index d05717397c2a..85e13aa359d2 100644
> > > > > > > --- a/fs/ceph/caps.c
> > > > > > > +++ b/fs/ceph/caps.c
> > > > > > > @@ -511,7 +511,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
> > > > > > >                                  struct ceph_inode_info *ci,
> > > > > > >                                  bool set_timeout)
> > > > > > >   {
> > > > > > > -       dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
> > > > > > > +       dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
> > > > > > >               ci->i_ceph_flags, ci->i_hold_caps_max);
> > > > > > >          if (!mdsc->stopping) {
> > > > > > >                  spin_lock(&mdsc->cap_delay_lock);
> > > > > > > @@ -1294,6 +1294,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
> > > > > > >          int delayed = 0;
> > > > > > >          int ret;
> > > > > > > 
> > > > > > > +       /* Don't send anything if it's still being created. Return delayed */
> > > > > > > +       if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
> > > > > > > +               spin_unlock(&ci->i_ceph_lock);
> > > > > > > +               dout("%s async create in flight for %p\n", __func__, inode);
> > > > > > > +               return 1;
> > > > > > > +       }
> > > > > > > +
> > > > > > 
> > > > > > Maybe it's better to check this in ceph_check_caps().  Other callers
> > > > > > of __send_cap() shouldn't encounter async creating inode
> > 
> > I'm not sure that's the case, is it? Suppose we call ceph_check_caps
> > and it ends up delayed. We requeue the cap and then later someone calls
> > fsync() and we end up calling try_flush_caps even though we haven't
> > gotten the async create reply yet.
> 
> your patch adds a wait_on_async_create for fsync case.
> 
> > > > > I've been looking, but what actually guarantees that?
> > > > > 
> > > > > Only ceph_check_caps calls it for UPDATE, but the other two callers call
> > > > > it for FLUSH. I don't see what prevents the kernel from (e.g.) calling
> > > > > write_inode before the create reply comes in, particularly if we just
> > > > > create and then close the file.
> > > > > 
> > > > 
> > > > I missed write_inode case. but make __send_cap() skip sending message
> > > > can cause problem. For example, if we skip a message that flush dirty
> > > > caps. call ceph_check_caps() again may not re-do the flush.
> > > > 
> > > 
> > > Ugh. Ok, so I guess we'll need to fix that first. I assume that making
> > > sure the flush is redone after being delayed is the right thing to do
> > > here?
> > > 
> > 
> > Hmm...looking at this more closely today.
> > 
> > __send_cap calls send_cap_msg, and that function does a number of
> > allocations which could fail. So if this is a problem, it's a problem
> > today, and we should fix it. There are 3 callers of __send_cap:
> > 
> > try_flush_caps : requeues the cap (and sets the timeouts) if __send_cap
> > returns non-zero. I think this one is (probably?) OK.
> > 
> I think we can return error back to fsync() for this case.
> 

Yeah. For write_inode too, I suppose.

> > __kick_flushing_caps : just throws a pr_err if __send_cap returns non-
> > zero, but since the cap is already queued here, there should be no need
> > to requeue it.
> > 
> 
> This one is really problematic. ceph_early_kick_flushing_caps() needs to 
> re-send flushes when recovering mds is in reconnect state. Otherwise, 
> flush may overwrite other client's new change.
> 
> 

Ok.


> > ceph_check_caps : the cap is requeued iff it's delayed.
> > 
> > So...I'm not sure I fully understand your concern. AFAICT, the cap
> > should end up being queued if the send failed.
> 
> If ceph_check_caps flushed dirty cap and it failed to send msg. it need 
> to undo what __mark_caps_flushing() did
> 

Nasty

> > I think that's probably the best we can do here. If we end up trying to
> > flush caps and we haven't gotten the async reply yet, we don't really
> > have much of a choice other than to wait to flush.
> > 
> 
> I think the best is make send_cap_msg never fail. If free memory is 
> slow, make the memory allocation wait.
> 

Ugh.

Looking...I think we're probably ok on the xattr blob already. AFAICT,
that gets preallocated at the time that the setxattr is done.

The main problem is all of the allocations under the ceph_msg_new call
in send_cap_msg. Maybe we ought to be doing those at the time that the
cap is dirtied?

In fact, we already do some preallocation at what appear to be the right
points in the ceph_alloc_cap_flush() calls. Maybe we should do something
similar with ceph_msg_new()?

> > Perhaps though, we ought to call __kick_flushing_caps when an async
> > create reply comes in just to ensure that we do flush in a timely
> > fashion once that does occur.
> > 
> > Thoughts?
> > 
> > 
> > > > > As a side note, I still struggle with the fact thatthere seems to be no
> > > > > coherent overall description of the cap protocol. What distinguishes a
> > > > > FLUSH from an UPDATE, for instance? The MDS code and comments seem to
> > > > > treat them somewhat interchangeably.
> > > > > 
> > > > 
> > > > UPDATE is super set of FLUSH, UPDATE can always replace FLUSH.
> > > > 
> > > 
> > > I'll toss this note onto my jumble of notes, for my (eventual) planned
> > > document that describes the cap protocol.
> > > 
> > > > > > >          held = cap->issued | cap->implemented;
> > > > > > >          revoking = cap->implemented & ~cap->issued;
> > > > > > >          retain &= ~revoking;
> > > > > > > @@ -2250,6 +2257,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
> > > > > > >          if (datasync)
> > > > > > >                  goto out;
> > > > > > > 
> > > > > > > +       ret = ceph_wait_on_async_create(inode);
> > > > > > > +       if (ret)
> > > > > > > +               goto out;
> > > > > > > +
> > > > > > >          dirty = try_flush_caps(inode, &flush_tid);
> > > > > > >          dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
> > > > > > > 
> > > > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > > > index a87274935a09..5b83bda57056 100644
> > > > > > > --- a/fs/ceph/dir.c
> > > > > > > +++ b/fs/ceph/dir.c
> > > > > > > @@ -752,7 +752,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
> > > > > > >                  struct ceph_dentry_info *di = ceph_dentry(dentry);
> > > > > > > 
> > > > > > >                  spin_lock(&ci->i_ceph_lock);
> > > > > > > -               dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
> > > > > > > +               dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
> > > > > > >                  if (strncmp(dentry->d_name.name,
> > > > > > >                              fsc->mount_options->snapdir_name,
> > > > > > >                              dentry->d_name.len) &&
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index 94d18e643a3d..38eb9dd5062b 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -2730,7 +2730,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
> > > > > > >   int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > > > > >                                struct ceph_mds_request *req)
> > > > > > >   {
> > > > > > > -       int err;
> > > > > > > +       int err = 0;
> > > > > > > 
> > > > > > >          /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
> > > > > > >          if (req->r_inode)
> > > > > > > @@ -2743,6 +2743,24 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
> > > > > > >                  ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
> > > > > > >                                    CEPH_CAP_PIN);
> > > > > > > 
> > > > > > > +       if (req->r_inode) {
> > > > > > > +               err = ceph_wait_on_async_create(req->r_inode);
> > > > > > > +               if (err) {
> > > > > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > > > > +                            __func__, err);
> > > > > > > +                       return err;
> > > > > > > +               }
> > > > > > > +       }
> > > > > > > +
> > > > > > > +       if (!err && req->r_old_inode) {
> > > > > > > +               err = ceph_wait_on_async_create(req->r_old_inode);
> > > > > > > +               if (err) {
> > > > > > > +                       dout("%s: wait for async create returned: %d\n",
> > > > > > > +                            __func__, err);
> > > > > > > +                       return err;
> > > > > > > +               }
> > > > > > > +       }
> > > > > > > +
> > > > > > >          dout("submit_request on %p for inode %p\n", req, dir);
> > > > > > >          mutex_lock(&mdsc->mutex);
> > > > > > >          __register_request(mdsc, req, dir);
> > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > index 95ac00e59e66..8043f2b439b1 100644
> > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > @@ -538,4 +538,11 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
> > > > > > >   extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
> > > > > > >                            struct ceph_mds_session *session,
> > > > > > >                            int max_caps);
> > > > > > > +static inline int ceph_wait_on_async_create(struct inode *inode)
> > > > > > > +{
> > > > > > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > > > > > +
> > > > > > > +       return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
> > > > > > > +                          TASK_INTERRUPTIBLE);
> > > > > > > +}
> > > > > > >   #endif
> > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > index 3430d7ffe8f7..bfb03adb4a08 100644
> > > > > > > --- a/fs/ceph/super.h
> > > > > > > +++ b/fs/ceph/super.h
> > > > > > > @@ -316,7 +316,7 @@ struct ceph_inode_info {
> > > > > > >          u64 i_inline_version;
> > > > > > >          u32 i_time_warp_seq;
> > > > > > > 
> > > > > > > -       unsigned i_ceph_flags;
> > > > > > > +       unsigned long i_ceph_flags;
> > > > > > >          atomic64_t i_release_count;
> > > > > > >          atomic64_t i_ordered_count;
> > > > > > >          atomic64_t i_complete_seq[2];
> > > > > > > @@ -524,6 +524,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
> > > > > > >   #define CEPH_I_ERROR_WRITE     (1 << 10) /* have seen write errors */
> > > > > > >   #define CEPH_I_ERROR_FILELOCK  (1 << 11) /* have seen file lock errors */
> > > > > > >   #define CEPH_I_ODIRECT         (1 << 12) /* inode in direct I/O mode */
> > > > > > > +#define CEPH_ASYNC_CREATE_BIT  (13)      /* async create in flight for this */
> > > > > > > +#define CEPH_I_ASYNC_CREATE    (1 << CEPH_ASYNC_CREATE_BIT)
> > > > > > > 
> > > > > > >   /*
> > > > > > >    * Masks of ceph inode work.
> > > > > > > --
> > > > > > > 2.24.1
> > > > > > > 
> > > > > 
> > > > > --
> > > > > Jeff Layton <jlayton@kernel.org>
> > > > >

Patch
diff mbox series

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d05717397c2a..85e13aa359d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -511,7 +511,7 @@  static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
 				struct ceph_inode_info *ci,
 				bool set_timeout)
 {
-	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
 	     ci->i_ceph_flags, ci->i_hold_caps_max);
 	if (!mdsc->stopping) {
 		spin_lock(&mdsc->cap_delay_lock);
@@ -1294,6 +1294,13 @@  static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	int delayed = 0;
 	int ret;
 
+	/* Don't send anything if it's still being created. Return delayed */
+	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+		spin_unlock(&ci->i_ceph_lock);
+		dout("%s async create in flight for %p\n", __func__, inode);
+		return 1;
+	}
+
 	held = cap->issued | cap->implemented;
 	revoking = cap->implemented & ~cap->issued;
 	retain &= ~revoking;
@@ -2250,6 +2257,10 @@  int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (datasync)
 		goto out;
 
+	ret = ceph_wait_on_async_create(inode);
+	if (ret)
+		goto out;
+
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a87274935a09..5b83bda57056 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -752,7 +752,7 @@  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		struct ceph_dentry_info *di = ceph_dentry(dentry);
 
 		spin_lock(&ci->i_ceph_lock);
-		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
 			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 94d18e643a3d..38eb9dd5062b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2730,7 +2730,7 @@  static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 			      struct ceph_mds_request *req)
 {
-	int err;
+	int err = 0;
 
 	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
 	if (req->r_inode)
@@ -2743,6 +2743,24 @@  int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 
+	if (req->r_inode) {
+		err = ceph_wait_on_async_create(req->r_inode);
+		if (err) {
+			dout("%s: wait for async create returned: %d\n",
+			     __func__, err);
+			return err;
+		}
+	}
+
+	if (!err && req->r_old_inode) {
+		err = ceph_wait_on_async_create(req->r_old_inode);
+		if (err) {
+			dout("%s: wait for async create returned: %d\n",
+			     __func__, err);
+			return err;
+		}
+	}
+
 	dout("submit_request on %p for inode %p\n", req, dir);
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 95ac00e59e66..8043f2b439b1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -538,4 +538,11 @@  extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
 extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
 			  struct ceph_mds_session *session,
 			  int max_caps);
+static inline int ceph_wait_on_async_create(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
+			   TASK_INTERRUPTIBLE);
+}
 #endif
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3430d7ffe8f7..bfb03adb4a08 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -316,7 +316,7 @@  struct ceph_inode_info {
 	u64 i_inline_version;
 	u32 i_time_warp_seq;
 
-	unsigned i_ceph_flags;
+	unsigned long i_ceph_flags;
 	atomic64_t i_release_count;
 	atomic64_t i_ordered_count;
 	atomic64_t i_complete_seq[2];
@@ -524,6 +524,8 @@  static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_ERROR_WRITE	(1 << 10) /* have seen write errors */
 #define CEPH_I_ERROR_FILELOCK	(1 << 11) /* have seen file lock errors */
 #define CEPH_I_ODIRECT		(1 << 12) /* inode in direct I/O mode */
+#define CEPH_ASYNC_CREATE_BIT	(13)	  /* async create in flight for this */
+#define CEPH_I_ASYNC_CREATE	(1 << CEPH_ASYNC_CREATE_BIT)
 
 /*
  * Masks of ceph inode work.