diff mbox series

[RFC,8/9] ceph: copy layout, max_size and truncate_size on successful sync create

Message ID 20200110205647.311023-9-jlayton@kernel.org (mailing list archive)
State New, archived
Headers show
Series ceph: add asynchronous create functionality | expand

Commit Message

Jeff Layton Jan. 10, 2020, 8:56 p.m. UTC
It doesn't do much good to do an asynchronous create unless we can do
I/O to it before the create reply comes in. That means we need layout
info the new file before we've gotten the response from the MDS.

All files created in a directory will initially inherit the same layout,
so copy off the requisite info from the first synchronous create in the
directory. Save it in the same fields in the directory inode, as those
are otherwise unsed for dir inodes. This means we need to be a bit
careful about only updating layout info on non-dir inodes.

Also, zero out the layout when we drop Dc caps in the dir.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c  | 24 ++++++++++++++++++++----
 fs/ceph/file.c  | 24 +++++++++++++++++++++++-
 fs/ceph/inode.c |  4 ++--
 3 files changed, 45 insertions(+), 7 deletions(-)

Comments

Yan, Zheng Jan. 13, 2020, 3:51 a.m. UTC | #1
On 1/11/20 4:56 AM, Jeff Layton wrote:
> It doesn't do much good to do an asynchronous create unless we can do
> I/O to it before the create reply comes in. That means we need layout
> info the new file before we've gotten the response from the MDS.
> 
> All files created in a directory will initially inherit the same layout,
> so copy off the requisite info from the first synchronous create in the
> directory. Save it in the same fields in the directory inode, as those
> are otherwise unsed for dir inodes. This means we need to be a bit
> careful about only updating layout info on non-dir inodes.
> 
> Also, zero out the layout when we drop Dc caps in the dir.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>   fs/ceph/caps.c  | 24 ++++++++++++++++++++----
>   fs/ceph/file.c  | 24 +++++++++++++++++++++++-
>   fs/ceph/inode.c |  4 ++--
>   3 files changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 7fc87b693ba4..b96fb1378479 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>   			return ret;
>   		}
>   
> -		if (S_ISREG(ci->vfs_inode.i_mode) &&
> +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
>   		    ci->i_inline_version != CEPH_INLINE_NONE &&
>   		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
>   		    i_size_read(inode) > 0) {
> @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>   	if (had & CEPH_CAP_FILE_RD)
>   		if (--ci->i_rd_ref == 0)
>   			last++;
> -	if (had & CEPH_CAP_FILE_CACHE)
> -		if (--ci->i_rdcache_ref == 0)
> +	if (had & CEPH_CAP_FILE_CACHE) {
> +		if (--ci->i_rdcache_ref == 0) {
>   			last++;
> +			/* Zero out layout if we lost CREATE caps */
> +			if (S_ISDIR(inode->i_mode) &&
> +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
> +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> +			}
> +		}
> +	}

should do this in __check_cap_issue

>   	if (had & CEPH_CAP_FILE_EXCL)
>   		if (--ci->i_fx_ref == 0)
>   			last++;
> @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
>   		ci->i_subdirs = extra_info->nsubdirs;
>   	}
>   
> -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
> +	if (!S_ISDIR(inode->i_mode) &&
> +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>   		/* file layout may have changed */
>   		s64 old_pool = ci->i_layout.pool_id;
>   		struct ceph_string *old_ns;
> @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
>   		     ceph_cap_string(cap->issued),
>   		     ceph_cap_string(newcaps),
>   		     ceph_cap_string(revoking));
> +
> +		if (S_ISDIR(inode->i_mode) &&
> +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
> +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> +		}


same here

> +
>   		if (S_ISREG(inode->i_mode) &&
>   		    (revoking & used & CEPH_CAP_FILE_BUFFER))
>   			writeback = true;  /* initiate writeback; will delay ack */
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 1e6cdf2dfe90..d4d7a277faf1 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
>   	return err;
>   }
>   
> +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
> +static void
> +copy_file_layout(struct inode *dst, struct inode *src)
> +{
> +	struct ceph_inode_info *cdst = ceph_inode(dst);
> +	struct ceph_inode_info *csrc = ceph_inode(src);
> +
> +	spin_lock(&cdst->i_ceph_lock);
> +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
> +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
> +		memcpy(&cdst->i_layout, &csrc->i_layout,
> +			sizeof(cdst->i_layout));
> +		rcu_assign_pointer(cdst->i_layout.pool_ns,
> +				   ceph_try_get_string(csrc->i_layout.pool_ns));
> +		cdst->i_max_size = csrc->i_max_size;
> +		cdst->i_truncate_size = csrc->i_truncate_size;
> +	}
> +	spin_unlock(&cdst->i_ceph_lock);
> +}
>   
>   /*
>    * Do a lookup + open with a single request.  If we get a non-existent
> @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>   	} else {
>   		dout("atomic_open finish_open on dn %p\n", dn);
>   		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
> +			struct inode *newino = d_inode(dentry);
> +
> +			copy_file_layout(dir, newino);
> +			ceph_init_inode_acls(newino, &as_ctx);
>   			file->f_mode |= FMODE_CREATED;
>   		}
>   		err = finish_open(file, dentry, ceph_open);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 9cfc093fd273..8b51051b79b0 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
>   		ci->i_subdirs = le64_to_cpu(info->subdirs);
>   	}
>   
> -	if (new_version ||
> -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +	if (!S_ISDIR(inode->i_mode) && (new_version ||
> +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
>   		s64 old_pool = ci->i_layout.pool_id;
>   		struct ceph_string *old_ns;
>   
>
Yan, Zheng Jan. 13, 2020, 9:01 a.m. UTC | #2
On 1/11/20 4:56 AM, Jeff Layton wrote:
> It doesn't do much good to do an asynchronous create unless we can do
> I/O to it before the create reply comes in. That means we need layout
> info the new file before we've gotten the response from the MDS.
> 
> All files created in a directory will initially inherit the same layout,
> so copy off the requisite info from the first synchronous create in the
> directory. Save it in the same fields in the directory inode, as those
> are otherwise unsed for dir inodes. This means we need to be a bit
> careful about only updating layout info on non-dir inodes.
> 
> Also, zero out the layout when we drop Dc caps in the dir.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>   fs/ceph/caps.c  | 24 ++++++++++++++++++++----
>   fs/ceph/file.c  | 24 +++++++++++++++++++++++-
>   fs/ceph/inode.c |  4 ++--
>   3 files changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 7fc87b693ba4..b96fb1378479 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>   			return ret;
>   		}
>   
> -		if (S_ISREG(ci->vfs_inode.i_mode) &&
> +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
>   		    ci->i_inline_version != CEPH_INLINE_NONE &&
>   		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
>   		    i_size_read(inode) > 0) {
> @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>   	if (had & CEPH_CAP_FILE_RD)
>   		if (--ci->i_rd_ref == 0)
>   			last++;
> -	if (had & CEPH_CAP_FILE_CACHE)
> -		if (--ci->i_rdcache_ref == 0)
> +	if (had & CEPH_CAP_FILE_CACHE) {
> +		if (--ci->i_rdcache_ref == 0) {
>   			last++;
> +			/* Zero out layout if we lost CREATE caps */
> +			if (S_ISDIR(inode->i_mode) &&
> +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
> +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> +			}
> +		}
> +	}
>   	if (had & CEPH_CAP_FILE_EXCL)
>   		if (--ci->i_fx_ref == 0)
>   			last++;
> @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
>   		ci->i_subdirs = extra_info->nsubdirs;
>   	}
>   
> -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
> +	if (!S_ISDIR(inode->i_mode) &&
> +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>   		/* file layout may have changed */
>   		s64 old_pool = ci->i_layout.pool_id;
>   		struct ceph_string *old_ns;
> @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
>   		     ceph_cap_string(cap->issued),
>   		     ceph_cap_string(newcaps),
>   		     ceph_cap_string(revoking));
> +
> +		if (S_ISDIR(inode->i_mode) &&
> +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
> +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> +		}
> +
>   		if (S_ISREG(inode->i_mode) &&
>   		    (revoking & used & CEPH_CAP_FILE_BUFFER))
>   			writeback = true;  /* initiate writeback; will delay ack */
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 1e6cdf2dfe90..d4d7a277faf1 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
>   	return err;
>   }
>   
> +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
> +static void
> +copy_file_layout(struct inode *dst, struct inode *src)
> +{
> +	struct ceph_inode_info *cdst = ceph_inode(dst);
> +	struct ceph_inode_info *csrc = ceph_inode(src);
> +
> +	spin_lock(&cdst->i_ceph_lock);
> +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
> +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
> +		memcpy(&cdst->i_layout, &csrc->i_layout,
> +			sizeof(cdst->i_layout));

directory's i_layout is used for other purpose. shouldn't modify it.

> +		rcu_assign_pointer(cdst->i_layout.pool_ns,
> +				   ceph_try_get_string(csrc->i_layout.pool_ns));
> +		cdst->i_max_size = csrc->i_max_size; > +		cdst->i_truncate_size = csrc->i_truncate_size;
no need to save above two. just set truncate size of new file to 
(u64)-1, set max_size of new file to its layout.stripe_unit;

max_size == layout.strip_unit ensure that client only write to the first 
object before its writeable range is persistent.

> +	}
> +	spin_unlock(&cdst->i_ceph_lock);
> +}
>   
>   /*
>    * Do a lookup + open with a single request.  If we get a non-existent
> @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>   	} else {
>   		dout("atomic_open finish_open on dn %p\n", dn);
>   		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
> +			struct inode *newino = d_inode(dentry);
> +
> +			copy_file_layout(dir, newino);
> +			ceph_init_inode_acls(newino, &as_ctx);
>   			file->f_mode |= FMODE_CREATED;
>   		}
>   		err = finish_open(file, dentry, ceph_open);
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 9cfc093fd273..8b51051b79b0 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
>   		ci->i_subdirs = le64_to_cpu(info->subdirs);
>   	}
>   
> -	if (new_version ||
> -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> +	if (!S_ISDIR(inode->i_mode) && (new_version ||
> +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
>   		s64 old_pool = ci->i_layout.pool_id;
>   		struct ceph_string *old_ns;
>   
>
Jeff Layton Jan. 13, 2020, 1:26 p.m. UTC | #3
On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote:
> On 1/11/20 4:56 AM, Jeff Layton wrote:
> > It doesn't do much good to do an asynchronous create unless we can do
> > I/O to it before the create reply comes in. That means we need layout
> > info the new file before we've gotten the response from the MDS.
> > 
> > All files created in a directory will initially inherit the same layout,
> > so copy off the requisite info from the first synchronous create in the
> > directory. Save it in the same fields in the directory inode, as those
> > are otherwise unsed for dir inodes. This means we need to be a bit
> > careful about only updating layout info on non-dir inodes.
> > 
> > Also, zero out the layout when we drop Dc caps in the dir.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >   fs/ceph/caps.c  | 24 ++++++++++++++++++++----
> >   fs/ceph/file.c  | 24 +++++++++++++++++++++++-
> >   fs/ceph/inode.c |  4 ++--
> >   3 files changed, 45 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 7fc87b693ba4..b96fb1378479 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >   			return ret;
> >   		}
> >   
> > -		if (S_ISREG(ci->vfs_inode.i_mode) &&
> > +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
> >   		    ci->i_inline_version != CEPH_INLINE_NONE &&
> >   		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
> >   		    i_size_read(inode) > 0) {
> > @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
> >   	if (had & CEPH_CAP_FILE_RD)
> >   		if (--ci->i_rd_ref == 0)
> >   			last++;
> > -	if (had & CEPH_CAP_FILE_CACHE)
> > -		if (--ci->i_rdcache_ref == 0)
> > +	if (had & CEPH_CAP_FILE_CACHE) {
> > +		if (--ci->i_rdcache_ref == 0) {
> >   			last++;
> > +			/* Zero out layout if we lost CREATE caps */
> > +			if (S_ISDIR(inode->i_mode) &&
> > +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
> > +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > +			}
> > +		}
> > +	}
> 
> should do this in __check_cap_issue
> 

That function doesn't get called from the put codepath. Suppose one task
is setting up an async create and a Dc (DIR_CREATE) cap revoke races in.
We zero out the layout and then the inode has a bogus layout set in it.

We can't wipe the cached layout until all of the Dc references are put.

> >   	if (had & CEPH_CAP_FILE_EXCL)
> >   		if (--ci->i_fx_ref == 0)
> >   			last++;
> > @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
> >   		ci->i_subdirs = extra_info->nsubdirs;
> >   	}
> >   
> > -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
> > +	if (!S_ISDIR(inode->i_mode) &&
> > +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> >   		/* file layout may have changed */
> >   		s64 old_pool = ci->i_layout.pool_id;
> >   		struct ceph_string *old_ns;
> > @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
> >   		     ceph_cap_string(cap->issued),
> >   		     ceph_cap_string(newcaps),
> >   		     ceph_cap_string(revoking));
> > +
> > +		if (S_ISDIR(inode->i_mode) &&
> > +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
> > +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > +		}
> 
> same here
> 
> > +
> >   		if (S_ISREG(inode->i_mode) &&
> >   		    (revoking & used & CEPH_CAP_FILE_BUFFER))
> >   			writeback = true;  /* initiate writeback; will delay ack */
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 1e6cdf2dfe90..d4d7a277faf1 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
> >   	return err;
> >   }
> >   
> > +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
> > +static void
> > +copy_file_layout(struct inode *dst, struct inode *src)
> > +{
> > +	struct ceph_inode_info *cdst = ceph_inode(dst);
> > +	struct ceph_inode_info *csrc = ceph_inode(src);
> > +
> > +	spin_lock(&cdst->i_ceph_lock);
> > +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
> > +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
> > +		memcpy(&cdst->i_layout, &csrc->i_layout,
> > +			sizeof(cdst->i_layout));
> > +		rcu_assign_pointer(cdst->i_layout.pool_ns,
> > +				   ceph_try_get_string(csrc->i_layout.pool_ns));
> > +		cdst->i_max_size = csrc->i_max_size;
> > +		cdst->i_truncate_size = csrc->i_truncate_size;
> > +	}
> > +	spin_unlock(&cdst->i_ceph_lock);
> > +}
> >   
> >   /*
> >    * Do a lookup + open with a single request.  If we get a non-existent
> > @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >   	} else {
> >   		dout("atomic_open finish_open on dn %p\n", dn);
> >   		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
> > +			struct inode *newino = d_inode(dentry);
> > +
> > +			copy_file_layout(dir, newino);
> > +			ceph_init_inode_acls(newino, &as_ctx);
> >   			file->f_mode |= FMODE_CREATED;
> >   		}
> >   		err = finish_open(file, dentry, ceph_open);
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 9cfc093fd273..8b51051b79b0 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
> >   		ci->i_subdirs = le64_to_cpu(info->subdirs);
> >   	}
> >   
> > -	if (new_version ||
> > -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > +	if (!S_ISDIR(inode->i_mode) && (new_version ||
> > +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
> >   		s64 old_pool = ci->i_layout.pool_id;
> >   		struct ceph_string *old_ns;
> >   
> >
Jeff Layton Jan. 13, 2020, 1:29 p.m. UTC | #4
On Mon, 2020-01-13 at 17:01 +0800, Yan, Zheng wrote:
> On 1/11/20 4:56 AM, Jeff Layton wrote:
> > It doesn't do much good to do an asynchronous create unless we can do
> > I/O to it before the create reply comes in. That means we need layout
> > info the new file before we've gotten the response from the MDS.
> > 
> > All files created in a directory will initially inherit the same layout,
> > so copy off the requisite info from the first synchronous create in the
> > directory. Save it in the same fields in the directory inode, as those
> > are otherwise unsed for dir inodes. This means we need to be a bit
> > careful about only updating layout info on non-dir inodes.
> > 
> > Also, zero out the layout when we drop Dc caps in the dir.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >   fs/ceph/caps.c  | 24 ++++++++++++++++++++----
> >   fs/ceph/file.c  | 24 +++++++++++++++++++++++-
> >   fs/ceph/inode.c |  4 ++--
> >   3 files changed, 45 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 7fc87b693ba4..b96fb1378479 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >   			return ret;
> >   		}
> >   
> > -		if (S_ISREG(ci->vfs_inode.i_mode) &&
> > +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
> >   		    ci->i_inline_version != CEPH_INLINE_NONE &&
> >   		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
> >   		    i_size_read(inode) > 0) {
> > @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
> >   	if (had & CEPH_CAP_FILE_RD)
> >   		if (--ci->i_rd_ref == 0)
> >   			last++;
> > -	if (had & CEPH_CAP_FILE_CACHE)
> > -		if (--ci->i_rdcache_ref == 0)
> > +	if (had & CEPH_CAP_FILE_CACHE) {
> > +		if (--ci->i_rdcache_ref == 0) {
> >   			last++;
> > +			/* Zero out layout if we lost CREATE caps */
> > +			if (S_ISDIR(inode->i_mode) &&
> > +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
> > +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > +			}
> > +		}
> > +	}
> >   	if (had & CEPH_CAP_FILE_EXCL)
> >   		if (--ci->i_fx_ref == 0)
> >   			last++;
> > @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
> >   		ci->i_subdirs = extra_info->nsubdirs;
> >   	}
> >   
> > -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
> > +	if (!S_ISDIR(inode->i_mode) &&
> > +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> >   		/* file layout may have changed */
> >   		s64 old_pool = ci->i_layout.pool_id;
> >   		struct ceph_string *old_ns;
> > @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
> >   		     ceph_cap_string(cap->issued),
> >   		     ceph_cap_string(newcaps),
> >   		     ceph_cap_string(revoking));
> > +
> > +		if (S_ISDIR(inode->i_mode) &&
> > +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
> > +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > +		}
> > +
> >   		if (S_ISREG(inode->i_mode) &&
> >   		    (revoking & used & CEPH_CAP_FILE_BUFFER))
> >   			writeback = true;  /* initiate writeback; will delay ack */
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 1e6cdf2dfe90..d4d7a277faf1 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
> >   	return err;
> >   }
> >   
> > +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
> > +static void
> > +copy_file_layout(struct inode *dst, struct inode *src)
> > +{
> > +	struct ceph_inode_info *cdst = ceph_inode(dst);
> > +	struct ceph_inode_info *csrc = ceph_inode(src);
> > +
> > +	spin_lock(&cdst->i_ceph_lock);
> > +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
> > +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
> > +		memcpy(&cdst->i_layout, &csrc->i_layout,
> > +			sizeof(cdst->i_layout));
> 
> directory's i_layout is used for other purpose. shouldn't modify it.
> 

Yeah, I realized that just recently. I'll add a new field for this.
Maybe we can at least union it with something that is unused in
directories.

> > +		rcu_assign_pointer(cdst->i_layout.pool_ns,
> > +				   ceph_try_get_string(csrc->i_layout.pool_ns));
> > +		cdst->i_max_size = csrc->i_max_size; > +		cdst->i_truncate_size = csrc->i_truncate_size;
> no need to save above two. just set truncate size of new file to 
> (u64)-1, set max_size of new file to its layout.stripe_unit;
> 
> max_size == layout.strip_unit ensure that client only write to the first 
> object before its writeable range is persistent.
> 

ACK, I saw that in your userland client patchset. I'll fix that up.

Thanks,

> > +	}
> > +	spin_unlock(&cdst->i_ceph_lock);
> > +}
> >   
> >   /*
> >    * Do a lookup + open with a single request.  If we get a non-existent
> > @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >   	} else {
> >   		dout("atomic_open finish_open on dn %p\n", dn);
> >   		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
> > +			struct inode *newino = d_inode(dentry);
> > +
> > +			copy_file_layout(dir, newino);
> > +			ceph_init_inode_acls(newino, &as_ctx);
> >   			file->f_mode |= FMODE_CREATED;
> >   		}
> >   		err = finish_open(file, dentry, ceph_open);
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 9cfc093fd273..8b51051b79b0 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
> >   		ci->i_subdirs = le64_to_cpu(info->subdirs);
> >   	}
> >   
> > -	if (new_version ||
> > -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > +	if (!S_ISDIR(inode->i_mode) && (new_version ||
> > +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
> >   		s64 old_pool = ci->i_layout.pool_id;
> >   		struct ceph_string *old_ns;
> >   
> >
Yan, Zheng Jan. 13, 2020, 2:56 p.m. UTC | #5
On 1/13/20 9:26 PM, Jeff Layton wrote:
> On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote:
>> On 1/11/20 4:56 AM, Jeff Layton wrote:
>>> It doesn't do much good to do an asynchronous create unless we can do
>>> I/O to it before the create reply comes in. That means we need layout
>>> info the new file before we've gotten the response from the MDS.
>>>
>>> All files created in a directory will initially inherit the same layout,
>>> so copy off the requisite info from the first synchronous create in the
>>> directory. Save it in the same fields in the directory inode, as those
>>> are otherwise unsed for dir inodes. This means we need to be a bit
>>> careful about only updating layout info on non-dir inodes.
>>>
>>> Also, zero out the layout when we drop Dc caps in the dir.
>>>
>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>> ---
>>>    fs/ceph/caps.c  | 24 ++++++++++++++++++++----
>>>    fs/ceph/file.c  | 24 +++++++++++++++++++++++-
>>>    fs/ceph/inode.c |  4 ++--
>>>    3 files changed, 45 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>> index 7fc87b693ba4..b96fb1378479 100644
>>> --- a/fs/ceph/caps.c
>>> +++ b/fs/ceph/caps.c
>>> @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>>>    			return ret;
>>>    		}
>>>    
>>> -		if (S_ISREG(ci->vfs_inode.i_mode) &&
>>> +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
>>>    		    ci->i_inline_version != CEPH_INLINE_NONE &&
>>>    		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
>>>    		    i_size_read(inode) > 0) {
>>> @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>>>    	if (had & CEPH_CAP_FILE_RD)
>>>    		if (--ci->i_rd_ref == 0)
>>>    			last++;
>>> -	if (had & CEPH_CAP_FILE_CACHE)
>>> -		if (--ci->i_rdcache_ref == 0)
>>> +	if (had & CEPH_CAP_FILE_CACHE) {
>>> +		if (--ci->i_rdcache_ref == 0) {
>>>    			last++;
>>> +			/* Zero out layout if we lost CREATE caps */
>>> +			if (S_ISDIR(inode->i_mode) &&
>>> +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
>>> +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
>>> +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
>>> +			}
>>> +		}
>>> +	}
>>
>> should do this in __check_cap_issue
>>
> 
> That function doesn't get called from the put codepath. Suppose one task
> is setting up an async create and a Dc (DIR_CREATE) cap revoke races in.
> We zero out the layout and then the inode has a bogus layout set in it.
> 
> We can't wipe the cached layout until all of the Dc references are put.

how about:

get_caps_for_async_create() return the layout.
pass the returned layout into ceph_finish_async_open()



> 
>>>    	if (had & CEPH_CAP_FILE_EXCL)
>>>    		if (--ci->i_fx_ref == 0)
>>>    			last++;
>>> @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
>>>    		ci->i_subdirs = extra_info->nsubdirs;
>>>    	}
>>>    
>>> -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
>>> +	if (!S_ISDIR(inode->i_mode) &&
>>> +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>>>    		/* file layout may have changed */
>>>    		s64 old_pool = ci->i_layout.pool_id;
>>>    		struct ceph_string *old_ns;
>>> @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
>>>    		     ceph_cap_string(cap->issued),
>>>    		     ceph_cap_string(newcaps),
>>>    		     ceph_cap_string(revoking));
>>> +
>>> +		if (S_ISDIR(inode->i_mode) &&
>>> +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
>>> +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
>>> +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
>>> +		}
>>
>> same here
>>
>>> +
>>>    		if (S_ISREG(inode->i_mode) &&
>>>    		    (revoking & used & CEPH_CAP_FILE_BUFFER))
>>>    			writeback = true;  /* initiate writeback; will delay ack */
>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>>> index 1e6cdf2dfe90..d4d7a277faf1 100644
>>> --- a/fs/ceph/file.c
>>> +++ b/fs/ceph/file.c
>>> @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
>>>    	return err;
>>>    }
>>>    
>>> +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
>>> +static void
>>> +copy_file_layout(struct inode *dst, struct inode *src)
>>> +{
>>> +	struct ceph_inode_info *cdst = ceph_inode(dst);
>>> +	struct ceph_inode_info *csrc = ceph_inode(src);
>>> +
>>> +	spin_lock(&cdst->i_ceph_lock);
>>> +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
>>> +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
>>> +		memcpy(&cdst->i_layout, &csrc->i_layout,
>>> +			sizeof(cdst->i_layout));
>>> +		rcu_assign_pointer(cdst->i_layout.pool_ns,
>>> +				   ceph_try_get_string(csrc->i_layout.pool_ns));
>>> +		cdst->i_max_size = csrc->i_max_size;
>>> +		cdst->i_truncate_size = csrc->i_truncate_size;
>>> +	}
>>> +	spin_unlock(&cdst->i_ceph_lock);
>>> +}
>>>    
>>>    /*
>>>     * Do a lookup + open with a single request.  If we get a non-existent
>>> @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>    	} else {
>>>    		dout("atomic_open finish_open on dn %p\n", dn);
>>>    		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
>>> -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
>>> +			struct inode *newino = d_inode(dentry);
>>> +
>>> +			copy_file_layout(dir, newino);
>>> +			ceph_init_inode_acls(newino, &as_ctx);
>>>    			file->f_mode |= FMODE_CREATED;
>>>    		}
>>>    		err = finish_open(file, dentry, ceph_open);
>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>> index 9cfc093fd273..8b51051b79b0 100644
>>> --- a/fs/ceph/inode.c
>>> +++ b/fs/ceph/inode.c
>>> @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
>>>    		ci->i_subdirs = le64_to_cpu(info->subdirs);
>>>    	}
>>>    
>>> -	if (new_version ||
>>> -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>>> +	if (!S_ISDIR(inode->i_mode) && (new_version ||
>>> +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
>>>    		s64 old_pool = ci->i_layout.pool_id;
>>>    		struct ceph_string *old_ns;
>>>    
>>>
>
Jeff Layton Jan. 13, 2020, 3:13 p.m. UTC | #6
On Mon, 2020-01-13 at 22:56 +0800, Yan, Zheng wrote:
> On 1/13/20 9:26 PM, Jeff Layton wrote:
> > On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote:
> > > On 1/11/20 4:56 AM, Jeff Layton wrote:
> > > > It doesn't do much good to do an asynchronous create unless we can do
> > > > I/O to it before the create reply comes in. That means we need layout
> > > > info the new file before we've gotten the response from the MDS.
> > > > 
> > > > All files created in a directory will initially inherit the same layout,
> > > > so copy off the requisite info from the first synchronous create in the
> > > > directory. Save it in the same fields in the directory inode, as those
> > > > are otherwise unsed for dir inodes. This means we need to be a bit
> > > > careful about only updating layout info on non-dir inodes.
> > > > 
> > > > Also, zero out the layout when we drop Dc caps in the dir.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >    fs/ceph/caps.c  | 24 ++++++++++++++++++++----
> > > >    fs/ceph/file.c  | 24 +++++++++++++++++++++++-
> > > >    fs/ceph/inode.c |  4 ++--
> > > >    3 files changed, 45 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index 7fc87b693ba4..b96fb1378479 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > > >    			return ret;
> > > >    		}
> > > >    
> > > > -		if (S_ISREG(ci->vfs_inode.i_mode) &&
> > > > +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
> > > >    		    ci->i_inline_version != CEPH_INLINE_NONE &&
> > > >    		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
> > > >    		    i_size_read(inode) > 0) {
> > > > @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
> > > >    	if (had & CEPH_CAP_FILE_RD)
> > > >    		if (--ci->i_rd_ref == 0)
> > > >    			last++;
> > > > -	if (had & CEPH_CAP_FILE_CACHE)
> > > > -		if (--ci->i_rdcache_ref == 0)
> > > > +	if (had & CEPH_CAP_FILE_CACHE) {
> > > > +		if (--ci->i_rdcache_ref == 0) {
> > > >    			last++;
> > > > +			/* Zero out layout if we lost CREATE caps */
> > > > +			if (S_ISDIR(inode->i_mode) &&
> > > > +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
> > > > +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > > > +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > > > +			}
> > > > +		}
> > > > +	}
> > > 
> > > should do this in __check_cap_issue
> > > 
> > 
> > That function doesn't get called from the put codepath. Suppose one task
> > is setting up an async create and a Dc (DIR_CREATE) cap revoke races in.
> > We zero out the layout and then the inode has a bogus layout set in it.
> > 
> > We can't wipe the cached layout until all of the Dc references are put.
> 
> how about:
> 
> get_caps_for_async_create() return the layout.
> pass the returned layout into ceph_finish_async_open()
> 

That still sounds racy.

What guarantees the stability of the cached layout while we're copying
it in get_caps_for_async_create()? I guess we could protect the new
cached layout field with the i_ceph_lock.

Is that a real improvement? I'm not sure.

> 
> 
> > > >    	if (had & CEPH_CAP_FILE_EXCL)
> > > >    		if (--ci->i_fx_ref == 0)
> > > >    			last++;
> > > > @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
> > > >    		ci->i_subdirs = extra_info->nsubdirs;
> > > >    	}
> > > >    
> > > > -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
> > > > +	if (!S_ISDIR(inode->i_mode) &&
> > > > +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > > >    		/* file layout may have changed */
> > > >    		s64 old_pool = ci->i_layout.pool_id;
> > > >    		struct ceph_string *old_ns;
> > > > @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
> > > >    		     ceph_cap_string(cap->issued),
> > > >    		     ceph_cap_string(newcaps),
> > > >    		     ceph_cap_string(revoking));
> > > > +
> > > > +		if (S_ISDIR(inode->i_mode) &&
> > > > +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
> > > > +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
> > > > +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
> > > > +		}
> > > 
> > > same here
> > > 
> > > > +
> > > >    		if (S_ISREG(inode->i_mode) &&
> > > >    		    (revoking & used & CEPH_CAP_FILE_BUFFER))
> > > >    			writeback = true;  /* initiate writeback; will delay ack */
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index 1e6cdf2dfe90..d4d7a277faf1 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
> > > >    	return err;
> > > >    }
> > > >    
> > > > +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
> > > > +static void
> > > > +copy_file_layout(struct inode *dst, struct inode *src)
> > > > +{
> > > > +	struct ceph_inode_info *cdst = ceph_inode(dst);
> > > > +	struct ceph_inode_info *csrc = ceph_inode(src);
> > > > +
> > > > +	spin_lock(&cdst->i_ceph_lock);
> > > > +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
> > > > +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
> > > > +		memcpy(&cdst->i_layout, &csrc->i_layout,
> > > > +			sizeof(cdst->i_layout));
> > > > +		rcu_assign_pointer(cdst->i_layout.pool_ns,
> > > > +				   ceph_try_get_string(csrc->i_layout.pool_ns));
> > > > +		cdst->i_max_size = csrc->i_max_size;
> > > > +		cdst->i_truncate_size = csrc->i_truncate_size;
> > > > +	}
> > > > +	spin_unlock(&cdst->i_ceph_lock);
> > > > +}
> > > >    
> > > >    /*
> > > >     * Do a lookup + open with a single request.  If we get a non-existent
> > > > @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    	} else {
> > > >    		dout("atomic_open finish_open on dn %p\n", dn);
> > > >    		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
> > > > -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
> > > > +			struct inode *newino = d_inode(dentry);
> > > > +
> > > > +			copy_file_layout(dir, newino);
> > > > +			ceph_init_inode_acls(newino, &as_ctx);
> > > >    			file->f_mode |= FMODE_CREATED;
> > > >    		}
> > > >    		err = finish_open(file, dentry, ceph_open);
> > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > index 9cfc093fd273..8b51051b79b0 100644
> > > > --- a/fs/ceph/inode.c
> > > > +++ b/fs/ceph/inode.c
> > > > @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
> > > >    		ci->i_subdirs = le64_to_cpu(info->subdirs);
> > > >    	}
> > > >    
> > > > -	if (new_version ||
> > > > -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
> > > > +	if (!S_ISDIR(inode->i_mode) && (new_version ||
> > > > +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
> > > >    		s64 old_pool = ci->i_layout.pool_id;
> > > >    		struct ceph_string *old_ns;
> > > >    
> > > >
Yan, Zheng Jan. 13, 2020, 4:37 p.m. UTC | #7
On 1/13/20 11:13 PM, Jeff Layton wrote:
> On Mon, 2020-01-13 at 22:56 +0800, Yan, Zheng wrote:
>> On 1/13/20 9:26 PM, Jeff Layton wrote:
>>> On Mon, 2020-01-13 at 11:51 +0800, Yan, Zheng wrote:
>>>> On 1/11/20 4:56 AM, Jeff Layton wrote:
>>>>> It doesn't do much good to do an asynchronous create unless we can do
>>>>> I/O to it before the create reply comes in. That means we need layout
>>>>> info the new file before we've gotten the response from the MDS.
>>>>>
>>>>> All files created in a directory will initially inherit the same layout,
>>>>> so copy off the requisite info from the first synchronous create in the
>>>>> directory. Save it in the same fields in the directory inode, as those
>>>>> are otherwise unsed for dir inodes. This means we need to be a bit
>>>>> careful about only updating layout info on non-dir inodes.
>>>>>
>>>>> Also, zero out the layout when we drop Dc caps in the dir.
>>>>>
>>>>> Signed-off-by: Jeff Layton <jlayton@kernel.org>
>>>>> ---
>>>>>     fs/ceph/caps.c  | 24 ++++++++++++++++++++----
>>>>>     fs/ceph/file.c  | 24 +++++++++++++++++++++++-
>>>>>     fs/ceph/inode.c |  4 ++--
>>>>>     3 files changed, 45 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>>> index 7fc87b693ba4..b96fb1378479 100644
>>>>> --- a/fs/ceph/caps.c
>>>>> +++ b/fs/ceph/caps.c
>>>>> @@ -2847,7 +2847,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>>>>>     			return ret;
>>>>>     		}
>>>>>     
>>>>> -		if (S_ISREG(ci->vfs_inode.i_mode) &&
>>>>> +		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
>>>>>     		    ci->i_inline_version != CEPH_INLINE_NONE &&
>>>>>     		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
>>>>>     		    i_size_read(inode) > 0) {
>>>>> @@ -2944,9 +2944,17 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>>>>>     	if (had & CEPH_CAP_FILE_RD)
>>>>>     		if (--ci->i_rd_ref == 0)
>>>>>     			last++;
>>>>> -	if (had & CEPH_CAP_FILE_CACHE)
>>>>> -		if (--ci->i_rdcache_ref == 0)
>>>>> +	if (had & CEPH_CAP_FILE_CACHE) {
>>>>> +		if (--ci->i_rdcache_ref == 0) {
>>>>>     			last++;
>>>>> +			/* Zero out layout if we lost CREATE caps */
>>>>> +			if (S_ISDIR(inode->i_mode) &&
>>>>> +			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
>>>>> +				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
>>>>> +				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
>>>>> +			}
>>>>> +		}
>>>>> +	}
>>>>
>>>> should do this in __check_cap_issue
>>>>
>>>
>>> That function doesn't get called from the put codepath. Suppose one task
>>> is setting up an async create and a Dc (DIR_CREATE) cap revoke races in.
>>> We zero out the layout and then the inode has a bogus layout set in it.
>>>
>>> We can't wipe the cached layout until all of the Dc references are put.
>>
>> how about:
>>
>> get_caps_for_async_create() return the layout.
>> pass the returned layout into ceph_finish_async_open()
>>
> 
> That still sounds racy.
> 
> What guarantees the stability of the cached layout while we're copying
> it in get_caps_for_async_create()? I guess we could protect the new
> cached layout field with the i_ceph_lock.
> 

Yes. copy cached layout while holding i_ceph_lock after getting Fsc caps.

Besides, we need to make sure ceph_finish_async_open get called before 
dropping Fsc.

> Is that a real improvement? I'm not sure.
> 
I think it's more clean.







>>
>>
>>>>>     	if (had & CEPH_CAP_FILE_EXCL)
>>>>>     		if (--ci->i_fx_ref == 0)
>>>>>     			last++;
>>>>> @@ -3264,7 +3272,8 @@ static void handle_cap_grant(struct inode *inode,
>>>>>     		ci->i_subdirs = extra_info->nsubdirs;
>>>>>     	}
>>>>>     
>>>>> -	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
>>>>> +	if (!S_ISDIR(inode->i_mode) &&
>>>>> +	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>>>>>     		/* file layout may have changed */
>>>>>     		s64 old_pool = ci->i_layout.pool_id;
>>>>>     		struct ceph_string *old_ns;
>>>>> @@ -3336,6 +3345,13 @@ static void handle_cap_grant(struct inode *inode,
>>>>>     		     ceph_cap_string(cap->issued),
>>>>>     		     ceph_cap_string(newcaps),
>>>>>     		     ceph_cap_string(revoking));
>>>>> +
>>>>> +		if (S_ISDIR(inode->i_mode) &&
>>>>> +		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
>>>>> +			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
>>>>> +			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
>>>>> +		}
>>>>
>>>> same here
>>>>
>>>>> +
>>>>>     		if (S_ISREG(inode->i_mode) &&
>>>>>     		    (revoking & used & CEPH_CAP_FILE_BUFFER))
>>>>>     			writeback = true;  /* initiate writeback; will delay ack */
>>>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>>>>> index 1e6cdf2dfe90..d4d7a277faf1 100644
>>>>> --- a/fs/ceph/file.c
>>>>> +++ b/fs/ceph/file.c
>>>>> @@ -430,6 +430,25 @@ int ceph_open(struct inode *inode, struct file *file)
>>>>>     	return err;
>>>>>     }
>>>>>     
>>>>> +/* Clone the layout from a synchronous create, if the dir now has Dc caps */
>>>>> +static void
>>>>> +copy_file_layout(struct inode *dst, struct inode *src)
>>>>> +{
>>>>> +	struct ceph_inode_info *cdst = ceph_inode(dst);
>>>>> +	struct ceph_inode_info *csrc = ceph_inode(src);
>>>>> +
>>>>> +	spin_lock(&cdst->i_ceph_lock);
>>>>> +	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
>>>>> +	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
>>>>> +		memcpy(&cdst->i_layout, &csrc->i_layout,
>>>>> +			sizeof(cdst->i_layout));
>>>>> +		rcu_assign_pointer(cdst->i_layout.pool_ns,
>>>>> +				   ceph_try_get_string(csrc->i_layout.pool_ns));
>>>>> +		cdst->i_max_size = csrc->i_max_size;
>>>>> +		cdst->i_truncate_size = csrc->i_truncate_size;
>>>>> +	}
>>>>> +	spin_unlock(&cdst->i_ceph_lock);
>>>>> +}
>>>>>     
>>>>>     /*
>>>>>      * Do a lookup + open with a single request.  If we get a non-existent
>>>>> @@ -518,7 +537,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>>>     	} else {
>>>>>     		dout("atomic_open finish_open on dn %p\n", dn);
>>>>>     		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
>>>>> -			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
>>>>> +			struct inode *newino = d_inode(dentry);
>>>>> +
>>>>> +			copy_file_layout(dir, newino);
>>>>> +			ceph_init_inode_acls(newino, &as_ctx);
>>>>>     			file->f_mode |= FMODE_CREATED;
>>>>>     		}
>>>>>     		err = finish_open(file, dentry, ceph_open);
>>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>>> index 9cfc093fd273..8b51051b79b0 100644
>>>>> --- a/fs/ceph/inode.c
>>>>> +++ b/fs/ceph/inode.c
>>>>> @@ -848,8 +848,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
>>>>>     		ci->i_subdirs = le64_to_cpu(info->subdirs);
>>>>>     	}
>>>>>     
>>>>> -	if (new_version ||
>>>>> -	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
>>>>> +	if (!S_ISDIR(inode->i_mode) && (new_version ||
>>>>> +	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
>>>>>     		s64 old_pool = ci->i_layout.pool_id;
>>>>>     		struct ceph_string *old_ns;
>>>>>     
>>>>>
>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7fc87b693ba4..b96fb1378479 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2847,7 +2847,7 @@  int ceph_get_caps(struct file *filp, int need, int want,
 			return ret;
 		}
 
-		if (S_ISREG(ci->vfs_inode.i_mode) &&
+		if (!S_ISDIR(ci->vfs_inode.i_mode) &&
 		    ci->i_inline_version != CEPH_INLINE_NONE &&
 		    (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
 		    i_size_read(inode) > 0) {
@@ -2944,9 +2944,17 @@  void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 	if (had & CEPH_CAP_FILE_RD)
 		if (--ci->i_rd_ref == 0)
 			last++;
-	if (had & CEPH_CAP_FILE_CACHE)
-		if (--ci->i_rdcache_ref == 0)
+	if (had & CEPH_CAP_FILE_CACHE) {
+		if (--ci->i_rdcache_ref == 0) {
 			last++;
+			/* Zero out layout if we lost CREATE caps */
+			if (S_ISDIR(inode->i_mode) &&
+			    !(__ceph_caps_issued(ci, NULL) & CEPH_CAP_DIR_CREATE)) {
+				ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+				memset(&ci->i_layout, 0, sizeof(ci->i_layout));
+			}
+		}
+	}
 	if (had & CEPH_CAP_FILE_EXCL)
 		if (--ci->i_fx_ref == 0)
 			last++;
@@ -3264,7 +3272,8 @@  static void handle_cap_grant(struct inode *inode,
 		ci->i_subdirs = extra_info->nsubdirs;
 	}
 
-	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+	if (!S_ISDIR(inode->i_mode) &&
+	    (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 		/* file layout may have changed */
 		s64 old_pool = ci->i_layout.pool_id;
 		struct ceph_string *old_ns;
@@ -3336,6 +3345,13 @@  static void handle_cap_grant(struct inode *inode,
 		     ceph_cap_string(cap->issued),
 		     ceph_cap_string(newcaps),
 		     ceph_cap_string(revoking));
+
+		if (S_ISDIR(inode->i_mode) &&
+		    (revoking & CEPH_CAP_DIR_CREATE) && !ci->i_rdcache_ref) {
+			ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+			memset(&ci->i_layout, 0, sizeof(ci->i_layout));
+		}
+
 		if (S_ISREG(inode->i_mode) &&
 		    (revoking & used & CEPH_CAP_FILE_BUFFER))
 			writeback = true;  /* initiate writeback; will delay ack */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1e6cdf2dfe90..d4d7a277faf1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -430,6 +430,25 @@  int ceph_open(struct inode *inode, struct file *file)
 	return err;
 }
 
+/* Clone the layout from a synchronous create, if the dir now has Dc caps */
+static void
+copy_file_layout(struct inode *dst, struct inode *src)
+{
+	struct ceph_inode_info *cdst = ceph_inode(dst);
+	struct ceph_inode_info *csrc = ceph_inode(src);
+
+	spin_lock(&cdst->i_ceph_lock);
+	if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
+	    !ceph_file_layout_is_valid(&cdst->i_layout)) {
+		memcpy(&cdst->i_layout, &csrc->i_layout,
+			sizeof(cdst->i_layout));
+		rcu_assign_pointer(cdst->i_layout.pool_ns,
+				   ceph_try_get_string(csrc->i_layout.pool_ns));
+		cdst->i_max_size = csrc->i_max_size;
+		cdst->i_truncate_size = csrc->i_truncate_size;
+	}
+	spin_unlock(&cdst->i_ceph_lock);
+}
 
 /*
  * Do a lookup + open with a single request.  If we get a non-existent
@@ -518,7 +537,10 @@  int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
 		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
-			ceph_init_inode_acls(d_inode(dentry), &as_ctx);
+			struct inode *newino = d_inode(dentry);
+
+			copy_file_layout(dir, newino);
+			ceph_init_inode_acls(newino, &as_ctx);
 			file->f_mode |= FMODE_CREATED;
 		}
 		err = finish_open(file, dentry, ceph_open);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9cfc093fd273..8b51051b79b0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -848,8 +848,8 @@  int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		ci->i_subdirs = le64_to_cpu(info->subdirs);
 	}
 
-	if (new_version ||
-	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+	if (!S_ISDIR(inode->i_mode) && (new_version ||
+	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)))) {
 		s64 old_pool = ci->i_layout.pool_id;
 		struct ceph_string *old_ns;