[3/3] ceph: fix potential races in ceph_uninline_data

Message ID	20190710161154.26125-4-jlayton@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <ceph-devel-owner@kernel.org> From: Jeff Layton <jlayton@kernel.org> To: ceph-devel@vger.kernel.org Cc: zyan@redhat.com, idryomov@gmail.com, sage@redhat.com Subject: [PATCH 3/3] ceph: fix potential races in ceph_uninline_data Date: Wed, 10 Jul 2019 12:11:54 -0400 Message-Id: <20190710161154.26125-4-jlayton@kernel.org> In-Reply-To: <20190710161154.26125-1-jlayton@kernel.org> References: <20190710161154.26125-1-jlayton@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk
Series	ceph: fix races when uninlining data \| expand [0/3] ceph: fix races when uninlining data [1/3] ceph: make ceph_uninline_data take inode pointer [2/3] ceph: pass unlocked page to ceph_uninline_data [3/3] ceph: fix potential races in ceph_uninline_data

Message ID

20190710161154.26125-4-jlayton@kernel.org (mailing list archive)

State

New, archived

Headers

From: Jeff Layton <jlayton@kernel.org>
To: ceph-devel@vger.kernel.org
Cc: zyan@redhat.com, idryomov@gmail.com, sage@redhat.com
Subject: [PATCH 3/3] ceph: fix potential races in ceph_uninline_data
Date: Wed, 10 Jul 2019 12:11:54 -0400
Message-Id: <20190710161154.26125-4-jlayton@kernel.org>
In-Reply-To: <20190710161154.26125-1-jlayton@kernel.org>
References: <20190710161154.26125-1-jlayton@kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: ceph-devel-owner@vger.kernel.org
Precedence: bulk

Series

ceph: fix races when uninlining data | expand

Commit Message

Jeff Layton July 10, 2019, 4:11 p.m. UTC

The current code will do the uninlining but it relies on the caller to
set the i_inline_version appropriately afterward. There are several
potential races here.

Protect against competing uninlining attempts by having the callers
take the i_truncate_mutex and then have them update the version
themselves before dropping it.

Other callers can then re-check the i_inline_version after acquiring the
mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
and do nothing.

Finally since we are doing a lockless check first in all cases, just
move that into ceph_uninline_data as well, and have the callers call
it unconditionally.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
 fs/ceph/file.c | 18 ++++++------------
 2 files changed, 30 insertions(+), 21 deletions(-)

Comments

Luis Henriques July 10, 2019, 6:03 p.m. UTC | #1

Jeff Layton <jlayton@kernel.org> writes:

> The current code will do the uninlining but it relies on the caller to
> set the i_inline_version appropriately afterward. There are several
> potential races here.
>
> Protect against competing uninlining attempts by having the callers
> take the i_truncate_mutex and then have them update the version
> themselves before dropping it.
>
> Other callers can then re-check the i_inline_version after acquiring the
> mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
> and do nothing.
>
> Finally since we are doing a lockless check first in all cases, just
> move that into ceph_uninline_data as well, and have the callers call
> it unconditionally.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
>  fs/ceph/file.c | 18 ++++++------------
>  2 files changed, 30 insertions(+), 21 deletions(-)
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 5f1e2b6577fb..e9700c997d12 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  
>  	ceph_block_sigs(&oldset);
>  
> -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> -		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> -		if (err < 0)
> -			goto out_free;
> -	}
> +	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> +	if (err < 0)
> +		goto out_free;
>  
>  	if (off + PAGE_SIZE <= size)
>  		len = PAGE_SIZE;
> @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>  	    ci->i_inline_version != CEPH_INLINE_NONE) {
>  		int dirty;
>  		spin_lock(&ci->i_ceph_lock);
> -		ci->i_inline_version = CEPH_INLINE_NONE;
>  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
>  					       &prealloc_cf);
>  		spin_unlock(&ci->i_ceph_lock);
> @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
>  	}
>  }
>  
> +/*
> + * We borrow the i_truncate_mutex to serialize callers that may be racing to
> + * uninline the data.
> + */
>  int ceph_uninline_data(struct inode *inode, struct page *page)
>  {
>  	struct ceph_inode_info *ci = ceph_inode(inode);
> @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
>  	int err = 0;
>  	bool from_pagecache = false;
>  
> -	spin_lock(&ci->i_ceph_lock);
> -	inline_version = ci->i_inline_version;
> -	spin_unlock(&ci->i_ceph_lock);
> +	/* Do a lockless check first -- paired with i_ceph_lock for changes */
> +	inline_version = READ_ONCE(ci->i_inline_version);
>  
>  	dout("uninline_data %p %llx.%llx inline_version %llu\n",
>  	     inode, ceph_vinop(inode), inline_version);
>  
>  	if (inline_version == 1 || /* initial version, no data */
>  	    inline_version == CEPH_INLINE_NONE)
> +		return 0;

We may need to do the unlock_page(page) before returning.

> + + mutex_lock(&ci->i_truncate_mutex); + + /* Double check the version
> after taking mutex */ + spin_lock(&ci->i_ceph_lock); + inline_version
> = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + if
> (inline_version == CEPH_INLINE_NONE) goto out;
>  
>  	if (page) {
> @@ -1770,11 +1779,17 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
>  	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>  	if (!err)
>  		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> +	if (!err) {
> +		spin_lock(&ci->i_ceph_lock);
> +		inline_version = CEPH_INLINE_NONE;

Shouldn't this be ci->i_inline_version = CEPH_INLINE_NONE ?  Or maybe
both since the dout() below uses inline_version.

Cheers,

Jeff Layton July 10, 2019, 6:34 p.m. UTC | #2

On Wed, 2019-07-10 at 19:03 +0100, Luis Henriques wrote:
> Jeff Layton <jlayton@kernel.org> writes:
> 
> > The current code will do the uninlining but it relies on the caller to
> > set the i_inline_version appropriately afterward. There are several
> > potential races here.
> > 
> > Protect against competing uninlining attempts by having the callers
> > take the i_truncate_mutex and then have them update the version
> > themselves before dropping it.
> > 
> > Other callers can then re-check the i_inline_version after acquiring the
> > mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
> > and do nothing.
> > 
> > Finally since we are doing a lockless check first in all cases, just
> > move that into ceph_uninline_data as well, and have the callers call
> > it unconditionally.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
> >  fs/ceph/file.c | 18 ++++++------------
> >  2 files changed, 30 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 5f1e2b6577fb..e9700c997d12 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> >  
> >  	ceph_block_sigs(&oldset);
> >  
> > -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > -		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> > -		if (err < 0)
> > -			goto out_free;
> > -	}
> > +	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> > +	if (err < 0)
> > +		goto out_free;
> >  
> >  	if (off + PAGE_SIZE <= size)
> >  		len = PAGE_SIZE;
> > @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> >  	    ci->i_inline_version != CEPH_INLINE_NONE) {
> >  		int dirty;
> >  		spin_lock(&ci->i_ceph_lock);
> > -		ci->i_inline_version = CEPH_INLINE_NONE;
> >  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
> >  					       &prealloc_cf);
> >  		spin_unlock(&ci->i_ceph_lock);
> > @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
> >  	}
> >  }
> >  
> > +/*
> > + * We borrow the i_truncate_mutex to serialize callers that may be racing to
> > + * uninline the data.
> > + */
> >  int ceph_uninline_data(struct inode *inode, struct page *page)
> >  {
> >  	struct ceph_inode_info *ci = ceph_inode(inode);
> > @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
> >  	int err = 0;
> >  	bool from_pagecache = false;
> >  
> > -	spin_lock(&ci->i_ceph_lock);
> > -	inline_version = ci->i_inline_version;
> > -	spin_unlock(&ci->i_ceph_lock);
> > +	/* Do a lockless check first -- paired with i_ceph_lock for changes */
> > +	inline_version = READ_ONCE(ci->i_inline_version);
> >  
> >  	dout("uninline_data %p %llx.%llx inline_version %llu\n",
> >  	     inode, ceph_vinop(inode), inline_version);
> >  
> >  	if (inline_version == 1 || /* initial version, no data */
> >  	    inline_version == CEPH_INLINE_NONE)
> > +		return 0;
> 
> We may need to do the unlock_page(page) before returning.
> 

No, we shouldn't here, because of patch #2.

> > + + mutex_lock(&ci->i_truncate_mutex); + + /* Double check the version
> > after taking mutex */ + spin_lock(&ci->i_ceph_lock); + inline_version
> > = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + if
> > (inline_version == CEPH_INLINE_NONE) goto out;
> >  
> >  	if (page) {
> > @@ -1770,11 +1779,17 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
> >  	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> >  	if (!err)
> >  		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > +	if (!err) {
> > +		spin_lock(&ci->i_ceph_lock);
> > +		inline_version = CEPH_INLINE_NONE;
> 
> Shouldn't this be ci->i_inline_version = CEPH_INLINE_NONE ?  Or maybe
> both since the dout() below uses inline_version.
> 

Yes!

Jeff Layton July 10, 2019, 6:48 p.m. UTC | #3

On Wed, 2019-07-10 at 19:03 +0100, Luis Henriques wrote:
> Jeff Layton <jlayton@kernel.org> writes:
> 
> > The current code will do the uninlining but it relies on the caller to
> > set the i_inline_version appropriately afterward. There are several
> > potential races here.
> > 
> > Protect against competing uninlining attempts by having the callers
> > take the i_truncate_mutex and then have them update the version
> > themselves before dropping it.
> > 
> > Other callers can then re-check the i_inline_version after acquiring the
> > mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
> > and do nothing.
> > 
> > Finally since we are doing a lockless check first in all cases, just
> > move that into ceph_uninline_data as well, and have the callers call
> > it unconditionally.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
> >  fs/ceph/file.c | 18 ++++++------------
> >  2 files changed, 30 insertions(+), 21 deletions(-)
> > 
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index 5f1e2b6577fb..e9700c997d12 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> >  
> >  	ceph_block_sigs(&oldset);
> >  
> > -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
> > -		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> > -		if (err < 0)
> > -			goto out_free;
> > -	}
> > +	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
> > +	if (err < 0)
> > +		goto out_free;
> >  
> >  	if (off + PAGE_SIZE <= size)
> >  		len = PAGE_SIZE;
> > @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
> >  	    ci->i_inline_version != CEPH_INLINE_NONE) {
> >  		int dirty;
> >  		spin_lock(&ci->i_ceph_lock);
> > -		ci->i_inline_version = CEPH_INLINE_NONE;
> >  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
> >  					       &prealloc_cf);
> >  		spin_unlock(&ci->i_ceph_lock);
> > @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
> >  	}
> >  }
> >  
> > +/*
> > + * We borrow the i_truncate_mutex to serialize callers that may be racing to
> > + * uninline the data.
> > + */
> >  int ceph_uninline_data(struct inode *inode, struct page *page)
> >  {
> >  	struct ceph_inode_info *ci = ceph_inode(inode);
> > @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
> >  	int err = 0;
> >  	bool from_pagecache = false;
> >  
> > -	spin_lock(&ci->i_ceph_lock);
> > -	inline_version = ci->i_inline_version;
> > -	spin_unlock(&ci->i_ceph_lock);
> > +	/* Do a lockless check first -- paired with i_ceph_lock for changes */
> > +	inline_version = READ_ONCE(ci->i_inline_version);
> >  
> >  	dout("uninline_data %p %llx.%llx inline_version %llu\n",
> >  	     inode, ceph_vinop(inode), inline_version);
> >  
> >  	if (inline_version == 1 || /* initial version, no data */
> >  	    inline_version == CEPH_INLINE_NONE)
> > +		return 0;
> 
> We may need to do the unlock_page(page) before returning.
> 

We don't need to do that, but if the value is 1, then the existing code
will reset it to CEPH_INLINE_NONE even though it doesn't inline
anything. I'll fix that up for v2.

Thanks for the review so far!

> > + + mutex_lock(&ci->i_truncate_mutex); + + /* Double check the version
> > after taking mutex */ + spin_lock(&ci->i_ceph_lock); + inline_version
> > = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + if
> > (inline_version == CEPH_INLINE_NONE) goto out;
> >  
> >  	if (page) {
> > @@ -1770,11 +1779,17 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
> >  	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
> >  	if (!err)
> >  		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> > +	if (!err) {
> > +		spin_lock(&ci->i_ceph_lock);
> > +		inline_version = CEPH_INLINE_NONE;
> 
> Shouldn't this be ci->i_inline_version = CEPH_INLINE_NONE ?  Or maybe
> both since the dout() below uses inline_version.
> 
> Cheers,

Luis Henriques July 11, 2019, 9:42 a.m. UTC | #4

"Jeff Layton" <jlayton@kernel.org> writes:

> On Wed, 2019-07-10 at 19:03 +0100, Luis Henriques wrote:
>> Jeff Layton <jlayton@kernel.org> writes:
>> 
>> > The current code will do the uninlining but it relies on the caller to
>> > set the i_inline_version appropriately afterward. There are several
>> > potential races here.
>> > 
>> > Protect against competing uninlining attempts by having the callers
>> > take the i_truncate_mutex and then have them update the version
>> > themselves before dropping it.
>> > 
>> > Other callers can then re-check the i_inline_version after acquiring the
>> > mutex and if it has changed to CEPH_INLINE_NONE, they can just drop it
>> > and do nothing.
>> > 
>> > Finally since we are doing a lockless check first in all cases, just
>> > move that into ceph_uninline_data as well, and have the callers call
>> > it unconditionally.
>> > 
>> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
>> > ---
>> >  fs/ceph/addr.c | 33 ++++++++++++++++++++++++---------
>> >  fs/ceph/file.c | 18 ++++++------------
>> >  2 files changed, 30 insertions(+), 21 deletions(-)
>> > 
>> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> > index 5f1e2b6577fb..e9700c997d12 100644
>> > --- a/fs/ceph/addr.c
>> > +++ b/fs/ceph/addr.c
>> > @@ -1541,11 +1541,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>> >  
>> >  	ceph_block_sigs(&oldset);
>> >  
>> > -	if (ci->i_inline_version != CEPH_INLINE_NONE) {
>> > -		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
>> > -		if (err < 0)
>> > -			goto out_free;
>> > -	}
>> > +	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
>> > +	if (err < 0)
>> > +		goto out_free;
>> >  
>> >  	if (off + PAGE_SIZE <= size)
>> >  		len = PAGE_SIZE;
>> > @@ -1593,7 +1591,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
>> >  	    ci->i_inline_version != CEPH_INLINE_NONE) {
>> >  		int dirty;
>> >  		spin_lock(&ci->i_ceph_lock);
>> > -		ci->i_inline_version = CEPH_INLINE_NONE;
>> >  		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
>> >  					       &prealloc_cf);
>> >  		spin_unlock(&ci->i_ceph_lock);
>> > @@ -1656,6 +1653,10 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
>> >  	}
>> >  }
>> >  
>> > +/*
>> > + * We borrow the i_truncate_mutex to serialize callers that may be racing to
>> > + * uninline the data.
>> > + */
>> >  int ceph_uninline_data(struct inode *inode, struct page *page)
>> >  {
>> >  	struct ceph_inode_info *ci = ceph_inode(inode);
>> > @@ -1665,15 +1666,23 @@ int ceph_uninline_data(struct inode *inode, struct page *page)
>> >  	int err = 0;
>> >  	bool from_pagecache = false;
>> >  
>> > -	spin_lock(&ci->i_ceph_lock);
>> > -	inline_version = ci->i_inline_version;
>> > -	spin_unlock(&ci->i_ceph_lock);
>> > +	/* Do a lockless check first -- paired with i_ceph_lock for changes */
>> > +	inline_version = READ_ONCE(ci->i_inline_version);
>> >  
>> >  	dout("uninline_data %p %llx.%llx inline_version %llu\n",
>> >  	     inode, ceph_vinop(inode), inline_version);
>> >  
>> >  	if (inline_version == 1 || /* initial version, no data */
>> >  	    inline_version == CEPH_INLINE_NONE)
>> > +		return 0;
>> 
>> We may need to do the unlock_page(page) before returning.
>> 
>
> No, we shouldn't here, because of patch #2.

Doh!  Right, sorry.

Regarding your earlier inquire (on IRC) about copy_file_range handling
of inline data: a quick look shows that it does not really handle it.  I
guess that the right thing to do is to simply return -EOPNOTSUPP if
either src or dst inodes have i_inline_version = CEPH_INLINE_NONE.

In practice, this may happen already because due to:

	if (len < src_ci->i_layout.object_size)
        	return -EOPNOTSUPP;

but I couldn't dig all the details on how big the inlined data can be.
I could see client_max_inline_size being mentioned in the docs, but at a
first glance this isn't enforced by the MDS but only validated by the
fuse client (I could be wrong, as I just did a quick grep in the code).

Anyway, I'll soon send a patch to do an explicit check for inline data
in the cfr implementation.

Cheers,

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f1e2b6577fb..e9700c997d12 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1541,11 +1541,9 @@  static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 
 	ceph_block_sigs(&oldset);
 
-	if (ci->i_inline_version != CEPH_INLINE_NONE) {
-		err = ceph_uninline_data(inode, off == 0 ? page : NULL);
-		if (err < 0)
-			goto out_free;
-	}
+	err = ceph_uninline_data(inode, off == 0 ? page : NULL);
+	if (err < 0)
+		goto out_free;
 
 	if (off + PAGE_SIZE <= size)
 		len = PAGE_SIZE;
@@ -1593,7 +1591,6 @@  static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	    ci->i_inline_version != CEPH_INLINE_NONE) {
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
-		ci->i_inline_version = CEPH_INLINE_NONE;
 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
 					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
@@ -1656,6 +1653,10 @@  void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 	}
 }
 
+/*
+ * We borrow the i_truncate_mutex to serialize callers that may be racing to
+ * uninline the data.
+ */
 int ceph_uninline_data(struct inode *inode, struct page *page)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1665,15 +1666,23 @@  int ceph_uninline_data(struct inode *inode, struct page *page)
 	int err = 0;
 	bool from_pagecache = false;
 
-	spin_lock(&ci->i_ceph_lock);
-	inline_version = ci->i_inline_version;
-	spin_unlock(&ci->i_ceph_lock);
+	/* Do a lockless check first -- paired with i_ceph_lock for changes */
+	inline_version = READ_ONCE(ci->i_inline_version);
 
 	dout("uninline_data %p %llx.%llx inline_version %llu\n",
 	     inode, ceph_vinop(inode), inline_version);
 
 	if (inline_version == 1 || /* initial version, no data */
 	    inline_version == CEPH_INLINE_NONE)
+		return 0;
+
+	mutex_lock(&ci->i_truncate_mutex);
+
+	/* Double check the version after taking mutex */
+	spin_lock(&ci->i_ceph_lock);
+	inline_version = ci->i_inline_version;
+	spin_unlock(&ci->i_ceph_lock);
+	if (inline_version == CEPH_INLINE_NONE)
 		goto out;
 
 	if (page) {
@@ -1770,11 +1779,17 @@  int ceph_uninline_data(struct inode *inode, struct page *page)
 	err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!err)
 		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
+	if (!err) {
+		spin_lock(&ci->i_ceph_lock);
+		inline_version = CEPH_INLINE_NONE;
+		spin_unlock(&ci->i_ceph_lock);
+	}
 out_put:
 	ceph_osdc_put_request(req);
 	if (err == -ECANCELED)
 		err = 0;
 out:
+	mutex_unlock(&ci->i_truncate_mutex);
 	if (page) {
 		unlock_page(page);
 		if (from_pagecache)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7bb090fa99d3..3ff83135562c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1438,11 +1438,9 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	inode_inc_iversion_raw(inode);
 
-	if (ci->i_inline_version != CEPH_INLINE_NONE) {
-		err = ceph_uninline_data(inode, NULL);
-		if (err < 0)
-			goto out;
-	}
+	err = ceph_uninline_data(inode, NULL);
+	if (err < 0)
+		goto out;
 
 	/* FIXME: not complete since it doesn't account for being at quota */
 	if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
@@ -1513,7 +1511,6 @@  static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		int dirty;
 
 		spin_lock(&ci->i_ceph_lock);
-		ci->i_inline_version = CEPH_INLINE_NONE;
 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
 					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
@@ -1762,11 +1759,9 @@  static long ceph_fallocate(struct file *file, int mode,
 		goto unlock;
 	}
 
-	if (ci->i_inline_version != CEPH_INLINE_NONE) {
-		ret = ceph_uninline_data(inode, NULL);
-		if (ret < 0)
-			goto unlock;
-	}
+	ret = ceph_uninline_data(inode, NULL);
+	if (ret < 0)
+		goto unlock;
 
 	size = i_size_read(inode);
 
@@ -1790,7 +1785,6 @@  static long ceph_fallocate(struct file *file, int mode,
 
 	if (!ret) {
 		spin_lock(&ci->i_ceph_lock);
-		ci->i_inline_version = CEPH_INLINE_NONE;
 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
 					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);

[3/3] ceph: fix potential races in ceph_uninline_data

Commit Message

Comments

Patch