diff mbox series

ceph: try to queue a writeback if revoking fails

Message ID 20220428082949.11841-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: try to queue a writeback if revoking fails | expand

Commit Message

Xiubo Li April 28, 2022, 8:29 a.m. UTC
If the pagecaches writeback just finished and the i_wrbuffer_ref
reaches zero it will try to trigger ceph_check_caps(). But if just
before ceph_check_caps() the i_wrbuffer_ref could be increased
again by mmap/cache write, then the Fwb revoke will fail.

We need to try to queue a writeback in this case instead of
triggering the writeback by BDI's delayed work per 5 seconds.

URL: https://tracker.ceph.com/issues/55377
URL: https://tracker.ceph.com/issues/46904
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c  | 44 +++++++++++++++++++++++++++++++++++---------
 fs/ceph/super.h |  7 +++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

Comments

Jeff Layton April 28, 2022, 11:10 a.m. UTC | #1
On Thu, 2022-04-28 at 16:29 +0800, Xiubo Li wrote:
> If the pagecaches writeback just finished and the i_wrbuffer_ref
> reaches zero it will try to trigger ceph_check_caps(). But if just
> before ceph_check_caps() the i_wrbuffer_ref could be increased
> again by mmap/cache write, then the Fwb revoke will fail.
> 
> We need to try to queue a writeback in this case instead of
> triggering the writeback by BDI's delayed work per 5 seconds.
> 
> URL: https://tracker.ceph.com/issues/55377
> URL: https://tracker.ceph.com/issues/46904
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/caps.c  | 44 +++++++++++++++++++++++++++++++++++---------
>  fs/ceph/super.h |  7 +++++++
>  2 files changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 906c95d2a4ed..0c0c8f5ae3b3 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -1912,6 +1912,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  	struct rb_node *p;
>  	bool queue_invalidate = false;
>  	bool tried_invalidate = false;
> +	bool queue_writeback = false;
>  
>  	if (session)
>  		ceph_get_mds_session(session);
> @@ -2064,10 +2065,30 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		}
>  
>  		/* completed revocation? going down and there are no caps? */
> -		if (revoking && (revoking & cap_used) == 0) {
> -			dout("completed revocation of %s\n",
> -			     ceph_cap_string(cap->implemented & ~cap->issued));
> -			goto ack;
> +		if (revoking) {
> +			if ((revoking & cap_used) == 0) {
> +				dout("completed revocation of %s\n",
> +				      ceph_cap_string(cap->implemented & ~cap->issued));
> +				goto ack;
> +			}
> +
> +			/*
> +			 * If the "i_wrbuffer_ref" was increased by mmap or generic
> +			 * cache write just before the ceph_check_caps() is called,
> +			 * the Fb capability revoking will fail this time. Then we
> +			 * must wait for the BDI's delayed work to flush the dirty
> +			 * pages and to release the "i_wrbuffer_ref", which will cost
> +			 * at most 5 seconds. That means the MDS needs to wait at
> +			 * most 5 seconds to finished the Fb capability's revocation.
> +			 *
> +			 * Let's queue a writeback for it.
> +			 */
> +			if ((ci->i_last_caps &
> +			     (CEPH_CAP_FAKE_WRBUFFER | CEPH_CAP_FILE_BUFFER)) &&
> +			    ci->i_wrbuffer_ref && S_ISREG(inode->i_mode) &&
> +			    (revoking & CEPH_CAP_FILE_BUFFER)) {
> +				queue_writeback = true;
> +			}

Is i_last_caps really necessary? It's handling seems very complex and
it's not 100% clear to me what it's supposed to represent. I'm also not
crazy about the FAKE_WRBUFFER thing.

It seems to me that we ought to queue writeback anytime Fb is being
revoked and i_wrbuffer_ref is non 0. Maybe something like this instead
would be simpler?

if (S_ISREG(inode->i_mode) && (revoking & CEPH_CAP_FILE_BUFFER) &&
    ci->i_wrbuffer_ref)
	queue_writeback = true;




>  		}
>  
>  		/* want more caps from mds? */
> @@ -2134,9 +2155,12 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>  		__cap_delay_requeue(mdsc, ci);
>  	}
>  
> +	ci->i_last_caps = 0;
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	ceph_put_mds_session(session);
> +	if (queue_writeback)
> +		ceph_queue_writeback(inode);
>  	if (queue_invalidate)
>  		ceph_queue_invalidate(inode);
>  }
> @@ -3084,16 +3108,16 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>  		--ci->i_pin_ref;
>  	if (had & CEPH_CAP_FILE_RD)
>  		if (--ci->i_rd_ref == 0)
> -			last++;
> +			last |= CEPH_CAP_FILE_RD;
>  	if (had & CEPH_CAP_FILE_CACHE)
>  		if (--ci->i_rdcache_ref == 0)
> -			last++;
> +			last |= CEPH_CAP_FILE_CACHE;
>  	if (had & CEPH_CAP_FILE_EXCL)
>  		if (--ci->i_fx_ref == 0)
> -			last++;
> +			last |= CEPH_CAP_FILE_EXCL;
>  	if (had & CEPH_CAP_FILE_BUFFER) {
>  		if (--ci->i_wb_ref == 0) {
> -			last++;
> +			last |= CEPH_CAP_FILE_BUFFER;
>  			/* put the ref held by ceph_take_cap_refs() */
>  			put++;
>  			check_flushsnaps = true;
> @@ -3103,7 +3127,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>  	}
>  	if (had & CEPH_CAP_FILE_WR) {
>  		if (--ci->i_wr_ref == 0) {
> -			last++;
> +			last |= CEPH_CAP_FILE_WR;
>  			check_flushsnaps = true;
>  			if (ci->i_wrbuffer_ref_head == 0 &&
>  			    ci->i_dirty_caps == 0 &&
> @@ -3131,6 +3155,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>  			flushsnaps = 1;
>  		wake = 1;
>  	}
> +	ci->i_last_caps |= last;
>  	spin_unlock(&ci->i_ceph_lock);
>  
>  	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
> @@ -3193,6 +3218,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
>  	spin_lock(&ci->i_ceph_lock);
>  	ci->i_wrbuffer_ref -= nr;
>  	if (ci->i_wrbuffer_ref == 0) {
> +		ci->i_last_caps |= CEPH_CAP_FAKE_WRBUFFER;
>  		last = true;
>  		put++;
>  	}
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 73db7f6021f3..f275a41649af 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -362,6 +362,13 @@ struct ceph_inode_info {
>  	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>  	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
>  
> +	/*
> +	 * The capabilities whose references reach to 0, and the bit
> +	 * (CEPH_CAP_BITS) is for i_wrbuffer_ref.
> +	 */
> +#define CEPH_CAP_FAKE_WRBUFFER (1 << CEPH_CAP_BITS)
> +	unsigned i_last_caps;
> +
>  	/*
>  	 * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
>  	 * is protected by the mdsc->cap_dirty_lock, but each individual item
Xiubo Li April 28, 2022, 11:52 a.m. UTC | #2
On 4/28/22 7:10 PM, Jeff Layton wrote:
> On Thu, 2022-04-28 at 16:29 +0800, Xiubo Li wrote:
>> If the pagecaches writeback just finished and the i_wrbuffer_ref
>> reaches zero it will try to trigger ceph_check_caps(). But if just
>> before ceph_check_caps() the i_wrbuffer_ref could be increased
>> again by mmap/cache write, then the Fwb revoke will fail.
>>
>> We need to try to queue a writeback in this case instead of
>> triggering the writeback by BDI's delayed work per 5 seconds.
>>
>> URL: https://tracker.ceph.com/issues/55377
>> URL: https://tracker.ceph.com/issues/46904
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c  | 44 +++++++++++++++++++++++++++++++++++---------
>>   fs/ceph/super.h |  7 +++++++
>>   2 files changed, 42 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 906c95d2a4ed..0c0c8f5ae3b3 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -1912,6 +1912,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>>   	struct rb_node *p;
>>   	bool queue_invalidate = false;
>>   	bool tried_invalidate = false;
>> +	bool queue_writeback = false;
>>   
>>   	if (session)
>>   		ceph_get_mds_session(session);
>> @@ -2064,10 +2065,30 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>>   		}
>>   
>>   		/* completed revocation? going down and there are no caps? */
>> -		if (revoking && (revoking & cap_used) == 0) {
>> -			dout("completed revocation of %s\n",
>> -			     ceph_cap_string(cap->implemented & ~cap->issued));
>> -			goto ack;
>> +		if (revoking) {
>> +			if ((revoking & cap_used) == 0) {
>> +				dout("completed revocation of %s\n",
>> +				      ceph_cap_string(cap->implemented & ~cap->issued));
>> +				goto ack;
>> +			}
>> +
>> +			/*
>> +			 * If the "i_wrbuffer_ref" was increased by mmap or generic
>> +			 * cache write just before the ceph_check_caps() is called,
>> +			 * the Fb capability revoking will fail this time. Then we
>> +			 * must wait for the BDI's delayed work to flush the dirty
>> +			 * pages and to release the "i_wrbuffer_ref", which will cost
>> +			 * at most 5 seconds. That means the MDS needs to wait at
>> +			 * most 5 seconds to finished the Fb capability's revocation.
>> +			 *
>> +			 * Let's queue a writeback for it.
>> +			 */
>> +			if ((ci->i_last_caps &
>> +			     (CEPH_CAP_FAKE_WRBUFFER | CEPH_CAP_FILE_BUFFER)) &&
>> +			    ci->i_wrbuffer_ref && S_ISREG(inode->i_mode) &&
>> +			    (revoking & CEPH_CAP_FILE_BUFFER)) {
>> +				queue_writeback = true;
>> +			}
> Is i_last_caps really necessary? It's handling seems very complex and
> it's not 100% clear to me what it's supposed to represent. I'm also not
> crazy about the FAKE_WRBUFFER thing.
>
> It seems to me that we ought to queue writeback anytime Fb is being
> revoked and i_wrbuffer_ref is non 0. Maybe something like this instead
> would be simpler?
>
> if (S_ISREG(inode->i_mode) && (revoking & CEPH_CAP_FILE_BUFFER) &&
>      ci->i_wrbuffer_ref)
> 	queue_writeback = true;

Just supposed the ceph_check_caps() is called when the Fb caps ref 
reaches to 0 in mmap and generic write cases should we do this.

Yeah, I think we should also queue the writeback in other cases.

I will fix it.

-- Xiubo

>
>
>>   		}
>>   
>>   		/* want more caps from mds? */
>> @@ -2134,9 +2155,12 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
>>   		__cap_delay_requeue(mdsc, ci);
>>   	}
>>   
>> +	ci->i_last_caps = 0;
>>   	spin_unlock(&ci->i_ceph_lock);
>>   
>>   	ceph_put_mds_session(session);
>> +	if (queue_writeback)
>> +		ceph_queue_writeback(inode);
>>   	if (queue_invalidate)
>>   		ceph_queue_invalidate(inode);
>>   }
>> @@ -3084,16 +3108,16 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>>   		--ci->i_pin_ref;
>>   	if (had & CEPH_CAP_FILE_RD)
>>   		if (--ci->i_rd_ref == 0)
>> -			last++;
>> +			last |= CEPH_CAP_FILE_RD;
>>   	if (had & CEPH_CAP_FILE_CACHE)
>>   		if (--ci->i_rdcache_ref == 0)
>> -			last++;
>> +			last |= CEPH_CAP_FILE_CACHE;
>>   	if (had & CEPH_CAP_FILE_EXCL)
>>   		if (--ci->i_fx_ref == 0)
>> -			last++;
>> +			last |= CEPH_CAP_FILE_EXCL;
>>   	if (had & CEPH_CAP_FILE_BUFFER) {
>>   		if (--ci->i_wb_ref == 0) {
>> -			last++;
>> +			last |= CEPH_CAP_FILE_BUFFER;
>>   			/* put the ref held by ceph_take_cap_refs() */
>>   			put++;
>>   			check_flushsnaps = true;
>> @@ -3103,7 +3127,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>>   	}
>>   	if (had & CEPH_CAP_FILE_WR) {
>>   		if (--ci->i_wr_ref == 0) {
>> -			last++;
>> +			last |= CEPH_CAP_FILE_WR;
>>   			check_flushsnaps = true;
>>   			if (ci->i_wrbuffer_ref_head == 0 &&
>>   			    ci->i_dirty_caps == 0 &&
>> @@ -3131,6 +3155,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
>>   			flushsnaps = 1;
>>   		wake = 1;
>>   	}
>> +	ci->i_last_caps |= last;
>>   	spin_unlock(&ci->i_ceph_lock);
>>   
>>   	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
>> @@ -3193,6 +3218,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
>>   	spin_lock(&ci->i_ceph_lock);
>>   	ci->i_wrbuffer_ref -= nr;
>>   	if (ci->i_wrbuffer_ref == 0) {
>> +		ci->i_last_caps |= CEPH_CAP_FAKE_WRBUFFER;
>>   		last = true;
>>   		put++;
>>   	}
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 73db7f6021f3..f275a41649af 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -362,6 +362,13 @@ struct ceph_inode_info {
>>   	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>>   	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
>>   
>> +	/*
>> +	 * The capabilities whose references reach to 0, and the bit
>> +	 * (CEPH_CAP_BITS) is for i_wrbuffer_ref.
>> +	 */
>> +#define CEPH_CAP_FAKE_WRBUFFER (1 << CEPH_CAP_BITS)
>> +	unsigned i_last_caps;
>> +
>>   	/*
>>   	 * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
>>   	 * is protected by the mdsc->cap_dirty_lock, but each individual item
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 906c95d2a4ed..0c0c8f5ae3b3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1912,6 +1912,7 @@  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct rb_node *p;
 	bool queue_invalidate = false;
 	bool tried_invalidate = false;
+	bool queue_writeback = false;
 
 	if (session)
 		ceph_get_mds_session(session);
@@ -2064,10 +2065,30 @@  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & cap_used) == 0) {
-			dout("completed revocation of %s\n",
-			     ceph_cap_string(cap->implemented & ~cap->issued));
-			goto ack;
+		if (revoking) {
+			if ((revoking & cap_used) == 0) {
+				dout("completed revocation of %s\n",
+				      ceph_cap_string(cap->implemented & ~cap->issued));
+				goto ack;
+			}
+
+			/*
+			 * If the "i_wrbuffer_ref" was increased by mmap or generic
+			 * cache write just before the ceph_check_caps() is called,
+			 * the Fb capability revoking will fail this time. Then we
+			 * must wait for the BDI's delayed work to flush the dirty
+			 * pages and to release the "i_wrbuffer_ref", which will cost
+			 * at most 5 seconds. That means the MDS needs to wait at
+			 * most 5 seconds to finished the Fb capability's revocation.
+			 *
+			 * Let's queue a writeback for it.
+			 */
+			if ((ci->i_last_caps &
+			     (CEPH_CAP_FAKE_WRBUFFER | CEPH_CAP_FILE_BUFFER)) &&
+			    ci->i_wrbuffer_ref && S_ISREG(inode->i_mode) &&
+			    (revoking & CEPH_CAP_FILE_BUFFER)) {
+				queue_writeback = true;
+			}
 		}
 
 		/* want more caps from mds? */
@@ -2134,9 +2155,12 @@  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		__cap_delay_requeue(mdsc, ci);
 	}
 
+	ci->i_last_caps = 0;
 	spin_unlock(&ci->i_ceph_lock);
 
 	ceph_put_mds_session(session);
+	if (queue_writeback)
+		ceph_queue_writeback(inode);
 	if (queue_invalidate)
 		ceph_queue_invalidate(inode);
 }
@@ -3084,16 +3108,16 @@  static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 		--ci->i_pin_ref;
 	if (had & CEPH_CAP_FILE_RD)
 		if (--ci->i_rd_ref == 0)
-			last++;
+			last |= CEPH_CAP_FILE_RD;
 	if (had & CEPH_CAP_FILE_CACHE)
 		if (--ci->i_rdcache_ref == 0)
-			last++;
+			last |= CEPH_CAP_FILE_CACHE;
 	if (had & CEPH_CAP_FILE_EXCL)
 		if (--ci->i_fx_ref == 0)
-			last++;
+			last |= CEPH_CAP_FILE_EXCL;
 	if (had & CEPH_CAP_FILE_BUFFER) {
 		if (--ci->i_wb_ref == 0) {
-			last++;
+			last |= CEPH_CAP_FILE_BUFFER;
 			/* put the ref held by ceph_take_cap_refs() */
 			put++;
 			check_flushsnaps = true;
@@ -3103,7 +3127,7 @@  static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 	}
 	if (had & CEPH_CAP_FILE_WR) {
 		if (--ci->i_wr_ref == 0) {
-			last++;
+			last |= CEPH_CAP_FILE_WR;
 			check_flushsnaps = true;
 			if (ci->i_wrbuffer_ref_head == 0 &&
 			    ci->i_dirty_caps == 0 &&
@@ -3131,6 +3155,7 @@  static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
 			flushsnaps = 1;
 		wake = 1;
 	}
+	ci->i_last_caps |= last;
 	spin_unlock(&ci->i_ceph_lock);
 
 	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
@@ -3193,6 +3218,7 @@  void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	spin_lock(&ci->i_ceph_lock);
 	ci->i_wrbuffer_ref -= nr;
 	if (ci->i_wrbuffer_ref == 0) {
+		ci->i_last_caps |= CEPH_CAP_FAKE_WRBUFFER;
 		last = true;
 		put++;
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 73db7f6021f3..f275a41649af 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -362,6 +362,13 @@  struct ceph_inode_info {
 	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
 	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
 
+	/*
+	 * The capabilities whose references reach to 0, and the bit
+	 * (CEPH_CAP_BITS) is for i_wrbuffer_ref.
+	 */
+#define CEPH_CAP_FAKE_WRBUFFER (1 << CEPH_CAP_BITS)
+	unsigned i_last_caps;
+
 	/*
 	 * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty
 	 * is protected by the mdsc->cap_dirty_lock, but each individual item