[2/2] xfs: hard limit the background CIL push
diff mbox series

Message ID 20190909015159.19662-3-david@fromorbit.com
State New
Headers show
Series
  • xfs: hard limit background CIL push size
Related show

Commit Message

Dave Chinner Sept. 9, 2019, 1:51 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

In certain situations the background CIL push can be indefinitely
delayed. While we have workarounds from the obvious cases now, it
doesn't solve the underlying issue. This issue is that there is no
upper limit on the CIL where we will either force or wait for
a background push to start, hence allowing the CIL to grow without
bound until it consumes all log space.

To fix this, add a new wait queue to the CIL which allows background
pushes to wait for the CIL context to be switched out. This happens
when the push starts, so it will allow us to block incoming
transaction commit completion until the push has started. This will
only affect processes that are running modifications, and only when
the CIL threshold has been significantly overrun.

This has no apparent impact on performance, and doesn't even trigger
until over 45 million inodes had been created in a 16-way fsmark
test on a 2GB log. That was limiting at 64MB of log space used, so
the active CIL size is only about 3% of the total log in that case.
The concurrent removal of those files did not trigger the background
sleep at all.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_cil.c  | 30 +++++++++++++++++++++++++-----
 fs/xfs/xfs_log_priv.h |  1 +
 2 files changed, 26 insertions(+), 5 deletions(-)

Comments

Darrick J. Wong Sept. 16, 2019, 4:42 p.m. UTC | #1
On Mon, Sep 09, 2019 at 11:51:59AM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> In certain situations the background CIL push can be indefinitely
> delayed. While we have workarounds from the obvious cases now, it
> doesn't solve the underlying issue. This issue is that there is no
> upper limit on the CIL where we will either force or wait for
> a background push to start, hence allowing the CIL to grow without
> bound until it consumes all log space.
> 
> To fix this, add a new wait queue to the CIL which allows background
> pushes to wait for the CIL context to be switched out. This happens
> when the push starts, so it will allow us to block incoming
> transaction commit completion until the push has started. This will
> only affect processes that are running modifications, and only when
> the CIL threshold has been significantly overrun.
> 
> This has no apparent impact on performance, and doesn't even trigger
> until over 45 million inodes had been created in a 16-way fsmark
> test on a 2GB log. That was limiting at 64MB of log space used, so
> the active CIL size is only about 3% of the total log in that case.
> The concurrent removal of those files did not trigger the background
> sleep at all.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log_cil.c  | 30 +++++++++++++++++++++++++-----
>  fs/xfs/xfs_log_priv.h |  1 +
>  2 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index ef652abd112c..eec9b32f5e08 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -670,6 +670,11 @@ xlog_cil_push(
>  	push_seq = cil->xc_push_seq;
>  	ASSERT(push_seq <= ctx->sequence);
>  
> +	/*
> +	 * Wake up any background push waiters now this context is being pushed.
> +	 */
> +	wake_up_all(&ctx->push_wait);
> +
>  	/*
>  	 * Check if we've anything to push. If there is nothing, then we don't
>  	 * move on to a new sequence number and so we have to be able to push
> @@ -746,6 +751,7 @@ xlog_cil_push(
>  	 */
>  	INIT_LIST_HEAD(&new_ctx->committing);
>  	INIT_LIST_HEAD(&new_ctx->busy_extents);
> +	init_waitqueue_head(&new_ctx->push_wait);
>  	new_ctx->sequence = ctx->sequence + 1;
>  	new_ctx->cil = cil;
>  	cil->xc_ctx = new_ctx;
> @@ -898,7 +904,7 @@ xlog_cil_push_work(
>   * checkpoint), but commit latency and memory usage limit this to a smaller
>   * size.
>   */
> -static void
> +static bool
>  xlog_cil_push_background(
>  	struct xlog	*log)
>  {
> @@ -915,14 +921,28 @@ xlog_cil_push_background(
>  	 * space available yet.
>  	 */
>  	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> -		return;
> +		return false;
>  
>  	spin_lock(&cil->xc_push_lock);
>  	if (cil->xc_push_seq < cil->xc_current_sequence) {
>  		cil->xc_push_seq = cil->xc_current_sequence;
>  		queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
>  	}
> +
> +	/*
> +	 * If we are well over the space limit, throttle the work that is being
> +	 * done until the push work on this context has begun. This will prevent
> +	 * the CIL from violating maximum transaction size limits if the CIL
> +	 * push is delayed for some reason.
> +	 */
> +	if (cil->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log) * 2) {
> +		up_read(&cil->xc_ctx_lock);
> +		trace_printk("CIL space used %d", cil->xc_ctx->space_used);

Needs a real tracepoint before this drops RFC status.

> +		xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
> +		return true;
> +	}
>  	spin_unlock(&cil->xc_push_lock);
> +	return false;
>  
>  }
>  
> @@ -1038,9 +1058,8 @@ xfs_log_commit_cil(
>  		if (lip->li_ops->iop_committing)
>  			lip->li_ops->iop_committing(lip, xc_commit_lsn);
>  	}
> -	xlog_cil_push_background(log);
> -
> -	up_read(&cil->xc_ctx_lock);
> +	if (!xlog_cil_push_background(log))
> +		up_read(&cil->xc_ctx_lock);

Hmmmm... the return value here tell us if ctx_lock has been dropped.
/me wonders if this would be cleaner if xlog_cil_push_background
returned having called up_read...?

--D

>  }
>  
>  /*
> @@ -1199,6 +1218,7 @@ xlog_cil_init(
>  
>  	INIT_LIST_HEAD(&ctx->committing);
>  	INIT_LIST_HEAD(&ctx->busy_extents);
> +	init_waitqueue_head(&ctx->push_wait);
>  	ctx->sequence = 1;
>  	ctx->cil = cil;
>  	cil->xc_ctx = ctx;
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 187a43ffeaf7..466259fd1e4a 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -247,6 +247,7 @@ struct xfs_cil_ctx {
>  	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
>  	struct list_head	iclog_entry;
>  	struct list_head	committing;	/* ctx committing list */
> +	wait_queue_head_t	push_wait;	/* background push throttle */
>  	struct work_struct	discard_endio_work;
>  };
>  
> -- 
> 2.23.0.rc1
>
Dave Chinner Sept. 24, 2019, 10:36 p.m. UTC | #2
On Mon, Sep 16, 2019 at 09:42:55AM -0700, Darrick J. Wong wrote:
> On Mon, Sep 09, 2019 at 11:51:59AM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > In certain situations the background CIL push can be indefinitely
> > delayed. While we have workarounds from the obvious cases now, it
> > doesn't solve the underlying issue. This issue is that there is no
> > upper limit on the CIL where we will either force or wait for
> > a background push to start, hence allowing the CIL to grow without
> > bound until it consumes all log space.
> > 
> > To fix this, add a new wait queue to the CIL which allows background
> > pushes to wait for the CIL context to be switched out. This happens
> > when the push starts, so it will allow us to block incoming
> > transaction commit completion until the push has started. This will
> > only affect processes that are running modifications, and only when
> > the CIL threshold has been significantly overrun.
> > 
> > This has no apparent impact on performance, and doesn't even trigger
> > until over 45 million inodes had been created in a 16-way fsmark
> > test on a 2GB log. That was limiting at 64MB of log space used, so
> > the active CIL size is only about 3% of the total log in that case.
> > The concurrent removal of those files did not trigger the background
> > sleep at all.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_log_cil.c  | 30 +++++++++++++++++++++++++-----
> >  fs/xfs/xfs_log_priv.h |  1 +
> >  2 files changed, 26 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > index ef652abd112c..eec9b32f5e08 100644
> > --- a/fs/xfs/xfs_log_cil.c
> > +++ b/fs/xfs/xfs_log_cil.c
> > @@ -670,6 +670,11 @@ xlog_cil_push(
> >  	push_seq = cil->xc_push_seq;
> >  	ASSERT(push_seq <= ctx->sequence);
> >  
> > +	/*
> > +	 * Wake up any background push waiters now this context is being pushed.
> > +	 */
> > +	wake_up_all(&ctx->push_wait);
> > +
> >  	/*
> >  	 * Check if we've anything to push. If there is nothing, then we don't
> >  	 * move on to a new sequence number and so we have to be able to push
> > @@ -746,6 +751,7 @@ xlog_cil_push(
> >  	 */
> >  	INIT_LIST_HEAD(&new_ctx->committing);
> >  	INIT_LIST_HEAD(&new_ctx->busy_extents);
> > +	init_waitqueue_head(&new_ctx->push_wait);
> >  	new_ctx->sequence = ctx->sequence + 1;
> >  	new_ctx->cil = cil;
> >  	cil->xc_ctx = new_ctx;
> > @@ -898,7 +904,7 @@ xlog_cil_push_work(
> >   * checkpoint), but commit latency and memory usage limit this to a smaller
> >   * size.
> >   */
> > -static void
> > +static bool
> >  xlog_cil_push_background(
> >  	struct xlog	*log)
> >  {
> > @@ -915,14 +921,28 @@ xlog_cil_push_background(
> >  	 * space available yet.
> >  	 */
> >  	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> > -		return;
> > +		return false;
> >  
> >  	spin_lock(&cil->xc_push_lock);
> >  	if (cil->xc_push_seq < cil->xc_current_sequence) {
> >  		cil->xc_push_seq = cil->xc_current_sequence;
> >  		queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
> >  	}
> > +
> > +	/*
> > +	 * If we are well over the space limit, throttle the work that is being
> > +	 * done until the push work on this context has begun. This will prevent
> > +	 * the CIL from violating maximum transaction size limits if the CIL
> > +	 * push is delayed for some reason.
> > +	 */
> > +	if (cil->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log) * 2) {
> > +		up_read(&cil->xc_ctx_lock);
> > +		trace_printk("CIL space used %d", cil->xc_ctx->space_used);
> 
> Needs a real tracepoint before this drops RFC status.

Ok, that was just debugging stuff I forgot to remove, but I can turn
it into a real tracepoint if you want.

> 
> > +		xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
> > +		return true;
> > +	}
> >  	spin_unlock(&cil->xc_push_lock);
> > +	return false;
> >  
> >  }
> >  
> > @@ -1038,9 +1058,8 @@ xfs_log_commit_cil(
> >  		if (lip->li_ops->iop_committing)
> >  			lip->li_ops->iop_committing(lip, xc_commit_lsn);
> >  	}
> > -	xlog_cil_push_background(log);
> > -
> > -	up_read(&cil->xc_ctx_lock);
> > +	if (!xlog_cil_push_background(log))
> > +		up_read(&cil->xc_ctx_lock);
> 
> Hmmmm... the return value here tell us if ctx_lock has been dropped.
> /me wonders if this would be cleaner if xlog_cil_push_background
> returned having called up_read...?

I thought about that - was on the fence about what to do. I'll
change it to be unconditional.

Cheers,

Dave.
Darrick J. Wong Sept. 24, 2019, 10:41 p.m. UTC | #3
On Wed, Sep 25, 2019 at 08:36:25AM +1000, Dave Chinner wrote:
> On Mon, Sep 16, 2019 at 09:42:55AM -0700, Darrick J. Wong wrote:
> > On Mon, Sep 09, 2019 at 11:51:59AM +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > In certain situations the background CIL push can be indefinitely
> > > delayed. While we have workarounds from the obvious cases now, it
> > > doesn't solve the underlying issue. This issue is that there is no
> > > upper limit on the CIL where we will either force or wait for
> > > a background push to start, hence allowing the CIL to grow without
> > > bound until it consumes all log space.
> > > 
> > > To fix this, add a new wait queue to the CIL which allows background
> > > pushes to wait for the CIL context to be switched out. This happens
> > > when the push starts, so it will allow us to block incoming
> > > transaction commit completion until the push has started. This will
> > > only affect processes that are running modifications, and only when
> > > the CIL threshold has been significantly overrun.
> > > 
> > > This has no apparent impact on performance, and doesn't even trigger
> > > until over 45 million inodes had been created in a 16-way fsmark
> > > test on a 2GB log. That was limiting at 64MB of log space used, so
> > > the active CIL size is only about 3% of the total log in that case.
> > > The concurrent removal of those files did not trigger the background
> > > sleep at all.
> > > 
> > > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > > ---
> > >  fs/xfs/xfs_log_cil.c  | 30 +++++++++++++++++++++++++-----
> > >  fs/xfs/xfs_log_priv.h |  1 +
> > >  2 files changed, 26 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > > index ef652abd112c..eec9b32f5e08 100644
> > > --- a/fs/xfs/xfs_log_cil.c
> > > +++ b/fs/xfs/xfs_log_cil.c
> > > @@ -670,6 +670,11 @@ xlog_cil_push(
> > >  	push_seq = cil->xc_push_seq;
> > >  	ASSERT(push_seq <= ctx->sequence);
> > >  
> > > +	/*
> > > +	 * Wake up any background push waiters now this context is being pushed.
> > > +	 */
> > > +	wake_up_all(&ctx->push_wait);
> > > +
> > >  	/*
> > >  	 * Check if we've anything to push. If there is nothing, then we don't
> > >  	 * move on to a new sequence number and so we have to be able to push
> > > @@ -746,6 +751,7 @@ xlog_cil_push(
> > >  	 */
> > >  	INIT_LIST_HEAD(&new_ctx->committing);
> > >  	INIT_LIST_HEAD(&new_ctx->busy_extents);
> > > +	init_waitqueue_head(&new_ctx->push_wait);
> > >  	new_ctx->sequence = ctx->sequence + 1;
> > >  	new_ctx->cil = cil;
> > >  	cil->xc_ctx = new_ctx;
> > > @@ -898,7 +904,7 @@ xlog_cil_push_work(
> > >   * checkpoint), but commit latency and memory usage limit this to a smaller
> > >   * size.
> > >   */
> > > -static void
> > > +static bool
> > >  xlog_cil_push_background(
> > >  	struct xlog	*log)
> > >  {
> > > @@ -915,14 +921,28 @@ xlog_cil_push_background(
> > >  	 * space available yet.
> > >  	 */
> > >  	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> > > -		return;
> > > +		return false;
> > >  
> > >  	spin_lock(&cil->xc_push_lock);
> > >  	if (cil->xc_push_seq < cil->xc_current_sequence) {
> > >  		cil->xc_push_seq = cil->xc_current_sequence;
> > >  		queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
> > >  	}
> > > +
> > > +	/*
> > > +	 * If we are well over the space limit, throttle the work that is being
> > > +	 * done until the push work on this context has begun. This will prevent
> > > +	 * the CIL from violating maximum transaction size limits if the CIL
> > > +	 * push is delayed for some reason.
> > > +	 */
> > > +	if (cil->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log) * 2) {
> > > +		up_read(&cil->xc_ctx_lock);
> > > +		trace_printk("CIL space used %d", cil->xc_ctx->space_used);
> > 
> > Needs a real tracepoint before this drops RFC status.
> 
> Ok, that was just debugging stuff I forgot to remove, but I can turn
> it into a real tracepoint if you want.

<shrug> You could drop it too; I was just point out the trace_printk.

(For those of you following at home, trace_printk calls generate huge
debugging warnings at module load time.)

> > 
> > > +		xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
> > > +		return true;
> > > +	}
> > >  	spin_unlock(&cil->xc_push_lock);
> > > +	return false;
> > >  
> > >  }
> > >  
> > > @@ -1038,9 +1058,8 @@ xfs_log_commit_cil(
> > >  		if (lip->li_ops->iop_committing)
> > >  			lip->li_ops->iop_committing(lip, xc_commit_lsn);
> > >  	}
> > > -	xlog_cil_push_background(log);
> > > -
> > > -	up_read(&cil->xc_ctx_lock);
> > > +	if (!xlog_cil_push_background(log))
> > > +		up_read(&cil->xc_ctx_lock);
> > 
> > Hmmmm... the return value here tell us if ctx_lock has been dropped.
> > /me wonders if this would be cleaner if xlog_cil_push_background
> > returned having called up_read...?
> 
> I thought about that - was on the fence about what to do. I'll
> change it to be unconditional.

<nod>

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com

Patch
diff mbox series

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef652abd112c..eec9b32f5e08 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -670,6 +670,11 @@  xlog_cil_push(
 	push_seq = cil->xc_push_seq;
 	ASSERT(push_seq <= ctx->sequence);
 
+	/*
+	 * Wake up any background push waiters now this context is being pushed.
+	 */
+	wake_up_all(&ctx->push_wait);
+
 	/*
 	 * Check if we've anything to push. If there is nothing, then we don't
 	 * move on to a new sequence number and so we have to be able to push
@@ -746,6 +751,7 @@  xlog_cil_push(
 	 */
 	INIT_LIST_HEAD(&new_ctx->committing);
 	INIT_LIST_HEAD(&new_ctx->busy_extents);
+	init_waitqueue_head(&new_ctx->push_wait);
 	new_ctx->sequence = ctx->sequence + 1;
 	new_ctx->cil = cil;
 	cil->xc_ctx = new_ctx;
@@ -898,7 +904,7 @@  xlog_cil_push_work(
  * checkpoint), but commit latency and memory usage limit this to a smaller
  * size.
  */
-static void
+static bool
 xlog_cil_push_background(
 	struct xlog	*log)
 {
@@ -915,14 +921,28 @@  xlog_cil_push_background(
 	 * space available yet.
 	 */
 	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
-		return;
+		return false;
 
 	spin_lock(&cil->xc_push_lock);
 	if (cil->xc_push_seq < cil->xc_current_sequence) {
 		cil->xc_push_seq = cil->xc_current_sequence;
 		queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
 	}
+
+	/*
+	 * If we are well over the space limit, throttle the work that is being
+	 * done until the push work on this context has begun. This will prevent
+	 * the CIL from violating maximum transaction size limits if the CIL
+	 * push is delayed for some reason.
+	 */
+	if (cil->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log) * 2) {
+		up_read(&cil->xc_ctx_lock);
+		trace_printk("CIL space used %d", cil->xc_ctx->space_used);
+		xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
+		return true;
+	}
 	spin_unlock(&cil->xc_push_lock);
+	return false;
 
 }
 
@@ -1038,9 +1058,8 @@  xfs_log_commit_cil(
 		if (lip->li_ops->iop_committing)
 			lip->li_ops->iop_committing(lip, xc_commit_lsn);
 	}
-	xlog_cil_push_background(log);
-
-	up_read(&cil->xc_ctx_lock);
+	if (!xlog_cil_push_background(log))
+		up_read(&cil->xc_ctx_lock);
 }
 
 /*
@@ -1199,6 +1218,7 @@  xlog_cil_init(
 
 	INIT_LIST_HEAD(&ctx->committing);
 	INIT_LIST_HEAD(&ctx->busy_extents);
+	init_waitqueue_head(&ctx->push_wait);
 	ctx->sequence = 1;
 	ctx->cil = cil;
 	cil->xc_ctx = ctx;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 187a43ffeaf7..466259fd1e4a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -247,6 +247,7 @@  struct xfs_cil_ctx {
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
 	struct list_head	iclog_entry;
 	struct list_head	committing;	/* ctx committing list */
+	wait_queue_head_t	push_wait;	/* background push throttle */
 	struct work_struct	discard_endio_work;
 };