diff mbox series

[36/45] xfs: implement percpu cil space used calculation

Message ID 20210305051143.182133-37-david@fromorbit.com (mailing list archive)
State New
Headers show
Series xfs: consolidated log and optimisation changes | expand

Commit Message

Dave Chinner March 5, 2021, 5:11 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Now that we have the CIL percpu structures in place, implement the
space used counter with a fast sum check similar to the
percpu_counter infrastructure.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_cil.c  | 42 ++++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_log_priv.h |  2 +-
 2 files changed, 37 insertions(+), 7 deletions(-)

Comments

Darrick J. Wong March 11, 2021, 12:20 a.m. UTC | #1
On Fri, Mar 05, 2021 at 04:11:34PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that we have the CIL percpu structures in place, implement the
> space used counter with a fast sum check similar to the
> percpu_counter infrastructure.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log_cil.c  | 42 ++++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_log_priv.h |  2 +-
>  2 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 1bcf0d423d30..5519d112c1fd 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -433,6 +433,8 @@ xlog_cil_insert_items(
>  	struct xfs_log_item	*lip;
>  	int			len = 0;
>  	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
> +	int			space_used;
> +	struct xlog_cil_pcp	*cilpcp;
>  
>  	ASSERT(tp);
>  
> @@ -469,8 +471,9 @@ xlog_cil_insert_items(
>  	 *
>  	 * This can steal more than we need, but that's OK.
>  	 */
> +	space_used = atomic_read(&ctx->space_used);
>  	if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
> -	    ctx->space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> +	    space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
>  		int	split_res = log->l_iclog_hsize +
>  					sizeof(struct xlog_op_header);
>  		if (ctx_res)
> @@ -480,16 +483,34 @@ xlog_cil_insert_items(
>  		atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
>  	}
>  
> +	/*
> +	 * Update the CIL percpu pointer. This updates the global counter when
> +	 * over the percpu batch size or when the CIL is over the space limit.
> +	 * This means low lock overhead for normal updates, and when over the
> +	 * limit the space used is immediately accounted. This makes enforcing
> +	 * the hard limit much more accurate. The per cpu fold threshold is
> +	 * based on how close we are to the hard limit.
> +	 */
> +	cilpcp = get_cpu_ptr(cil->xc_pcp);
> +	cilpcp->space_used += len;
> +	if (space_used >= XLOG_CIL_SPACE_LIMIT(log) ||
> +	    cilpcp->space_used >
> +			((XLOG_CIL_BLOCKING_SPACE_LIMIT(log) - space_used) /
> +					num_online_cpus())) {

What happens if the log is very small and there are hundreds of CPUs?
Can we end up on this slow path on a regular basis even if the amount of
space used is not that large?

Granted I can't think of a good way out of that, since I suspect that if
you do that you're already going to be hurting in 5 other places anyway.
That said ... I /do/ keep getting bugs from people with tiny logs on big
iron.  Some day I'll (ha!) stomp out all the bugs that are "NO do not
let your deployment system growfs 10000x, this is not ext4"...

> +		atomic_add(cilpcp->space_used, &ctx->space_used);
> +		cilpcp->space_used = 0;
> +	}
> +	put_cpu_ptr(cilpcp);
> +
>  	spin_lock(&cil->xc_cil_lock);
> -	tp->t_ticket->t_curr_res -= ctx_res + len;
>  	ctx->ticket->t_unit_res += ctx_res;
>  	ctx->ticket->t_curr_res += ctx_res;
> -	ctx->space_used += len;
>  
>  	/*
>  	 * If we've overrun the reservation, dump the tx details before we move
>  	 * the log items. Shutdown is imminent...
>  	 */
> +	tp->t_ticket->t_curr_res -= ctx_res + len;

Is moving this really necessary?

--D

>  	if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
>  		xfs_warn(log->l_mp, "Transaction log reservation overrun:");
>  		xfs_warn(log->l_mp,
> @@ -769,12 +790,20 @@ xlog_cil_push_work(
>  	struct bio		bio;
>  	DECLARE_COMPLETION_ONSTACK(bdev_flush);
>  	bool			commit_iclog_sync = false;
> +	int			cpu;
> +	struct xlog_cil_pcp	*cilpcp;
>  
>  	new_ctx = xlog_cil_ctx_alloc();
>  	new_ctx->ticket = xlog_cil_ticket_alloc(log);
>  
>  	down_write(&cil->xc_ctx_lock);
>  
> +	/* Reset the CIL pcp counters */
> +	for_each_online_cpu(cpu) {
> +		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
> +		cilpcp->space_used = 0;
> +	}
> +
>  	spin_lock(&cil->xc_push_lock);
>  	push_seq = cil->xc_push_seq;
>  	ASSERT(push_seq <= ctx->sequence);
> @@ -1042,6 +1071,7 @@ xlog_cil_push_background(
>  	struct xlog	*log) __releases(cil->xc_ctx_lock)
>  {
>  	struct xfs_cil	*cil = log->l_cilp;
> +	int		space_used = atomic_read(&cil->xc_ctx->space_used);
>  
>  	/*
>  	 * The cil won't be empty because we are called while holding the
> @@ -1054,7 +1084,7 @@ xlog_cil_push_background(
>  	 * Don't do a background push if we haven't used up all the
>  	 * space available yet.
>  	 */
> -	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
> +	if (space_used < XLOG_CIL_SPACE_LIMIT(log)) {
>  		up_read(&cil->xc_ctx_lock);
>  		return;
>  	}
> @@ -1083,10 +1113,10 @@ xlog_cil_push_background(
>  	 * The ctx->xc_push_lock provides the serialisation necessary for safely
>  	 * using the lockless waitqueue_active() check in this context.
>  	 */
> -	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
> +	if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
>  	    waitqueue_active(&cil->xc_push_wait)) {
>  		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
> -		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
> +		ASSERT(space_used < log->l_logsize);
>  		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
>  		return;
>  	}
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 2562f29c8986..4eb373357f26 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -222,7 +222,7 @@ struct xfs_cil_ctx {
>  	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
>  	struct xlog_ticket	*ticket;	/* chkpt ticket */
>  	int			nvecs;		/* number of regions */
> -	int			space_used;	/* aggregate size of regions */
> +	atomic_t		space_used;	/* aggregate size of regions */
>  	struct list_head	busy_extents;	/* busy extents in chkpt */
>  	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
>  	struct list_head	iclog_entry;
> -- 
> 2.28.0
>
Dave Chinner March 11, 2021, 6:51 a.m. UTC | #2
On Wed, Mar 10, 2021 at 04:20:54PM -0800, Darrick J. Wong wrote:
> On Fri, Mar 05, 2021 at 04:11:34PM +1100, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that we have the CIL percpu structures in place, implement the
> > space used counter with a fast sum check similar to the
> > percpu_counter infrastructure.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/xfs_log_cil.c  | 42 ++++++++++++++++++++++++++++++++++++------
> >  fs/xfs/xfs_log_priv.h |  2 +-
> >  2 files changed, 37 insertions(+), 7 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> > index 1bcf0d423d30..5519d112c1fd 100644
> > --- a/fs/xfs/xfs_log_cil.c
> > +++ b/fs/xfs/xfs_log_cil.c
> > @@ -433,6 +433,8 @@ xlog_cil_insert_items(
> >  	struct xfs_log_item	*lip;
> >  	int			len = 0;
> >  	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
> > +	int			space_used;
> > +	struct xlog_cil_pcp	*cilpcp;
> >  
> >  	ASSERT(tp);
> >  
> > @@ -469,8 +471,9 @@ xlog_cil_insert_items(
> >  	 *
> >  	 * This can steal more than we need, but that's OK.
> >  	 */
> > +	space_used = atomic_read(&ctx->space_used);
> >  	if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
> > -	    ctx->space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> > +	    space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
> >  		int	split_res = log->l_iclog_hsize +
> >  					sizeof(struct xlog_op_header);
> >  		if (ctx_res)
> > @@ -480,16 +483,34 @@ xlog_cil_insert_items(
> >  		atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
> >  	}
> >  
> > +	/*
> > +	 * Update the CIL percpu pointer. This updates the global counter when
> > +	 * over the percpu batch size or when the CIL is over the space limit.
> > +	 * This means low lock overhead for normal updates, and when over the
> > +	 * limit the space used is immediately accounted. This makes enforcing
> > +	 * the hard limit much more accurate. The per cpu fold threshold is
> > +	 * based on how close we are to the hard limit.
> > +	 */
> > +	cilpcp = get_cpu_ptr(cil->xc_pcp);
> > +	cilpcp->space_used += len;
> > +	if (space_used >= XLOG_CIL_SPACE_LIMIT(log) ||
> > +	    cilpcp->space_used >
> > +			((XLOG_CIL_BLOCKING_SPACE_LIMIT(log) - space_used) /
> > +					num_online_cpus())) {
> 
> What happens if the log is very small and there are hundreds of CPUs?
> Can we end up on this slow path on a regular basis even if the amount of
> space used is not that large?

AFAICT, no big deal - the transaction reservations limit the the
amount of space and concurrency that we can have here. A small log
will not allow many more than a handful of transactions through at a
time.  IOWs, we'll already be on the transaction reservation slow
path that limits concurrency via the grant head spin lock in
xlog_grant_head_check() and sleeping in xlog_grant_head_wait()...

> Granted I can't think of a good way out of that, since I suspect that if
> you do that you're already going to be hurting in 5 other places anyway.
> That said ... I /do/ keep getting bugs from people with tiny logs on big
> iron.  Some day I'll (ha!) stomp out all the bugs that are "NO do not
> let your deployment system growfs 10000x, this is not ext4"...

Yeah, that hurts long before we get to this transaction commit
path...

> 
> > +		atomic_add(cilpcp->space_used, &ctx->space_used);
> > +		cilpcp->space_used = 0;
> > +	}
> > +	put_cpu_ptr(cilpcp);
> > +
> >  	spin_lock(&cil->xc_cil_lock);
> > -	tp->t_ticket->t_curr_res -= ctx_res + len;
> >  	ctx->ticket->t_unit_res += ctx_res;
> >  	ctx->ticket->t_curr_res += ctx_res;
> > -	ctx->space_used += len;
> >  
> >  	/*
> >  	 * If we've overrun the reservation, dump the tx details before we move
> >  	 * the log items. Shutdown is imminent...
> >  	 */
> > +	tp->t_ticket->t_curr_res -= ctx_res + len;
> 
> Is moving this really necessary?

Not really, just gets it out of the way. I moved it because it
doesn't need to be inside the spinlock and in the end it needs to
be associated with the underrun check. So I moved it here first so
that it didn't have to keep moving every time I moved the spinlock
or changed the order of the code from this point onwards....

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1bcf0d423d30..5519d112c1fd 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -433,6 +433,8 @@  xlog_cil_insert_items(
 	struct xfs_log_item	*lip;
 	int			len = 0;
 	int			iovhdr_res = 0, split_res = 0, ctx_res = 0;
+	int			space_used;
+	struct xlog_cil_pcp	*cilpcp;
 
 	ASSERT(tp);
 
@@ -469,8 +471,9 @@  xlog_cil_insert_items(
 	 *
 	 * This can steal more than we need, but that's OK.
 	 */
+	space_used = atomic_read(&ctx->space_used);
 	if (atomic_read(&cil->xc_iclog_hdrs) > 0 ||
-	    ctx->space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+	    space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
 		int	split_res = log->l_iclog_hsize +
 					sizeof(struct xlog_op_header);
 		if (ctx_res)
@@ -480,16 +483,34 @@  xlog_cil_insert_items(
 		atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs);
 	}
 
+	/*
+	 * Update the CIL percpu pointer. This updates the global counter when
+	 * over the percpu batch size or when the CIL is over the space limit.
+	 * This means low lock overhead for normal updates, and when over the
+	 * limit the space used is immediately accounted. This makes enforcing
+	 * the hard limit much more accurate. The per cpu fold threshold is
+	 * based on how close we are to the hard limit.
+	 */
+	cilpcp = get_cpu_ptr(cil->xc_pcp);
+	cilpcp->space_used += len;
+	if (space_used >= XLOG_CIL_SPACE_LIMIT(log) ||
+	    cilpcp->space_used >
+			((XLOG_CIL_BLOCKING_SPACE_LIMIT(log) - space_used) /
+					num_online_cpus())) {
+		atomic_add(cilpcp->space_used, &ctx->space_used);
+		cilpcp->space_used = 0;
+	}
+	put_cpu_ptr(cilpcp);
+
 	spin_lock(&cil->xc_cil_lock);
-	tp->t_ticket->t_curr_res -= ctx_res + len;
 	ctx->ticket->t_unit_res += ctx_res;
 	ctx->ticket->t_curr_res += ctx_res;
-	ctx->space_used += len;
 
 	/*
 	 * If we've overrun the reservation, dump the tx details before we move
 	 * the log items. Shutdown is imminent...
 	 */
+	tp->t_ticket->t_curr_res -= ctx_res + len;
 	if (WARN_ON(tp->t_ticket->t_curr_res < 0)) {
 		xfs_warn(log->l_mp, "Transaction log reservation overrun:");
 		xfs_warn(log->l_mp,
@@ -769,12 +790,20 @@  xlog_cil_push_work(
 	struct bio		bio;
 	DECLARE_COMPLETION_ONSTACK(bdev_flush);
 	bool			commit_iclog_sync = false;
+	int			cpu;
+	struct xlog_cil_pcp	*cilpcp;
 
 	new_ctx = xlog_cil_ctx_alloc();
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
 	down_write(&cil->xc_ctx_lock);
 
+	/* Reset the CIL pcp counters */
+	for_each_online_cpu(cpu) {
+		cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
+		cilpcp->space_used = 0;
+	}
+
 	spin_lock(&cil->xc_push_lock);
 	push_seq = cil->xc_push_seq;
 	ASSERT(push_seq <= ctx->sequence);
@@ -1042,6 +1071,7 @@  xlog_cil_push_background(
 	struct xlog	*log) __releases(cil->xc_ctx_lock)
 {
 	struct xfs_cil	*cil = log->l_cilp;
+	int		space_used = atomic_read(&cil->xc_ctx->space_used);
 
 	/*
 	 * The cil won't be empty because we are called while holding the
@@ -1054,7 +1084,7 @@  xlog_cil_push_background(
 	 * Don't do a background push if we haven't used up all the
 	 * space available yet.
 	 */
-	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+	if (space_used < XLOG_CIL_SPACE_LIMIT(log)) {
 		up_read(&cil->xc_ctx_lock);
 		return;
 	}
@@ -1083,10 +1113,10 @@  xlog_cil_push_background(
 	 * The ctx->xc_push_lock provides the serialisation necessary for safely
 	 * using the lockless waitqueue_active() check in this context.
 	 */
-	if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
+	if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
 	    waitqueue_active(&cil->xc_push_wait)) {
 		trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
-		ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+		ASSERT(space_used < log->l_logsize);
 		xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
 		return;
 	}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2562f29c8986..4eb373357f26 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -222,7 +222,7 @@  struct xfs_cil_ctx {
 	xfs_lsn_t		commit_lsn;	/* chkpt commit record lsn */
 	struct xlog_ticket	*ticket;	/* chkpt ticket */
 	int			nvecs;		/* number of regions */
-	int			space_used;	/* aggregate size of regions */
+	atomic_t		space_used;	/* aggregate size of regions */
 	struct list_head	busy_extents;	/* busy extents in chkpt */
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
 	struct list_head	iclog_entry;