diff mbox

[WIP] Propagate error state from buffers to the objects attached

Message ID 20170112142209.23458-1-cmaiolino@redhat.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Carlos Maiolino Jan. 12, 2017, 2:22 p.m. UTC
Hi folks,

this is a new version of my possible fix to the error propagation model plus the
fix to the locked AIL items, both things are merged together since I'm using the
fix to the locked items to test the error propagation model, I'll split them
between different patches and also include dquot fixes when we agree that I'm on
the right path.

This new version adds Brian's suggestion to use a new flag for the xfs_log_item
instead of a callback. and uses the flag into xfs_inode_item_push, to resubmit
the buffer or not.

Also, the logic about when to set the flag has been moved into
xfs_buf_iodone_callback_error, so we only check that after the last buffer
resubmission.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
---
 fs/xfs/xfs_buf_item.c   |  6 ++++++
 fs/xfs/xfs_inode_item.c | 33 +++++++++++++++++++++++++++++++--
 fs/xfs/xfs_trans.h      |  4 +++-
 3 files changed, 40 insertions(+), 3 deletions(-)

Comments

Brian Foster Jan. 12, 2017, 3:38 p.m. UTC | #1
On Thu, Jan 12, 2017 at 03:22:09PM +0100, Carlos Maiolino wrote:
> Hi folks,
> 
> this is a new version of my possible fix to the error propagation model plus the
> fix to the locked AIL items, both things are merged together since I'm using the
> fix to the locked items to test the error propagation model, I'll split them
> between different patches and also include dquot fixes when we agree that I'm on
> the right path.
> 
> This new version adds Brian's suggestion to use a new flag for the xfs_log_item
> instead of a callback. and uses the flag into xfs_inode_item_push, to resubmit
> the buffer or not.
> 
> Also, the logic about when to set the flag has been moved into
> xfs_buf_iodone_callback_error, so we only check that after the last buffer
> resubmission.
> 
> Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
> ---
>  fs/xfs/xfs_buf_item.c   |  6 ++++++
>  fs/xfs/xfs_inode_item.c | 33 +++++++++++++++++++++++++++++++--
>  fs/xfs/xfs_trans.h      |  4 +++-
>  3 files changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 2975cb2..16896d5 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -1123,6 +1123,12 @@ xfs_buf_iodone_callback_error(
>  	/* still a transient error, higher layers will retry */
>  	xfs_buf_ioerror(bp, 0);
>  	xfs_buf_relse(bp);
> +
> +	/*
> +	 * Notify log item that the buffer has been failed so it can be retried
> +	 * later if needed
> +	 */
> +	lip->li_flags |= XFS_LI_FAILED;

Looks like the right idea to me. We might want to set this before we
release the buffer though. Perhaps move it a few lines up and combine
this comment with the "transient error" comment above..?

>  	return true;
>  
>  	/*
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index d90e781..308aa27 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -517,8 +517,37 @@ xfs_inode_item_push(
>  	 * the AIL.
>  	 */
>  	if (!xfs_iflock_nowait(ip)) {
> -		rval = XFS_ITEM_FLUSHING;
> -		goto out_unlock;
> +		int error;
> +		struct xfs_dinode *dip;
> +
> +		/* Buffer carrying this item has been failed, we must resubmit
> +		 * the buffer or the item will be locked forever
> +		 */

I'd suggest to reference the flush lock specifically, in particular how
it's locked once and remains so until the flushed content makes it to
disk (across I/O failures and retries if necessary).

Note that the comment above the 'if (!xfs_iflock_nowait())' above will
probably need an update as well.

> +		if (lip->li_flags & XFS_LI_FAILED) {
> +			printk("#### ITEM BUFFER FAILED PREVIOUSLY, inode: %llu\n",
> +			       ip->i_ino);
> +			error = xfs_imap_to_bp(ip->i_mount, NULL, &ip->i_imap,
> +					       &dip, &bp, XBF_TRYLOCK, 0);
> +
> +			if (error) {
> +				rval = XFS_ITEM_FLUSHING;
> +				goto out_unlock;
> +			}
> +
> +			if (!(bp->b_flags & XBF_WRITE_FAIL)) {
> +				rval = XFS_ITEM_FLUSHING;
> +				xfs_buf_relse(bp);
> +				goto out_unlock;
> +			}
> +
> +			if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> +				printk("#### QUEUEING AGAIN\n");
> +				rval = XFS_ITEM_FLUSHING;
> +			}
> +

We need to clear the LI_FAILED state once we've queued it for retry. We
may also want to do the same for all other LI_FAILED log items attached
to the buffer so we don't lock the buffer a bunch of times. For
example, add a helper to walk the log items and clear the failure flag.
(That could be a separate patch in the series though).

Those things aside this looks like it's on the right track to me. Thanks
Carlos.

Brian

> +			xfs_buf_relse(bp);
> +			goto out_unlock;
> +		}
>  	}
>  
>  	ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index 61b7fbd..d62d174 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -66,10 +66,12 @@ typedef struct xfs_log_item {
>  
>  #define	XFS_LI_IN_AIL	0x1
>  #define XFS_LI_ABORTED	0x2
> +#define	XFS_LI_FAILED	0x3
>  
>  #define XFS_LI_FLAGS \
>  	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
> -	{ XFS_LI_ABORTED,	"ABORTED" }
> +	{ XFS_LI_ABORTED,	"ABORTED"}, \
> +	{ XFS_LI_FAILED,	"FAILED" }
>  
>  struct xfs_item_ops {
>  	void (*iop_size)(xfs_log_item_t *, int *, int *);
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Carlos Maiolino Feb. 21, 2017, 9:30 a.m. UTC | #2
Hi Brian.

First, I apologize for my late reply, I was on vacations, comments inline.


> > diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> > index 2975cb2..16896d5 100644
> > --- a/fs/xfs/xfs_buf_item.c
> > +++ b/fs/xfs/xfs_buf_item.c
> > @@ -1123,6 +1123,12 @@ xfs_buf_iodone_callback_error(
> >  	/* still a transient error, higher layers will retry */
> >  	xfs_buf_ioerror(bp, 0);
> >  	xfs_buf_relse(bp);
> > +
> > +	/*
> > +	 * Notify log item that the buffer has been failed so it can be retried
> > +	 * later if needed
> > +	 */
> > +	lip->li_flags |= XFS_LI_FAILED;
> 
> Looks like the right idea to me. We might want to set this before we
> release the buffer though. Perhaps move it a few lines up and combine
> this comment with the "transient error" comment above..?
> 

I agree, will do it.

> >  	return true;
> >  
> >  	/*
> > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> > index d90e781..308aa27 100644
> > --- a/fs/xfs/xfs_inode_item.c
> > +++ b/fs/xfs/xfs_inode_item.c
> > @@ -517,8 +517,37 @@ xfs_inode_item_push(
> >  	 * the AIL.
> >  	 */
> >  	if (!xfs_iflock_nowait(ip)) {
> > -		rval = XFS_ITEM_FLUSHING;
> > -		goto out_unlock;
> > +		int error;
> > +		struct xfs_dinode *dip;
> > +
> > +		/* Buffer carrying this item has been failed, we must resubmit
> > +		 * the buffer or the item will be locked forever
> > +		 */
> 
> I'd suggest to reference the flush lock specifically, in particular how
> it's locked once and remains so until the flushed content makes it to
> disk (across I/O failures and retries if necessary).
> 
> Note that the comment above the 'if (!xfs_iflock_nowait())' above will
> probably need an update as well.
> 

Agreed

> > +		if (lip->li_flags & XFS_LI_FAILED) {
> > +			printk("#### ITEM BUFFER FAILED PREVIOUSLY, inode: %llu\n",
> > +			       ip->i_ino);
> > +			error = xfs_imap_to_bp(ip->i_mount, NULL, &ip->i_imap,
> > +					       &dip, &bp, XBF_TRYLOCK, 0);
> > +
> > +			if (error) {
> > +				rval = XFS_ITEM_FLUSHING;
> > +				goto out_unlock;
> > +			}
> > +
> > +			if (!(bp->b_flags & XBF_WRITE_FAIL)) {
> > +				rval = XFS_ITEM_FLUSHING;
> > +				xfs_buf_relse(bp);
> > +				goto out_unlock;
> > +			}
> > +
> > +			if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> > +				printk("#### QUEUEING AGAIN\n");
> > +				rval = XFS_ITEM_FLUSHING;
> > +			}
> > +
> 
> We need to clear the LI_FAILED state once we've queued it for retry. 

I tried to do this to clear the flag:

		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
				lip->li_flags &= ~XFS_LI_FAILED;
				printk("#### QUEUEING AGAIN\n");
				rval = XFS_ITEM_FLUSHING;
			}

There is something wrong here that I'm trying to figure out. When I clear the
flag, my reproducer ends up in a stack overflow, see below (I removed the linked
in modules from the stack to save some space). I just started to investigate
why such overflow happened, but I thought it might be worth to post giving that
I'm still learning how buffers and the log items work together, maybe you or
somebody else might have an idea.

[   70.655063] #### ITEM BUFFER FAILED PREVIOUSLY, inode: 131 
[   70.660031] BUG: stack guard page was hit at ffffc90000fe0000 (stack is
ffffc90000fdc000..ffffc90000fdffff)
[   70.661553] kernel stack overflow (page fault): 0000 [#1] SMP 
[   70.662375] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs
fscache.....
..
..
..

[   70.673330] CPU: 0 PID: 55 Comm: kworker/0:1 Not tainted 4.9.0-rc1xfs-next+
#59
[   70.674324] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014
[   70.675572] Workqueue:            (null) ()
[   70.676226] task: ffff880235ea9d40 task.stack: ffffc90000fdc000
[   70.677073] RIP: 0010:[<ffffffff813b7e8d>]  [<ffffffff813b7e8d>] xfs_iflush_done+0x19d/0x210
[   70.678293] RSP: 0000:ffffc90000fdfd30  EFLAGS: 00010202
[   70.679051] RAX: ffff880230738640 RBX: ffff880230738a00 RCX: 000000000000005a
[   70.680068] RDX: 000000000000005b RSI: 0000000000000003 RDI: ffff88022fee2dc0
[   70.681077] RBP: ffffc90000fdfd78 R08: ffff880230738640 R09: 0000000100000080
[   70.682036] R10: ffffc90000fdfd70 R11: 0000000000000000 R12: ffffc90000fdfd50
[   70.682998] R13: ffff88022fee2d80 R14: ffffc90000fdfd30 R15: ffff88022e4f1a18
[   70.684015] FS:  0000000000000000(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
[   70.685163] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   70.685988] CR2: 00000000000000b8 CR3: 0000000231de9000 CR4: 00000000003406f0
[   70.687010] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   70.688024] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   70.689034] Stack:
[   70.689332]  ffff880230738a00 ffff880230738640 ffff880230738640 ffff880230738640
[   70.690463]  ffff880230738640 ffff880230738640 ffff880230738640 ffff880230738640
[   70.691542]  ffff880230738640 ffff880230738640 ffff880230738640 ffff880230738640
[   70.692570] Call Trace:
[   70.692931] Code: d8 31 d2 eb 09 48 8b 40 38 48 85 c0 74 2a 66 83 b8 92 00 00
00 00 74 ed 48 8b 88 80 00 00 00 48 39 48 $
0 75 e0 48 63 ca 83 c2 01 <49> 89 04 ce 48 8b 40 38 48 85 c0 75 d6 b9 08 00 00
00 4c 89 f6  
[   70.696678] RIP  [<ffffffff813b7e8d>] xfs_iflush_done+0x19d/0x210
[   70.697518]  RSP <ffffc90000fdfd30>
[   70.698026] ---[ end trace 87916c0b8e13041c ]---

And xfsaild hang up

  95.784135] CPU: 1 PID: 7430 Comm: xfsaild/dm-5 Tainted: G      D      L
4.9.0-rc1xfs-next+ #59
[   95.785241] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.9.3-1.fc25 04/01/2014
[   95.786297] task: ffff880234e557c0 task.stack: ffffc90001e74000
[   95.787014] RIP: 0010:[<ffffffff810d8595>]  [<ffffffff810d8595>]
queued_spin_lock_slowpath+0x25/0x1a0
[   95.788145] RSP: 0018:ffffc90001e77e00  EFLAGS: 00000202
[   95.788792] RAX: 0000000000000001 RBX: ffff880234e557c0 RCX: 0000000000000000
[   95.789668] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88022fee2dc0
[   95.790538] RBP: ffffc90001e77e00 R08: 0000001055207808 R09: 0000000000000000
[   95.791409] R10: 0000000000000232 R11: 0000000000000400 R12: ffffffff8218eba8
[   95.792281] R13: ffff8802316f5000 R14: ffff88022fee2dc0 R15: ffff88022fee2d80
[   95.793157] FS:  0000000000000000(0000) GS:ffff88023fc80000(0000)
knlGS:0000000000000000
[   95.794116] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   95.794743] CR2: 00007ffe3543fd18 CR3: 0000000001e06000 CR4: 00000000003406e0
[   95.795511] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   95.796287] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   95.797086] Stack:
[   95.797346]  ffffc90001e77e10 ffffffff8186e740 ffffc90001e77ec0
ffffffff813c0a3c
[   95.798437]  ffff880234e557c0 ffff880234e557c0 ffff880234e557c0
0000000000000001
[   95.799466]  0000000100000000 0000000100000080 0000000100000080
ffff88022fee2d90
[   95.800453] Call Trace:
[   95.800780]  [<ffffffff8186e740>] _raw_spin_lock+0x20/0x30
[   95.801519]  [<ffffffff813c0a3c>] xfsaild+0x15c/0x740
[   95.802337]  [<ffffffff813c08e0>] ? xfs_trans_ail_cursor_first+0x90/0x90
[   95.803217]  [<ffffffff813c08e0>] ? xfs_trans_ail_cursor_first+0x90/0x90
[   95.804058]  [<ffffffff810ac999>] kthread+0xd9/0xf0
[   95.804674]  [<ffffffff810ac8c0>] ? kthread_park+0x60/0x60
[   95.805356]  [<ffffffff8186ebd5>] ret_from_fork+0x25/0x30
[   95.806042] Code: 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 0f 1f 44 00 00
ba 01 00 00 00 8b 07 85 c0 75 0a f0 0f b1 1
7 85 c0 75 f2 5d c3 f3 90 <eb> ec 81 fe 00 01 00 00 0f 84 92 00 00 00 41 b8 01
01 00 00 b9

>We
> may also want to do the same for all other LI_FAILED log items attached
> to the buffer so we don't lock the buffer a bunch of times. For
> example, add a helper to walk the log items and clear the failure flag.
> (That could be a separate patch in the series though).
> 
> Those things aside this looks like it's on the right track to me. Thanks
> Carlos.
> 
> Brian
Brian Foster Feb. 21, 2017, 1:25 p.m. UTC | #3
On Tue, Feb 21, 2017 at 10:30:57AM +0100, Carlos Maiolino wrote:
> Hi Brian.
> 
> First, I apologize for my late reply, I was on vacations, comments inline.
> 
> 
> > > diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> > > index 2975cb2..16896d5 100644
> > > --- a/fs/xfs/xfs_buf_item.c
> > > +++ b/fs/xfs/xfs_buf_item.c
> > > @@ -1123,6 +1123,12 @@ xfs_buf_iodone_callback_error(
> > >  	/* still a transient error, higher layers will retry */
> > >  	xfs_buf_ioerror(bp, 0);
> > >  	xfs_buf_relse(bp);
> > > +
> > > +	/*
> > > +	 * Notify log item that the buffer has been failed so it can be retried
> > > +	 * later if needed
> > > +	 */
> > > +	lip->li_flags |= XFS_LI_FAILED;
> > 
> > Looks like the right idea to me. We might want to set this before we
> > release the buffer though. Perhaps move it a few lines up and combine
> > this comment with the "transient error" comment above..?
> > 
> 
> I agree, will do it.
> 
> > >  	return true;
> > >  
> > >  	/*
> > > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> > > index d90e781..308aa27 100644
> > > --- a/fs/xfs/xfs_inode_item.c
> > > +++ b/fs/xfs/xfs_inode_item.c
> > > @@ -517,8 +517,37 @@ xfs_inode_item_push(
> > >  	 * the AIL.
> > >  	 */
> > >  	if (!xfs_iflock_nowait(ip)) {
> > > -		rval = XFS_ITEM_FLUSHING;
> > > -		goto out_unlock;
> > > +		int error;
> > > +		struct xfs_dinode *dip;
> > > +
> > > +		/* Buffer carrying this item has been failed, we must resubmit
> > > +		 * the buffer or the item will be locked forever
> > > +		 */
> > 
> > I'd suggest to reference the flush lock specifically, in particular how
> > it's locked once and remains so until the flushed content makes it to
> > disk (across I/O failures and retries if necessary).
> > 
> > Note that the comment above the 'if (!xfs_iflock_nowait())' above will
> > probably need an update as well.
> > 
> 
> Agreed
> 
> > > +		if (lip->li_flags & XFS_LI_FAILED) {
> > > +			printk("#### ITEM BUFFER FAILED PREVIOUSLY, inode: %llu\n",
> > > +			       ip->i_ino);
> > > +			error = xfs_imap_to_bp(ip->i_mount, NULL, &ip->i_imap,
> > > +					       &dip, &bp, XBF_TRYLOCK, 0);
> > > +
> > > +			if (error) {
> > > +				rval = XFS_ITEM_FLUSHING;
> > > +				goto out_unlock;
> > > +			}
> > > +
> > > +			if (!(bp->b_flags & XBF_WRITE_FAIL)) {
> > > +				rval = XFS_ITEM_FLUSHING;
> > > +				xfs_buf_relse(bp);
> > > +				goto out_unlock;
> > > +			}
> > > +
> > > +			if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> > > +				printk("#### QUEUEING AGAIN\n");
> > > +				rval = XFS_ITEM_FLUSHING;
> > > +			}
> > > +
> > 
> > We need to clear the LI_FAILED state once we've queued it for retry. 
> 
> I tried to do this to clear the flag:
> 
> 		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> 				lip->li_flags &= ~XFS_LI_FAILED;
> 				printk("#### QUEUEING AGAIN\n");
> 				rval = XFS_ITEM_FLUSHING;
> 			}
> 
> There is something wrong here that I'm trying to figure out. When I clear the
> flag, my reproducer ends up in a stack overflow, see below (I removed the linked
> in modules from the stack to save some space). I just started to investigate
> why such overflow happened, but I thought it might be worth to post giving that
> I'm still learning how buffers and the log items work together, maybe you or
> somebody else might have an idea.
> 

Nothing stands out to me on a quick look, but it's been a while since
I've looked at this. It does look like we should clear the failed flag
regardless of whether the queue succeeds or not, though (here you clear
it only when the queue fails).

Brian

> [   70.655063] #### ITEM BUFFER FAILED PREVIOUSLY, inode: 131 
> [   70.660031] BUG: stack guard page was hit at ffffc90000fe0000 (stack is
> ffffc90000fdc000..ffffc90000fdffff)
> [   70.661553] kernel stack overflow (page fault): 0000 [#1] SMP 
> [   70.662375] Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs
> fscache.....
> ..
> ..
> ..
> 
> [   70.673330] CPU: 0 PID: 55 Comm: kworker/0:1 Not tainted 4.9.0-rc1xfs-next+
> #59
> [   70.674324] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014
> [   70.675572] Workqueue:            (null) ()
> [   70.676226] task: ffff880235ea9d40 task.stack: ffffc90000fdc000
> [   70.677073] RIP: 0010:[<ffffffff813b7e8d>]  [<ffffffff813b7e8d>] xfs_iflush_done+0x19d/0x210
> [   70.678293] RSP: 0000:ffffc90000fdfd30  EFLAGS: 00010202
> [   70.679051] RAX: ffff880230738640 RBX: ffff880230738a00 RCX: 000000000000005a
> [   70.680068] RDX: 000000000000005b RSI: 0000000000000003 RDI: ffff88022fee2dc0
> [   70.681077] RBP: ffffc90000fdfd78 R08: ffff880230738640 R09: 0000000100000080
> [   70.682036] R10: ffffc90000fdfd70 R11: 0000000000000000 R12: ffffc90000fdfd50
> [   70.682998] R13: ffff88022fee2d80 R14: ffffc90000fdfd30 R15: ffff88022e4f1a18
> [   70.684015] FS:  0000000000000000(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
> [   70.685163] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   70.685988] CR2: 00000000000000b8 CR3: 0000000231de9000 CR4: 00000000003406f0
> [   70.687010] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   70.688024] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   70.689034] Stack:
> [   70.689332]  ffff880230738a00 ffff880230738640 ffff880230738640 ffff880230738640
> [   70.690463]  ffff880230738640 ffff880230738640 ffff880230738640 ffff880230738640
> [   70.691542]  ffff880230738640 ffff880230738640 ffff880230738640 ffff880230738640
> [   70.692570] Call Trace:
> [   70.692931] Code: d8 31 d2 eb 09 48 8b 40 38 48 85 c0 74 2a 66 83 b8 92 00 00
> 00 00 74 ed 48 8b 88 80 00 00 00 48 39 48 $
> 0 75 e0 48 63 ca 83 c2 01 <49> 89 04 ce 48 8b 40 38 48 85 c0 75 d6 b9 08 00 00
> 00 4c 89 f6  
> [   70.696678] RIP  [<ffffffff813b7e8d>] xfs_iflush_done+0x19d/0x210
> [   70.697518]  RSP <ffffc90000fdfd30>
> [   70.698026] ---[ end trace 87916c0b8e13041c ]---
> 
> And xfsaild hang up
> 
>   95.784135] CPU: 1 PID: 7430 Comm: xfsaild/dm-5 Tainted: G      D      L
> 4.9.0-rc1xfs-next+ #59
> [   95.785241] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> 1.9.3-1.fc25 04/01/2014
> [   95.786297] task: ffff880234e557c0 task.stack: ffffc90001e74000
> [   95.787014] RIP: 0010:[<ffffffff810d8595>]  [<ffffffff810d8595>]
> queued_spin_lock_slowpath+0x25/0x1a0
> [   95.788145] RSP: 0018:ffffc90001e77e00  EFLAGS: 00000202
> [   95.788792] RAX: 0000000000000001 RBX: ffff880234e557c0 RCX: 0000000000000000
> [   95.789668] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88022fee2dc0
> [   95.790538] RBP: ffffc90001e77e00 R08: 0000001055207808 R09: 0000000000000000
> [   95.791409] R10: 0000000000000232 R11: 0000000000000400 R12: ffffffff8218eba8
> [   95.792281] R13: ffff8802316f5000 R14: ffff88022fee2dc0 R15: ffff88022fee2d80
> [   95.793157] FS:  0000000000000000(0000) GS:ffff88023fc80000(0000)
> knlGS:0000000000000000
> [   95.794116] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   95.794743] CR2: 00007ffe3543fd18 CR3: 0000000001e06000 CR4: 00000000003406e0
> [   95.795511] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   95.796287] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [   95.797086] Stack:
> [   95.797346]  ffffc90001e77e10 ffffffff8186e740 ffffc90001e77ec0
> ffffffff813c0a3c
> [   95.798437]  ffff880234e557c0 ffff880234e557c0 ffff880234e557c0
> 0000000000000001
> [   95.799466]  0000000100000000 0000000100000080 0000000100000080
> ffff88022fee2d90
> [   95.800453] Call Trace:
> [   95.800780]  [<ffffffff8186e740>] _raw_spin_lock+0x20/0x30
> [   95.801519]  [<ffffffff813c0a3c>] xfsaild+0x15c/0x740
> [   95.802337]  [<ffffffff813c08e0>] ? xfs_trans_ail_cursor_first+0x90/0x90
> [   95.803217]  [<ffffffff813c08e0>] ? xfs_trans_ail_cursor_first+0x90/0x90
> [   95.804058]  [<ffffffff810ac999>] kthread+0xd9/0xf0
> [   95.804674]  [<ffffffff810ac8c0>] ? kthread_park+0x60/0x60
> [   95.805356]  [<ffffffff8186ebd5>] ret_from_fork+0x25/0x30
> [   95.806042] Code: 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 0f 1f 44 00 00
> ba 01 00 00 00 8b 07 85 c0 75 0a f0 0f b1 1
> 7 85 c0 75 f2 5d c3 f3 90 <eb> ec 81 fe 00 01 00 00 0f 84 92 00 00 00 41 b8 01
> 01 00 00 b9
> 
> >We
> > may also want to do the same for all other LI_FAILED log items attached
> > to the buffer so we don't lock the buffer a bunch of times. For
> > example, add a helper to walk the log items and clear the failure flag.
> > (That could be a separate patch in the series though).
> > 
> > Those things aside this looks like it's on the right track to me. Thanks
> > Carlos.
> > 
> > Brian
> 
> -- 
> Carlos
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Feb. 23, 2017, 10:42 p.m. UTC | #4
On Tue, Feb 21, 2017 at 08:25:30AM -0500, Brian Foster wrote:
> On Tue, Feb 21, 2017 at 10:30:57AM +0100, Carlos Maiolino wrote:
> > > > +		if (lip->li_flags & XFS_LI_FAILED) {
> > > > +			printk("#### ITEM BUFFER FAILED PREVIOUSLY, inode: %llu\n",
> > > > +			       ip->i_ino);
> > > > +			error = xfs_imap_to_bp(ip->i_mount, NULL, &ip->i_imap,
> > > > +					       &dip, &bp, XBF_TRYLOCK, 0);
> > > > +
> > > > +			if (error) {
> > > > +				rval = XFS_ITEM_FLUSHING;
> > > > +				goto out_unlock;
> > > > +			}
> > > > +
> > > > +			if (!(bp->b_flags & XBF_WRITE_FAIL)) {
> > > > +				rval = XFS_ITEM_FLUSHING;
> > > > +				xfs_buf_relse(bp);
> > > > +				goto out_unlock;
> > > > +			}
> > > > +
> > > > +			if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> > > > +				printk("#### QUEUEING AGAIN\n");
> > > > +				rval = XFS_ITEM_FLUSHING;
> > > > +			}
> > > > +
> > > 
> > > We need to clear the LI_FAILED state once we've queued it for retry. 
> > 
> > I tried to do this to clear the flag:
> > 
> > 		if (!xfs_buf_delwri_queue(bp, buffer_list)) {
> > 				lip->li_flags &= ~XFS_LI_FAILED;
> > 				printk("#### QUEUEING AGAIN\n");
> > 				rval = XFS_ITEM_FLUSHING;
> > 			}
> > 
> > There is something wrong here that I'm trying to figure out. When I clear the
> > flag, my reproducer ends up in a stack overflow, see below (I removed the linked
> > in modules from the stack to save some space). I just started to investigate
> > why such overflow happened, but I thought it might be worth to post giving that
> > I'm still learning how buffers and the log items work together, maybe you or
> > somebody else might have an idea.
> > 
> 
> Nothing stands out to me on a quick look, but it's been a while since
> I've looked at this. It does look like we should clear the failed flag
> regardless of whether the queue succeeds or not, though (here you clear
> it only when the queue fails).

FWIW, I think this is the wrong way to handle a buffer error, both
from a philosophical POV and from an XFS IO model POV. That is:
IO submission should not need to handle a previous write failure that
didn't clean up buffer and object state correctly. The IO completion handler of the
failed IO should have done all the required cleanup so the
submission path should not require any modifications at all.

Keep in mind the way inode cluster flushing works:

	1. flush first inode in cluster
	2. call xfs-iflush_cluster()
	3. flush lock and flush all inodes on cluster buffer

Now, this means the other 31 dirty inodes in the cluster that were
clustered and flush locked will now get hit your new code when the
AIL tries to push them and find the flush locked. It will do buffer
lookups for 31 inodes that it didn't need to, i.e. failing to get an
inode flush lock in AIL pushing is a /very common occurrence/ whilst
buffer writeback failure is extremely rare.

i.e. we need to unlock and mark the inodes as failed from the buffer
IO completion callbacks, not poll inode buffers for failure if we
can't get a flush lock on an inode. i.e. we need a "soft-error"
equivalent of xfs_iflush_abort() that doesn't shutdown the fs or
remove objects from the AIL unless the filesystem has already been
shut down....

Cheers,

Dave.
diff mbox

Patch

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2975cb2..16896d5 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1123,6 +1123,12 @@  xfs_buf_iodone_callback_error(
 	/* still a transient error, higher layers will retry */
 	xfs_buf_ioerror(bp, 0);
 	xfs_buf_relse(bp);
+
+	/*
+	 * Notify log item that the buffer has been failed so it can be retried
+	 * later if needed
+	 */
+	lip->li_flags |= XFS_LI_FAILED;
 	return true;
 
 	/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e781..308aa27 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -517,8 +517,37 @@  xfs_inode_item_push(
 	 * the AIL.
 	 */
 	if (!xfs_iflock_nowait(ip)) {
-		rval = XFS_ITEM_FLUSHING;
-		goto out_unlock;
+		int error;
+		struct xfs_dinode *dip;
+
+		/* Buffer carrying this item has been failed, we must resubmit
+		 * the buffer or the item will be locked forever
+		 */
+		if (lip->li_flags & XFS_LI_FAILED) {
+			printk("#### ITEM BUFFER FAILED PREVIOUSLY, inode: %llu\n",
+			       ip->i_ino);
+			error = xfs_imap_to_bp(ip->i_mount, NULL, &ip->i_imap,
+					       &dip, &bp, XBF_TRYLOCK, 0);
+
+			if (error) {
+				rval = XFS_ITEM_FLUSHING;
+				goto out_unlock;
+			}
+
+			if (!(bp->b_flags & XBF_WRITE_FAIL)) {
+				rval = XFS_ITEM_FLUSHING;
+				xfs_buf_relse(bp);
+				goto out_unlock;
+			}
+
+			if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+				printk("#### QUEUEING AGAIN\n");
+				rval = XFS_ITEM_FLUSHING;
+			}
+
+			xfs_buf_relse(bp);
+			goto out_unlock;
+		}
 	}
 
 	ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 61b7fbd..d62d174 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -66,10 +66,12 @@  typedef struct xfs_log_item {
 
 #define	XFS_LI_IN_AIL	0x1
 #define XFS_LI_ABORTED	0x2
+#define	XFS_LI_FAILED	0x3
 
 #define XFS_LI_FLAGS \
 	{ XFS_LI_IN_AIL,	"IN_AIL" }, \
-	{ XFS_LI_ABORTED,	"ABORTED" }
+	{ XFS_LI_ABORTED,	"ABORTED"}, \
+	{ XFS_LI_FAILED,	"FAILED" }
 
 struct xfs_item_ops {
 	void (*iop_size)(xfs_log_item_t *, int *, int *);