diff mbox series

[v1,5/7] xfs: Add device retry

Message ID 1543376991-5764-6-git-send-email-allison.henderson@oracle.com (mailing list archive)
State Superseded
Headers show
Series Block/XFS: Support alternative mirror device retry | expand

Commit Message

Allison Henderson Nov. 28, 2018, 3:49 a.m. UTC
Check to see if the _xfs_buf_read fails.  If so loop over the
available mirrors and retry the read

Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
---
 fs/xfs/xfs_buf.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

Comments

Dave Chinner Nov. 28, 2018, 5:08 a.m. UTC | #1
On Tue, Nov 27, 2018 at 08:49:49PM -0700, Allison Henderson wrote:
> Check to see if the _xfs_buf_read fails.  If so loop over the
> available mirrors and retry the read
> 
> Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
> ---
>  fs/xfs/xfs_buf.c | 28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index dd8ba59..f102d01 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -21,6 +21,7 @@
>  #include <linux/migrate.h>
>  #include <linux/backing-dev.h>
>  #include <linux/freezer.h>
> +#include <linux/blkdev.h>
>  
>  #include "xfs_format.h"
>  #include "xfs_log_format.h"
> @@ -808,6 +809,8 @@ xfs_buf_read_map(
>  	const struct xfs_buf_ops *ops)
>  {
>  	struct xfs_buf		*bp;
> +	struct request_queue	*q;
> +	unsigned short		i;
>  
>  	flags |= XBF_READ;
>  
> @@ -820,7 +823,30 @@ xfs_buf_read_map(
>  	if (!(bp->b_flags & XBF_DONE)) {
>  		XFS_STATS_INC(target->bt_mount, xb_get_read);
>  		bp->b_ops = ops;
> -		_xfs_buf_read(bp, flags);
> +		q = bdev_get_queue(bp->b_target->bt_bdev);
> +
> +		/*
> +		 * Mirrors are indexed 1 - n, specified through the rw_hint.
> +		 * Setting the hint to 0 is unspecified and allows the block
> +		 * layer to decide.
> +		 */
> +		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
> +			bp->b_error = 0;
> +			bp->b_rw_hint = i;
> +			_xfs_buf_read(bp, flags);

So the first time through this loop the block layer devices what
device to read from, then we iterate devices 1..n on error.

Whihc means if device 0 is the only one with good information in it,
we may not ever actually read from it.

I'd suggest that a hint of "-1" (or equivalent max value) should be
used for "device selects mirror leg" rather than 0, so we can
actually read from the first device on command.

i.e.
		bp->b_error = 0;
		bp->b_rw_hint = -1;
		_xfs_buf_read(bp, flags);

		if (!bp->b_error)
			return bp;

		/* manual iteration to find a good copy */
		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
			bp->b_error = 0;
			bp->b_rw_hint = i;
			_xfs_buf_read(bp, flags);
......
> +
> +			switch (bp->b_error) {
> +			case -EIO:
> +			case -EFSCORRUPTED:
> +			case -EFSBADCRC:
> +				/* loop again */
> +				continue;
> +			default:
> +				goto retry_done;

Just return bp here, don't need a jump label for it.

Cheers,

Dave.
Darrick J. Wong Nov. 28, 2018, 5:22 a.m. UTC | #2
On Wed, Nov 28, 2018 at 04:08:50PM +1100, Dave Chinner wrote:
> On Tue, Nov 27, 2018 at 08:49:49PM -0700, Allison Henderson wrote:
> > Check to see if the _xfs_buf_read fails.  If so loop over the
> > available mirrors and retry the read
> > 
> > Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
> > ---
> >  fs/xfs/xfs_buf.c | 28 +++++++++++++++++++++++++++-
> >  1 file changed, 27 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > index dd8ba59..f102d01 100644
> > --- a/fs/xfs/xfs_buf.c
> > +++ b/fs/xfs/xfs_buf.c
> > @@ -21,6 +21,7 @@
> >  #include <linux/migrate.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/freezer.h>
> > +#include <linux/blkdev.h>
> >  
> >  #include "xfs_format.h"
> >  #include "xfs_log_format.h"
> > @@ -808,6 +809,8 @@ xfs_buf_read_map(
> >  	const struct xfs_buf_ops *ops)
> >  {
> >  	struct xfs_buf		*bp;
> > +	struct request_queue	*q;
> > +	unsigned short		i;
> >  
> >  	flags |= XBF_READ;
> >  
> > @@ -820,7 +823,30 @@ xfs_buf_read_map(
> >  	if (!(bp->b_flags & XBF_DONE)) {
> >  		XFS_STATS_INC(target->bt_mount, xb_get_read);
> >  		bp->b_ops = ops;
> > -		_xfs_buf_read(bp, flags);
> > +		q = bdev_get_queue(bp->b_target->bt_bdev);
> > +
> > +		/*
> > +		 * Mirrors are indexed 1 - n, specified through the rw_hint.
> > +		 * Setting the hint to 0 is unspecified and allows the block
> > +		 * layer to decide.
> > +		 */
> > +		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
> > +			bp->b_error = 0;
> > +			bp->b_rw_hint = i;
> > +			_xfs_buf_read(bp, flags);
> 
> So the first time through this loop the block layer devices what
> device to read from, then we iterate devices 1..n on error.
> 
> Whihc means if device 0 is the only one with good information in it,
> we may not ever actually read from it.
> 
> I'd suggest that a hint of "-1" (or equivalent max value) should be
> used for "device selects mirror leg" rather than 0, so we can
> actually read from the first device on command.

"read from the first device on command" => "set bio.bi_rw_hint = 1"...

> i.e.
> 		bp->b_error = 0;
> 		bp->b_rw_hint = -1;

...which is confusing.  The intended behavior for this RFC (though not
so well documented) is that bi_rw_hint == 0 means "let the device
choose", and rw_hint > 1 means "choose mirror (rw_hint - 1)".  That's
sort of an odd behavior because now we have:

blk_queue_get_mirrors(q) returns 5 (as in 5 mirrors) but we access the
5 mirrors as indices 1-5, not 0-4 like most programmers would probably
expect.

Also, I think it's probably necessary to create a #define to attach a
name to the "let the device choose" value...

#define BIO_RW_HINT_ANY_MIRROR	(0)

for (i = BIO_RW_HINT_ANY_MIRROR; i <= blk_queue_get_mirrors(q); i++) {
	...
	bp->b_rw_hint = i;
	...
	_xfs_buf_read(bp, flags);
	...
}

(or offset things -1 like you propose)

--D

> 		_xfs_buf_read(bp, flags);
> 
> 		if (!bp->b_error)
> 			return bp;
> 
> 		/* manual iteration to find a good copy */
> 		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
> 			bp->b_error = 0;
> 			bp->b_rw_hint = i;
> 			_xfs_buf_read(bp, flags);
> ......
> > +
> > +			switch (bp->b_error) {
> > +			case -EIO:
> > +			case -EFSCORRUPTED:
> > +			case -EFSBADCRC:
> > +				/* loop again */
> > +				continue;
> > +			default:
> > +				goto retry_done;
> 
> Just return bp here, don't need a jump label for it.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Nov. 28, 2018, 5:38 a.m. UTC | #3
On Tue, Nov 27, 2018 at 09:22:45PM -0800, Darrick J. Wong wrote:
> On Wed, Nov 28, 2018 at 04:08:50PM +1100, Dave Chinner wrote:
> > On Tue, Nov 27, 2018 at 08:49:49PM -0700, Allison Henderson wrote:
> > > Check to see if the _xfs_buf_read fails.  If so loop over the
> > > available mirrors and retry the read
> > > 
> > > Signed-off-by: Allison Henderson <allison.henderson@oracle.com>
> > > ---
> > >  fs/xfs/xfs_buf.c | 28 +++++++++++++++++++++++++++-
> > >  1 file changed, 27 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > > index dd8ba59..f102d01 100644
> > > --- a/fs/xfs/xfs_buf.c
> > > +++ b/fs/xfs/xfs_buf.c
> > > @@ -21,6 +21,7 @@
> > >  #include <linux/migrate.h>
> > >  #include <linux/backing-dev.h>
> > >  #include <linux/freezer.h>
> > > +#include <linux/blkdev.h>
> > >  
> > >  #include "xfs_format.h"
> > >  #include "xfs_log_format.h"
> > > @@ -808,6 +809,8 @@ xfs_buf_read_map(
> > >  	const struct xfs_buf_ops *ops)
> > >  {
> > >  	struct xfs_buf		*bp;
> > > +	struct request_queue	*q;
> > > +	unsigned short		i;
> > >  
> > >  	flags |= XBF_READ;
> > >  
> > > @@ -820,7 +823,30 @@ xfs_buf_read_map(
> > >  	if (!(bp->b_flags & XBF_DONE)) {
> > >  		XFS_STATS_INC(target->bt_mount, xb_get_read);
> > >  		bp->b_ops = ops;
> > > -		_xfs_buf_read(bp, flags);
> > > +		q = bdev_get_queue(bp->b_target->bt_bdev);
> > > +
> > > +		/*
> > > +		 * Mirrors are indexed 1 - n, specified through the rw_hint.
> > > +		 * Setting the hint to 0 is unspecified and allows the block
> > > +		 * layer to decide.
> > > +		 */
> > > +		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
> > > +			bp->b_error = 0;
> > > +			bp->b_rw_hint = i;
> > > +			_xfs_buf_read(bp, flags);
> > 
> > So the first time through this loop the block layer devices what
> > device to read from, then we iterate devices 1..n on error.
> > 
> > Whihc means if device 0 is the only one with good information in it,
> > we may not ever actually read from it.
> > 
> > I'd suggest that a hint of "-1" (or equivalent max value) should be
> > used for "device selects mirror leg" rather than 0, so we can
> > actually read from the first device on command.
> 
> "read from the first device on command" => "set bio.bi_rw_hint = 1"...

Landmine.

> > i.e.
> > 		bp->b_error = 0;
> > 		bp->b_rw_hint = -1;
> 
> ...which is confusing.  The intended behavior for this RFC (though not
> so well documented) is that bi_rw_hint == 0 means "let the device
> choose", and rw_hint > 1 means "choose mirror (rw_hint - 1)".  That's
> sort of an odd behavior because now we have:
> 
> blk_queue_get_mirrors(q) returns 5 (as in 5 mirrors) but we access the
> 5 mirrors as indices 1-5, not 0-4 like most programmers would probably
> expect.

Yeah, that's not nice, and will lead to bugs in future as it trips
up people who have forgotten about this quirk.

> Also, I think it's probably necessary to create a #define to attach a
> name to the "let the device choose" value...
> 
> #define BIO_RW_HINT_ANY_MIRROR	(0)
> 
> for (i = BIO_RW_HINT_ANY_MIRROR; i <= blk_queue_get_mirrors(q); i++) {
> 	...
> 	bp->b_rw_hint = i;
> 	...
> 	_xfs_buf_read(bp, flags);
> 	...
> }

The recovery algorithms are only going to get more complex as
time goes on, so I'd really like to see an explicit separation of
the simple, unchanging fast path and the fallback recovery code.

Cheers,

dave.
Christoph Hellwig Nov. 28, 2018, 7:35 a.m. UTC | #4
On Wed, Nov 28, 2018 at 04:08:50PM +1100, Dave Chinner wrote:
> So the first time through this loop the block layer devices what
> device to read from, then we iterate devices 1..n on error.
> 
> Whihc means if device 0 is the only one with good information in it,
> we may not ever actually read from it.
> 
> I'd suggest that a hint of "-1" (or equivalent max value) should be
> used for "device selects mirror leg" rather than 0, so we can
> actually read from the first device on command.

Yes.  For one thing I think we really need to split this retry counter
of sorts from the write hints.  I.e. make both u8 types and keep them
separate.  Then start out with (u8)-1 as initialized by the block layer
for the first attempt.  The device then fills out which leg it used
(in the completion path, so that another underlying driver doesn't
override it!), and then the file system just preserves this value on
a resumit, leaving the driver to chose a new value when it gets a
non -1 value.
Bob Liu Nov. 28, 2018, 12:41 p.m. UTC | #5
On 11/28/18 3:35 PM, Christoph Hellwig wrote:
> On Wed, Nov 28, 2018 at 04:08:50PM +1100, Dave Chinner wrote:
>> So the first time through this loop the block layer devices what
>> device to read from, then we iterate devices 1..n on error.
>>
>> Whihc means if device 0 is the only one with good information in it,
>> we may not ever actually read from it.
>>
>> I'd suggest that a hint of "-1" (or equivalent max value) should be
>> used for "device selects mirror leg" rather than 0, so we can
>> actually read from the first device on command.
> 
> Yes.  For one thing I think we really need to split this retry counter
> of sorts from the write hints.  I.e. make both u8 types and keep them
> separate.  Then start out with (u8)-1 as initialized by the block layer
> for the first attempt.  The device then fills out which leg it used
> (in the completion path, so that another underlying driver doesn't
> override it!), and then the file system just preserves this value on
> a resumit, leaving the driver to chose a new value when it gets a
> non -1 value.
> 

Will update as suggested, thank you for all your feedback :)

-Bob
Allison Henderson Nov. 28, 2018, 4:47 p.m. UTC | #6
On 11/28/18 5:41 AM, Bob Liu wrote:
> On 11/28/18 3:35 PM, Christoph Hellwig wrote:
>> On Wed, Nov 28, 2018 at 04:08:50PM +1100, Dave Chinner wrote:
>>> So the first time through this loop the block layer devices what
>>> device to read from, then we iterate devices 1..n on error.
>>>
>>> Whihc means if device 0 is the only one with good information in it,
>>> we may not ever actually read from it.
>>>
>>> I'd suggest that a hint of "-1" (or equivalent max value) should be
>>> used for "device selects mirror leg" rather than 0, so we can
>>> actually read from the first device on command.
>>
>> Yes.  For one thing I think we really need to split this retry counter
>> of sorts from the write hints.  I.e. make both u8 types and keep them
>> separate.  Then start out with (u8)-1 as initialized by the block layer
>> for the first attempt.  The device then fills out which leg it used
>> (in the completion path, so that another underlying driver doesn't
>> override it!), and then the file system just preserves this value on
>> a resumit, leaving the driver to chose a new value when it gets a
>> non -1 value.
>>
> 
> Will update as suggested, thank you for all your feedback :)
> 
> -Bob
> 

Yes, thanks everyone for your feed back.  Maybe Bob and I can come up 
with some test cases that recreate the problem scenarios described here 
and see if we can work out a solution to the multi bio complexities. 
Thanks!

Allison
diff mbox series

Patch

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dd8ba59..f102d01 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,6 +21,7 @@ 
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include <linux/blkdev.h>
 
 #include "xfs_format.h"
 #include "xfs_log_format.h"
@@ -808,6 +809,8 @@  xfs_buf_read_map(
 	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
+	struct request_queue	*q;
+	unsigned short		i;
 
 	flags |= XBF_READ;
 
@@ -820,7 +823,30 @@  xfs_buf_read_map(
 	if (!(bp->b_flags & XBF_DONE)) {
 		XFS_STATS_INC(target->bt_mount, xb_get_read);
 		bp->b_ops = ops;
-		_xfs_buf_read(bp, flags);
+		q = bdev_get_queue(bp->b_target->bt_bdev);
+
+		/*
+		 * Mirrors are indexed 1 - n, specified through the rw_hint.
+		 * Setting the hint to 0 is unspecified and allows the block
+		 * layer to decide.
+		 */
+		for (i = 0; i <= blk_queue_get_mirrors(q); i++) {
+			bp->b_error = 0;
+			bp->b_rw_hint = i;
+			_xfs_buf_read(bp, flags);
+
+			switch (bp->b_error) {
+			case -EIO:
+			case -EFSCORRUPTED:
+			case -EFSBADCRC:
+				/* loop again */
+				continue;
+			default:
+				goto retry_done;
+			}
+
+		}
+retry_done:
 		return bp;
 	}