diff mbox

[v2,11/12] Btrfs: implement repair function when direct read fails

Message ID 1406625850-32168-12-git-send-email-miaox@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show

Commit Message

Miao Xie July 29, 2014, 9:24 a.m. UTC
This patch implement data repair function when direct read fails.

The detail of the implementation is:
- When we find the data is not right, we try to read the data from the other
  mirror.
- After we get right data, we write it back to the corrupted mirror.
- And if the data on the new mirror is still corrupted, we will try next
  mirror until we read right data or all the mirrors are traversed.
- After the above work, we set the uptodate flag according to the result.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
Changelog v1-v2:
- None
---
 fs/btrfs/btrfs_inode.h |   2 +-
 fs/btrfs/disk-io.c     |  43 ++++++--
 fs/btrfs/disk-io.h     |   1 +
 fs/btrfs/extent_io.c   |  12 ++-
 fs/btrfs/extent_io.h   |   5 +-
 fs/btrfs/inode.c       | 276 +++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 300 insertions(+), 39 deletions(-)

Comments

Chris Mason Aug. 29, 2014, 6:31 p.m. UTC | #1
On 07/29/2014 05:24 AM, Miao Xie wrote:
> This patch implement data repair function when direct read fails.
> 
> The detail of the implementation is:
> - When we find the data is not right, we try to read the data from the other
>   mirror.
> - After we get right data, we write it back to the corrupted mirror.
> - And if the data on the new mirror is still corrupted, we will try next
>   mirror until we read right data or all the mirrors are traversed.
> - After the above work, we set the uptodate flag according to the result.
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 08e65e9..56b1546 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -698,7 +719,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
>  
>  	fs_info = end_io_wq->info;
>  	end_io_wq->error = err;
> -	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
> +
> +	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
> +		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
> +				NULL);
> +	else
> +		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);

It's not clear why this one is using INIT_WORK instead of
btrfs_init_work, or why we're calling directly into queue_work instead
of btrfs_queue_work.  What am I missing?

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie Sept. 1, 2014, 6:56 a.m. UTC | #2
On Fri, 29 Aug 2014 14:31:48 -0400, Chris Mason wrote:
> On 07/29/2014 05:24 AM, Miao Xie wrote:
>> This patch implement data repair function when direct read fails.
>>
>> The detail of the implementation is:
>> - When we find the data is not right, we try to read the data from the other
>>   mirror.
>> - After we get right data, we write it back to the corrupted mirror.
>> - And if the data on the new mirror is still corrupted, we will try next
>>   mirror until we read right data or all the mirrors are traversed.
>> - After the above work, we set the uptodate flag according to the result.
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 08e65e9..56b1546 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -698,7 +719,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
>>  
>>  	fs_info = end_io_wq->info;
>>  	end_io_wq->error = err;
>> -	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
>> +
>> +	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
>> +		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
>> +				NULL);
>> +	else
>> +		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);
> 
> It's not clear why this one is using INIT_WORK instead of
> btrfs_init_work, or why we're calling directly into queue_work instead
> of btrfs_queue_work.  What am I missing?

I'm sorry that I forgot writing the explanation in this patch's changlog,
I wrote it in Patch 0.

"2.When the io on the mirror ends, we will insert the endio work into the
   system workqueue, not btrfs own endio workqueue, because the original
   endio work is still blocked in the btrfs endio workqueue, if we insert
   the endio work of the io on the mirror into that workqueue, deadlock
   would happen."

Could you add it into the changelog of this patch when you apply this patch?

Thanks
Miao
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo Sept. 2, 2014, 12:33 p.m. UTC | #3
On Mon, Sep 01, 2014 at 02:56:15PM +0800, Miao Xie wrote:
> On Fri, 29 Aug 2014 14:31:48 -0400, Chris Mason wrote:
> > On 07/29/2014 05:24 AM, Miao Xie wrote:
> >> This patch implement data repair function when direct read fails.
> >>
> >> The detail of the implementation is:
> >> - When we find the data is not right, we try to read the data from the other
> >>   mirror.
> >> - After we get right data, we write it back to the corrupted mirror.
> >> - And if the data on the new mirror is still corrupted, we will try next
> >>   mirror until we read right data or all the mirrors are traversed.
> >> - After the above work, we set the uptodate flag according to the result.
> >>
> >> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> >> index 08e65e9..56b1546 100644
> >> --- a/fs/btrfs/disk-io.c
> >> +++ b/fs/btrfs/disk-io.c
> >> @@ -698,7 +719,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
> >>  
> >>  	fs_info = end_io_wq->info;
> >>  	end_io_wq->error = err;
> >> -	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
> >> +
> >> +	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
> >> +		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
> >> +				NULL);
> >> +	else
> >> +		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);
> > 
> > It's not clear why this one is using INIT_WORK instead of
> > btrfs_init_work, or why we're calling directly into queue_work instead
> > of btrfs_queue_work.  What am I missing?
> 
> I'm sorry that I forgot writing the explanation in this patch's changlog,
> I wrote it in Patch 0.
> 
> "2.When the io on the mirror ends, we will insert the endio work into the
>    system workqueue, not btrfs own endio workqueue, because the original
>    endio work is still blocked in the btrfs endio workqueue, if we insert
>    the endio work of the io on the mirror into that workqueue, deadlock
>    would happen."

Can you elaborate the deadlock?

Now that buffer read can insert a subsequent read-mirror bio into btrfs endio
workqueue without problems, what's the difference?

thanks,
-liubo

> 
> Could you add it into the changelog of this patch when you apply this patch?
> 
> Thanks
> Miao
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Mason Sept. 2, 2014, 1:05 p.m. UTC | #4
On 09/02/2014 08:33 AM, Liu Bo wrote:
> On Mon, Sep 01, 2014 at 02:56:15PM +0800, Miao Xie wrote:
>> On Fri, 29 Aug 2014 14:31:48 -0400, Chris Mason wrote:
>>> On 07/29/2014 05:24 AM, Miao Xie wrote:
>>>> This patch implement data repair function when direct read fails.
>>>>
>>>> The detail of the implementation is:
>>>> - When we find the data is not right, we try to read the data from the other
>>>>   mirror.
>>>> - After we get right data, we write it back to the corrupted mirror.
>>>> - And if the data on the new mirror is still corrupted, we will try next
>>>>   mirror until we read right data or all the mirrors are traversed.
>>>> - After the above work, we set the uptodate flag according to the result.
>>>>
>>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>>> index 08e65e9..56b1546 100644
>>>> --- a/fs/btrfs/disk-io.c
>>>> +++ b/fs/btrfs/disk-io.c
>>>> @@ -698,7 +719,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
>>>>  
>>>>  	fs_info = end_io_wq->info;
>>>>  	end_io_wq->error = err;
>>>> -	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
>>>> +
>>>> +	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
>>>> +		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
>>>> +				NULL);
>>>> +	else
>>>> +		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);
>>>
>>> It's not clear why this one is using INIT_WORK instead of
>>> btrfs_init_work, or why we're calling directly into queue_work instead
>>> of btrfs_queue_work.  What am I missing?
>>
>> I'm sorry that I forgot writing the explanation in this patch's changlog,
>> I wrote it in Patch 0.
>>
>> "2.When the io on the mirror ends, we will insert the endio work into the
>>    system workqueue, not btrfs own endio workqueue, because the original
>>    endio work is still blocked in the btrfs endio workqueue, if we insert
>>    the endio work of the io on the mirror into that workqueue, deadlock
>>    would happen."
> 
> Can you elaborate the deadlock?
> 
> Now that buffer read can insert a subsequent read-mirror bio into btrfs endio
> workqueue without problems, what's the difference?

We do have problems if we're inserting dependent items in the same
workqueue.

Miao, please make a repair workqueue.  I'll also have a use for it in
the raid56 parity work I think.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie Sept. 3, 2014, 9:02 a.m. UTC | #5
On Tue, 2 Sep 2014 09:05:15 -0400, Chris Mason wrote:
>>>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>>>> index 08e65e9..56b1546 100644
>>>>> --- a/fs/btrfs/disk-io.c
>>>>> +++ b/fs/btrfs/disk-io.c
>>>>> @@ -698,7 +719,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
>>>>>  
>>>>>  	fs_info = end_io_wq->info;
>>>>>  	end_io_wq->error = err;
>>>>> -	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
>>>>> +
>>>>> +	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
>>>>> +		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
>>>>> +				NULL);
>>>>> +	else
>>>>> +		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);
>>>>
>>>> It's not clear why this one is using INIT_WORK instead of
>>>> btrfs_init_work, or why we're calling directly into queue_work instead
>>>> of btrfs_queue_work.  What am I missing?
>>>
>>> I'm sorry that I forgot writing the explanation in this patch's changlog,
>>> I wrote it in Patch 0.
>>>
>>> "2.When the io on the mirror ends, we will insert the endio work into the
>>>    system workqueue, not btrfs own endio workqueue, because the original
>>>    endio work is still blocked in the btrfs endio workqueue, if we insert
>>>    the endio work of the io on the mirror into that workqueue, deadlock
>>>    would happen."
>>
>> Can you elaborate the deadlock?
>>
>> Now that buffer read can insert a subsequent read-mirror bio into btrfs endio
>> workqueue without problems, what's the difference?
> 
> We do have problems if we're inserting dependent items in the same
> workqueue.
> 
> Miao, please make a repair workqueue.  I'll also have a use for it in
> the raid56 parity work I think.

OK, I'll update the patch soon.

Thanks
Miao

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Miao Xie Sept. 12, 2014, 10:43 a.m. UTC | #6
This patchset implement the data repair function for the direct read, it
is implemented like buffered read:
1.When we find the data is not right, we try to read the data from the other
  mirror.
2.When the io on the mirror ends, we will insert the endio work into the
  dedicated btrfs workqueue, not common read endio workqueue, because the
  original endio work is still blocked in the btrfs endio workqueue, if we
  insert the endio work of the io on the mirror into that workqueue, deadlock
  would happen.
3.If We get right data, we write it back to repair the corrupted mirror.
4.If the data on the new mirror is still corrupted, we will try next
  mirror until we read right data or all the mirrors are traversed.
5.After the above work, we set the uptodate flag according to the result.

The difference is that the direct read may be splited to several small io,
in order to get the number of the mirror on which the io error happens. we
have to do data check and repair on the end IO function of those sub-IO
request.

Besides that, we also fixed some bugs of direct io.

Changelog v3 -> v4:
- Remove the 1st patch which has been applied into the upstream kernel.
- Use a dedicated btrfs workqueue instead of the system workqueue to
  deal with the completed repair bio, this suggest was from Chris.
- Rebase the patchset to integration branch of Chris's git tree.

Changelog v2 -> v3:
- Fix wrong returned bio when doing bio clone, which was reported by Filipe

Changelog v1 -> v2:
- Fix the warning which was triggered by __GFP_ZERO in the 2nd patch

Miao Xie (11):
  Btrfs: load checksum data once when submitting a direct read io
  Btrfs: cleanup similar code of the buffered data data check and dio
    read data check
  Btrfs: do file data check by sub-bio's self
  Btrfs: fix missing error handler if submiting re-read bio fails
  Btrfs: Cleanup unused variant and argument of IO failure handlers
  Btrfs: split bio_readpage_error into several functions
  Btrfs: modify repair_io_failure and make it suit direct io
  Btrfs: modify clean_io_failure and make it suit direct io
  Btrfs: Set real mirror number for read operation on RAID0/5/6
  Btrfs: implement repair function when direct read fails
  Btrfs: cleanup the read failure record after write or when the inode
    is freeing

 fs/btrfs/async-thread.c |   1 +
 fs/btrfs/async-thread.h |   1 +
 fs/btrfs/btrfs_inode.h  |  10 +-
 fs/btrfs/ctree.h        |   4 +-
 fs/btrfs/disk-io.c      |  11 +-
 fs/btrfs/disk-io.h      |   1 +
 fs/btrfs/extent_io.c    | 254 +++++++++++++++++----------
 fs/btrfs/extent_io.h    |  38 ++++-
 fs/btrfs/file-item.c    |  14 +-
 fs/btrfs/inode.c        | 446 +++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/scrub.c        |   4 +-
 fs/btrfs/volumes.c      |   5 +
 fs/btrfs/volumes.h      |   5 +-
 13 files changed, 601 insertions(+), 193 deletions(-)
Chris Mason Sept. 12, 2014, 2:50 p.m. UTC | #7
On 09/12/2014 06:43 AM, Miao Xie wrote:
> This patchset implement the data repair function for the direct read, it
> is implemented like buffered read:
> 1.When we find the data is not right, we try to read the data from the other
>   mirror.
> 2.When the io on the mirror ends, we will insert the endio work into the
>   dedicated btrfs workqueue, not common read endio workqueue, because the
>   original endio work is still blocked in the btrfs endio workqueue, if we
>   insert the endio work of the io on the mirror into that workqueue, deadlock
>   would happen.
> 3.If We get right data, we write it back to repair the corrupted mirror.
> 4.If the data on the new mirror is still corrupted, we will try next
>   mirror until we read right data or all the mirrors are traversed.
> 5.After the above work, we set the uptodate flag according to the result.
> 
> The difference is that the direct read may be splited to several small io,
> in order to get the number of the mirror on which the io error happens. we
> have to do data check and repair on the end IO function of those sub-IO
> request.
> 
> Besides that, we also fixed some bugs of direct io.
> 
> Changelog v3 -> v4:
> - Remove the 1st patch which has been applied into the upstream kernel.
> - Use a dedicated btrfs workqueue instead of the system workqueue to
>   deal with the completed repair bio, this suggest was from Chris.
> - Rebase the patchset to integration branch of Chris's git tree.

Perfect, thank you.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 745fca40..20d4975 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -271,7 +271,7 @@  struct btrfs_dio_private {
 	 * The original bio may be splited to several sub-bios, this is
 	 * done during endio of sub-bios
 	 */
-	int (*subio_endio)(struct inode *, struct btrfs_io_bio *);
+	int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e65e9..56b1546 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -691,6 +691,27 @@  static int btree_io_failed_hook(struct page *page, int failed_mirror)
 	return -EIO;	/* we fixed nothing */
 }
 
+static inline void do_end_workqueue_fn(struct end_io_wq *end_io_wq)
+{
+	struct bio *bio = end_io_wq->bio;
+
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	bio_endio_nodec(bio, end_io_wq->error);
+	kfree(end_io_wq);
+}
+
+static void dio_end_workqueue_fn(struct work_struct *work)
+{
+	struct btrfs_work *bwork;
+	struct end_io_wq *end_io_wq;
+
+	bwork = container_of(work, struct btrfs_work, normal_work);
+	end_io_wq = container_of(bwork, struct end_io_wq, work);
+
+	do_end_workqueue_fn(end_io_wq);
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
@@ -698,7 +719,12 @@  static void end_workqueue_bio(struct bio *bio, int err)
 
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
-	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
+
+	if (likely(end_io_wq->metadata != BTRFS_WQ_ENDIO_DIO_REPAIR))
+		btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL,
+				NULL);
+	else
+		INIT_WORK(&end_io_wq->work.normal_work, dio_end_workqueue_fn);
 
 	if (bio->bi_rw & REQ_WRITE) {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
@@ -714,7 +740,9 @@  static void end_workqueue_bio(struct bio *bio, int err)
 			btrfs_queue_work(fs_info->endio_write_workers,
 					 &end_io_wq->work);
 	} else {
-		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+		if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
+			queue_work(system_wq, &end_io_wq->work.normal_work);
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
 			btrfs_queue_work(fs_info->endio_raid56_workers,
 					 &end_io_wq->work);
 		else if (end_io_wq->metadata)
@@ -738,6 +766,7 @@  int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
 {
 	struct end_io_wq *end_io_wq;
+
 	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
 	if (!end_io_wq)
 		return -ENOMEM;
@@ -1730,18 +1759,10 @@  static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
  */
 static void end_workqueue_fn(struct btrfs_work *work)
 {
-	struct bio *bio;
 	struct end_io_wq *end_io_wq;
-	int error;
 
 	end_io_wq = container_of(work, struct end_io_wq, work);
-	bio = end_io_wq->bio;
-
-	error = end_io_wq->error;
-	bio->bi_private = end_io_wq->private;
-	bio->bi_end_io = end_io_wq->end_io;
-	kfree(end_io_wq);
-	bio_endio_nodec(bio, error);
+	do_end_workqueue_fn(end_io_wq);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ce..4fde7a0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -30,6 +30,7 @@  enum {
 	BTRFS_WQ_ENDIO_METADATA = 1,
 	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
 	BTRFS_WQ_ENDIO_RAID56 = 3,
+	BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
 };
 
 static inline u64 btrfs_sb_offset(int mirror)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8082220..31600ef 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1959,7 +1959,7 @@  static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 		SetPageUptodate(page);
 }
 
-static int free_io_failure(struct inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct inode *inode, struct io_failure_record *rec)
 {
 	int ret;
 	int err = 0;
@@ -2078,8 +2078,8 @@  int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-static int clean_io_failure(struct inode *inode, u64 start,
-			    struct page *page, unsigned int pg_offset)
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+		     unsigned int pg_offset)
 {
 	u64 private;
 	u64 private_failure;
@@ -2288,7 +2288,7 @@  int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 				    struct io_failure_record *failrec,
 				    struct page *page, int pg_offset, int icsum,
-				    bio_end_io_t *endio_func)
+				    bio_end_io_t *endio_func, void *data)
 {
 	struct bio *bio;
 	struct btrfs_io_bio *btrfs_failed_bio;
@@ -2302,6 +2302,7 @@  struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 	bio->bi_iter.bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_iter.bi_size = 0;
+	bio->bi_private = data;
 
 	btrfs_failed_bio = btrfs_io_bio(failed_bio);
 	if (btrfs_failed_bio->csum) {
@@ -2359,7 +2360,8 @@  static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	phy_offset >>= inode->i_sb->s_blocksize_bits;
 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
 				      start - page_offset(page),
-				      (int)phy_offset, failed_bio->bi_end_io);
+				      (int)phy_offset, failed_bio->bi_end_io,
+				      NULL);
 	if (!bio) {
 		free_io_failure(inode, failrec);
 		return -EIO;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7662eaa..b23c7c2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -344,6 +344,8 @@  struct btrfs_fs_info;
 int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		      struct page *page, unsigned int pg_offset,
 		      int mirror_num);
+int clean_io_failure(struct inode *inode, u64 start, struct page *page,
+		     unsigned int pg_offset);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
@@ -374,7 +376,8 @@  int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 				    struct io_failure_record *failrec,
 				    struct page *page, int pg_offset, int icsum,
-				    bio_end_io_t *endio_func);
+				    bio_end_io_t *endio_func, void *data);
+int free_io_failure(struct inode *inode, struct io_failure_record *rec);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 noinline u64 find_lock_delalloc_range(struct inode *inode,
 				      struct extent_io_tree *tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e95a2b..e087189 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7083,30 +7083,267 @@  unlock_err:
 	return ret;
 }
 
-static int btrfs_subio_endio_read(struct inode *inode,
-				  struct btrfs_io_bio *io_bio)
+static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+					int rw, int mirror_num)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	BUG_ON(rw & REQ_WRITE);
+
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio,
+				  BTRFS_WQ_ENDIO_DIO_REPAIR);
+	if (ret)
+		goto err;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+err:
+	bio_put(bio);
+	return ret;
+}
+
+static int btrfs_check_dio_repairable(struct inode *inode,
+				      struct bio *failed_bio,
+				      struct io_failure_record *failrec,
+				      int failed_mirror)
+{
+	int num_copies;
+
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	failrec->failed_mirror = failed_mirror;
+	failrec->this_mirror++;
+	if (failrec->this_mirror == failed_mirror)
+		failrec->this_mirror++;
+
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int dio_read_error(struct inode *inode, struct bio *failed_bio,
+			  struct page *page, u64 start, u64 end,
+			  int failed_mirror, bio_end_io_t *repair_endio,
+			  void *repair_arg)
+{
+	struct io_failure_record *failrec;
+	struct bio *bio;
+	int isector;
+	int read_mode;
+	int ret;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+	if (ret)
+		return ret;
+
+	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
+					 failed_mirror);
+	if (!ret) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	if (failed_bio->bi_vcnt > 1)
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	else
+		read_mode = READ_SYNC;
+
+	isector = start - btrfs_io_bio(failed_bio)->logical;
+	isector >>= inode->i_sb->s_blocksize_bits;
+	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
+				      0, isector, repair_endio, repair_arg);
+	if (!bio) {
+		free_io_failure(inode, failrec);
+		return -EIO;
+	}
+
+	btrfs_debug(BTRFS_I(inode)->root->fs_info,
+		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
+		    read_mode, failrec->this_mirror, failrec->in_validation);
+
+	ret = submit_dio_repair_bio(inode, bio, read_mode,
+				    failrec->this_mirror);
+	if (ret) {
+		free_io_failure(inode, failrec);
+		bio_put(bio);
+	}
+
+	return ret;
+}
+
+struct btrfs_retry_complete {
+	struct completion done;
+	struct inode *inode;
+	u64 start;
+	int uptodate;
+};
+
+static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+{
+	struct btrfs_retry_complete *done = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	if (err)
+		goto end;
+
+	done->uptodate = 1;
+	bio_for_each_segment_all(bvec, bio, i)
+		clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
+end:
+	complete(&done->done);
+	bio_put(bio);
+}
+
+static int __btrfs_correct_data_nocsum(struct inode *inode,
+				       struct btrfs_io_bio *io_bio)
 {
 	struct bio_vec *bvec;
+	struct btrfs_retry_complete done;
 	u64 start;
 	int i;
 	int ret;
-	int err = 0;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-		return 0;
+	start = io_bio->logical;
+	done.inode = inode;
+
+	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
+try_again:
+		done.uptodate = 0;
+		done.start = start;
+		init_completion(&done.done);
+
+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+				     start + bvec->bv_len - 1,
+				     io_bio->mirror_num,
+				     btrfs_retry_endio_nocsum, &done);
+		if (ret)
+			return ret;
+
+		wait_for_completion(&done.done);
+
+		if (!done.uptodate) {
+			/* We might have another mirror, so try again */
+			goto try_again;
+		}
+
+		start += bvec->bv_len;
+	}
+
+	return 0;
+}
+
+static void btrfs_retry_endio(struct bio *bio, int err)
+{
+	struct btrfs_retry_complete *done = bio->bi_private;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct bio_vec *bvec;
+	int uptodate;
+	int ret;
+	int i;
+
+	if (err)
+		goto end;
+
+	uptodate = 1;
+	bio_for_each_segment_all(bvec, bio, i) {
+		ret = __readpage_endio_check(done->inode, io_bio, i,
+					     bvec->bv_page, 0,
+					     done->start, bvec->bv_len);
+		if (!ret)
+			clean_io_failure(done->inode, done->start,
+					 bvec->bv_page, 0);
+		else
+			uptodate = 0;
+	}
+
+	done->uptodate = uptodate;
+end:
+	complete(&done->done);
+	bio_put(bio);
+}
 
+static int __btrfs_subio_endio_read(struct inode *inode,
+				    struct btrfs_io_bio *io_bio, int err)
+{
+	struct bio_vec *bvec;
+	struct btrfs_retry_complete done;
+	u64 start;
+	u64 offset = 0;
+	int i;
+	int ret;
+
+	err = 0;
 	start = io_bio->logical;
+	done.inode = inode;
+
 	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
 		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
 					     0, start, bvec->bv_len);
-		if (ret)
-			err = -EIO;
+		if (likely(!ret))
+			goto next;
+try_again:
+		done.uptodate = 0;
+		done.start = start;
+		init_completion(&done.done);
+
+		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
+				     start + bvec->bv_len - 1,
+				     io_bio->mirror_num,
+				     btrfs_retry_endio, &done);
+		if (ret) {
+			err = ret;
+			goto next;
+		}
+
+		wait_for_completion(&done.done);
+
+		if (!done.uptodate) {
+			/* We might have another mirror, so try again */
+			goto try_again;
+		}
+next:
+		offset += bvec->bv_len;
 		start += bvec->bv_len;
 	}
 
 	return err;
 }
 
+static int btrfs_subio_endio_read(struct inode *inode,
+				  struct btrfs_io_bio *io_bio, int err)
+{
+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	if (skip_csum) {
+		if (unlikely(err))
+			return __btrfs_correct_data_nocsum(inode, io_bio);
+		else
+			return 0;
+	} else {
+		return __btrfs_subio_endio_read(inode, io_bio, err);
+	}
+}
+
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
@@ -7114,8 +7351,8 @@  static void btrfs_endio_direct_read(struct bio *bio, int err)
 	struct bio *dio_bio;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 
-	if (!err && (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED))
-		err = btrfs_subio_endio_read(inode, io_bio);
+	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
+		err = btrfs_subio_endio_read(inode, io_bio, err);
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
 		      dip->logical_offset + dip->bytes - 1);
@@ -7193,19 +7430,16 @@  static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
 static void btrfs_end_dio_bio(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
-	int ret;
 
-	if (err) {
-		btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
-			  "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
-		      btrfs_ino(dip->inode), bio->bi_rw,
-		      (unsigned long long)bio->bi_iter.bi_sector,
-		      bio->bi_iter.bi_size, err);
-	} else if (dip->subio_endio) {
-		ret = dip->subio_endio(dip->inode, btrfs_io_bio(bio));
-		if (ret)
-			err = ret;
-	}
+	if (err)
+		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
+			   "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
+			   btrfs_ino(dip->inode), bio->bi_rw,
+			   (unsigned long long)bio->bi_iter.bi_sector,
+			   bio->bi_iter.bi_size, err);
+
+	if (dip->subio_endio)
+		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
 
 	if (err) {
 		dip->errors = 1;