diff mbox

[v4,1/7] btrfs: use blkdev_issue_flush to flush the device cache

Message ID 20170406032253.14631-2-anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Anand Jain April 6, 2017, 3:22 a.m. UTC
As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
to flush the device cache, instead we can use blkdev_issue_flush()
for this puspose.

Also now no need to check the return when write_dev_flush() is called
with wait = 0

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
v2: Title of this patch is changed from
   btrfs: communicate back ENOMEM when it occurs
  And its entirely a new patch, which now use blkdev_issue_flush()
v3: no change
v4: no change

 fs/btrfs/disk-io.c | 64 +++++++++++++++---------------------------------------
 fs/btrfs/volumes.h |  3 ++-
 2 files changed, 19 insertions(+), 48 deletions(-)

Comments

Liu Bo April 13, 2017, 6:41 p.m. UTC | #1
On Thu, Apr 06, 2017 at 11:22:47AM +0800, Anand Jain wrote:
> As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
> to flush the device cache, instead we can use blkdev_issue_flush()
> for this puspose.
> 
> Also now no need to check the return when write_dev_flush() is called
> with wait = 0
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
> v2: Title of this patch is changed from
>    btrfs: communicate back ENOMEM when it occurs
>   And its entirely a new patch, which now use blkdev_issue_flush()
> v3: no change
> v4: no change
> 
>  fs/btrfs/disk-io.c | 64 +++++++++++++++---------------------------------------
>  fs/btrfs/volumes.h |  3 ++-
>  2 files changed, 19 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 08b74daf35d0..08cbbee228ee 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3498,70 +3498,42 @@ static int write_dev_supers(struct btrfs_device *device,
>  	return errors < i ? 0 : -1;
>  }
>  
> -/*
> - * endio for the write_dev_flush, this will wake anyone waiting
> - * for the barrier when it is done
> - */
> -static void btrfs_end_empty_barrier(struct bio *bio)
> +static void btrfs_dev_issue_flush(struct work_struct *work)
>  {
> -	if (bio->bi_private)
> -		complete(bio->bi_private);
> -	bio_put(bio);
> +	int ret;
> +	struct btrfs_device *device;
> +
> +	device = container_of(work, struct btrfs_device, flush_work);
> +
> +	/* we are in the commit thread */

What is the above comment trying to explain?

Others look good.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo
> +	ret = blkdev_issue_flush(device->bdev, GFP_NOFS, NULL);
> +	device->last_flush_error = ret;
> +	complete(&device->flush_wait);
>  }
>  
>  /*
>   * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
>   * sent down.  With wait == 1, it waits for the previous flush.
> - *
> - * any device where the flush fails with eopnotsupp are flagged as not-barrier
> - * capable
>   */
>  static int write_dev_flush(struct btrfs_device *device, int wait)
>  {
> -	struct bio *bio;
> -	int ret = 0;
> -
>  	if (device->nobarriers)
>  		return 0;
>  
>  	if (wait) {
> -		bio = device->flush_bio;
> -		if (!bio)
> -			return 0;
> +		int ret;
>  
>  		wait_for_completion(&device->flush_wait);
> -
> -		if (bio->bi_error) {
> -			ret = bio->bi_error;
> +		ret = device->last_flush_error;
> +		if (ret)
>  			btrfs_dev_stat_inc_and_print(device,
> -				BTRFS_DEV_STAT_FLUSH_ERRS);
> -		}
> -
> -		/* drop the reference from the wait == 0 run */
> -		bio_put(bio);
> -		device->flush_bio = NULL;
> -
> +					BTRFS_DEV_STAT_FLUSH_ERRS);
>  		return ret;
>  	}
>  
> -	/*
> -	 * one reference for us, and we leave it for the
> -	 * caller
> -	 */
> -	device->flush_bio = NULL;
> -	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
> -	if (!bio)
> -		return -ENOMEM;
> -
> -	bio->bi_end_io = btrfs_end_empty_barrier;
> -	bio->bi_bdev = device->bdev;
> -	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
>  	init_completion(&device->flush_wait);
> -	bio->bi_private = &device->flush_wait;
> -	device->flush_bio = bio;
> -
> -	bio_get(bio);
> -	btrfsic_submit_bio(bio);
> +	INIT_WORK(&device->flush_work, btrfs_dev_issue_flush);
> +	schedule_work(&device->flush_work);
>  
>  	return 0;
>  }
> @@ -3590,9 +3562,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
>  		if (!dev->in_fs_metadata || !dev->writeable)
>  			continue;
>  
> -		ret = write_dev_flush(dev, 0);
> -		if (ret)
> -			errors_send++;
> +		write_dev_flush(dev, 0);
>  	}
>  
>  	/* wait for all the barriers */
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 59be81206dd7..0df50bc65578 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -124,8 +124,9 @@ struct btrfs_device {
>  
>  	/* for sending down flush barriers */
>  	int nobarriers;
> -	struct bio *flush_bio;
>  	struct completion flush_wait;
> +	struct work_struct flush_work;
> +	int last_flush_error;
>  
>  	/* per-device scrub information */
>  	struct scrub_ctx *scrub_device;
> -- 
> 2.10.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba April 18, 2017, 1:54 p.m. UTC | #2
On Thu, Apr 06, 2017 at 11:22:47AM +0800, Anand Jain wrote:
> As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
> to flush the device cache, instead we can use blkdev_issue_flush()
> for this puspose.

This would change the scheduling characteristics. Right now, the caller
thread submits all bios from one thread, lets block layer do it's work,
and then in the same thread wait for each of the submitted bios.

In your code, the btrfs thread prepares tasks for each bio, shifts the
work to the global workqueue (schedule_work) and then it's same as
before.

I'm concerned about using the global queue. As the bio submission jobs
could get blocked by some other unrelated task queued there, the
guarantees depend on the forward progress of the tasks scheduled
(plus block layer processing).

In the current behaviour, the guarantees stand on the block layer only.

We could introduce yet another work queue and submit the bios there,
with possible fine tuning of the flags, like priority or emergency etc.
But that sounds like unnecessary work as we can simply keep the code
as-is and get the same end result.

Regarding other patches, some of them are independent so I'll see what
can be merged now regardless of the above comments.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anand Jain April 19, 2017, 4:29 a.m. UTC | #3
On 04/14/2017 02:41 AM, Liu Bo wrote:
> On Thu, Apr 06, 2017 at 11:22:47AM +0800, Anand Jain wrote:
>> As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
>> to flush the device cache, instead we can use blkdev_issue_flush()
>> for this puspose.
>>
>> Also now no need to check the return when write_dev_flush() is called
>> with wait = 0
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
>> ---
>> v2: Title of this patch is changed from
>>    btrfs: communicate back ENOMEM when it occurs
>>   And its entirely a new patch, which now use blkdev_issue_flush()
>> v3: no change
>> v4: no change
>>
>>  fs/btrfs/disk-io.c | 64 +++++++++++++++---------------------------------------
>>  fs/btrfs/volumes.h |  3 ++-
>>  2 files changed, 19 insertions(+), 48 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 08b74daf35d0..08cbbee228ee 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -3498,70 +3498,42 @@ static int write_dev_supers(struct btrfs_device *device,
>>  	return errors < i ? 0 : -1;
>>  }
>>
>> -/*
>> - * endio for the write_dev_flush, this will wake anyone waiting
>> - * for the barrier when it is done
>> - */
>> -static void btrfs_end_empty_barrier(struct bio *bio)
>> +static void btrfs_dev_issue_flush(struct work_struct *work)
>>  {
>> -	if (bio->bi_private)
>> -		complete(bio->bi_private);
>> -	bio_put(bio);
>> +	int ret;
>> +	struct btrfs_device *device;
>> +
>> +	device = container_of(work, struct btrfs_device, flush_work);
>> +
>> +	/* we are in the commit thread */
>
> What is the above comment trying to explain?

  I had GFP_NOFS to justify when I wrote that.

Thanks, Anand


> Others look good.
>
> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
>
> Thanks,
>
> -liubo
>> +	ret = blkdev_issue_flush(device->bdev, GFP_NOFS, NULL);
>> +	device->last_flush_error = ret;
>> +	complete(&device->flush_wait);
>>  }
>>
>>  /*
>>   * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
>>   * sent down.  With wait == 1, it waits for the previous flush.
>> - *
>> - * any device where the flush fails with eopnotsupp are flagged as not-barrier
>> - * capable
>>   */
>>  static int write_dev_flush(struct btrfs_device *device, int wait)
>>  {
>> -	struct bio *bio;
>> -	int ret = 0;
>> -
>>  	if (device->nobarriers)
>>  		return 0;
>>
>>  	if (wait) {
>> -		bio = device->flush_bio;
>> -		if (!bio)
>> -			return 0;
>> +		int ret;
>>
>>  		wait_for_completion(&device->flush_wait);
>> -
>> -		if (bio->bi_error) {
>> -			ret = bio->bi_error;
>> +		ret = device->last_flush_error;
>> +		if (ret)
>>  			btrfs_dev_stat_inc_and_print(device,
>> -				BTRFS_DEV_STAT_FLUSH_ERRS);
>> -		}
>> -
>> -		/* drop the reference from the wait == 0 run */
>> -		bio_put(bio);
>> -		device->flush_bio = NULL;
>> -
>> +					BTRFS_DEV_STAT_FLUSH_ERRS);
>>  		return ret;
>>  	}
>>
>> -	/*
>> -	 * one reference for us, and we leave it for the
>> -	 * caller
>> -	 */
>> -	device->flush_bio = NULL;
>> -	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
>> -	if (!bio)
>> -		return -ENOMEM;
>> -
>> -	bio->bi_end_io = btrfs_end_empty_barrier;
>> -	bio->bi_bdev = device->bdev;
>> -	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
>>  	init_completion(&device->flush_wait);
>> -	bio->bi_private = &device->flush_wait;
>> -	device->flush_bio = bio;
>> -
>> -	bio_get(bio);
>> -	btrfsic_submit_bio(bio);
>> +	INIT_WORK(&device->flush_work, btrfs_dev_issue_flush);
>> +	schedule_work(&device->flush_work);
>>
>>  	return 0;
>>  }
>> @@ -3590,9 +3562,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
>>  		if (!dev->in_fs_metadata || !dev->writeable)
>>  			continue;
>>
>> -		ret = write_dev_flush(dev, 0);
>> -		if (ret)
>> -			errors_send++;
>> +		write_dev_flush(dev, 0);
>>  	}
>>
>>  	/* wait for all the barriers */
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 59be81206dd7..0df50bc65578 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -124,8 +124,9 @@ struct btrfs_device {
>>
>>  	/* for sending down flush barriers */
>>  	int nobarriers;
>> -	struct bio *flush_bio;
>>  	struct completion flush_wait;
>> +	struct work_struct flush_work;
>> +	int last_flush_error;
>>
>>  	/* per-device scrub information */
>>  	struct scrub_ctx *scrub_device;
>> --
>> 2.10.0
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anand Jain April 19, 2017, 4:29 a.m. UTC | #4
On 04/18/2017 09:54 PM, David Sterba wrote:
> On Thu, Apr 06, 2017 at 11:22:47AM +0800, Anand Jain wrote:
>> As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
>> to flush the device cache, instead we can use blkdev_issue_flush()
>> for this puspose.
>
> This would change the scheduling characteristics. Right now, the caller
> thread submits all bios from one thread, lets block layer do it's work,
> and then in the same thread wait for each of the submitted bios.

> In your code, the btrfs thread prepares tasks for each bio, shifts the
> work to the global workqueue (schedule_work) and then it's same as
> before.

> I'm concerned about using the global queue. As the bio submission jobs
> could get blocked by some other unrelated task queued there, the
> guarantees depend on the forward progress of the tasks scheduled
> (plus block layer processing).

> In the current behaviour, the guarantees stand on the block layer only.
>
> We could introduce yet another work queue and submit the bios there,
> with possible fine tuning of the flags, like priority or emergency etc.
> But that sounds like unnecessary work as we can simply keep the code
> as-is and get the same end result.

  I had to schedule the flush to request flush for all the devices in
  parallel. For the obvious reason that flush may take a lot of time
  for the devices which aren't missing. blkdev_issue_flush() uses
  submit_bio_wait() which makes sense for the non-volume-based-FS.
  And there isn't something like blkdev_issue_flush_no_wait(). Also
  now I see that there isn't the btrfsic part in the patch.
  To fix this I think pre-alloc-ed bio for the flush is a good idea
  (as you suggested earlier). Further as the commit thread doesn't
  overlap, and barrier device happens only at the commit so there
  won't be concurrent demand for the pre-alloc-ed bio.
  Also pre-alloc-ed bio can be alloc-ed only when barrier is enabled
  as a mem optimization. Will send a RFC code for the comments with
  these changes.

> Regarding other patches, some of them are independent so I'll see what
> can be merged now regardless of the above comments.

Thanks, Anand


> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Anand Jain April 25, 2017, 9:25 a.m. UTC | #5
David,

  Based on the comments received, I have to pull back patch 1/7 to 3/7 
and instead have replaced them with a patch as below.

     [patch] btrfs: add framework to handle device flush error as a volume


Thanks, Anand


On 04/19/2017 12:29 PM, Anand Jain wrote:
>
>
> On 04/18/2017 09:54 PM, David Sterba wrote:
>> On Thu, Apr 06, 2017 at 11:22:47AM +0800, Anand Jain wrote:
>>> As of now we do alloc an empty bio and then use the flag REQ_PREFLUSH
>>> to flush the device cache, instead we can use blkdev_issue_flush()
>>> for this puspose.
>>
>> This would change the scheduling characteristics. Right now, the caller
>> thread submits all bios from one thread, lets block layer do it's work,
>> and then in the same thread wait for each of the submitted bios.
>
>> In your code, the btrfs thread prepares tasks for each bio, shifts the
>> work to the global workqueue (schedule_work) and then it's same as
>> before.
>
>> I'm concerned about using the global queue. As the bio submission jobs
>> could get blocked by some other unrelated task queued there, the
>> guarantees depend on the forward progress of the tasks scheduled
>> (plus block layer processing).
>
>> In the current behaviour, the guarantees stand on the block layer only.
>>
>> We could introduce yet another work queue and submit the bios there,
>> with possible fine tuning of the flags, like priority or emergency etc.
>> But that sounds like unnecessary work as we can simply keep the code
>> as-is and get the same end result.
>
>  I had to schedule the flush to request flush for all the devices in
>  parallel. For the obvious reason that flush may take a lot of time
>  for the devices which aren't missing. blkdev_issue_flush() uses
>  submit_bio_wait() which makes sense for the non-volume-based-FS.
>  And there isn't something like blkdev_issue_flush_no_wait(). Also
>  now I see that there isn't the btrfsic part in the patch.
>  To fix this I think pre-alloc-ed bio for the flush is a good idea
>  (as you suggested earlier). Further as the commit thread doesn't
>  overlap, and barrier device happens only at the commit so there
>  won't be concurrent demand for the pre-alloc-ed bio.
>  Also pre-alloc-ed bio can be alloc-ed only when barrier is enabled
>  as a mem optimization. Will send a RFC code for the comments with
>  these changes.


>> Regarding other patches, some of them are independent so I'll see what
>> can be merged now regardless of the above comments.
>
> Thanks, Anand
>
>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08b74daf35d0..08cbbee228ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3498,70 +3498,42 @@  static int write_dev_supers(struct btrfs_device *device,
 	return errors < i ? 0 : -1;
 }
 
-/*
- * endio for the write_dev_flush, this will wake anyone waiting
- * for the barrier when it is done
- */
-static void btrfs_end_empty_barrier(struct bio *bio)
+static void btrfs_dev_issue_flush(struct work_struct *work)
 {
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
+	int ret;
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, flush_work);
+
+	/* we are in the commit thread */
+	ret = blkdev_issue_flush(device->bdev, GFP_NOFS, NULL);
+	device->last_flush_error = ret;
+	complete(&device->flush_wait);
 }
 
 /*
  * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
  * sent down.  With wait == 1, it waits for the previous flush.
- *
- * any device where the flush fails with eopnotsupp are flagged as not-barrier
- * capable
  */
 static int write_dev_flush(struct btrfs_device *device, int wait)
 {
-	struct bio *bio;
-	int ret = 0;
-
 	if (device->nobarriers)
 		return 0;
 
 	if (wait) {
-		bio = device->flush_bio;
-		if (!bio)
-			return 0;
+		int ret;
 
 		wait_for_completion(&device->flush_wait);
-
-		if (bio->bi_error) {
-			ret = bio->bi_error;
+		ret = device->last_flush_error;
+		if (ret)
 			btrfs_dev_stat_inc_and_print(device,
-				BTRFS_DEV_STAT_FLUSH_ERRS);
-		}
-
-		/* drop the reference from the wait == 0 run */
-		bio_put(bio);
-		device->flush_bio = NULL;
-
+					BTRFS_DEV_STAT_FLUSH_ERRS);
 		return ret;
 	}
 
-	/*
-	 * one reference for us, and we leave it for the
-	 * caller
-	 */
-	device->flush_bio = NULL;
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
-	if (!bio)
-		return -ENOMEM;
-
-	bio->bi_end_io = btrfs_end_empty_barrier;
-	bio->bi_bdev = device->bdev;
-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
-	bio->bi_private = &device->flush_wait;
-	device->flush_bio = bio;
-
-	bio_get(bio);
-	btrfsic_submit_bio(bio);
+	INIT_WORK(&device->flush_work, btrfs_dev_issue_flush);
+	schedule_work(&device->flush_work);
 
 	return 0;
 }
@@ -3590,9 +3562,7 @@  static int barrier_all_devices(struct btrfs_fs_info *info)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		ret = write_dev_flush(dev, 0);
-		if (ret)
-			errors_send++;
+		write_dev_flush(dev, 0);
 	}
 
 	/* wait for all the barriers */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 59be81206dd7..0df50bc65578 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -124,8 +124,9 @@  struct btrfs_device {
 
 	/* for sending down flush barriers */
 	int nobarriers;
-	struct bio *flush_bio;
 	struct completion flush_wait;
+	struct work_struct flush_work;
+	int last_flush_error;
 
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_device;