[v4,2/5] btrfs: use BIOs instead of buffer_heads from superblock writeout
diff mbox series

Message ID 20200205143831.13959-3-johannes.thumshirn@wdc.com
State New
Headers show
Series
  • btrfs: remove buffer heads form superblock handling
Related show

Commit Message

Johannes Thumshirn Feb. 5, 2020, 2:38 p.m. UTC
Similar to the superblock read path, change the write path to using BIOs
and pages instead of buffer_heads. This allows us to skip over the
buffer_head code, for writing the superblock to disk.

This is based on a patch originally authored by Nikolay Borisov.

Co-developed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>

---
Changes to v2:
- Don't use bi_set_op_attrs() (David)

Changes to v1:
- Remove left-over buffer_head.h include (David)
---
 fs/btrfs/disk-io.c | 117 +++++++++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 47 deletions(-)

Comments

hch@infradead.org Feb. 5, 2020, 6:16 p.m. UTC | #1
On Wed, Feb 05, 2020 at 11:38:28PM +0900, Johannes Thumshirn wrote:
> +static void btrfs_end_super_write(struct bio *bio)
>  {
> +	struct btrfs_device *device = bio->bi_private;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	struct page *page;
> +
> +	bio_for_each_segment_all(bvec, bio, iter_all) {
> +		page = bvec->bv_page;
> +
> +		if (blk_status_to_errno(bio->bi_status)) {

Nit: this could simply check bio->bi_status without a conversion.

> +			btrfs_warn_rl_in_rcu(device->fs_info,
> +					     "lost page write due to IO error on %s",
> +					     rcu_str_deref(device->name));

But maybe you want to print the error here?

> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;

Same comment on the ask as in the previous patch.

> +		u8 *ptr;

I'd use a typed pointer here again..

> +		ptr = kmap(page);
> +		memcpy(ptr, sb, BTRFS_SUPER_INFO_SIZE);

With which you could do a struct assignment here and very slightly
improve type safety.

> @@ -3497,9 +3506,23 @@ static int write_dev_supers(struct btrfs_device *device,
>  		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
>  		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
>  			op_flags |= REQ_FUA;

Question on the existing code:  why is it safe to not use FUA for the
subsequent superblocks?

> +
> +		/*
> +		 * Directly use BIOs here instead of relying on the page-cache
> +		 * to do I/O, so we don't loose the ability to do integrity
> +		 * checking.
> +		 */
> +		bio = bio_alloc(gfp_mask, 1);
> +		bio_set_dev(bio, device->bdev);
> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio->bi_private = device;
> +		bio->bi_end_io = btrfs_end_super_write;
> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));

Missing return value check.  But given that it is a single page and
can't error out please switch to __bio_add_page here.

> +		bio->bi_opf = REQ_OP_WRITE | op_flags;

You could kill the op_flags variable and just assign everything directly
to bio->bi_opf.
Johannes Thumshirn Feb. 6, 2020, 8:20 a.m. UTC | #2
On 05/02/2020 19:16, Christoph Hellwig wrote:
> On Wed, Feb 05, 2020 at 11:38:28PM +0900, Johannes Thumshirn wrote:
>> +static void btrfs_end_super_write(struct bio *bio)
>>   {
>> +	struct btrfs_device *device = bio->bi_private;
>> +	struct bio_vec *bvec;
>> +	struct bvec_iter_all iter_all;
>> +	struct page *page;
>> +
>> +	bio_for_each_segment_all(bvec, bio, iter_all) {
>> +		page = bvec->bv_page;
>> +
>> +		if (blk_status_to_errno(bio->bi_status)) {
> 
> Nit: this could simply check bio->bi_status without a conversion.
> 
>> +			btrfs_warn_rl_in_rcu(device->fs_info,
>> +					     "lost page write due to IO error on %s",
>> +					     rcu_str_deref(device->name));
> 
> But maybe you want to print the error here?
> 
>> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
> 
> Same comment on the ask as in the previous patch.
> 
>> +		u8 *ptr;
> 
> I'd use a typed pointer here again..
> 
>> +		ptr = kmap(page);
>> +		memcpy(ptr, sb, BTRFS_SUPER_INFO_SIZE);
> 
> With which you could do a struct assignment here and very slightly
> improve type safety.
> 
>> @@ -3497,9 +3506,23 @@ static int write_dev_supers(struct btrfs_device *device,
>>   		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
>>   		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
>>   			op_flags |= REQ_FUA;
> 
> Question on the existing code:  why is it safe to not use FUA for the
> subsequent superblocks?
> 
>> +
>> +		/*
>> +		 * Directly use BIOs here instead of relying on the page-cache
>> +		 * to do I/O, so we don't loose the ability to do integrity
>> +		 * checking.
>> +		 */
>> +		bio = bio_alloc(gfp_mask, 1);
>> +		bio_set_dev(bio, device->bdev);
>> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
>> +		bio->bi_private = device;
>> +		bio->bi_end_io = btrfs_end_super_write;
>> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
>> +			     offset_in_page(bytenr));
> 
> Missing return value check.  But given that it is a single page and
> can't error out please switch to __bio_add_page here.

Good question, I guess it's saver to always FUA the SBs

> 
>> +		bio->bi_opf = REQ_OP_WRITE | op_flags;
> 
> You could kill the op_flags variable and just assign everything directly
> to bio->bi_opf.
> 

Then I'd still have to deal with the NOBARRIER mount option here so 
op_flags is nice to have for readability.
hch@infradead.org Feb. 6, 2020, 2:59 p.m. UTC | #3
On Thu, Feb 06, 2020 at 08:20:16AM +0000, Johannes Thumshirn wrote:
> >>   		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
> >>   		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
> >>   			op_flags |= REQ_FUA;
> > 
> >> +		bio->bi_opf = REQ_OP_WRITE | op_flags;
> > 
> > You could kill the op_flags variable and just assign everything directly
> > to bio->bi_opf.
> > 
> 
> Then I'd still have to deal with the NOBARRIER mount option here so 
> op_flags is nice to have for readability.

I tend to disagree.  The simple version is:

		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
			bio->bi_opf |= REQ_FUA;
Johannes Thumshirn Feb. 6, 2020, 3:18 p.m. UTC | #4
On 06/02/2020 15:59, Christoph Hellwig wrote:
> I tend to disagree.  The simple version is:
> 
> 		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
> 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
> 			bio->bi_opf |= REQ_FUA;
> 

Well let's use that one then.
David Sterba Feb. 7, 2020, 4:08 p.m. UTC | #5
On Thu, Feb 06, 2020 at 08:20:16AM +0000, Johannes Thumshirn wrote:
> >> @@ -3497,9 +3506,23 @@ static int write_dev_supers(struct btrfs_device *device,
> >>   		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
> >>   		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
> >>   			op_flags |= REQ_FUA;
> > 
> > Question on the existing code:  why is it safe to not use FUA for the
> > subsequent superblocks?
> > 
> >> +
> >>C +		/*
> >> +		 * Directly use BIOs here instead of relying on the page-cache
> >> +		 * to do I/O, so we don't loose the ability to do integrity
> >> +		 * checking.
> >> +		 */
> >> +		bio = bio_alloc(gfp_mask, 1);
> >> +		bio_set_dev(bio, device->bdev);
> >> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> >> +		bio->bi_private = device;
> >> +		bio->bi_end_io = btrfs_end_super_write;
> >> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> >> +			     offset_in_page(bytenr));
> > 
> > Missing return value check.  But given that it is a single page and
> > can't error out please switch to __bio_add_page here.
> IR
> Good question, I guess it's saver to always FUA the SBs

That is a performance optimization IIRC, only the primary superblock
does FUA the backup superblocks don't as this would add 2 more flushes
that are considered expensive.

The trade-off is optimistic because the backup superblocks are almost
never necessary. For the common power-fail situation primary will be
there or not atomically, the non-FUA writes of secondary superblocks
will be perhaps delayed a bit. The scenario where the primary sb is
unexpectedly damaged would have to happen in the short window between
primary FUA and backup writes, so the current version of sb is not
available. Something like that:

  write primary sb
1 FUA

  write backup copy 1
  other writes
  write backup copy 2
  other writes
2 FUA (or equvalent flushing the copies to device)

The window is between 1 and 2, and if some divine force kills primary
sb, the backup copies are not permanently stored yet. Which makes
recovery of the last transaction tricky, but there are still the backup
superblocks with previous intact version.

With FUA after each backup, the window would be shortened, with only 2
blocks written, allowing to access the latest transaction, or possibly
the previous one too given where exactly the write sequence is
interrupted.

The above describes possible scenario but I consider it quite rare to
hit in practice, also it depends on the device that should not just skip
writes or FUAs. So the performance optimization is IMO justified.

Patch
diff mbox series

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bc14ef1aadda..f5343a35ac2f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -7,7 +7,6 @@ 
 #include <linux/blkdev.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
@@ -3341,25 +3340,33 @@  int __cold open_ctree(struct super_block *sb,
 }
 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
 
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
 {
-	if (uptodate) {
-		set_buffer_uptodate(bh);
-	} else {
-		struct btrfs_device *device = (struct btrfs_device *)
-			bh->b_private;
-
-		btrfs_warn_rl_in_rcu(device->fs_info,
-				"lost page write due to IO error on %s",
-					  rcu_str_deref(device->name));
-		/* note, we don't set_buffer_write_io_error because we have
-		 * our own ways of dealing with the IO errors
-		 */
-		clear_buffer_uptodate(bh);
-		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+	struct btrfs_device *device = bio->bi_private;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+	struct page *page;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		page = bvec->bv_page;
+
+		if (blk_status_to_errno(bio->bi_status)) {
+			btrfs_warn_rl_in_rcu(device->fs_info,
+					     "lost page write due to IO error on %s",
+					     rcu_str_deref(device->name));
+			ClearPageUptodate(page);
+			SetPageError(page);
+			btrfs_dev_stat_inc_and_print(device,
+						     BTRFS_DEV_STAT_WRITE_ERRS);
+		} else {
+			SetPageUptodate(page);
+		}
+
+		put_page(page);
+		unlock_page(page);
 	}
-	unlock_buffer(bh);
-	put_bh(bh);
+
+	bio_put(bio);
 }
 
 int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
@@ -3437,16 +3444,16 @@  int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
  * the expected device size at commit time. Note that max_mirrors must be
  * same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
  */
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
+	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	struct buffer_head *bh;
+	gfp_t gfp_mask;
 	int i;
-	int ret;
 	int errors = 0;
 	u64 bytenr;
 	int op_flags;
@@ -3456,7 +3463,13 @@  static int write_dev_supers(struct btrfs_device *device,
 
 	shash->tfm = fs_info->csum_shash;
 
+	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
+
 	for (i = 0; i < max_mirrors; i++) {
+		struct page *page;
+		struct bio *bio;
+		u8 *ptr;
+
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
@@ -3469,26 +3482,22 @@  static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
 		crypto_shash_final(shash, sb->csum);
 
-		/* One reference for us, and we leave it for the caller */
-		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
-			      BTRFS_SUPER_INFO_SIZE);
-		if (!bh) {
+		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
+					   gfp_mask);
+		if (!page) {
 			btrfs_err(device->fs_info,
-			    "couldn't get super buffer head for bytenr %llu",
+			    "couldn't get superblock page for bytenr %llu",
 			    bytenr);
 			errors++;
 			continue;
 		}
 
-		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+		/* Bump the refcount for wait_dev_supers() */
+		get_page(page);
 
-		/* one reference for submit_bh */
-		get_bh(bh);
-
-		set_buffer_uptodate(bh);
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_write_sync;
-		bh->b_private = device;
+		ptr = kmap(page);
+		memcpy(ptr, sb, BTRFS_SUPER_INFO_SIZE);
+		kunmap(page);
 
 		/*
 		 * we fua the first super.  The others we allow
@@ -3497,9 +3506,23 @@  static int write_dev_supers(struct btrfs_device *device,
 		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
 			op_flags |= REQ_FUA;
-		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
-		if (ret)
-			errors++;
+
+		/*
+		 * Directly use BIOs here instead of relying on the page-cache
+		 * to do I/O, so we don't loose the ability to do integrity
+		 * checking.
+		 */
+		bio = bio_alloc(gfp_mask, 1);
+		bio_set_dev(bio, device->bdev);
+		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+		bio->bi_private = device;
+		bio->bi_end_io = btrfs_end_super_write;
+		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+			     offset_in_page(bytenr));
+
+		bio->bi_opf = REQ_OP_WRITE | op_flags;
+
+		btrfsic_submit_bio(bio);
 	}
 	return errors < i ? 0 : -1;
 }
@@ -3508,12 +3531,11 @@  static int write_dev_supers(struct btrfs_device *device,
  * Wait for write completion of superblocks done by write_dev_supers,
  * @max_mirrors same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
  * date.
  */
 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 {
-	struct buffer_head *bh;
 	int i;
 	int errors = 0;
 	bool primary_failed = false;
@@ -3523,32 +3545,33 @@  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
 	for (i = 0; i < max_mirrors; i++) {
+		struct page *page;
+
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
 			break;
 
-		bh = __find_get_block(device->bdev,
-				      bytenr / BTRFS_BDEV_BLOCKSIZE,
-				      BTRFS_SUPER_INFO_SIZE);
-		if (!bh) {
+		page = find_get_page(device->bdev->bd_inode->i_mapping,
+				     bytenr >> PAGE_SHIFT);
+		if (!page) {
 			errors++;
 			if (i == 0)
 				primary_failed = true;
 			continue;
 		}
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
+		wait_on_page_locked(page);
+		if (PageError(page)) {
 			errors++;
 			if (i == 0)
 				primary_failed = true;
 		}
 
 		/* drop our reference */
-		brelse(bh);
+		put_page(page);
 
 		/* drop the reference from the writing run */
-		brelse(bh);
+		put_page(page);
 	}
 
 	/* log error, force error return */