[2/5] btrfs: remove use of buffer_heads from superblock writeout
diff mbox series

Message ID 20200117125105.20989-3-johannes.thumshirn@wdc.com
State New
Headers show
Series
  • btrfs: remove buffer heads form superblock handling
Related show

Commit Message

Johannes Thumshirn Jan. 17, 2020, 12:51 p.m. UTC
Similar to the superblock read path, change the write path to using BIOs
and pages instead of buffer_heads.

This is based on a patch originally authored by Nikolay Borisov.

Co-developed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
 fs/btrfs/disk-io.c | 107 ++++++++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 46 deletions(-)

Comments

David Sterba Jan. 17, 2020, 1:38 p.m. UTC | #1
On Fri, Jan 17, 2020 at 09:51:02PM +0900, Johannes Thumshirn wrote:
> Similar to the superblock read path, change the write path to using BIOs
> and pages instead of buffer_heads.
> 
> This is based on a patch originally authored by Nikolay Borisov.
> 
> Co-developed-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/disk-io.c | 107 ++++++++++++++++++++++++++-------------------
>  1 file changed, 61 insertions(+), 46 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 50c93ffe8d03..51e7b832c8fd 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3353,25 +3353,33 @@ int __cold open_ctree(struct super_block *sb,
>  }
>  ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
>  
> -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
> +static void btrfs_end_super_write(struct bio *bio)
>  {
> -	if (uptodate) {
> -		set_buffer_uptodate(bh);
> -	} else {
> -		struct btrfs_device *device = (struct btrfs_device *)
> -			bh->b_private;
> -
> -		btrfs_warn_rl_in_rcu(device->fs_info,
> -				"lost page write due to IO error on %s",
> -					  rcu_str_deref(device->name));
> -		/* note, we don't set_buffer_write_io_error because we have
> -		 * our own ways of dealing with the IO errors
> -		 */
> -		clear_buffer_uptodate(bh);
> -		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
> +	struct btrfs_device *device = bio->bi_private;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	struct page *page;
> +
> +	bio_for_each_segment_all(bvec, bio, iter_all) {
> +		page = bvec->bv_page;
> +
> +		if (blk_status_to_errno(bio->bi_status)) {
> +			btrfs_warn_rl_in_rcu(device->fs_info,
> +					     "lost page write due to IO error on %s",
> +					     rcu_str_deref(device->name));
> +			ClearPageUptodate(page);
> +			SetPageError(page);
> +			btrfs_dev_stat_inc_and_print(device,
> +						     BTRFS_DEV_STAT_WRITE_ERRS);
> +		} else {
> +			SetPageUptodate(page);
> +		}
> +
> +		put_page(page);
> +		unlock_page(page);
>  	}
> -	unlock_buffer(bh);
> -	put_bh(bh);
> +
> +	bio_put(bio);
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> @@ -3462,16 +3470,15 @@ int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>   * the expected device size at commit time. Note that max_mirrors must be
>   * same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or submission fails.
> + * Return number of errors when page is not found or submission fails.
>   */
>  static int write_dev_supers(struct btrfs_device *device,
>  			    struct btrfs_super_block *sb, int max_mirrors)
>  {
>  	struct btrfs_fs_info *fs_info = device->fs_info;
>  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
> -	struct buffer_head *bh;
> +	gfp_t gfp_mask;
>  	int i;
> -	int ret;
>  	int errors = 0;
>  	u64 bytenr;
>  	int op_flags;
> @@ -3481,7 +3488,13 @@ static int write_dev_supers(struct btrfs_device *device,
>  
>  	shash->tfm = fs_info->csum_shash;
>  
> +	gfp_mask = mapping_gfp_constraint(device->bdev->bd_inode->i_mapping,
> +					  ~__GFP_FS) | __GFP_NOFAIL;
> +
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +		struct bio *bio;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
> @@ -3494,26 +3507,20 @@ static int write_dev_supers(struct btrfs_device *device,
>  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
>  		crypto_shash_final(shash, sb->csum);
>  
> -		/* One reference for us, and we leave it for the caller */
> -		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
> -			      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_or_create_page(device->bdev->bd_inode->i_mapping,
> +					   bytenr >> PAGE_SHIFT, gfp_mask);

This has NOFAIL again, but now we're in write_dev_supers, so this has
some implications regarding the potential unbounded waiting

> +		if (!page) {
>  			btrfs_err(device->fs_info,
> -			    "couldn't get super buffer head for bytenr %llu",
> +			    "couldn't get superblock page for bytenr %llu",
>  			    bytenr);
>  			errors++;
>  			continue;
>  		}
>  
> -		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
> +		/* Bump the refcount for wait_dev_supers() */
> +		get_page(page);
>  
> -		/* one reference for submit_bh */
> -		get_bh(bh);
> -
> -		set_buffer_uptodate(bh);
> -		lock_buffer(bh);
> -		bh->b_end_io = btrfs_end_buffer_write_sync;
> -		bh->b_private = device;
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>  
>  		/*
>  		 * we fua the first super.  The others we allow
> @@ -3522,9 +3529,17 @@ static int write_dev_supers(struct btrfs_device *device,
>  		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
>  		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
>  			op_flags |= REQ_FUA;
> -		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
> -		if (ret)
> -			errors++;
> +
> +		bio = bio_alloc(gfp_mask, 1);

And allocating a new bio when we have to write the superblock is also
not very nice. This should do something like the device::flush_bio that
could be reused.

> +		bio_set_dev(bio, device->bdev);
> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio->bi_private = device;
> +		bio->bi_end_io = btrfs_end_super_write;
> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));
> +
> +		bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
> +		btrfsic_submit_bio(bio);
>  	}
>  	return errors < i ? 0 : -1;
>  }
> @@ -3533,12 +3548,11 @@ static int write_dev_supers(struct btrfs_device *device,
>   * Wait for write completion of superblocks done by write_dev_supers,
>   * @max_mirrors same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or not marked up to
> + * Return number of errors when page is not found or not marked up to
>   * date.
>   */
>  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  {
> -	struct buffer_head *bh;
>  	int i;
>  	int errors = 0;
>  	bool primary_failed = false;
> @@ -3548,32 +3562,33 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
>  
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
>  			break;
>  
> -		bh = __find_get_block(device->bdev,
> -				      bytenr / BTRFS_BDEV_BLOCKSIZE,
> -				      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_get_page(device->bdev->bd_inode->i_mapping,
> +				     bytenr >> PAGE_SHIFT);
> +		if (!page) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  			continue;
>  		}
> -		wait_on_buffer(bh);
> -		if (!buffer_uptodate(bh)) {
> +		wait_on_page_locked(page);

What locks the page?

> +		if (PageError(page)) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  		}
>  
>  		/* drop our reference */
> -		brelse(bh);
> +		put_page(page);
>  
>  		/* drop the reference from the writing run */
> -		brelse(bh);
> +		put_page(page);
>  	}
>  
>  	/* log error, force error return */
> -- 
> 2.24.1
David Sterba Jan. 17, 2020, 2:51 p.m. UTC | #2
On Fri, Jan 17, 2020 at 09:51:02PM +0900, Johannes Thumshirn wrote:
> Similar to the superblock read path, change the write path to using BIOs
> and pages instead of buffer_heads.
> 
> This is based on a patch originally authored by Nikolay Borisov.
> 
> Co-developed-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/disk-io.c | 107 ++++++++++++++++++++++++++-------------------
>  1 file changed, 61 insertions(+), 46 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 50c93ffe8d03..51e7b832c8fd 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c

Grep pointed out that the #include <buffer_head> is still in disk-io.c,
(check-integrity.c got it removed).

> @@ -3353,25 +3353,33 @@ int __cold open_ctree(struct super_block *sb,
>  }
>  ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
>  
> -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
> +static void btrfs_end_super_write(struct bio *bio)
>  {
> -	if (uptodate) {
> -		set_buffer_uptodate(bh);
> -	} else {
> -		struct btrfs_device *device = (struct btrfs_device *)
> -			bh->b_private;
> -
> -		btrfs_warn_rl_in_rcu(device->fs_info,
> -				"lost page write due to IO error on %s",
> -					  rcu_str_deref(device->name));
> -		/* note, we don't set_buffer_write_io_error because we have
> -		 * our own ways of dealing with the IO errors
> -		 */
> -		clear_buffer_uptodate(bh);
> -		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
> +	struct btrfs_device *device = bio->bi_private;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	struct page *page;
> +
> +	bio_for_each_segment_all(bvec, bio, iter_all) {
> +		page = bvec->bv_page;
> +
> +		if (blk_status_to_errno(bio->bi_status)) {
> +			btrfs_warn_rl_in_rcu(device->fs_info,
> +					     "lost page write due to IO error on %s",
> +					     rcu_str_deref(device->name));
> +			ClearPageUptodate(page);
> +			SetPageError(page);
> +			btrfs_dev_stat_inc_and_print(device,
> +						     BTRFS_DEV_STAT_WRITE_ERRS);
> +		} else {
> +			SetPageUptodate(page);
> +		}
> +
> +		put_page(page);
> +		unlock_page(page);
>  	}
> -	unlock_buffer(bh);
> -	put_bh(bh);
> +
> +	bio_put(bio);
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> @@ -3462,16 +3470,15 @@ int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>   * the expected device size at commit time. Note that max_mirrors must be
>   * same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or submission fails.
> + * Return number of errors when page is not found or submission fails.
>   */
>  static int write_dev_supers(struct btrfs_device *device,
>  			    struct btrfs_super_block *sb, int max_mirrors)
>  {
>  	struct btrfs_fs_info *fs_info = device->fs_info;
>  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
> -	struct buffer_head *bh;
> +	gfp_t gfp_mask;
>  	int i;
> -	int ret;
>  	int errors = 0;
>  	u64 bytenr;
>  	int op_flags;
> @@ -3481,7 +3488,13 @@ static int write_dev_supers(struct btrfs_device *device,
>  
>  	shash->tfm = fs_info->csum_shash;
>  
> +	gfp_mask = mapping_gfp_constraint(device->bdev->bd_inode->i_mapping,
> +					  ~__GFP_FS) | __GFP_NOFAIL;
> +
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +		struct bio *bio;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
> @@ -3494,26 +3507,20 @@ static int write_dev_supers(struct btrfs_device *device,
>  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
>  		crypto_shash_final(shash, sb->csum);
>  
> -		/* One reference for us, and we leave it for the caller */
> -		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
> -			      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_or_create_page(device->bdev->bd_inode->i_mapping,
> +					   bytenr >> PAGE_SHIFT, gfp_mask);
> +		if (!page) {
>  			btrfs_err(device->fs_info,
> -			    "couldn't get super buffer head for bytenr %llu",
> +			    "couldn't get superblock page for bytenr %llu",
>  			    bytenr);
>  			errors++;
>  			continue;
>  		}
>  
> -		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
> +		/* Bump the refcount for wait_dev_supers() */
> +		get_page(page);
>  
> -		/* one reference for submit_bh */
> -		get_bh(bh);
> -
> -		set_buffer_uptodate(bh);
> -		lock_buffer(bh);
> -		bh->b_end_io = btrfs_end_buffer_write_sync;
> -		bh->b_private = device;
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>  
>  		/*
>  		 * we fua the first super.  The others we allow
> @@ -3522,9 +3529,17 @@ static int write_dev_supers(struct btrfs_device *device,
>  		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
>  		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
>  			op_flags |= REQ_FUA;
> -		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
> -		if (ret)
> -			errors++;
> +
> +		bio = bio_alloc(gfp_mask, 1);
> +		bio_set_dev(bio, device->bdev);
> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio->bi_private = device;
> +		bio->bi_end_io = btrfs_end_super_write;
> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));
> +
> +		bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
> +		btrfsic_submit_bio(bio);
>  	}
>  	return errors < i ? 0 : -1;
>  }
> @@ -3533,12 +3548,11 @@ static int write_dev_supers(struct btrfs_device *device,
>   * Wait for write completion of superblocks done by write_dev_supers,
>   * @max_mirrors same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or not marked up to
> + * Return number of errors when page is not found or not marked up to
>   * date.
>   */
>  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  {
> -	struct buffer_head *bh;
>  	int i;
>  	int errors = 0;
>  	bool primary_failed = false;
> @@ -3548,32 +3562,33 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
>  
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
>  			break;
>  
> -		bh = __find_get_block(device->bdev,
> -				      bytenr / BTRFS_BDEV_BLOCKSIZE,
> -				      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_get_page(device->bdev->bd_inode->i_mapping,
> +				     bytenr >> PAGE_SHIFT);
> +		if (!page) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  			continue;
>  		}
> -		wait_on_buffer(bh);
> -		if (!buffer_uptodate(bh)) {
> +		wait_on_page_locked(page);
> +		if (PageError(page)) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  		}
>  
>  		/* drop our reference */
> -		brelse(bh);
> +		put_page(page);
>  
>  		/* drop the reference from the writing run */
> -		brelse(bh);
> +		put_page(page);
>  	}
>  
>  	/* log error, force error return */
> -- 
> 2.24.1
Nikolay Borisov Jan. 17, 2020, 3:01 p.m. UTC | #3
On 17.01.20 г. 14:51 ч., Johannes Thumshirn wrote:
> Similar to the superblock read path, change the write path to using BIOs
> and pages instead of buffer_heads.
> 
> This is based on a patch originally authored by Nikolay Borisov.
> 
> Co-developed-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
>  fs/btrfs/disk-io.c | 107 ++++++++++++++++++++++++++-------------------
>  1 file changed, 61 insertions(+), 46 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 50c93ffe8d03..51e7b832c8fd 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3353,25 +3353,33 @@ int __cold open_ctree(struct super_block *sb,
>  }
>  ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
>  
> -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
> +static void btrfs_end_super_write(struct bio *bio)
>  {
> -	if (uptodate) {
> -		set_buffer_uptodate(bh);
> -	} else {
> -		struct btrfs_device *device = (struct btrfs_device *)
> -			bh->b_private;
> -
> -		btrfs_warn_rl_in_rcu(device->fs_info,
> -				"lost page write due to IO error on %s",
> -					  rcu_str_deref(device->name));
> -		/* note, we don't set_buffer_write_io_error because we have
> -		 * our own ways of dealing with the IO errors
> -		 */
> -		clear_buffer_uptodate(bh);
> -		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
> +	struct btrfs_device *device = bio->bi_private;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	struct page *page;
> +
> +	bio_for_each_segment_all(bvec, bio, iter_all) {
> +		page = bvec->bv_page;
> +
> +		if (blk_status_to_errno(bio->bi_status)) {
> +			btrfs_warn_rl_in_rcu(device->fs_info,
> +					     "lost page write due to IO error on %s",
> +					     rcu_str_deref(device->name));
> +			ClearPageUptodate(page);
> +			SetPageError(page);
> +			btrfs_dev_stat_inc_and_print(device,
> +						     BTRFS_DEV_STAT_WRITE_ERRS);
> +		} else {
> +			SetPageUptodate(page);
> +		}
> +
> +		put_page(page);

Isn't this extra put page? Perhahps it's not because that would be the
reference from find_or_create_page in write_dev_supers. In any case I'd
rather have it in that function.

> +		unlock_page(page);
>  	}
> -	unlock_buffer(bh);
> -	put_bh(bh);
> +
> +	bio_put(bio);
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> @@ -3462,16 +3470,15 @@ int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>   * the expected device size at commit time. Note that max_mirrors must be
>   * same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or submission fails.
> + * Return number of errors when page is not found or submission fails.
>   */
>  static int write_dev_supers(struct btrfs_device *device,
>  			    struct btrfs_super_block *sb, int max_mirrors)
>  {
>  	struct btrfs_fs_info *fs_info = device->fs_info;
>  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
> -	struct buffer_head *bh;
> +	gfp_t gfp_mask;
>  	int i;
> -	int ret;
>  	int errors = 0;
>  	u64 bytenr;
>  	int op_flags;
> @@ -3481,7 +3488,13 @@ static int write_dev_supers(struct btrfs_device *device,
>  
>  	shash->tfm = fs_info->csum_shash;
>  
> +	gfp_mask = mapping_gfp_constraint(device->bdev->bd_inode->i_mapping,
> +					  ~__GFP_FS) | __GFP_NOFAIL;
> +
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +		struct bio *bio;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
> @@ -3494,26 +3507,20 @@ static int write_dev_supers(struct btrfs_device *device,
>  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
>  		crypto_shash_final(shash, sb->csum);
>  
> -		/* One reference for us, and we leave it for the caller */
> -		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
> -			      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_or_create_page(device->bdev->bd_inode->i_mapping,
> +					   bytenr >> PAGE_SHIFT, gfp_mask);

You introduce asymmetry here. Because the write path now utilizes the
page cache whereas the read path uses plain page alloc. I'm not sure but
could this lead to reading garbage from the super block because now you
don't have synchronization between the read and write path. This reminds
me why I was using the page cache and not a plain page. Also by
utilising the page cache you will potentially be reducing IO to disk
since you can be fetching the sb data directly from cache.

> +		if (!page) {
>  			btrfs_err(device->fs_info,
> -			    "couldn't get super buffer head for bytenr %llu",
> +			    "couldn't get superblock page for bytenr %llu",
>  			    bytenr);
>  			errors++;
>  			continue;
>  		}
>  
> -		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
> +		/* Bump the refcount for wait_dev_supers() */
> +		get_page(page);
>  
> -		/* one reference for submit_bh */
> -		get_bh(bh);
> -
> -		set_buffer_uptodate(bh);
> -		lock_buffer(bh);
> -		bh->b_end_io = btrfs_end_buffer_write_sync;
> -		bh->b_private = device;
> +		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
>  
>  		/*
>  		 * we fua the first super.  The others we allow
> @@ -3522,9 +3529,17 @@ static int write_dev_supers(struct btrfs_device *device,
>  		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
>  		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
>  			op_flags |= REQ_FUA;
> -		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
> -		if (ret)
> -			errors++;
> +
> +		bio = bio_alloc(gfp_mask, 1);
> +		bio_set_dev(bio, device->bdev);
> +		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio->bi_private = device;
> +		bio->bi_end_io = btrfs_end_super_write;
> +		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));
> +
> +		bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
> +		btrfsic_submit_bio(bio);
>  	}
>  	return errors < i ? 0 : -1;
>  }
> @@ -3533,12 +3548,11 @@ static int write_dev_supers(struct btrfs_device *device,
>   * Wait for write completion of superblocks done by write_dev_supers,
>   * @max_mirrors same for write and wait phases.
>   *
> - * Return number of errors when buffer head is not found or not marked up to
> + * Return number of errors when page is not found or not marked up to
>   * date.
>   */
>  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  {
> -	struct buffer_head *bh;
>  	int i;
>  	int errors = 0;
>  	bool primary_failed = false;
> @@ -3548,32 +3562,33 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
>  
>  	for (i = 0; i < max_mirrors; i++) {
> +		struct page *page;
> +
>  		bytenr = btrfs_sb_offset(i);
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
>  			break;
>  
> -		bh = __find_get_block(device->bdev,
> -				      bytenr / BTRFS_BDEV_BLOCKSIZE,
> -				      BTRFS_SUPER_INFO_SIZE);
> -		if (!bh) {
> +		page = find_get_page(device->bdev->bd_inode->i_mapping,
> +				     bytenr >> PAGE_SHIFT);
> +		if (!page) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  			continue;
>  		}
> -		wait_on_buffer(bh);
> -		if (!buffer_uptodate(bh)) {
> +		wait_on_page_locked(page);
> +		if (PageError(page)) {
>  			errors++;
>  			if (i == 0)
>  				primary_failed = true;
>  		}
>  
>  		/* drop our reference */
> -		brelse(bh);
> +		put_page(page);
>  
>  		/* drop the reference from the writing run */
> -		brelse(bh);
> +		put_page(page);
>  	}
>  
>  	/* log error, force error return */
>
David Sterba Jan. 17, 2020, 3:11 p.m. UTC | #4
On Fri, Jan 17, 2020 at 05:01:35PM +0200, Nikolay Borisov wrote:
> > @@ -3494,26 +3507,20 @@ static int write_dev_supers(struct btrfs_device *device,
> >  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
> >  		crypto_shash_final(shash, sb->csum);
> >  
> > -		/* One reference for us, and we leave it for the caller */
> > -		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
> > -			      BTRFS_SUPER_INFO_SIZE);
> > -		if (!bh) {
> > +		page = find_or_create_page(device->bdev->bd_inode->i_mapping,
> > +					   bytenr >> PAGE_SHIFT, gfp_mask);
> 
> You introduce asymmetry here. Because the write path now utilizes the
> page cache whereas the read path uses plain page alloc. I'm not sure but
> could this lead to reading garbage from the super block because now you
> don't have synchronization between the read and write path. This reminds
> me why I was using the page cache and not a plain page. Also by
> utilising the page cache you will potentially be reducing IO to disk
> since you can be fetching the sb data directly from cache.

There won't be any data in the cache anyway, because
btrfs_get_bdev_and_sb calls invalidate_bdev just before reading the
superblock. But otherwise I agree that the cache must be used,
effectively the same way as __bread does with BH.
Johannes Thumshirn Jan. 22, 2020, 3:48 p.m. UTC | #5
On 17/01/2020, 16:01, "linux-btrfs-owner@vger.kernel.org on behalf of Nikolay Borisov" <linux-btrfs-owner@vger.kernel.org on behalf of nborisov@suse.com> wrote: 
    
    On 17.01.20 г. 14:51 ч., Johannes Thumshirn wrote:
    > Similar to the superblock read path, change the write path to using BIOs
    > and pages instead of buffer_heads.
    >
    > This is based on a patch originally authored by Nikolay Borisov.
    >
    > Co-developed-by: Nikolay Borisov <nborisov@suse.com>
    > Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
    > ---
    >  fs/btrfs/disk-io.c | 107 ++++++++++++++++++++++++++-------------------
    >  1 file changed, 61 insertions(+), 46 deletions(-)
    >
    > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
    > index 50c93ffe8d03..51e7b832c8fd 100644
    > --- a/fs/btrfs/disk-io.c
    > +++ b/fs/btrfs/disk-io.c
    > @@ -3353,25 +3353,33 @@ int __cold open_ctree(struct super_block *sb,
    >  }
    >  ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
    >
    > -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
    > +static void btrfs_end_super_write(struct bio *bio)
    >  {
    > -     if (uptodate) {
    > -             set_buffer_uptodate(bh);
    > -     } else {
    > -             struct btrfs_device *device = (struct btrfs_device *)
    > -                     bh->b_private;
    > -
    > -             btrfs_warn_rl_in_rcu(device->fs_info,
    > -                             "lost page write due to IO error on %s",
    > -                                       rcu_str_deref(device->name));
    > -             /* note, we don't set_buffer_write_io_error because we have
    > -              * our own ways of dealing with the IO errors
    > -              */
    > -             clear_buffer_uptodate(bh);
    > -             btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
    > +     struct btrfs_device *device = bio->bi_private;
    > +     struct bio_vec *bvec;
    > +     struct bvec_iter_all iter_all;
    > +     struct page *page;
    > +
    > +     bio_for_each_segment_all(bvec, bio, iter_all) {
    > +             page = bvec->bv_page;
    > +
    > +             if (blk_status_to_errno(bio->bi_status)) {
    > +                     btrfs_warn_rl_in_rcu(device->fs_info,
    > +                                          "lost page write due to IO error on %s",
    > +                                          rcu_str_deref(device->name));
    > +                     ClearPageUptodate(page);
    > +                     SetPageError(page);
    > +                     btrfs_dev_stat_inc_and_print(device,
    > +                                                  BTRFS_DEV_STAT_WRITE_ERRS);
    > +             } else {
    > +                     SetPageUptodate(page);
    > +             }
    > +
    > +             put_page(page);
    
    Isn't this extra put page? Perhahps it's not because that would be the
    reference from find_or_create_page in write_dev_supers. In any case I'd
    rather have it in that function.

No we can't do the put_page() in in write_dev_supers() as btrfs_end_super_write()
is the bio endio hook and it's called upon completion of the bio. This happens
asynchronous to write_dev_supers().

Patch
diff mbox series

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 50c93ffe8d03..51e7b832c8fd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3353,25 +3353,33 @@  int __cold open_ctree(struct super_block *sb,
 }
 ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
 
-static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+static void btrfs_end_super_write(struct bio *bio)
 {
-	if (uptodate) {
-		set_buffer_uptodate(bh);
-	} else {
-		struct btrfs_device *device = (struct btrfs_device *)
-			bh->b_private;
-
-		btrfs_warn_rl_in_rcu(device->fs_info,
-				"lost page write due to IO error on %s",
-					  rcu_str_deref(device->name));
-		/* note, we don't set_buffer_write_io_error because we have
-		 * our own ways of dealing with the IO errors
-		 */
-		clear_buffer_uptodate(bh);
-		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
+	struct btrfs_device *device = bio->bi_private;
+	struct bio_vec *bvec;
+	struct bvec_iter_all iter_all;
+	struct page *page;
+
+	bio_for_each_segment_all(bvec, bio, iter_all) {
+		page = bvec->bv_page;
+
+		if (blk_status_to_errno(bio->bi_status)) {
+			btrfs_warn_rl_in_rcu(device->fs_info,
+					     "lost page write due to IO error on %s",
+					     rcu_str_deref(device->name));
+			ClearPageUptodate(page);
+			SetPageError(page);
+			btrfs_dev_stat_inc_and_print(device,
+						     BTRFS_DEV_STAT_WRITE_ERRS);
+		} else {
+			SetPageUptodate(page);
+		}
+
+		put_page(page);
+		unlock_page(page);
 	}
-	unlock_buffer(bh);
-	put_bh(bh);
+
+	bio_put(bio);
 }
 
 int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
@@ -3462,16 +3470,15 @@  int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
  * the expected device size at commit time. Note that max_mirrors must be
  * same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or submission fails.
+ * Return number of errors when page is not found or submission fails.
  */
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-	struct buffer_head *bh;
+	gfp_t gfp_mask;
 	int i;
-	int ret;
 	int errors = 0;
 	u64 bytenr;
 	int op_flags;
@@ -3481,7 +3488,13 @@  static int write_dev_supers(struct btrfs_device *device,
 
 	shash->tfm = fs_info->csum_shash;
 
+	gfp_mask = mapping_gfp_constraint(device->bdev->bd_inode->i_mapping,
+					  ~__GFP_FS) | __GFP_NOFAIL;
+
 	for (i = 0; i < max_mirrors; i++) {
+		struct page *page;
+		struct bio *bio;
+
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
@@ -3494,26 +3507,20 @@  static int write_dev_supers(struct btrfs_device *device,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
 		crypto_shash_final(shash, sb->csum);
 
-		/* One reference for us, and we leave it for the caller */
-		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
-			      BTRFS_SUPER_INFO_SIZE);
-		if (!bh) {
+		page = find_or_create_page(device->bdev->bd_inode->i_mapping,
+					   bytenr >> PAGE_SHIFT, gfp_mask);
+		if (!page) {
 			btrfs_err(device->fs_info,
-			    "couldn't get super buffer head for bytenr %llu",
+			    "couldn't get superblock page for bytenr %llu",
 			    bytenr);
 			errors++;
 			continue;
 		}
 
-		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+		/* Bump the refcount for wait_dev_supers() */
+		get_page(page);
 
-		/* one reference for submit_bh */
-		get_bh(bh);
-
-		set_buffer_uptodate(bh);
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_write_sync;
-		bh->b_private = device;
+		memcpy(page_address(page), sb, BTRFS_SUPER_INFO_SIZE);
 
 		/*
 		 * we fua the first super.  The others we allow
@@ -3522,9 +3529,17 @@  static int write_dev_supers(struct btrfs_device *device,
 		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
 			op_flags |= REQ_FUA;
-		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
-		if (ret)
-			errors++;
+
+		bio = bio_alloc(gfp_mask, 1);
+		bio_set_dev(bio, device->bdev);
+		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+		bio->bi_private = device;
+		bio->bi_end_io = btrfs_end_super_write;
+		bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
+			     offset_in_page(bytenr));
+
+		bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags);
+		btrfsic_submit_bio(bio);
 	}
 	return errors < i ? 0 : -1;
 }
@@ -3533,12 +3548,11 @@  static int write_dev_supers(struct btrfs_device *device,
  * Wait for write completion of superblocks done by write_dev_supers,
  * @max_mirrors same for write and wait phases.
  *
- * Return number of errors when buffer head is not found or not marked up to
+ * Return number of errors when page is not found or not marked up to
  * date.
  */
 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 {
-	struct buffer_head *bh;
 	int i;
 	int errors = 0;
 	bool primary_failed = false;
@@ -3548,32 +3562,33 @@  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
 
 	for (i = 0; i < max_mirrors; i++) {
+		struct page *page;
+
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
 			break;
 
-		bh = __find_get_block(device->bdev,
-				      bytenr / BTRFS_BDEV_BLOCKSIZE,
-				      BTRFS_SUPER_INFO_SIZE);
-		if (!bh) {
+		page = find_get_page(device->bdev->bd_inode->i_mapping,
+				     bytenr >> PAGE_SHIFT);
+		if (!page) {
 			errors++;
 			if (i == 0)
 				primary_failed = true;
 			continue;
 		}
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
+		wait_on_page_locked(page);
+		if (PageError(page)) {
 			errors++;
 			if (i == 0)
 				primary_failed = true;
 		}
 
 		/* drop our reference */
-		brelse(bh);
+		put_page(page);
 
 		/* drop the reference from the writing run */
-		brelse(bh);
+		put_page(page);
 	}
 
 	/* log error, force error return */