[v2,2/6] btrfs: remove buffer heads from super block reading
diff mbox series

Message ID 20200123081849.23397-3-johannes.thumshirn@wdc.com
State New
Headers show
Series
  • btrfs: remove buffer heads form superblock handling
Related show

Commit Message

Johannes Thumshirn Jan. 23, 2020, 8:18 a.m. UTC
Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have
some drawbacks, like not being able to propagate errors from the lower
layers.

Change the buffer_heads to BIOs and utilize the page cache for the page
allocation. Compared to buffer_heads using BIOs are more lightweight and
we skip several layers of buffer_head code until we either reach the page
cache or build a BIO and submit it to read the blocks from disk.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

---
Changes to v1:
- move 'super_page' into for-loop in btrfs_scratch_superblocks() (Nikolay)
- switch to using pagecahce instead of alloc_pages() (Nikolay, David)
---
 fs/btrfs/disk-io.c | 83 ++++++++++++++++++++++++++++++----------------
 fs/btrfs/disk-io.h |  4 +--
 fs/btrfs/volumes.c | 62 ++++++++++++++++++++--------------
 fs/btrfs/volumes.h |  2 --
 4 files changed, 94 insertions(+), 57 deletions(-)

Comments

Josef Bacik Jan. 23, 2020, 1:51 p.m. UTC | #1
On 1/23/20 3:18 AM, Johannes Thumshirn wrote:
> Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have
> some drawbacks, like not being able to propagate errors from the lower
> layers.
> 
> Change the buffer_heads to BIOs and utilize the page cache for the page
> allocation. Compared to buffer_heads using BIOs are more lightweight and
> we skip several layers of buffer_head code until we either reach the page
> cache or build a BIO and submit it to read the blocks from disk.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> ---
> Changes to v1:
> - move 'super_page' into for-loop in btrfs_scratch_superblocks() (Nikolay)
> - switch to using pagecahce instead of alloc_pages() (Nikolay, David)
> ---
>   fs/btrfs/disk-io.c | 83 ++++++++++++++++++++++++++++++----------------
>   fs/btrfs/disk-io.h |  4 +--
>   fs/btrfs/volumes.c | 62 ++++++++++++++++++++--------------
>   fs/btrfs/volumes.h |  2 --
>   4 files changed, 94 insertions(+), 57 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index aea48d6ddc0c..b111f32108cc 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2635,11 +2635,12 @@ int __cold open_ctree(struct super_block *sb,
>   	u64 features;
>   	u16 csum_type;
>   	struct btrfs_key location;
> -	struct buffer_head *bh;
>   	struct btrfs_super_block *disk_super;
>   	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>   	struct btrfs_root *tree_root;
>   	struct btrfs_root *chunk_root;
> +	struct page *super_page;
> +	u8 *superblock;
>   	int ret;
>   	int err = -EINVAL;
>   	int clear_free_space_tree = 0;
> @@ -2832,28 +2833,31 @@ int __cold open_ctree(struct super_block *sb,
>   	/*
>   	 * Read super block and check the signature bytes only
>   	 */
> -	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
> -	if (IS_ERR(bh)) {
> -		err = PTR_ERR(bh);
> +	ret = btrfs_read_dev_super(fs_devices->latest_bdev, &super_page);
> +	if (ret) {
> +		err = ret;
>   		goto fail_alloc;
>   	}
>   
> +	superblock = kmap(super_page);
>   	/*
>   	 * Verify the type first, if that or the the checksum value are
>   	 * corrupted, we'll find out
>   	 */
> -	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
> +	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)
> +					  superblock);
>   	if (!btrfs_supported_super_csum(csum_type)) {
>   		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
>   			  csum_type);
>   		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);
>   		goto fail_alloc;
>   	}
>   
>   	ret = btrfs_init_csum_hash(fs_info, csum_type);
>   	if (ret) {
>   		err = ret;
> +		btrfs_release_disk_super(super_page);
>   		goto fail_alloc;
>   	}
>   
> @@ -2861,10 +2865,10 @@ int __cold open_ctree(struct super_block *sb,
>   	 * We want to check superblock checksum, the type is stored inside.
>   	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
>   	 */
> -	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
> +	if (btrfs_check_super_csum(fs_info, superblock)) {
>   		btrfs_err(fs_info, "superblock checksum mismatch");
>   		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);
>   		goto fail_csum;
>   	}
>   
> @@ -2873,8 +2877,8 @@ int __cold open_ctree(struct super_block *sb,
>   	 * following bytes up to INFO_SIZE, the checksum is calculated from
>   	 * the whole block of INFO_SIZE
>   	 */
> -	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
> -	brelse(bh);
> +	memcpy(fs_info->super_copy, superblock, sizeof(*fs_info->super_copy));
> +	btrfs_release_disk_super(super_page);
>   
>   	disk_super = fs_info->super_copy;
>   
> @@ -3374,40 +3378,60 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
>   }
>   
>   int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret)
> +			struct page **super_page)
>   {
> -	struct buffer_head *bh;
>   	struct btrfs_super_block *super;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	struct page *page;
>   	u64 bytenr;
> +	struct address_space *mapping = bdev->bd_inode->i_mapping;
> +	gfp_t gfp_mask;
> +	int ret;
>   
>   	bytenr = btrfs_sb_offset(copy_num);
>   	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>   		return -EINVAL;
>   
> -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
> +	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	bio_init(&bio, &bio_vec, 1);
> +	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +	bio_set_dev(&bio, bdev);
> +	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
> +	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +		     offset_in_page(bytenr));
> +
> +	ret = submit_bio_wait(&bio);
> +	unlock_page(page);
>   	/*
>   	 * If we fail to read from the underlying devices, as of now
>   	 * the best option we have is to mark it EIO.
>   	 */
> -	if (!bh)
> +	if (ret) {
> +		put_page(page);
>   		return -EIO;
> +	}
>   
> -	super = (struct btrfs_super_block *)bh->b_data;
> +	super = kmap(page);
>   	if (btrfs_super_bytenr(super) != bytenr ||
>   		    btrfs_super_magic(super) != BTRFS_MAGIC) {
> -		brelse(bh);
> +		btrfs_release_disk_super(page);
>   		return -EINVAL;
>   	}
> +	kunmap(page);
>   
> -	*bh_ret = bh;
> +	*super_page = page;
>   	return 0;
>   }
>   
>   
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>   {
> -	struct buffer_head *bh;
> -	struct buffer_head *latest = NULL;
> +	struct page *latest = NULL;
>   	struct btrfs_super_block *super;
>   	int i;
>   	u64 transid = 0;
> @@ -3419,25 +3443,28 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
>   	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>   	 */
>   	for (i = 0; i < 1; i++) {
> -		ret = btrfs_read_dev_one_super(bdev, i, &bh);
> +		ret = btrfs_read_dev_one_super(bdev, i, page);
>   		if (ret)
>   			continue;
>   
> -		super = (struct btrfs_super_block *)bh->b_data;
> +		super = kmap(*page);
>   
>   		if (!latest || btrfs_super_generation(super) > transid) {
> -			brelse(latest);
> -			latest = bh;
> +			if (latest)
> +				btrfs_release_disk_super(latest);
> +			latest = *page;
>   			transid = btrfs_super_generation(super);
>   		} else {
> -			brelse(bh);
> +			btrfs_release_disk_super(*page);
>   		}
> +
> +		kunmap(*page);
>   	}
>   
> -	if (!latest)
> -		return ERR_PTR(ret);
> +	if (ret)
> +		return ret;
>   
> -	return latest;
> +	return 0;
>   }
>   
>   /*
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 8c2d6cf1ce59..e04b233c436a 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -54,9 +54,9 @@ int __cold open_ctree(struct super_block *sb,
>   	       char *options);
>   void __cold close_ctree(struct btrfs_fs_info *fs_info);
>   int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **super_page);
>   int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret);
> +			     struct page **super_page);
>   int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>   struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
>   				      struct btrfs_key *location);
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 7a480a2bdf51..f4a6ee518f0c 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6,7 +6,6 @@
>   #include <linux/sched.h>
>   #include <linux/bio.h>
>   #include <linux/slab.h>
> -#include <linux/buffer_head.h>
>   #include <linux/blkdev.h>
>   #include <linux/ratelimit.h>
>   #include <linux/kthread.h>
> @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
>   static int
>   btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>   		      int flush, struct block_device **bdev,
> -		      struct buffer_head **bh)
> +		      struct page **super_page)
>   {
>   	int ret;
>   
> @@ -519,9 +518,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>   		goto error;
>   	}
>   	invalidate_bdev(*bdev);
> -	*bh = btrfs_read_dev_super(*bdev);
> -	if (IS_ERR(*bh)) {
> -		ret = PTR_ERR(*bh);
> +	ret = btrfs_read_dev_super(*bdev, super_page);
> +	if (ret) {
>   		blkdev_put(*bdev, flags);
>   		goto error;
>   	}
> @@ -530,7 +528,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>   
>   error:
>   	*bdev = NULL;
> -	*bh = NULL;
>   	return ret;
>   }
>   
> @@ -611,7 +608,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>   {
>   	struct request_queue *q;
>   	struct block_device *bdev;
> -	struct buffer_head *bh;
> +	struct page *super_page;
>   	struct btrfs_super_block *disk_super;
>   	u64 devid;
>   	int ret;
> @@ -622,17 +619,17 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>   		return -EINVAL;
>   
>   	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
> -				    &bdev, &bh);
> +				    &bdev, &super_page);
>   	if (ret)
>   		return ret;
>   
> -	disk_super = (struct btrfs_super_block *)bh->b_data;
> +	disk_super = kmap(super_page);
>   	devid = btrfs_stack_device_id(&disk_super->dev_item);
>   	if (devid != device->devid)
> -		goto error_brelse;
> +		goto error_free_page;
>   
>   	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
> -		goto error_brelse;
> +		goto error_free_page;
>   
>   	device->generation = btrfs_super_generation(disk_super);
>   
> @@ -641,7 +638,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>   		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
>   			pr_err(
>   		"BTRFS: Invalid seeding and uuid-changed device detected\n");
> -			goto error_brelse;
> +			goto error_free_page;
>   		}
>   
>   		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
> @@ -667,12 +664,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>   		fs_devices->rw_devices++;
>   		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
>   	}
> -	brelse(bh);
> +	btrfs_release_disk_super(super_page);
>   
>   	return 0;
>   
> -error_brelse:
> -	brelse(bh);
> +error_free_page:
> +	btrfs_release_disk_super(super_page);
>   	blkdev_put(bdev, flags);
>   
>   	return -EINVAL;
> @@ -2209,14 +2206,15 @@ static struct btrfs_device *btrfs_find_device_by_path(
>   	u64 devid;
>   	u8 *dev_uuid;
>   	struct block_device *bdev;
> -	struct buffer_head *bh;
> +	struct page *super_page;
>   	struct btrfs_device *device;
>   
>   	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
> -				    fs_info->bdev_holder, 0, &bdev, &bh);
> +				    fs_info->bdev_holder, 0, &bdev,
> +				    &super_page);
>   	if (ret)
>   		return ERR_PTR(ret);
> -	disk_super = (struct btrfs_super_block *)bh->b_data;
> +	disk_super = kmap(super_page);
>   	devid = btrfs_stack_device_id(&disk_super->dev_item);
>   	dev_uuid = disk_super->dev_item.uuid;
>   	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
> @@ -2226,7 +2224,7 @@ static struct btrfs_device *btrfs_find_device_by_path(
>   		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
>   					   disk_super->fsid, true);
>   
> -	brelse(bh);
> +	btrfs_release_disk_super(super_page);
>   	if (!device)
>   		device = ERR_PTR(-ENOENT);
>   	blkdev_put(bdev, FMODE_READ);
> @@ -7319,25 +7317,39 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
>   
>   void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
>   {
> -	struct buffer_head *bh;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
>   	struct btrfs_super_block *disk_super;
>   	int copy_num;
>   
>   	if (!bdev)
>   		return;
>   
> +	bio_init(&bio, &bio_vec, 1);
>   	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
>   		copy_num++) {
> +		u64 bytenr = btrfs_sb_offset(copy_num);
> +		struct page *page;
>   
> -		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
> +		if (btrfs_read_dev_one_super(bdev, copy_num, &page))
>   			continue;
>   
> -		disk_super = (struct btrfs_super_block *)bh->b_data;
> +		disk_super = kmap(page) + offset_in_page(bytenr);
>   
>   		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> -		set_buffer_dirty(bh);
> -		sync_dirty_buffer(bh);
> -		brelse(bh);
> +
> +		bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio_set_dev(&bio, bdev);
> +		bio_set_op_attrs(&bio, REQ_OP_WRITE, 0);
nit: We're losing REQ_SYNC here, not that it matters but if you have to re-roll 
it may be nice to have.  Otherwise

Reviewed-by: Josef Bacik <josef@toxicpanda.com>

Thanks,

Josef
Johannes Thumshirn Jan. 23, 2020, 1:53 p.m. UTC | #2
On 23/01/2020, 14:51, "Josef Bacik" <josef@toxicpanda.com> wrote:
[...]
    > -		brelse(bh);
    > +
    > +		bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
    > +		bio_set_dev(&bio, bdev);
    > +		bio_set_op_attrs(&bio, REQ_OP_WRITE, 0);
    nit: We're losing REQ_SYNC here, not that it matters but if you have to re-roll 
    it may be nice to have.  Otherwise
    
No actually we don't. submnit_bio_wait() implies REQ_SYNC, so we're all good.

    Reviewed-by: Josef Bacik <josef@toxicpanda.com>

Thanks,
	Johannes
Nikolay Borisov Jan. 23, 2020, 2:13 p.m. UTC | #3
On 23.01.20 г. 10:18 ч., Johannes Thumshirn wrote:

<snip>

> @@ -3374,40 +3378,60 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret)
> +			struct page **super_page)
>  {
> -	struct buffer_head *bh;
>  	struct btrfs_super_block *super;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	struct page *page;
>  	u64 bytenr;
> +	struct address_space *mapping = bdev->bd_inode->i_mapping;
> +	gfp_t gfp_mask;
> +	int ret;
>  
>  	bytenr = btrfs_sb_offset(copy_num);
>  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
> +	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	bio_init(&bio, &bio_vec, 1);
> +	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +	bio_set_dev(&bio, bdev);
> +	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
> +	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +		     offset_in_page(bytenr));
> +
> +	ret = submit_bio_wait(&bio);
> +	unlock_page(page);

So this is based on my code where the page is unlocked as soon as it's
read. However, as per out off-list discussion, I think this page needs
to be unlocked once the caller of
btrfs_read_dev_one_super/btrfs_read_dev_super is finished with it.

>  	/*
>  	 * If we fail to read from the underlying devices, as of now
>  	 * the best option we have is to mark it EIO.
>  	 */
> -	if (!bh)
> +	if (ret) {
> +		put_page(page);
>  		return -EIO;
> +	}
>  
> -	super = (struct btrfs_super_block *)bh->b_data;
> +	super = kmap(page);
>  	if (btrfs_super_bytenr(super) != bytenr ||
>  		    btrfs_super_magic(super) != BTRFS_MAGIC) {
> -		brelse(bh);
> +		btrfs_release_disk_super(page);
>  		return -EINVAL;
>  	}
> +	kunmap(page);
>  
> -	*bh_ret = bh;
> +	*super_page = page;
>  	return 0;
>  }
>  
>  
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>  {
> -	struct buffer_head *bh;
> -	struct buffer_head *latest = NULL;
> +	struct page *latest = NULL;
>  	struct btrfs_super_block *super;
>  	int i;
>  	u64 transid = 0;

<snip>

>  
>  void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
>  {
> -	struct buffer_head *bh;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
>  	struct btrfs_super_block *disk_super;
>  	int copy_num;
>  
>  	if (!bdev)
>  		return;
>  
> +	bio_init(&bio, &bio_vec, 1);
>  	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
>  		copy_num++) {
> +		u64 bytenr = btrfs_sb_offset(copy_num);
> +		struct page *page;
>  
> -		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
> +		if (btrfs_read_dev_one_super(bdev, copy_num, &page))
>  			continue;
>  
> -		disk_super = (struct btrfs_super_block *)bh->b_data;
> +		disk_super = kmap(page) + offset_in_page(bytenr);
>  
>  		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> -		set_buffer_dirty(bh);
> -		sync_dirty_buffer(bh);
> -		brelse(bh);
> +
> +		bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio_set_dev(&bio, bdev);
> +		bio_set_op_attrs(&bio, REQ_OP_WRITE, 0);
> +		bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));
> +
> +		lock_page(page);
> +		submit_bio_wait(&bio);
> +		unlock_page(page);
> +		btrfs_release_disk_super(page);
> +		bio_reset(&bio);

For example in this code if btrfs_read_dev_one_super held the page
locked you would have unlocked after submit_bio_wait is has returned.
But in this case what you do is take the page unlocked and now you have
a race window between btrfs_read_dev_one_super and lock_page(page) in
this function? btrfs_scratch_superblocks is used in replace context so
what if it races with transaction commit? If my worries are moot and I
have missed anything then at the very least safety should be documented
in the changelog of the patch.

> +
>  	}
>  
>  	/* Notify udev that device has changed */
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index b7f2edbc6581..1e4ebe6d6368 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex;
>  
>  #define BTRFS_STRIPE_LEN	SZ_64K
>  
> -struct buffer_head;
> -
>  struct btrfs_io_geometry {
>  	/* remaining bytes before crossing a stripe */
>  	u64 len;
>
Johannes Thumshirn Jan. 23, 2020, 4:45 p.m. UTC | #4
On 23/01/2020 15:14, Nikolay Borisov wrote:
> 
> 
> On 23.01.20 г. 10:18 ч., Johannes Thumshirn wrote:
> 
> <snip>
> 
>> @@ -3374,40 +3378,60 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
>>   }
>>   
>>   int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
>> -			struct buffer_head **bh_ret)
>> +			struct page **super_page)
>>   {
>> -	struct buffer_head *bh;
>>   	struct btrfs_super_block *super;
>> +	struct bio_vec bio_vec;
>> +	struct bio bio;
>> +	struct page *page;
>>   	u64 bytenr;
>> +	struct address_space *mapping = bdev->bd_inode->i_mapping;
>> +	gfp_t gfp_mask;
>> +	int ret;
>>   
>>   	bytenr = btrfs_sb_offset(copy_num);
>>   	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>>   		return -EINVAL;
>>   
>> -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
>> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
>> +	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
>> +	if (!page)
>> +		return -ENOMEM;
>> +
>> +	bio_init(&bio, &bio_vec, 1);
>> +	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
>> +	bio_set_dev(&bio, bdev);
>> +	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
>> +	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
>> +		     offset_in_page(bytenr));
>> +
>> +	ret = submit_bio_wait(&bio);
>> +	unlock_page(page);
> 
> So this is based on my code where the page is unlocked as soon as it's
> read. However, as per out off-list discussion, I think this page needs
> to be unlocked once the caller of
> btrfs_read_dev_one_super/btrfs_read_dev_super is finished with it.

As I don't have an answer to that yet, I just gave it a try (see 
attached diff) and it instantly deadlocks on mount. So either I did 
something wrong or holding the lock until the callers have finished 
processing it is a bad idea.

Thanks,
	Johannes
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ac1755793596..7afac96be17f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2856,6 +2856,7 @@ int __cold open_ctree(struct super_block *sb,
 	ret = btrfs_init_csum_hash(fs_info, csum_type);
 	if (ret) {
 		err = ret;
+		unlock_page(super_page);
 		btrfs_release_disk_super(super_page);
 		goto fail_alloc;
 	}
@@ -2867,6 +2868,7 @@ int __cold open_ctree(struct super_block *sb,
 	if (btrfs_check_super_csum(fs_info, superblock)) {
 		btrfs_err(fs_info, "superblock checksum mismatch");
 		err = -EINVAL;
+		unlock_page(super_page);
 		btrfs_release_disk_super(super_page);
 		goto fail_csum;
 	}
@@ -2877,6 +2879,7 @@ int __cold open_ctree(struct super_block *sb,
 	 * the whole block of INFO_SIZE
 	 */
 	memcpy(fs_info->super_copy, superblock, sizeof(*fs_info->super_copy));
+	unlock_page(super_page);
 	btrfs_release_disk_super(super_page);
 
 	disk_super = fs_info->super_copy;
@@ -3413,7 +3416,6 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
 		     offset_in_page(bytenr));
 
 	ret = submit_bio_wait(&bio);
-	unlock_page(page);
 	/*
 	 * If we fail to read from the underlying devices, as of now
 	 * the best option we have is to mark it EIO.
@@ -3426,6 +3428,7 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
 	super = kmap(page);
 	if (btrfs_super_bytenr(super) != bytenr ||
 		    btrfs_super_magic(super) != BTRFS_MAGIC) {
+		unlock_page(page);
 		btrfs_release_disk_super(page);
 		return -EINVAL;
 	}
@@ -3457,11 +3460,14 @@ int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
 		super = kmap(*page);
 
 		if (!latest || btrfs_super_generation(super) > transid) {
-			if (latest)
+			if (latest) {
+				unlock_page(latest);
 				btrfs_release_disk_super(latest);
+			}
 			latest = *page;
 			transid = btrfs_super_generation(super);
 		} else {
+			unlock_page(*page);
 			btrfs_release_disk_super(*page);
 		}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4a6ee518f0c..8bb2f9dead63 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7344,7 +7344,6 @@ void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_pat
 		bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
 			     offset_in_page(bytenr));
 
-		lock_page(page);
 		submit_bio_wait(&bio);
 		unlock_page(page);
 		btrfs_release_disk_super(page);
David Sterba Jan. 23, 2020, 10:25 p.m. UTC | #5
On Thu, Jan 23, 2020 at 05:18:45PM +0900, Johannes Thumshirn wrote:
> Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have
> some drawbacks, like not being able to propagate errors from the lower
> layers.
> 
> Change the buffer_heads to BIOs and utilize the page cache for the page
> allocation. Compared to buffer_heads using BIOs are more lightweight and
> we skip several layers of buffer_head code until we either reach the page
> cache or build a BIO and submit it to read the blocks from disk.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> ---
> Changes to v1:
> - move 'super_page' into for-loop in btrfs_scratch_superblocks() (Nikolay)
> - switch to using pagecahce instead of alloc_pages() (Nikolay, David)
> ---
>  fs/btrfs/disk-io.c | 83 ++++++++++++++++++++++++++++++----------------
>  fs/btrfs/disk-io.h |  4 +--
>  fs/btrfs/volumes.c | 62 ++++++++++++++++++++--------------
>  fs/btrfs/volumes.h |  2 --
>  4 files changed, 94 insertions(+), 57 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index aea48d6ddc0c..b111f32108cc 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2635,11 +2635,12 @@ int __cold open_ctree(struct super_block *sb,
>  	u64 features;
>  	u16 csum_type;
>  	struct btrfs_key location;
> -	struct buffer_head *bh;
>  	struct btrfs_super_block *disk_super;
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>  	struct btrfs_root *tree_root;
>  	struct btrfs_root *chunk_root;
> +	struct page *super_page;
> +	u8 *superblock;
>  	int ret;
>  	int err = -EINVAL;
>  	int clear_free_space_tree = 0;
> @@ -2832,28 +2833,31 @@ int __cold open_ctree(struct super_block *sb,
>  	/*
>  	 * Read super block and check the signature bytes only
>  	 */
> -	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
> -	if (IS_ERR(bh)) {
> -		err = PTR_ERR(bh);
> +	ret = btrfs_read_dev_super(fs_devices->latest_bdev, &super_page);
> +	if (ret) {
> +		err = ret;
>  		goto fail_alloc;
>  	}
>  
> +	superblock = kmap(super_page);

Page mapped here

>  	/*
>  	 * Verify the type first, if that or the the checksum value are
>  	 * corrupted, we'll find out
>  	 */
> -	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
> +	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)
> +					  superblock);
>  	if (!btrfs_supported_super_csum(csum_type)) {
>  		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
>  			  csum_type);
>  		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);

and the pairing kunmap is hidden in this helper with mismatching name.
It's correct but the common cleanup code at the fail_alloc label can be
used to avoid calling btrfs_release_disk_super at each error branch.

The helper is trival, kunmap(), put_page(). I'd rather keep it opencoded
in case the kmap happens in the function so it can be easily seen what's
the span of the mapping.

>  		goto fail_alloc;
>  	}
>  
>  	ret = btrfs_init_csum_hash(fs_info, csum_type);
>  	if (ret) {
>  		err = ret;
> +		btrfs_release_disk_super(super_page);
>  		goto fail_alloc;
>  	}
>  
> @@ -2861,10 +2865,10 @@ int __cold open_ctree(struct super_block *sb,
>  	 * We want to check superblock checksum, the type is stored inside.
>  	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
>  	 */
> -	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
> +	if (btrfs_check_super_csum(fs_info, superblock)) {
>  		btrfs_err(fs_info, "superblock checksum mismatch");
>  		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);
>  		goto fail_csum;
>  	}
>  
> @@ -2873,8 +2877,8 @@ int __cold open_ctree(struct super_block *sb,
>  	 * following bytes up to INFO_SIZE, the checksum is calculated from
>  	 * the whole block of INFO_SIZE
>  	 */
> -	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
> -	brelse(bh);
> +	memcpy(fs_info->super_copy, superblock, sizeof(*fs_info->super_copy));
> +	btrfs_release_disk_super(super_page);
>  
>  	disk_super = fs_info->super_copy;
>  
> @@ -3374,40 +3378,60 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret)
> +			struct page **super_page)
>  {
> -	struct buffer_head *bh;
>  	struct btrfs_super_block *super;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	struct page *page;
>  	u64 bytenr;
> +	struct address_space *mapping = bdev->bd_inode->i_mapping;
> +	gfp_t gfp_mask;
> +	int ret;
>  
>  	bytenr = btrfs_sb_offset(copy_num);
>  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
> +	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	bio_init(&bio, &bio_vec, 1);
> +	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +	bio_set_dev(&bio, bdev);
> +	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
> +	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +		     offset_in_page(bytenr));
> +
> +	ret = submit_bio_wait(&bio);
> +	unlock_page(page);
>  	/*
>  	 * If we fail to read from the underlying devices, as of now
>  	 * the best option we have is to mark it EIO.
>  	 */
> -	if (!bh)
> +	if (ret) {
> +		put_page(page);
>  		return -EIO;
> +	}
>  
> -	super = (struct btrfs_super_block *)bh->b_data;
> +	super = kmap(page);
>  	if (btrfs_super_bytenr(super) != bytenr ||
>  		    btrfs_super_magic(super) != BTRFS_MAGIC) {
> -		brelse(bh);
> +		btrfs_release_disk_super(page);
>  		return -EINVAL;
>  	}
> +	kunmap(page);

Like here

>  
> -	*bh_ret = bh;
> +	*super_page = page;
>  	return 0;
>  }
>  
>  
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>  {
> -	struct buffer_head *bh;
> -	struct buffer_head *latest = NULL;
> +	struct page *latest = NULL;
>  	struct btrfs_super_block *super;
>  	int i;
>  	u64 transid = 0;
> @@ -3419,25 +3443,28 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
>  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>  	 */
>  	for (i = 0; i < 1; i++) {
> -		ret = btrfs_read_dev_one_super(bdev, i, &bh);
> +		ret = btrfs_read_dev_one_super(bdev, i, page);
>  		if (ret)
>  			continue;
>  
> -		super = (struct btrfs_super_block *)bh->b_data;
> +		super = kmap(*page);
>  
>  		if (!latest || btrfs_super_generation(super) > transid) {
> -			brelse(latest);
> -			latest = bh;
> +			if (latest)
> +				btrfs_release_disk_super(latest);
> +			latest = *page;
>  			transid = btrfs_super_generation(super);
>  		} else {
> -			brelse(bh);
> +			btrfs_release_disk_super(*page);
>  		}
> +
> +		kunmap(*page);

And here.

>  	}
>  
> -	if (!latest)
> -		return ERR_PTR(ret);
> +	if (ret)
> +		return ret;
>  
> -	return latest;
> +	return 0;
>  }
>  
>  /*
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 8c2d6cf1ce59..e04b233c436a 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -54,9 +54,9 @@ int __cold open_ctree(struct super_block *sb,
>  	       char *options);
>  void __cold close_ctree(struct btrfs_fs_info *fs_info);
>  int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **super_page);
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret);
> +			     struct page **super_page);
>  int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>  struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
>  				      struct btrfs_key *location);
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 7a480a2bdf51..f4a6ee518f0c 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6,7 +6,6 @@
>  #include <linux/sched.h>
>  #include <linux/bio.h>
>  #include <linux/slab.h>
> -#include <linux/buffer_head.h>
>  #include <linux/blkdev.h>
>  #include <linux/ratelimit.h>
>  #include <linux/kthread.h>
> @@ -500,7 +499,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
>  static int
>  btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>  		      int flush, struct block_device **bdev,
> -		      struct buffer_head **bh)
> +		      struct page **super_page)
>  {
>  	int ret;
>  
> @@ -519,9 +518,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>  		goto error;
>  	}
>  	invalidate_bdev(*bdev);
> -	*bh = btrfs_read_dev_super(*bdev);
> -	if (IS_ERR(*bh)) {
> -		ret = PTR_ERR(*bh);
> +	ret = btrfs_read_dev_super(*bdev, super_page);
> +	if (ret) {
>  		blkdev_put(*bdev, flags);
>  		goto error;
>  	}
> @@ -530,7 +528,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
>  
>  error:
>  	*bdev = NULL;
> -	*bh = NULL;
>  	return ret;
>  }
>  
> @@ -611,7 +608,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>  {
>  	struct request_queue *q;
>  	struct block_device *bdev;
> -	struct buffer_head *bh;
> +	struct page *super_page;
>  	struct btrfs_super_block *disk_super;
>  	u64 devid;
>  	int ret;
> @@ -622,17 +619,17 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>  		return -EINVAL;
>  
>  	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
> -				    &bdev, &bh);
> +				    &bdev, &super_page);
>  	if (ret)
>  		return ret;
>  
> -	disk_super = (struct btrfs_super_block *)bh->b_data;
> +	disk_super = kmap(super_page);
>  	devid = btrfs_stack_device_id(&disk_super->dev_item);
>  	if (devid != device->devid)
> -		goto error_brelse;
> +		goto error_free_page;
>  
>  	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
> -		goto error_brelse;
> +		goto error_free_page;
>  
>  	device->generation = btrfs_super_generation(disk_super);
>  
> @@ -641,7 +638,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>  		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
>  			pr_err(
>  		"BTRFS: Invalid seeding and uuid-changed device detected\n");
> -			goto error_brelse;
> +			goto error_free_page;
>  		}
>  
>  		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
> @@ -667,12 +664,12 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
>  		fs_devices->rw_devices++;
>  		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
>  	}
> -	brelse(bh);
> +	btrfs_release_disk_super(super_page);

kummap here

>  
>  	return 0;
>  
> -error_brelse:
> -	brelse(bh);
> +error_free_page:
> +	btrfs_release_disk_super(super_page);
>  	blkdev_put(bdev, flags);
>  
>  	return -EINVAL;
> @@ -2209,14 +2206,15 @@ static struct btrfs_device *btrfs_find_device_by_path(
>  	u64 devid;
>  	u8 *dev_uuid;
>  	struct block_device *bdev;
> -	struct buffer_head *bh;
> +	struct page *super_page;
>  	struct btrfs_device *device;
>  
>  	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
> -				    fs_info->bdev_holder, 0, &bdev, &bh);
> +				    fs_info->bdev_holder, 0, &bdev,
> +				    &super_page);
>  	if (ret)
>  		return ERR_PTR(ret);
> -	disk_super = (struct btrfs_super_block *)bh->b_data;
> +	disk_super = kmap(super_page);
>  	devid = btrfs_stack_device_id(&disk_super->dev_item);
>  	dev_uuid = disk_super->dev_item.uuid;
>  	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
> @@ -2226,7 +2224,7 @@ static struct btrfs_device *btrfs_find_device_by_path(
>  		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
>  					   disk_super->fsid, true);
>  
> -	brelse(bh);
> +	btrfs_release_disk_super(super_page);

and here

>  	if (!device)
>  		device = ERR_PTR(-ENOENT);
>  	blkdev_put(bdev, FMODE_READ);
> @@ -7319,25 +7317,39 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
>  
>  void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
>  {
> -	struct buffer_head *bh;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
>  	struct btrfs_super_block *disk_super;
>  	int copy_num;
>  
>  	if (!bdev)
>  		return;
>  
> +	bio_init(&bio, &bio_vec, 1);
>  	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
>  		copy_num++) {
> +		u64 bytenr = btrfs_sb_offset(copy_num);
> +		struct page *page;
>  
> -		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
> +		if (btrfs_read_dev_one_super(bdev, copy_num, &page))
>  			continue;
>  
> -		disk_super = (struct btrfs_super_block *)bh->b_data;
> +		disk_super = kmap(page) + offset_in_page(bytenr);
>  
>  		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
> -		set_buffer_dirty(bh);
> -		sync_dirty_buffer(bh);
> -		brelse(bh);
> +
> +		bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +		bio_set_dev(&bio, bdev);
> +		bio_set_op_attrs(&bio, REQ_OP_WRITE, 0);
> +		bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +			     offset_in_page(bytenr));
> +
> +		lock_page(page);
> +		submit_bio_wait(&bio);
> +		unlock_page(page);
> +		btrfs_release_disk_super(page);

And the last one

> +		bio_reset(&bio);
> +
>  	}
>  
>  	/* Notify udev that device has changed */
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index b7f2edbc6581..1e4ebe6d6368 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -17,8 +17,6 @@ extern struct mutex uuid_mutex;
>  
>  #define BTRFS_STRIPE_LEN	SZ_64K
>  
> -struct buffer_head;
> -
>  struct btrfs_io_geometry {
>  	/* remaining bytes before crossing a stripe */
>  	u64 len;
> -- 
> 2.24.1
David Sterba Jan. 24, 2020, 1:54 p.m. UTC | #6
On Thu, Jan 23, 2020 at 05:18:45PM +0900, Johannes Thumshirn wrote:
> -	if (!latest)
> -		return ERR_PTR(ret);
> +	if (ret)
> +		return ret;
>  
> -	return latest;
> +	return 0;

This could be simpliried to 'return ret', no?
David Sterba Jan. 24, 2020, 2:03 p.m. UTC | #7
On Thu, Jan 23, 2020 at 05:18:45PM +0900, Johannes Thumshirn wrote:
> Super-block reading in BTRFS is done using buffer_heads. Buffer_heads have
> some drawbacks, like not being able to propagate errors from the lower
> layers.
> 
> Change the buffer_heads to BIOs and utilize the page cache for the page
> allocation. Compared to buffer_heads using BIOs are more lightweight and
> we skip several layers of buffer_head code until we either reach the page
> cache or build a BIO and submit it to read the blocks from disk.
> 
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> 
> ---
> Changes to v1:
> - move 'super_page' into for-loop in btrfs_scratch_superblocks() (Nikolay)
> - switch to using pagecahce instead of alloc_pages() (Nikolay, David)
> ---
>  fs/btrfs/disk-io.c | 83 ++++++++++++++++++++++++++++++----------------
>  fs/btrfs/disk-io.h |  4 +--
>  fs/btrfs/volumes.c | 62 ++++++++++++++++++++--------------
>  fs/btrfs/volumes.h |  2 --
>  4 files changed, 94 insertions(+), 57 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index aea48d6ddc0c..b111f32108cc 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2635,11 +2635,12 @@ int __cold open_ctree(struct super_block *sb,
>  	u64 features;
>  	u16 csum_type;
>  	struct btrfs_key location;
> -	struct buffer_head *bh;
>  	struct btrfs_super_block *disk_super;
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>  	struct btrfs_root *tree_root;
>  	struct btrfs_root *chunk_root;
> +	struct page *super_page;
> +	u8 *superblock;
>  	int ret;
>  	int err = -EINVAL;
>  	int clear_free_space_tree = 0;
> @@ -2832,28 +2833,31 @@ int __cold open_ctree(struct super_block *sb,
>  	/*
>  	 * Read super block and check the signature bytes only
>  	 */
> -	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
> -	if (IS_ERR(bh)) {
> -		err = PTR_ERR(bh);
> +	ret = btrfs_read_dev_super(fs_devices->latest_bdev, &super_page);
> +	if (ret) {
> +		err = ret;
>  		goto fail_alloc;
>  	}
>  
> +	superblock = kmap(super_page);
>  	/*
>  	 * Verify the type first, if that or the the checksum value are
>  	 * corrupted, we'll find out
>  	 */
> -	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
> +	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)
> +					  superblock);
>  	if (!btrfs_supported_super_csum(csum_type)) {
>  		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
>  			  csum_type);
>  		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);
>  		goto fail_alloc;
>  	}
>  
>  	ret = btrfs_init_csum_hash(fs_info, csum_type);
>  	if (ret) {
>  		err = ret;
> +		btrfs_release_disk_super(super_page);
>  		goto fail_alloc;
>  	}
>  
> @@ -2861,10 +2865,10 @@ int __cold open_ctree(struct super_block *sb,
>  	 * We want to check superblock checksum, the type is stored inside.
>  	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
>  	 */
> -	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
> +	if (btrfs_check_super_csum(fs_info, superblock)) {
>  		btrfs_err(fs_info, "superblock checksum mismatch");
>  		err = -EINVAL;
> -		brelse(bh);
> +		btrfs_release_disk_super(super_page);
>  		goto fail_csum;
>  	}
>  
> @@ -2873,8 +2877,8 @@ int __cold open_ctree(struct super_block *sb,
>  	 * following bytes up to INFO_SIZE, the checksum is calculated from
>  	 * the whole block of INFO_SIZE
>  	 */
> -	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
> -	brelse(bh);
> +	memcpy(fs_info->super_copy, superblock, sizeof(*fs_info->super_copy));
> +	btrfs_release_disk_super(super_page);
>  
>  	disk_super = fs_info->super_copy;
>  
> @@ -3374,40 +3378,60 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
>  }
>  
>  int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
> -			struct buffer_head **bh_ret)
> +			struct page **super_page)
>  {
> -	struct buffer_head *bh;
>  	struct btrfs_super_block *super;
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	struct page *page;
>  	u64 bytenr;
> +	struct address_space *mapping = bdev->bd_inode->i_mapping;
> +	gfp_t gfp_mask;
> +	int ret;
>  
>  	bytenr = btrfs_sb_offset(copy_num);
>  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> -	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
> +	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
> +	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	bio_init(&bio, &bio_vec, 1);
> +	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
> +	bio_set_dev(&bio, bdev);
> +	bio_set_op_attrs(&bio, REQ_OP_READ, 0);

The comment in blk_types.h says:

390 /* obsolete, don't use in new code */
391 static inline void bio_set_op_attrs(struct bio *bio, unsigned op,
392                 unsigned op_flags)

please open code it. Other uses of bio_set_op_attrs have been cleaned up
already.

> +	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
> +		     offset_in_page(bytenr));

For a fresh bio this never fails right?

> +
> +	ret = submit_bio_wait(&bio);
> +	unlock_page(page);
>  	/*
>  	 * If we fail to read from the underlying devices, as of now
>  	 * the best option we have is to mark it EIO.
>  	 */
> -	if (!bh)
> +	if (ret) {
> +		put_page(page);
>  		return -EIO;
> +	}
>  
> -	super = (struct btrfs_super_block *)bh->b_data;
> +	super = kmap(page);
>  	if (btrfs_super_bytenr(super) != bytenr ||
>  		    btrfs_super_magic(super) != BTRFS_MAGIC) {
> -		brelse(bh);
> +		btrfs_release_disk_super(page);
>  		return -EINVAL;
>  	}
> +	kunmap(page);
>  
> -	*bh_ret = bh;
> +	*super_page = page;
>  	return 0;
>  }
>  
>  
> -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
> +int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
>  {
> -	struct buffer_head *bh;
> -	struct buffer_head *latest = NULL;
> +	struct page *latest = NULL;
>  	struct btrfs_super_block *super;
>  	int i;
>  	u64 transid = 0;
> @@ -3419,25 +3443,28 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
>  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>  	 */
>  	for (i = 0; i < 1; i++) {
> -		ret = btrfs_read_dev_one_super(bdev, i, &bh);
> +		ret = btrfs_read_dev_one_super(bdev, i, page);
>  		if (ret)
>  			continue;
>  
> -		super = (struct btrfs_super_block *)bh->b_data;
> +		super = kmap(*page);
>  
>  		if (!latest || btrfs_super_generation(super) > transid) {
> -			brelse(latest);
> -			latest = bh;
> +			if (latest)
> +				btrfs_release_disk_super(latest);
> +			latest = *page;
>  			transid = btrfs_super_generation(super);
>  		} else {
> -			brelse(bh);
> +			btrfs_release_disk_super(*page);
>  		}
> +
> +		kunmap(*page);

This looks like double unmap, once in btrfs_release_disk_super and then
unconditinally as kunmap.

The first part of the if/else block also calls kunmap, but depending on
latest.

So another point for opencoding btrfs_release_disk_super, this kind of
mistakes is too easy.

>  	}
>  
> -	if (!latest)
> -		return ERR_PTR(ret);
> +	if (ret)
> +		return ret;
>  
> -	return latest;
> +	return 0;
>  }
>  
>  /*

Patch
diff mbox series

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aea48d6ddc0c..b111f32108cc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2635,11 +2635,12 @@  int __cold open_ctree(struct super_block *sb,
 	u64 features;
 	u16 csum_type;
 	struct btrfs_key location;
-	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
+	struct page *super_page;
+	u8 *superblock;
 	int ret;
 	int err = -EINVAL;
 	int clear_free_space_tree = 0;
@@ -2832,28 +2833,31 @@  int __cold open_ctree(struct super_block *sb,
 	/*
 	 * Read super block and check the signature bytes only
 	 */
-	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-	if (IS_ERR(bh)) {
-		err = PTR_ERR(bh);
+	ret = btrfs_read_dev_super(fs_devices->latest_bdev, &super_page);
+	if (ret) {
+		err = ret;
 		goto fail_alloc;
 	}
 
+	superblock = kmap(super_page);
 	/*
 	 * Verify the type first, if that or the the checksum value are
 	 * corrupted, we'll find out
 	 */
-	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
+	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)
+					  superblock);
 	if (!btrfs_supported_super_csum(csum_type)) {
 		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
 			  csum_type);
 		err = -EINVAL;
-		brelse(bh);
+		btrfs_release_disk_super(super_page);
 		goto fail_alloc;
 	}
 
 	ret = btrfs_init_csum_hash(fs_info, csum_type);
 	if (ret) {
 		err = ret;
+		btrfs_release_disk_super(super_page);
 		goto fail_alloc;
 	}
 
@@ -2861,10 +2865,10 @@  int __cold open_ctree(struct super_block *sb,
 	 * We want to check superblock checksum, the type is stored inside.
 	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
 	 */
-	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
+	if (btrfs_check_super_csum(fs_info, superblock)) {
 		btrfs_err(fs_info, "superblock checksum mismatch");
 		err = -EINVAL;
-		brelse(bh);
+		btrfs_release_disk_super(super_page);
 		goto fail_csum;
 	}
 
@@ -2873,8 +2877,8 @@  int __cold open_ctree(struct super_block *sb,
 	 * following bytes up to INFO_SIZE, the checksum is calculated from
 	 * the whole block of INFO_SIZE
 	 */
-	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
-	brelse(bh);
+	memcpy(fs_info->super_copy, superblock, sizeof(*fs_info->super_copy));
+	btrfs_release_disk_super(super_page);
 
 	disk_super = fs_info->super_copy;
 
@@ -3374,40 +3378,60 @@  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 }
 
 int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
-			struct buffer_head **bh_ret)
+			struct page **super_page)
 {
-	struct buffer_head *bh;
 	struct btrfs_super_block *super;
+	struct bio_vec bio_vec;
+	struct bio bio;
+	struct page *page;
 	u64 bytenr;
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	gfp_t gfp_mask;
+	int ret;
 
 	bytenr = btrfs_sb_offset(copy_num);
 	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
+	gfp_mask = mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL;
+	page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, gfp_mask);
+	if (!page)
+		return -ENOMEM;
+
+	bio_init(&bio, &bio_vec, 1);
+	bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+	bio_set_dev(&bio, bdev);
+	bio_set_op_attrs(&bio, REQ_OP_READ, 0);
+	bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
+		     offset_in_page(bytenr));
+
+	ret = submit_bio_wait(&bio);
+	unlock_page(page);
 	/*
 	 * If we fail to read from the underlying devices, as of now
 	 * the best option we have is to mark it EIO.
 	 */
-	if (!bh)
+	if (ret) {
+		put_page(page);
 		return -EIO;
+	}
 
-	super = (struct btrfs_super_block *)bh->b_data;
+	super = kmap(page);
 	if (btrfs_super_bytenr(super) != bytenr ||
 		    btrfs_super_magic(super) != BTRFS_MAGIC) {
-		brelse(bh);
+		btrfs_release_disk_super(page);
 		return -EINVAL;
 	}
+	kunmap(page);
 
-	*bh_ret = bh;
+	*super_page = page;
 	return 0;
 }
 
 
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+int btrfs_read_dev_super(struct block_device *bdev, struct page **page)
 {
-	struct buffer_head *bh;
-	struct buffer_head *latest = NULL;
+	struct page *latest = NULL;
 	struct btrfs_super_block *super;
 	int i;
 	u64 transid = 0;
@@ -3419,25 +3443,28 @@  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 	 */
 	for (i = 0; i < 1; i++) {
-		ret = btrfs_read_dev_one_super(bdev, i, &bh);
+		ret = btrfs_read_dev_one_super(bdev, i, page);
 		if (ret)
 			continue;
 
-		super = (struct btrfs_super_block *)bh->b_data;
+		super = kmap(*page);
 
 		if (!latest || btrfs_super_generation(super) > transid) {
-			brelse(latest);
-			latest = bh;
+			if (latest)
+				btrfs_release_disk_super(latest);
+			latest = *page;
 			transid = btrfs_super_generation(super);
 		} else {
-			brelse(bh);
+			btrfs_release_disk_super(*page);
 		}
+
+		kunmap(*page);
 	}
 
-	if (!latest)
-		return ERR_PTR(ret);
+	if (ret)
+		return ret;
 
-	return latest;
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c2d6cf1ce59..e04b233c436a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,9 +54,9 @@  int __cold open_ctree(struct super_block *sb,
 	       char *options);
 void __cold close_ctree(struct btrfs_fs_info *fs_info);
 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_super(struct block_device *bdev, struct page **super_page);
 int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
-			struct buffer_head **bh_ret);
+			     struct page **super_page);
 int btrfs_commit_super(struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 				      struct btrfs_key *location);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7a480a2bdf51..f4a6ee518f0c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6,7 +6,6 @@ 
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/slab.h>
-#include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
@@ -500,7 +499,7 @@  static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 static int
 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 		      int flush, struct block_device **bdev,
-		      struct buffer_head **bh)
+		      struct page **super_page)
 {
 	int ret;
 
@@ -519,9 +518,8 @@  btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 		goto error;
 	}
 	invalidate_bdev(*bdev);
-	*bh = btrfs_read_dev_super(*bdev);
-	if (IS_ERR(*bh)) {
-		ret = PTR_ERR(*bh);
+	ret = btrfs_read_dev_super(*bdev, super_page);
+	if (ret) {
 		blkdev_put(*bdev, flags);
 		goto error;
 	}
@@ -530,7 +528,6 @@  btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 
 error:
 	*bdev = NULL;
-	*bh = NULL;
 	return ret;
 }
 
@@ -611,7 +608,7 @@  static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 {
 	struct request_queue *q;
 	struct block_device *bdev;
-	struct buffer_head *bh;
+	struct page *super_page;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
 	int ret;
@@ -622,17 +619,17 @@  static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		return -EINVAL;
 
 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-				    &bdev, &bh);
+				    &bdev, &super_page);
 	if (ret)
 		return ret;
 
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	disk_super = kmap(super_page);
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	if (devid != device->devid)
-		goto error_brelse;
+		goto error_free_page;
 
 	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
-		goto error_brelse;
+		goto error_free_page;
 
 	device->generation = btrfs_super_generation(disk_super);
 
@@ -641,7 +638,7 @@  static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 			pr_err(
 		"BTRFS: Invalid seeding and uuid-changed device detected\n");
-			goto error_brelse;
+			goto error_free_page;
 		}
 
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -667,12 +664,12 @@  static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		fs_devices->rw_devices++;
 		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 	}
-	brelse(bh);
+	btrfs_release_disk_super(super_page);
 
 	return 0;
 
-error_brelse:
-	brelse(bh);
+error_free_page:
+	btrfs_release_disk_super(super_page);
 	blkdev_put(bdev, flags);
 
 	return -EINVAL;
@@ -2209,14 +2206,15 @@  static struct btrfs_device *btrfs_find_device_by_path(
 	u64 devid;
 	u8 *dev_uuid;
 	struct block_device *bdev;
-	struct buffer_head *bh;
+	struct page *super_page;
 	struct btrfs_device *device;
 
 	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
-				    fs_info->bdev_holder, 0, &bdev, &bh);
+				    fs_info->bdev_holder, 0, &bdev,
+				    &super_page);
 	if (ret)
 		return ERR_PTR(ret);
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	disk_super = kmap(super_page);
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	dev_uuid = disk_super->dev_item.uuid;
 	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
@@ -2226,7 +2224,7 @@  static struct btrfs_device *btrfs_find_device_by_path(
 		device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
 					   disk_super->fsid, true);
 
-	brelse(bh);
+	btrfs_release_disk_super(super_page);
 	if (!device)
 		device = ERR_PTR(-ENOENT);
 	blkdev_put(bdev, FMODE_READ);
@@ -7319,25 +7317,39 @@  int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
 
 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
 {
-	struct buffer_head *bh;
+	struct bio_vec bio_vec;
+	struct bio bio;
 	struct btrfs_super_block *disk_super;
 	int copy_num;
 
 	if (!bdev)
 		return;
 
+	bio_init(&bio, &bio_vec, 1);
 	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
 		copy_num++) {
+		u64 bytenr = btrfs_sb_offset(copy_num);
+		struct page *page;
 
-		if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+		if (btrfs_read_dev_one_super(bdev, copy_num, &page))
 			continue;
 
-		disk_super = (struct btrfs_super_block *)bh->b_data;
+		disk_super = kmap(page) + offset_in_page(bytenr);
 
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-		set_buffer_dirty(bh);
-		sync_dirty_buffer(bh);
-		brelse(bh);
+
+		bio.bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
+		bio_set_dev(&bio, bdev);
+		bio_set_op_attrs(&bio, REQ_OP_WRITE, 0);
+		bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE,
+			     offset_in_page(bytenr));
+
+		lock_page(page);
+		submit_bio_wait(&bio);
+		unlock_page(page);
+		btrfs_release_disk_super(page);
+		bio_reset(&bio);
+
 	}
 
 	/* Notify udev that device has changed */
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b7f2edbc6581..1e4ebe6d6368 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -17,8 +17,6 @@  extern struct mutex uuid_mutex;
 
 #define BTRFS_STRIPE_LEN	SZ_64K
 
-struct buffer_head;
-
 struct btrfs_io_geometry {
 	/* remaining bytes before crossing a stripe */
 	u64 len;