diff mbox series

[v9,11/41] btrfs: implement log-structured superblock for ZONED mode

Message ID eca26372a84d8b8ec2b59d3390f172810ed6f3e4.1604065695.git.naohiro.aota@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: zoned block device support | expand

Commit Message

Naohiro Aota Oct. 30, 2020, 1:51 p.m. UTC
Superblock (and its copies) is the only data structure in btrfs which has a
fixed location on a device. Since we cannot overwrite in a sequential write
required zone, we cannot place superblock in the zone. One easy solution is
limiting superblock and copies to be placed only in conventional zones.
However, this method has two downsides: one is reduced number of superblock
copies. The location of the second copy of superblock is 256GB, which is in
a sequential write required zone on typical devices in the market today.
So, the number of superblock and copies is limited to be two.  Second
downside is that we cannot support devices which have no conventional zones
at all.

To solve these two problems, we employ superblock log writing. It uses two
zones as a circular buffer to write updated superblocks. Once the first
zone is filled up, start writing into the second buffer. Then, when the
both zones are filled up and before start writing to the first zone again,
it reset the first zone.

We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when the both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.

The following zones are reserved as the circular buffer on ZONED btrfs.

- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and next
  to it

If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.

Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
---
 fs/btrfs/block-group.c |   9 ++
 fs/btrfs/disk-io.c     |  41 +++++-
 fs/btrfs/scrub.c       |   3 +
 fs/btrfs/volumes.c     |  21 ++-
 fs/btrfs/zoned.c       | 311 +++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/zoned.h       |  40 ++++++
 6 files changed, 413 insertions(+), 12 deletions(-)

Comments

Josef Bacik Nov. 2, 2020, 6:22 p.m. UTC | #1
On 10/30/20 9:51 AM, Naohiro Aota wrote:
> Superblock (and its copies) is the only data structure in btrfs which has a
> fixed location on a device. Since we cannot overwrite in a sequential write
> required zone, we cannot place superblock in the zone. One easy solution is
> limiting superblock and copies to be placed only in conventional zones.
> However, this method has two downsides: one is reduced number of superblock
> copies. The location of the second copy of superblock is 256GB, which is in
> a sequential write required zone on typical devices in the market today.
> So, the number of superblock and copies is limited to be two.  Second
> downside is that we cannot support devices which have no conventional zones
> at all.
> 
> To solve these two problems, we employ superblock log writing. It uses two
> zones as a circular buffer to write updated superblocks. Once the first
> zone is filled up, start writing into the second buffer. Then, when the
> both zones are filled up and before start writing to the first zone again,
> it reset the first zone.
> 
> We can determine the position of the latest superblock by reading write
> pointer information from a device. One corner case is when the both zones
> are full. For this situation, we read out the last superblock of each
> zone, and compare them to determine which zone is older.
> 
> The following zones are reserved as the circular buffer on ZONED btrfs.
> 
> - The primary superblock: zones 0 and 1
> - The first copy: zones 16 and 17
> - The second copy: zones 1024 or zone at 256GB which is minimum, and next
>    to it
> 
> If these reserved zones are conventional, superblock is written fixed at
> the start of the zone without logging.
> 
> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
> ---
>   fs/btrfs/block-group.c |   9 ++
>   fs/btrfs/disk-io.c     |  41 +++++-
>   fs/btrfs/scrub.c       |   3 +
>   fs/btrfs/volumes.c     |  21 ++-
>   fs/btrfs/zoned.c       | 311 +++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/zoned.h       |  40 ++++++
>   6 files changed, 413 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index c0f1d6818df7..e989c66aa764 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1723,6 +1723,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
>   static int exclude_super_stripes(struct btrfs_block_group *cache)
>   {
>   	struct btrfs_fs_info *fs_info = cache->fs_info;
> +	bool zoned = btrfs_is_zoned(fs_info);
>   	u64 bytenr;
>   	u64 *logical;
>   	int stripe_len;
> @@ -1744,6 +1745,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
>   		if (ret)
>   			return ret;
>   
> +		/* shouldn't have super stripes in sequential zones */
> +		if (zoned && nr) {
> +			btrfs_err(fs_info,
> +				  "Zoned btrfs's block group %llu should not have super blocks",
> +				  cache->start);
> +			return -EUCLEAN;
> +		}
> +

I'm very confused about this check, namely how you've been able to test without 
it blowing up, which makes me feel like I'm missing something.

We _always_ call exclude_super_stripes(), and we're simply looking up the bytenr 
for that block, which appears to not do anything special for zoned.  This should 
be looking up and failing whenever it looks for super stripes far enough out. 
How are you not failing here everytime you mount the fs?  Thanks,

Josef
Johannes Thumshirn Nov. 2, 2020, 6:53 p.m. UTC | #2
On 02/11/2020 19:23, Josef Bacik wrote:
>> +		/* shouldn't have super stripes in sequential zones */
>> +		if (zoned && nr) {
>> +			btrfs_err(fs_info,
>> +				  "Zoned btrfs's block group %llu should not have super blocks",
>> +				  cache->start);
>> +			return -EUCLEAN;
>> +		}
>> +
> I'm very confused about this check, namely how you've been able to test without 
> it blowing up, which makes me feel like I'm missing something.
> 
> We _always_ call exclude_super_stripes(), and we're simply looking up the bytenr 
> for that block, which appears to not do anything special for zoned.  This should 
> be looking up and failing whenever it looks for super stripes far enough out. 
> How are you not failing here everytime you mount the fs?  Thanks,

Naohiro (or Josef and everyone else as well of cause), please correct me if I'm 
wrong, but on zoned btrfs we're not supporting any RAID type. So the call to 
btrfs_rmap_block() above will return 'nr = 0' (as we're always having 
map->num_stripes = 1) so this won't evaluate to true.

Byte,
	Johannes
Josef Bacik Nov. 2, 2020, 6:54 p.m. UTC | #3
On 10/30/20 9:51 AM, Naohiro Aota wrote:
> Superblock (and its copies) is the only data structure in btrfs which has a
> fixed location on a device. Since we cannot overwrite in a sequential write
> required zone, we cannot place superblock in the zone. One easy solution is
> limiting superblock and copies to be placed only in conventional zones.
> However, this method has two downsides: one is reduced number of superblock
> copies. The location of the second copy of superblock is 256GB, which is in
> a sequential write required zone on typical devices in the market today.
> So, the number of superblock and copies is limited to be two.  Second
> downside is that we cannot support devices which have no conventional zones
> at all.
> 
> To solve these two problems, we employ superblock log writing. It uses two
> zones as a circular buffer to write updated superblocks. Once the first
> zone is filled up, start writing into the second buffer. Then, when the
> both zones are filled up and before start writing to the first zone again,
> it reset the first zone.
> 
> We can determine the position of the latest superblock by reading write
> pointer information from a device. One corner case is when the both zones
> are full. For this situation, we read out the last superblock of each
> zone, and compare them to determine which zone is older.
> 
> The following zones are reserved as the circular buffer on ZONED btrfs.
> 
> - The primary superblock: zones 0 and 1
> - The first copy: zones 16 and 17
> - The second copy: zones 1024 or zone at 256GB which is minimum, and next
>    to it
> 
> If these reserved zones are conventional, superblock is written fixed at
> the start of the zone without logging.
> 

<snip>

>   
>   /*
>    * This is only the first step towards a full-features scrub. It reads all
> @@ -3704,6 +3705,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
>   		if (bytenr + BTRFS_SUPER_INFO_SIZE >
>   		    scrub_dev->commit_total_bytes)
>   			break;
> +		if (!btrfs_check_super_location(scrub_dev, bytenr))
> +			continue;

Any reason in particular we're skipping scrubbing supers here?  Can't we just 
lookup the bytenr and do the right thing here?

>   
>   		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
>   				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 10827892c086..db884b96a5ea 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1282,7 +1282,8 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
>   }
>   
>   static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
> -						       u64 bytenr)
> +						       u64 bytenr,
> +						       u64 bytenr_orig)
>   {
>   	struct btrfs_super_block *disk_super;
>   	struct page *page;
> @@ -1313,7 +1314,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
>   	/* align our pointer to the offset of the super block */
>   	disk_super = p + offset_in_page(bytenr);
>   
> -	if (btrfs_super_bytenr(disk_super) != bytenr ||
> +	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
>   	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
>   		btrfs_release_disk_super(p);
>   		return ERR_PTR(-EINVAL);
> @@ -1348,7 +1349,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>   	bool new_device_added = false;
>   	struct btrfs_device *device = NULL;
>   	struct block_device *bdev;
> -	u64 bytenr;
> +	u64 bytenr, bytenr_orig;
> +	int ret;
>   
>   	lockdep_assert_held(&uuid_mutex);
>   
> @@ -1358,14 +1360,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>   	 * So, we need to add a special mount option to scan for
>   	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>   	 */
> -	bytenr = btrfs_sb_offset(0);
>   	flags |= FMODE_EXCL;
>   
>   	bdev = blkdev_get_by_path(path, flags, holder);
>   	if (IS_ERR(bdev))
>   		return ERR_CAST(bdev);
>   
> -	disk_super = btrfs_read_disk_super(bdev, bytenr);
> +	bytenr_orig = btrfs_sb_offset(0);
> +	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
>   	if (IS_ERR(disk_super)) {
>   		device = ERR_CAST(disk_super);
>   		goto error_bdev_put;
> @@ -2029,6 +2035,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>   		if (IS_ERR(disk_super))
>   			continue;
>   
> +		if (bdev_is_zoned(bdev)) {
> +			btrfs_reset_sb_log_zones(bdev, copy_num);
> +			continue;
> +		}
> +
>   		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
>   
>   		page = virt_to_page(disk_super);
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index ae509699da14..d5487cba203b 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -20,6 +20,25 @@ static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx,
>   	return 0;
>   }
>   
> +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zone,
> +			    u64 *wp_ret);
> +
> +static inline u32 sb_zone_number(u8 shift, int mirror)
> +{
> +	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
> +
> +	switch (mirror) {
> +	case 0:
> +		return 0;
> +	case 1:
> +		return 16;
> +	case 2:
> +		return min(btrfs_sb_offset(mirror) >> shift, 1024ULL);
> +	}
> +

Can we get a comment here explaining the zone numbers?

> +	return 0;
> +}
> +
>   static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
>   			       struct blk_zone *zones, unsigned int *nr_zones)
>   {
> @@ -123,6 +142,49 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
>   		goto out;
>   	}
>   
> +	/* validate superblock log */
> +	nr_zones = 2;
> +	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		u32 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
> +		u64 sb_wp;
> +

I'd rather see

#define BTRFS_NR_ZONED_SB_ZONES 2

or something equally poorly named and use that instead of our magic 2 everywhere.

Then you can just do

int index = i * BTRFS_NR_ZONED_SB_ZONES;
&zone_info->sb_zones[index];

<snip>

> +static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
> +			   int rw, u64 *bytenr_ret)
> +{
> +	u64 wp;
> +	int ret;
> +
> +	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> +		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
> +		return 0;
> +	}
> +
> +	ret = sb_write_pointer(bdev, zones, &wp);
> +	if (ret != -ENOENT && ret < 0)
> +		return ret;
> +
> +	if (rw == WRITE) {
> +		struct blk_zone *reset = NULL;
> +
> +		if (wp == zones[0].start << SECTOR_SHIFT)
> +			reset = &zones[0];
> +		else if (wp == zones[1].start << SECTOR_SHIFT)
> +			reset = &zones[1];
> +
> +		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
> +			ASSERT(reset->cond == BLK_ZONE_COND_FULL);
> +
> +			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
> +					       reset->start, reset->len,
> +					       GFP_NOFS);

What happens if we crash right after this?  Is the WP set to the start of the 
zone here?  Does this mean we'll simply miss the super block?  I understand 
we're resetting one zone here, but we're doing this in order, so we'll reset one 
and write one, then reset the other and write the next.  We don't wait until 
we've issued the writes for everything, so it appears to me that there's a gap 
where we could have the WP pointed at the start of the zone, which we view as an 
invalid state and thus won't be able to mount the file system.  Or am I missing 
something?  Thanks,

Josef
Josef Bacik Nov. 2, 2020, 7:01 p.m. UTC | #4
On 11/2/20 1:53 PM, Johannes Thumshirn wrote:
> On 02/11/2020 19:23, Josef Bacik wrote:
>>> +		/* shouldn't have super stripes in sequential zones */
>>> +		if (zoned && nr) {
>>> +			btrfs_err(fs_info,
>>> +				  "Zoned btrfs's block group %llu should not have super blocks",
>>> +				  cache->start);
>>> +			return -EUCLEAN;
>>> +		}
>>> +
>> I'm very confused about this check, namely how you've been able to test without
>> it blowing up, which makes me feel like I'm missing something.
>>
>> We _always_ call exclude_super_stripes(), and we're simply looking up the bytenr
>> for that block, which appears to not do anything special for zoned.  This should
>> be looking up and failing whenever it looks for super stripes far enough out.
>> How are you not failing here everytime you mount the fs?  Thanks,
> 
> Naohiro (or Josef and everyone else as well of cause), please correct me if I'm
> wrong, but on zoned btrfs we're not supporting any RAID type. So the call to
> btrfs_rmap_block() above will return 'nr = 0' (as we're always having
> map->num_stripes = 1) so this won't evaluate to true.
> 

No it should return nr == 1 in the single case.  This maps physical address to a 
logical address in the block group, so it could be multiple, but if that bytenr 
falls inside the block group it'll return with something set.  Hence my 
confusion.  Thanks,

Josef
Johannes Thumshirn Nov. 2, 2020, 7:31 p.m. UTC | #5
On 02/11/2020 20:02, Josef Bacik wrote:
> No it should return nr == 1 in the single case.  This maps physical address to a 
> logical address in the block group, so it could be multiple, but if that bytenr 
> falls inside the block group it'll return with something set.  Hence my 
> confusion.

OK so from my debugging [1] it looks like we're hitting the !in_range() continue
case in __btrfs_rmap_block()'s loop.

But I'll need to defer to Naohiro to answer this question.

[1]:
mount -t btrfs /dev/nullb0 /mnt/test                                                                                                                                                
[    2.189080] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 65536                                                                       
[    2.191168] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(65536, 536870912, 268435456)
[    2.193068] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0                                                                                                     
[    2.194603] BTRFS error (device nullb0): exclude_super_stripes: nr: 0                                                                                                              
[    2.195973] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 67108864                                                                    
[    2.197378] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(67108864, 536870912, 268435456)
[    2.198382] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.199160] BTRFS error (device nullb0): exclude_super_stripes: nr: 0                                                                                                              
[    2.199871] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 274877906944                                                                
[    2.201030] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(274877906944, 536870912, 268435456)
[    2.202088] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.202864] BTRFS error (device nullb0): exclude_super_stripes: nr: 0                                                                                                              
[    2.203549] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 65536                                                                       
[    2.204621] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(65536, 805306368, 268435456)
[    2.205590] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.206394] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
[    2.207078] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 67108864
[    2.208131] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(67108864, 805306368, 268435456)
[    2.209111] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.209885] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
[    2.210540] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 274877906944
[    2.211595] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(274877906944, 805306368, 268435456)
[    2.212620] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.213388] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
[    2.214076] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 65536
[    2.215079] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(65536, 1073741824, 268435456)
[    2.216039] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.216801] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
[    2.217491] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 67108864
[    2.218548] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(67108864, 1073741824, 268435456)
[    2.219537] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.220322] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
[    2.221020] BTRFS error (device nullb0): exclude_super_stripes: calling btrfs_rmap_block() for bytenr: 274877906944
[    2.222150] BTRFS error (device nullb0): __btrfs_rmap_block: !in_range(274877906944, 1073741824, 268435456)
[    2.223165] BTRFS error (device nullb0): __btrfs_rmap_block: nr: 0, *naddrs: 0
[    2.223945] BTRFS error (device nullb0): exclude_super_stripes: nr: 0
Naohiro Aota Nov. 3, 2020, 3:31 a.m. UTC | #6
On Mon, Nov 02, 2020 at 01:54:14PM -0500, Josef Bacik wrote:
>On 10/30/20 9:51 AM, Naohiro Aota wrote:
>>Superblock (and its copies) is the only data structure in btrfs which has a
>>fixed location on a device. Since we cannot overwrite in a sequential write
>>required zone, we cannot place superblock in the zone. One easy solution is
>>limiting superblock and copies to be placed only in conventional zones.
>>However, this method has two downsides: one is reduced number of superblock
>>copies. The location of the second copy of superblock is 256GB, which is in
>>a sequential write required zone on typical devices in the market today.
>>So, the number of superblock and copies is limited to be two.  Second
>>downside is that we cannot support devices which have no conventional zones
>>at all.
>>
>>To solve these two problems, we employ superblock log writing. It uses two
>>zones as a circular buffer to write updated superblocks. Once the first
>>zone is filled up, start writing into the second buffer. Then, when the
>>both zones are filled up and before start writing to the first zone again,
>>it reset the first zone.
>>
>>We can determine the position of the latest superblock by reading write
>>pointer information from a device. One corner case is when the both zones
>>are full. For this situation, we read out the last superblock of each
>>zone, and compare them to determine which zone is older.
>>
>>The following zones are reserved as the circular buffer on ZONED btrfs.
>>
>>- The primary superblock: zones 0 and 1
>>- The first copy: zones 16 and 17
>>- The second copy: zones 1024 or zone at 256GB which is minimum, and next
>>   to it
>>
>>If these reserved zones are conventional, superblock is written fixed at
>>the start of the zone without logging.
>>
>
><snip>
>
>>  /*
>>   * This is only the first step towards a full-features scrub. It reads all
>>@@ -3704,6 +3705,8 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
>>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >
>>  		    scrub_dev->commit_total_bytes)
>>  			break;
>>+		if (!btrfs_check_super_location(scrub_dev, bytenr))
>>+			continue;
>
>Any reason in particular we're skipping scrubbing supers here?  Can't 
>we just lookup the bytenr and do the right thing here?

Hmm, technically, we can do something here, but I'm not sure it's useful to
scrub superblocks for zoned devices where superblocks are log-structured.
We can read and check if the latest superblock in the log is valid. But,
when we find it's not correct, we cannot overwrite it anyway. Instead, we
can append a new superblock to the log. But this is no different than
normal sync... Furthermore, the scrub-checked superblock might already be
out-dated at the time of reading.

We might want to read and check each entry of the log. And warn the user
when a superblock is corrupted. It's totally different from current
scrub_supers(), so we will need another helper function for it.

>
>>  		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
>>  				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
>>diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>index 10827892c086..db884b96a5ea 100644
>>--- a/fs/btrfs/volumes.c
>>+++ b/fs/btrfs/volumes.c
>>@@ -1282,7 +1282,8 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
>>  }
>>  static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
>>-						       u64 bytenr)
>>+						       u64 bytenr,
>>+						       u64 bytenr_orig)
>>  {
>>  	struct btrfs_super_block *disk_super;
>>  	struct page *page;
>>@@ -1313,7 +1314,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
>>  	/* align our pointer to the offset of the super block */
>>  	disk_super = p + offset_in_page(bytenr);
>>-	if (btrfs_super_bytenr(disk_super) != bytenr ||
>>+	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
>>  	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
>>  		btrfs_release_disk_super(p);
>>  		return ERR_PTR(-EINVAL);
>>@@ -1348,7 +1349,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>>  	bool new_device_added = false;
>>  	struct btrfs_device *device = NULL;
>>  	struct block_device *bdev;
>>-	u64 bytenr;
>>+	u64 bytenr, bytenr_orig;
>>+	int ret;
>>  	lockdep_assert_held(&uuid_mutex);
>>@@ -1358,14 +1360,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>>  	 * So, we need to add a special mount option to scan for
>>  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>>  	 */
>>-	bytenr = btrfs_sb_offset(0);
>>  	flags |= FMODE_EXCL;
>>  	bdev = blkdev_get_by_path(path, flags, holder);
>>  	if (IS_ERR(bdev))
>>  		return ERR_CAST(bdev);
>>-	disk_super = btrfs_read_disk_super(bdev, bytenr);
>>+	bytenr_orig = btrfs_sb_offset(0);
>>+	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
>>+	if (ret)
>>+		return ERR_PTR(ret);
>>+
>>+	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
>>  	if (IS_ERR(disk_super)) {
>>  		device = ERR_CAST(disk_super);
>>  		goto error_bdev_put;
>>@@ -2029,6 +2035,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>>  		if (IS_ERR(disk_super))
>>  			continue;
>>+		if (bdev_is_zoned(bdev)) {
>>+			btrfs_reset_sb_log_zones(bdev, copy_num);
>>+			continue;
>>+		}
>>+
>>  		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
>>  		page = virt_to_page(disk_super);
>>diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>index ae509699da14..d5487cba203b 100644
>>--- a/fs/btrfs/zoned.c
>>+++ b/fs/btrfs/zoned.c
>>@@ -20,6 +20,25 @@ static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx,
>>  	return 0;
>>  }
>>+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zone,
>>+			    u64 *wp_ret);
>>+
>>+static inline u32 sb_zone_number(u8 shift, int mirror)
>>+{
>>+	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
>>+
>>+	switch (mirror) {
>>+	case 0:
>>+		return 0;
>>+	case 1:
>>+		return 16;
>>+	case 2:
>>+		return min(btrfs_sb_offset(mirror) >> shift, 1024ULL);
>>+	}
>>+
>
>Can we get a comment here explaining the zone numbers?

Sure. I'll add one.

>
>>+	return 0;
>>+}
>>+
>>  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
>>  			       struct blk_zone *zones, unsigned int *nr_zones)
>>  {
>>@@ -123,6 +142,49 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
>>  		goto out;
>>  	}
>>+	/* validate superblock log */
>>+	nr_zones = 2;
>>+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
>>+		u32 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
>>+		u64 sb_wp;
>>+
>
>I'd rather see
>
>#define BTRFS_NR_ZONED_SB_ZONES 2
>
>or something equally poorly named and use that instead of our magic 2 everywhere.
>
>Then you can just do
>
>int index = i * BTRFS_NR_ZONED_SB_ZONES;
>&zone_info->sb_zones[index];

I'll do so. BTRFS_NR_ZONED_SB_ZONES is duplicating "ZONE", so how about
BTRFS_NR_SB_LOG_ZONES ?


>
><snip>
>
>>+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
>>+			   int rw, u64 *bytenr_ret)
>>+{
>>+	u64 wp;
>>+	int ret;
>>+
>>+	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>>+		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
>>+		return 0;
>>+	}
>>+
>>+	ret = sb_write_pointer(bdev, zones, &wp);
>>+	if (ret != -ENOENT && ret < 0)
>>+		return ret;
>>+
>>+	if (rw == WRITE) {
>>+		struct blk_zone *reset = NULL;
>>+
>>+		if (wp == zones[0].start << SECTOR_SHIFT)
>>+			reset = &zones[0];
>>+		else if (wp == zones[1].start << SECTOR_SHIFT)
>>+			reset = &zones[1];
>>+
>>+		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
>>+			ASSERT(reset->cond == BLK_ZONE_COND_FULL);
>>+
>>+			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
>>+					       reset->start, reset->len,
>>+					       GFP_NOFS);
>
>What happens if we crash right after this?  Is the WP set to the start 
>of the zone here?  Does this mean we'll simply miss the super block?  
>I understand we're resetting one zone here, but we're doing this in 
>order, so we'll reset one and write one, then reset the other and 
>write the next.  We don't wait until we've issued the writes for 
>everything, so it appears to me that there's a gap where we could have 
>the WP pointed at the start of the zone, which we view as an invalid 
>state and thus won't be able to mount the file system.  Or am I 
>missing something?  Thanks,

Here, we reset a zone we're going to write which contains the older
superblocks. And in this case, we should have the other zone fully written.
So, even after a reset and a crash, we still have the latest superblock in
the other zone.

>
>Josef
Naohiro Aota Nov. 3, 2020, 8:21 a.m. UTC | #7
On Mon, Nov 02, 2020 at 01:22:56PM -0500, Josef Bacik wrote:
>On 10/30/20 9:51 AM, Naohiro Aota wrote:
>>Superblock (and its copies) is the only data structure in btrfs which has a
>>fixed location on a device. Since we cannot overwrite in a sequential write
>>required zone, we cannot place superblock in the zone. One easy solution is
>>limiting superblock and copies to be placed only in conventional zones.
>>However, this method has two downsides: one is reduced number of superblock
>>copies. The location of the second copy of superblock is 256GB, which is in
>>a sequential write required zone on typical devices in the market today.
>>So, the number of superblock and copies is limited to be two.  Second
>>downside is that we cannot support devices which have no conventional zones
>>at all.
>>
>>To solve these two problems, we employ superblock log writing. It uses two
>>zones as a circular buffer to write updated superblocks. Once the first
>>zone is filled up, start writing into the second buffer. Then, when the
>>both zones are filled up and before start writing to the first zone again,
>>it reset the first zone.
>>
>>We can determine the position of the latest superblock by reading write
>>pointer information from a device. One corner case is when the both zones
>>are full. For this situation, we read out the last superblock of each
>>zone, and compare them to determine which zone is older.
>>
>>The following zones are reserved as the circular buffer on ZONED btrfs.
>>
>>- The primary superblock: zones 0 and 1
>>- The first copy: zones 16 and 17
>>- The second copy: zones 1024 or zone at 256GB which is minimum, and next
>>   to it
>>
>>If these reserved zones are conventional, superblock is written fixed at
>>the start of the zone without logging.
>>
>>Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
>>---
>>  fs/btrfs/block-group.c |   9 ++
>>  fs/btrfs/disk-io.c     |  41 +++++-
>>  fs/btrfs/scrub.c       |   3 +
>>  fs/btrfs/volumes.c     |  21 ++-
>>  fs/btrfs/zoned.c       | 311 +++++++++++++++++++++++++++++++++++++++++
>>  fs/btrfs/zoned.h       |  40 ++++++
>>  6 files changed, 413 insertions(+), 12 deletions(-)
>>
>>diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>index c0f1d6818df7..e989c66aa764 100644
>>--- a/fs/btrfs/block-group.c
>>+++ b/fs/btrfs/block-group.c
>>@@ -1723,6 +1723,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
>>  static int exclude_super_stripes(struct btrfs_block_group *cache)
>>  {
>>  	struct btrfs_fs_info *fs_info = cache->fs_info;
>>+	bool zoned = btrfs_is_zoned(fs_info);
>>  	u64 bytenr;
>>  	u64 *logical;
>>  	int stripe_len;
>>@@ -1744,6 +1745,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
>>  		if (ret)
>>  			return ret;
>>+		/* shouldn't have super stripes in sequential zones */
>>+		if (zoned && nr) {
>>+			btrfs_err(fs_info,
>>+				  "Zoned btrfs's block group %llu should not have super blocks",
>>+				  cache->start);
>>+			return -EUCLEAN;
>>+		}
>>+
>
>I'm very confused about this check, namely how you've been able to 
>test without it blowing up, which makes me feel like I'm missing 
>something.
>
>We _always_ call exclude_super_stripes(), and we're simply looking up 
>the bytenr for that block, which appears to not do anything special 
>for zoned.  This should be looking up and failing whenever it looks 
>for super stripes far enough out. How are you not failing here 
>everytime you mount the fs?  Thanks,
>
>Josef

As previous discussion with David, we decided to exclude superblock
position of regular btrfs being allocated for zoned block groups, because
superblock is one of on-disk specification. (Sorry, I could not find a
pointer of the discussion) We also need to ensure some user data which
looks like a superblock won't corrupt the FS.

btrfs_find_allocatable_zones() is doing that exclusion ... this was my
understanding. But, to be precise, the function is just excluding
superblock log zones, not regular superblock positions.

However, it accidentally excludes superblock positions of regualr device.
We have superblocks on 16KB, 64MB and 256GB. And, we exclude zones 0, 1,
16, 17 and min(zones at 256GB, 1024). With typical 256MB zone size setup,
16KB and 64MB drop on zone 0, and 256GB on zone 1024. So, I was not hitting
this "if" on my test runs.

I'll fix btrfs_find_allocatable_zones() so that it also excludes regular
superblock positions.

Thanks,
David Sterba Nov. 3, 2020, 1:15 p.m. UTC | #8
On Fri, Oct 30, 2020 at 10:51:18PM +0900, Naohiro Aota wrote:
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1723,6 +1723,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
>  static int exclude_super_stripes(struct btrfs_block_group *cache)
>  {
>  	struct btrfs_fs_info *fs_info = cache->fs_info;
> +	bool zoned = btrfs_is_zoned(fs_info);

	const bool

>  	u64 bytenr;
>  	u64 *logical;
>  	int stripe_len;
> @@ -1744,6 +1745,14 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
>  		if (ret)
>  			return ret;
>  
> +		/* shouldn't have super stripes in sequential zones */

		/* Shouldn't ... */

> +		if (zoned && nr) {
> +			btrfs_err(fs_info,
> +				  "Zoned btrfs's block group %llu should not have super blocks",

			"zoned: block group %llu must not contain super block"

> +				  cache->start);
> +			return -EUCLEAN;
> +		}
> +
>  		while (nr--) {
>  			u64 len = min_t(u64, stripe_len,
>  				cache->start + cache->length - logical[nr]);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9bc51cff48b8..fd8b970ee92c 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3423,10 +3423,17 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>  {
>  	struct btrfs_super_block *super;
>  	struct page *page;
> -	u64 bytenr;
> +	u64 bytenr, bytenr_orig;
>  	struct address_space *mapping = bdev->bd_inode->i_mapping;
> +	int ret;
> +
> +	bytenr_orig = btrfs_sb_offset(copy_num);
> +	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
> +	if (ret == -ENOENT)
> +		return ERR_PTR(-EINVAL);
> +	else if (ret)
> +		return ERR_PTR(ret);
>  
> -	bytenr = btrfs_sb_offset(copy_num);
>  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
>  		return ERR_PTR(-EINVAL);
>  
> @@ -3440,7 +3447,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>  		return ERR_PTR(-ENODATA);
>  	}
>  
> -	if (btrfs_super_bytenr(super) != bytenr) {
> +	if (btrfs_super_bytenr(super) != bytenr_orig) {
>  		btrfs_release_disk_super(super);
>  		return ERR_PTR(-EINVAL);
>  	}
> @@ -3495,7 +3502,8 @@ static int write_dev_supers(struct btrfs_device *device,
>  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>  	int i;
>  	int errors = 0;
> -	u64 bytenr;
> +	int ret;
> +	u64 bytenr, bytenr_orig;
>  
>  	if (max_mirrors == 0)
>  		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
> @@ -3507,12 +3515,21 @@ static int write_dev_supers(struct btrfs_device *device,
>  		struct bio *bio;
>  		struct btrfs_super_block *disk_super;
>  
> -		bytenr = btrfs_sb_offset(i);
> +		bytenr_orig = btrfs_sb_offset(i);
> +		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
> +		if (ret == -ENOENT)
> +			continue;
> +		else if (ret < 0) {
> +			btrfs_err(device->fs_info, "couldn't get super block location for mirror %d",
> +				  i);
> +			errors++;
> +			continue;
> +		}
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
>  			break;
>  
> -		btrfs_set_super_bytenr(sb, bytenr);
> +		btrfs_set_super_bytenr(sb, bytenr_orig);
>  
>  		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
>  				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
> @@ -3557,6 +3574,7 @@ static int write_dev_supers(struct btrfs_device *device,
>  			bio->bi_opf |= REQ_FUA;
>  
>  		btrfsic_submit_bio(bio);
> +		btrfs_advance_sb_log(device, i);
>  	}
>  	return errors < i ? 0 : -1;
>  }
> @@ -3573,6 +3591,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  	int i;
>  	int errors = 0;
>  	bool primary_failed = false;
> +	int ret;
>  	u64 bytenr;
>  
>  	if (max_mirrors == 0)
> @@ -3581,7 +3600,15 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
>  	for (i = 0; i < max_mirrors; i++) {
>  		struct page *page;
>  
> -		bytenr = btrfs_sb_offset(i);
> +		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
> +		if (ret == -ENOENT)

		if (...) {

> +			break;

		} else if (...) {

> +		else if (ret < 0) {
> +			errors++;
> +			if (i == 0)
> +				primary_failed = true;
> +			continue;
> +		}
>  		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
>  		    device->commit_total_bytes)
>  			break;
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1282,7 +1282,8 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
>  }
>  
>  static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
> -						       u64 bytenr)
> +						       u64 bytenr,
> +						       u64 bytenr_orig)
>  {
>  	struct btrfs_super_block *disk_super;
>  	struct page *page;
> @@ -1313,7 +1314,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
>  	/* align our pointer to the offset of the super block */
>  	disk_super = p + offset_in_page(bytenr);
>  
> -	if (btrfs_super_bytenr(disk_super) != bytenr ||
> +	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
>  	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
>  		btrfs_release_disk_super(p);
>  		return ERR_PTR(-EINVAL);
> @@ -1348,7 +1349,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>  	bool new_device_added = false;
>  	struct btrfs_device *device = NULL;
>  	struct block_device *bdev;
> -	u64 bytenr;
> +	u64 bytenr, bytenr_orig;
> +	int ret;
>  
>  	lockdep_assert_held(&uuid_mutex);
>  
> @@ -1358,14 +1360,18 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
>  	 * So, we need to add a special mount option to scan for
>  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>  	 */
> -	bytenr = btrfs_sb_offset(0);
>  	flags |= FMODE_EXCL;
>  
>  	bdev = blkdev_get_by_path(path, flags, holder);
>  	if (IS_ERR(bdev))
>  		return ERR_CAST(bdev);
>  
> -	disk_super = btrfs_read_disk_super(bdev, bytenr);
> +	bytenr_orig = btrfs_sb_offset(0);
> +	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
>  	if (IS_ERR(disk_super)) {
>  		device = ERR_CAST(disk_super);
>  		goto error_bdev_put;
> @@ -2029,6 +2035,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>  		if (IS_ERR(disk_super))
>  			continue;
>  
> +		if (bdev_is_zoned(bdev)) {
> +			btrfs_reset_sb_log_zones(bdev, copy_num);
> +			continue;
> +		}
> +
>  		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
>  
>  		page = virt_to_page(disk_super);
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index ae509699da14..d5487cba203b 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -20,6 +20,25 @@ static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx,
>  	return 0;
>  }
>  
> +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zone,
> +			    u64 *wp_ret);

Please define sb_write_pointer here instead of the prototype for a
static function.

> +
> +static inline u32 sb_zone_number(u8 shift, int mirror)
> +{
> +	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
> +
> +	switch (mirror) {
> +	case 0:
> +		return 0;
> +	case 1:
> +		return 16;
> +	case 2:
> +		return min(btrfs_sb_offset(mirror) >> shift, 1024ULL);
> +	}
> +
> +	return 0;
> +}
> +
>  static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
>  			       struct blk_zone *zones, unsigned int *nr_zones)
>  {
> @@ -123,6 +142,49 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
>  		goto out;
>  	}
>  
> +	/* validate superblock log */

	/* Validate ... */

> +	nr_zones = 2;
> +	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> +		u32 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
> +		u64 sb_wp;
> +
> +		if (sb_zone + 1 >= zone_info->nr_zones)
> +			continue;
> +
> +		sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
> +		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
> +					  &zone_info->sb_zones[2 * i],
> +					  &nr_zones);
> +		if (ret)
> +			goto out;
> +		if (nr_zones != 2) {
> +			btrfs_err_in_rcu(device->fs_info,
> +			"failed to read SB log zone info at device %s zone %u",
> +					 rcu_str_deref(device->name), sb_zone);

			"zoned: failed to read super block log zone info at devid %llu zone %u"

> +			ret = -EIO;

What are the possible reasons here? EIO would fit reading error but if
the zone is missing it's more like EUCLEAN.

> +			goto out;
> +		}
> +
> +		/*
> +		 * If zones[0] is conventional, always use the beggining of
> +		 * the zone to record superblock. No need to validate in
> +		 * that case.
> +		 */
> +		if (zone_info->sb_zones[2 * i].type == BLK_ZONE_TYPE_CONVENTIONAL)
> +			continue;
> +
> +		ret = sb_write_pointer(device->bdev,
> +				       &zone_info->sb_zones[2 * i], &sb_wp);
> +		if (ret != -ENOENT && ret) {
> +			btrfs_err_in_rcu(device->fs_info,
> +				"SB log zone corrupted: device %s zone %u",
> +					 rcu_str_deref(device->name), sb_zone);

			"zoned: super block log zone corrupted devid %llu zone %u"

The device path would be also good in all the messages, this could be
tweaked later.

> +			ret = -EUCLEAN;
> +			goto out;
> +		}
> +	}
> +
> +
>  	kfree(zones);
>  
>  	device->zone_info = zone_info;
> @@ -296,3 +358,252 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
>  
>  	return 0;
>  }
> +
> +static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
> +			    u64 *wp_ret)
> +{
> +	bool empty[2];
> +	bool full[2];
> +	sector_t sector;
> +
> +	ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
> +	       zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
> +
> +	empty[0] = zones[0].cond == BLK_ZONE_COND_EMPTY;

	empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);

> +	empty[1] = zones[1].cond == BLK_ZONE_COND_EMPTY;
> +	full[0] = zones[0].cond == BLK_ZONE_COND_FULL;
> +	full[1] = zones[1].cond == BLK_ZONE_COND_FULL;

Same

> +
> +	/*
> +	 * Possible state of log buffer zones
> +	 *
> +	 *   E I F
> +	 * E * x 0
> +	 * I 0 x 0
> +	 * F 1 1 C
> +	 *
> +	 * Row: zones[0]
> +	 * Col: zones[1]
> +	 * State:
> +	 *   E: Empty, I: In-Use, F: Full
> +	 * Log position:
> +	 *   *: Special case, no superblock is written
> +	 *   0: Use write pointer of zones[0]
> +	 *   1: Use write pointer of zones[1]
> +	 *   C: Compare SBs from zones[0] and zones[1], use the newer one
> +	 *   x: Invalid state
> +	 */
> +
> +	if (empty[0] && empty[1]) {
> +		/* special case to distinguish no superblock to read */

		/* Special ... */

> +		*wp_ret = zones[0].start << SECTOR_SHIFT;
> +		return -ENOENT;
> +	} else if (full[0] && full[1]) {
> +		/* Compare two super blocks */
> +		struct address_space *mapping = bdev->bd_inode->i_mapping;
> +		struct page *page[2];
> +		struct btrfs_super_block *super[2];
> +		int i;
> +
> +		for (i = 0; i < 2; i++) {
> +			u64 bytenr = ((zones[i].start + zones[i].len) << SECTOR_SHIFT) -
> +				BTRFS_SUPER_INFO_SIZE;
> +
> +			page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
> +			if (IS_ERR(page[i])) {
> +				if (i == 1)
> +					btrfs_release_disk_super(super[0]);
> +				return PTR_ERR(page[i]);
> +			}
> +			super[i] = page_address(page[i]);
> +		}
> +
> +		if (super[0]->generation > super[1]->generation)
> +			sector = zones[1].start;
> +		else
> +			sector = zones[0].start;
> +
> +		for (i = 0; i < 2; i++)
> +			btrfs_release_disk_super(super[i]);
> +	} else if (!full[0] && (empty[1] || full[1])) {
> +		sector = zones[0].wp;
> +	} else if (full[0]) {
> +		sector = zones[1].wp;
> +	} else {
> +		return -EUCLEAN;
> +	}
> +	*wp_ret = sector << SECTOR_SHIFT;
> +	return 0;
> +}
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -4,6 +4,8 @@
>  #define BTRFS_ZONED_H
>  
>  #include <linux/blkdev.h>
> +#include "volumes.h"
> +#include "disk-io.h"
>  
>  struct btrfs_zoned_device_info {
>  	/*
> @@ -16,6 +18,7 @@ struct btrfs_zoned_device_info {
>  	u32 nr_zones;
>  	unsigned long *seq_zones;
>  	unsigned long *empty_zones;
> +	struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
>  };
>  
>  #ifdef CONFIG_BLK_DEV_ZONED
> @@ -25,6 +28,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device);
>  void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
>  int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
>  int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
> +int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
> +			       u64 *bytenr_ret);
> +int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
> +			  u64 *bytenr_ret);
> +void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
> +int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
>  #else /* CONFIG_BLK_DEV_ZONED */
>  static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
>  				     struct blk_zone *zone)
> @@ -48,6 +57,26 @@ static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
>  {
>  	return 0;
>  }

newline

> +static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
> +					     int mirror, int rw,
> +					     u64 *bytenr_ret)
> +{
> +	*bytenr_ret = btrfs_sb_offset(mirror);
> +	return 0;
> +}

newline

> +static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
> +					int rw, u64 *bytenr_ret)
> +{
> +	*bytenr_ret = btrfs_sb_offset(mirror);
> +	return 0;
> +}

newline

> +static inline void btrfs_advance_sb_log(struct btrfs_device *device,
> +					int mirror) { }

newline

> +static inline int btrfs_reset_sb_log_zones(struct block_device *bdev,
> +					   int mirror)
> +{
> +	return 0;
> +}

newline

>  #endif
>  
>  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
> @@ -115,4 +144,15 @@ static inline bool btrfs_check_device_zone_type(struct btrfs_fs_info *fs_info,
>  	return bdev_zoned_model(bdev) != BLK_ZONED_HM;
>  }
>  
> +static inline bool btrfs_check_super_location(struct btrfs_device *device,
> +					      u64 pos)
> +{
> +	/*
> +	 * On a non-zoned device, any address is OK. On a zoned device,
> +	 * non-SEQUENTIAL WRITE REQUIRED zones are capable.
> +	 */
> +	return device->zone_info == NULL ||
> +	       !btrfs_dev_is_sequential(device, pos);
> +}
> +
>  #endif
> -- 
> 2.27.0
David Sterba Nov. 3, 2020, 2:10 p.m. UTC | #9
On Fri, Oct 30, 2020 at 10:51:18PM +0900, Naohiro Aota wrote:
> Superblock (and its copies) is the only data structure in btrfs which has a
> fixed location on a device. Since we cannot overwrite in a sequential write
> required zone, we cannot place superblock in the zone. One easy solution is
> limiting superblock and copies to be placed only in conventional zones.
> However, this method has two downsides: one is reduced number of superblock
> copies. The location of the second copy of superblock is 256GB, which is in
> a sequential write required zone on typical devices in the market today.
> So, the number of superblock and copies is limited to be two.  Second
> downside is that we cannot support devices which have no conventional zones
> at all.
> 
> To solve these two problems, we employ superblock log writing. It uses two
> zones as a circular buffer to write updated superblocks. Once the first
> zone is filled up, start writing into the second buffer. Then, when the
> both zones are filled up and before start writing to the first zone again,
> it reset the first zone.
> 
> We can determine the position of the latest superblock by reading write
> pointer information from a device. One corner case is when the both zones
> are full. For this situation, we read out the last superblock of each
> zone, and compare them to determine which zone is older.
> 
> The following zones are reserved as the circular buffer on ZONED btrfs.
> 
> - The primary superblock: zones 0 and 1
> - The first copy: zones 16 and 17
> - The second copy: zones 1024 or zone at 256GB which is minimum, and next
>   to it
> 
> If these reserved zones are conventional, superblock is written fixed at
> the start of the zone without logging.

I don't have a clear picture here.

In case there's a conventional zone covering 0 and 1st copy (64K and
64M) it'll be overwritten. What happens for 2nd copy that's at 256G?
sb-log?

For all-sequential drive, the 0 and 1st copy are in the first zone.
You say 0 and 1, but how come if the minimum zone size we ever expect is
256M?

The circular buffer comprises zones covering all superblock copies? I
mean one buffer for 2 or more sb copies? The problem is that we'll have
just one copy of the current superblock. Or I misunderstood.

My idea is that we have primary zone, unfortunatelly covering 2
superblocks but let it be. Second zone contains 2nd superblock copy
(256G), we can assume that devices will be bigger than that.

Then the circular buffers happen in each zone, so first one will go from
offset 64K up to the zone size (256M or 1G).  Second zone rotates from
offset 0 to end of the zone.

The positive outcome of that is that both zones contain the latest
superblock after succesful write and their write pointer is slightly out
of sync, so they never have to be reset at the same time.

In numbers:
- first zone 64K .. 256M, 65520 superblocks
- second zone 256G .. 245G+256M, 65536 superblocks

The difference is 16 superblock updates, which should be enough to let
the zone resets happen far apart.
Naohiro Aota Nov. 6, 2020, 10:37 a.m. UTC | #10
On Tue, Nov 03, 2020 at 03:10:35PM +0100, David Sterba wrote:
>On Fri, Oct 30, 2020 at 10:51:18PM +0900, Naohiro Aota wrote:
>> Superblock (and its copies) is the only data structure in btrfs which has a
>> fixed location on a device. Since we cannot overwrite in a sequential write
>> required zone, we cannot place superblock in the zone. One easy solution is
>> limiting superblock and copies to be placed only in conventional zones.
>> However, this method has two downsides: one is reduced number of superblock
>> copies. The location of the second copy of superblock is 256GB, which is in
>> a sequential write required zone on typical devices in the market today.
>> So, the number of superblock and copies is limited to be two.  Second
>> downside is that we cannot support devices which have no conventional zones
>> at all.
>>
>> To solve these two problems, we employ superblock log writing. It uses two
>> zones as a circular buffer to write updated superblocks. Once the first
>> zone is filled up, start writing into the second buffer. Then, when the
>> both zones are filled up and before start writing to the first zone again,
>> it reset the first zone.
>>
>> We can determine the position of the latest superblock by reading write
>> pointer information from a device. One corner case is when the both zones
>> are full. For this situation, we read out the last superblock of each
>> zone, and compare them to determine which zone is older.
>>
>> The following zones are reserved as the circular buffer on ZONED btrfs.
>>
>> - The primary superblock: zones 0 and 1
>> - The first copy: zones 16 and 17
>> - The second copy: zones 1024 or zone at 256GB which is minimum, and next
>>   to it
>>
>> If these reserved zones are conventional, superblock is written fixed at
>> the start of the zone without logging.
>
>I don't have a clear picture here.
>
>In case there's a conventional zone covering 0 and 1st copy (64K and
>64M) it'll be overwritten. What happens for 2nd copy that's at 256G?
>sb-log?
>
>For all-sequential drive, the 0 and 1st copy are in the first zone.
>You say 0 and 1, but how come if the minimum zone size we ever expect is
>256M?

On zoned device, we always reserve the above zones (0, 1, 16, 17, 1024,
1025 (or zones at 256G)) regardless of it is sequential or conventional.
And, if the reserved zones is conventional, we write a superblock always at
the beginning of the reserved zone. So, if a drive have 32
conventional zones, superblocks are placed at the beginning of zone 0 and
zone 16. And, zone 1024 and 1025 are written with sb-log.

>
>The circular buffer comprises zones covering all superblock copies? I
>mean one buffer for 2 or more sb copies? The problem is that we'll have
>just one copy of the current superblock. Or I misunderstood.

A circular buffer consists with a pair of the zones, so we'll have three
sb-logs for each on zone pairs 0 & 1, zones 16 & 17, and 1024 & 1025.

>
>My idea is that we have primary zone, unfortunatelly covering 2
>superblocks but let it be. Second zone contains 2nd superblock copy
>(256G), we can assume that devices will be bigger than that.
>
>Then the circular buffers happen in each zone, so first one will go from
>offset 64K up to the zone size (256M or 1G).  Second zone rotates from
>offset 0 to end of the zone.
>
>The positive outcome of that is that both zones contain the latest
>superblock after succesful write and their write pointer is slightly out
>of sync, so they never have to be reset at the same time.
>
>In numbers:
>- first zone 64K .. 256M, 65520 superblocks
>- second zone 256G .. 245G+256M, 65536 superblocks
>
>The difference is 16 superblock updates, which should be enough to let
>the zone resets happen far apart.

Hmm, this makes the minimal FS size requirement to 256 GB to survive a
crash after resetting the first zone... So, that's why we have two zones as
a circular buffer.
diff mbox series

Patch

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c0f1d6818df7..e989c66aa764 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1723,6 +1723,7 @@  int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
 static int exclude_super_stripes(struct btrfs_block_group *cache)
 {
 	struct btrfs_fs_info *fs_info = cache->fs_info;
+	bool zoned = btrfs_is_zoned(fs_info);
 	u64 bytenr;
 	u64 *logical;
 	int stripe_len;
@@ -1744,6 +1745,14 @@  static int exclude_super_stripes(struct btrfs_block_group *cache)
 		if (ret)
 			return ret;
 
+		/* shouldn't have super stripes in sequential zones */
+		if (zoned && nr) {
+			btrfs_err(fs_info,
+				  "Zoned btrfs's block group %llu should not have super blocks",
+				  cache->start);
+			return -EUCLEAN;
+		}
+
 		while (nr--) {
 			u64 len = min_t(u64, stripe_len,
 				cache->start + cache->length - logical[nr]);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9bc51cff48b8..fd8b970ee92c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3423,10 +3423,17 @@  struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
 {
 	struct btrfs_super_block *super;
 	struct page *page;
-	u64 bytenr;
+	u64 bytenr, bytenr_orig;
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	int ret;
+
+	bytenr_orig = btrfs_sb_offset(copy_num);
+	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+	if (ret == -ENOENT)
+		return ERR_PTR(-EINVAL);
+	else if (ret)
+		return ERR_PTR(ret);
 
-	bytenr = btrfs_sb_offset(copy_num);
 	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
 		return ERR_PTR(-EINVAL);
 
@@ -3440,7 +3447,7 @@  struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
 		return ERR_PTR(-ENODATA);
 	}
 
-	if (btrfs_super_bytenr(super) != bytenr) {
+	if (btrfs_super_bytenr(super) != bytenr_orig) {
 		btrfs_release_disk_super(super);
 		return ERR_PTR(-EINVAL);
 	}
@@ -3495,7 +3502,8 @@  static int write_dev_supers(struct btrfs_device *device,
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	int i;
 	int errors = 0;
-	u64 bytenr;
+	int ret;
+	u64 bytenr, bytenr_orig;
 
 	if (max_mirrors == 0)
 		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
@@ -3507,12 +3515,21 @@  static int write_dev_supers(struct btrfs_device *device,
 		struct bio *bio;
 		struct btrfs_super_block *disk_super;
 
-		bytenr = btrfs_sb_offset(i);
+		bytenr_orig = btrfs_sb_offset(i);
+		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
+		if (ret == -ENOENT)
+			continue;
+		else if (ret < 0) {
+			btrfs_err(device->fs_info, "couldn't get super block location for mirror %d",
+				  i);
+			errors++;
+			continue;
+		}
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
 			break;
 
-		btrfs_set_super_bytenr(sb, bytenr);
+		btrfs_set_super_bytenr(sb, bytenr_orig);
 
 		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
@@ -3557,6 +3574,7 @@  static int write_dev_supers(struct btrfs_device *device,
 			bio->bi_opf |= REQ_FUA;
 
 		btrfsic_submit_bio(bio);
+		btrfs_advance_sb_log(device, i);
 	}
 	return errors < i ? 0 : -1;
 }
@@ -3573,6 +3591,7 @@  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 	int i;
 	int errors = 0;
 	bool primary_failed = false;
+	int ret;
 	u64 bytenr;
 
 	if (max_mirrors == 0)
@@ -3581,7 +3600,15 @@  static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 	for (i = 0; i < max_mirrors; i++) {
 		struct page *page;
 
-		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
+		if (ret == -ENOENT)
+			break;
+		else if (ret < 0) {
+			errors++;
+			if (i == 0)
+				primary_failed = true;
+			continue;
+		}
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
 		    device->commit_total_bytes)
 			break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index cf63f1e27a27..aa1b36cf5c88 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -20,6 +20,7 @@ 
 #include "rcu-string.h"
 #include "raid56.h"
 #include "block-group.h"
+#include "zoned.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -3704,6 +3705,8 @@  static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 		if (bytenr + BTRFS_SUPER_INFO_SIZE >
 		    scrub_dev->commit_total_bytes)
 			break;
+		if (!btrfs_check_super_location(scrub_dev, bytenr))
+			continue;
 
 		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
 				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 10827892c086..db884b96a5ea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,8 @@  void btrfs_release_disk_super(struct btrfs_super_block *super)
 }
 
 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
-						       u64 bytenr)
+						       u64 bytenr,
+						       u64 bytenr_orig)
 {
 	struct btrfs_super_block *disk_super;
 	struct page *page;
@@ -1313,7 +1314,7 @@  static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
 	/* align our pointer to the offset of the super block */
 	disk_super = p + offset_in_page(bytenr);
 
-	if (btrfs_super_bytenr(disk_super) != bytenr ||
+	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
 		btrfs_release_disk_super(p);
 		return ERR_PTR(-EINVAL);
@@ -1348,7 +1349,8 @@  struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
 	struct block_device *bdev;
-	u64 bytenr;
+	u64 bytenr, bytenr_orig;
+	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
 
@@ -1358,14 +1360,18 @@  struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
 	 * So, we need to add a special mount option to scan for
 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 	 */
-	bytenr = btrfs_sb_offset(0);
 	flags |= FMODE_EXCL;
 
 	bdev = blkdev_get_by_path(path, flags, holder);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
-	disk_super = btrfs_read_disk_super(bdev, bytenr);
+	bytenr_orig = btrfs_sb_offset(0);
+	ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
 	if (IS_ERR(disk_super)) {
 		device = ERR_CAST(disk_super);
 		goto error_bdev_put;
@@ -2029,6 +2035,11 @@  void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 		if (IS_ERR(disk_super))
 			continue;
 
+		if (bdev_is_zoned(bdev)) {
+			btrfs_reset_sb_log_zones(bdev, copy_num);
+			continue;
+		}
+
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
 
 		page = virt_to_page(disk_super);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index ae509699da14..d5487cba203b 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -20,6 +20,25 @@  static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx,
 	return 0;
 }
 
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zone,
+			    u64 *wp_ret);
+
+static inline u32 sb_zone_number(u8 shift, int mirror)
+{
+	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+
+	switch (mirror) {
+	case 0:
+		return 0;
+	case 1:
+		return 16;
+	case 2:
+		return min(btrfs_sb_offset(mirror) >> shift, 1024ULL);
+	}
+
+	return 0;
+}
+
 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 			       struct blk_zone *zones, unsigned int *nr_zones)
 {
@@ -123,6 +142,49 @@  int btrfs_get_dev_zone_info(struct btrfs_device *device)
 		goto out;
 	}
 
+	/* validate superblock log */
+	nr_zones = 2;
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		u32 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
+		u64 sb_wp;
+
+		if (sb_zone + 1 >= zone_info->nr_zones)
+			continue;
+
+		sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
+		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+					  &zone_info->sb_zones[2 * i],
+					  &nr_zones);
+		if (ret)
+			goto out;
+		if (nr_zones != 2) {
+			btrfs_err_in_rcu(device->fs_info,
+			"failed to read SB log zone info at device %s zone %u",
+					 rcu_str_deref(device->name), sb_zone);
+			ret = -EIO;
+			goto out;
+		}
+
+		/*
+		 * If zones[0] is conventional, always use the beggining of
+		 * the zone to record superblock. No need to validate in
+		 * that case.
+		 */
+		if (zone_info->sb_zones[2 * i].type == BLK_ZONE_TYPE_CONVENTIONAL)
+			continue;
+
+		ret = sb_write_pointer(device->bdev,
+				       &zone_info->sb_zones[2 * i], &sb_wp);
+		if (ret != -ENOENT && ret) {
+			btrfs_err_in_rcu(device->fs_info,
+				"SB log zone corrupted: device %s zone %u",
+					 rcu_str_deref(device->name), sb_zone);
+			ret = -EUCLEAN;
+			goto out;
+		}
+	}
+
+
 	kfree(zones);
 
 	device->zone_info = zone_info;
@@ -296,3 +358,252 @@  int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 
 	return 0;
 }
+
+static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
+			    u64 *wp_ret)
+{
+	bool empty[2];
+	bool full[2];
+	sector_t sector;
+
+	ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
+	       zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
+
+	empty[0] = zones[0].cond == BLK_ZONE_COND_EMPTY;
+	empty[1] = zones[1].cond == BLK_ZONE_COND_EMPTY;
+	full[0] = zones[0].cond == BLK_ZONE_COND_FULL;
+	full[1] = zones[1].cond == BLK_ZONE_COND_FULL;
+
+	/*
+	 * Possible state of log buffer zones
+	 *
+	 *   E I F
+	 * E * x 0
+	 * I 0 x 0
+	 * F 1 1 C
+	 *
+	 * Row: zones[0]
+	 * Col: zones[1]
+	 * State:
+	 *   E: Empty, I: In-Use, F: Full
+	 * Log position:
+	 *   *: Special case, no superblock is written
+	 *   0: Use write pointer of zones[0]
+	 *   1: Use write pointer of zones[1]
+	 *   C: Compare SBs from zones[0] and zones[1], use the newer one
+	 *   x: Invalid state
+	 */
+
+	if (empty[0] && empty[1]) {
+		/* special case to distinguish no superblock to read */
+		*wp_ret = zones[0].start << SECTOR_SHIFT;
+		return -ENOENT;
+	} else if (full[0] && full[1]) {
+		/* Compare two super blocks */
+		struct address_space *mapping = bdev->bd_inode->i_mapping;
+		struct page *page[2];
+		struct btrfs_super_block *super[2];
+		int i;
+
+		for (i = 0; i < 2; i++) {
+			u64 bytenr = ((zones[i].start + zones[i].len) << SECTOR_SHIFT) -
+				BTRFS_SUPER_INFO_SIZE;
+
+			page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
+			if (IS_ERR(page[i])) {
+				if (i == 1)
+					btrfs_release_disk_super(super[0]);
+				return PTR_ERR(page[i]);
+			}
+			super[i] = page_address(page[i]);
+		}
+
+		if (super[0]->generation > super[1]->generation)
+			sector = zones[1].start;
+		else
+			sector = zones[0].start;
+
+		for (i = 0; i < 2; i++)
+			btrfs_release_disk_super(super[i]);
+	} else if (!full[0] && (empty[1] || full[1])) {
+		sector = zones[0].wp;
+	} else if (full[0]) {
+		sector = zones[1].wp;
+	} else {
+		return -EUCLEAN;
+	}
+	*wp_ret = sector << SECTOR_SHIFT;
+	return 0;
+}
+
+static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
+			   int rw, u64 *bytenr_ret)
+{
+	u64 wp;
+	int ret;
+
+	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
+		return 0;
+	}
+
+	ret = sb_write_pointer(bdev, zones, &wp);
+	if (ret != -ENOENT && ret < 0)
+		return ret;
+
+	if (rw == WRITE) {
+		struct blk_zone *reset = NULL;
+
+		if (wp == zones[0].start << SECTOR_SHIFT)
+			reset = &zones[0];
+		else if (wp == zones[1].start << SECTOR_SHIFT)
+			reset = &zones[1];
+
+		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+			ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+
+			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+					       reset->start, reset->len,
+					       GFP_NOFS);
+			if (ret)
+				return ret;
+
+			reset->cond = BLK_ZONE_COND_EMPTY;
+			reset->wp = reset->start;
+		}
+	} else if (ret != -ENOENT) {
+		/* For READ, we want the precious one */
+		if (wp == zones[0].start << SECTOR_SHIFT)
+			wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+		wp -= BTRFS_SUPER_INFO_SIZE;
+	}
+
+	*bytenr_ret = wp;
+	return 0;
+
+}
+
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+			       u64 *bytenr_ret)
+{
+	struct blk_zone zones[2];
+	unsigned int zone_sectors;
+	u32 sb_zone;
+	int ret;
+	u64 zone_size;
+	u8 zone_sectors_shift;
+	sector_t nr_sectors = bdev->bd_part->nr_sects;
+	u32 nr_zones;
+
+	if (!bdev_is_zoned(bdev)) {
+		*bytenr_ret = btrfs_sb_offset(mirror);
+		return 0;
+	}
+
+	ASSERT(rw == READ || rw == WRITE);
+
+	zone_sectors = bdev_zone_sectors(bdev);
+	if (!is_power_of_2(zone_sectors))
+		return -EINVAL;
+	zone_size = zone_sectors << SECTOR_SHIFT;
+	zone_sectors_shift = ilog2(zone_sectors);
+	nr_zones = nr_sectors >> zone_sectors_shift;
+
+	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+	if (sb_zone + 1 >= nr_zones)
+		return -ENOENT;
+
+	ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, 2,
+				  copy_zone_info_cb, zones);
+	if (ret < 0)
+		return ret;
+	if (ret != 2)
+		return -EIO;
+
+	return sb_log_location(bdev, zones, rw, bytenr_ret);
+}
+
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+			  u64 *bytenr_ret)
+{
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	u32 zone_num;
+
+	if (!zinfo) {
+		*bytenr_ret = btrfs_sb_offset(mirror);
+		return 0;
+	}
+
+	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+	if (zone_num + 1 >= zinfo->nr_zones)
+		return -ENOENT;
+
+	return sb_log_location(device->bdev, &zinfo->sb_zones[2 * mirror], rw,
+			       bytenr_ret);
+}
+
+static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
+				  int mirror)
+{
+	u32 zone_num;
+
+	if (!zinfo)
+		return false;
+
+	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
+	if (zone_num + 1 >= zinfo->nr_zones)
+		return false;
+
+	if (!test_bit(zone_num, zinfo->seq_zones))
+		return false;
+
+	return true;
+}
+
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+	struct btrfs_zoned_device_info *zinfo = device->zone_info;
+	struct blk_zone *zone;
+
+	if (!is_sb_log_zone(zinfo, mirror))
+		return;
+
+	zone = &zinfo->sb_zones[2 * mirror];
+	if (zone->cond != BLK_ZONE_COND_FULL) {
+		if (zone->cond == BLK_ZONE_COND_EMPTY)
+			zone->cond = BLK_ZONE_COND_IMP_OPEN;
+		zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+		if (zone->wp == zone->start + zone->len)
+			zone->cond = BLK_ZONE_COND_FULL;
+		return;
+	}
+
+	zone++;
+	ASSERT(zone->cond != BLK_ZONE_COND_FULL);
+	if (zone->cond == BLK_ZONE_COND_EMPTY)
+		zone->cond = BLK_ZONE_COND_IMP_OPEN;
+	zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+	if (zone->wp == zone->start + zone->len)
+		zone->cond = BLK_ZONE_COND_FULL;
+}
+
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
+{
+	sector_t zone_sectors;
+	sector_t nr_sectors = bdev->bd_part->nr_sects;
+	u8 zone_sectors_shift;
+	u32 sb_zone;
+	u32 nr_zones;
+
+	zone_sectors = bdev_zone_sectors(bdev);
+	zone_sectors_shift = ilog2(zone_sectors);
+	nr_zones = nr_sectors >> zone_sectors_shift;
+
+	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
+	if (sb_zone + 1 >= nr_zones)
+		return -ENOENT;
+
+	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+				sb_zone << zone_sectors_shift, zone_sectors * 2,
+				GFP_NOFS);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 0b7756a7104d..447c4e5ffcbb 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -4,6 +4,8 @@ 
 #define BTRFS_ZONED_H
 
 #include <linux/blkdev.h>
+#include "volumes.h"
+#include "disk-io.h"
 
 struct btrfs_zoned_device_info {
 	/*
@@ -16,6 +18,7 @@  struct btrfs_zoned_device_info {
 	u32 nr_zones;
 	unsigned long *seq_zones;
 	unsigned long *empty_zones;
+	struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
 };
 
 #ifdef CONFIG_BLK_DEV_ZONED
@@ -25,6 +28,12 @@  int btrfs_get_dev_zone_info(struct btrfs_device *device);
 void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
+			       u64 *bytenr_ret);
+int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
+			  u64 *bytenr_ret);
+void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 				     struct blk_zone *zone)
@@ -48,6 +57,26 @@  static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 {
 	return 0;
 }
+static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
+					     int mirror, int rw,
+					     u64 *bytenr_ret)
+{
+	*bytenr_ret = btrfs_sb_offset(mirror);
+	return 0;
+}
+static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
+					int rw, u64 *bytenr_ret)
+{
+	*bytenr_ret = btrfs_sb_offset(mirror);
+	return 0;
+}
+static inline void btrfs_advance_sb_log(struct btrfs_device *device,
+					int mirror) { }
+static inline int btrfs_reset_sb_log_zones(struct block_device *bdev,
+					   int mirror)
+{
+	return 0;
+}
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -115,4 +144,15 @@  static inline bool btrfs_check_device_zone_type(struct btrfs_fs_info *fs_info,
 	return bdev_zoned_model(bdev) != BLK_ZONED_HM;
 }
 
+static inline bool btrfs_check_super_location(struct btrfs_device *device,
+					      u64 pos)
+{
+	/*
+	 * On a non-zoned device, any address is OK. On a zoned device,
+	 * non-SEQUENTIAL WRITE REQUIRED zones are capable.
+	 */
+	return device->zone_info == NULL ||
+	       !btrfs_dev_is_sequential(device, pos);
+}
+
 #endif