diff mbox series

[v2] btrfs: check the superblock to ensure the fs is not modified at thaw time

Message ID 8032f0bba42927fb4d87909060e03a647bb60c32.1660200417.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series [v2] btrfs: check the superblock to ensure the fs is not modified at thaw time | expand

Commit Message

Qu Wenruo Aug. 11, 2022, 6:47 a.m. UTC
[BACKGROUND]
There is an incident report that, one user hibernated the system, with
one btrfs on removable device still mounted.

Then by some incident, the btrfs got mounted and modified by another
system/OS, then back to the hibernated system.

After resuming from the hibernation, new write happened into the victim btrfs.

Now the fs is completely broken, since the underlying btrfs is no longer
the same one before the hibernation, and the user lost their data due to
various transid mismatch.

[REPRODUCER]
We can emulate the situation using the following small script:

 truncate -s 1G $dev
 mkfs.btrfs -f $dev
 mount $dev $mnt
 fsstress -w -d $mnt -n 500
 sync
 xfs_freeze -f $mnt
 cp $dev $dev.backup

 # There is no way to mount the same cloned fs on the same system,
 # as the conflicting fsid will be rejected by btrfs.
 # Thus here we have to wipe the fs using a different btrfs.
 mkfs.btrfs -f $dev.backup

 dd if=$dev.backup of=$dev bs=1M
 xfs_freeze -u $mnt
 fsstress -w -d $mnt -n 20
 umount $mnt
 btrfs check $dev

The final fsck will fail due to some tree blocks has incorrect fsid.

This is enough to emulate the problem hit by the unfortunate user.

[ENHANCEMENT]
Although such case should not be that common, it can still happen from
time to time.

From the view of btrfs, we can detect any unexpected super block change,
and if there is any unexpected change, we just mark the fs RO, and thaw
the fs.

By this we can limit the damage to minimal, and I hope no one would lose
their data by this anymore.

Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Changelog:
v2:
- Remove one unrelated debug pr_info()
- Slightly re-word some comments
- Add suggested-by tag
---
 fs/btrfs/disk-io.c |  9 +++++--
 fs/btrfs/disk-io.h |  2 +-
 fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c |  2 +-
 4 files changed, 67 insertions(+), 4 deletions(-)

Comments

David Sterba Aug. 23, 2022, 5:10 p.m. UTC | #1
On Thu, Aug 11, 2022 at 02:47:08PM +0800, Qu Wenruo wrote:
> [BACKGROUND]
> There is an incident report that, one user hibernated the system, with
> one btrfs on removable device still mounted.
> 
> Then by some incident, the btrfs got mounted and modified by another
> system/OS, then back to the hibernated system.
> 
> After resuming from the hibernation, new write happened into the victim btrfs.
> 
> Now the fs is completely broken, since the underlying btrfs is no longer
> the same one before the hibernation, and the user lost their data due to
> various transid mismatch.
> 
> [REPRODUCER]
> We can emulate the situation using the following small script:
> 
>  truncate -s 1G $dev
>  mkfs.btrfs -f $dev
>  mount $dev $mnt
>  fsstress -w -d $mnt -n 500
>  sync
>  xfs_freeze -f $mnt
>  cp $dev $dev.backup
> 
>  # There is no way to mount the same cloned fs on the same system,
>  # as the conflicting fsid will be rejected by btrfs.
>  # Thus here we have to wipe the fs using a different btrfs.
>  mkfs.btrfs -f $dev.backup
> 
>  dd if=$dev.backup of=$dev bs=1M
>  xfs_freeze -u $mnt
>  fsstress -w -d $mnt -n 20
>  umount $mnt
>  btrfs check $dev
> 
> The final fsck will fail due to some tree blocks has incorrect fsid.
> 
> This is enough to emulate the problem hit by the unfortunate user.
> 
> [ENHANCEMENT]
> Although such case should not be that common, it can still happen from
> time to time.
> 
> >From the view of btrfs, we can detect any unexpected super block change,
> and if there is any unexpected change, we just mark the fs RO, and thaw
> the fs.
> 
> By this we can limit the damage to minimal, and I hope no one would lose
> their data by this anymore.
> 
> Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
> Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Changelog:
> v2:
> - Remove one unrelated debug pr_info()
> - Slightly re-word some comments
> - Add suggested-by tag
> ---
>  fs/btrfs/disk-io.c |  9 +++++--
>  fs/btrfs/disk-io.h |  2 +-
>  fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c |  2 +-
>  4 files changed, 67 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 6268dafeeb2d..7d99c42bdc51 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3849,7 +3849,7 @@ static void btrfs_end_super_write(struct bio *bio)
>  }
>  
>  struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
> -						   int copy_num)
> +						   int copy_num, bool drop_cache)
>  {
>  	struct btrfs_super_block *super;
>  	struct page *page;
> @@ -3867,6 +3867,11 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>  	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
>  		return ERR_PTR(-EINVAL);
>  
> +	if (drop_cache)
> +		truncate_inode_pages_range(bdev->bd_inode->i_mapping,

This will delete the range and replace by implicit zeros, but don't we
want to force reading the superblock form disk instead? Otherwise the
zeros can be read by something in userspace until the next superblock
write.

> +				round_down(bytenr, PAGE_SIZE),
> +				round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
> +					 PAGE_SIZE) - 1);
>  	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
>  	if (IS_ERR(page))
>  		return ERR_CAST(page);
> @@ -3898,7 +3903,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
>  	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>  	 */
>  	for (i = 0; i < 1; i++) {
> -		super = btrfs_read_dev_one_super(bdev, i);
> +		super = btrfs_read_dev_one_super(bdev, i, false);
>  		if (IS_ERR(super))
>  			continue;
>  
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 47ad8e0a2d33..d0946f502f62 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -49,7 +49,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info);
>  int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
>  struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
>  struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
> -						   int copy_num);
> +						   int copy_num, bool drop_cache);
>  int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>  struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
>  					struct btrfs_key *key);
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 4c7089b1681b..913b951981a9 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -2548,11 +2548,69 @@ static int btrfs_freeze(struct super_block *sb)
>  	return btrfs_commit_transaction(trans);
>  }
>  
> +static int check_dev_super(struct btrfs_device *dev)
> +{
> +	struct btrfs_fs_info *fs_info = dev->fs_info;
> +	struct btrfs_super_block *sb;
> +	int ret = 0;
> +
> +	/* This should be called with fs still frozen. */
> +	ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
> +
> +	/* Missing dev,  no need to check. */
> +	if (!dev->bdev)
> +		return 0;
> +
> +	/* Only need to check the primary super block. */
> +	sb = btrfs_read_dev_one_super(dev->bdev, 0, true);

Inside that there magic number and offset is verified as it's the
minimal what must be correct.

> +	if (IS_ERR(sb))
> +		return PTR_ERR(sb);
> +
> +	if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
> +		btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
> +			  sb->fsid, dev->fs_devices->fsid);
> +		ret = -EUCLEAN;
> +		goto out;
> +	}
> +
> +	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {

That's last generation and fsid but how about everything else? Also the
checksum should be validated. We have extensive check in
super.c:validate_super, why haven't you used that?

> +		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
> +			btrfs_super_generation(sb),
> +			fs_info->last_trans_committed);
> +		ret = -EUCLEAN;
> +		goto out;
> +	}
> +out:
> +	btrfs_release_disk_super(sb);
> +	return ret;
> +}
> +
>  static int btrfs_unfreeze(struct super_block *sb)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_device *device;
> +	int ret = 0;
>  
> +	/*
> +	 * Make sure the fs is not changed by accident (like hibernation then
> +	 * modified by other OS).
> +	 * If we found anything wrong, we mark the fs error immediately.
> +	 */
> +	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
> +		ret = check_dev_super(device);
> +		if (ret < 0) {
> +			btrfs_handle_fs_error(fs_info, ret,
> +				"filesystem got modified unexpectedly");

This should say something about thaw and on which device it got
detected.

> +			break;
> +		}
> +	}
>  	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
> +
> +	/*
> +	 * We still return 0, to allow VFS layer to unfreeze the fs even above
> +	 * checks failed. Since the fs is either fine or RO, we're safe to
> +	 * continue, without causing further damage.
> +	 */
>  	return 0;
>  }
>  
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 8c64dda69404..a02066ae5812 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2017,7 +2017,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>  		struct page *page;
>  		int ret;
>  
> -		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
> +		disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
>  		if (IS_ERR(disk_super))
>  			continue;
>  
> -- 
> 2.37.1
Qu Wenruo Aug. 23, 2022, 11:11 p.m. UTC | #2
On 2022/8/24 01:10, David Sterba wrote:
> On Thu, Aug 11, 2022 at 02:47:08PM +0800, Qu Wenruo wrote:
>> [BACKGROUND]
>> There is an incident report that, one user hibernated the system, with
>> one btrfs on removable device still mounted.
>>
>> Then by some incident, the btrfs got mounted and modified by another
>> system/OS, then back to the hibernated system.
>>
>> After resuming from the hibernation, new write happened into the victim btrfs.
>>
>> Now the fs is completely broken, since the underlying btrfs is no longer
>> the same one before the hibernation, and the user lost their data due to
>> various transid mismatch.
>>
>> [REPRODUCER]
>> We can emulate the situation using the following small script:
>>
>>   truncate -s 1G $dev
>>   mkfs.btrfs -f $dev
>>   mount $dev $mnt
>>   fsstress -w -d $mnt -n 500
>>   sync
>>   xfs_freeze -f $mnt
>>   cp $dev $dev.backup
>>
>>   # There is no way to mount the same cloned fs on the same system,
>>   # as the conflicting fsid will be rejected by btrfs.
>>   # Thus here we have to wipe the fs using a different btrfs.
>>   mkfs.btrfs -f $dev.backup
>>
>>   dd if=$dev.backup of=$dev bs=1M
>>   xfs_freeze -u $mnt
>>   fsstress -w -d $mnt -n 20
>>   umount $mnt
>>   btrfs check $dev
>>
>> The final fsck will fail due to some tree blocks has incorrect fsid.
>>
>> This is enough to emulate the problem hit by the unfortunate user.
>>
>> [ENHANCEMENT]
>> Although such case should not be that common, it can still happen from
>> time to time.
>>
>> >From the view of btrfs, we can detect any unexpected super block change,
>> and if there is any unexpected change, we just mark the fs RO, and thaw
>> the fs.
>>
>> By this we can limit the damage to minimal, and I hope no one would lose
>> their data by this anymore.
>>
>> Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
>> Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Changelog:
>> v2:
>> - Remove one unrelated debug pr_info()
>> - Slightly re-word some comments
>> - Add suggested-by tag
>> ---
>>   fs/btrfs/disk-io.c |  9 +++++--
>>   fs/btrfs/disk-io.h |  2 +-
>>   fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.c |  2 +-
>>   4 files changed, 67 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 6268dafeeb2d..7d99c42bdc51 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -3849,7 +3849,7 @@ static void btrfs_end_super_write(struct bio *bio)
>>   }
>>
>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>> -						   int copy_num)
>> +						   int copy_num, bool drop_cache)
>>   {
>>   	struct btrfs_super_block *super;
>>   	struct page *page;
>> @@ -3867,6 +3867,11 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>>   	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
>>   		return ERR_PTR(-EINVAL);
>>
>> +	if (drop_cache)
>> +		truncate_inode_pages_range(bdev->bd_inode->i_mapping,
>
> This will delete the range and replace by implicit zeros, but don't we
> want to force reading the superblock form disk instead? Otherwise the
> zeros can be read by something in userspace until the next superblock
> write.
>
>> +				round_down(bytenr, PAGE_SIZE),
>> +				round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
>> +					 PAGE_SIZE) - 1);
>>   	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
>>   	if (IS_ERR(page))
>>   		return ERR_CAST(page);
>> @@ -3898,7 +3903,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
>>   	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>>   	 */
>>   	for (i = 0; i < 1; i++) {
>> -		super = btrfs_read_dev_one_super(bdev, i);
>> +		super = btrfs_read_dev_one_super(bdev, i, false);
>>   		if (IS_ERR(super))
>>   			continue;
>>
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index 47ad8e0a2d33..d0946f502f62 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -49,7 +49,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info);
>>   int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
>>   struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>> -						   int copy_num);
>> +						   int copy_num, bool drop_cache);
>>   int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>>   struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
>>   					struct btrfs_key *key);
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 4c7089b1681b..913b951981a9 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -2548,11 +2548,69 @@ static int btrfs_freeze(struct super_block *sb)
>>   	return btrfs_commit_transaction(trans);
>>   }
>>
>> +static int check_dev_super(struct btrfs_device *dev)
>> +{
>> +	struct btrfs_fs_info *fs_info = dev->fs_info;
>> +	struct btrfs_super_block *sb;
>> +	int ret = 0;
>> +
>> +	/* This should be called with fs still frozen. */
>> +	ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
>> +
>> +	/* Missing dev,  no need to check. */
>> +	if (!dev->bdev)
>> +		return 0;
>> +
>> +	/* Only need to check the primary super block. */
>> +	sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
>
> Inside that there magic number and offset is verified as it's the
> minimal what must be correct.
>
>> +	if (IS_ERR(sb))
>> +		return PTR_ERR(sb);
>> +
>> +	if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
>> +		btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
>> +			  sb->fsid, dev->fs_devices->fsid);
>> +		ret = -EUCLEAN;
>> +		goto out;
>> +	}
>> +
>> +	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
>
> That's last generation and fsid but how about everything else? Also the
> checksum should be validated. We have extensive check in
> super.c:validate_super, why haven't you used that?

Because I don't think that's needed at thawn time.

Last transid and fsid is enough to ensure:

1) It's still the same btrfs

2) It's not modified halfway

Unless there is some way to modify the fs without COWing the metadata,
the fsid + transid check should be enough.

Thanks,
Qu

>
>> +		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
>> +			btrfs_super_generation(sb),
>> +			fs_info->last_trans_committed);
>> +		ret = -EUCLEAN;
>> +		goto out;
>> +	}
>> +out:
>> +	btrfs_release_disk_super(sb);
>> +	return ret;
>> +}
>> +
>>   static int btrfs_unfreeze(struct super_block *sb)
>>   {
>>   	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>> +	struct btrfs_device *device;
>> +	int ret = 0;
>>
>> +	/*
>> +	 * Make sure the fs is not changed by accident (like hibernation then
>> +	 * modified by other OS).
>> +	 * If we found anything wrong, we mark the fs error immediately.
>> +	 */
>> +	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
>> +		ret = check_dev_super(device);
>> +		if (ret < 0) {
>> +			btrfs_handle_fs_error(fs_info, ret,
>> +				"filesystem got modified unexpectedly");
>
> This should say something about thaw and on which device it got
> detected.
>
>> +			break;
>> +		}
>> +	}
>>   	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
>> +
>> +	/*
>> +	 * We still return 0, to allow VFS layer to unfreeze the fs even above
>> +	 * checks failed. Since the fs is either fine or RO, we're safe to
>> +	 * continue, without causing further damage.
>> +	 */
>>   	return 0;
>>   }
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 8c64dda69404..a02066ae5812 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2017,7 +2017,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>>   		struct page *page;
>>   		int ret;
>>
>> -		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
>> +		disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
>>   		if (IS_ERR(disk_super))
>>   			continue;
>>
>> --
>> 2.37.1
Anand Jain Aug. 24, 2022, 10:13 a.m. UTC | #3
On 11/08/2022 14:47, Qu Wenruo wrote:
> [BACKGROUND]
> There is an incident report that, one user hibernated the system, with
> one btrfs on removable device still mounted.
> 
> Then by some incident, the btrfs got mounted and modified by another
> system/OS, then back to the hibernated system.
> 
> After resuming from the hibernation, new write happened into the victim btrfs.
> 
> Now the fs is completely broken, since the underlying btrfs is no longer
> the same one before the hibernation, and the user lost their data due to
> various transid mismatch.
> 
> [REPRODUCER]
> We can emulate the situation using the following small script:
> 
>   truncate -s 1G $dev
>   mkfs.btrfs -f $dev
>   mount $dev $mnt
>   fsstress -w -d $mnt -n 500
>   sync
>   xfs_freeze -f $mnt
>   cp $dev $dev.backup
> 
>   # There is no way to mount the same cloned fs on the same system,
>   # as the conflicting fsid will be rejected by btrfs.
>   # Thus here we have to wipe the fs using a different btrfs.
>   mkfs.btrfs -f $dev.backup
> 
>   dd if=$dev.backup of=$dev bs=1M
>   xfs_freeze -u $mnt
>   fsstress -w -d $mnt -n 20
>   umount $mnt
>   btrfs check $dev
> 
> The final fsck will fail due to some tree blocks has incorrect fsid.
> 
> This is enough to emulate the problem hit by the unfortunate user.
> 
> [ENHANCEMENT]
> Although such case should not be that common, it can still happen from
> time to time.
> 
>  From the view of btrfs, we can detect any unexpected super block change,
> and if there is any unexpected change, we just mark the fs RO, and thaw
> the fs.
> 
> By this we can limit the damage to minimal, and I hope no one would lose
> their data by this anymore.
> 
> Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
> Link: https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Changelog:
> v2:
> - Remove one unrelated debug pr_info()
> - Slightly re-word some comments
> - Add suggested-by tag
> ---
>   fs/btrfs/disk-io.c |  9 +++++--
>   fs/btrfs/disk-io.h |  2 +-
>   fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/volumes.c |  2 +-
>   4 files changed, 67 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 6268dafeeb2d..7d99c42bdc51 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3849,7 +3849,7 @@ static void btrfs_end_super_write(struct bio *bio)
>   }
>   
>   struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
> -						   int copy_num)
> +						   int copy_num, bool drop_cache)
>   {
>   	struct btrfs_super_block *super;
>   	struct page *page;
> @@ -3867,6 +3867,11 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
>   	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
>   		return ERR_PTR(-EINVAL);
>   
> +	if (drop_cache)


> +		truncate_inode_pages_range(bdev->bd_inode->i_mapping,
> +				round_down(bytenr, PAGE_SIZE),
> +				round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
> +					 PAGE_SIZE) - 1);

The 3rd argument is the offset to which to truncate (inclusive), and it
looks correct.


>   	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
>   	if (IS_ERR(page))
>   		return ERR_CAST(page);
> @@ -3898,7 +3903,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
>   	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>   	 */
>   	for (i = 0; i < 1; i++) {
> -		super = btrfs_read_dev_one_super(bdev, i);
> +		super = btrfs_read_dev_one_super(bdev, i, false);
>   		if (IS_ERR(super))
>   			continue;
>   
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index 47ad8e0a2d33..d0946f502f62 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -49,7 +49,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info);
>   int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
>   struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
>   struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
> -						   int copy_num);
> +						   int copy_num, bool drop_cache);
>   int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>   struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
>   					struct btrfs_key *key);
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 4c7089b1681b..913b951981a9 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -2548,11 +2548,69 @@ static int btrfs_freeze(struct super_block *sb)
>   	return btrfs_commit_transaction(trans);
>   }
>   
> +static int check_dev_super(struct btrfs_device *dev)
> +{
> +	struct btrfs_fs_info *fs_info = dev->fs_info;
> +	struct btrfs_super_block *sb;
> +	int ret = 0;
> +
> +	/* This should be called with fs still frozen. */
> +	ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
> +
> +	/* Missing dev,  no need to check. */
> +	if (!dev->bdev)
> +		return 0;
> +
> +	/* Only need to check the primary super block. */
> +	sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
> +	if (IS_ERR(sb))
> +		return PTR_ERR(sb);
> +

> +	if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
> +		btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
> +			  sb->fsid, dev->fs_devices->fsid);
> +		ret = -EUCLEAN;
> +		goto out;
> +	}

  Just a fallthrough is fine.

> +	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
> +		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
> +			btrfs_super_generation(sb),
> +			fs_info->last_trans_committed);
> +		ret = -EUCLEAN;
> +		goto out;
> +	}

  Here also.

> +out:

  And the out label can be removed.

> +	btrfs_release_disk_super(sb);
> +	return ret;
> +}
> +
>   static int btrfs_unfreeze(struct super_block *sb)
>   {
>   	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_device *device;
> +	int ret = 0;
>   


> +	/*
> +	 * Make sure the fs is not changed by accident (like hibernation then
> +	 * modified by other OS).
> +	 * If we found anything wrong, we mark the fs error immediately.
> +	 */
> +	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
> +		ret = check_dev_super(device);
> +		if (ret < 0) {
> +			btrfs_handle_fs_error(fs_info, ret,
> +				"filesystem got modified unexpectedly");


  btrfs_read_dev_one_super() may return -EINVAL and the error log will
  miss lead.

> +			break;
> +		}
> +	}

  I checked if device_list_mutex is required, but as we are in a frozen
  state, you are correct no device_list_mutex is required here.


Thanks, Anand

>   	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
> +
> +	/*
> +	 * We still return 0, to allow VFS layer to unfreeze the fs even above
> +	 * checks failed. Since the fs is either fine or RO, we're safe to
> +	 * continue, without causing further damage.
> +	 */
>   	return 0;
>   }
>   
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 8c64dda69404..a02066ae5812 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2017,7 +2017,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
>   		struct page *page;
>   		int ret;
>   
> -		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
> +		disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
>   		if (IS_ERR(disk_super))
>   			continue;
>
Qu Wenruo Aug. 24, 2022, 10:57 a.m. UTC | #4
On 2022/8/24 18:13, Anand Jain wrote:
> On 11/08/2022 14:47, Qu Wenruo wrote:
>> [BACKGROUND]
>> There is an incident report that, one user hibernated the system, with
>> one btrfs on removable device still mounted.
>>
>> Then by some incident, the btrfs got mounted and modified by another
>> system/OS, then back to the hibernated system.
>>
>> After resuming from the hibernation, new write happened into the 
>> victim btrfs.
>>
>> Now the fs is completely broken, since the underlying btrfs is no longer
>> the same one before the hibernation, and the user lost their data due to
>> various transid mismatch.
>>
>> [REPRODUCER]
>> We can emulate the situation using the following small script:
>>
>>   truncate -s 1G $dev
>>   mkfs.btrfs -f $dev
>>   mount $dev $mnt
>>   fsstress -w -d $mnt -n 500
>>   sync
>>   xfs_freeze -f $mnt
>>   cp $dev $dev.backup
>>
>>   # There is no way to mount the same cloned fs on the same system,
>>   # as the conflicting fsid will be rejected by btrfs.
>>   # Thus here we have to wipe the fs using a different btrfs.
>>   mkfs.btrfs -f $dev.backup
>>
>>   dd if=$dev.backup of=$dev bs=1M
>>   xfs_freeze -u $mnt
>>   fsstress -w -d $mnt -n 20
>>   umount $mnt
>>   btrfs check $dev
>>
>> The final fsck will fail due to some tree blocks has incorrect fsid.
>>
>> This is enough to emulate the problem hit by the unfortunate user.
>>
>> [ENHANCEMENT]
>> Although such case should not be that common, it can still happen from
>> time to time.
>>
>>  From the view of btrfs, we can detect any unexpected super block change,
>> and if there is any unexpected change, we just mark the fs RO, and thaw
>> the fs.
>>
>> By this we can limit the damage to minimal, and I hope no one would lose
>> their data by this anymore.
>>
>> Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
>> Link: 
>> https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/ 
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Changelog:
>> v2:
>> - Remove one unrelated debug pr_info()
>> - Slightly re-word some comments
>> - Add suggested-by tag
>> ---
>>   fs/btrfs/disk-io.c |  9 +++++--
>>   fs/btrfs/disk-io.h |  2 +-
>>   fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.c |  2 +-
>>   4 files changed, 67 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 6268dafeeb2d..7d99c42bdc51 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -3849,7 +3849,7 @@ static void btrfs_end_super_write(struct bio *bio)
>>   }
>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct 
>> block_device *bdev,
>> -                           int copy_num)
>> +                           int copy_num, bool drop_cache)
>>   {
>>       struct btrfs_super_block *super;
>>       struct page *page;
>> @@ -3867,6 +3867,11 @@ struct btrfs_super_block 
>> *btrfs_read_dev_one_super(struct block_device *bdev,
>>       if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
>>           return ERR_PTR(-EINVAL);
>> +    if (drop_cache)
> 
> 
>> +        truncate_inode_pages_range(bdev->bd_inode->i_mapping,
>> +                round_down(bytenr, PAGE_SIZE),
>> +                round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
>> +                     PAGE_SIZE) - 1);
> 
> The 3rd argument is the offset to which to truncate (inclusive), and it
> looks correct.
> 
> 
>>       page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, 
>> GFP_NOFS);
>>       if (IS_ERR(page))
>>           return ERR_CAST(page);
>> @@ -3898,7 +3903,7 @@ struct btrfs_super_block 
>> *btrfs_read_dev_super(struct block_device *bdev)
>>        * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>>        */
>>       for (i = 0; i < 1; i++) {
>> -        super = btrfs_read_dev_one_super(bdev, i);
>> +        super = btrfs_read_dev_one_super(bdev, i, false);
>>           if (IS_ERR(super))
>>               continue;
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index 47ad8e0a2d33..d0946f502f62 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -49,7 +49,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info);
>>   int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
>>   struct btrfs_super_block *btrfs_read_dev_super(struct block_device 
>> *bdev);
>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct 
>> block_device *bdev,
>> -                           int copy_num);
>> +                           int copy_num, bool drop_cache);
>>   int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>>   struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
>>                       struct btrfs_key *key);
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 4c7089b1681b..913b951981a9 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -2548,11 +2548,69 @@ static int btrfs_freeze(struct super_block *sb)
>>       return btrfs_commit_transaction(trans);
>>   }
>> +static int check_dev_super(struct btrfs_device *dev)
>> +{
>> +    struct btrfs_fs_info *fs_info = dev->fs_info;
>> +    struct btrfs_super_block *sb;
>> +    int ret = 0;
>> +
>> +    /* This should be called with fs still frozen. */
>> +    ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
>> +
>> +    /* Missing dev,  no need to check. */
>> +    if (!dev->bdev)
>> +        return 0;
>> +
>> +    /* Only need to check the primary super block. */
>> +    sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
>> +    if (IS_ERR(sb))
>> +        return PTR_ERR(sb);
>> +
> 
>> +    if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
>> +        btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
>> +              sb->fsid, dev->fs_devices->fsid);
>> +        ret = -EUCLEAN;
>> +        goto out;
>> +    }
> 
>   Just a fallthrough is fine.

If the fsid is changed, the generation check are almost ensured to fail.

Thus I don't think we should even try continue checking.

> 
>> +    if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
>> +        btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
>> +            btrfs_super_generation(sb),
>> +            fs_info->last_trans_committed);
>> +        ret = -EUCLEAN;
>> +        goto out;
>> +    }
> 
>   Here also. >
>> +out:
> 
>   And the out label can be removed.

As David mentioned, we may want to do a full super block check,
and as I mentioned above, I don't think any failed check should continue 
the verification, thus I'm afraid I would keep the tag.

> 
>> +    btrfs_release_disk_super(sb);
>> +    return ret;
>> +}
>> +
>>   static int btrfs_unfreeze(struct super_block *sb)
>>   {
>>       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>> +    struct btrfs_device *device;
>> +    int ret = 0;
> 
> 
>> +    /*
>> +     * Make sure the fs is not changed by accident (like hibernation 
>> then
>> +     * modified by other OS).
>> +     * If we found anything wrong, we mark the fs error immediately.
>> +     */
>> +    list_for_each_entry(device, &fs_info->fs_devices->devices, 
>> dev_list) {
>> +        ret = check_dev_super(device);
>> +        if (ret < 0) {
>> +            btrfs_handle_fs_error(fs_info, ret,
>> +                "filesystem got modified unexpectedly");
> 
> 
>   btrfs_read_dev_one_super() may return -EINVAL and the error log will
>   miss lead.

In that case, it still means the fs is incorrect.

Unless we have some unexposed bugs, shouldn't every super block, which 
we committed to disk, is valid?

> 
>> +            break;
>> +        }
>> +    }
> 
>   I checked if device_list_mutex is required, but as we are in a frozen
>   state, you are correct no device_list_mutex is required here.

I can definitely add a comment for this.

Thanks,
Qu

> 
> 
> Thanks, Anand
> 
>>       clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
>> +
>> +    /*
>> +     * We still return 0, to allow VFS layer to unfreeze the fs even 
>> above
>> +     * checks failed. Since the fs is either fine or RO, we're safe to
>> +     * continue, without causing further damage.
>> +     */
>>       return 0;
>>   }
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 8c64dda69404..a02066ae5812 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2017,7 +2017,7 @@ void btrfs_scratch_superblocks(struct 
>> btrfs_fs_info *fs_info,
>>           struct page *page;
>>           int ret;
>> -        disk_super = btrfs_read_dev_one_super(bdev, copy_num);
>> +        disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
>>           if (IS_ERR(disk_super))
>>               continue;
>
Qu Wenruo Aug. 24, 2022, 11:09 a.m. UTC | #5
On 2022/8/24 07:11, Qu Wenruo wrote:
>
>
> On 2022/8/24 01:10, David Sterba wrote:
>> On Thu, Aug 11, 2022 at 02:47:08PM +0800, Qu Wenruo wrote:
>>> [BACKGROUND]
>>> There is an incident report that, one user hibernated the system, with
>>> one btrfs on removable device still mounted.
>>>
>>> Then by some incident, the btrfs got mounted and modified by another
>>> system/OS, then back to the hibernated system.
>>>
>>> After resuming from the hibernation, new write happened into the
>>> victim btrfs.
>>>
>>> Now the fs is completely broken, since the underlying btrfs is no longer
>>> the same one before the hibernation, and the user lost their data due to
>>> various transid mismatch.
>>>
>>> [REPRODUCER]
>>> We can emulate the situation using the following small script:
>>>
>>>   truncate -s 1G $dev
>>>   mkfs.btrfs -f $dev
>>>   mount $dev $mnt
>>>   fsstress -w -d $mnt -n 500
>>>   sync
>>>   xfs_freeze -f $mnt
>>>   cp $dev $dev.backup
>>>
>>>   # There is no way to mount the same cloned fs on the same system,
>>>   # as the conflicting fsid will be rejected by btrfs.
>>>   # Thus here we have to wipe the fs using a different btrfs.
>>>   mkfs.btrfs -f $dev.backup
>>>
>>>   dd if=$dev.backup of=$dev bs=1M
>>>   xfs_freeze -u $mnt
>>>   fsstress -w -d $mnt -n 20
>>>   umount $mnt
>>>   btrfs check $dev
>>>
>>> The final fsck will fail due to some tree blocks has incorrect fsid.
>>>
>>> This is enough to emulate the problem hit by the unfortunate user.
>>>
>>> [ENHANCEMENT]
>>> Although such case should not be that common, it can still happen from
>>> time to time.
>>>
>>> >From the view of btrfs, we can detect any unexpected super block
>>> change,
>>> and if there is any unexpected change, we just mark the fs RO, and thaw
>>> the fs.
>>>
>>> By this we can limit the damage to minimal, and I hope no one would lose
>>> their data by this anymore.
>>>
>>> Suggested-by: Goffredo Baroncelli <kreijack@libero.it>
>>> Link:
>>> https://lore.kernel.org/linux-btrfs/83bf3b4b-7f4c-387a-b286-9251e3991e34@bluemole.com/
>>>
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>> Changelog:
>>> v2:
>>> - Remove one unrelated debug pr_info()
>>> - Slightly re-word some comments
>>> - Add suggested-by tag
>>> ---
>>>   fs/btrfs/disk-io.c |  9 +++++--
>>>   fs/btrfs/disk-io.h |  2 +-
>>>   fs/btrfs/super.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++++
>>>   fs/btrfs/volumes.c |  2 +-
>>>   4 files changed, 67 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> index 6268dafeeb2d..7d99c42bdc51 100644
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -3849,7 +3849,7 @@ static void btrfs_end_super_write(struct bio *bio)
>>>   }
>>>
>>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct
>>> block_device *bdev,
>>> -                           int copy_num)
>>> +                           int copy_num, bool drop_cache)
>>>   {
>>>       struct btrfs_super_block *super;
>>>       struct page *page;
>>> @@ -3867,6 +3867,11 @@ struct btrfs_super_block
>>> *btrfs_read_dev_one_super(struct block_device *bdev,
>>>       if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
>>>           return ERR_PTR(-EINVAL);
>>>
>>> +    if (drop_cache)
>>> +        truncate_inode_pages_range(bdev->bd_inode->i_mapping,
>>
>> This will delete the range and replace by implicit zeros,

Nope, the implicit zeros are there if we're truncating sub-sector.

I'm not sure what's the sector size used by block layer (depending on
device? Thus 4K or 512B?).
But here I intentionally use range [round_down(bytenr, PAGE_SIZE),
round_up(bytenr + 4K, PAGE_SIZE)) to drop the page cache.

Unless we have some sepecial block layer code using unit larger than
PAGE_SIZE, we're ensured to drop the full page range, and leaving no
page filled with zero.

By this, no user space can read out zero from the page cache of the
device, but always read from disk.

Sure, this may need extra comments, and if you have more correct call
schema to only drop page cache of a certain range, then I'm pretty happy
to follow.

But I searched `mm.h` for quite some time, and didn't find a better one
than truncate_inode_pages_range().
The other alternative is truncate_inode_pages(), which drops all page
cache, thus needs no the page alignement thing.
I can go that way if you prefer.


>> but don't we
>> want to force reading the superblock form disk instead? Otherwise the
>> zeros can be read by something in userspace until the next superblock
>> write.
>>
>>> +                round_down(bytenr, PAGE_SIZE),
>>> +                round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
>>> +                     PAGE_SIZE) - 1);
>>>       page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT,
>>> GFP_NOFS);
>>>       if (IS_ERR(page))
>>>           return ERR_CAST(page);
>>> @@ -3898,7 +3903,7 @@ struct btrfs_super_block
>>> *btrfs_read_dev_super(struct block_device *bdev)
>>>        * later supers, using BTRFS_SUPER_MIRROR_MAX instead
>>>        */
>>>       for (i = 0; i < 1; i++) {
>>> -        super = btrfs_read_dev_one_super(bdev, i);
>>> +        super = btrfs_read_dev_one_super(bdev, i, false);
>>>           if (IS_ERR(super))
>>>               continue;
>>>
>>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>>> index 47ad8e0a2d33..d0946f502f62 100644
>>> --- a/fs/btrfs/disk-io.h
>>> +++ b/fs/btrfs/disk-io.h
>>> @@ -49,7 +49,7 @@ void __cold close_ctree(struct btrfs_fs_info
>>> *fs_info);
>>>   int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
>>>   struct btrfs_super_block *btrfs_read_dev_super(struct block_device
>>> *bdev);
>>>   struct btrfs_super_block *btrfs_read_dev_one_super(struct
>>> block_device *bdev,
>>> -                           int copy_num);
>>> +                           int copy_num, bool drop_cache);
>>>   int btrfs_commit_super(struct btrfs_fs_info *fs_info);
>>>   struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
>>>                       struct btrfs_key *key);
>>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>>> index 4c7089b1681b..913b951981a9 100644
>>> --- a/fs/btrfs/super.c
>>> +++ b/fs/btrfs/super.c
>>> @@ -2548,11 +2548,69 @@ static int btrfs_freeze(struct super_block *sb)
>>>       return btrfs_commit_transaction(trans);
>>>   }
>>>
>>> +static int check_dev_super(struct btrfs_device *dev)
>>> +{
>>> +    struct btrfs_fs_info *fs_info = dev->fs_info;
>>> +    struct btrfs_super_block *sb;
>>> +    int ret = 0;
>>> +
>>> +    /* This should be called with fs still frozen. */
>>> +    ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
>>> +
>>> +    /* Missing dev,  no need to check. */
>>> +    if (!dev->bdev)
>>> +        return 0;
>>> +
>>> +    /* Only need to check the primary super block. */
>>> +    sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
>>
>> Inside that there magic number and offset is verified as it's the
>> minimal what must be correct.
>>
>>> +    if (IS_ERR(sb))
>>> +        return PTR_ERR(sb);
>>> +
>>> +    if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
>>> +        btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
>>> +              sb->fsid, dev->fs_devices->fsid);
>>> +        ret = -EUCLEAN;
>>> +        goto out;
>>> +    }
>>> +
>>> +    if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
>>
>> That's last generation and fsid but how about everything else? Also the
>> checksum should be validated. We have extensive check in
>> super.c:validate_super, why haven't you used that?
>
> Because I don't think that's needed at thawn time.
>
> Last transid and fsid is enough to ensure:
>
> 1) It's still the same btrfs
>
> 2) It's not modified halfway
>
> Unless there is some way to modify the fs without COWing the metadata,
> the fsid + transid check should be enough.

After more consideration, the fs thawn path is far from hot, and my
usual tend is, check every byte of the on-disk data. (which is not
followed here)

So I agree we should do full check.

Will address them all in v3 update.

Thanks,
Qu

>
> Thanks,
> Qu
>
>>
>>> +        btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
>>> +            btrfs_super_generation(sb),
>>> +            fs_info->last_trans_committed);
>>> +        ret = -EUCLEAN;
>>> +        goto out;
>>> +    }
>>> +out:
>>> +    btrfs_release_disk_super(sb);
>>> +    return ret;
>>> +}
>>> +
>>>   static int btrfs_unfreeze(struct super_block *sb)
>>>   {
>>>       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
>>> +    struct btrfs_device *device;
>>> +    int ret = 0;
>>>
>>> +    /*
>>> +     * Make sure the fs is not changed by accident (like hibernation
>>> then
>>> +     * modified by other OS).
>>> +     * If we found anything wrong, we mark the fs error immediately.
>>> +     */
>>> +    list_for_each_entry(device, &fs_info->fs_devices->devices,
>>> dev_list) {
>>> +        ret = check_dev_super(device);
>>> +        if (ret < 0) {
>>> +            btrfs_handle_fs_error(fs_info, ret,
>>> +                "filesystem got modified unexpectedly");
>>
>> This should say something about thaw and on which device it got
>> detected.
>>
>>> +            break;
>>> +        }
>>> +    }
>>>       clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
>>> +
>>> +    /*
>>> +     * We still return 0, to allow VFS layer to unfreeze the fs even
>>> above
>>> +     * checks failed. Since the fs is either fine or RO, we're safe to
>>> +     * continue, without causing further damage.
>>> +     */
>>>       return 0;
>>>   }
>>>
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 8c64dda69404..a02066ae5812 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -2017,7 +2017,7 @@ void btrfs_scratch_superblocks(struct
>>> btrfs_fs_info *fs_info,
>>>           struct page *page;
>>>           int ret;
>>>
>>> -        disk_super = btrfs_read_dev_one_super(bdev, copy_num);
>>> +        disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
>>>           if (IS_ERR(disk_super))
>>>               continue;
>>>
>>> --
>>> 2.37.1
diff mbox series

Patch

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6268dafeeb2d..7d99c42bdc51 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3849,7 +3849,7 @@  static void btrfs_end_super_write(struct bio *bio)
 }
 
 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
-						   int copy_num)
+						   int copy_num, bool drop_cache)
 {
 	struct btrfs_super_block *super;
 	struct page *page;
@@ -3867,6 +3867,11 @@  struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
 	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
 		return ERR_PTR(-EINVAL);
 
+	if (drop_cache)
+		truncate_inode_pages_range(bdev->bd_inode->i_mapping,
+				round_down(bytenr, PAGE_SIZE),
+				round_up(bytenr + BTRFS_SUPER_INFO_SIZE,
+					 PAGE_SIZE) - 1);
 	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
 	if (IS_ERR(page))
 		return ERR_CAST(page);
@@ -3898,7 +3903,7 @@  struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
 	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 	 */
 	for (i = 0; i < 1; i++) {
-		super = btrfs_read_dev_one_super(bdev, i);
+		super = btrfs_read_dev_one_super(bdev, i, false);
 		if (IS_ERR(super))
 			continue;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 47ad8e0a2d33..d0946f502f62 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -49,7 +49,7 @@  void __cold close_ctree(struct btrfs_fs_info *fs_info);
 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
-						   int copy_num);
+						   int copy_num, bool drop_cache);
 int btrfs_commit_super(struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 					struct btrfs_key *key);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4c7089b1681b..913b951981a9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2548,11 +2548,69 @@  static int btrfs_freeze(struct super_block *sb)
 	return btrfs_commit_transaction(trans);
 }
 
+static int check_dev_super(struct btrfs_device *dev)
+{
+	struct btrfs_fs_info *fs_info = dev->fs_info;
+	struct btrfs_super_block *sb;
+	int ret = 0;
+
+	/* This should be called with fs still frozen. */
+	ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+	/* Missing dev,  no need to check. */
+	if (!dev->bdev)
+		return 0;
+
+	/* Only need to check the primary super block. */
+	sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (memcmp(sb->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE)) {
+		btrfs_err(fs_info, "fsid doesn't match, has %pU expect %pU",
+			  sb->fsid, dev->fs_devices->fsid);
+		ret = -EUCLEAN;
+		goto out;
+	}
+
+	if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+		btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+			btrfs_super_generation(sb),
+			fs_info->last_trans_committed);
+		ret = -EUCLEAN;
+		goto out;
+	}
+out:
+	btrfs_release_disk_super(sb);
+	return ret;
+}
+
 static int btrfs_unfreeze(struct super_block *sb)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_device *device;
+	int ret = 0;
 
+	/*
+	 * Make sure the fs is not changed by accident (like hibernation then
+	 * modified by other OS).
+	 * If we found anything wrong, we mark the fs error immediately.
+	 */
+	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+		ret = check_dev_super(device);
+		if (ret < 0) {
+			btrfs_handle_fs_error(fs_info, ret,
+				"filesystem got modified unexpectedly");
+			break;
+		}
+	}
 	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+	/*
+	 * We still return 0, to allow VFS layer to unfreeze the fs even above
+	 * checks failed. Since the fs is either fine or RO, we're safe to
+	 * continue, without causing further damage.
+	 */
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8c64dda69404..a02066ae5812 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2017,7 +2017,7 @@  void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
 		struct page *page;
 		int ret;
 
-		disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+		disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
 		if (IS_ERR(disk_super))
 			continue;