[1/6] btrfs: metadata_uuid: fix failed assertion due to unsuccessful device scan
diff mbox series

Message ID 20191212110132.11063-2-Damenly_Su@gmx.com
State New
Headers show
Series
  • btrfs: metadata uuid fixes and enhancements
Related show

Commit Message

Damenly Su Dec. 12, 2019, 11:01 a.m. UTC
From: Su Yue <Damenly_Su@gmx.com>

While running misc-tests/034 of btrfs-progs, easy to hit:
======================================================================
[ 1318.749685] BTRFS: device fsid 55abf31c-6b3a-47ad-8711-cfd5a249dd4c devid 2 transid 8 /dev/loop0 scanned by systemd-udevd (3530)
[ 1318.846791] BTRFS: device fsid 55abf31c-6b3a-47ad-8711-cfd5a249dd4c devid 2 transid 8 /dev/loop0 scanned by systemd-udevd (3530)
[ 1318.847812] BTRFS: device fsid 55abf31c-6b3a-47ad-8711-cfd5a249dd4c devid 2 transid 8 /dev/loop0 scanned by mount (3540)
[ 1318.901278] assertion failed: !memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE), in fs/btrfs/disk-io.c:2874
[ 1318.901499] ------------[ cut here ]------------
[ 1318.901503] kernel BUG at fs/btrfs/ctree.h:3118!
[ 1318.901582] invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
[ 1318.901644] CPU: 4 PID: 3540 Comm: mount Tainted: G           O      5.5.0-rc1-custom+ #41
[ 1318.901720] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[ 1318.901836] RIP: 0010:assfail.constprop.0+0x1c/0x1e [btrfs]
[ 1318.901894] Code: 00 00 44 89 c0 5b 41 5c 41 5d 41 5e 5d c3 55 89 f1 48 c7 c2 40 13 0f c1 48 89 fe 48 c7 c7 00 1b 0f c1 48 89 e5 e8 3e 12 0f dc <0f> 0b e8 93 01 39 dc be 56 03 00 00 48 c7 c7 a0 1b 0f c1 e8 cc ff
[ 1318.902069] RSP: 0018:ffff88814fea76e0 EFLAGS: 00010282
[ 1318.902124] RAX: 000000000000007c RBX: ffff88814d862400 RCX: 0000000000000000
[ 1318.902191] RDX: 0000000000000001 RSI: 0000000000000008 RDI: ffffed1029fd4ecf
[ 1318.902259] RBP: ffff88814fea76e0 R08: 000000000000007c R09: ffffed102b3fe719
[ 1318.902326] R10: ffffed102b3fe718 R11: ffff888159ff38c7 R12: ffff8881398ac000
[ 1318.902394] R13: ffff888145742930 R14: ffff88812f56e000 R15: ffff88812f56e000
[ 1318.902464] FS:  00007f0753032500(0000) GS:ffff888159e00000(0000) knlGS:0000000000000000
[ 1318.902539] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1318.902595] CR2: 0000560357a9c018 CR3: 00000001502d4000 CR4: 0000000000340ee0
[ 1318.903852] Call Trace:
[ 1318.904958]  open_ctree+0x166c/0x373e [btrfs]
[ 1318.906029]  ? congestion_wait+0x2d0/0x2d0
[ 1318.907500]  ? wb_init+0x31e/0x400
[ 1318.908699]  ? close_ctree+0x52b/0x52b [btrfs]
[ 1318.909734]  btrfs_mount_root.cold+0xe/0x118 [btrfs]
[ 1318.910764]  ? btrfs_decode_error+0x40/0x40 [btrfs]
[ 1318.911784]  ? vfs_parse_fs_string+0xdc/0x130
[ 1318.912798]  ? rcu_read_lock_sched_held+0xa1/0xd0
[ 1318.913835]  ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1318.914878]  ? legacy_parse_param+0x75/0x340
[ 1318.915946]  ? __lookup_constant+0x54/0x90
[ 1318.917005]  ? debug_lockdep_rcu_enabled.part.0+0x1a/0x30
[ 1318.918099]  ? kfree+0x2fd/0x360
[ 1318.919233]  ? btrfs_decode_error+0x40/0x40 [btrfs]
[ 1318.920361]  legacy_get_tree+0x89/0xd0
[ 1318.921480]  vfs_get_tree+0x52/0x140
[ 1318.922625]  fc_mount+0x14/0x70
[ 1318.923713]  vfs_kern_mount.part.0+0x78/0x90
[ 1318.924789]  vfs_kern_mount+0x13/0x20
[ 1318.925885]  btrfs_mount+0x1f3/0xb30 [btrfs]
[ 1318.926922]  ? sched_clock_cpu+0x1b/0x130
[ 1318.927936]  ? find_held_lock+0x95/0xb0
[ 1318.928964]  ? btrfs_remount+0x7e0/0x7e0 [btrfs]
[ 1318.929939]  ? vfs_parse_fs_string+0xdc/0x130
[ 1318.930875]  ? vfs_parse_fs_string+0xdc/0x130
[ 1318.931784]  ? rcu_read_lock_sched_held+0xa1/0xd0
[ 1318.932651]  ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1318.933497]  ? legacy_parse_param+0x75/0x340
[ 1318.934327]  ? __lookup_constant+0x54/0x90
[ 1318.935173]  ? debug_lockdep_rcu_enabled.part.0+0x1a/0x30
[ 1318.936023]  ? kfree+0x2fd/0x360
[ 1318.937372]  ? cap_capable+0xb3/0xf0
[ 1318.938323]  ? btrfs_remount+0x7e0/0x7e0 [btrfs]
[ 1318.939126]  legacy_get_tree+0x89/0xd0
[ 1318.939921]  ? legacy_get_tree+0x89/0xd0
[ 1318.940687]  vfs_get_tree+0x52/0x140
[ 1318.941431]  do_mount+0xe01/0x1220
[ 1318.942189]  ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1318.942935]  ? copy_mount_string+0x20/0x20
[ 1318.943690]  ? __kasan_check_write+0x14/0x20
[ 1318.944438]  ? memdup_user+0x52/0x90
[ 1318.945190]  ksys_mount+0x82/0xd0
[ 1318.945921]  __x64_sys_mount+0x67/0x80
[ 1318.946640]  do_syscall_64+0x79/0xe0
[ 1318.947364]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1318.948109] RIP: 0033:0x7f07531b5e4e
[ 1318.948851] Code: 48 8b 0d 35 00 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 02 00 0c 00 f7 d8 64 89 01 48
[ 1318.951249] RSP: 002b:00007ffeffa9c9f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
[ 1318.952119] RAX: ffffffffffffffda RBX: 00007f07532dc204 RCX: 00007f07531b5e4e
[ 1318.952988] RDX: 0000560357a97430 RSI: 0000560357a96230 RDI: 0000560357a93500
[ 1318.953867] RBP: 0000560357a932f0 R08: 0000000000000000 R09: 0000560357a990e0
[ 1318.954757] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[ 1318.955638] R13: 0000560357a93500 R14: 0000560357a97430 R15: 0000560357a932f0
[ 1318.956540] Modules linked in: btrfs(O) xor zstd_decompress zstd_compress raid6_pq loop mousedev nls_iso8859_1 nls_cp437 vfat fat iTCO_wdt crct10dif_pclmul iTCO_vendor_support snd_hda_codec_generic crc32_pclmul crc32c_intel ghash_clmulni_intel snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm input_leds snd_timer led_class psmouse pcspkr aesni_intel i2c_i801 snd intel_agp glue_helper crypto_simd cryptd soundcore lpc_ich intel_gtt rtc_cmos qemu_fw_cfg agpgart evdev mac_hid ip_tables x_tables xfs sr_mod cdrom sd_mod dm_mod virtio_scsi virtio_balloon virtio_rng virtio_blk virtio_console rng_core virtio_net net_failover failover ahci serio_raw libahci atkbd libps2 libata scsi_mod virtio_pci virtio_ring virtio i8042 serio [last unloaded: btrfs]
[ 1318.965122] ---[ end trace 51f0adac8fc1fe76 ]---
======================================================================

Acutally, there are two devices in the fs. Device 2 with
FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
fs_devices but failed to be added into since fs_devices->opened (
the thread is doing mount device 1). But device 1's fsid was copied
to fs_devices->fsid then the assertion failed.

The solution is that only if a new device was added into a existing
fs_device, then the fs_devices->fsid is allowed to be rewritten.

Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario during fsid change")
Signed-off-by: Su Yue <Damenly_Su@gmx.com>
---
 fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

Comments

Nikolay Borisov Dec. 12, 2019, 2:15 p.m. UTC | #1
On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:

<snip>

> Acutally, there are two devices in the fs. Device 2 with
> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
> fs_devices but failed to be added into since fs_devices->opened (

It's not clear why device 1 wasn't able to be added to the fs_devices
allocated by dev 2. Please elaborate?


> the thread is doing mount device 1). But device 1's fsid was copied
> to fs_devices->fsid then the assertion failed.


dev 1 fsid should be copied iff its transid is newer.

> 
> The solution is that only if a new device was added into a existing
> fs_device, then the fs_devices->fsid is allowed to be rewritten.

fs_devices->fsid must be re-written by any device which is _newer_ w.r.t
to the transid.

> 
> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario during fsid change")
> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
> ---
>  fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>  1 file changed, 21 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index d8e5560db285..9efa4123c335 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -732,6 +732,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>  		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>  	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>  					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
> +	bool fs_devices_found = false;
> +
> +	*new_device_added = false;
>  
>  	if (fsid_change_in_progress) {
>  		if (!has_metadata_uuid) {
> @@ -772,24 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>  
>  		device = NULL;
>  	} else {
> +		fs_devices_found = true;
> +
>  		mutex_lock(&fs_devices->device_list_mutex);
>  		device = btrfs_find_device(fs_devices, devid,
>  				disk_super->dev_item.uuid, NULL, false);
> -
> -		/*
> -		 * If this disk has been pulled into an fs devices created by
> -		 * a device which had the CHANGING_FSID_V2 flag then replace the
> -		 * metadata_uuid/fsid values of the fs_devices.
> -		 */
> -		if (has_metadata_uuid && fs_devices->fsid_change &&
> -		    found_transid > fs_devices->latest_generation) {
> -			memcpy(fs_devices->fsid, disk_super->fsid,
> -					BTRFS_FSID_SIZE);
> -			memcpy(fs_devices->metadata_uuid,
> -					disk_super->metadata_uuid, BTRFS_FSID_SIZE);
> -
> -			fs_devices->fsid_change = false;
> -		}
>  	}
>  
>  	if (!device) {
> @@ -912,6 +902,22 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>  		}
>  	}
>  
> +	/*
> +	 * If the new added disk has been pulled into an fs devices created by
> +	 * a device which had the CHANGING_FSID_V2 flag then replace the
> +	 * metadata_uuid/fsid values of the fs_devices.
> +	 */
> +	if (*new_device_added && fs_devices_found &&
> +	    has_metadata_uuid && fs_devices->fsid_change &&
> +	    found_transid > fs_devices->latest_generation) {
> +		memcpy(fs_devices->fsid, disk_super->fsid,
> +		       BTRFS_FSID_SIZE);
> +		memcpy(fs_devices->metadata_uuid,
> +		       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
> +
> +		fs_devices->fsid_change = false;
> +	}
> +
>  	/*
>  	 * Unmount does not free the btrfs_device struct but would zero
>  	 * generation along with most of the other members. So just update
>
Su Yue Dec. 13, 2019, 2:30 a.m. UTC | #2
On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>
>
> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>
> <snip>
>
>> Acutally, there are two devices in the fs. Device 2 with
>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>> fs_devices but failed to be added into since fs_devices->opened (
>  > It's not clear why device 1 wasn't able to be added to the fs_devices
> allocated by dev 2. Please elaborate?
>
Because fs_devices is opened.
For example.

$cat test.sh
====================================================================
img1="/tmp/test1.img"
img2="/tmp/test2.img"

[ -f "$img1" ] || fallocate -l 300M "$img1"
[ -f "$img2" ] || fallocate -l 300M "$img2"

mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
losetup -D

dmesg -C
rmmod btrfs
modprobe btrfs

loop1=$(losetup --find --show "$img1")
loop2=$(losetup --find --show "$img2")

mount $loop1 /mnt || exit 1
umount /mnt
====================================================================

$dmesg
====================================================================
[  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
[  395.210773] !!!!!!!!fs_device opened
[  395.213875] BTRFS info (device loop0): disk space caching is enabled
[  395.214994] BTRFS info (device loop0): has skinny extents
[  395.215891] BTRFS info (device loop0): flagging fs with big metadata
feature
[  395.222639] BTRFS error (device loop0): devid 2 uuid
adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
[  395.224147] BTRFS error (device loop0): failed to read the system
array: -2
[  395.246163] !!!!!!!!fs_device opened
[  395.338219] BTRFS error (device loop0): open_ctree failed
=====================================================================

The line "!!!!!!!!fs_device opened" is handy added by me in debug purpose.

=====================================================================
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -794,6 +794,7 @@ static noinline struct btrfs_device
*device_list_add(const char *path,

         if (!device) {
                 if (fs_devices->opened) {
+                       pr_info("!!!!!!!!fs_device opened\n");
                         mutex_unlock(&fs_devices->device_list_mutex);
                         return ERR_PTR(-EBUSY);
                 }
=====================================================================

To make it more clear. The following is in metadata_uuid situation.
Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
(newer transid).

Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
transid).

The workflow in misc-tests/034 is

loop1=$(losetup --find --show "$device2")
loop2=$(losetup --find --show "$device1")

mount $loop1 /mnt ---> fails here

Assume the fs_devices was allocated by systemd-udevd through
btrfs_control_ioctl() path after finish of scanning of device2.

Then:

Thread *mounting device2*               Thread *scanning device1*


btrfs_mount_root			btrfs_control_ioctl

   mutex_lock(&uuid_mutex);

     btrfs_read_disk_super
     btrfs_scan_one_device
     --> there is only device2
	in the fs_devices

     btrfs_open_devices
       fs_devices->opened = 1
       fs_devices->latest_bdev = device2

   mutex_unlock(&uuid_mutex);
   -->Here, fs_devices->fsid is same
      as device2's fsid.
					mutex_lock(&uuid_mutex);
					btrfs_scan_one_device

					  btrfs_read_disk_super
					  device_list_add
					    found fs_devices
					      device = btrfs_find_device

					      rewrite fs_deivces->fsid

                                               if scanned device is newer
                                               --> Change fs_devices->fsi
                                                   d to device1->fsid

					    if (!device)
						if fs_devices->opened
						--> the device1 adding
						    aborts since
						    fs_devices
						    was opened
					mutex_unlock(&uuid_mutex);
   btrfs_fill_super
     open_ctree
        btrfs_read_dev_super(
        fs_devices->latest_bdev)
        --> the latest_bdev is device2

        assert fs_devices->fsid equals device2's fsid.
        --> fs_device->fsid was rewritten by the scanning thread

>
>> the thread is doing mount device 1). But device 1's fsid was copied
>> to fs_devices->fsid then the assertion failed.
>
>
> dev 1 fsid should be copied iff its transid is newer.
>
Even it was failed to be added into the fs_devices?
>>
>> The solution is that only if a new device was added into a existing
>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>
> fs_devices->fsid must be re-written by any device which is _newer_ w.r.t
> to the transid.

Then the assertion failed in above scenario. Just do not update the
fs_devices->fsid, let later btrfs_read_sys_array() report the device
missing then reject to mount.

Thanks
>
>>
>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario during fsid change")
>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>> ---
>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index d8e5560db285..9efa4123c335 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>   		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>   	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>   					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>> +	bool fs_devices_found = false;
>> +
>> +	*new_device_added = false;
>>
>>   	if (fsid_change_in_progress) {
>>   		if (!has_metadata_uuid) {
>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>
>>   		device = NULL;
>>   	} else {
>> +		fs_devices_found = true;
>> +
>>   		mutex_lock(&fs_devices->device_list_mutex);
>>   		device = btrfs_find_device(fs_devices, devid,
>>   				disk_super->dev_item.uuid, NULL, false);
>> -
>> -		/*
>> -		 * If this disk has been pulled into an fs devices created by
>> -		 * a device which had the CHANGING_FSID_V2 flag then replace the
>> -		 * metadata_uuid/fsid values of the fs_devices.
>> -		 */
>> -		if (has_metadata_uuid && fs_devices->fsid_change &&
>> -		    found_transid > fs_devices->latest_generation) {
>> -			memcpy(fs_devices->fsid, disk_super->fsid,
>> -					BTRFS_FSID_SIZE);
>> -			memcpy(fs_devices->metadata_uuid,
>> -					disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>> -
>> -			fs_devices->fsid_change = false;
>> -		}
>>   	}
>>
>>   	if (!device) {
>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>   		}
>>   	}
>>
>> +	/*
>> +	 * If the new added disk has been pulled into an fs devices created by
>> +	 * a device which had the CHANGING_FSID_V2 flag then replace the
>> +	 * metadata_uuid/fsid values of the fs_devices.
>> +	 */
>> +	if (*new_device_added && fs_devices_found &&
>> +	    has_metadata_uuid && fs_devices->fsid_change &&
>> +	    found_transid > fs_devices->latest_generation) {
>> +		memcpy(fs_devices->fsid, disk_super->fsid,
>> +		       BTRFS_FSID_SIZE);
>> +		memcpy(fs_devices->metadata_uuid,
>> +		       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>> +
>> +		fs_devices->fsid_change = false;
>> +	}
>> +
>>   	/*
>>   	 * Unmount does not free the btrfs_device struct but would zero
>>   	 * generation along with most of the other members. So just update
>>
Su Yue Dec. 13, 2019, 2:46 a.m. UTC | #3
On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>
>
> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>
> <snip>
>
>> Acutally, there are two devices in the fs. Device 2 with
>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>> fs_devices but failed to be added into since fs_devices->opened (
>
> It's not clear why device 1 wasn't able to be added to the fs_devices
> allocated by dev 2. Please elaborate?
>
>
Sure, of course.

For example.

$cat test.sh
====================================================================
img1="/tmp/test1.img"
img2="/tmp/test2.img"

[ -f "$img1" ] || fallocate -l 300M "$img1"
[ -f "$img2" ] || fallocate -l 300M "$img2"

mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
losetup -D

dmesg -C
rmmod btrfs
modprobe btrfs

loop1=$(losetup --find --show "$img1")
loop2=$(losetup --find --show "$img2")

mount $loop1 /mnt || exit 1
umount /mnt
====================================================================

$dmesg
====================================================================
[  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
[  395.210773] !!!!!!!!fs_device opened
[  395.213875] BTRFS info (device loop0): disk space caching is enabled
[  395.214994] BTRFS info (device loop0): has skinny extents
[  395.215891] BTRFS info (device loop0): flagging fs with big metadata
feature
[  395.222639] BTRFS error (device loop0): devid 2 uuid
adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
[  395.224147] BTRFS error (device loop0): failed to read the system
array: -2
[  395.246163] !!!!!!!!fs_device opened
[  395.338219] BTRFS error (device loop0): open_ctree failed
=====================================================================

The line "!!!!!!!!fs_device opened" is handy added by me in debug purpose.

=====================================================================
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -794,6 +794,7 @@ static noinline struct btrfs_device
*device_list_add(const char *path,

         if (!device) {
                 if (fs_devices->opened) {
+                       pr_info("!!!!!!!!fs_device opened\n");
                         mutex_unlock(&fs_devices->device_list_mutex);
                         return ERR_PTR(-EBUSY);
                 }
=====================================================================

To make it more clear. The following is in metadata_uuid situation.
Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
(newer transid).

Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
transid).

The workflow in misc-tests/034 is

loop1=$(losetup --find --show "$device2")
loop2=$(losetup --find --show "$device1")

mount $loop1 /mnt ---> fails here

Assume the fs_devices was allocated by systemd-udevd through
btrfs_control_ioctl() path after finish of scanning of device2.

Then:

Thread *mounting device2*            Thread *scanning device1*


btrfs_mount_root                     btrfs_control_ioctl

   mutex_lock(&uuid_mutex);

     btrfs_read_disk_super
     btrfs_scan_one_device
     --> there is only device2
     in the fs_devices

     btrfs_open_devices
       fs_devices->opened = 1
       fs_devices->latest_bdev = device2

     mutex_unlock(&uuid_mutex);

                                       mutex_lock(&uuid_mutex);
                                       btrfs_scan_one_device
                                         btrfs_read_disk_super

                                         device_list_add
                                           found fs_devices
                                             device = btrfs_find_device

                                             rewrite fs_deivces->fsid if
                                             scanned device1 is newer
                                              --> Change fs_devices->fsi
                                                   d to device1->fsid

                                           if (!device)
                                              if(fs_devices->opened)
						 return -EBUSY
                                              --> the device1 adding
                                                  aborts since
                                                  fs_devices was opened
                                       mutex_unlock(&uuid_mutex);
   btrfs_fill_super
     open_ctree
        btrfs_read_dev_super(
        fs_devices->latest_bdev)
        --> the latest_bdev is device2

        assert fs_devices->fsid equals
        device2's fsid.
        --> fs_device->fsid was rewritten by
            the scanning thread

The result is fs_device->fsid is from device1 but super->fsid is from
the lastest device2.

>> the thread is doing mount device 1). But device 1's fsid was copied
>> to fs_devices->fsid then the assertion failed.
>
>
> dev 1 fsid should be copied iff its transid is newer.
>

Even it was failed to be added into the fs_devices?

>>
>> The solution is that only if a new device was added into a existing
>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>
> fs_devices->fsid must be re-written by any device which is _newer_ w.r.t
> to the transid.
>

Then the assertion failed in above scenario. Just do not update the
fs_devices->fsid, let later btrfs_read_sys_array() report the device
missing then reject to mount.

Thanks

>>
>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario during fsid change")
>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>> ---
>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index d8e5560db285..9efa4123c335 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>   		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>   	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>   					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>> +	bool fs_devices_found = false;
>> +
>> +	*new_device_added = false;
>>
>>   	if (fsid_change_in_progress) {
>>   		if (!has_metadata_uuid) {
>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>
>>   		device = NULL;
>>   	} else {
>> +		fs_devices_found = true;
>> +
>>   		mutex_lock(&fs_devices->device_list_mutex);
>>   		device = btrfs_find_device(fs_devices, devid,
>>   				disk_super->dev_item.uuid, NULL, false);
>> -
>> -		/*
>> -		 * If this disk has been pulled into an fs devices created by
>> -		 * a device which had the CHANGING_FSID_V2 flag then replace the
>> -		 * metadata_uuid/fsid values of the fs_devices.
>> -		 */
>> -		if (has_metadata_uuid && fs_devices->fsid_change &&
>> -		    found_transid > fs_devices->latest_generation) {
>> -			memcpy(fs_devices->fsid, disk_super->fsid,
>> -					BTRFS_FSID_SIZE);
>> -			memcpy(fs_devices->metadata_uuid,
>> -					disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>> -
>> -			fs_devices->fsid_change = false;
>> -		}
>>   	}
>>
>>   	if (!device) {
>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>>   		}
>>   	}
>>
>> +	/*
>> +	 * If the new added disk has been pulled into an fs devices created by
>> +	 * a device which had the CHANGING_FSID_V2 flag then replace the
>> +	 * metadata_uuid/fsid values of the fs_devices.
>> +	 */
>> +	if (*new_device_added && fs_devices_found &&
>> +	    has_metadata_uuid && fs_devices->fsid_change &&
>> +	    found_transid > fs_devices->latest_generation) {
>> +		memcpy(fs_devices->fsid, disk_super->fsid,
>> +		       BTRFS_FSID_SIZE);
>> +		memcpy(fs_devices->metadata_uuid,
>> +		       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>> +
>> +		fs_devices->fsid_change = false;
>> +	}
>> +
>>   	/*
>>   	 * Unmount does not free the btrfs_device struct but would zero
>>   	 * generation along with most of the other members. So just update
>>
Anand Jain Dec. 13, 2019, 5:36 a.m. UTC | #4
metadata_uuid code is too confusing, a lot of if and if-nots
  it should be have been better.

  more below.

On 13/12/19 10:46 AM, Su Yue wrote:
> On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>>
>>
>> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>>
>> <snip>
>>
>>> Acutally, there are two devices in the fs. Device 2 with
>>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>>> fs_devices but failed to be added into since fs_devices->opened (
>>
>> It's not clear why device 1 wasn't able to be added to the fs_devices
>> allocated by dev 2. Please elaborate?
>>
>>
> Sure, of course.
> 
> For example.
> 
> $cat test.sh
> ====================================================================
> img1="/tmp/test1.img"
> img2="/tmp/test2.img"
> 
> [ -f "$img1" ] || fallocate -l 300M "$img1"
> [ -f "$img2" ] || fallocate -l 300M "$img2"
> 
> mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
> losetup -D
> 
> dmesg -C
> rmmod btrfs
> modprobe btrfs
> 
> loop1=$(losetup --find --show "$img1")
> loop2=$(losetup --find --show "$img2")

  Can you explicitly show what devices should be scanned to make the
  device mount (below) successful. Fist you can cleanup the
  device list using

    btrfs device --forget

> mount $loop1 /mnt || exit 1
> umount /mnt
> ====================================================================
> 
> $dmesg
> ====================================================================
> [  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
> devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
> [  395.210773] !!!!!!!!fs_device opened
> [  395.213875] BTRFS info (device loop0): disk space caching is enabled
> [  395.214994] BTRFS info (device loop0): has skinny extents
> [  395.215891] BTRFS info (device loop0): flagging fs with big metadata
> feature
> [  395.222639] BTRFS error (device loop0): devid 2 uuid
> adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
> [  395.224147] BTRFS error (device loop0): failed to read the system
> array: -2
> [  395.246163] !!!!!!!!fs_device opened
> [  395.338219] BTRFS error (device loop0): open_ctree failed
> =====================================================================
> 
> The line "!!!!!!!!fs_device opened" is handy added by me in debug purpose.
> 
> =====================================================================
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -794,6 +794,7 @@ static noinline struct btrfs_device
> *device_list_add(const char *path,
> 
>          if (!device) {
>                  if (fs_devices->opened) {
> +                       pr_info("!!!!!!!!fs_device opened\n");
>                          mutex_unlock(&fs_devices->device_list_mutex);
>                          return ERR_PTR(-EBUSY);
>                  }
> =====================================================================
> 
> To make it more clear. The following is in metadata_uuid situation.
> Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
> (newer transid).
> 
> Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
> transid).

How were you able to set both BTRFS_SUPER_FLAG_CHANGING_FSID_V2
and BTRFS_FEATURE_INCOMPAT_METADATA_UUID on only devid 2 ?


> The workflow in misc-tests/034 is
> 
> loop1=$(losetup --find --show "$device2")
> loop2=$(losetup --find --show "$device1")
> 
> mount $loop1 /mnt ---> fails here
> 
> Assume the fs_devices was allocated by systemd-udevd through
> btrfs_control_ioctl() path after finish of scanning of device2.
> 
> Then:
> 

In the two threads which are in race (below), the mount thread can't be 
successful unless -o degraded is used, if it does it means the devid 1 
is already scanned and for that btrfs_device to be in the
btrfs_fs_devices list the fsid has to match (does not matter metadata_uuid).

> Thread *mounting device2*            Thread *scanning device1*
> 
> 
> btrfs_mount_root                     btrfs_control_ioctl
> 
>    mutex_lock(&uuid_mutex);
> 
>      btrfs_read_disk_super
>      btrfs_scan_one_device
>      --> there is only device2
>      in the fs_devices
> 
>      btrfs_open_devices
>        fs_devices->opened = 1
>        fs_devices->latest_bdev = device2
> 
>      mutex_unlock(&uuid_mutex);
> 
>                                        mutex_lock(&uuid_mutex);
>                                        btrfs_scan_one_device
>                                          btrfs_read_disk_super
> 
>                                          device_list_add
>                                            found fs_devices
>                                              device = btrfs_find_device
> 
>                                              rewrite fs_deivces->fsid if
>                                              scanned device1 is newer
>                                               --> Change fs_devices->fsi
>                                                    d to device1->fsid
> 
>                                            if (!device)
>                                               if(fs_devices->opened)
>                           return -EBUSY
>                                               --> the device1 adding
>                                                   aborts since
>                                                   fs_devices was opened
>                                        mutex_unlock(&uuid_mutex);
>    btrfs_fill_super
>      open_ctree
>         btrfs_read_dev_super(
>         fs_devices->latest_bdev)
>         --> the latest_bdev is device2
> 
>         assert fs_devices->fsid equals
>         device2's fsid.
>         --> fs_device->fsid was rewritten by
>             the scanning thread
> 
> The result is fs_device->fsid is from device1 but super->fsid is from
> the lastest device2.
> 

  Oops that's not good. However still not able to image various devices
  and its fsid to achieve that condition. Is it possible to write a test
  case? It would help.

Thanks, Anand

>>> the thread is doing mount device 1). But device 1's fsid was copied
>>> to fs_devices->fsid then the assertion failed.
>>
>>
>> dev 1 fsid should be copied iff its transid is newer.
>>
> 
> Even it was failed to be added into the fs_devices?
> 
>>>
>>> The solution is that only if a new device was added into a existing
>>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>>
>> fs_devices->fsid must be re-written by any device which is _newer_ w.r.t
>> to the transid.
>>
> 
> Then the assertion failed in above scenario. Just do not update the
> fs_devices->fsid, let later btrfs_read_sys_array() report the device
> missing then reject to mount.
> 
> Thanks
> 
>>>
>>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario 
>>> during fsid change")
>>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>>> ---
>>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index d8e5560db285..9efa4123c335 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device 
>>> *device_list_add(const char *path,
>>>           BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>>       bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>>                       BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>>> +    bool fs_devices_found = false;
>>> +
>>> +    *new_device_added = false;
>>>
>>>       if (fsid_change_in_progress) {
>>>           if (!has_metadata_uuid) {
>>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device 
>>> *device_list_add(const char *path,
>>>
>>>           device = NULL;
>>>       } else {
>>> +        fs_devices_found = true;
>>> +
>>>           mutex_lock(&fs_devices->device_list_mutex);
>>>           device = btrfs_find_device(fs_devices, devid,
>>>                   disk_super->dev_item.uuid, NULL, false);
>>> -
>>> -        /*
>>> -         * If this disk has been pulled into an fs devices created by
>>> -         * a device which had the CHANGING_FSID_V2 flag then replace 
>>> the
>>> -         * metadata_uuid/fsid values of the fs_devices.
>>> -         */
>>> -        if (has_metadata_uuid && fs_devices->fsid_change &&
>>> -            found_transid > fs_devices->latest_generation) {
>>> -            memcpy(fs_devices->fsid, disk_super->fsid,
>>> -                    BTRFS_FSID_SIZE);
>>> -            memcpy(fs_devices->metadata_uuid,
>>> -                    disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>> -
>>> -            fs_devices->fsid_change = false;
>>> -        }
>>>       }
>>>
>>>       if (!device) {
>>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device 
>>> *device_list_add(const char *path,
>>>           }
>>>       }
>>>
>>> +    /*
>>> +     * If the new added disk has been pulled into an fs devices 
>>> created by
>>> +     * a device which had the CHANGING_FSID_V2 flag then replace the
>>> +     * metadata_uuid/fsid values of the fs_devices.
>>> +     */
>>> +    if (*new_device_added && fs_devices_found &&
>>> +        has_metadata_uuid && fs_devices->fsid_change &&
>>> +        found_transid > fs_devices->latest_generation) {
>>> +        memcpy(fs_devices->fsid, disk_super->fsid,
>>> +               BTRFS_FSID_SIZE);
>>> +        memcpy(fs_devices->metadata_uuid,
>>> +               disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>> +
>>> +        fs_devices->fsid_change = false;
>>> +    }
>>> +
>>>       /*
>>>        * Unmount does not free the btrfs_device struct but would zero
>>>        * generation along with most of the other members. So just update
>>>
>
Su Yue Dec. 13, 2019, 7:15 a.m. UTC | #5
On 2019/12/13 1:36 PM, Anand Jain wrote:
> 
> 
>   metadata_uuid code is too confusing, a lot of if and if-nots
>   it should be have been better.
> 

Agreed. It costed much brain power of mine.

>   more below.
> 

Willing to answer from my understanding.
If something wrong, please point at.

> On 13/12/19 10:46 AM, Su Yue wrote:
>> On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>>>
>>>
>>> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>>>
>>> <snip>
>>>
>>>> Acutally, there are two devices in the fs. Device 2 with
>>>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>>>> fs_devices but failed to be added into since fs_devices->opened (
>>>
>>> It's not clear why device 1 wasn't able to be added to the fs_devices
>>> allocated by dev 2. Please elaborate?
>>>
>>>
>> Sure, of course.
>>
>> For example.
>>
>> $cat test.sh
>> ====================================================================
>> img1="/tmp/test1.img"
>> img2="/tmp/test2.img"
>>
>> [ -f "$img1" ] || fallocate -l 300M "$img1"
>> [ -f "$img2" ] || fallocate -l 300M "$img2"
>>
>> mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
>> losetup -D
>>
>> dmesg -C
>> rmmod btrfs
>> modprobe btrfs
>>
>> loop1=$(losetup --find --show "$img1")
>> loop2=$(losetup --find --show "$img2")
> 
>   Can you explicitly show what devices should be scanned to make the
>   device mount (below) successful. Fist you can cleanup the
>   device list using
> 
>     btrfs device --forget
> 

Thanks for the tip.
The purpose of simple script is to show that there
may be uncompleted/unsuccessful device(s) scanning due to
fs_devices->opened. Is the issue already known?

>> mount $loop1 /mnt || exit 1
>> umount /mnt
>> ====================================================================
>>
>> $dmesg
>> ====================================================================
>> [  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
>> devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
>> [  395.210773] !!!!!!!!fs_device opened
>> [  395.213875] BTRFS info (device loop0): disk space caching is enabled
>> [  395.214994] BTRFS info (device loop0): has skinny extents
>> [  395.215891] BTRFS info (device loop0): flagging fs with big metadata
>> feature
>> [  395.222639] BTRFS error (device loop0): devid 2 uuid
>> adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
>> [  395.224147] BTRFS error (device loop0): failed to read the system
>> array: -2
>> [  395.246163] !!!!!!!!fs_device opened
>> [  395.338219] BTRFS error (device loop0): open_ctree failed
>> =====================================================================
>>
>> The line "!!!!!!!!fs_device opened" is handy added by me in debug 
>> purpose.
>>
>> =====================================================================
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -794,6 +794,7 @@ static noinline struct btrfs_device
>> *device_list_add(const char *path,
>>
>>          if (!device) {
>>                  if (fs_devices->opened) {
>> +                       pr_info("!!!!!!!!fs_device opened\n");
>>                          mutex_unlock(&fs_devices->device_list_mutex);
>>                          return ERR_PTR(-EBUSY);
>>                  }
>> =====================================================================
>>
>> To make it more clear. The following is in metadata_uuid situation.
>> Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
>> (newer transid).
>>
>> Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
>> transid).
> 
> How were you able to set FSID_CHANGING_V2
> and BTRFS_FEATURE_INCOMPAT_METADATA_UUID on only devid 2 ?
> 
> 
The device2 is simulated to be the device failed to sync due
to some expected reason (power loss).

mkfs on two devices, use v5.4 progs/btrfstune -m $device.
Then both two devices both have the BTRFS_FEATURE_INCOMPAT_METADATA_UUID.
Play some tricks in btrfstune.c to avoid final super block
write on one deivce. Like the ugly code to delete the device:
========================================================================
diff --git a/btrfstune.c b/btrfstune.c
index afa3aae3..f678b978 100644
--- a/btrfstune.c
+++ b/btrfstune.c
@@ -101,12 +101,14 @@ static int set_metadata_uuid(struct btrfs_root 
*root, const char *uuid_string)
         struct btrfs_super_block *disk_super;
         uuid_t new_fsid, unused1, unused2;
         struct btrfs_trans_handle *trans;
+       struct btrfs_device *dev, *next;
         bool new_uuid = true;
         u64 incompat_flags;
         bool uuid_changed;
         u64 super_flags;
         int ret;

         disk_super = root->fs_info->super_copy;
         super_flags = btrfs_super_flags(disk_super);
         incompat_flags = btrfs_super_incompat_flags(disk_super);
@@ -170,6 +172,14 @@ static int set_metadata_uuid(struct btrfs_root 
*root, const char *uuid_string)
                 return 0;
         }

+       list_for_each_entry_safe(dev, next, 
&root->fs_info->fs_devices->devices,
+                                dev_list) {
+               if (dev->devid == 2) {
+                       fsync(dev->fd);
+                       list_del_init(&dev->dev_list);
+               }
+       }
+==================================================================


Compile again. call btrfstune -m again.
Then we get a device with
BTRFS_FEATURE_INCOMPAT_METADATA_UUID and FSID_CHANGING_V2.

>> The workflow in misc-tests/034 is
>>
>> loop1=$(losetup --find --show "$device2")
>> loop2=$(losetup --find --show "$device1")
>>
>> mount $loop1 /mnt ---> fails here
>>
>> Assume the fs_devices was allocated by systemd-udevd through
>> btrfs_control_ioctl() path after finish of scanning of device2.
>>
>> Then:
>>
> 
> In the two threads which are in race (below), the mount thread can't be 
> successful unless -o degraded is used, if it does it means the devid 1 

Right.. The dmesg reports the device 1 is missing.

> is already scanned and for that btrfs_device to be in the
> btrfs_fs_devices list the fsid has to match (does not matter 
> metadata_uuid).

Sorry, I doesn't make much clear what you mean. In similar but no 
metadata_uuid situation, mount will fail too but the assertion won't
fail of course. The device1 was scanned but not added into the
fs_devices(already found) since the fs_devices was opened by the
mounting thread.

> 
>> Thread *mounting device2*            Thread *scanning device1*
>>
>>
>> btrfs_mount_root                     btrfs_control_ioctl
>>
>>    mutex_lock(&uuid_mutex);
>>
>>      btrfs_read_disk_super
>>      btrfs_scan_one_device
>>      --> there is only device2
>>      in the fs_devices
>>
>>      btrfs_open_devices
>>        fs_devices->opened = 1
>>        fs_devices->latest_bdev = device2
>>
>>      mutex_unlock(&uuid_mutex);
>>
>>                                        mutex_lock(&uuid_mutex);
>>                                        btrfs_scan_one_device
>>                                          btrfs_read_disk_super
>>
>>                                          device_list_add
>>                                            found fs_devices
>>                                              device = btrfs_find_device
>>
>>                                              rewrite fs_deivces->fsid if
>>                                              scanned device1 is newer
>>                                               --> Change fs_devices->fsi
>>                                                    d to device1->fsid
>>
>>                                            if (!device)
>>                                               if(fs_devices->opened)
>>                           return -EBUSY
>>                                               --> the device1 adding
>>                                                   aborts since
>>                                                   fs_devices was opened
>>                                        mutex_unlock(&uuid_mutex);
>>    btrfs_fill_super
>>      open_ctree
>>         btrfs_read_dev_super(
>>         fs_devices->latest_bdev)
>>         --> the latest_bdev is device2
>>
>>         assert fs_devices->fsid equals
>>         device2's fsid.
>>         --> fs_device->fsid was rewritten by
>>             the scanning thread
>>
>> The result is fs_device->fsid is from device1 but super->fsid is from
>> the lastest device2.
>>
> 
>   Oops that's not good. However still not able to image various devices
>   and its fsid to achieve that condition. Is it possible to write a test
>   case? It would help.
> 
It did happened in my test environment.. You can try misc-tests/034 
about 20 times on v5.4 progs and v5.5-rc1 kernel. As for the test case,
will give a try.

Thanks
> Thanks, Anand
> 
>>>> the thread is doing mount device 1). But device 1's fsid was copied
>>>> to fs_devices->fsid then the assertion failed.
>>>
>>>
>>> dev 1 fsid should be copied iff its transid is newer.
>>>
>>
>> Even it was failed to be added into the fs_devices?
>>
>>>>
>>>> The solution is that only if a new device was added into a existing
>>>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>>>
>>> fs_devices->fsid must be re-written by any device which is _newer_ w.r.t
>>> to the transid.
>>>
>>
>> Then the assertion failed in above scenario. Just do not update the
>> fs_devices->fsid, let later btrfs_read_sys_array() report the device
>> missing then reject to mount.
>>
>> Thanks
>>
>>>>
>>>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario 
>>>> during fsid change")
>>>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>>>> ---
>>>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>> index d8e5560db285..9efa4123c335 100644
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device 
>>>> *device_list_add(const char *path,
>>>>           BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>>>       bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>>>                       BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>>>> +    bool fs_devices_found = false;
>>>> +
>>>> +    *new_device_added = false;
>>>>
>>>>       if (fsid_change_in_progress) {
>>>>           if (!has_metadata_uuid) {
>>>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device 
>>>> *device_list_add(const char *path,
>>>>
>>>>           device = NULL;
>>>>       } else {
>>>> +        fs_devices_found = true;
>>>> +
>>>>           mutex_lock(&fs_devices->device_list_mutex);
>>>>           device = btrfs_find_device(fs_devices, devid,
>>>>                   disk_super->dev_item.uuid, NULL, false);
>>>> -
>>>> -        /*
>>>> -         * If this disk has been pulled into an fs devices created by
>>>> -         * a device which had the CHANGING_FSID_V2 flag then 
>>>> replace the
>>>> -         * metadata_uuid/fsid values of the fs_devices.
>>>> -         */
>>>> -        if (has_metadata_uuid && fs_devices->fsid_change &&
>>>> -            found_transid > fs_devices->latest_generation) {
>>>> -            memcpy(fs_devices->fsid, disk_super->fsid,
>>>> -                    BTRFS_FSID_SIZE);
>>>> -            memcpy(fs_devices->metadata_uuid,
>>>> -                    disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>> -
>>>> -            fs_devices->fsid_change = false;
>>>> -        }
>>>>       }
>>>>
>>>>       if (!device) {
>>>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device 
>>>> *device_list_add(const char *path,
>>>>           }
>>>>       }
>>>>
>>>> +    /*
>>>> +     * If the new added disk has been pulled into an fs devices 
>>>> created by
>>>> +     * a device which had the CHANGING_FSID_V2 flag then replace the
>>>> +     * metadata_uuid/fsid values of the fs_devices.
>>>> +     */
>>>> +    if (*new_device_added && fs_devices_found &&
>>>> +        has_metadata_uuid && fs_devices->fsid_change &&
>>>> +        found_transid > fs_devices->latest_generation) {
>>>> +        memcpy(fs_devices->fsid, disk_super->fsid,
>>>> +               BTRFS_FSID_SIZE);
>>>> +        memcpy(fs_devices->metadata_uuid,
>>>> +               disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>> +
>>>> +        fs_devices->fsid_change = false;
>>>> +    }
>>>> +
>>>>       /*
>>>>        * Unmount does not free the btrfs_device struct but would zero
>>>>        * generation along with most of the other members. So just 
>>>> update
>>>>
>>
>
Anand Jain Dec. 13, 2019, 8:51 a.m. UTC | #6
On 12/13/19 3:15 PM, Su Yue wrote:
> On 2019/12/13 1:36 PM, Anand Jain wrote:
>>
>>
>>   metadata_uuid code is too confusing, a lot of if and if-nots
>>   it should be have been better.
>>
> 
> Agreed. It costed much brain power of mine.
> 
>>   more below.
>>
> 
> Willing to answer from my understanding.
> If something wrong, please point at.
> 
>> On 13/12/19 10:46 AM, Su Yue wrote:
>>> On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>>>>
>>>>
>>>> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>>>>
>>>> <snip>
>>>>
>>>>> Acutally, there are two devices in the fs. Device 2 with
>>>>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>>>>> fs_devices but failed to be added into since fs_devices->opened (
>>>>
>>>> It's not clear why device 1 wasn't able to be added to the fs_devices
>>>> allocated by dev 2. Please elaborate?
>>>>
>>>>
>>> Sure, of course.
>>>
>>> For example.
>>>
>>> $cat test.sh
>>> ====================================================================
>>> img1="/tmp/test1.img"
>>> img2="/tmp/test2.img"
>>>
>>> [ -f "$img1" ] || fallocate -l 300M "$img1"
>>> [ -f "$img2" ] || fallocate -l 300M "$img2"
>>>
>>> mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
>>> losetup -D
>>>
>>> dmesg -C
>>> rmmod btrfs
>>> modprobe btrfs
>>>
>>> loop1=$(losetup --find --show "$img1")
>>> loop2=$(losetup --find --show "$img2")
>>
>>   Can you explicitly show what devices should be scanned to make the
>>   device mount (below) successful. Fist you can cleanup the
>>   device list using
>>
>>     btrfs device --forget
>>
> 
> Thanks for the tip.
> The purpose of simple script is to show that there
> may be uncompleted/unsuccessful device(s) scanning due to
> fs_devices->opened. Is the issue already known?


Do you mean at line 803.

-----------
  729 static noinline struct btrfs_device *device_list_add(const char *path,
  730                            struct btrfs_super_block *disk_super,
  731                            bool *new_device_added)
  732 {

::

  802         if (!device) {
  803                 if (fs_devices->opened) {
  804                       mutex_unlock(&fs_devices->device_list_mutex);
  805                       return ERR_PTR(-EBUSY);
  806                 }
------------

fs_devices->opened indicates mounted state of the device.

If there is a missing device, the %device will still be there,
we create a dummy %device with the dev_state set to
BTRFS_DEV_STATE_MISSING.

So its wrong if we encounter device == NULL for a given fsid
which is mounted. So by error and return we keep the mounted
fs safe and fail the hijacking attack.


>>> mount $loop1 /mnt || exit 1
>>> umount /mnt
>>> ====================================================================
>>>
>>> $dmesg
>>> ====================================================================
>>> [  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
>>> devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
>>> [  395.210773] !!!!!!!!fs_device opened
>>> [  395.213875] BTRFS info (device loop0): disk space caching is enabled
>>> [  395.214994] BTRFS info (device loop0): has skinny extents
>>> [  395.215891] BTRFS info (device loop0): flagging fs with big metadata
>>> feature
>>> [  395.222639] BTRFS error (device loop0): devid 2 uuid
>>> adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
>>> [  395.224147] BTRFS error (device loop0): failed to read the system
>>> array: -2
>>> [  395.246163] !!!!!!!!fs_device opened
>>> [  395.338219] BTRFS error (device loop0): open_ctree failed
>>> =====================================================================
>>>
>>> The line "!!!!!!!!fs_device opened" is handy added by me in debug 
>>> purpose.
>>>
>>> =====================================================================
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -794,6 +794,7 @@ static noinline struct btrfs_device
>>> *device_list_add(const char *path,
>>>
>>>          if (!device) {
>>>                  if (fs_devices->opened) {
>>> +                       pr_info("!!!!!!!!fs_device opened\n");
>>>                          mutex_unlock(&fs_devices->device_list_mutex);
>>>                          return ERR_PTR(-EBUSY);
>>>                  }
>>> =====================================================================
>>>
>>> To make it more clear. The following is in metadata_uuid situation.
>>> Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
>>> (newer transid).
>>>
>>> Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
>>> transid).
>>
>> How were you able to set FSID_CHANGING_V2

  Sorry typo, it should be BTRFS_SUPER_FLAG_CHANGING_FSID_V2.

>> and BTRFS_FEATURE_INCOMPAT_METADATA_UUID on only devid 2 ?
>>
>>
> The device2 is simulated to be the device failed to sync due
> to some expected reason (power loss).

  Ah. power loss before FSID_CHANGING_V2 is cleared. ok.

> mkfs on two devices, use v5.4 progs/btrfstune -m $device.
> Then both two devices both have the BTRFS_FEATURE_INCOMPAT_METADATA_UUID.
> Play some tricks in btrfstune.c to avoid final super block
> write on one deivce. Like the ugly code to delete the device:
> ========================================================================
> diff --git a/btrfstune.c b/btrfstune.c
> index afa3aae3..f678b978 100644
> --- a/btrfstune.c
> +++ b/btrfstune.c
> @@ -101,12 +101,14 @@ static int set_metadata_uuid(struct btrfs_root 
> *root, const char *uuid_string)
>          struct btrfs_super_block *disk_super;
>          uuid_t new_fsid, unused1, unused2;
>          struct btrfs_trans_handle *trans;
> +       struct btrfs_device *dev, *next;
>          bool new_uuid = true;
>          u64 incompat_flags;
>          bool uuid_changed;
>          u64 super_flags;
>          int ret;
> 
>          disk_super = root->fs_info->super_copy;
>          super_flags = btrfs_super_flags(disk_super);
>          incompat_flags = btrfs_super_incompat_flags(disk_super);
> @@ -170,6 +172,14 @@ static int set_metadata_uuid(struct btrfs_root 
> *root, const char *uuid_string)
>                  return 0;
>          }
> 
> +       list_for_each_entry_safe(dev, next, 
> &root->fs_info->fs_devices->devices,
> +                                dev_list) {
> +               if (dev->devid == 2) {
> +                       fsync(dev->fd);
> +                       list_del_init(&dev->dev_list);
> +               }
> +       }
> +==================================================================
> 

  Not like this. If you want to simulate failed write to the
  disk its better to do it with IO failing tools / mechanism outside
  of the btrfs-progs which probably can be a real fstests test case
  as well.

> Compile again. call btrfstune -m again.
> Then we get a device with
> BTRFS_FEATURE_INCOMPAT_METADATA_UUID and FSID_CHANGING_V2.
> 
>>> The workflow in misc-tests/034 is
>>>
>>> loop1=$(losetup --find --show "$device2")
>>> loop2=$(losetup --find --show "$device1")
>>>
>>> mount $loop1 /mnt ---> fails here
>>>
>>> Assume the fs_devices was allocated by systemd-udevd through
>>> btrfs_control_ioctl() path after finish of scanning of device2.
>>>
>>> Then:
>>>
>>
>> In the two threads which are in race (below), the mount thread can't 
>> be successful unless -o degraded is used, if it does it means the devid 1 
> 
> Right.. The dmesg reports the device 1 is missing.

  But then if you aren't using -o degraded then the mount shouldn't
  be successful. Are you using -o degraded?

> 
>> is already scanned and for that btrfs_device to be in the
>> btrfs_fs_devices list the fsid has to match (does not matter 
>> metadata_uuid).
> 
> Sorry, I doesn't make much clear what you mean. In similar but no 
> metadata_uuid situation, mount will fail too but the assertion won't
> fail of course. The device1 was scanned but not added into the
> fs_devices(already found) since the fs_devices was opened by the
> mounting thread.

  That's correct. But your test case with -o degraded and metadata_uuid
  is not clearly understood, is it possible to write a real fstests
  test case? you can use dmerror you will have control when to fail
  the IO.


>>
>>> Thread *mounting device2*            Thread *scanning device1*
>>>
>>>
>>> btrfs_mount_root                     btrfs_control_ioctl
>>>
>>>    mutex_lock(&uuid_mutex);
>>>
>>>      btrfs_read_disk_super
>>>      btrfs_scan_one_device
>>>      --> there is only device2
>>>      in the fs_devices
>>>
>>>      btrfs_open_devices
>>>        fs_devices->opened = 1
>>>        fs_devices->latest_bdev = device2
>>>
>>>      mutex_unlock(&uuid_mutex);
>>>
>>>                                        mutex_lock(&uuid_mutex);
>>>                                        btrfs_scan_one_device
>>>                                          btrfs_read_disk_super
>>>
>>>                                          device_list_add
>>>                                            found fs_devices
>>>                                              device = btrfs_find_device
>>>
>>>                                              rewrite fs_deivces->fsid if
>>>                                              scanned device1 is newer
>>>                                               --> Change fs_devices->fsi
>>>                                                    d to device1->fsid
>>>
>>>                                            if (!device)
>>>                                               if(fs_devices->opened)
>>>                           return -EBUSY
>>>                                               --> the device1 adding
>>>                                                   aborts since
>>>                                                   fs_devices was opened
>>>                                        mutex_unlock(&uuid_mutex);
>>>    btrfs_fill_super
>>>      open_ctree
>>>         btrfs_read_dev_super(
>>>         fs_devices->latest_bdev)
>>>         --> the latest_bdev is device2
>>>
>>>         assert fs_devices->fsid equals
>>>         device2's fsid.
>>>         --> fs_device->fsid was rewritten by
>>>             the scanning thread
>>>
>>> The result is fs_device->fsid is from device1 but super->fsid is from
>>> the lastest device2.
>>>
>>
>>   Oops that's not good. However still not able to image various devices
>>   and its fsid to achieve that condition. Is it possible to write a test
>>   case? It would help.
>>
> It did happened in my test environment.. You can try misc-tests/034 
> about 20 times on v5.4 progs and v5.5-rc1 kernel. As for the test case,
> will give a try.

Let me try.

Thanks, Anand


> Thanks
>> Thanks, Anand
>>
>>>>> the thread is doing mount device 1). But device 1's fsid was copied
>>>>> to fs_devices->fsid then the assertion failed.
>>>>
>>>>
>>>> dev 1 fsid should be copied iff its transid is newer.
>>>>
>>>
>>> Even it was failed to be added into the fs_devices?
>>>
>>>>>
>>>>> The solution is that only if a new device was added into a existing
>>>>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>>>>
>>>> fs_devices->fsid must be re-written by any device which is _newer_ 
>>>> w.r.t
>>>> to the transid.
>>>>
>>>
>>> Then the assertion failed in above scenario. Just do not update the
>>> fs_devices->fsid, let later btrfs_read_sys_array() report the device
>>> missing then reject to mount.
>>>
>>> Thanks
>>>
>>>>>
>>>>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario 
>>>>> during fsid change")
>>>>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>>>>> ---
>>>>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>>>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>>> index d8e5560db285..9efa4123c335 100644
>>>>> --- a/fs/btrfs/volumes.c
>>>>> +++ b/fs/btrfs/volumes.c
>>>>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device 
>>>>> *device_list_add(const char *path,
>>>>>           BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>>>>       bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>>>>                       BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>>>>> +    bool fs_devices_found = false;
>>>>> +
>>>>> +    *new_device_added = false;
>>>>>
>>>>>       if (fsid_change_in_progress) {
>>>>>           if (!has_metadata_uuid) {
>>>>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device 
>>>>> *device_list_add(const char *path,
>>>>>
>>>>>           device = NULL;
>>>>>       } else {
>>>>> +        fs_devices_found = true;
>>>>> +
>>>>>           mutex_lock(&fs_devices->device_list_mutex);
>>>>>           device = btrfs_find_device(fs_devices, devid,
>>>>>                   disk_super->dev_item.uuid, NULL, false);
>>>>> -
>>>>> -        /*
>>>>> -         * If this disk has been pulled into an fs devices created by
>>>>> -         * a device which had the CHANGING_FSID_V2 flag then 
>>>>> replace the
>>>>> -         * metadata_uuid/fsid values of the fs_devices.
>>>>> -         */
>>>>> -        if (has_metadata_uuid && fs_devices->fsid_change &&
>>>>> -            found_transid > fs_devices->latest_generation) {
>>>>> -            memcpy(fs_devices->fsid, disk_super->fsid,
>>>>> -                    BTRFS_FSID_SIZE);
>>>>> -            memcpy(fs_devices->metadata_uuid,
>>>>> -                    disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>>> -
>>>>> -            fs_devices->fsid_change = false;
>>>>> -        }
>>>>>       }
>>>>>
>>>>>       if (!device) {
>>>>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device 
>>>>> *device_list_add(const char *path,
>>>>>           }
>>>>>       }
>>>>>
>>>>> +    /*
>>>>> +     * If the new added disk has been pulled into an fs devices 
>>>>> created by
>>>>> +     * a device which had the CHANGING_FSID_V2 flag then replace the
>>>>> +     * metadata_uuid/fsid values of the fs_devices.
>>>>> +     */
>>>>> +    if (*new_device_added && fs_devices_found &&
>>>>> +        has_metadata_uuid && fs_devices->fsid_change &&
>>>>> +        found_transid > fs_devices->latest_generation) {
>>>>> +        memcpy(fs_devices->fsid, disk_super->fsid,
>>>>> +               BTRFS_FSID_SIZE);
>>>>> +        memcpy(fs_devices->metadata_uuid,
>>>>> +               disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>>> +
>>>>> +        fs_devices->fsid_change = false;
>>>>> +    }
>>>>> +
>>>>>       /*
>>>>>        * Unmount does not free the btrfs_device struct but would zero
>>>>>        * generation along with most of the other members. So just 
>>>>> update
>>>>>
>>>
>>
>
Su Yue Dec. 13, 2019, 10:10 a.m. UTC | #7
On 2019/12/13 4:51 PM, Anand Jain wrote:
> 
> 
> On 12/13/19 3:15 PM, Su Yue wrote:
>> On 2019/12/13 1:36 PM, Anand Jain wrote:
>>>
>>>
>>>   metadata_uuid code is too confusing, a lot of if and if-nots
>>>   it should be have been better.
>>>
>>
>> Agreed. It costed much brain power of mine.
>>
>>>   more below.
>>>
>>
>> Willing to answer from my understanding.
>> If something wrong, please point at.
>>
>>> On 13/12/19 10:46 AM, Su Yue wrote:
>>>> On 2019/12/12 10:15 PM, Nikolay Borisov wrote:
>>>>>
>>>>>
>>>>> On 12.12.19 г. 13:01 ч., damenly.su@gmail.com wrote:
>>>>>
>>>>> <snip>
>>>>>
>>>>>> Acutally, there are two devices in the fs. Device 2 with
>>>>>> FSID_CHANGING_V2 allocated a fs_devices. But, device 1 found the
>>>>>> fs_devices but failed to be added into since fs_devices->opened (
>>>>>
>>>>> It's not clear why device 1 wasn't able to be added to the fs_devices
>>>>> allocated by dev 2. Please elaborate?
>>>>>
>>>>>
>>>> Sure, of course.
>>>>
>>>> For example.
>>>>
>>>> $cat test.sh
>>>> ====================================================================
>>>> img1="/tmp/test1.img"
>>>> img2="/tmp/test2.img"
>>>>
>>>> [ -f "$img1" ] || fallocate -l 300M "$img1"
>>>> [ -f "$img2" ] || fallocate -l 300M "$img2"
>>>>
>>>> mkfs.btrfs -f $img1 $img2 2>&1 >/dev/null|| exit 1
>>>> losetup -D
>>>>
>>>> dmesg -C
>>>> rmmod btrfs
>>>> modprobe btrfs
>>>>
>>>> loop1=$(losetup --find --show "$img1")
>>>> loop2=$(losetup --find --show "$img2")
>>>
>>>   Can you explicitly show what devices should be scanned to make the
>>>   device mount (below) successful. Fist you can cleanup the
>>>   device list using
>>>
>>>     btrfs device --forget
>>>
>>
>> Thanks for the tip.
>> The purpose of simple script is to show that there
>> may be uncompleted/unsuccessful device(s) scanning due to
>> fs_devices->opened. Is the issue already known?
> 
> 
> Do you mean at line 803.
> 
> -----------
>   729 static noinline struct btrfs_device *device_list_add(const char 
> *path,
>   730                            struct btrfs_super_block *disk_super,
>   731                            bool *new_device_added)
>   732 {
> 
> ::
> 
>   802         if (!device) {
>   803                 if (fs_devices->opened) {
>   804                       mutex_unlock(&fs_devices->device_list_mutex);
>   805                       return ERR_PTR(-EBUSY);
>   806                 }
> ------------
> 
> fs_devices->opened indicates mounted state of the device.
> 
> If there is a missing device, the %device will still be there,
> we create a dummy %device with the dev_state set to
> BTRFS_DEV_STATE_MISSING.
> 
> So its wrong if we encounter device == NULL for a given fsid
> which is mounted. So by error and return we keep the mounted
> fs safe and fail the hijacking attack.
> 
> 
Thanks for your patient explantion. If I understand the
BTRFS_DEV_STATE_MISSING correctly, dummmy device with the state
is adding only if "-o degraded" in read_one_chunk()?

>>>> mount $loop1 /mnt || exit 1
>>>> umount /mnt
>>>> ====================================================================
>>>>
>>>> $dmesg
>>>> ====================================================================
>>>> [  395.205221] BTRFS: device fsid 5090db22-5e48-4767-8fb7-d037c619c1ee
>>>> devid 1 transid 5 /dev/loop0 scanned by systemd-udevd (13620)
>>>> [  395.210773] !!!!!!!!fs_device opened
>>>> [  395.213875] BTRFS info (device loop0): disk space caching is enabled
>>>> [  395.214994] BTRFS info (device loop0): has skinny extents
>>>> [  395.215891] BTRFS info (device loop0): flagging fs with big metadata
>>>> feature
>>>> [  395.222639] BTRFS error (device loop0): devid 2 uuid
>>>> adcc8454-695f-4e1d-bde8-94041b7bf761 is missing
>>>> [  395.224147] BTRFS error (device loop0): failed to read the system
>>>> array: -2
>>>> [  395.246163] !!!!!!!!fs_device opened
>>>> [  395.338219] BTRFS error (device loop0): open_ctree failed
>>>> =====================================================================
>>>>
>>>> The line "!!!!!!!!fs_device opened" is handy added by me in debug 
>>>> purpose.
>>>>
>>>> =====================================================================
>>>> --- a/fs/btrfs/volumes.c
>>>> +++ b/fs/btrfs/volumes.c
>>>> @@ -794,6 +794,7 @@ static noinline struct btrfs_device
>>>> *device_list_add(const char *path,
>>>>
>>>>          if (!device) {
>>>>                  if (fs_devices->opened) {
>>>> +                       pr_info("!!!!!!!!fs_device opened\n");
>>>>                          mutex_unlock(&fs_devices->device_list_mutex);
>>>>                          return ERR_PTR(-EBUSY);
>>>>                  }
>>>> =====================================================================
>>>>
>>>> To make it more clear. The following is in metadata_uuid situation.
>>>> Device 1 is without FSID_CHANGING_V2 but has IMCOMPAT_METADATA_UUID.
>>>> (newer transid).
>>>>
>>>> Device 2 is with FSID_CHANGING_V2 and IMCOMPAT_METADATA_UUID.(Older
>>>> transid).
>>>
>>> How were you able to set FSID_CHANGING_V2
> 
>   Sorry typo, it should be BTRFS_SUPER_FLAG_CHANGING_FSID_V2.
> 
>>> and BTRFS_FEATURE_INCOMPAT_METADATA_UUID on only devid 2 ?
>>>
>>>
>> The device2 is simulated to be the device failed to sync due
>> to some expected reason (power loss).
> 
>   Ah. power loss before FSID_CHANGING_V2 is cleared. ok.
> 
>> mkfs on two devices, use v5.4 progs/btrfstune -m $device.
>> Then both two devices both have the BTRFS_FEATURE_INCOMPAT_METADATA_UUID.
>> Play some tricks in btrfstune.c to avoid final super block
>> write on one deivce. Like the ugly code to delete the device:
>> ========================================================================
>> diff --git a/btrfstune.c b/btrfstune.c
>> index afa3aae3..f678b978 100644
>> --- a/btrfstune.c
>> +++ b/btrfstune.c
>> @@ -101,12 +101,14 @@ static int set_metadata_uuid(struct btrfs_root 
>> *root, const char *uuid_string)
>>          struct btrfs_super_block *disk_super;
>>          uuid_t new_fsid, unused1, unused2;
>>          struct btrfs_trans_handle *trans;
>> +       struct btrfs_device *dev, *next;
>>          bool new_uuid = true;
>>          u64 incompat_flags;
>>          bool uuid_changed;
>>          u64 super_flags;
>>          int ret;
>>
>>          disk_super = root->fs_info->super_copy;
>>          super_flags = btrfs_super_flags(disk_super);
>>          incompat_flags = btrfs_super_incompat_flags(disk_super);
>> @@ -170,6 +172,14 @@ static int set_metadata_uuid(struct btrfs_root 
>> *root, const char *uuid_string)
>>                  return 0;
>>          }
>>
>> +       list_for_each_entry_safe(dev, next, 
>> &root->fs_info->fs_devices->devices,
>> +                                dev_list) {
>> +               if (dev->devid == 2) {
>> +                       fsync(dev->fd);
>> +                       list_del_init(&dev->dev_list);
>> +               }
>> +       }
>> +==================================================================
>>
> 
>   Not like this. If you want to simulate failed write to the
>   disk its better to do it with IO failing tools / mechanism outside
>   of the btrfs-progs which probably can be a real fstests test case
>   as well.
> 
>> Compile again. call btrfstune -m again.
>> Then we get a device with
>> BTRFS_FEATURE_INCOMPAT_METADATA_UUID and FSID_CHANGING_V2.
>>
>>>> The workflow in misc-tests/034 is
>>>>
>>>> loop1=$(losetup --find --show "$device2")
>>>> loop2=$(losetup --find --show "$device1")
>>>>
>>>> mount $loop1 /mnt ---> fails here
>>>>
>>>> Assume the fs_devices was allocated by systemd-udevd through
>>>> btrfs_control_ioctl() path after finish of scanning of device2.
>>>>
>>>> Then:
>>>>
>>>
>>> In the two threads which are in race (below), the mount thread can't 
>>> be successful unless -o degraded is used, if it does it means the 
>>> devid 1 
>>
>> Right.. The dmesg reports the device 1 is missing.
> 
>   But then if you aren't using -o degraded then the mount shouldn't
>   be successful. Are you using -o degraded?
> 
No.. I know what you mean, if the race happened the mount operation 
should be expected to fail. And degraded mount solves the problem.

It makes me quite confused whether the workflow is correct.
Should I alwats add the "-o degraded" after the two 'losteup'?

>>
>>> is already scanned and for that btrfs_device to be in the
>>> btrfs_fs_devices list the fsid has to match (does not matter 
>>> metadata_uuid).
>>
>> Sorry, I doesn't make much clear what you mean. In similar but no 
>> metadata_uuid situation, mount will fail too but the assertion won't
>> fail of course. The device1 was scanned but not added into the
>> fs_devices(already found) since the fs_devices was opened by the
>> mounting thread.
> 
>   That's correct. But your test case with -o degraded and metadata_uuid
>   is not clearly understood, is it possible to write a real fstests
>   test case? you can use dmerror you will have control when to fail
>   the IO.
> 
Of course, I will do.

Thanks

> 
>>>
>>>> Thread *mounting device2*            Thread *scanning device1*
>>>>
>>>>
>>>> btrfs_mount_root                     btrfs_control_ioctl
>>>>
>>>>    mutex_lock(&uuid_mutex);
>>>>
>>>>      btrfs_read_disk_super
>>>>      btrfs_scan_one_device
>>>>      --> there is only device2
>>>>      in the fs_devices
>>>>
>>>>      btrfs_open_devices
>>>>        fs_devices->opened = 1
>>>>        fs_devices->latest_bdev = device2
>>>>
>>>>      mutex_unlock(&uuid_mutex);
>>>>
>>>>                                        mutex_lock(&uuid_mutex);
>>>>                                        btrfs_scan_one_device
>>>>                                          btrfs_read_disk_super
>>>>
>>>>                                          device_list_add
>>>>                                            found fs_devices
>>>>                                              device = btrfs_find_device
>>>>
>>>>                                              rewrite 
>>>> fs_deivces->fsid if
>>>>                                              scanned device1 is newer
>>>>                                               --> Change 
>>>> fs_devices->fsi
>>>>                                                    d to device1->fsid
>>>>
>>>>                                            if (!device)
>>>>                                               if(fs_devices->opened)
>>>>                           return -EBUSY
>>>>                                               --> the device1 adding
>>>>                                                   aborts since
>>>>                                                   fs_devices was opened
>>>>                                        mutex_unlock(&uuid_mutex);
>>>>    btrfs_fill_super
>>>>      open_ctree
>>>>         btrfs_read_dev_super(
>>>>         fs_devices->latest_bdev)
>>>>         --> the latest_bdev is device2
>>>>
>>>>         assert fs_devices->fsid equals
>>>>         device2's fsid.
>>>>         --> fs_device->fsid was rewritten by
>>>>             the scanning thread
>>>>
>>>> The result is fs_device->fsid is from device1 but super->fsid is from
>>>> the lastest device2.
>>>>
>>>
>>>   Oops that's not good. However still not able to image various devices
>>>   and its fsid to achieve that condition. Is it possible to write a test
>>>   case? It would help.
>>>
>> It did happened in my test environment.. You can try misc-tests/034 
>> about 20 times on v5.4 progs and v5.5-rc1 kernel. As for the test case,
>> will give a try.
> 
> Let me try.
> 
> Thanks, Anand
> 
> 
>> Thanks
>>> Thanks, Anand
>>>
>>>>>> the thread is doing mount device 1). But device 1's fsid was copied
>>>>>> to fs_devices->fsid then the assertion failed.
>>>>>
>>>>>
>>>>> dev 1 fsid should be copied iff its transid is newer.
>>>>>
>>>>
>>>> Even it was failed to be added into the fs_devices?
>>>>
>>>>>>
>>>>>> The solution is that only if a new device was added into a existing
>>>>>> fs_device, then the fs_devices->fsid is allowed to be rewritten.
>>>>>
>>>>> fs_devices->fsid must be re-written by any device which is _newer_ 
>>>>> w.r.t
>>>>> to the transid.
>>>>>
>>>>
>>>> Then the assertion failed in above scenario. Just do not update the
>>>> fs_devices->fsid, let later btrfs_read_sys_array() report the device
>>>> missing then reject to mount.
>>>>
>>>> Thanks
>>>>
>>>>>>
>>>>>> Fixes: 7a62d0f07377 ("btrfs: Handle one more split-brain scenario 
>>>>>> during fsid change")
>>>>>> Signed-off-by: Su Yue <Damenly_Su@gmx.com>
>>>>>> ---
>>>>>>   fs/btrfs/volumes.c | 36 +++++++++++++++++++++---------------
>>>>>>   1 file changed, 21 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>>>>> index d8e5560db285..9efa4123c335 100644
>>>>>> --- a/fs/btrfs/volumes.c
>>>>>> +++ b/fs/btrfs/volumes.c
>>>>>> @@ -732,6 +732,9 @@ static noinline struct btrfs_device 
>>>>>> *device_list_add(const char *path,
>>>>>>           BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
>>>>>>       bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
>>>>>>                       BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
>>>>>> +    bool fs_devices_found = false;
>>>>>> +
>>>>>> +    *new_device_added = false;
>>>>>>
>>>>>>       if (fsid_change_in_progress) {
>>>>>>           if (!has_metadata_uuid) {
>>>>>> @@ -772,24 +775,11 @@ static noinline struct btrfs_device 
>>>>>> *device_list_add(const char *path,
>>>>>>
>>>>>>           device = NULL;
>>>>>>       } else {
>>>>>> +        fs_devices_found = true;
>>>>>> +
>>>>>>           mutex_lock(&fs_devices->device_list_mutex);
>>>>>>           device = btrfs_find_device(fs_devices, devid,
>>>>>>                   disk_super->dev_item.uuid, NULL, false);
>>>>>> -
>>>>>> -        /*
>>>>>> -         * If this disk has been pulled into an fs devices 
>>>>>> created by
>>>>>> -         * a device which had the CHANGING_FSID_V2 flag then 
>>>>>> replace the
>>>>>> -         * metadata_uuid/fsid values of the fs_devices.
>>>>>> -         */
>>>>>> -        if (has_metadata_uuid && fs_devices->fsid_change &&
>>>>>> -            found_transid > fs_devices->latest_generation) {
>>>>>> -            memcpy(fs_devices->fsid, disk_super->fsid,
>>>>>> -                    BTRFS_FSID_SIZE);
>>>>>> -            memcpy(fs_devices->metadata_uuid,
>>>>>> -                    disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>>>> -
>>>>>> -            fs_devices->fsid_change = false;
>>>>>> -        }
>>>>>>       }
>>>>>>
>>>>>>       if (!device) {
>>>>>> @@ -912,6 +902,22 @@ static noinline struct btrfs_device 
>>>>>> *device_list_add(const char *path,
>>>>>>           }
>>>>>>       }
>>>>>>
>>>>>> +    /*
>>>>>> +     * If the new added disk has been pulled into an fs devices 
>>>>>> created by
>>>>>> +     * a device which had the CHANGING_FSID_V2 flag then replace the
>>>>>> +     * metadata_uuid/fsid values of the fs_devices.
>>>>>> +     */
>>>>>> +    if (*new_device_added && fs_devices_found &&
>>>>>> +        has_metadata_uuid && fs_devices->fsid_change &&
>>>>>> +        found_transid > fs_devices->latest_generation) {
>>>>>> +        memcpy(fs_devices->fsid, disk_super->fsid,
>>>>>> +               BTRFS_FSID_SIZE);
>>>>>> +        memcpy(fs_devices->metadata_uuid,
>>>>>> +               disk_super->metadata_uuid, BTRFS_FSID_SIZE);
>>>>>> +
>>>>>> +        fs_devices->fsid_change = false;
>>>>>> +    }
>>>>>> +
>>>>>>       /*
>>>>>>        * Unmount does not free the btrfs_device struct but would zero
>>>>>>        * generation along with most of the other members. So just 
>>>>>> update
>>>>>>
>>>>
>>>
>>

Patch
diff mbox series

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d8e5560db285..9efa4123c335 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -732,6 +732,9 @@  static noinline struct btrfs_device *device_list_add(const char *path,
 		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 	bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 					BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+	bool fs_devices_found = false;
+
+	*new_device_added = false;
 
 	if (fsid_change_in_progress) {
 		if (!has_metadata_uuid) {
@@ -772,24 +775,11 @@  static noinline struct btrfs_device *device_list_add(const char *path,
 
 		device = NULL;
 	} else {
+		fs_devices_found = true;
+
 		mutex_lock(&fs_devices->device_list_mutex);
 		device = btrfs_find_device(fs_devices, devid,
 				disk_super->dev_item.uuid, NULL, false);
-
-		/*
-		 * If this disk has been pulled into an fs devices created by
-		 * a device which had the CHANGING_FSID_V2 flag then replace the
-		 * metadata_uuid/fsid values of the fs_devices.
-		 */
-		if (has_metadata_uuid && fs_devices->fsid_change &&
-		    found_transid > fs_devices->latest_generation) {
-			memcpy(fs_devices->fsid, disk_super->fsid,
-					BTRFS_FSID_SIZE);
-			memcpy(fs_devices->metadata_uuid,
-					disk_super->metadata_uuid, BTRFS_FSID_SIZE);
-
-			fs_devices->fsid_change = false;
-		}
 	}
 
 	if (!device) {
@@ -912,6 +902,22 @@  static noinline struct btrfs_device *device_list_add(const char *path,
 		}
 	}
 
+	/*
+	 * If the new added disk has been pulled into an fs devices created by
+	 * a device which had the CHANGING_FSID_V2 flag then replace the
+	 * metadata_uuid/fsid values of the fs_devices.
+	 */
+	if (*new_device_added && fs_devices_found &&
+	    has_metadata_uuid && fs_devices->fsid_change &&
+	    found_transid > fs_devices->latest_generation) {
+		memcpy(fs_devices->fsid, disk_super->fsid,
+		       BTRFS_FSID_SIZE);
+		memcpy(fs_devices->metadata_uuid,
+		       disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+		fs_devices->fsid_change = false;
+	}
+
 	/*
 	 * Unmount does not free the btrfs_device struct but would zero
 	 * generation along with most of the other members. So just update