diff mbox series

[v2] btrfs-progs: mkfs: use flock() to properly prevent race with udev

Message ID 49bbb80e37990b0614f0929ac302560b27d2d933.1706594470.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series [v2] btrfs-progs: mkfs: use flock() to properly prevent race with udev | expand

Commit Message

Qu Wenruo Jan. 30, 2024, 6:01 a.m. UTC
[BUG]
Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
descriptors open during whole time"), there is still a bug report about
blkid failed to grab the FSID:

 device=/dev/loop0
 fstype=btrfs

 wipefs -a "$device"*

 parted -s "$device" \
     mklabel gpt \
     mkpart '"EFI system partition"' fat32 1MiB 513MiB \
     set 1 esp on \
     mkpart '"root partition"' "$fstype" 513MiB 100%

 udevadm settle
 partitions=($(lsblk -n -o path "$device"))

 mkfs.fat -F 32 ${partitions[1]}
 mkfs."$fstype" ${partitions[2]}
 udevadm settle

The above script can sometimes result empty fsid:

 loop0
 |-loop0p1 BDF3-552B
 `-loop0p2

[CAUSE]
Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
open during whole time") changed the lifespan of the fds, it doesn't
properly inform udev about our change to certain partition.

Thus for a multi-partition case, udev can start scanning the whole disk,
meanwhile our mkfs is still happening halfway.

If the scan is done before our new super blocks written, and our close()
calls happens later just before the current scan is finished, udev can
got the temporary super blocks (which is not a valid one).

And since our close() calls happens during the scan, there would be no
new scan, thus leading to the bad result.

[FIX]
The proper way to avoid race with udev is to flock() the whole disk
(aka, the parent block device, not the partition disk).

Thus this patch would introduce such mechanism by:

- btrfs_flock_one_device()
  This would resolve the path to a whole disk path.
  Then make sure the whole disk is not already locked (this can happen
  for cases like "mkfs.btrfs -f /dev/sda[123]").

  If the device is not already locked, then flock() the device, and
  insert a new entry into the list.

- btrfs_unlock_all_devices()
  Would go unlock all devices recorded in locked_devices list, and free
  the memory.

And mkfs.btrfs would be the first one to utilize the new mechanism, to
prevent such race with udev.

Issue: #734
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Changelog:
v2:
- Fix the patch prefix
  From "btrfs" to "btrfs-progs"
---
 common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 common/device-utils.h |   3 ++
 mkfs/main.c           |  11 ++++
 3 files changed, 128 insertions(+)

--
2.43.0

Comments

David Sterba Jan. 31, 2024, 7:13 a.m. UTC | #1
On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
> [BUG]
> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
> descriptors open during whole time"), there is still a bug report about
> blkid failed to grab the FSID:
> 
>  device=/dev/loop0
>  fstype=btrfs
> 
>  wipefs -a "$device"*
> 
>  parted -s "$device" \
>      mklabel gpt \
>      mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>      set 1 esp on \
>      mkpart '"root partition"' "$fstype" 513MiB 100%
> 
>  udevadm settle
>  partitions=($(lsblk -n -o path "$device"))
> 
>  mkfs.fat -F 32 ${partitions[1]}
>  mkfs."$fstype" ${partitions[2]}
>  udevadm settle

As mentioned in the issue 'udevadm lock <command>' can be used as a
workaround until mkfs does the locking but we can also use that for
testing, ie lock a device and then try mkfs.

> 
> The above script can sometimes result empty fsid:
> 
>  loop0
>  |-loop0p1 BDF3-552B
>  `-loop0p2
> 
> [CAUSE]
> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
> open during whole time") changed the lifespan of the fds, it doesn't
> properly inform udev about our change to certain partition.
> 
> Thus for a multi-partition case, udev can start scanning the whole disk,
> meanwhile our mkfs is still happening halfway.
> 
> If the scan is done before our new super blocks written, and our close()
> calls happens later just before the current scan is finished, udev can
> got the temporary super blocks (which is not a valid one).
> 
> And since our close() calls happens during the scan, there would be no
> new scan, thus leading to the bad result.
> 
> [FIX]
> The proper way to avoid race with udev is to flock() the whole disk
> (aka, the parent block device, not the partition disk).
> 
> Thus this patch would introduce such mechanism by:
> 
> - btrfs_flock_one_device()
>   This would resolve the path to a whole disk path.
>   Then make sure the whole disk is not already locked (this can happen
>   for cases like "mkfs.btrfs -f /dev/sda[123]").
> 
>   If the device is not already locked, then flock() the device, and
>   insert a new entry into the list.
> 
> - btrfs_unlock_all_devices()
>   Would go unlock all devices recorded in locked_devices list, and free
>   the memory.
> 
> And mkfs.btrfs would be the first one to utilize the new mechanism, to
> prevent such race with udev.
> 
> Issue: #734
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Changelog:
> v2:
> - Fix the patch prefix
>   From "btrfs" to "btrfs-progs"
> ---
>  common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>  common/device-utils.h |   3 ++
>  mkfs/main.c           |  11 ++++
>  3 files changed, 128 insertions(+)
> 
> diff --git a/common/device-utils.c b/common/device-utils.c
> index f86120afa00c..88c21c66382d 100644
> --- a/common/device-utils.c
> +++ b/common/device-utils.c
> @@ -17,11 +17,13 @@
>  #include <sys/ioctl.h>
>  #include <sys/stat.h>
>  #include <sys/types.h>
> +#include <sys/file.h>
>  #include <linux/limits.h>
>  #ifdef BTRFS_ZONED
>  #include <linux/blkzoned.h>
>  #endif
>  #include <linux/fs.h>
> +#include <linux/kdev_t.h>
>  #include <limits.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -48,6 +50,24 @@
>  #define BLKDISCARD	_IO(0x12,119)
>  #endif
> 
> +static LIST_HEAD(locked_devices);
> +
> +/*
> + * This is to record flock()ed devices.
> + * For flock() to prevent udev races, we must lock the parent block device,
> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
> + * we should only lock "/dev/sda" once.
> + *
> + * This structure would be used to record any flocked block device (not
> + * the partition one), and avoid double locking.
> + */
> +struct btrfs_locked_wholedisk {

Please pick a different name, we've been calling it devices, although
you can find 'disk' references but mainly for historical reasons (eg.
when it's in a structure). In this case it's a block device.
Qu Wenruo Jan. 31, 2024, 8:07 a.m. UTC | #2
On 2024/1/31 17:43, David Sterba wrote:
> On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
>> [BUG]
>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>> descriptors open during whole time"), there is still a bug report about
>> blkid failed to grab the FSID:
>>
>>   device=/dev/loop0
>>   fstype=btrfs
>>
>>   wipefs -a "$device"*
>>
>>   parted -s "$device" \
>>       mklabel gpt \
>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>       set 1 esp on \
>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>
>>   udevadm settle
>>   partitions=($(lsblk -n -o path "$device"))
>>
>>   mkfs.fat -F 32 ${partitions[1]}
>>   mkfs."$fstype" ${partitions[2]}
>>   udevadm settle
>
> As mentioned in the issue 'udevadm lock <command>' can be used as a
> workaround until mkfs does the locking but we can also use that for
> testing, ie lock a device and then try mkfs.

Indeed, that may be a cleaner way to solve the problem.

Let me check if there is any API provided to do that, but still we will
need the same de-duplication check to prevent locking the same device
twice if we're making fs on all partition from the same disk.

>
>>
>> The above script can sometimes result empty fsid:
>>
>>   loop0
>>   |-loop0p1 BDF3-552B
>>   `-loop0p2
>>
>> [CAUSE]
>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>> open during whole time") changed the lifespan of the fds, it doesn't
>> properly inform udev about our change to certain partition.
>>
>> Thus for a multi-partition case, udev can start scanning the whole disk,
>> meanwhile our mkfs is still happening halfway.
>>
>> If the scan is done before our new super blocks written, and our close()
>> calls happens later just before the current scan is finished, udev can
>> got the temporary super blocks (which is not a valid one).
>>
>> And since our close() calls happens during the scan, there would be no
>> new scan, thus leading to the bad result.
>>
>> [FIX]
>> The proper way to avoid race with udev is to flock() the whole disk
>> (aka, the parent block device, not the partition disk).
>>
>> Thus this patch would introduce such mechanism by:
>>
>> - btrfs_flock_one_device()
>>    This would resolve the path to a whole disk path.
>>    Then make sure the whole disk is not already locked (this can happen
>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>
>>    If the device is not already locked, then flock() the device, and
>>    insert a new entry into the list.
>>
>> - btrfs_unlock_all_devices()
>>    Would go unlock all devices recorded in locked_devices list, and free
>>    the memory.
>>
>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>> prevent such race with udev.
>>
>> Issue: #734
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Changelog:
>> v2:
>> - Fix the patch prefix
>>    From "btrfs" to "btrfs-progs"
>> ---
>>   common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>>   common/device-utils.h |   3 ++
>>   mkfs/main.c           |  11 ++++
>>   3 files changed, 128 insertions(+)
>>
>> diff --git a/common/device-utils.c b/common/device-utils.c
>> index f86120afa00c..88c21c66382d 100644
>> --- a/common/device-utils.c
>> +++ b/common/device-utils.c
>> @@ -17,11 +17,13 @@
>>   #include <sys/ioctl.h>
>>   #include <sys/stat.h>
>>   #include <sys/types.h>
>> +#include <sys/file.h>
>>   #include <linux/limits.h>
>>   #ifdef BTRFS_ZONED
>>   #include <linux/blkzoned.h>
>>   #endif
>>   #include <linux/fs.h>
>> +#include <linux/kdev_t.h>
>>   #include <limits.h>
>>   #include <stdio.h>
>>   #include <stdlib.h>
>> @@ -48,6 +50,24 @@
>>   #define BLKDISCARD	_IO(0x12,119)
>>   #endif
>>
>> +static LIST_HEAD(locked_devices);
>> +
>> +/*
>> + * This is to record flock()ed devices.
>> + * For flock() to prevent udev races, we must lock the parent block device,
>> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
>> + * we should only lock "/dev/sda" once.
>> + *
>> + * This structure would be used to record any flocked block device (not
>> + * the partition one), and avoid double locking.
>> + */
>> +struct btrfs_locked_wholedisk {
>
> Please pick a different name, we've been calling it devices, although
> you can find 'disk' references but mainly for historical reasons (eg.
> when it's in a structure). In this case it's a block device.
>

Well, the "wholedisk" name is from the libblk, and I thought it may be
good enough, but it's not the case.

Maybe "parent_block_dev" would be better?

Thanks,
Qu
Qu Wenruo Jan. 31, 2024, 8:34 a.m. UTC | #3
On 2024/1/31 18:37, Qu Wenruo wrote:
>
>
> On 2024/1/31 17:43, David Sterba wrote:
>> On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
>>> [BUG]
>>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>>> descriptors open during whole time"), there is still a bug report about
>>> blkid failed to grab the FSID:
>>>
>>>   device=/dev/loop0
>>>   fstype=btrfs
>>>
>>>   wipefs -a "$device"*
>>>
>>>   parted -s "$device" \
>>>       mklabel gpt \
>>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>>       set 1 esp on \
>>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>>
>>>   udevadm settle
>>>   partitions=($(lsblk -n -o path "$device"))
>>>
>>>   mkfs.fat -F 32 ${partitions[1]}
>>>   mkfs."$fstype" ${partitions[2]}
>>>   udevadm settle
>>
>> As mentioned in the issue 'udevadm lock <command>' can be used as a
>> workaround until mkfs does the locking but we can also use that for
>> testing, ie lock a device and then try mkfs.
>
> Indeed, that may be a cleaner way to solve the problem.
>
> Let me check if there is any API provided to do that, but still we will
> need the same de-duplication check to prevent locking the same device
> twice if we're making fs on all partition from the same disk.

Nope, the "udevadm lock" is just doing flock() on the target device,
thus it's no difference than my implementation.

The call chain looks like this:

udevadm-lock.c:
lock_device()
|- lock_generic(LOCK_BSD, LOCK_EX|LOCK_NB);
    |- flock(fd, operation);

And the fd passed in is also the "wholedisk" fd, not the partition disk fd:

lock_main()
|- find_devno()
    |- path_get_whole_disk()

So in short, "udevadm lock" command is just doing:

- Convert the partition path to whole disk path
- Make sure the whole disk is not yet locked
- Then call flock() on it
   There are some extra work like deadline, but it's not really
   something we need to bother AFAIK.

And it also has the same de-duplication ability of my patch.
Although udev is using binary search instead of my simpler but slower
list based search.

Whether to utilize external "udevadm lock" command, or doing the same
thing inside "mkfs.btrfs" is up to you to determine.

But at least, I'm doing the same thing as "udevadm lock".
(Although this version has other problems, like using st_dev not
st_rdev, and not properly making the parameter for
btrfs_flock_one_device() const etc)

Thanks,
Qu
>
>>
>>>
>>> The above script can sometimes result empty fsid:
>>>
>>>   loop0
>>>   |-loop0p1 BDF3-552B
>>>   `-loop0p2
>>>
>>> [CAUSE]
>>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>>> open during whole time") changed the lifespan of the fds, it doesn't
>>> properly inform udev about our change to certain partition.
>>>
>>> Thus for a multi-partition case, udev can start scanning the whole disk,
>>> meanwhile our mkfs is still happening halfway.
>>>
>>> If the scan is done before our new super blocks written, and our close()
>>> calls happens later just before the current scan is finished, udev can
>>> got the temporary super blocks (which is not a valid one).
>>>
>>> And since our close() calls happens during the scan, there would be no
>>> new scan, thus leading to the bad result.
>>>
>>> [FIX]
>>> The proper way to avoid race with udev is to flock() the whole disk
>>> (aka, the parent block device, not the partition disk).
>>>
>>> Thus this patch would introduce such mechanism by:
>>>
>>> - btrfs_flock_one_device()
>>>    This would resolve the path to a whole disk path.
>>>    Then make sure the whole disk is not already locked (this can happen
>>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>>
>>>    If the device is not already locked, then flock() the device, and
>>>    insert a new entry into the list.
>>>
>>> - btrfs_unlock_all_devices()
>>>    Would go unlock all devices recorded in locked_devices list, and free
>>>    the memory.
>>>
>>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>>> prevent such race with udev.
>>>
>>> Issue: #734
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>> Changelog:
>>> v2:
>>> - Fix the patch prefix
>>>    From "btrfs" to "btrfs-progs"
>>> ---
>>>   common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>>>   common/device-utils.h |   3 ++
>>>   mkfs/main.c           |  11 ++++
>>>   3 files changed, 128 insertions(+)
>>>
>>> diff --git a/common/device-utils.c b/common/device-utils.c
>>> index f86120afa00c..88c21c66382d 100644
>>> --- a/common/device-utils.c
>>> +++ b/common/device-utils.c
>>> @@ -17,11 +17,13 @@
>>>   #include <sys/ioctl.h>
>>>   #include <sys/stat.h>
>>>   #include <sys/types.h>
>>> +#include <sys/file.h>
>>>   #include <linux/limits.h>
>>>   #ifdef BTRFS_ZONED
>>>   #include <linux/blkzoned.h>
>>>   #endif
>>>   #include <linux/fs.h>
>>> +#include <linux/kdev_t.h>
>>>   #include <limits.h>
>>>   #include <stdio.h>
>>>   #include <stdlib.h>
>>> @@ -48,6 +50,24 @@
>>>   #define BLKDISCARD    _IO(0x12,119)
>>>   #endif
>>>
>>> +static LIST_HEAD(locked_devices);
>>> +
>>> +/*
>>> + * This is to record flock()ed devices.
>>> + * For flock() to prevent udev races, we must lock the parent block
>>> device,
>>> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that
>>> case
>>> + * we should only lock "/dev/sda" once.
>>> + *
>>> + * This structure would be used to record any flocked block device (not
>>> + * the partition one), and avoid double locking.
>>> + */
>>> +struct btrfs_locked_wholedisk {
>>
>> Please pick a different name, we've been calling it devices, although
>> you can find 'disk' references but mainly for historical reasons (eg.
>> when it's in a structure). In this case it's a block device.
>>
>
> Well, the "wholedisk" name is from the libblk, and I thought it may be
> good enough, but it's not the case.
>
> Maybe "parent_block_dev" would be better?
>
> Thanks,
> Qu
>
Anand Jain Jan. 31, 2024, 3:53 p.m. UTC | #4
On 1/30/24 11:31, Qu Wenruo wrote:
> [BUG]
> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
> descriptors open during whole time"), there is still a bug report about
> blkid failed to grab the FSID:
> 
>   device=/dev/loop0
>   fstype=btrfs
> 
>   wipefs -a "$device"*
> 
>   parted -s "$device" \
>       mklabel gpt \
>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>       set 1 esp on \
>       mkpart '"root partition"' "$fstype" 513MiB 100%
> 
>   udevadm settle
>   partitions=($(lsblk -n -o path "$device"))
> 
>   mkfs.fat -F 32 ${partitions[1]}
>   mkfs."$fstype" ${partitions[2]}
>   udevadm settle
> 
> The above script can sometimes result empty fsid:
> 
>   loop0
>   |-loop0p1 BDF3-552B
>   `-loop0p2
> 



> [CAUSE]
> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
> open during whole time") changed the lifespan of the fds, it doesn't
> properly inform udev about our change to certain partition.
> 
> Thus for a multi-partition case, udev can start scanning the whole disk,
> meanwhile our mkfs is still happening halfway.
> 
> If the scan is done before our new super blocks written, and our close()
> calls happens later just before the current scan is finished, udev can
> got the temporary super blocks (which is not a valid one).
> 
> And since our close() calls happens during the scan, there would be no
> new scan, thus leading to the bad result.
> 



I am able to reproduce the missing udev events without the device 
partitions on the entire device also, so its not about the flock.
Also, per the udevadm monitor I see no new fsid being reported for the 
btrfs. Please find the test case in the RFC patch below.

The problem appears to be a convoluted nested file descriptor of the 
primary device (which obtains the temp-super-block).

The RFC patch below optimizes the file descriptors and I find it to fix 
the issue. Now, both yours and my test cases pass.

[PATCH RFC] btrfs-progs: mkfs: optimize file descriptor usage in
  mkfs.btrfs

Thanks, Anand




> [FIX]
> The proper way to avoid race with udev is to flock() the whole disk
> (aka, the parent block device, not the partition disk).
> 
> Thus this patch would introduce such mechanism by:
> 
> - btrfs_flock_one_device()
>    This would resolve the path to a whole disk path.
>    Then make sure the whole disk is not already locked (this can happen
>    for cases like "mkfs.btrfs -f /dev/sda[123]").
> 
>    If the device is not already locked, then flock() the device, and
>    insert a new entry into the list.
> 
> - btrfs_unlock_all_devices()
>    Would go unlock all devices recorded in locked_devices list, and free
>    the memory.
> 
> And mkfs.btrfs would be the first one to utilize the new mechanism, to
> prevent such race with udev.
> 
> Issue: #734
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Changelog:
> v2:
> - Fix the patch prefix
>    From "btrfs" to "btrfs-progs"
> ---
>   common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>   common/device-utils.h |   3 ++
>   mkfs/main.c           |  11 ++++
>   3 files changed, 128 insertions(+)
> 
> diff --git a/common/device-utils.c b/common/device-utils.c
> index f86120afa00c..88c21c66382d 100644
> --- a/common/device-utils.c
> +++ b/common/device-utils.c
> @@ -17,11 +17,13 @@
>   #include <sys/ioctl.h>
>   #include <sys/stat.h>
>   #include <sys/types.h>
> +#include <sys/file.h>
>   #include <linux/limits.h>
>   #ifdef BTRFS_ZONED
>   #include <linux/blkzoned.h>
>   #endif
>   #include <linux/fs.h>
> +#include <linux/kdev_t.h>
>   #include <limits.h>
>   #include <stdio.h>
>   #include <stdlib.h>
> @@ -48,6 +50,24 @@
>   #define BLKDISCARD	_IO(0x12,119)
>   #endif
> 
> +static LIST_HEAD(locked_devices);
> +
> +/*
> + * This is to record flock()ed devices.
> + * For flock() to prevent udev races, we must lock the parent block device,
> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
> + * we should only lock "/dev/sda" once.
> + *
> + * This structure would be used to record any flocked block device (not
> + * the partition one), and avoid double locking.
> + */
> +struct btrfs_locked_wholedisk {
> +	char *full_path;
> +	dev_t devno;
> +	int fd;
> +	struct list_head list;
> +};
> +
>   /*
>    * Discard the given range in one go
>    */
> @@ -633,3 +653,97 @@ ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset)
>   	free(bounce_buf);
>   	return ret;
>   }
> +
> +int btrfs_flock_one_device(char *path)
> +{
> +	struct btrfs_locked_wholedisk *entry;
> +	struct stat st = { 0 };
> +	char *wholedisk_path;
> +	dev_t wholedisk_devno;
> +	int ret;
> +
> +	ret = stat(path, &st);
> +	if (ret < 0) {
> +		error("failed to stat %s: %m", path);
> +		return -errno;
> +	}
> +	/* Non-block device, skipping the locking. */
> +	if (!S_ISBLK(st.st_mode))
> +		return 0;
> +
> +	ret = blkid_devno_to_wholedisk(st.st_dev, path, strlen(path),
> +				       &wholedisk_devno);
> +	if (ret < 0) {
> +		error("failed to get the whole disk devno for %s: %m", path);
> +		return -errno;
> +	}
> +	wholedisk_path = blkid_devno_to_devname(wholedisk_devno);
> +	if (!wholedisk_path) {
> +		error("failed to get the devname of dev %ld:%ld",
> +			MAJOR(wholedisk_devno), MINOR(wholedisk_devno));
> +	}
> +
> +	/* Check if we already have the whole disk in the list. */
> +	list_for_each_entry(entry, &locked_devices, list) {
> +		/* The wholedisk is already locked, need to do nothing. */
> +		if (entry->devno == wholedisk_devno ||
> +		    entry->full_path == wholedisk_path) {
> +			free(wholedisk_path);
> +			return 0;
> +		}
> +	}
> +
> +	/* Allocate new entry. */
> +	entry = malloc(sizeof(*entry));
> +	if (!entry) {
> +		errno = ENOMEM;
> +		error("unable to allocate new memory for %s: %m",
> +		      wholedisk_path);
> +		free(wholedisk_path);
> +		return -errno;
> +	}
> +	entry->devno = wholedisk_devno;
> +	entry->full_path = wholedisk_path;
> +
> +	/* Lock the whole disk. */
> +	entry->fd = open(wholedisk_path, O_RDONLY);
> +	if (entry->fd < 0) {
> +		error("failed to open device %s: %m",
> +		      wholedisk_path);
> +		free(wholedisk_path);
> +		free(entry);
> +		return -errno;
> +	}
> +	ret = flock(entry->fd, LOCK_EX);
> +	if (ret < 0) {
> +		error("failed to hold an exclusive lock on %s: %m",
> +		      wholedisk_path);
> +		free(wholedisk_path);
> +		free(entry);
> +		return -errno;
> +	}
> +
> +	/* Insert it into the list. */
> +	list_add_tail(&entry->list, &locked_devices);
> +	return 0;
> +}
> +
> +void btrfs_unlock_all_devicecs(void)
> +{
> +	while (!list_empty(&locked_devices)) {
> +		struct btrfs_locked_wholedisk *entry;
> +		int ret;
> +
> +		entry = list_entry(locked_devices.next,
> +				   struct btrfs_locked_wholedisk, list);
> +
> +		list_del_init(&entry->list);
> +		ret = flock(entry->fd, LOCK_UN);
> +		if (ret < 0)
> +			warning("failed to unlock %s (fd %d dev %ld:%ld), skipping it",
> +				entry->full_path, entry->fd, MAJOR(entry->devno),
> +				MINOR(entry->devno));
> +		free(entry->full_path);
> +		free(entry);
> +	}
> +}
> diff --git a/common/device-utils.h b/common/device-utils.h
> index 853d17b5ab98..3a04348a0867 100644
> --- a/common/device-utils.h
> +++ b/common/device-utils.h
> @@ -57,6 +57,9 @@ int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
>   ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t offset);
>   ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset);
> 
> +int btrfs_flock_one_device(char *path);
> +void btrfs_unlock_all_devicecs(void);
> +
>   #ifdef BTRFS_ZONED
>   static inline ssize_t btrfs_pwrite(int fd, const void *buf, size_t count,
>   				   off_t offset, bool direct)
> diff --git a/mkfs/main.c b/mkfs/main.c
> index b9882208dbd5..6e6cb81a4165 100644
> --- a/mkfs/main.c
> +++ b/mkfs/main.c
> @@ -1723,6 +1723,15 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>   		}
>   	}
> 
> +	/* Lock all devices to prevent race with udev probing. */
> +	for (i = 0; i < device_count; i++) {
> +		char *path = argv[optind + i - 1];
> +
> +		ret = btrfs_flock_one_device(path);
> +		if (ret < 0)
> +			warning("failed to flock %s, skipping it", path);
> +	}
> +
>   	/* Start threads */
>   	for (i = 0; i < device_count; i++) {
>   		prepare_ctx[i].file = argv[optind + i - 1];
> @@ -2079,6 +2088,7 @@ out:
>   	free(label);
>   	free(source_dir);
> 
> +	btrfs_unlock_all_devicecs();
>   	return !!ret;
> 
>   error:
> @@ -2090,6 +2100,7 @@ error:
>   	free(prepare_ctx);
>   	free(label);
>   	free(source_dir);
> +	btrfs_unlock_all_devicecs();
>   	exit(1);
>   success:
>   	exit(0);
> --
> 2.43.0
>
David Sterba Jan. 31, 2024, 6:11 p.m. UTC | #5
On Wed, Jan 31, 2024 at 07:04:19PM +1030, Qu Wenruo wrote:
> 
> 
> On 2024/1/31 18:37, Qu Wenruo wrote:
> >
> >
> > On 2024/1/31 17:43, David Sterba wrote:
> >> On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
> >>> [BUG]
> >>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
> >>> descriptors open during whole time"), there is still a bug report about
> >>> blkid failed to grab the FSID:
> >>>
> >>>   device=/dev/loop0
> >>>   fstype=btrfs
> >>>
> >>>   wipefs -a "$device"*
> >>>
> >>>   parted -s "$device" \
> >>>       mklabel gpt \
> >>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
> >>>       set 1 esp on \
> >>>       mkpart '"root partition"' "$fstype" 513MiB 100%
> >>>
> >>>   udevadm settle
> >>>   partitions=($(lsblk -n -o path "$device"))
> >>>
> >>>   mkfs.fat -F 32 ${partitions[1]}
> >>>   mkfs."$fstype" ${partitions[2]}
> >>>   udevadm settle
> >>
> >> As mentioned in the issue 'udevadm lock <command>' can be used as a
> >> workaround until mkfs does the locking but we can also use that for
> >> testing, ie lock a device and then try mkfs.
> >
> > Indeed, that may be a cleaner way to solve the problem.
> >
> > Let me check if there is any API provided to do that, but still we will
> > need the same de-duplication check to prevent locking the same device
> > twice if we're making fs on all partition from the same disk.
> 
> Nope, the "udevadm lock" is just doing flock() on the target device,
> thus it's no difference than my implementation.

That was a misunderstanding, it's right to do just the flock() as it's
effectively what udevadm lock does and we don't need to add another
dependency.

What I meant:

udevadm lock --name=/dev/sdx sleep 20 &
mkfs.btrfs /dev/sdx

and expect mkfs to fail if it detects a locked device.

> The call chain looks like this:
> 
> udevadm-lock.c:
> lock_device()
> |- lock_generic(LOCK_BSD, LOCK_EX|LOCK_NB);
>     |- flock(fd, operation);
> 
> And the fd passed in is also the "wholedisk" fd, not the partition disk fd:
> 
> lock_main()
> |- find_devno()
>     |- path_get_whole_disk()
> 
> So in short, "udevadm lock" command is just doing:
> 
> - Convert the partition path to whole disk path
> - Make sure the whole disk is not yet locked
> - Then call flock() on it
>    There are some extra work like deadline, but it's not really
>    something we need to bother AFAIK.
> 
> And it also has the same de-duplication ability of my patch.
> Although udev is using binary search instead of my simpler but slower
> list based search.

That's all fine, thanks for double checking.

> Whether to utilize external "udevadm lock" command, or doing the same
> thing inside "mkfs.btrfs" is up to you to determine.
> 
> But at least, I'm doing the same thing as "udevadm lock".
> (Although this version has other problems, like using st_dev not
> st_rdev, and not properly making the parameter for
> btrfs_flock_one_device() const etc)
David Sterba Jan. 31, 2024, 6:14 p.m. UTC | #6
On Wed, Jan 31, 2024 at 06:37:26PM +1030, Qu Wenruo wrote:
> >> +static LIST_HEAD(locked_devices);
> >> +
> >> +/*
> >> + * This is to record flock()ed devices.
> >> + * For flock() to prevent udev races, we must lock the parent block device,
> >> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
> >> + * we should only lock "/dev/sda" once.
> >> + *
> >> + * This structure would be used to record any flocked block device (not
> >> + * the partition one), and avoid double locking.
> >> + */
> >> +struct btrfs_locked_wholedisk {
> >
> > Please pick a different name, we've been calling it devices, although
> > you can find 'disk' references but mainly for historical reasons (eg.
> > when it's in a structure). In this case it's a block device.
> >
> 
> Well, the "wholedisk" name is from the libblk, and I thought it may be
> good enough, but it's not the case.

I see, in that case we can keep it that way.
David Sterba Jan. 31, 2024, 6:24 p.m. UTC | #7
On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
> [BUG]
> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
> descriptors open during whole time"), there is still a bug report about
> blkid failed to grab the FSID:
> 
>  device=/dev/loop0
>  fstype=btrfs
> 
>  wipefs -a "$device"*
> 
>  parted -s "$device" \
>      mklabel gpt \
>      mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>      set 1 esp on \
>      mkpart '"root partition"' "$fstype" 513MiB 100%
> 
>  udevadm settle
>  partitions=($(lsblk -n -o path "$device"))
> 
>  mkfs.fat -F 32 ${partitions[1]}
>  mkfs."$fstype" ${partitions[2]}
>  udevadm settle
> 
> The above script can sometimes result empty fsid:
> 
>  loop0
>  |-loop0p1 BDF3-552B
>  `-loop0p2
> 
> [CAUSE]
> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
> open during whole time") changed the lifespan of the fds, it doesn't
> properly inform udev about our change to certain partition.
> 
> Thus for a multi-partition case, udev can start scanning the whole disk,
> meanwhile our mkfs is still happening halfway.
> 
> If the scan is done before our new super blocks written, and our close()
> calls happens later just before the current scan is finished, udev can
> got the temporary super blocks (which is not a valid one).
> 
> And since our close() calls happens during the scan, there would be no
> new scan, thus leading to the bad result.
> 
> [FIX]
> The proper way to avoid race with udev is to flock() the whole disk
> (aka, the parent block device, not the partition disk).
> 
> Thus this patch would introduce such mechanism by:
> 
> - btrfs_flock_one_device()
>   This would resolve the path to a whole disk path.
>   Then make sure the whole disk is not already locked (this can happen
>   for cases like "mkfs.btrfs -f /dev/sda[123]").
> 
>   If the device is not already locked, then flock() the device, and
>   insert a new entry into the list.
> 
> - btrfs_unlock_all_devices()
>   Would go unlock all devices recorded in locked_devices list, and free
>   the memory.
> 
> And mkfs.btrfs would be the first one to utilize the new mechanism, to
> prevent such race with udev.

The other possible user could be btrfs-convert as it also writes data
and changes the UUID.

> Issue: #734
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Added to devel, thanks for fixing it.
Qu Wenruo Jan. 31, 2024, 10:36 p.m. UTC | #8
On 2024/2/1 02:23, Anand Jain wrote:
> On 1/30/24 11:31, Qu Wenruo wrote:
>> [BUG]
>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>> descriptors open during whole time"), there is still a bug report about
>> blkid failed to grab the FSID:
>>
>>   device=/dev/loop0
>>   fstype=btrfs
>>
>>   wipefs -a "$device"*
>>
>>   parted -s "$device" \
>>       mklabel gpt \
>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>       set 1 esp on \
>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>
>>   udevadm settle
>>   partitions=($(lsblk -n -o path "$device"))
>>
>>   mkfs.fat -F 32 ${partitions[1]}
>>   mkfs."$fstype" ${partitions[2]}
>>   udevadm settle
>>
>> The above script can sometimes result empty fsid:
>>
>>   loop0
>>   |-loop0p1 BDF3-552B
>>   `-loop0p2
>>
> 
> 
> 
>> [CAUSE]
>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>> open during whole time") changed the lifespan of the fds, it doesn't
>> properly inform udev about our change to certain partition.
>>
>> Thus for a multi-partition case, udev can start scanning the whole disk,
>> meanwhile our mkfs is still happening halfway.
>>
>> If the scan is done before our new super blocks written, and our close()
>> calls happens later just before the current scan is finished, udev can
>> got the temporary super blocks (which is not a valid one).
>>
>> And since our close() calls happens during the scan, there would be no
>> new scan, thus leading to the bad result.
>>
> 
> 
> 
> I am able to reproduce the missing udev events without the device 
> partitions on the entire device also, so its not about the flock.
> Also, per the udevadm monitor I see no new fsid being reported for the 
> btrfs. Please find the test case in the RFC patch below.

Please go check the issue of btrfs-progs.

Firstly, it is about flock().
In fact the problem would be gone if using "udevadm lock" command for 
the mkfs.btrfs.

And as I already explained, "udevadm lock" is just flock() for the 
parent block device, with some extra fancy work like deadline and 
deduplication.

> 
> The problem appears to be a convoluted nested file descriptor of the 
> primary device (which obtains the temp-super-block).

Nope.

Thanks,
Qu

> 
> The RFC patch below optimizes the file descriptors and I find it to fix 
> the issue. Now, both yours and my test cases pass.
> 
> [PATCH RFC] btrfs-progs: mkfs: optimize file descriptor usage in
>   mkfs.btrfs
> 
> Thanks, Anand
> 
> 
> 
> 
>> [FIX]
>> The proper way to avoid race with udev is to flock() the whole disk
>> (aka, the parent block device, not the partition disk).
>>
>> Thus this patch would introduce such mechanism by:
>>
>> - btrfs_flock_one_device()
>>    This would resolve the path to a whole disk path.
>>    Then make sure the whole disk is not already locked (this can happen
>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>
>>    If the device is not already locked, then flock() the device, and
>>    insert a new entry into the list.
>>
>> - btrfs_unlock_all_devices()
>>    Would go unlock all devices recorded in locked_devices list, and free
>>    the memory.
>>
>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>> prevent such race with udev.
>>
>> Issue: #734
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Changelog:
>> v2:
>> - Fix the patch prefix
>>    From "btrfs" to "btrfs-progs"
>> ---
>>   common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>>   common/device-utils.h |   3 ++
>>   mkfs/main.c           |  11 ++++
>>   3 files changed, 128 insertions(+)
>>
>> diff --git a/common/device-utils.c b/common/device-utils.c
>> index f86120afa00c..88c21c66382d 100644
>> --- a/common/device-utils.c
>> +++ b/common/device-utils.c
>> @@ -17,11 +17,13 @@
>>   #include <sys/ioctl.h>
>>   #include <sys/stat.h>
>>   #include <sys/types.h>
>> +#include <sys/file.h>
>>   #include <linux/limits.h>
>>   #ifdef BTRFS_ZONED
>>   #include <linux/blkzoned.h>
>>   #endif
>>   #include <linux/fs.h>
>> +#include <linux/kdev_t.h>
>>   #include <limits.h>
>>   #include <stdio.h>
>>   #include <stdlib.h>
>> @@ -48,6 +50,24 @@
>>   #define BLKDISCARD    _IO(0x12,119)
>>   #endif
>>
>> +static LIST_HEAD(locked_devices);
>> +
>> +/*
>> + * This is to record flock()ed devices.
>> + * For flock() to prevent udev races, we must lock the parent block 
>> device,
>> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
>> + * we should only lock "/dev/sda" once.
>> + *
>> + * This structure would be used to record any flocked block device (not
>> + * the partition one), and avoid double locking.
>> + */
>> +struct btrfs_locked_wholedisk {
>> +    char *full_path;
>> +    dev_t devno;
>> +    int fd;
>> +    struct list_head list;
>> +};
>> +
>>   /*
>>    * Discard the given range in one go
>>    */
>> @@ -633,3 +653,97 @@ ssize_t btrfs_direct_pwrite(int fd, const void 
>> *buf, size_t count, off_t offset)
>>       free(bounce_buf);
>>       return ret;
>>   }
>> +
>> +int btrfs_flock_one_device(char *path)
>> +{
>> +    struct btrfs_locked_wholedisk *entry;
>> +    struct stat st = { 0 };
>> +    char *wholedisk_path;
>> +    dev_t wholedisk_devno;
>> +    int ret;
>> +
>> +    ret = stat(path, &st);
>> +    if (ret < 0) {
>> +        error("failed to stat %s: %m", path);
>> +        return -errno;
>> +    }
>> +    /* Non-block device, skipping the locking. */
>> +    if (!S_ISBLK(st.st_mode))
>> +        return 0;
>> +
>> +    ret = blkid_devno_to_wholedisk(st.st_dev, path, strlen(path),
>> +                       &wholedisk_devno);
>> +    if (ret < 0) {
>> +        error("failed to get the whole disk devno for %s: %m", path);
>> +        return -errno;
>> +    }
>> +    wholedisk_path = blkid_devno_to_devname(wholedisk_devno);
>> +    if (!wholedisk_path) {
>> +        error("failed to get the devname of dev %ld:%ld",
>> +            MAJOR(wholedisk_devno), MINOR(wholedisk_devno));
>> +    }
>> +
>> +    /* Check if we already have the whole disk in the list. */
>> +    list_for_each_entry(entry, &locked_devices, list) {
>> +        /* The wholedisk is already locked, need to do nothing. */
>> +        if (entry->devno == wholedisk_devno ||
>> +            entry->full_path == wholedisk_path) {
>> +            free(wholedisk_path);
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    /* Allocate new entry. */
>> +    entry = malloc(sizeof(*entry));
>> +    if (!entry) {
>> +        errno = ENOMEM;
>> +        error("unable to allocate new memory for %s: %m",
>> +              wholedisk_path);
>> +        free(wholedisk_path);
>> +        return -errno;
>> +    }
>> +    entry->devno = wholedisk_devno;
>> +    entry->full_path = wholedisk_path;
>> +
>> +    /* Lock the whole disk. */
>> +    entry->fd = open(wholedisk_path, O_RDONLY);
>> +    if (entry->fd < 0) {
>> +        error("failed to open device %s: %m",
>> +              wholedisk_path);
>> +        free(wholedisk_path);
>> +        free(entry);
>> +        return -errno;
>> +    }
>> +    ret = flock(entry->fd, LOCK_EX);
>> +    if (ret < 0) {
>> +        error("failed to hold an exclusive lock on %s: %m",
>> +              wholedisk_path);
>> +        free(wholedisk_path);
>> +        free(entry);
>> +        return -errno;
>> +    }
>> +
>> +    /* Insert it into the list. */
>> +    list_add_tail(&entry->list, &locked_devices);
>> +    return 0;
>> +}
>> +
>> +void btrfs_unlock_all_devicecs(void)
>> +{
>> +    while (!list_empty(&locked_devices)) {
>> +        struct btrfs_locked_wholedisk *entry;
>> +        int ret;
>> +
>> +        entry = list_entry(locked_devices.next,
>> +                   struct btrfs_locked_wholedisk, list);
>> +
>> +        list_del_init(&entry->list);
>> +        ret = flock(entry->fd, LOCK_UN);
>> +        if (ret < 0)
>> +            warning("failed to unlock %s (fd %d dev %ld:%ld), 
>> skipping it",
>> +                entry->full_path, entry->fd, MAJOR(entry->devno),
>> +                MINOR(entry->devno));
>> +        free(entry->full_path);
>> +        free(entry);
>> +    }
>> +}
>> diff --git a/common/device-utils.h b/common/device-utils.h
>> index 853d17b5ab98..3a04348a0867 100644
>> --- a/common/device-utils.h
>> +++ b/common/device-utils.h
>> @@ -57,6 +57,9 @@ int btrfs_prepare_device(int fd, const char *file, 
>> u64 *block_count_ret,
>>   ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t 
>> offset);
>>   ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, 
>> off_t offset);
>>
>> +int btrfs_flock_one_device(char *path);
>> +void btrfs_unlock_all_devicecs(void);
>> +
>>   #ifdef BTRFS_ZONED
>>   static inline ssize_t btrfs_pwrite(int fd, const void *buf, size_t 
>> count,
>>                      off_t offset, bool direct)
>> diff --git a/mkfs/main.c b/mkfs/main.c
>> index b9882208dbd5..6e6cb81a4165 100644
>> --- a/mkfs/main.c
>> +++ b/mkfs/main.c
>> @@ -1723,6 +1723,15 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>>           }
>>       }
>>
>> +    /* Lock all devices to prevent race with udev probing. */
>> +    for (i = 0; i < device_count; i++) {
>> +        char *path = argv[optind + i - 1];
>> +
>> +        ret = btrfs_flock_one_device(path);
>> +        if (ret < 0)
>> +            warning("failed to flock %s, skipping it", path);
>> +    }
>> +
>>       /* Start threads */
>>       for (i = 0; i < device_count; i++) {
>>           prepare_ctx[i].file = argv[optind + i - 1];
>> @@ -2079,6 +2088,7 @@ out:
>>       free(label);
>>       free(source_dir);
>>
>> +    btrfs_unlock_all_devicecs();
>>       return !!ret;
>>
>>   error:
>> @@ -2090,6 +2100,7 @@ error:
>>       free(prepare_ctx);
>>       free(label);
>>       free(source_dir);
>> +    btrfs_unlock_all_devicecs();
>>       exit(1);
>>   success:
>>       exit(0);
>> -- 
>> 2.43.0
>>
>
Qu Wenruo Jan. 31, 2024, 10:38 p.m. UTC | #9
On 2024/2/1 04:54, David Sterba wrote:
> On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
>> [BUG]
>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>> descriptors open during whole time"), there is still a bug report about
>> blkid failed to grab the FSID:
>>
>>   device=/dev/loop0
>>   fstype=btrfs
>>
>>   wipefs -a "$device"*
>>
>>   parted -s "$device" \
>>       mklabel gpt \
>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>       set 1 esp on \
>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>
>>   udevadm settle
>>   partitions=($(lsblk -n -o path "$device"))
>>
>>   mkfs.fat -F 32 ${partitions[1]}
>>   mkfs."$fstype" ${partitions[2]}
>>   udevadm settle
>>
>> The above script can sometimes result empty fsid:
>>
>>   loop0
>>   |-loop0p1 BDF3-552B
>>   `-loop0p2
>>
>> [CAUSE]
>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>> open during whole time") changed the lifespan of the fds, it doesn't
>> properly inform udev about our change to certain partition.
>>
>> Thus for a multi-partition case, udev can start scanning the whole disk,
>> meanwhile our mkfs is still happening halfway.
>>
>> If the scan is done before our new super blocks written, and our close()
>> calls happens later just before the current scan is finished, udev can
>> got the temporary super blocks (which is not a valid one).
>>
>> And since our close() calls happens during the scan, there would be no
>> new scan, thus leading to the bad result.
>>
>> [FIX]
>> The proper way to avoid race with udev is to flock() the whole disk
>> (aka, the parent block device, not the partition disk).
>>
>> Thus this patch would introduce such mechanism by:
>>
>> - btrfs_flock_one_device()
>>    This would resolve the path to a whole disk path.
>>    Then make sure the whole disk is not already locked (this can happen
>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>
>>    If the device is not already locked, then flock() the device, and
>>    insert a new entry into the list.
>>
>> - btrfs_unlock_all_devices()
>>    Would go unlock all devices recorded in locked_devices list, and free
>>    the memory.
>>
>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>> prevent such race with udev.
> 
> The other possible user could be btrfs-convert as it also writes data
> and changes the UUID.
> 
>> Issue: #734
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
> 
> Added to devel, thanks for fixing it.

Sorry, this v2 is still incorrect, as it doesn't use the correct device 
number, the proper fix is in my flock branch.

But I'll send out a v3 just in case.

Thanks,
Qu
David Sterba Jan. 31, 2024, 11:04 p.m. UTC | #10
On Thu, Feb 01, 2024 at 09:08:31AM +1030, Qu Wenruo wrote:
> 
> 
> On 2024/2/1 04:54, David Sterba wrote:
> > On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
> >> [BUG]
> >> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
> >> descriptors open during whole time"), there is still a bug report about
> >> blkid failed to grab the FSID:
> >>
> >>   device=/dev/loop0
> >>   fstype=btrfs
> >>
> >>   wipefs -a "$device"*
> >>
> >>   parted -s "$device" \
> >>       mklabel gpt \
> >>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
> >>       set 1 esp on \
> >>       mkpart '"root partition"' "$fstype" 513MiB 100%
> >>
> >>   udevadm settle
> >>   partitions=($(lsblk -n -o path "$device"))
> >>
> >>   mkfs.fat -F 32 ${partitions[1]}
> >>   mkfs."$fstype" ${partitions[2]}
> >>   udevadm settle
> >>
> >> The above script can sometimes result empty fsid:
> >>
> >>   loop0
> >>   |-loop0p1 BDF3-552B
> >>   `-loop0p2
> >>
> >> [CAUSE]
> >> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
> >> open during whole time") changed the lifespan of the fds, it doesn't
> >> properly inform udev about our change to certain partition.
> >>
> >> Thus for a multi-partition case, udev can start scanning the whole disk,
> >> meanwhile our mkfs is still happening halfway.
> >>
> >> If the scan is done before our new super blocks written, and our close()
> >> calls happens later just before the current scan is finished, udev can
> >> got the temporary super blocks (which is not a valid one).
> >>
> >> And since our close() calls happens during the scan, there would be no
> >> new scan, thus leading to the bad result.
> >>
> >> [FIX]
> >> The proper way to avoid race with udev is to flock() the whole disk
> >> (aka, the parent block device, not the partition disk).
> >>
> >> Thus this patch would introduce such mechanism by:
> >>
> >> - btrfs_flock_one_device()
> >>    This would resolve the path to a whole disk path.
> >>    Then make sure the whole disk is not already locked (this can happen
> >>    for cases like "mkfs.btrfs -f /dev/sda[123]").
> >>
> >>    If the device is not already locked, then flock() the device, and
> >>    insert a new entry into the list.
> >>
> >> - btrfs_unlock_all_devices()
> >>    Would go unlock all devices recorded in locked_devices list, and free
> >>    the memory.
> >>
> >> And mkfs.btrfs would be the first one to utilize the new mechanism, to
> >> prevent such race with udev.
> > 
> > The other possible user could be btrfs-convert as it also writes data
> > and changes the UUID.
> > 
> >> Issue: #734
> >> Signed-off-by: Qu Wenruo <wqu@suse.com>
> > 
> > Added to devel, thanks for fixing it.
> 
> Sorry, this v2 is still incorrect, as it doesn't use the correct device 
> number, the proper fix is in my flock branch.
> 
> But I'll send out a v3 just in case.

I noticed the update after I replied, the commit in devel is from your
repository. There were added hunks in btrfs_flock_one_device() adding
path_dump, and some typos fixed.
Qu Wenruo Jan. 31, 2024, 11:48 p.m. UTC | #11
On 2024/2/1 09:34, David Sterba wrote:
> On Thu, Feb 01, 2024 at 09:08:31AM +1030, Qu Wenruo wrote:
>>
>>
>> On 2024/2/1 04:54, David Sterba wrote:
>>> On Tue, Jan 30, 2024 at 04:31:17PM +1030, Qu Wenruo wrote:
>>>> [BUG]
>>>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>>>> descriptors open during whole time"), there is still a bug report about
>>>> blkid failed to grab the FSID:
>>>>
>>>>    device=/dev/loop0
>>>>    fstype=btrfs
>>>>
>>>>    wipefs -a "$device"*
>>>>
>>>>    parted -s "$device" \
>>>>        mklabel gpt \
>>>>        mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>>>        set 1 esp on \
>>>>        mkpart '"root partition"' "$fstype" 513MiB 100%
>>>>
>>>>    udevadm settle
>>>>    partitions=($(lsblk -n -o path "$device"))
>>>>
>>>>    mkfs.fat -F 32 ${partitions[1]}
>>>>    mkfs."$fstype" ${partitions[2]}
>>>>    udevadm settle
>>>>
>>>> The above script can sometimes result empty fsid:
>>>>
>>>>    loop0
>>>>    |-loop0p1 BDF3-552B
>>>>    `-loop0p2
>>>>
>>>> [CAUSE]
>>>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>>>> open during whole time") changed the lifespan of the fds, it doesn't
>>>> properly inform udev about our change to certain partition.
>>>>
>>>> Thus for a multi-partition case, udev can start scanning the whole disk,
>>>> meanwhile our mkfs is still happening halfway.
>>>>
>>>> If the scan is done before our new super blocks written, and our close()
>>>> calls happens later just before the current scan is finished, udev can
>>>> got the temporary super blocks (which is not a valid one).
>>>>
>>>> And since our close() calls happens during the scan, there would be no
>>>> new scan, thus leading to the bad result.
>>>>
>>>> [FIX]
>>>> The proper way to avoid race with udev is to flock() the whole disk
>>>> (aka, the parent block device, not the partition disk).
>>>>
>>>> Thus this patch would introduce such mechanism by:
>>>>
>>>> - btrfs_flock_one_device()
>>>>     This would resolve the path to a whole disk path.
>>>>     Then make sure the whole disk is not already locked (this can happen
>>>>     for cases like "mkfs.btrfs -f /dev/sda[123]").
>>>>
>>>>     If the device is not already locked, then flock() the device, and
>>>>     insert a new entry into the list.
>>>>
>>>> - btrfs_unlock_all_devices()
>>>>     Would go unlock all devices recorded in locked_devices list, and free
>>>>     the memory.
>>>>
>>>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>>>> prevent such race with udev.
>>>
>>> The other possible user could be btrfs-convert as it also writes data
>>> and changes the UUID.
>>>
>>>> Issue: #734
>>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>>
>>> Added to devel, thanks for fixing it.
>>
>> Sorry, this v2 is still incorrect, as it doesn't use the correct device
>> number, the proper fix is in my flock branch.
>>
>> But I'll send out a v3 just in case.
>
> I noticed the update after I replied, the commit in devel is from your
> repository. There were added hunks in btrfs_flock_one_device() adding
> path_dump, and some typos fixed.
>
As long as you're using the commit from my repo, it's totally fine.

Thanks,
Qu
Anand Jain Feb. 1, 2024, 10:32 a.m. UTC | #12
On 2/1/24 04:06, Qu Wenruo wrote:
> 
> 
> On 2024/2/1 02:23, Anand Jain wrote:
>> On 1/30/24 11:31, Qu Wenruo wrote:
>>> [BUG]
>>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>>> descriptors open during whole time"), there is still a bug report about
>>> blkid failed to grab the FSID:
>>>
>>>   device=/dev/loop0
>>>   fstype=btrfs
>>>
>>>   wipefs -a "$device"*
>>>
>>>   parted -s "$device" \
>>>       mklabel gpt \
>>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>>       set 1 esp on \
>>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>>
>>>   udevadm settle
>>>   partitions=($(lsblk -n -o path "$device"))
>>>
>>>   mkfs.fat -F 32 ${partitions[1]}
>>>   mkfs."$fstype" ${partitions[2]}
>>>   udevadm settle
>>>
>>> The above script can sometimes result empty fsid:
>>>
>>>   loop0
>>>   |-loop0p1 BDF3-552B
>>>   `-loop0p2
>>>
>>
>>
>>
>>> [CAUSE]
>>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>>> open during whole time") changed the lifespan of the fds, it doesn't
>>> properly inform udev about our change to certain partition.
>>>


>>> Thus for a multi-partition case, udev can start scanning the whole disk,
>>> meanwhile our mkfs is still happening halfway.
>>>
>>> If the scan is done before our new super blocks written, and our close()
>>> calls happens later just before the current scan is finished, udev can
>>> got the temporary super blocks (which is not a valid one).
>>>
>>> And since our close() calls happens during the scan, there would be no
>>> new scan, thus leading to the bad result.
>>>
>>
>>
>>
>> I am able to reproduce the missing udev events without the device 
>> partitions on the entire device also, so its not about the flock.
>> Also, per the udevadm monitor I see no new fsid being reported for the 
>> btrfs. Please find the test case in the RFC patch below.
> 
> Please go check the issue of btrfs-progs.
> 
> Firstly, it is about flock().
> In fact the problem would be gone if using "udevadm lock" command for 
> the mkfs.btrfs.
> 
> And as I already explained, "udevadm lock" is just flock() for the 
> parent block device, with some extra fancy work like deadline and 
> deduplication.
> 

I am able to reproduce the issue on the entire device, which conflicts
with the cause you have mentioned. Ref to the test case in the RFC.

>> The problem appears to be a convoluted nested file descriptor of the 
>> primary device (which obtains the temp-super-block).
> 
> Nope.

I don't see how your fix will work when two filesystems' mkfs use
flock() simultaneously on different partitions of the same device.
IMO, this approach is incorrect.

Were you able reproduce the issue with mkfs.xfs? I couldn't; xfs
doesn't use flock() either, as I glanced.

Cleaning up the mkfs.btrfs file descriptors has been pending for a
long time, which my RFC patch did as a first step. It makes it
similar to what other filesystem mkfs do, with no nesting of open
and close per device. The udev monitors CLOSE-WRITE event, and it
works well with this RFC.

Thanks, Anand

> 
> Thanks,
> Qu
> 
>>
>> The RFC patch below optimizes the file descriptors and I find it to 
>> fix the issue. Now, both yours and my test cases pass.
>>
>> [PATCH RFC] btrfs-progs: mkfs: optimize file descriptor usage in
>>   mkfs.btrfs
>>
>> Thanks, Anand
>>
>>
>>
>>
>>> [FIX]
>>> The proper way to avoid race with udev is to flock() the whole disk
>>> (aka, the parent block device, not the partition disk).
>>>
>>> Thus this patch would introduce such mechanism by:
>>>
>>> - btrfs_flock_one_device()
>>>    This would resolve the path to a whole disk path.
>>>    Then make sure the whole disk is not already locked (this can happen
>>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>>
>>>    If the device is not already locked, then flock() the device, and
>>>    insert a new entry into the list.
>>>
>>> - btrfs_unlock_all_devices()
>>>    Would go unlock all devices recorded in locked_devices list, and free
>>>    the memory.
>>>
>>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>>> prevent such race with udev.
>>>
>>> Issue: #734
>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>> ---
>>> Changelog:
>>> v2:
>>> - Fix the patch prefix
>>>    From "btrfs" to "btrfs-progs"
>>> ---
>>>   common/device-utils.c | 114 ++++++++++++++++++++++++++++++++++++++++++
>>>   common/device-utils.h |   3 ++
>>>   mkfs/main.c           |  11 ++++
>>>   3 files changed, 128 insertions(+)
>>>
>>> diff --git a/common/device-utils.c b/common/device-utils.c
>>> index f86120afa00c..88c21c66382d 100644
>>> --- a/common/device-utils.c
>>> +++ b/common/device-utils.c
>>> @@ -17,11 +17,13 @@
>>>   #include <sys/ioctl.h>
>>>   #include <sys/stat.h>
>>>   #include <sys/types.h>
>>> +#include <sys/file.h>
>>>   #include <linux/limits.h>
>>>   #ifdef BTRFS_ZONED
>>>   #include <linux/blkzoned.h>
>>>   #endif
>>>   #include <linux/fs.h>
>>> +#include <linux/kdev_t.h>
>>>   #include <limits.h>
>>>   #include <stdio.h>
>>>   #include <stdlib.h>
>>> @@ -48,6 +50,24 @@
>>>   #define BLKDISCARD    _IO(0x12,119)
>>>   #endif
>>>
>>> +static LIST_HEAD(locked_devices);
>>> +
>>> +/*
>>> + * This is to record flock()ed devices.
>>> + * For flock() to prevent udev races, we must lock the parent block 
>>> device,
>>> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that 
>>> case
>>> + * we should only lock "/dev/sda" once.
>>> + *
>>> + * This structure would be used to record any flocked block device (not
>>> + * the partition one), and avoid double locking.
>>> + */
>>> +struct btrfs_locked_wholedisk {
>>> +    char *full_path;
>>> +    dev_t devno;
>>> +    int fd;
>>> +    struct list_head list;
>>> +};
>>> +
>>>   /*
>>>    * Discard the given range in one go
>>>    */
>>> @@ -633,3 +653,97 @@ ssize_t btrfs_direct_pwrite(int fd, const void 
>>> *buf, size_t count, off_t offset)
>>>       free(bounce_buf);
>>>       return ret;
>>>   }
>>> +
>>> +int btrfs_flock_one_device(char *path)
>>> +{
>>> +    struct btrfs_locked_wholedisk *entry;
>>> +    struct stat st = { 0 };
>>> +    char *wholedisk_path;
>>> +    dev_t wholedisk_devno;
>>> +    int ret;
>>> +
>>> +    ret = stat(path, &st);
>>> +    if (ret < 0) {
>>> +        error("failed to stat %s: %m", path);
>>> +        return -errno;
>>> +    }
>>> +    /* Non-block device, skipping the locking. */
>>> +    if (!S_ISBLK(st.st_mode))
>>> +        return 0;
>>> +
>>> +    ret = blkid_devno_to_wholedisk(st.st_dev, path, strlen(path),
>>> +                       &wholedisk_devno);
>>> +    if (ret < 0) {
>>> +        error("failed to get the whole disk devno for %s: %m", path);
>>> +        return -errno;
>>> +    }
>>> +    wholedisk_path = blkid_devno_to_devname(wholedisk_devno);
>>> +    if (!wholedisk_path) {
>>> +        error("failed to get the devname of dev %ld:%ld",
>>> +            MAJOR(wholedisk_devno), MINOR(wholedisk_devno));
>>> +    }
>>> +
>>> +    /* Check if we already have the whole disk in the list. */
>>> +    list_for_each_entry(entry, &locked_devices, list) {
>>> +        /* The wholedisk is already locked, need to do nothing. */
>>> +        if (entry->devno == wholedisk_devno ||
>>> +            entry->full_path == wholedisk_path) {
>>> +            free(wholedisk_path);
>>> +            return 0;
>>> +        }
>>> +    }
>>> +
>>> +    /* Allocate new entry. */
>>> +    entry = malloc(sizeof(*entry));
>>> +    if (!entry) {
>>> +        errno = ENOMEM;
>>> +        error("unable to allocate new memory for %s: %m",
>>> +              wholedisk_path);
>>> +        free(wholedisk_path);
>>> +        return -errno;
>>> +    }
>>> +    entry->devno = wholedisk_devno;
>>> +    entry->full_path = wholedisk_path;
>>> +
>>> +    /* Lock the whole disk. */
>>> +    entry->fd = open(wholedisk_path, O_RDONLY);
>>> +    if (entry->fd < 0) {
>>> +        error("failed to open device %s: %m",
>>> +              wholedisk_path);
>>> +        free(wholedisk_path);
>>> +        free(entry);
>>> +        return -errno;
>>> +    }
>>> +    ret = flock(entry->fd, LOCK_EX);
>>> +    if (ret < 0) {
>>> +        error("failed to hold an exclusive lock on %s: %m",
>>> +              wholedisk_path);
>>> +        free(wholedisk_path);
>>> +        free(entry);
>>> +        return -errno;
>>> +    }
>>> +
>>> +    /* Insert it into the list. */
>>> +    list_add_tail(&entry->list, &locked_devices);
>>> +    return 0;
>>> +}
>>> +
>>> +void btrfs_unlock_all_devicecs(void)
>>> +{
>>> +    while (!list_empty(&locked_devices)) {
>>> +        struct btrfs_locked_wholedisk *entry;
>>> +        int ret;
>>> +
>>> +        entry = list_entry(locked_devices.next,
>>> +                   struct btrfs_locked_wholedisk, list);
>>> +
>>> +        list_del_init(&entry->list);
>>> +        ret = flock(entry->fd, LOCK_UN);
>>> +        if (ret < 0)
>>> +            warning("failed to unlock %s (fd %d dev %ld:%ld), 
>>> skipping it",
>>> +                entry->full_path, entry->fd, MAJOR(entry->devno),
>>> +                MINOR(entry->devno));
>>> +        free(entry->full_path);
>>> +        free(entry);
>>> +    }
>>> +}
>>> diff --git a/common/device-utils.h b/common/device-utils.h
>>> index 853d17b5ab98..3a04348a0867 100644
>>> --- a/common/device-utils.h
>>> +++ b/common/device-utils.h
>>> @@ -57,6 +57,9 @@ int btrfs_prepare_device(int fd, const char *file, 
>>> u64 *block_count_ret,
>>>   ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t 
>>> offset);
>>>   ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, 
>>> off_t offset);
>>>
>>> +int btrfs_flock_one_device(char *path);
>>> +void btrfs_unlock_all_devicecs(void);
>>> +
>>>   #ifdef BTRFS_ZONED
>>>   static inline ssize_t btrfs_pwrite(int fd, const void *buf, size_t 
>>> count,
>>>                      off_t offset, bool direct)
>>> diff --git a/mkfs/main.c b/mkfs/main.c
>>> index b9882208dbd5..6e6cb81a4165 100644
>>> --- a/mkfs/main.c
>>> +++ b/mkfs/main.c
>>> @@ -1723,6 +1723,15 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>>>           }
>>>       }
>>>
>>> +    /* Lock all devices to prevent race with udev probing. */
>>> +    for (i = 0; i < device_count; i++) {
>>> +        char *path = argv[optind + i - 1];
>>> +
>>> +        ret = btrfs_flock_one_device(path);
>>> +        if (ret < 0)
>>> +            warning("failed to flock %s, skipping it", path);
>>> +    }
>>> +
>>>       /* Start threads */
>>>       for (i = 0; i < device_count; i++) {
>>>           prepare_ctx[i].file = argv[optind + i - 1];
>>> @@ -2079,6 +2088,7 @@ out:
>>>       free(label);
>>>       free(source_dir);
>>>
>>> +    btrfs_unlock_all_devicecs();
>>>       return !!ret;
>>>
>>>   error:
>>> @@ -2090,6 +2100,7 @@ error:
>>>       free(prepare_ctx);
>>>       free(label);
>>>       free(source_dir);
>>> +    btrfs_unlock_all_devicecs();
>>>       exit(1);
>>>   success:
>>>       exit(0);
>>> -- 
>>> 2.43.0
>>>
>>
Qu Wenruo Feb. 1, 2024, 9:15 p.m. UTC | #13
On 2024/2/1 21:02, Anand Jain wrote:
>
>
> On 2/1/24 04:06, Qu Wenruo wrote:
>>
>>
>> On 2024/2/1 02:23, Anand Jain wrote:
>>> On 1/30/24 11:31, Qu Wenruo wrote:
>>>> [BUG]
>>>> Even after commit b2a1be83b85f ("btrfs-progs: mkfs: keep file
>>>> descriptors open during whole time"), there is still a bug report about
>>>> blkid failed to grab the FSID:
>>>>
>>>>   device=/dev/loop0
>>>>   fstype=btrfs
>>>>
>>>>   wipefs -a "$device"*
>>>>
>>>>   parted -s "$device" \
>>>>       mklabel gpt \
>>>>       mkpart '"EFI system partition"' fat32 1MiB 513MiB \
>>>>       set 1 esp on \
>>>>       mkpart '"root partition"' "$fstype" 513MiB 100%
>>>>
>>>>   udevadm settle
>>>>   partitions=($(lsblk -n -o path "$device"))
>>>>
>>>>   mkfs.fat -F 32 ${partitions[1]}
>>>>   mkfs."$fstype" ${partitions[2]}
>>>>   udevadm settle
>>>>
>>>> The above script can sometimes result empty fsid:
>>>>
>>>>   loop0
>>>>   |-loop0p1 BDF3-552B
>>>>   `-loop0p2
>>>>
>>>
>>>
>>>
>>>> [CAUSE]
>>>> Although commit b2a1be83b85f ("btrfs-progs: mkfs: keep file descriptors
>>>> open during whole time") changed the lifespan of the fds, it doesn't
>>>> properly inform udev about our change to certain partition.
>>>>
>
>
>>>> Thus for a multi-partition case, udev can start scanning the whole
>>>> disk,
>>>> meanwhile our mkfs is still happening halfway.
>>>>
>>>> If the scan is done before our new super blocks written, and our
>>>> close()
>>>> calls happens later just before the current scan is finished, udev can
>>>> got the temporary super blocks (which is not a valid one).
>>>>
>>>> And since our close() calls happens during the scan, there would be no
>>>> new scan, thus leading to the bad result.
>>>>
>>>
>>>
>>>
>>> I am able to reproduce the missing udev events without the device
>>> partitions on the entire device also, so its not about the flock.
>>> Also, per the udevadm monitor I see no new fsid being reported for
>>> the btrfs. Please find the test case in the RFC patch below.
>>
>> Please go check the issue of btrfs-progs.
>>
>> Firstly, it is about flock().
>> In fact the problem would be gone if using "udevadm lock" command for
>> the mkfs.btrfs.
>>
>> And as I already explained, "udevadm lock" is just flock() for the
>> parent block device, with some extra fancy work like deadline and
>> deduplication.
>>
>
> I am able to reproduce the issue on the entire device, which conflicts
> with the cause you have mentioned. Ref to the test case in the RFC.
>
>>> The problem appears to be a convoluted nested file descriptor of the
>>> primary device (which obtains the temp-super-block).
>>
>> Nope.
>
> I don't see how your fix will work when two filesystems' mkfs use
> flock() simultaneously on different partitions of the same device.
> IMO, this approach is incorrect.

Why not?

The first would lock the parent block device, does it work, and unlock.
The other one who loses the race would just wait until the first one
finishes its work.

And I'd say you're incorrect, or argue with udev guys:

https://systemd.io/BLOCK_DEVICE_LOCKING/

>
> Were you able reproduce the issue with mkfs.xfs? I couldn't; xfs
> doesn't use flock() either, as I glanced.

Yes, mkfs.xfs is not doing the flock, but have you tried it with an
external log device on the same disk?
I'm pretty sure it would cause the same problem.

Any fs with multi-device support (no matter if it's external log or true
multi-device support) would have the same problem.

For pure single device usecase, changing the fd lifespan may be fine,
but see the next section, as I don't think the existing nested behavior
is the cause.

>
> Cleaning up the mkfs.btrfs file descriptors has been pending for a
> long time, which my RFC patch did as a first step. It makes it
> similar to what other filesystem mkfs do, with no nesting of open
> and close per device. The udev monitors CLOSE-WRITE event, and it
> works well with this RFC.

Then explain why the original nested workflow doesn't work in the first
place.

Let me be clear, we keep the first writeable fd open (for the
preparation part), then we open the devices for fs_info open.

At the time of close of the fs_info devices, the fs is already properly
written.
Later closes of the preparation fds would only:

- Trigger a new scan (and found nothing changed)
- Do nothing if there is already a running scan

Either way, as long as the scan is triggered by the close of writeable
fds, the scan should got a correct btrfs super already.

Thus the fd lifespan change makes no difference.

I would even suggest there may be some stray writeable fd we didn't notice.
But even that's the case, the proper flock() documented by the udev guys
can handle it correctly no matter what the lifespan of writeable fds is.

Thanks,
Qu
>
> Thanks, Anand
>
>>
>> Thanks,
>> Qu
>>
>>>
>>> The RFC patch below optimizes the file descriptors and I find it to
>>> fix the issue. Now, both yours and my test cases pass.
>>>
>>> [PATCH RFC] btrfs-progs: mkfs: optimize file descriptor usage in
>>>   mkfs.btrfs
>>>
>>> Thanks, Anand
>>>
>>>
>>>
>>>
>>>> [FIX]
>>>> The proper way to avoid race with udev is to flock() the whole disk
>>>> (aka, the parent block device, not the partition disk).
>>>>
>>>> Thus this patch would introduce such mechanism by:
>>>>
>>>> - btrfs_flock_one_device()
>>>>    This would resolve the path to a whole disk path.
>>>>    Then make sure the whole disk is not already locked (this can happen
>>>>    for cases like "mkfs.btrfs -f /dev/sda[123]").
>>>>
>>>>    If the device is not already locked, then flock() the device, and
>>>>    insert a new entry into the list.
>>>>
>>>> - btrfs_unlock_all_devices()
>>>>    Would go unlock all devices recorded in locked_devices list, and
>>>> free
>>>>    the memory.
>>>>
>>>> And mkfs.btrfs would be the first one to utilize the new mechanism, to
>>>> prevent such race with udev.
>>>>
>>>> Issue: #734
>>>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>>>> ---
>>>> Changelog:
>>>> v2:
>>>> - Fix the patch prefix
>>>>    From "btrfs" to "btrfs-progs"
>>>> ---
>>>>   common/device-utils.c | 114
>>>> ++++++++++++++++++++++++++++++++++++++++++
>>>>   common/device-utils.h |   3 ++
>>>>   mkfs/main.c           |  11 ++++
>>>>   3 files changed, 128 insertions(+)
>>>>
>>>> diff --git a/common/device-utils.c b/common/device-utils.c
>>>> index f86120afa00c..88c21c66382d 100644
>>>> --- a/common/device-utils.c
>>>> +++ b/common/device-utils.c
>>>> @@ -17,11 +17,13 @@
>>>>   #include <sys/ioctl.h>
>>>>   #include <sys/stat.h>
>>>>   #include <sys/types.h>
>>>> +#include <sys/file.h>
>>>>   #include <linux/limits.h>
>>>>   #ifdef BTRFS_ZONED
>>>>   #include <linux/blkzoned.h>
>>>>   #endif
>>>>   #include <linux/fs.h>
>>>> +#include <linux/kdev_t.h>
>>>>   #include <limits.h>
>>>>   #include <stdio.h>
>>>>   #include <stdlib.h>
>>>> @@ -48,6 +50,24 @@
>>>>   #define BLKDISCARD    _IO(0x12,119)
>>>>   #endif
>>>>
>>>> +static LIST_HEAD(locked_devices);
>>>> +
>>>> +/*
>>>> + * This is to record flock()ed devices.
>>>> + * For flock() to prevent udev races, we must lock the parent block
>>>> device,
>>>> + * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that
>>>> case
>>>> + * we should only lock "/dev/sda" once.
>>>> + *
>>>> + * This structure would be used to record any flocked block device
>>>> (not
>>>> + * the partition one), and avoid double locking.
>>>> + */
>>>> +struct btrfs_locked_wholedisk {
>>>> +    char *full_path;
>>>> +    dev_t devno;
>>>> +    int fd;
>>>> +    struct list_head list;
>>>> +};
>>>> +
>>>>   /*
>>>>    * Discard the given range in one go
>>>>    */
>>>> @@ -633,3 +653,97 @@ ssize_t btrfs_direct_pwrite(int fd, const void
>>>> *buf, size_t count, off_t offset)
>>>>       free(bounce_buf);
>>>>       return ret;
>>>>   }
>>>> +
>>>> +int btrfs_flock_one_device(char *path)
>>>> +{
>>>> +    struct btrfs_locked_wholedisk *entry;
>>>> +    struct stat st = { 0 };
>>>> +    char *wholedisk_path;
>>>> +    dev_t wholedisk_devno;
>>>> +    int ret;
>>>> +
>>>> +    ret = stat(path, &st);
>>>> +    if (ret < 0) {
>>>> +        error("failed to stat %s: %m", path);
>>>> +        return -errno;
>>>> +    }
>>>> +    /* Non-block device, skipping the locking. */
>>>> +    if (!S_ISBLK(st.st_mode))
>>>> +        return 0;
>>>> +
>>>> +    ret = blkid_devno_to_wholedisk(st.st_dev, path, strlen(path),
>>>> +                       &wholedisk_devno);
>>>> +    if (ret < 0) {
>>>> +        error("failed to get the whole disk devno for %s: %m", path);
>>>> +        return -errno;
>>>> +    }
>>>> +    wholedisk_path = blkid_devno_to_devname(wholedisk_devno);
>>>> +    if (!wholedisk_path) {
>>>> +        error("failed to get the devname of dev %ld:%ld",
>>>> +            MAJOR(wholedisk_devno), MINOR(wholedisk_devno));
>>>> +    }
>>>> +
>>>> +    /* Check if we already have the whole disk in the list. */
>>>> +    list_for_each_entry(entry, &locked_devices, list) {
>>>> +        /* The wholedisk is already locked, need to do nothing. */
>>>> +        if (entry->devno == wholedisk_devno ||
>>>> +            entry->full_path == wholedisk_path) {
>>>> +            free(wholedisk_path);
>>>> +            return 0;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Allocate new entry. */
>>>> +    entry = malloc(sizeof(*entry));
>>>> +    if (!entry) {
>>>> +        errno = ENOMEM;
>>>> +        error("unable to allocate new memory for %s: %m",
>>>> +              wholedisk_path);
>>>> +        free(wholedisk_path);
>>>> +        return -errno;
>>>> +    }
>>>> +    entry->devno = wholedisk_devno;
>>>> +    entry->full_path = wholedisk_path;
>>>> +
>>>> +    /* Lock the whole disk. */
>>>> +    entry->fd = open(wholedisk_path, O_RDONLY);
>>>> +    if (entry->fd < 0) {
>>>> +        error("failed to open device %s: %m",
>>>> +              wholedisk_path);
>>>> +        free(wholedisk_path);
>>>> +        free(entry);
>>>> +        return -errno;
>>>> +    }
>>>> +    ret = flock(entry->fd, LOCK_EX);
>>>> +    if (ret < 0) {
>>>> +        error("failed to hold an exclusive lock on %s: %m",
>>>> +              wholedisk_path);
>>>> +        free(wholedisk_path);
>>>> +        free(entry);
>>>> +        return -errno;
>>>> +    }
>>>> +
>>>> +    /* Insert it into the list. */
>>>> +    list_add_tail(&entry->list, &locked_devices);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +void btrfs_unlock_all_devicecs(void)
>>>> +{
>>>> +    while (!list_empty(&locked_devices)) {
>>>> +        struct btrfs_locked_wholedisk *entry;
>>>> +        int ret;
>>>> +
>>>> +        entry = list_entry(locked_devices.next,
>>>> +                   struct btrfs_locked_wholedisk, list);
>>>> +
>>>> +        list_del_init(&entry->list);
>>>> +        ret = flock(entry->fd, LOCK_UN);
>>>> +        if (ret < 0)
>>>> +            warning("failed to unlock %s (fd %d dev %ld:%ld),
>>>> skipping it",
>>>> +                entry->full_path, entry->fd, MAJOR(entry->devno),
>>>> +                MINOR(entry->devno));
>>>> +        free(entry->full_path);
>>>> +        free(entry);
>>>> +    }
>>>> +}
>>>> diff --git a/common/device-utils.h b/common/device-utils.h
>>>> index 853d17b5ab98..3a04348a0867 100644
>>>> --- a/common/device-utils.h
>>>> +++ b/common/device-utils.h
>>>> @@ -57,6 +57,9 @@ int btrfs_prepare_device(int fd, const char *file,
>>>> u64 *block_count_ret,
>>>>   ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t
>>>> offset);
>>>>   ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count,
>>>> off_t offset);
>>>>
>>>> +int btrfs_flock_one_device(char *path);
>>>> +void btrfs_unlock_all_devicecs(void);
>>>> +
>>>>   #ifdef BTRFS_ZONED
>>>>   static inline ssize_t btrfs_pwrite(int fd, const void *buf, size_t
>>>> count,
>>>>                      off_t offset, bool direct)
>>>> diff --git a/mkfs/main.c b/mkfs/main.c
>>>> index b9882208dbd5..6e6cb81a4165 100644
>>>> --- a/mkfs/main.c
>>>> +++ b/mkfs/main.c
>>>> @@ -1723,6 +1723,15 @@ int BOX_MAIN(mkfs)(int argc, char **argv)
>>>>           }
>>>>       }
>>>>
>>>> +    /* Lock all devices to prevent race with udev probing. */
>>>> +    for (i = 0; i < device_count; i++) {
>>>> +        char *path = argv[optind + i - 1];
>>>> +
>>>> +        ret = btrfs_flock_one_device(path);
>>>> +        if (ret < 0)
>>>> +            warning("failed to flock %s, skipping it", path);
>>>> +    }
>>>> +
>>>>       /* Start threads */
>>>>       for (i = 0; i < device_count; i++) {
>>>>           prepare_ctx[i].file = argv[optind + i - 1];
>>>> @@ -2079,6 +2088,7 @@ out:
>>>>       free(label);
>>>>       free(source_dir);
>>>>
>>>> +    btrfs_unlock_all_devicecs();
>>>>       return !!ret;
>>>>
>>>>   error:
>>>> @@ -2090,6 +2100,7 @@ error:
>>>>       free(prepare_ctx);
>>>>       free(label);
>>>>       free(source_dir);
>>>> +    btrfs_unlock_all_devicecs();
>>>>       exit(1);
>>>>   success:
>>>>       exit(0);
>>>> --
>>>> 2.43.0
>>>>
>>>
>
diff mbox series

Patch

diff --git a/common/device-utils.c b/common/device-utils.c
index f86120afa00c..88c21c66382d 100644
--- a/common/device-utils.c
+++ b/common/device-utils.c
@@ -17,11 +17,13 @@ 
 #include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/file.h>
 #include <linux/limits.h>
 #ifdef BTRFS_ZONED
 #include <linux/blkzoned.h>
 #endif
 #include <linux/fs.h>
+#include <linux/kdev_t.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -48,6 +50,24 @@ 
 #define BLKDISCARD	_IO(0x12,119)
 #endif

+static LIST_HEAD(locked_devices);
+
+/*
+ * This is to record flock()ed devices.
+ * For flock() to prevent udev races, we must lock the parent block device,
+ * but we may hit cases like "mkfs.btrfs -f /dev/sda[123]", in that case
+ * we should only lock "/dev/sda" once.
+ *
+ * This structure would be used to record any flocked block device (not
+ * the partition one), and avoid double locking.
+ */
+struct btrfs_locked_wholedisk {
+	char *full_path;
+	dev_t devno;
+	int fd;
+	struct list_head list;
+};
+
 /*
  * Discard the given range in one go
  */
@@ -633,3 +653,97 @@  ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset)
 	free(bounce_buf);
 	return ret;
 }
+
+int btrfs_flock_one_device(char *path)
+{
+	struct btrfs_locked_wholedisk *entry;
+	struct stat st = { 0 };
+	char *wholedisk_path;
+	dev_t wholedisk_devno;
+	int ret;
+
+	ret = stat(path, &st);
+	if (ret < 0) {
+		error("failed to stat %s: %m", path);
+		return -errno;
+	}
+	/* Non-block device, skipping the locking. */
+	if (!S_ISBLK(st.st_mode))
+		return 0;
+
+	ret = blkid_devno_to_wholedisk(st.st_dev, path, strlen(path),
+				       &wholedisk_devno);
+	if (ret < 0) {
+		error("failed to get the whole disk devno for %s: %m", path);
+		return -errno;
+	}
+	wholedisk_path = blkid_devno_to_devname(wholedisk_devno);
+	if (!wholedisk_path) {
+		error("failed to get the devname of dev %ld:%ld",
+			MAJOR(wholedisk_devno), MINOR(wholedisk_devno));
+	}
+
+	/* Check if we already have the whole disk in the list. */
+	list_for_each_entry(entry, &locked_devices, list) {
+		/* The wholedisk is already locked, need to do nothing. */
+		if (entry->devno == wholedisk_devno ||
+		    entry->full_path == wholedisk_path) {
+			free(wholedisk_path);
+			return 0;
+		}
+	}
+
+	/* Allocate new entry. */
+	entry = malloc(sizeof(*entry));
+	if (!entry) {
+		errno = ENOMEM;
+		error("unable to allocate new memory for %s: %m",
+		      wholedisk_path);
+		free(wholedisk_path);
+		return -errno;
+	}
+	entry->devno = wholedisk_devno;
+	entry->full_path = wholedisk_path;
+
+	/* Lock the whole disk. */
+	entry->fd = open(wholedisk_path, O_RDONLY);
+	if (entry->fd < 0) {
+		error("failed to open device %s: %m",
+		      wholedisk_path);
+		free(wholedisk_path);
+		free(entry);
+		return -errno;
+	}
+	ret = flock(entry->fd, LOCK_EX);
+	if (ret < 0) {
+		error("failed to hold an exclusive lock on %s: %m",
+		      wholedisk_path);
+		free(wholedisk_path);
+		free(entry);
+		return -errno;
+	}
+
+	/* Insert it into the list. */
+	list_add_tail(&entry->list, &locked_devices);
+	return 0;
+}
+
+void btrfs_unlock_all_devicecs(void)
+{
+	while (!list_empty(&locked_devices)) {
+		struct btrfs_locked_wholedisk *entry;
+		int ret;
+
+		entry = list_entry(locked_devices.next,
+				   struct btrfs_locked_wholedisk, list);
+
+		list_del_init(&entry->list);
+		ret = flock(entry->fd, LOCK_UN);
+		if (ret < 0)
+			warning("failed to unlock %s (fd %d dev %ld:%ld), skipping it",
+				entry->full_path, entry->fd, MAJOR(entry->devno),
+				MINOR(entry->devno));
+		free(entry->full_path);
+		free(entry);
+	}
+}
diff --git a/common/device-utils.h b/common/device-utils.h
index 853d17b5ab98..3a04348a0867 100644
--- a/common/device-utils.h
+++ b/common/device-utils.h
@@ -57,6 +57,9 @@  int btrfs_prepare_device(int fd, const char *file, u64 *block_count_ret,
 ssize_t btrfs_direct_pread(int fd, void *buf, size_t count, off_t offset);
 ssize_t btrfs_direct_pwrite(int fd, const void *buf, size_t count, off_t offset);

+int btrfs_flock_one_device(char *path);
+void btrfs_unlock_all_devicecs(void);
+
 #ifdef BTRFS_ZONED
 static inline ssize_t btrfs_pwrite(int fd, const void *buf, size_t count,
 				   off_t offset, bool direct)
diff --git a/mkfs/main.c b/mkfs/main.c
index b9882208dbd5..6e6cb81a4165 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1723,6 +1723,15 @@  int BOX_MAIN(mkfs)(int argc, char **argv)
 		}
 	}

+	/* Lock all devices to prevent race with udev probing. */
+	for (i = 0; i < device_count; i++) {
+		char *path = argv[optind + i - 1];
+
+		ret = btrfs_flock_one_device(path);
+		if (ret < 0)
+			warning("failed to flock %s, skipping it", path);
+	}
+
 	/* Start threads */
 	for (i = 0; i < device_count; i++) {
 		prepare_ctx[i].file = argv[optind + i - 1];
@@ -2079,6 +2088,7 @@  out:
 	free(label);
 	free(source_dir);

+	btrfs_unlock_all_devicecs();
 	return !!ret;

 error:
@@ -2090,6 +2100,7 @@  error:
 	free(prepare_ctx);
 	free(label);
 	free(source_dir);
+	btrfs_unlock_all_devicecs();
 	exit(1);
 success:
 	exit(0);