[2/3] btrfs: volumes: Add btrfs_fs_devices::missing_list to collect missing devices
diff mbox series

Message ID 20191107062710.67964-3-wqu@suse.com
State New
Headers show
Series
  • btrfs: More intelligent degraded chunk allocator
Related show

Commit Message

Qu Wenruo Nov. 7, 2019, 6:27 a.m. UTC
This enables btrfs to iterate missing devices separately, without
iterating all fs_devices.

This provides the basis for later degraded chunk enhancement.

The change includes:
- Add missing devices to btrfs_fs_devices::missing_list
  This happens at add_missing_dev() and other locations where
  missing_devices get increased.

- Remove missing devices from btrfs_fs_devices::missing_list
  This needs to cover all locations where missing_devices get decreased.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
 fs/btrfs/volumes.h |  6 ++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

Comments

Johannes Thumshirn Nov. 7, 2019, 9:31 a.m. UTC | #1
On 07/11/2019 07:27, Qu Wenruo wrote:
> +	/*
> +	 * Devices which can't be found. Projected by chunk_mutex.
> +	 * This acts as a fallback allocation list for certain degraded mount.
> +	 */
> +	struct list_head missing_list;

From a quick glance, s/Projected/Protected/
Anand Jain Nov. 19, 2019, 10:03 a.m. UTC | #2
On 11/7/19 2:27 PM, Qu Wenruo wrote:
> This enables btrfs to iterate missing devices separately, without
> iterating all fs_devices.

  IMO.
  We don't need another list to maintain the missing device. We already
  have good enough device lists.
  The way its been implemented is
  Allo_list is the only list from which we shall alloc the chunks.
  Missing is a state of the device.
  A device in the alloc list can be in the Missing state.

  If there is missing_list that means the device in the missing list
  is also possible candidate for the alloc that's messy.
  Also its not typical to have a larger number of missing devices
  to constitute its own list.

Thanks, Anand


> This provides the basis for later degraded chunk enhancement.
> 
> The change includes:
> - Add missing devices to btrfs_fs_devices::missing_list
>    This happens at add_missing_dev() and other locations where
>    missing_devices get increased.
> 
> - Remove missing devices from btrfs_fs_devices::missing_list
>    This needs to cover all locations where missing_devices get decreased.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
>   fs/btrfs/volumes.h |  6 ++++++
>   2 files changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index eee5fc1d11f0..a462d8de5d2a 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -324,6 +324,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
>   
>   	INIT_LIST_HEAD(&fs_devs->devices);
>   	INIT_LIST_HEAD(&fs_devs->alloc_list);
> +	INIT_LIST_HEAD(&fs_devs->missing_list);
>   	INIT_LIST_HEAD(&fs_devs->fs_list);
>   	if (fsid)
>   		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
> @@ -1089,6 +1090,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
>   		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>   			fs_devices->missing_devices--;
>   			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +			list_del_init(&device->dev_alloc_list);
>   		}
>   	}
>   
> @@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
>   	if (device->bdev)
>   		fs_devices->open_devices--;
>   
> +	list_del_init(&device->dev_alloc_list);
>   	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
> -	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
> -		list_del_init(&device->dev_alloc_list);
> +	    device->devid != BTRFS_DEV_REPLACE_DEVID)
>   		fs_devices->rw_devices--;
> -	}
>   
>   	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>   		fs_devices->missing_devices--;
> @@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   		device->fs_devices->rw_devices--;
>   		mutex_unlock(&fs_info->chunk_mutex);
>   	}
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
> +		mutex_lock(&fs_info->chunk_mutex);
> +		list_del_init(&device->dev_alloc_list);
> +		device->fs_devices->missing_devices--;
> +		mutex_unlock(&fs_info->chunk_mutex);
> +	}
>   
>   	mutex_unlock(&uuid_mutex);
>   	ret = btrfs_shrink_device(device, 0);
> @@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   	if (cur_devices != fs_devices)
>   		fs_devices->total_devices--;
>   
> -	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
> -		cur_devices->missing_devices--;
> -
>   	btrfs_assign_next_active_device(device, NULL);
>   
>   	if (device->bdev) {
> @@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
>   		device->fs_devices->rw_devices++;
>   		mutex_unlock(&fs_info->chunk_mutex);
>   	}
> +	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
> +		mutex_lock(&fs_info->chunk_mutex);
> +		list_add(&device->dev_alloc_list,
> +			 &fs_devices->missing_list);
> +		device->fs_devices->missing_devices++;
> +		mutex_unlock(&fs_info->chunk_mutex);
> +	}
>   	goto out;
>   }
>   
> @@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
>   	seed_devices->opened = 1;
>   	INIT_LIST_HEAD(&seed_devices->devices);
>   	INIT_LIST_HEAD(&seed_devices->alloc_list);
> +	INIT_LIST_HEAD(&seed_devices->missing_list);
>   	mutex_init(&seed_devices->device_list_mutex);
>   
>   	mutex_lock(&fs_devices->device_list_mutex);
> @@ -6640,6 +6652,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
>   	fs_devices->num_devices++;
>   
>   	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +	list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>   	fs_devices->missing_devices++;
>   
>   	return device;
> @@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>   			 */
>   			device->fs_devices->missing_devices++;
>   			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
> +			list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>   		}
>   
>   		/* Move the device to its own fs_devices */
> @@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>   
>   			device->fs_devices->missing_devices--;
>   			fs_devices->missing_devices++;
> +			list_move(&device->dev_alloc_list, &fs_devices->missing_list);
>   
>   			device->fs_devices = fs_devices;
>   		}
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index a7da1f3e3627..9cef4dc4b5be 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -253,6 +253,12 @@ struct btrfs_fs_devices {
>   	 */
>   	struct list_head alloc_list;
>   
> +	/*
> +	 * Devices which can't be found. Projected by chunk_mutex.
> +	 * This acts as a fallback allocation list for certain degraded mount.
> +	 */
> +	struct list_head missing_list;
> +
>   	struct btrfs_fs_devices *seed;
>   	int seeding;
>   
>
Qu Wenruo Nov. 19, 2019, 10:29 a.m. UTC | #3
On 2019/11/19 下午6:03, Anand Jain wrote:
> On 11/7/19 2:27 PM, Qu Wenruo wrote:
>> This enables btrfs to iterate missing devices separately, without
>> iterating all fs_devices.
> 
>  IMO.
>  We don't need another list to maintain the missing device. We already
>  have good enough device lists.
>  The way its been implemented is
>  Allo_list is the only list from which we shall alloc the chunks.
>  Missing is a state of the device.
>  A device in the alloc list can be in the Missing state.

That would cause problem, especially when you only want to use missing
device as last resort method.

IIRC it's you mentioned this is a problem in my original design (which
put all missing deviecs into alloc_list). Or it's David?

> 
>  If there is missing_list that means the device in the missing list
>  is also possible candidate for the alloc that's messy.

But when you want to avoid missing device, alloc_list and missing_list
makes sense.

E.g. 6 devices RAID5, with one missing device, we should *avoid* using
missing devices as we have enough (5) devices to allocate from.

>  Also its not typical to have a larger number of missing devices
>  to constitute its own list.

That's just for now.

If we're going to allow RAID10 to lost half of its devices, then it
would be a problem.

Thanks,
Qu

> 
> Thanks, Anand
> 
> 
>> This provides the basis for later degraded chunk enhancement.
>>
>> The change includes:
>> - Add missing devices to btrfs_fs_devices::missing_list
>>    This happens at add_missing_dev() and other locations where
>>    missing_devices get increased.
>>
>> - Remove missing devices from btrfs_fs_devices::missing_list
>>    This needs to cover all locations where missing_devices get decreased.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/volumes.c | 27 +++++++++++++++++++++------
>>   fs/btrfs/volumes.h |  6 ++++++
>>   2 files changed, 27 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index eee5fc1d11f0..a462d8de5d2a 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -324,6 +324,7 @@ static struct btrfs_fs_devices
>> *alloc_fs_devices(const u8 *fsid,
>>         INIT_LIST_HEAD(&fs_devs->devices);
>>       INIT_LIST_HEAD(&fs_devs->alloc_list);
>> +    INIT_LIST_HEAD(&fs_devs->missing_list);
>>       INIT_LIST_HEAD(&fs_devs->fs_list);
>>       if (fsid)
>>           memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
>> @@ -1089,6 +1090,7 @@ static noinline struct btrfs_device
>> *device_list_add(const char *path,
>>           if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>>               fs_devices->missing_devices--;
>>               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +            list_del_init(&device->dev_alloc_list);
>>           }
>>       }
>>   @@ -1250,11 +1252,10 @@ static void btrfs_close_one_device(struct
>> btrfs_device *device)
>>       if (device->bdev)
>>           fs_devices->open_devices--;
>>   +    list_del_init(&device->dev_alloc_list);
>>       if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
>> -        device->devid != BTRFS_DEV_REPLACE_DEVID) {
>> -        list_del_init(&device->dev_alloc_list);
>> +        device->devid != BTRFS_DEV_REPLACE_DEVID)
>>           fs_devices->rw_devices--;
>> -    }
>>         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>>           fs_devices->missing_devices--;
>> @@ -2140,6 +2141,12 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>           device->fs_devices->rw_devices--;
>>           mutex_unlock(&fs_info->chunk_mutex);
>>       }
>> +    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>> +        mutex_lock(&fs_info->chunk_mutex);
>> +        list_del_init(&device->dev_alloc_list);
>> +        device->fs_devices->missing_devices--;
>> +        mutex_unlock(&fs_info->chunk_mutex);
>> +    }
>>         mutex_unlock(&uuid_mutex);
>>       ret = btrfs_shrink_device(device, 0);
>> @@ -2184,9 +2191,6 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>       if (cur_devices != fs_devices)
>>           fs_devices->total_devices--;
>>   -    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>> -        cur_devices->missing_devices--;
>> -
>>       btrfs_assign_next_active_device(device, NULL);
>>         if (device->bdev) {
>> @@ -2236,6 +2240,13 @@ int btrfs_rm_device(struct btrfs_fs_info
>> *fs_info, const char *device_path,
>>           device->fs_devices->rw_devices++;
>>           mutex_unlock(&fs_info->chunk_mutex);
>>       }
>> +    if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
>> +        mutex_lock(&fs_info->chunk_mutex);
>> +        list_add(&device->dev_alloc_list,
>> +             &fs_devices->missing_list);
>> +        device->fs_devices->missing_devices++;
>> +        mutex_unlock(&fs_info->chunk_mutex);
>> +    }
>>       goto out;
>>   }
>>   @@ -2438,6 +2449,7 @@ static int btrfs_prepare_sprout(struct
>> btrfs_fs_info *fs_info)
>>       seed_devices->opened = 1;
>>       INIT_LIST_HEAD(&seed_devices->devices);
>>       INIT_LIST_HEAD(&seed_devices->alloc_list);
>> +    INIT_LIST_HEAD(&seed_devices->missing_list);
>>       mutex_init(&seed_devices->device_list_mutex);
>>         mutex_lock(&fs_devices->device_list_mutex);
>> @@ -6640,6 +6652,7 @@ static struct btrfs_device
>> *add_missing_dev(struct btrfs_fs_devices *fs_devices,
>>       fs_devices->num_devices++;
>>         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +    list_add(&device->dev_alloc_list, &fs_devices->missing_list);
>>       fs_devices->missing_devices++;
>>         return device;
>> @@ -6979,6 +6992,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>>                */
>>               device->fs_devices->missing_devices++;
>>               set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
>> +            list_add(&device->dev_alloc_list,
>> &fs_devices->missing_list);
>>           }
>>             /* Move the device to its own fs_devices */
>> @@ -6992,6 +7006,7 @@ static int read_one_dev(struct extent_buffer *leaf,
>>                 device->fs_devices->missing_devices--;
>>               fs_devices->missing_devices++;
>> +            list_move(&device->dev_alloc_list,
>> &fs_devices->missing_list);
>>                 device->fs_devices = fs_devices;
>>           }
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index a7da1f3e3627..9cef4dc4b5be 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -253,6 +253,12 @@ struct btrfs_fs_devices {
>>        */
>>       struct list_head alloc_list;
>>   +    /*
>> +     * Devices which can't be found. Projected by chunk_mutex.
>> +     * This acts as a fallback allocation list for certain degraded
>> mount.
>> +     */
>> +    struct list_head missing_list;
>> +
>>       struct btrfs_fs_devices *seed;
>>       int seeding;
>>  
>
David Sterba Nov. 27, 2019, 7:36 p.m. UTC | #4
On Tue, Nov 19, 2019 at 06:29:46PM +0800, Qu Wenruo wrote:
> On 2019/11/19 下午6:03, Anand Jain wrote:
> > On 11/7/19 2:27 PM, Qu Wenruo wrote:
> >> This enables btrfs to iterate missing devices separately, without
> >> iterating all fs_devices.
> > 
> >  IMO.
> >  We don't need another list to maintain the missing device. We already
> >  have good enough device lists.
> >  The way its been implemented is
> >  Allo_list is the only list from which we shall alloc the chunks.
> >  Missing is a state of the device.
> >  A device in the alloc list can be in the Missing state.
> 
> That would cause problem, especially when you only want to use missing
> device as last resort method.
> 
> IIRC it's you mentioned this is a problem in my original design (which
> put all missing deviecs into alloc_list). Or it's David?
> 
> > 
> >  If there is missing_list that means the device in the missing list
> >  is also possible candidate for the alloc that's messy.
> 
> But when you want to avoid missing device, alloc_list and missing_list
> makes sense.
> 
> E.g. 6 devices RAID5, with one missing device, we should *avoid* using
> missing devices as we have enough (5) devices to allocate from.

I tend to agree that adding more lists would make things messy. This
needs to keep the missing state bit and presence in the list in sync,
there's the counter of missing devices. And that there are typically
only very few missing devices is also something to consider.

The device selection in __btrfs_alloc_chunk can avoid that. There's an
array allocated, with some size related data then it's passed to qsort
so the first N drives will be used for the chunk.

In case the degraded allocation is allowed (as mentioned in the other
mail, only for the mirrored profiles)

* add the missing device to the array
* update the comparison function btrfs_cmp_device_info to order missing
  devices to the end

Then the same logic "first N" would work here.

Patch
diff mbox series

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eee5fc1d11f0..a462d8de5d2a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -324,6 +324,7 @@  static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 
 	INIT_LIST_HEAD(&fs_devs->devices);
 	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->missing_list);
 	INIT_LIST_HEAD(&fs_devs->fs_list);
 	if (fsid)
 		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -1089,6 +1090,7 @@  static noinline struct btrfs_device *device_list_add(const char *path,
 		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 			fs_devices->missing_devices--;
 			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+			list_del_init(&device->dev_alloc_list);
 		}
 	}
 
@@ -1250,11 +1252,10 @@  static void btrfs_close_one_device(struct btrfs_device *device)
 	if (device->bdev)
 		fs_devices->open_devices--;
 
+	list_del_init(&device->dev_alloc_list);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
-	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
-		list_del_init(&device->dev_alloc_list);
+	    device->devid != BTRFS_DEV_REPLACE_DEVID)
 		fs_devices->rw_devices--;
-	}
 
 	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		fs_devices->missing_devices--;
@@ -2140,6 +2141,12 @@  int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		device->fs_devices->rw_devices--;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+		mutex_lock(&fs_info->chunk_mutex);
+		list_del_init(&device->dev_alloc_list);
+		device->fs_devices->missing_devices--;
+		mutex_unlock(&fs_info->chunk_mutex);
+	}
 
 	mutex_unlock(&uuid_mutex);
 	ret = btrfs_shrink_device(device, 0);
@@ -2184,9 +2191,6 @@  int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	if (cur_devices != fs_devices)
 		fs_devices->total_devices--;
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
-		cur_devices->missing_devices--;
-
 	btrfs_assign_next_active_device(device, NULL);
 
 	if (device->bdev) {
@@ -2236,6 +2240,13 @@  int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 		device->fs_devices->rw_devices++;
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+		mutex_lock(&fs_info->chunk_mutex);
+		list_add(&device->dev_alloc_list,
+			 &fs_devices->missing_list);
+		device->fs_devices->missing_devices++;
+		mutex_unlock(&fs_info->chunk_mutex);
+	}
 	goto out;
 }
 
@@ -2438,6 +2449,7 @@  static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	seed_devices->opened = 1;
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	INIT_LIST_HEAD(&seed_devices->missing_list);
 	mutex_init(&seed_devices->device_list_mutex);
 
 	mutex_lock(&fs_devices->device_list_mutex);
@@ -6640,6 +6652,7 @@  static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
 	fs_devices->num_devices++;
 
 	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+	list_add(&device->dev_alloc_list, &fs_devices->missing_list);
 	fs_devices->missing_devices++;
 
 	return device;
@@ -6979,6 +6992,7 @@  static int read_one_dev(struct extent_buffer *leaf,
 			 */
 			device->fs_devices->missing_devices++;
 			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+			list_add(&device->dev_alloc_list, &fs_devices->missing_list);
 		}
 
 		/* Move the device to its own fs_devices */
@@ -6992,6 +7006,7 @@  static int read_one_dev(struct extent_buffer *leaf,
 
 			device->fs_devices->missing_devices--;
 			fs_devices->missing_devices++;
+			list_move(&device->dev_alloc_list, &fs_devices->missing_list);
 
 			device->fs_devices = fs_devices;
 		}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a7da1f3e3627..9cef4dc4b5be 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@  struct btrfs_fs_devices {
 	 */
 	struct list_head alloc_list;
 
+	/*
+	 * Devices which can't be found. Projected by chunk_mutex.
+	 * This acts as a fallback allocation list for certain degraded mount.
+	 */
+	struct list_head missing_list;
+
 	struct btrfs_fs_devices *seed;
 	int seeding;