diff mbox

[v2,2/3] btrfs: add read_mirror_policy parameter devid

Message ID 20180516100359.7752-3-anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Anand Jain May 16, 2018, 10:03 a.m. UTC
Adds the mount option:
  mount -o read_mirror_policy=<devid>

To set the devid of the device which should be used for read. That
means all the normal reads will go to that particular device only.

This also helps testing and gives a better control for the test
scripts including mount context reads.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/super.c   | 21 +++++++++++++++++++++
 fs/btrfs/volumes.c | 10 ++++++++++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 33 insertions(+)

Comments

Steven Davies Jan. 21, 2019, 11:56 a.m. UTC | #1
On 2018-05-16 11:03, Anand Jain wrote:

Going back to an old patchset I was testing this weekend:

> Adds the mount option:
>   mount -o read_mirror_policy=<devid>
> 
> To set the devid of the device which should be used for read. That
> means all the normal reads will go to that particular device only.
> 
> This also helps testing and gives a better control for the test
> scripts including mount context reads.
> 
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
Tested-By: Steven Davies <btrfs-list@steev.me.uk>

> ---
>  fs/btrfs/super.c   | 21 +++++++++++++++++++++
>  fs/btrfs/volumes.c | 10 ++++++++++
>  fs/btrfs/volumes.h |  2 ++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 21eff0ac1e4f..7ddecf4178a6 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -852,6 +852,27 @@ int btrfs_parse_options(struct btrfs_fs_info
> *info, char *options,
>  					BTRFS_READ_MIRROR_BY_PID;
>  				break;
>  			}
> +
> +			intarg = 0;
> +			if (match_int(&args[0], &intarg) == 0) {
> +				struct btrfs_device *device;
> +
> +				device = btrfs_find_device(info, intarg,
> +							   NULL, NULL);
> +				if (!device) {
> +					btrfs_err(info,
> +					  "read_mirror_policy: invalid devid %d",
> +					  intarg);
> +					ret = -EINVAL;
> +					goto out;
> +				}
> +				info->read_mirror_policy =
> +						BTRFS_READ_MIRROR_BY_DEV;
> +				set_bit(BTRFS_DEV_STATE_READ_MIRROR,
> +					&device->dev_state);

Not an expert, but might this be overwritten with another state? e.g. 
BTRFS_DEV_STATE_REPLACE_TGT. The device would then lose its READ_MIRROR 
flag and the fs would always use the first device for reading.

> +				break;
> +			}

I noticed that it's possible to pass this option multiple times at 
mount, which sets multiple devices as read mirrors. While that doesn't 
do anything harmful, the code below will only use the first device. It 
may be worth at least mentioning this in documentation.

> +
>  			ret = -EINVAL;
>  			goto out;
>  		case Opt_err:
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 64dba5c4cf33..3a9c3804ff17 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5220,6 +5220,16 @@ static int find_live_mirror(struct
> btrfs_fs_info *fs_info,
>  		num_stripes = map->num_stripes;
> 
>  	switch(fs_info->read_mirror_policy) {
> +	case BTRFS_READ_MIRROR_BY_DEV:
> +		preferred_mirror = first;
> +		if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> +			     &map->stripes[preferred_mirror].dev->dev_state))
> +			break;
> +		if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
> +			     &map->stripes[++preferred_mirror].dev->dev_state))
> +			break;
> +		preferred_mirror = first;

Why set preferred_mirror again? The only effect of re-setting it will be 
to use the lowest devid (1) rather than the highest (2). Is there any 
difference? Either way it should never happen, because the above code 
traps it (except when the BTRFS_DEV_STATE_READ_MIRROR flag has been 
changed as mentioned above).

 From Goffredo's comment[1] on Timofey's similar effect patch, if it 
becomes possible to have more mirrors in a RAID1/RAID10 fs then this 
code will need to be updated to test dev_state for more than two 
devices. Would it be sensible to implement this as a for loop straight 
away?

> +		break;
>  	case BTRFS_READ_MIRROR_DEFAULT:
>  	case BTRFS_READ_MIRROR_BY_PID:
>  	default:
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 953df9925832..55b2eebbf183 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -37,6 +37,7 @@ struct btrfs_pending_bios {
>  enum btrfs_read_mirror_type {
>  	BTRFS_READ_MIRROR_DEFAULT,
>  	BTRFS_READ_MIRROR_BY_PID,
> +	BTRFS_READ_MIRROR_BY_DEV,
>  };
> 
>  #define BTRFS_DEV_STATE_WRITEABLE	(0)
> @@ -44,6 +45,7 @@ enum btrfs_read_mirror_type {
>  #define BTRFS_DEV_STATE_MISSING		(2)
>  #define BTRFS_DEV_STATE_REPLACE_TGT	(3)
>  #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
> +#define BTRFS_DEV_STATE_READ_MIRROR	(5)
> 
>  struct btrfs_device {
>  	struct list_head dev_list;

Happy to add Tested-By on patch 1/3 too, but I haven't looked at 3/3 
yet.

[1]: https://patchwork.kernel.org/patch/10678531/#22315779
Anand Jain Jan. 22, 2019, 1:43 p.m. UTC | #2
On 01/21/2019 07:56 PM, Steven Davies wrote:
> On 2018-05-16 11:03, Anand Jain wrote:
> 
> Going back to an old patchset I was testing this weekend:
> 
>> Adds the mount option:
>>   mount -o read_mirror_policy=<devid>
>>
>> To set the devid of the device which should be used for read. That
>> means all the normal reads will go to that particular device only.
>>
>> This also helps testing and gives a better control for the test
>> scripts including mount context reads.
>>
>> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> Tested-By: Steven Davies <btrfs-list@steev.me.uk>

Thanks for testing.

> 
>> ---
>>  fs/btrfs/super.c   | 21 +++++++++++++++++++++
>>  fs/btrfs/volumes.c | 10 ++++++++++
>>  fs/btrfs/volumes.h |  2 ++
>>  3 files changed, 33 insertions(+)
>>
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 21eff0ac1e4f..7ddecf4178a6 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -852,6 +852,27 @@ int btrfs_parse_options(struct btrfs_fs_info
>> *info, char *options,
>>                      BTRFS_READ_MIRROR_BY_PID;
>>                  break;
>>              }
>> +
>> +            intarg = 0;
>> +            if (match_int(&args[0], &intarg) == 0) {
>> +                struct btrfs_device *device;
>> +
>> +                device = btrfs_find_device(info, intarg,
>> +                               NULL, NULL);
>> +                if (!device) {
>> +                    btrfs_err(info,
>> +                      "read_mirror_policy: invalid devid %d",
>> +                      intarg);
>> +                    ret = -EINVAL;
>> +                    goto out;
>> +                }
>> +                info->read_mirror_policy =
>> +                        BTRFS_READ_MIRROR_BY_DEV;
>> +                set_bit(BTRFS_DEV_STATE_READ_MIRROR,
>> +                    &device->dev_state);
> 
> Not an expert, but might this be overwritten with another state? e.g. 
> BTRFS_DEV_STATE_REPLACE_TGT. The device would then lose its READ_MIRROR 
> flag and the fs would always use the first device for reading.

  It won't it is defined as bitmap.

>> +                break;
>> +            }
> 
> I noticed that it's possible to pass this option multiple times at 
> mount, which sets multiple devices as read mirrors. While that doesn't 
> do anything harmful, the code below will only use the first device. It 
> may be worth at least mentioning this in documentation.

  There were few feedback if read_mirror_policy should be a mount
  option or a sysfs or a property. IMO property is better as it would
  be persistent. In sysfs and mount-option, the user or a config file
  has to remember. Will fix.

>> +
>>              ret = -EINVAL;
>>              goto out;
>>          case Opt_err:
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 64dba5c4cf33..3a9c3804ff17 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -5220,6 +5220,16 @@ static int find_live_mirror(struct
>> btrfs_fs_info *fs_info,
>>          num_stripes = map->num_stripes;
>>
>>      switch(fs_info->read_mirror_policy) {
>> +    case BTRFS_READ_MIRROR_BY_DEV:
>> +        preferred_mirror = first;
>> +        if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>> +                 &map->stripes[preferred_mirror].dev->dev_state))
>> +            break;
>> +        if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>> +                 &map->stripes[++preferred_mirror].dev->dev_state)) <--[*]
>> +            break;
>> +        preferred_mirror = first;
> 
> Why set preferred_mirror again? The only effect of re-setting it will be 
> to use the lowest devid (1) rather than the highest (2). Is there any 
> difference? Either way it should never happen, because the above code 
> traps it (except when the BTRFS_DEV_STATE_READ_MIRROR flag has been 
> changed as mentioned above).

  Code at [*] above does ++preferred_mirror. So the following
    preferred_mirror = first;
  is not redundant.

>  From Goffredo's comment[1] on Timofey's similar effect patch, if it 
> becomes possible to have more mirrors in a RAID1/RAID10 fs then this 
> code will need to be updated to test dev_state for more than two 
> devices. Would it be sensible to implement this as a for loop straight 
> away?

  Right. A loop is better, will add.

>> +        break;
>>      case BTRFS_READ_MIRROR_DEFAULT:
>>      case BTRFS_READ_MIRROR_BY_PID:
>>      default:
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 953df9925832..55b2eebbf183 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -37,6 +37,7 @@ struct btrfs_pending_bios {
>>  enum btrfs_read_mirror_type {
>>      BTRFS_READ_MIRROR_DEFAULT,
>>      BTRFS_READ_MIRROR_BY_PID,
>> +    BTRFS_READ_MIRROR_BY_DEV,
>>  };
>>
>>  #define BTRFS_DEV_STATE_WRITEABLE    (0)
>> @@ -44,6 +45,7 @@ enum btrfs_read_mirror_type {
>>  #define BTRFS_DEV_STATE_MISSING        (2)
>>  #define BTRFS_DEV_STATE_REPLACE_TGT    (3)
>>  #define BTRFS_DEV_STATE_FLUSH_SENT    (4)
>> +#define BTRFS_DEV_STATE_READ_MIRROR    (5)
>>
>>  struct btrfs_device {
>>      struct list_head dev_list;
> 
> Happy to add Tested-By on patch 1/3 too, but I haven't looked at 3/3 yet.
> 
> [1]: https://patchwork.kernel.org/patch/10678531/#22315779


  Thanks,
Anand.
Steven Davies Jan. 22, 2019, 2:28 p.m. UTC | #3
On 2019-01-22 13:43, Anand Jain wrote:
> On 01/21/2019 07:56 PM, Steven Davies wrote:
>> On 2018-05-16 11:03, Anand Jain wrote:

>>> +                break;
>>> +            }
>> 
>> I noticed that it's possible to pass this option multiple times at 
>> mount, which sets multiple devices as read mirrors. While that doesn't 
>> do anything harmful, the code below will only use the first device. It 
>> may be worth at least mentioning this in documentation.
> 
>  There were few feedback if read_mirror_policy should be a mount
>  option or a sysfs or a property. IMO property is better as it would
>  be persistent. In sysfs and mount-option, the user or a config file
>  has to remember. Will fix.

I agree. Would/could this be set at mkfs time? It would then be similar 
to how mdadm's --write-mostly is set up.

>>> +
>>>              ret = -EINVAL;
>>>              goto out;
>>>          case Opt_err:
>>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>>> index 64dba5c4cf33..3a9c3804ff17 100644
>>> --- a/fs/btrfs/volumes.c
>>> +++ b/fs/btrfs/volumes.c
>>> @@ -5220,6 +5220,16 @@ static int find_live_mirror(struct
>>> btrfs_fs_info *fs_info,
>>>          num_stripes = map->num_stripes;
>>> 
>>>      switch(fs_info->read_mirror_policy) {
>>> +    case BTRFS_READ_MIRROR_BY_DEV:
>>> +        preferred_mirror = first;
>>> +        if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>>> +                 &map->stripes[preferred_mirror].dev->dev_state))
>>> +            break;
>>> +        if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
>>> +                 &map->stripes[++preferred_mirror].dev->dev_state)) 
>>> <--[*]
>>> +            break;
>>> +        preferred_mirror = first;
>> 
>> Why set preferred_mirror again? The only effect of re-setting it will 
>> be to use the lowest devid (1) rather than the highest (2). Is there 
>> any difference? Either way it should never happen, because the above 
>> code traps it (except when the BTRFS_DEV_STATE_READ_MIRROR flag has 
>> been changed as mentioned above).
> 
>  Code at [*] above does ++preferred_mirror. So the following
>    preferred_mirror = first;
>  is not redundant.

Yes, but preferred_mirror will be first + num_stripes; meaning that 
without that line the last stripe is preferred and with it the first 
stripe is preferred. It only changes the fallback case from reading from 
devid 2 to devid 1, which barely matters. Perhaps it would be even 
better to fall back to pid % num_stripes in this case anyway?

>>  From Goffredo's comment[1] on Timofey's similar effect patch, if it 
>> becomes possible to have more mirrors in a RAID1/RAID10 fs then this 
>> code will need to be updated to test dev_state for more than two 
>> devices. Would it be sensible to implement this as a for loop straight 
>> away?
> 
>  Right. A loop is better, will add.

Thinking more about this, if it becomes possible to have more than two 
devices in a RAID1 or part-RAID10 then do we also need to consider that 
there may be more than one READ_MIRROR drive? e.g. slow+fast+fast drives 
in a RAID1 with this approach would result in all chunks being read from 
the first drive with READ_MIRROR set if both chunks are on the fast 
drives. This is where Timofey's queue length patch in [1] would become 
useful - in any case, that is an existing problem and probably deserving 
of an entirely new patch.

>> [1]: https://patchwork.kernel.org/patch/10678531/#22315779
> 
> 
>  Thanks,
> Anand.

Steve
diff mbox

Patch

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 21eff0ac1e4f..7ddecf4178a6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -852,6 +852,27 @@  int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 					BTRFS_READ_MIRROR_BY_PID;
 				break;
 			}
+
+			intarg = 0;
+			if (match_int(&args[0], &intarg) == 0) {
+				struct btrfs_device *device;
+
+				device = btrfs_find_device(info, intarg,
+							   NULL, NULL);
+				if (!device) {
+					btrfs_err(info,
+					  "read_mirror_policy: invalid devid %d",
+					  intarg);
+					ret = -EINVAL;
+					goto out;
+				}
+				info->read_mirror_policy =
+						BTRFS_READ_MIRROR_BY_DEV;
+				set_bit(BTRFS_DEV_STATE_READ_MIRROR,
+					&device->dev_state);
+				break;
+			}
+
 			ret = -EINVAL;
 			goto out;
 		case Opt_err:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 64dba5c4cf33..3a9c3804ff17 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5220,6 +5220,16 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 		num_stripes = map->num_stripes;
 
 	switch(fs_info->read_mirror_policy) {
+	case BTRFS_READ_MIRROR_BY_DEV:
+		preferred_mirror = first;
+		if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+			     &map->stripes[preferred_mirror].dev->dev_state))
+			break;
+		if (test_bit(BTRFS_DEV_STATE_READ_MIRROR,
+			     &map->stripes[++preferred_mirror].dev->dev_state))
+			break;
+		preferred_mirror = first;
+		break;
 	case BTRFS_READ_MIRROR_DEFAULT:
 	case BTRFS_READ_MIRROR_BY_PID:
 	default:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 953df9925832..55b2eebbf183 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -37,6 +37,7 @@  struct btrfs_pending_bios {
 enum btrfs_read_mirror_type {
 	BTRFS_READ_MIRROR_DEFAULT,
 	BTRFS_READ_MIRROR_BY_PID,
+	BTRFS_READ_MIRROR_BY_DEV,
 };
 
 #define BTRFS_DEV_STATE_WRITEABLE	(0)
@@ -44,6 +45,7 @@  enum btrfs_read_mirror_type {
 #define BTRFS_DEV_STATE_MISSING		(2)
 #define BTRFS_DEV_STATE_REPLACE_TGT	(3)
 #define BTRFS_DEV_STATE_FLUSH_SENT	(4)
+#define BTRFS_DEV_STATE_READ_MIRROR	(5)
 
 struct btrfs_device {
 	struct list_head dev_list;