diff mbox series

[for-5.0,v2,10/23] quorum: Implement .bdrv_recurse_can_replace()

Message ID 20191111160216.197086-11-mreitz@redhat.com (mailing list archive)
State New, archived
Headers show
Series block: Fix check_to_replace_node() | expand

Commit Message

Max Reitz Nov. 11, 2019, 4:02 p.m. UTC
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

Comments

Vladimir Sementsov-Ogievskiy Nov. 29, 2019, 10:18 a.m. UTC | #1
11.11.2019 19:02, Max Reitz wrote:
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>   block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 62 insertions(+)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index 3a824e77e3..8ee03e9baf 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>       return false;
>   }
>   
> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
> +                                       BlockDriverState *to_replace)
> +{
> +    BDRVQuorumState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_children; i++) {
> +        /*
> +         * We have no idea whether our children show the same data as
> +         * this node (@bs).  It is actually highly likely that
> +         * @to_replace does not, because replacing a broken child is
> +         * one of the main use cases here.
> +         *
> +         * We do know that the new BDS will match @bs, so replacing
> +         * any of our children by it will be safe.  It cannot change
> +         * the data this quorum node presents to its parents.
> +         *
> +         * However, replacing @to_replace by @bs in any of our
> +         * children's chains may change visible data somewhere in
> +         * there.  We therefore cannot recurse down those chains with
> +         * bdrv_recurse_can_replace().
> +         * (More formally, bdrv_recurse_can_replace() requires that
> +         * @to_replace will be replaced by something matching the @bs
> +         * passed to it.  We cannot guarantee that.)
> +         *
> +         * Thus, we can only check whether any of our immediate
> +         * children matches @to_replace.
> +         *
> +         * (In the future, we might add a function to recurse down a
> +         * chain that checks that nothing there cares about a change
> +         * in data from the respective child in question.  For
> +         * example, most filters do not care when their child's data
> +         * suddenly changes, as long as their parents do not care.)
> +         */
> +        if (s->children[i].child->bs == to_replace) {
> +            Error *local_err = NULL;

bdrv_child_refresh_perms returns int, so I suggest instead:


bool ok;

> +
> +            /*
> +             * We now have to ensure that there is no other parent
> +             * that cares about replacing this child by a node with
> +             * potentially different data.
> +             */
> +            s->children[i].to_be_replaced = true;
> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);

ok = !bdrv_child_refresh_perms(bs, s->children[i].child, NULL);

> +
> +            /* Revert permissions */
> +            s->children[i].to_be_replaced = false;
> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);

return ok;

Or similar with // int ret; // ret = // return !ret; //

> +
> +            if (local_err) {
> +                error_free(local_err);
> +                return false;
> +            }
> +
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
>   static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
>   {
>   
> @@ -1195,6 +1256,7 @@ static BlockDriver bdrv_quorum = {
>   
>       .is_filter                          = true,
>       .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
> +    .bdrv_recurse_can_replace           = quorum_recurse_can_replace,
>   
>       .strong_runtime_opts                = quorum_strong_runtime_opts,
>   };
> 

with or without my suggestion:
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Max Reitz Nov. 29, 2019, 12:50 p.m. UTC | #2
On 29.11.19 11:18, Vladimir Sementsov-Ogievskiy wrote:
> 11.11.2019 19:02, Max Reitz wrote:
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 62 insertions(+)
>>
>> diff --git a/block/quorum.c b/block/quorum.c
>> index 3a824e77e3..8ee03e9baf 100644
>> --- a/block/quorum.c
>> +++ b/block/quorum.c
>> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>>       return false;
>>   }
>>   
>> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
>> +                                       BlockDriverState *to_replace)
>> +{
>> +    BDRVQuorumState *s = bs->opaque;
>> +    int i;
>> +
>> +    for (i = 0; i < s->num_children; i++) {
>> +        /*
>> +         * We have no idea whether our children show the same data as
>> +         * this node (@bs).  It is actually highly likely that
>> +         * @to_replace does not, because replacing a broken child is
>> +         * one of the main use cases here.
>> +         *
>> +         * We do know that the new BDS will match @bs, so replacing
>> +         * any of our children by it will be safe.  It cannot change
>> +         * the data this quorum node presents to its parents.
>> +         *
>> +         * However, replacing @to_replace by @bs in any of our
>> +         * children's chains may change visible data somewhere in
>> +         * there.  We therefore cannot recurse down those chains with
>> +         * bdrv_recurse_can_replace().
>> +         * (More formally, bdrv_recurse_can_replace() requires that
>> +         * @to_replace will be replaced by something matching the @bs
>> +         * passed to it.  We cannot guarantee that.)
>> +         *
>> +         * Thus, we can only check whether any of our immediate
>> +         * children matches @to_replace.
>> +         *
>> +         * (In the future, we might add a function to recurse down a
>> +         * chain that checks that nothing there cares about a change
>> +         * in data from the respective child in question.  For
>> +         * example, most filters do not care when their child's data
>> +         * suddenly changes, as long as their parents do not care.)
>> +         */
>> +        if (s->children[i].child->bs == to_replace) {
>> +            Error *local_err = NULL;
> 
> bdrv_child_refresh_perms returns int, so I suggest instead:

Good to know. :-)

> bool ok;
> 
>> +
>> +            /*
>> +             * We now have to ensure that there is no other parent
>> +             * that cares about replacing this child by a node with
>> +             * potentially different data.
>> +             */
>> +            s->children[i].to_be_replaced = true;
>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
> 
> ok = !bdrv_child_refresh_perms(bs, s->children[i].child, NULL);
> 
>> +
>> +            /* Revert permissions */
>> +            s->children[i].to_be_replaced = false;
>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
> 
> return ok;
> 
> Or similar with // int ret; // ret = // return !ret; //

Sounds good.

Max
Kevin Wolf Feb. 5, 2020, 3:55 p.m. UTC | #3
Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 62 insertions(+)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index 3a824e77e3..8ee03e9baf 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>      return false;
>  }
>  
> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
> +                                       BlockDriverState *to_replace)
> +{
> +    BDRVQuorumState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_children; i++) {
> +        /*
> +         * We have no idea whether our children show the same data as
> +         * this node (@bs).  It is actually highly likely that
> +         * @to_replace does not, because replacing a broken child is
> +         * one of the main use cases here.
> +         *
> +         * We do know that the new BDS will match @bs, so replacing
> +         * any of our children by it will be safe.  It cannot change
> +         * the data this quorum node presents to its parents.
> +         *
> +         * However, replacing @to_replace by @bs in any of our
> +         * children's chains may change visible data somewhere in
> +         * there.  We therefore cannot recurse down those chains with
> +         * bdrv_recurse_can_replace().
> +         * (More formally, bdrv_recurse_can_replace() requires that
> +         * @to_replace will be replaced by something matching the @bs
> +         * passed to it.  We cannot guarantee that.)
> +         *
> +         * Thus, we can only check whether any of our immediate
> +         * children matches @to_replace.
> +         *
> +         * (In the future, we might add a function to recurse down a
> +         * chain that checks that nothing there cares about a change
> +         * in data from the respective child in question.  For
> +         * example, most filters do not care when their child's data
> +         * suddenly changes, as long as their parents do not care.)
> +         */
> +        if (s->children[i].child->bs == to_replace) {
> +            Error *local_err = NULL;
> +
> +            /*
> +             * We now have to ensure that there is no other parent
> +             * that cares about replacing this child by a node with
> +             * potentially different data.
> +             */
> +            s->children[i].to_be_replaced = true;
> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
> +
> +            /* Revert permissions */
> +            s->children[i].to_be_replaced = false;
> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);

Quite a hack. The two obvious problems are:

1. We can't guarantee that we can actually revert the permissions. I
   think we ignore failure to loosen permissions meanwhile so that at
   least the &error_abort doesn't trigger, but bs could still be in the
   wrong state afterwards.

   It would be cleaner to use check+abort instead of actually setting
   the new permission.

2. As aborting the permission change makes more obvious, we're checking
   something that might not be true any more when we actually make the
   change.

Pragmatically, a hack might be good enough here, but it should be
documented as such (with a short explanation of its shortcomings) at
least.

Kevin
Kevin Wolf Feb. 5, 2020, 4:03 p.m. UTC | #4
Am 05.02.2020 um 16:55 hat Kevin Wolf geschrieben:
> Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
> > Signed-off-by: Max Reitz <mreitz@redhat.com>
> > ---
> >  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 62 insertions(+)
> > 
> > diff --git a/block/quorum.c b/block/quorum.c
> > index 3a824e77e3..8ee03e9baf 100644
> > --- a/block/quorum.c
> > +++ b/block/quorum.c
> > @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
> >      return false;
> >  }
> >  
> > +static bool quorum_recurse_can_replace(BlockDriverState *bs,
> > +                                       BlockDriverState *to_replace)
> > +{
> > +    BDRVQuorumState *s = bs->opaque;
> > +    int i;
> > +
> > +    for (i = 0; i < s->num_children; i++) {
> > +        /*
> > +         * We have no idea whether our children show the same data as
> > +         * this node (@bs).  It is actually highly likely that
> > +         * @to_replace does not, because replacing a broken child is
> > +         * one of the main use cases here.
> > +         *
> > +         * We do know that the new BDS will match @bs, so replacing
> > +         * any of our children by it will be safe.  It cannot change
> > +         * the data this quorum node presents to its parents.
> > +         *
> > +         * However, replacing @to_replace by @bs in any of our
> > +         * children's chains may change visible data somewhere in
> > +         * there.  We therefore cannot recurse down those chains with
> > +         * bdrv_recurse_can_replace().
> > +         * (More formally, bdrv_recurse_can_replace() requires that
> > +         * @to_replace will be replaced by something matching the @bs
> > +         * passed to it.  We cannot guarantee that.)
> > +         *
> > +         * Thus, we can only check whether any of our immediate
> > +         * children matches @to_replace.
> > +         *
> > +         * (In the future, we might add a function to recurse down a
> > +         * chain that checks that nothing there cares about a change
> > +         * in data from the respective child in question.  For
> > +         * example, most filters do not care when their child's data
> > +         * suddenly changes, as long as their parents do not care.)
> > +         */
> > +        if (s->children[i].child->bs == to_replace) {
> > +            Error *local_err = NULL;
> > +
> > +            /*
> > +             * We now have to ensure that there is no other parent
> > +             * that cares about replacing this child by a node with
> > +             * potentially different data.
> > +             */
> > +            s->children[i].to_be_replaced = true;
> > +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
> > +
> > +            /* Revert permissions */
> > +            s->children[i].to_be_replaced = false;
> > +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
> 
> Quite a hack. The two obvious problems are:
> 
> 1. We can't guarantee that we can actually revert the permissions. I
>    think we ignore failure to loosen permissions meanwhile so that at
>    least the &error_abort doesn't trigger, but bs could still be in the
>    wrong state afterwards.
> 
>    It would be cleaner to use check+abort instead of actually setting
>    the new permission.
> 
> 2. As aborting the permission change makes more obvious, we're checking
>    something that might not be true any more when we actually make the
>    change.
> 
> Pragmatically, a hack might be good enough here, but it should be
> documented as such (with a short explanation of its shortcomings) at
> least.

Oops, meant to send this as a comment for v3 (which I did apply locally
for review).

Kevin
Max Reitz Feb. 6, 2020, 10:21 a.m. UTC | #5
On 05.02.20 16:55, Kevin Wolf wrote:
> Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 62 insertions(+)
>>
>> diff --git a/block/quorum.c b/block/quorum.c
>> index 3a824e77e3..8ee03e9baf 100644
>> --- a/block/quorum.c
>> +++ b/block/quorum.c
>> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>>      return false;
>>  }
>>  
>> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
>> +                                       BlockDriverState *to_replace)
>> +{
>> +    BDRVQuorumState *s = bs->opaque;
>> +    int i;
>> +
>> +    for (i = 0; i < s->num_children; i++) {
>> +        /*
>> +         * We have no idea whether our children show the same data as
>> +         * this node (@bs).  It is actually highly likely that
>> +         * @to_replace does not, because replacing a broken child is
>> +         * one of the main use cases here.
>> +         *
>> +         * We do know that the new BDS will match @bs, so replacing
>> +         * any of our children by it will be safe.  It cannot change
>> +         * the data this quorum node presents to its parents.
>> +         *
>> +         * However, replacing @to_replace by @bs in any of our
>> +         * children's chains may change visible data somewhere in
>> +         * there.  We therefore cannot recurse down those chains with
>> +         * bdrv_recurse_can_replace().
>> +         * (More formally, bdrv_recurse_can_replace() requires that
>> +         * @to_replace will be replaced by something matching the @bs
>> +         * passed to it.  We cannot guarantee that.)
>> +         *
>> +         * Thus, we can only check whether any of our immediate
>> +         * children matches @to_replace.
>> +         *
>> +         * (In the future, we might add a function to recurse down a
>> +         * chain that checks that nothing there cares about a change
>> +         * in data from the respective child in question.  For
>> +         * example, most filters do not care when their child's data
>> +         * suddenly changes, as long as their parents do not care.)
>> +         */
>> +        if (s->children[i].child->bs == to_replace) {
>> +            Error *local_err = NULL;
>> +
>> +            /*
>> +             * We now have to ensure that there is no other parent
>> +             * that cares about replacing this child by a node with
>> +             * potentially different data.
>> +             */
>> +            s->children[i].to_be_replaced = true;
>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
>> +
>> +            /* Revert permissions */
>> +            s->children[i].to_be_replaced = false;
>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
> 
> Quite a hack. The two obvious problems are:
> 
> 1. We can't guarantee that we can actually revert the permissions. I
>    think we ignore failure to loosen permissions meanwhile so that at
>    least the &error_abort doesn't trigger, but bs could still be in the
>    wrong state afterwards.

I thought we guaranteed that loosening permissions never fails.

(Well, you know.  It may “leak” permissions, but we’d never get an error
here so there’s nothing to handle anyway.)

>    It would be cleaner to use check+abort instead of actually setting
>    the new permission.

Oh.  Yes.  Maybe.  It does require more code, though, because I’d rather
not use bdrv_check_update_perm() from here as-is.

> 2. As aborting the permission change makes more obvious, we're checking
>    something that might not be true any more when we actually make the
>    change.

True.  I tried to do it right by having a post-replace cleanup function,
but after a while that was just going nowhere, really.  So I just went
with what’s patch 13 here.

But isn’t 13 enough, actually?  It check can_replace right before
replacing in a drained section.  I can’t imagine the permissions to
change there.

Max

> Pragmatically, a hack might be good enough here, but it should be
> documented as such (with a short explanation of its shortcomings) at
> least.
Kevin Wolf Feb. 6, 2020, 2:42 p.m. UTC | #6
Am 06.02.2020 um 11:21 hat Max Reitz geschrieben:
> On 05.02.20 16:55, Kevin Wolf wrote:
> > Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
> >> Signed-off-by: Max Reitz <mreitz@redhat.com>
> >> ---
> >>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 62 insertions(+)
> >>
> >> diff --git a/block/quorum.c b/block/quorum.c
> >> index 3a824e77e3..8ee03e9baf 100644
> >> --- a/block/quorum.c
> >> +++ b/block/quorum.c
> >> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
> >>      return false;
> >>  }
> >>  
> >> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
> >> +                                       BlockDriverState *to_replace)
> >> +{
> >> +    BDRVQuorumState *s = bs->opaque;
> >> +    int i;
> >> +
> >> +    for (i = 0; i < s->num_children; i++) {
> >> +        /*
> >> +         * We have no idea whether our children show the same data as
> >> +         * this node (@bs).  It is actually highly likely that
> >> +         * @to_replace does not, because replacing a broken child is
> >> +         * one of the main use cases here.
> >> +         *
> >> +         * We do know that the new BDS will match @bs, so replacing
> >> +         * any of our children by it will be safe.  It cannot change
> >> +         * the data this quorum node presents to its parents.
> >> +         *
> >> +         * However, replacing @to_replace by @bs in any of our
> >> +         * children's chains may change visible data somewhere in
> >> +         * there.  We therefore cannot recurse down those chains with
> >> +         * bdrv_recurse_can_replace().
> >> +         * (More formally, bdrv_recurse_can_replace() requires that
> >> +         * @to_replace will be replaced by something matching the @bs
> >> +         * passed to it.  We cannot guarantee that.)
> >> +         *
> >> +         * Thus, we can only check whether any of our immediate
> >> +         * children matches @to_replace.
> >> +         *
> >> +         * (In the future, we might add a function to recurse down a
> >> +         * chain that checks that nothing there cares about a change
> >> +         * in data from the respective child in question.  For
> >> +         * example, most filters do not care when their child's data
> >> +         * suddenly changes, as long as their parents do not care.)
> >> +         */
> >> +        if (s->children[i].child->bs == to_replace) {
> >> +            Error *local_err = NULL;
> >> +
> >> +            /*
> >> +             * We now have to ensure that there is no other parent
> >> +             * that cares about replacing this child by a node with
> >> +             * potentially different data.
> >> +             */
> >> +            s->children[i].to_be_replaced = true;
> >> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
> >> +
> >> +            /* Revert permissions */
> >> +            s->children[i].to_be_replaced = false;
> >> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
> > 
> > Quite a hack. The two obvious problems are:
> > 
> > 1. We can't guarantee that we can actually revert the permissions. I
> >    think we ignore failure to loosen permissions meanwhile so that at
> >    least the &error_abort doesn't trigger, but bs could still be in the
> >    wrong state afterwards.
> 
> I thought we guaranteed that loosening permissions never fails.
> 
> (Well, you know.  It may “leak” permissions, but we’d never get an error
> here so there’s nothing to handle anyway.)

This is what I meant. We ignore the failure (i.e. don't return an error),
but the result still isn't completely correct ("leaked" permissions).

> >    It would be cleaner to use check+abort instead of actually setting
> >    the new permission.
> 
> Oh.  Yes.  Maybe.  It does require more code, though, because I’d rather
> not use bdrv_check_update_perm() from here as-is.

I'm not saying you need to do it, just that it would be cleaner. :-)

> > 2. As aborting the permission change makes more obvious, we're checking
> >    something that might not be true any more when we actually make the
> >    change.
> 
> True.  I tried to do it right by having a post-replace cleanup function,
> but after a while that was just going nowhere, really.  So I just went
> with what’s patch 13 here.
> 
> But isn’t 13 enough, actually?  It check can_replace right before
> replacing in a drained section.  I can’t imagine the permissions to
> change there.

Permissions are tied to file locks, so an external process can just grab
the locks in between. But if I understand correctly, all we try here is
to have an additional safeguard to prevent the user from doing stupid
things. So I guess not being 100% is fine as long as it's documented in
the code.

Kevin
Max Reitz Feb. 6, 2020, 3:19 p.m. UTC | #7
On 06.02.20 15:42, Kevin Wolf wrote:
> Am 06.02.2020 um 11:21 hat Max Reitz geschrieben:
>> On 05.02.20 16:55, Kevin Wolf wrote:
>>> Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 62 insertions(+)
>>>>
>>>> diff --git a/block/quorum.c b/block/quorum.c
>>>> index 3a824e77e3..8ee03e9baf 100644
>>>> --- a/block/quorum.c
>>>> +++ b/block/quorum.c
>>>> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>>>>      return false;
>>>>  }
>>>>  
>>>> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
>>>> +                                       BlockDriverState *to_replace)
>>>> +{
>>>> +    BDRVQuorumState *s = bs->opaque;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < s->num_children; i++) {
>>>> +        /*
>>>> +         * We have no idea whether our children show the same data as
>>>> +         * this node (@bs).  It is actually highly likely that
>>>> +         * @to_replace does not, because replacing a broken child is
>>>> +         * one of the main use cases here.
>>>> +         *
>>>> +         * We do know that the new BDS will match @bs, so replacing
>>>> +         * any of our children by it will be safe.  It cannot change
>>>> +         * the data this quorum node presents to its parents.
>>>> +         *
>>>> +         * However, replacing @to_replace by @bs in any of our
>>>> +         * children's chains may change visible data somewhere in
>>>> +         * there.  We therefore cannot recurse down those chains with
>>>> +         * bdrv_recurse_can_replace().
>>>> +         * (More formally, bdrv_recurse_can_replace() requires that
>>>> +         * @to_replace will be replaced by something matching the @bs
>>>> +         * passed to it.  We cannot guarantee that.)
>>>> +         *
>>>> +         * Thus, we can only check whether any of our immediate
>>>> +         * children matches @to_replace.
>>>> +         *
>>>> +         * (In the future, we might add a function to recurse down a
>>>> +         * chain that checks that nothing there cares about a change
>>>> +         * in data from the respective child in question.  For
>>>> +         * example, most filters do not care when their child's data
>>>> +         * suddenly changes, as long as their parents do not care.)
>>>> +         */
>>>> +        if (s->children[i].child->bs == to_replace) {
>>>> +            Error *local_err = NULL;
>>>> +
>>>> +            /*
>>>> +             * We now have to ensure that there is no other parent
>>>> +             * that cares about replacing this child by a node with
>>>> +             * potentially different data.
>>>> +             */
>>>> +            s->children[i].to_be_replaced = true;
>>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
>>>> +
>>>> +            /* Revert permissions */
>>>> +            s->children[i].to_be_replaced = false;
>>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
>>>
>>> Quite a hack. The two obvious problems are:
>>>
>>> 1. We can't guarantee that we can actually revert the permissions. I
>>>    think we ignore failure to loosen permissions meanwhile so that at
>>>    least the &error_abort doesn't trigger, but bs could still be in the
>>>    wrong state afterwards.
>>
>> I thought we guaranteed that loosening permissions never fails.
>>
>> (Well, you know.  It may “leak” permissions, but we’d never get an error
>> here so there’s nothing to handle anyway.)
> 
> This is what I meant. We ignore the failure (i.e. don't return an error),
> but the result still isn't completely correct ("leaked" permissions).
> 
>>>    It would be cleaner to use check+abort instead of actually setting
>>>    the new permission.
>>
>> Oh.  Yes.  Maybe.  It does require more code, though, because I’d rather
>> not use bdrv_check_update_perm() from here as-is.
> 
> I'm not saying you need to do it, just that it would be cleaner. :-)

It would.  Thanks for the suggestion, I obviously didn’t think of it.
(Or there’d be a comment on how this is not the best way in theory, but
in practice it’s good enough.)  I suppose I’ll see how what I can do.

>>> 2. As aborting the permission change makes more obvious, we're checking
>>>    something that might not be true any more when we actually make the
>>>    change.
>>
>> True.  I tried to do it right by having a post-replace cleanup function,
>> but after a while that was just going nowhere, really.  So I just went
>> with what’s patch 13 here.
>>
>> But isn’t 13 enough, actually?  It check can_replace right before
>> replacing in a drained section.  I can’t imagine the permissions to
>> change there.
> 
> Permissions are tied to file locks, so an external process can just grab
> the locks in between.

Ah, right, I didn’t think of that.

> But if I understand correctly, all we try here is
> to have an additional safeguard to prevent the user from doing stupid
> things. So I guess not being 100% is fine as long as it's documented in
> the code.

Yes.  I just think it actually would be 100 % in practice, so I wondered
whether it would need to be documented.

You’re right, though, it isn’t 100 %, so it should definitely be
documented.  Maybe something like

In theory, we would have to keep the permissions tightened until the
node is replaced.  In practice, that would require post-replacement
cleanup infrastructure, which we do not have, and which would be
unreasonably complex to implement.  Therefore, all we can do is require
anyone who wants to replace one node by some potentially unrelated other
node (i.e., the mirror job on completion) to invoke
bdrv_recurse_can_replace() immediately before and thus minimize the time
during which some condition may arise that might forbid the swap.

?

Max
Kevin Wolf Feb. 6, 2020, 3:42 p.m. UTC | #8
Am 06.02.2020 um 16:19 hat Max Reitz geschrieben:
> On 06.02.20 15:42, Kevin Wolf wrote:
> > Am 06.02.2020 um 11:21 hat Max Reitz geschrieben:
> >> On 05.02.20 16:55, Kevin Wolf wrote:
> >>> Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
> >>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
> >>>> ---
> >>>>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  1 file changed, 62 insertions(+)
> >>>>
> >>>> diff --git a/block/quorum.c b/block/quorum.c
> >>>> index 3a824e77e3..8ee03e9baf 100644
> >>>> --- a/block/quorum.c
> >>>> +++ b/block/quorum.c
> >>>> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
> >>>>      return false;
> >>>>  }
> >>>>  
> >>>> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
> >>>> +                                       BlockDriverState *to_replace)
> >>>> +{
> >>>> +    BDRVQuorumState *s = bs->opaque;
> >>>> +    int i;
> >>>> +
> >>>> +    for (i = 0; i < s->num_children; i++) {
> >>>> +        /*
> >>>> +         * We have no idea whether our children show the same data as
> >>>> +         * this node (@bs).  It is actually highly likely that
> >>>> +         * @to_replace does not, because replacing a broken child is
> >>>> +         * one of the main use cases here.
> >>>> +         *
> >>>> +         * We do know that the new BDS will match @bs, so replacing
> >>>> +         * any of our children by it will be safe.  It cannot change
> >>>> +         * the data this quorum node presents to its parents.
> >>>> +         *
> >>>> +         * However, replacing @to_replace by @bs in any of our
> >>>> +         * children's chains may change visible data somewhere in
> >>>> +         * there.  We therefore cannot recurse down those chains with
> >>>> +         * bdrv_recurse_can_replace().
> >>>> +         * (More formally, bdrv_recurse_can_replace() requires that
> >>>> +         * @to_replace will be replaced by something matching the @bs
> >>>> +         * passed to it.  We cannot guarantee that.)
> >>>> +         *
> >>>> +         * Thus, we can only check whether any of our immediate
> >>>> +         * children matches @to_replace.
> >>>> +         *
> >>>> +         * (In the future, we might add a function to recurse down a
> >>>> +         * chain that checks that nothing there cares about a change
> >>>> +         * in data from the respective child in question.  For
> >>>> +         * example, most filters do not care when their child's data
> >>>> +         * suddenly changes, as long as their parents do not care.)
> >>>> +         */
> >>>> +        if (s->children[i].child->bs == to_replace) {
> >>>> +            Error *local_err = NULL;
> >>>> +
> >>>> +            /*
> >>>> +             * We now have to ensure that there is no other parent
> >>>> +             * that cares about replacing this child by a node with
> >>>> +             * potentially different data.
> >>>> +             */
> >>>> +            s->children[i].to_be_replaced = true;
> >>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
> >>>> +
> >>>> +            /* Revert permissions */
> >>>> +            s->children[i].to_be_replaced = false;
> >>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
> >>>
> >>> Quite a hack. The two obvious problems are:
> >>>
> >>> 1. We can't guarantee that we can actually revert the permissions. I
> >>>    think we ignore failure to loosen permissions meanwhile so that at
> >>>    least the &error_abort doesn't trigger, but bs could still be in the
> >>>    wrong state afterwards.
> >>
> >> I thought we guaranteed that loosening permissions never fails.
> >>
> >> (Well, you know.  It may “leak” permissions, but we’d never get an error
> >> here so there’s nothing to handle anyway.)
> > 
> > This is what I meant. We ignore the failure (i.e. don't return an error),
> > but the result still isn't completely correct ("leaked" permissions).
> > 
> >>>    It would be cleaner to use check+abort instead of actually setting
> >>>    the new permission.
> >>
> >> Oh.  Yes.  Maybe.  It does require more code, though, because I’d rather
> >> not use bdrv_check_update_perm() from here as-is.
> > 
> > I'm not saying you need to do it, just that it would be cleaner. :-)
> 
> It would.  Thanks for the suggestion, I obviously didn’t think of it.
> (Or there’d be a comment on how this is not the best way in theory, but
> in practice it’s good enough.)  I suppose I’ll see how what I can do.
> 
> >>> 2. As aborting the permission change makes more obvious, we're checking
> >>>    something that might not be true any more when we actually make the
> >>>    change.
> >>
> >> True.  I tried to do it right by having a post-replace cleanup function,
> >> but after a while that was just going nowhere, really.  So I just went
> >> with what’s patch 13 here.
> >>
> >> But isn’t 13 enough, actually?  It check can_replace right before
> >> replacing in a drained section.  I can’t imagine the permissions to
> >> change there.
> > 
> > Permissions are tied to file locks, so an external process can just grab
> > the locks in between.
> 
> Ah, right, I didn’t think of that.
> 
> > But if I understand correctly, all we try here is
> > to have an additional safeguard to prevent the user from doing stupid
> > things. So I guess not being 100% is fine as long as it's documented in
> > the code.
> 
> Yes.  I just think it actually would be 100 % in practice, so I wondered
> whether it would need to be documented.
> 
> You’re right, though, it isn’t 100 %, so it should definitely be
> documented.  Maybe something like
> 
> In theory, we would have to keep the permissions tightened until the
> node is replaced.  In practice, that would require post-replacement
> cleanup infrastructure, which we do not have, and which would be
> unreasonably complex to implement.

Sounds good until here.

> Therefore, all we can do is require
> anyone who wants to replace one node by some potentially unrelated other
> node (i.e., the mirror job on completion) to invoke
> bdrv_recurse_can_replace() immediately before and thus minimize the time
> during which some condition may arise that might forbid the swap.
> 
> ?

This second part of your suggested comment could be dropped, as far as
I'm concerned. If anything, it's part of the contract and would belong
in the bdrv_recurse_can_replace() documentation.

However, I think I would mention why not being 100% is okay: The part
with "additional safeguard to prevent the user from doing stupid
things", and that it doesn't make a difference if the user runs the
correct command.

Kevin
Max Reitz Feb. 6, 2020, 4:44 p.m. UTC | #9
On 06.02.20 16:42, Kevin Wolf wrote:
> Am 06.02.2020 um 16:19 hat Max Reitz geschrieben:
>> On 06.02.20 15:42, Kevin Wolf wrote:
>>> Am 06.02.2020 um 11:21 hat Max Reitz geschrieben:
>>>> On 05.02.20 16:55, Kevin Wolf wrote:
>>>>> Am 11.11.2019 um 17:02 hat Max Reitz geschrieben:
>>>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>>>> ---
>>>>>>  block/quorum.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  1 file changed, 62 insertions(+)
>>>>>>
>>>>>> diff --git a/block/quorum.c b/block/quorum.c
>>>>>> index 3a824e77e3..8ee03e9baf 100644
>>>>>> --- a/block/quorum.c
>>>>>> +++ b/block/quorum.c
>>>>>> @@ -825,6 +825,67 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
>>>>>>      return false;
>>>>>>  }
>>>>>>  
>>>>>> +static bool quorum_recurse_can_replace(BlockDriverState *bs,
>>>>>> +                                       BlockDriverState *to_replace)
>>>>>> +{
>>>>>> +    BDRVQuorumState *s = bs->opaque;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    for (i = 0; i < s->num_children; i++) {
>>>>>> +        /*
>>>>>> +         * We have no idea whether our children show the same data as
>>>>>> +         * this node (@bs).  It is actually highly likely that
>>>>>> +         * @to_replace does not, because replacing a broken child is
>>>>>> +         * one of the main use cases here.
>>>>>> +         *
>>>>>> +         * We do know that the new BDS will match @bs, so replacing
>>>>>> +         * any of our children by it will be safe.  It cannot change
>>>>>> +         * the data this quorum node presents to its parents.
>>>>>> +         *
>>>>>> +         * However, replacing @to_replace by @bs in any of our
>>>>>> +         * children's chains may change visible data somewhere in
>>>>>> +         * there.  We therefore cannot recurse down those chains with
>>>>>> +         * bdrv_recurse_can_replace().
>>>>>> +         * (More formally, bdrv_recurse_can_replace() requires that
>>>>>> +         * @to_replace will be replaced by something matching the @bs
>>>>>> +         * passed to it.  We cannot guarantee that.)
>>>>>> +         *
>>>>>> +         * Thus, we can only check whether any of our immediate
>>>>>> +         * children matches @to_replace.
>>>>>> +         *
>>>>>> +         * (In the future, we might add a function to recurse down a
>>>>>> +         * chain that checks that nothing there cares about a change
>>>>>> +         * in data from the respective child in question.  For
>>>>>> +         * example, most filters do not care when their child's data
>>>>>> +         * suddenly changes, as long as their parents do not care.)
>>>>>> +         */
>>>>>> +        if (s->children[i].child->bs == to_replace) {
>>>>>> +            Error *local_err = NULL;
>>>>>> +
>>>>>> +            /*
>>>>>> +             * We now have to ensure that there is no other parent
>>>>>> +             * that cares about replacing this child by a node with
>>>>>> +             * potentially different data.
>>>>>> +             */
>>>>>> +            s->children[i].to_be_replaced = true;
>>>>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
>>>>>> +
>>>>>> +            /* Revert permissions */
>>>>>> +            s->children[i].to_be_replaced = false;
>>>>>> +            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
>>>>>
>>>>> Quite a hack. The two obvious problems are:
>>>>>
>>>>> 1. We can't guarantee that we can actually revert the permissions. I
>>>>>    think we ignore failure to loosen permissions meanwhile so that at
>>>>>    least the &error_abort doesn't trigger, but bs could still be in the
>>>>>    wrong state afterwards.
>>>>
>>>> I thought we guaranteed that loosening permissions never fails.
>>>>
>>>> (Well, you know.  It may “leak” permissions, but we’d never get an error
>>>> here so there’s nothing to handle anyway.)
>>>
>>> This is what I meant. We ignore the failure (i.e. don't return an error),
>>> but the result still isn't completely correct ("leaked" permissions).
>>>
>>>>>    It would be cleaner to use check+abort instead of actually setting
>>>>>    the new permission.
>>>>
>>>> Oh.  Yes.  Maybe.  It does require more code, though, because I’d rather
>>>> not use bdrv_check_update_perm() from here as-is.
>>>
>>> I'm not saying you need to do it, just that it would be cleaner. :-)
>>
>> It would.  Thanks for the suggestion, I obviously didn’t think of it.
>> (Or there’d be a comment on how this is not the best way in theory, but
>> in practice it’s good enough.)  I suppose I’ll see how what I can do.
>>
>>>>> 2. As aborting the permission change makes more obvious, we're checking
>>>>>    something that might not be true any more when we actually make the
>>>>>    change.
>>>>
>>>> True.  I tried to do it right by having a post-replace cleanup function,
>>>> but after a while that was just going nowhere, really.  So I just went
>>>> with what’s patch 13 here.
>>>>
>>>> But isn’t 13 enough, actually?  It check can_replace right before
>>>> replacing in a drained section.  I can’t imagine the permissions to
>>>> change there.
>>>
>>> Permissions are tied to file locks, so an external process can just grab
>>> the locks in between.
>>
>> Ah, right, I didn’t think of that.
>>
>>> But if I understand correctly, all we try here is
>>> to have an additional safeguard to prevent the user from doing stupid
>>> things. So I guess not being 100% is fine as long as it's documented in
>>> the code.
>>
>> Yes.  I just think it actually would be 100 % in practice, so I wondered
>> whether it would need to be documented.
>>
>> You’re right, though, it isn’t 100 %, so it should definitely be
>> documented.  Maybe something like
>>
>> In theory, we would have to keep the permissions tightened until the
>> node is replaced.  In practice, that would require post-replacement
>> cleanup infrastructure, which we do not have, and which would be
>> unreasonably complex to implement.
> 
> Sounds good until here.
> 
>> Therefore, all we can do is require
>> anyone who wants to replace one node by some potentially unrelated other
>> node (i.e., the mirror job on completion) to invoke
>> bdrv_recurse_can_replace() immediately before and thus minimize the time
>> during which some condition may arise that might forbid the swap.
>>
>> ?
> 
> This second part of your suggested comment could be dropped, as far as
> I'm concerned. If anything, it's part of the contract and would belong
> in the bdrv_recurse_can_replace() documentation.
> 
> However, I think I would mention why not being 100% is okay: The part
> with "additional safeguard to prevent the user from doing stupid
> things", and that it doesn't make a difference if the user runs the
> correct command.

OK.

Max
diff mbox series

Patch

diff --git a/block/quorum.c b/block/quorum.c
index 3a824e77e3..8ee03e9baf 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -825,6 +825,67 @@  static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs,
     return false;
 }
 
+static bool quorum_recurse_can_replace(BlockDriverState *bs,
+                                       BlockDriverState *to_replace)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        /*
+         * We have no idea whether our children show the same data as
+         * this node (@bs).  It is actually highly likely that
+         * @to_replace does not, because replacing a broken child is
+         * one of the main use cases here.
+         *
+         * We do know that the new BDS will match @bs, so replacing
+         * any of our children by it will be safe.  It cannot change
+         * the data this quorum node presents to its parents.
+         *
+         * However, replacing @to_replace by @bs in any of our
+         * children's chains may change visible data somewhere in
+         * there.  We therefore cannot recurse down those chains with
+         * bdrv_recurse_can_replace().
+         * (More formally, bdrv_recurse_can_replace() requires that
+         * @to_replace will be replaced by something matching the @bs
+         * passed to it.  We cannot guarantee that.)
+         *
+         * Thus, we can only check whether any of our immediate
+         * children matches @to_replace.
+         *
+         * (In the future, we might add a function to recurse down a
+         * chain that checks that nothing there cares about a change
+         * in data from the respective child in question.  For
+         * example, most filters do not care when their child's data
+         * suddenly changes, as long as their parents do not care.)
+         */
+        if (s->children[i].child->bs == to_replace) {
+            Error *local_err = NULL;
+
+            /*
+             * We now have to ensure that there is no other parent
+             * that cares about replacing this child by a node with
+             * potentially different data.
+             */
+            s->children[i].to_be_replaced = true;
+            bdrv_child_refresh_perms(bs, s->children[i].child, &local_err);
+
+            /* Revert permissions */
+            s->children[i].to_be_replaced = false;
+            bdrv_child_refresh_perms(bs, s->children[i].child, &error_abort);
+
+            if (local_err) {
+                error_free(local_err);
+                return false;
+            }
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
 {
 
@@ -1195,6 +1256,7 @@  static BlockDriver bdrv_quorum = {
 
     .is_filter                          = true,
     .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
+    .bdrv_recurse_can_replace           = quorum_recurse_can_replace,
 
     .strong_runtime_opts                = quorum_strong_runtime_opts,
 };