diff mbox

[for-2.7,v2,04/17] block: Introduce image file locking

Message ID 1460690887-32751-5-git-send-email-famz@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Fam Zheng April 15, 2016, 3:27 a.m. UTC
Block drivers can implement this new operation .bdrv_lockf to actually lock the
image in the protocol specific way.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/block/block_int.h | 12 ++++++++++++
 2 files changed, 54 insertions(+)

Comments

Denis V. Lunev April 16, 2016, 1:22 p.m. UTC | #1
On 04/15/2016 06:27 AM, Fam Zheng wrote:
> Block drivers can implement this new operation .bdrv_lockf to actually lock the
> image in the protocol specific way.
>
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>   block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
>   include/block/block_int.h | 12 ++++++++++++
>   2 files changed, 54 insertions(+)
>
> diff --git a/block.c b/block.c
> index 1c575e4..7971a25 100644
> --- a/block.c
> +++ b/block.c
> @@ -846,6 +846,34 @@ out:
>       g_free(gen_node_name);
>   }
>   
> +static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> +{
> +    int cmd = BDRV_LOCKF_UNLOCK;
> +
> +    if (bs->image_locked == lock_image) {
> +        return 0;
> +    } else if (!bs->drv) {
> +        return -ENOMEDIUM;
> +    } else if (!bs->drv->bdrv_lockf) {
> +        return 0;
> +    }
> +    if (lock_image) {
> +        cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> +                                             BDRV_LOCKF_ROLOCK;
> +    }
> +    return bs->drv->bdrv_lockf(bs, cmd);
should we handle ENOTSUP specially?
f.e. this would fire with raw-posix.c on a filesystem which does not 
support locking.

from my POW this situations is equivalent to  the absence of 
bs->drv->bdrv_lockf

> +}
> +
> +static int bdrv_lock_image(BlockDriverState *bs)
> +{
> +    return bdrv_lock_unlock_image_do(bs, true);
> +}
> +
> +static int bdrv_unlock_image(BlockDriverState *bs)
> +{
> +    return bdrv_lock_unlock_image_do(bs, false);
> +}
> +
>   static QemuOptsList bdrv_runtime_opts = {
>       .name = "bdrv_common",
>       .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> @@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
>           goto free_and_fail;
>       }
>   
> +    if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> +        ret = bdrv_lock_image(bs);
> +        if (ret) {
> +            error_setg(errp, "Failed to lock image");
> +            goto free_and_fail;
> +        }
> +    }
> +
>       ret = refresh_total_sectors(bs, bs->total_sectors);
>       if (ret < 0) {
>           error_setg_errno(errp, -ret, "Could not refresh total sector count");
> @@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
>       if (bs->drv) {
>           BdrvChild *child, *next;
>   
> +        bdrv_unlock_image(bs);
>           bs->drv->bdrv_close(bs);
>           bs->drv = NULL;
>   
> @@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
>           error_setg_errno(errp, -ret, "Could not refresh total sector count");
>           return;
>       }
> +    if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> +        bdrv_lock_image(bs);
> +    }
>   }
>   
>   void bdrv_invalidate_cache_all(Error **errp)
> @@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
>       }
>   
>       bs->open_flags |= BDRV_O_INACTIVE;
> +    ret = bdrv_unlock_image(bs);
I'd better move unlock a line above.
Though this is personal. This could be useful
for debugging purposes.


>       return 0;
>   }
>   
> @@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
>           QDECREF(json);
>       }
>   }
> +
this hunk is extra

> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 10d8759..ffa30b0 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -85,6 +85,12 @@ typedef struct BdrvTrackedRequest {
>       struct BdrvTrackedRequest *waiting_for;
>   } BdrvTrackedRequest;
>   
> +typedef enum {
> +    BDRV_LOCKF_RWLOCK,
> +    BDRV_LOCKF_ROLOCK,
> +    BDRV_LOCKF_UNLOCK,
> +} BdrvLockfCmd;
> +
>   struct BlockDriver {
>       const char *format_name;
>       int instance_size;
> @@ -317,6 +323,11 @@ struct BlockDriver {
>        */
>       void (*bdrv_drain)(BlockDriverState *bs);
>   
> +    /**
> +     * Lock/unlock the image.
> +     */
> +    int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
> +
>       QLIST_ENTRY(BlockDriver) list;
>   };
>   
> @@ -485,6 +496,7 @@ struct BlockDriverState {
>       NotifierWithReturn write_threshold_notifier;
>   
>       int quiesce_counter;
> +    bool image_locked;
>   };
>   
>   struct BlockBackendRootState {
Max Reitz April 16, 2016, 11:29 p.m. UTC | #2
On 15.04.2016 05:27, Fam Zheng wrote:
> Block drivers can implement this new operation .bdrv_lockf to actually lock the
> image in the protocol specific way.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
>  include/block/block_int.h | 12 ++++++++++++
>  2 files changed, 54 insertions(+)

I'm prepared for everyone hating this idea, but I'm just bringing it up
so I can always say I did bring it up.

Heads up: This will be about qcow2 locking again.

Relax, though, it won't be about how much better qcow2 locking is better
than protocol locking.

Now that you know this feel free to drop out.

This patch implements locking by just trying to lock every single BDS
that is being opened. While it may fulfill its purpose, I don't think
that is what we actually want.

What we want is the following: qemu has a BDS graph. It is basically a
forest of trees. It may be a bit more complicated (DAGs instead of
trees), but let's just assume it is.

What we want to protect are leaves in this tree. Every leaf basically
corresponds to a physical resource such as a file or an NBD connection.
Every leaf is driven by a protocol block driver. We want to protect
these physical resources from concurrent access.

Ideally, we can just protect the physical resource itself. This works
for raw-posix, this works for gluster, this works for raw-win32, and
probably some other protocols, too. But I guess it won't work for all
protocols, and even if it does, it would need to be implemented.

But we can protect leaves in the BDS forest by locking non-leaves also:
If you lock a qcow2 node, all of its "file" subtree will be protected;
normally, that's just a single leaf.

Therefore, I think the ideal approach would be for each BDS tree that is
to be created we try to lock all of its leaves, and if that does not
work for some, we walk up the tree and try to lock inner nodes (e.g.
format BDSs which then use format locking) so that the leaves are still
protected even if their protocol does not support that.

This could be implemented like this: Whenever a leaf BDS is created, try
to lock it. If we can't, leave some information to the parent node that
its child could not be locked. Then, the parent will evaluate this
information and try to act upon it. This then recurses up the tree. Or,
well, down the tree, considering that in most natural trees the root is
at the bottom.


We could just implement qcow2 locking on top of this series as it is,
but this would result in qcow2 files being locked even if their files'
protocol nodes have been successfully locked. That would be superfluous
and we'd have all the issues with force-unlocking qcow2 files we have
discussed before.


So what am I saying? I think that it makes sense to consider format
locking as a backup alternative to protocol locking in case the latter
is not possible. I think it is possible to implement both using the same
framework.

I don't think we need to worry about the actual implementation of format
locking now. But I do think having a framework which supports both
format and protocol locking is possible and would be nice to have.

Such a framework would require more effort, however, than the basically
brute-force "just lock everything" method presented in this patch. Don't
get me wrong, this method here works for what it's supposed to do (I
haven't reviewed it yet, though), and it's very reasonable if protocol
locking is all we intend to have. I'm just suggesting that maybe we do
want to have more than that.


All in all, I won't object if the locking framework introduced by this
series is not supposed to and does not work with format locking. It can
always be added later if I really like it so much, and I can definitely
understand if it appears to be too much effort for basically no gain
right now.

As I said above, I just brought this up so I brought it up. :-)

Max
Fam Zheng April 18, 2016, 1:33 a.m. UTC | #3
On Sun, 04/17 01:29, Max Reitz wrote:
> On 15.04.2016 05:27, Fam Zheng wrote:
> > Block drivers can implement this new operation .bdrv_lockf to actually lock the
> > image in the protocol specific way.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
> >  include/block/block_int.h | 12 ++++++++++++
> >  2 files changed, 54 insertions(+)
> 
> I'm prepared for everyone hating this idea, but I'm just bringing it up
> so I can always say I did bring it up.
> 
> Heads up: This will be about qcow2 locking again.
> 
> Relax, though, it won't be about how much better qcow2 locking is better
> than protocol locking.
> 
> Now that you know this feel free to drop out.
> 
> This patch implements locking by just trying to lock every single BDS
> that is being opened. While it may fulfill its purpose, I don't think
> that is what we actually want.
> 
> What we want is the following: qemu has a BDS graph. It is basically a
> forest of trees. It may be a bit more complicated (DAGs instead of
> trees), but let's just assume it is.
> 
> What we want to protect are leaves in this tree. Every leaf basically
> corresponds to a physical resource such as a file or an NBD connection.
> Every leaf is driven by a protocol block driver. We want to protect
> these physical resources from concurrent access.
> 
> Ideally, we can just protect the physical resource itself. This works
> for raw-posix, this works for gluster, this works for raw-win32, and
> probably some other protocols, too. But I guess it won't work for all
> protocols, and even if it does, it would need to be implemented.
> 
> But we can protect leaves in the BDS forest by locking non-leaves also:
> If you lock a qcow2 node, all of its "file" subtree will be protected;
> normally, that's just a single leaf.
> 
> Therefore, I think the ideal approach would be for each BDS tree that is
> to be created we try to lock all of its leaves, and if that does not
> work for some, we walk up the tree and try to lock inner nodes (e.g.
> format BDSs which then use format locking) so that the leaves are still
> protected even if their protocol does not support that.
> 
> This could be implemented like this: Whenever a leaf BDS is created, try
> to lock it. If we can't, leave some information to the parent node that
> its child could not be locked. Then, the parent will evaluate this
> information and try to act upon it. This then recurses up the tree. Or,
> well, down the tree, considering that in most natural trees the root is
> at the bottom.
> 
> 
> We could just implement qcow2 locking on top of this series as it is,
> but this would result in qcow2 files being locked even if their files'
> protocol nodes have been successfully locked. That would be superfluous
> and we'd have all the issues with force-unlocking qcow2 files we have
> discussed before.
> 
> 
> So what am I saying? I think that it makes sense to consider format
> locking as a backup alternative to protocol locking in case the latter
> is not possible. I think it is possible to implement both using the same
> framework.
> 
> I don't think we need to worry about the actual implementation of format
> locking now. But I do think having a framework which supports both
> format and protocol locking is possible and would be nice to have.
> 
> Such a framework would require more effort, however, than the basically
> brute-force "just lock everything" method presented in this patch. Don't
> get me wrong, this method here works for what it's supposed to do (I
> haven't reviewed it yet, though), and it's very reasonable if protocol
> locking is all we intend to have. I'm just suggesting that maybe we do
> want to have more than that.
> 
> 
> All in all, I won't object if the locking framework introduced by this
> series is not supposed to and does not work with format locking. It can
> always be added later if I really like it so much, and I can definitely
> understand if it appears to be too much effort for basically no gain
> right now.
> 
> As I said above, I just brought this up so I brought it up. :-)

I don't hate this idea, but it is not necessarily much more effort.  We can
always check the underlying file in qcow2's locking implementation, can't we?

    int qcow2_lockf(BlockDriverState *bs, int cmd)
    {
        if ((cmd != BDRV_LOCKF_UNLOCK) && !bdrv_is_locked(bs->file)) {
            return 0;
        }
        ...
    }

The problem with doing this generically in block layer is the chicken-and-egg
problem: it's not safe to just have format probling code or qcow2 driver to
read the image or even writing to the header field for opening it, another
process could be writing to the image already. A challenge with format
locking is the lack of file level atomic operations (cmpxchg on the image
header).

Fam
Fam Zheng April 18, 2016, 1:43 a.m. UTC | #4
On Sat, 04/16 16:22, Denis V. Lunev wrote:
> On 04/15/2016 06:27 AM, Fam Zheng wrote:
> >Block drivers can implement this new operation .bdrv_lockf to actually lock the
> >image in the protocol specific way.
> >
> >Signed-off-by: Fam Zheng <famz@redhat.com>
> >---
> >  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
> >  include/block/block_int.h | 12 ++++++++++++
> >  2 files changed, 54 insertions(+)
> >
> >diff --git a/block.c b/block.c
> >index 1c575e4..7971a25 100644
> >--- a/block.c
> >+++ b/block.c
> >@@ -846,6 +846,34 @@ out:
> >      g_free(gen_node_name);
> >  }
> >+static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> >+{
> >+    int cmd = BDRV_LOCKF_UNLOCK;
> >+
> >+    if (bs->image_locked == lock_image) {
> >+        return 0;
> >+    } else if (!bs->drv) {
> >+        return -ENOMEDIUM;
> >+    } else if (!bs->drv->bdrv_lockf) {
> >+        return 0;
> >+    }
> >+    if (lock_image) {
> >+        cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> >+                                             BDRV_LOCKF_ROLOCK;
> >+    }
> >+    return bs->drv->bdrv_lockf(bs, cmd);
> should we handle ENOTSUP specially?
> f.e. this would fire with raw-posix.c on a filesystem which does not support
> locking.
> 
> from my POW this situations is equivalent to  the absence of
> bs->drv->bdrv_lockf

Yes, that's right. Will fix.

> 
> >+}
> >+
> >+static int bdrv_lock_image(BlockDriverState *bs)
> >+{
> >+    return bdrv_lock_unlock_image_do(bs, true);
> >+}
> >+
> >+static int bdrv_unlock_image(BlockDriverState *bs)
> >+{
> >+    return bdrv_lock_unlock_image_do(bs, false);
> >+}
> >+
> >  static QemuOptsList bdrv_runtime_opts = {
> >      .name = "bdrv_common",
> >      .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> >@@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
> >          goto free_and_fail;
> >      }
> >+    if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> >+        ret = bdrv_lock_image(bs);
> >+        if (ret) {
> >+            error_setg(errp, "Failed to lock image");
> >+            goto free_and_fail;
> >+        }
> >+    }
> >+
> >      ret = refresh_total_sectors(bs, bs->total_sectors);
> >      if (ret < 0) {
> >          error_setg_errno(errp, -ret, "Could not refresh total sector count");
> >@@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
> >      if (bs->drv) {
> >          BdrvChild *child, *next;
> >+        bdrv_unlock_image(bs);
> >          bs->drv->bdrv_close(bs);
> >          bs->drv = NULL;
> >@@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
> >          error_setg_errno(errp, -ret, "Could not refresh total sector count");
> >          return;
> >      }
> >+    if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> >+        bdrv_lock_image(bs);
> >+    }
> >  }
> >  void bdrv_invalidate_cache_all(Error **errp)
> >@@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
> >      }
> >      bs->open_flags |= BDRV_O_INACTIVE;
> >+    ret = bdrv_unlock_image(bs);
> I'd better move unlock a line above.
> Though this is personal. This could be useful
> for debugging purposes.

OK, I can move it.

> 
> 
> >      return 0;
> >  }
> >@@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
> >          QDECREF(json);
> >      }
> >  }
> >+
> this hunk is extra

Will remove.

Thanks,
Fam
Denis V. Lunev April 18, 2016, 5:34 a.m. UTC | #5
On 04/18/2016 04:33 AM, Fam Zheng wrote:
> On Sun, 04/17 01:29, Max Reitz wrote:
>> On 15.04.2016 05:27, Fam Zheng wrote:
>>> Block drivers can implement this new operation .bdrv_lockf to actually lock the
>>> image in the protocol specific way.
>>>
>>> Signed-off-by: Fam Zheng <famz@redhat.com>
>>> ---
>>>   block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
>>>   include/block/block_int.h | 12 ++++++++++++
>>>   2 files changed, 54 insertions(+)
>> I'm prepared for everyone hating this idea, but I'm just bringing it up
>> so I can always say I did bring it up.
>>
>> Heads up: This will be about qcow2 locking again.
>>
>> Relax, though, it won't be about how much better qcow2 locking is better
>> than protocol locking.
>>
>> Now that you know this feel free to drop out.
>>
>> This patch implements locking by just trying to lock every single BDS
>> that is being opened. While it may fulfill its purpose, I don't think
>> that is what we actually want.
>>
>> What we want is the following: qemu has a BDS graph. It is basically a
>> forest of trees. It may be a bit more complicated (DAGs instead of
>> trees), but let's just assume it is.
>>
>> What we want to protect are leaves in this tree. Every leaf basically
>> corresponds to a physical resource such as a file or an NBD connection.
>> Every leaf is driven by a protocol block driver. We want to protect
>> these physical resources from concurrent access.
>>
>> Ideally, we can just protect the physical resource itself. This works
>> for raw-posix, this works for gluster, this works for raw-win32, and
>> probably some other protocols, too. But I guess it won't work for all
>> protocols, and even if it does, it would need to be implemented.
>>
>> But we can protect leaves in the BDS forest by locking non-leaves also:
>> If you lock a qcow2 node, all of its "file" subtree will be protected;
>> normally, that's just a single leaf.
>>
>> Therefore, I think the ideal approach would be for each BDS tree that is
>> to be created we try to lock all of its leaves, and if that does not
>> work for some, we walk up the tree and try to lock inner nodes (e.g.
>> format BDSs which then use format locking) so that the leaves are still
>> protected even if their protocol does not support that.
>>
>> This could be implemented like this: Whenever a leaf BDS is created, try
>> to lock it. If we can't, leave some information to the parent node that
>> its child could not be locked. Then, the parent will evaluate this
>> information and try to act upon it. This then recurses up the tree. Or,
>> well, down the tree, considering that in most natural trees the root is
>> at the bottom.
>>
>>
>> We could just implement qcow2 locking on top of this series as it is,
>> but this would result in qcow2 files being locked even if their files'
>> protocol nodes have been successfully locked. That would be superfluous
>> and we'd have all the issues with force-unlocking qcow2 files we have
>> discussed before.
>>
>>
>> So what am I saying? I think that it makes sense to consider format
>> locking as a backup alternative to protocol locking in case the latter
>> is not possible. I think it is possible to implement both using the same
>> framework.
>>
>> I don't think we need to worry about the actual implementation of format
>> locking now. But I do think having a framework which supports both
>> format and protocol locking is possible and would be nice to have.
>>
>> Such a framework would require more effort, however, than the basically
>> brute-force "just lock everything" method presented in this patch. Don't
>> get me wrong, this method here works for what it's supposed to do (I
>> haven't reviewed it yet, though), and it's very reasonable if protocol
>> locking is all we intend to have. I'm just suggesting that maybe we do
>> want to have more than that.
>>
>>
>> All in all, I won't object if the locking framework introduced by this
>> series is not supposed to and does not work with format locking. It can
>> always be added later if I really like it so much, and I can definitely
>> understand if it appears to be too much effort for basically no gain
>> right now.
>>
>> As I said above, I just brought this up so I brought it up. :-)
> I don't hate this idea, but it is not necessarily much more effort.  We can
> always check the underlying file in qcow2's locking implementation, can't we?
>
>      int qcow2_lockf(BlockDriverState *bs, int cmd)
>      {
>          if ((cmd != BDRV_LOCKF_UNLOCK) && !bdrv_is_locked(bs->file)) {
>              return 0;
>          }
>          ...
>      }
>
> The problem with doing this generically in block layer is the chicken-and-egg
> problem: it's not safe to just have format probling code or qcow2 driver to
> read the image or even writing to the header field for opening it, another
> process could be writing to the image already. A challenge with format
> locking is the lack of file level atomic operations (cmpxchg on the image
> header).
>
> Fam
We should not touch images!

If QEMU will die, f.e. on assert or by power off, we will suffer
a REAL pain to guess whether the lock is taken or not.
flock() and friends is a perfect mechanics for the purpose
as the kernel will drop corresponding locks by magic.

Header changes are good for detecting of unclean stops and
running consistency checks after that, f.e. if we have lazy
ref-counters on.

Pls do not do this inside the image. We have eaten that
stuff in our older products and this is really BAD way to
follow.

Den
Max Reitz April 19, 2016, 7:13 p.m. UTC | #6
On 18.04.2016 03:33, Fam Zheng wrote:
> On Sun, 04/17 01:29, Max Reitz wrote:
>> On 15.04.2016 05:27, Fam Zheng wrote:
>>> Block drivers can implement this new operation .bdrv_lockf to actually lock the
>>> image in the protocol specific way.
>>>
>>> Signed-off-by: Fam Zheng <famz@redhat.com>
>>> ---
>>>  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
>>>  include/block/block_int.h | 12 ++++++++++++
>>>  2 files changed, 54 insertions(+)
>>
>> I'm prepared for everyone hating this idea, but I'm just bringing it up
>> so I can always say I did bring it up.
>>
>> Heads up: This will be about qcow2 locking again.
>>
>> Relax, though, it won't be about how much better qcow2 locking is better
>> than protocol locking.
>>
>> Now that you know this feel free to drop out.
>>
>> This patch implements locking by just trying to lock every single BDS
>> that is being opened. While it may fulfill its purpose, I don't think
>> that is what we actually want.
>>
>> What we want is the following: qemu has a BDS graph. It is basically a
>> forest of trees. It may be a bit more complicated (DAGs instead of
>> trees), but let's just assume it is.
>>
>> What we want to protect are leaves in this tree. Every leaf basically
>> corresponds to a physical resource such as a file or an NBD connection.
>> Every leaf is driven by a protocol block driver. We want to protect
>> these physical resources from concurrent access.
>>
>> Ideally, we can just protect the physical resource itself. This works
>> for raw-posix, this works for gluster, this works for raw-win32, and
>> probably some other protocols, too. But I guess it won't work for all
>> protocols, and even if it does, it would need to be implemented.
>>
>> But we can protect leaves in the BDS forest by locking non-leaves also:
>> If you lock a qcow2 node, all of its "file" subtree will be protected;
>> normally, that's just a single leaf.
>>
>> Therefore, I think the ideal approach would be for each BDS tree that is
>> to be created we try to lock all of its leaves, and if that does not
>> work for some, we walk up the tree and try to lock inner nodes (e.g.
>> format BDSs which then use format locking) so that the leaves are still
>> protected even if their protocol does not support that.
>>
>> This could be implemented like this: Whenever a leaf BDS is created, try
>> to lock it. If we can't, leave some information to the parent node that
>> its child could not be locked. Then, the parent will evaluate this
>> information and try to act upon it. This then recurses up the tree. Or,
>> well, down the tree, considering that in most natural trees the root is
>> at the bottom.
>>
>>
>> We could just implement qcow2 locking on top of this series as it is,
>> but this would result in qcow2 files being locked even if their files'
>> protocol nodes have been successfully locked. That would be superfluous
>> and we'd have all the issues with force-unlocking qcow2 files we have
>> discussed before.
>>
>>
>> So what am I saying? I think that it makes sense to consider format
>> locking as a backup alternative to protocol locking in case the latter
>> is not possible. I think it is possible to implement both using the same
>> framework.
>>
>> I don't think we need to worry about the actual implementation of format
>> locking now. But I do think having a framework which supports both
>> format and protocol locking is possible and would be nice to have.
>>
>> Such a framework would require more effort, however, than the basically
>> brute-force "just lock everything" method presented in this patch. Don't
>> get me wrong, this method here works for what it's supposed to do (I
>> haven't reviewed it yet, though), and it's very reasonable if protocol
>> locking is all we intend to have. I'm just suggesting that maybe we do
>> want to have more than that.
>>
>>
>> All in all, I won't object if the locking framework introduced by this
>> series is not supposed to and does not work with format locking. It can
>> always be added later if I really like it so much, and I can definitely
>> understand if it appears to be too much effort for basically no gain
>> right now.
>>
>> As I said above, I just brought this up so I brought it up. :-)
> 
> I don't hate this idea, but it is not necessarily much more effort.  We can
> always check the underlying file in qcow2's locking implementation, can't we?
> 
>     int qcow2_lockf(BlockDriverState *bs, int cmd)
>     {
>         if ((cmd != BDRV_LOCKF_UNLOCK) && !bdrv_is_locked(bs->file)) {
>             return 0;
>         }
>         ...
>     }

Good point. I like that.

> The problem with doing this generically in block layer is the chicken-and-egg
> problem: it's not safe to just have format probling code or qcow2 driver to
> read the image or even writing to the header field for opening it, another
> process could be writing to the image already. A challenge with format
> locking is the lack of file level atomic operations (cmpxchg on the image
> header).

Well, I'm not sure whether we'd need to format-lock an image for
probing, but with the above, we'd circumvent the whole issue anyway.

Max
Max Reitz April 19, 2016, 7:14 p.m. UTC | #7
On 18.04.2016 07:34, Denis V. Lunev wrote:
> On 04/18/2016 04:33 AM, Fam Zheng wrote:
>> On Sun, 04/17 01:29, Max Reitz wrote:
>>> On 15.04.2016 05:27, Fam Zheng wrote:
>>>> Block drivers can implement this new operation .bdrv_lockf to
>>>> actually lock the
>>>> image in the protocol specific way.
>>>>
>>>> Signed-off-by: Fam Zheng <famz@redhat.com>
>>>> ---
>>>>   block.c                   | 42
>>>> ++++++++++++++++++++++++++++++++++++++++++
>>>>   include/block/block_int.h | 12 ++++++++++++
>>>>   2 files changed, 54 insertions(+)
>>> I'm prepared for everyone hating this idea, but I'm just bringing it up
>>> so I can always say I did bring it up.
>>>
>>> Heads up: This will be about qcow2 locking again.
>>>
>>> Relax, though, it won't be about how much better qcow2 locking is better
>>> than protocol locking.
>>>
>>> Now that you know this feel free to drop out.
>>>
>>> This patch implements locking by just trying to lock every single BDS
>>> that is being opened. While it may fulfill its purpose, I don't think
>>> that is what we actually want.
>>>
>>> What we want is the following: qemu has a BDS graph. It is basically a
>>> forest of trees. It may be a bit more complicated (DAGs instead of
>>> trees), but let's just assume it is.
>>>
>>> What we want to protect are leaves in this tree. Every leaf basically
>>> corresponds to a physical resource such as a file or an NBD connection.
>>> Every leaf is driven by a protocol block driver. We want to protect
>>> these physical resources from concurrent access.
>>>
>>> Ideally, we can just protect the physical resource itself. This works
>>> for raw-posix, this works for gluster, this works for raw-win32, and
>>> probably some other protocols, too. But I guess it won't work for all
>>> protocols, and even if it does, it would need to be implemented.
>>>
>>> But we can protect leaves in the BDS forest by locking non-leaves also:
>>> If you lock a qcow2 node, all of its "file" subtree will be protected;
>>> normally, that's just a single leaf.
>>>
>>> Therefore, I think the ideal approach would be for each BDS tree that is
>>> to be created we try to lock all of its leaves, and if that does not
>>> work for some, we walk up the tree and try to lock inner nodes (e.g.
>>> format BDSs which then use format locking) so that the leaves are still
>>> protected even if their protocol does not support that.
>>>
>>> This could be implemented like this: Whenever a leaf BDS is created, try
>>> to lock it. If we can't, leave some information to the parent node that
>>> its child could not be locked. Then, the parent will evaluate this
>>> information and try to act upon it. This then recurses up the tree. Or,
>>> well, down the tree, considering that in most natural trees the root is
>>> at the bottom.
>>>
>>>
>>> We could just implement qcow2 locking on top of this series as it is,
>>> but this would result in qcow2 files being locked even if their files'
>>> protocol nodes have been successfully locked. That would be superfluous
>>> and we'd have all the issues with force-unlocking qcow2 files we have
>>> discussed before.
>>>
>>>
>>> So what am I saying? I think that it makes sense to consider format
>>> locking as a backup alternative to protocol locking in case the latter
>>> is not possible. I think it is possible to implement both using the same
>>> framework.
>>>
>>> I don't think we need to worry about the actual implementation of format
>>> locking now. But I do think having a framework which supports both
>>> format and protocol locking is possible and would be nice to have.
>>>
>>> Such a framework would require more effort, however, than the basically
>>> brute-force "just lock everything" method presented in this patch. Don't
>>> get me wrong, this method here works for what it's supposed to do (I
>>> haven't reviewed it yet, though), and it's very reasonable if protocol
>>> locking is all we intend to have. I'm just suggesting that maybe we do
>>> want to have more than that.
>>>
>>>
>>> All in all, I won't object if the locking framework introduced by this
>>> series is not supposed to and does not work with format locking. It can
>>> always be added later if I really like it so much, and I can definitely
>>> understand if it appears to be too much effort for basically no gain
>>> right now.
>>>
>>> As I said above, I just brought this up so I brought it up. :-)
>> I don't hate this idea, but it is not necessarily much more effort. 
>> We can
>> always check the underlying file in qcow2's locking implementation,
>> can't we?
>>
>>      int qcow2_lockf(BlockDriverState *bs, int cmd)
>>      {
>>          if ((cmd != BDRV_LOCKF_UNLOCK) && !bdrv_is_locked(bs->file)) {
>>              return 0;
>>          }
>>          ...
>>      }
>>
>> The problem with doing this generically in block layer is the
>> chicken-and-egg
>> problem: it's not safe to just have format probling code or qcow2
>> driver to
>> read the image or even writing to the header field for opening it,
>> another
>> process could be writing to the image already. A challenge with format
>> locking is the lack of file level atomic operations (cmpxchg on the image
>> header).
>>
>> Fam
> We should not touch images!
> 
> If QEMU will die, f.e. on assert or by power off, we will suffer
> a REAL pain to guess whether the lock is taken or not.
> flock() and friends is a perfect mechanics for the purpose
> as the kernel will drop corresponding locks by magic.
> 
> Header changes are good for detecting of unclean stops and
> running consistency checks after that, f.e. if we have lazy
> ref-counters on.
> 
> Pls do not do this inside the image. We have eaten that
> stuff in our older products and this is really BAD way to
> follow.

My suggestion was to use format locking only when protocol locking is
not available. And you can always switch it off manually.

Also, the bulk of my argument was not to implement format locking right
now, but to keep in mind that maybe we do want to implement it later.

Max
Denis V. Lunev April 20, 2016, 8:46 a.m. UTC | #8
On 04/19/2016 10:14 PM, Max Reitz wrote:
> On 18.04.2016 07:34, Denis V. Lunev wrote:
>> On 04/18/2016 04:33 AM, Fam Zheng wrote:
>>> On Sun, 04/17 01:29, Max Reitz wrote:
>>>> On 15.04.2016 05:27, Fam Zheng wrote:
>>>>> Block drivers can implement this new operation .bdrv_lockf to
>>>>> actually lock the
>>>>> image in the protocol specific way.
>>>>>
>>>>> Signed-off-by: Fam Zheng <famz@redhat.com>
>>>>> ---
>>>>>    block.c                   | 42
>>>>> ++++++++++++++++++++++++++++++++++++++++++
>>>>>    include/block/block_int.h | 12 ++++++++++++
>>>>>    2 files changed, 54 insertions(+)
>>>> I'm prepared for everyone hating this idea, but I'm just bringing it up
>>>> so I can always say I did bring it up.
>>>>
>>>> Heads up: This will be about qcow2 locking again.
>>>>
>>>> Relax, though, it won't be about how much better qcow2 locking is better
>>>> than protocol locking.
>>>>
>>>> Now that you know this feel free to drop out.
>>>>
>>>> This patch implements locking by just trying to lock every single BDS
>>>> that is being opened. While it may fulfill its purpose, I don't think
>>>> that is what we actually want.
>>>>
>>>> What we want is the following: qemu has a BDS graph. It is basically a
>>>> forest of trees. It may be a bit more complicated (DAGs instead of
>>>> trees), but let's just assume it is.
>>>>
>>>> What we want to protect are leaves in this tree. Every leaf basically
>>>> corresponds to a physical resource such as a file or an NBD connection.
>>>> Every leaf is driven by a protocol block driver. We want to protect
>>>> these physical resources from concurrent access.
>>>>
>>>> Ideally, we can just protect the physical resource itself. This works
>>>> for raw-posix, this works for gluster, this works for raw-win32, and
>>>> probably some other protocols, too. But I guess it won't work for all
>>>> protocols, and even if it does, it would need to be implemented.
>>>>
>>>> But we can protect leaves in the BDS forest by locking non-leaves also:
>>>> If you lock a qcow2 node, all of its "file" subtree will be protected;
>>>> normally, that's just a single leaf.
>>>>
>>>> Therefore, I think the ideal approach would be for each BDS tree that is
>>>> to be created we try to lock all of its leaves, and if that does not
>>>> work for some, we walk up the tree and try to lock inner nodes (e.g.
>>>> format BDSs which then use format locking) so that the leaves are still
>>>> protected even if their protocol does not support that.
>>>>
>>>> This could be implemented like this: Whenever a leaf BDS is created, try
>>>> to lock it. If we can't, leave some information to the parent node that
>>>> its child could not be locked. Then, the parent will evaluate this
>>>> information and try to act upon it. This then recurses up the tree. Or,
>>>> well, down the tree, considering that in most natural trees the root is
>>>> at the bottom.
>>>>
>>>>
>>>> We could just implement qcow2 locking on top of this series as it is,
>>>> but this would result in qcow2 files being locked even if their files'
>>>> protocol nodes have been successfully locked. That would be superfluous
>>>> and we'd have all the issues with force-unlocking qcow2 files we have
>>>> discussed before.
>>>>
>>>>
>>>> So what am I saying? I think that it makes sense to consider format
>>>> locking as a backup alternative to protocol locking in case the latter
>>>> is not possible. I think it is possible to implement both using the same
>>>> framework.
>>>>
>>>> I don't think we need to worry about the actual implementation of format
>>>> locking now. But I do think having a framework which supports both
>>>> format and protocol locking is possible and would be nice to have.
>>>>
>>>> Such a framework would require more effort, however, than the basically
>>>> brute-force "just lock everything" method presented in this patch. Don't
>>>> get me wrong, this method here works for what it's supposed to do (I
>>>> haven't reviewed it yet, though), and it's very reasonable if protocol
>>>> locking is all we intend to have. I'm just suggesting that maybe we do
>>>> want to have more than that.
>>>>
>>>>
>>>> All in all, I won't object if the locking framework introduced by this
>>>> series is not supposed to and does not work with format locking. It can
>>>> always be added later if I really like it so much, and I can definitely
>>>> understand if it appears to be too much effort for basically no gain
>>>> right now.
>>>>
>>>> As I said above, I just brought this up so I brought it up. :-)
>>> I don't hate this idea, but it is not necessarily much more effort.
>>> We can
>>> always check the underlying file in qcow2's locking implementation,
>>> can't we?
>>>
>>>       int qcow2_lockf(BlockDriverState *bs, int cmd)
>>>       {
>>>           if ((cmd != BDRV_LOCKF_UNLOCK) && !bdrv_is_locked(bs->file)) {
>>>               return 0;
>>>           }
>>>           ...
>>>       }
>>>
>>> The problem with doing this generically in block layer is the
>>> chicken-and-egg
>>> problem: it's not safe to just have format probling code or qcow2
>>> driver to
>>> read the image or even writing to the header field for opening it,
>>> another
>>> process could be writing to the image already. A challenge with format
>>> locking is the lack of file level atomic operations (cmpxchg on the image
>>> header).
>>>
>>> Fam
>> We should not touch images!
>>
>> If QEMU will die, f.e. on assert or by power off, we will suffer
>> a REAL pain to guess whether the lock is taken or not.
>> flock() and friends is a perfect mechanics for the purpose
>> as the kernel will drop corresponding locks by magic.
>>
>> Header changes are good for detecting of unclean stops and
>> running consistency checks after that, f.e. if we have lazy
>> ref-counters on.
>>
>> Pls do not do this inside the image. We have eaten that
>> stuff in our older products and this is really BAD way to
>> follow.
> My suggestion was to use format locking only when protocol locking is
> not available. And you can always switch it off manually.
>
> Also, the bulk of my argument was not to implement format locking right
> now, but to keep in mind that maybe we do want to implement it later.
>
> Max
>
OK. this sounds sane and fair if we could start with the
current approach and extend it later on with format
locking.

Den
Laszlo Ersek April 25, 2016, 11:55 p.m. UTC | #9
On 04/15/16 05:27, Fam Zheng wrote:
> Block drivers can implement this new operation .bdrv_lockf to actually lock the
> image in the protocol specific way.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
>  include/block/block_int.h | 12 ++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 1c575e4..7971a25 100644
> --- a/block.c
> +++ b/block.c
> @@ -846,6 +846,34 @@ out:
>      g_free(gen_node_name);
>  }
>  
> +static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> +{
> +    int cmd = BDRV_LOCKF_UNLOCK;
> +
> +    if (bs->image_locked == lock_image) {
> +        return 0;
> +    } else if (!bs->drv) {
> +        return -ENOMEDIUM;
> +    } else if (!bs->drv->bdrv_lockf) {
> +        return 0;
> +    }
> +    if (lock_image) {
> +        cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> +                                             BDRV_LOCKF_ROLOCK;
> +    }
> +    return bs->drv->bdrv_lockf(bs, cmd);
> +}
> +
> +static int bdrv_lock_image(BlockDriverState *bs)
> +{
> +    return bdrv_lock_unlock_image_do(bs, true);
> +}
> +
> +static int bdrv_unlock_image(BlockDriverState *bs)
> +{
> +    return bdrv_lock_unlock_image_do(bs, false);
> +}
> +
>  static QemuOptsList bdrv_runtime_opts = {
>      .name = "bdrv_common",
>      .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> @@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
>          goto free_and_fail;
>      }
>  
> +    if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> +        ret = bdrv_lock_image(bs);
> +        if (ret) {
> +            error_setg(errp, "Failed to lock image");
> +            goto free_and_fail;
> +        }
> +    }
> +
>      ret = refresh_total_sectors(bs, bs->total_sectors);
>      if (ret < 0) {
>          error_setg_errno(errp, -ret, "Could not refresh total sector count");
> @@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
>      if (bs->drv) {
>          BdrvChild *child, *next;
>  
> +        bdrv_unlock_image(bs);
>          bs->drv->bdrv_close(bs);
>          bs->drv = NULL;
>  
> @@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
>          error_setg_errno(errp, -ret, "Could not refresh total sector count");
>          return;
>      }
> +    if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> +        bdrv_lock_image(bs);
> +    }
>  }
>  
>  void bdrv_invalidate_cache_all(Error **errp)
> @@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
>      }
>  
>      bs->open_flags |= BDRV_O_INACTIVE;
> +    ret = bdrv_unlock_image(bs);
>      return 0;
>  }
>  
> @@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
>          QDECREF(json);
>      }
>  }
> +
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 10d8759..ffa30b0 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -85,6 +85,12 @@ typedef struct BdrvTrackedRequest {
>      struct BdrvTrackedRequest *waiting_for;
>  } BdrvTrackedRequest;
>  
> +typedef enum {
> +    BDRV_LOCKF_RWLOCK,
> +    BDRV_LOCKF_ROLOCK,
> +    BDRV_LOCKF_UNLOCK,
> +} BdrvLockfCmd;
> +
>  struct BlockDriver {
>      const char *format_name;
>      int instance_size;
> @@ -317,6 +323,11 @@ struct BlockDriver {
>       */
>      void (*bdrv_drain)(BlockDriverState *bs);
>  
> +    /**
> +     * Lock/unlock the image.
> +     */
> +    int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
> +
>      QLIST_ENTRY(BlockDriver) list;
>  };
>  
> @@ -485,6 +496,7 @@ struct BlockDriverState {
>      NotifierWithReturn write_threshold_notifier;
>  
>      int quiesce_counter;
> +    bool image_locked;
>  };
>  
>  struct BlockBackendRootState {
> 

I'd like to raise one point which I think may not have been, yet (after
briefly skimming the v1 / v2 comments). Sorry if this has been discussed
already.

IIUC, the idea is that "protocols" (in the block layer sense) implement
the lockf method, and then bdrv_open_common() automatically locks image
files, if the lockf method is available, and if various settings
(cmdline options etc) don't request otherwise.

I tried to see if this series modifies -- for example --
raw_reopen_commit() and raw_reopen_abort(), in "block/raw-posix.c". Or,
if it modifies bdrv_reopen_multiple(), in "block.c". It doesn't seem to.

Those functions are relevant for the following reason. Given the
following chain of references:

  file descriptor --> file description --> file

an fcntl() lock is associated with the file. However, the fcntl() lock
held by the process on the file is dropped if the process closes *any*
file descriptor that points (through the same or another file
description) to the file. From
<http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html>:

    All locks associated with a file for a given process shall be
    removed when a file descriptor for that file is closed by that
    process [...]

From <http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html>:

    All outstanding record locks owned by the process on the file
    associated with the file descriptor shall be removed (that is,
    unlocked).

From <http://man7.org/linux/man-pages/man2/fcntl.2.html>:

    If a process closes any file descriptor referring to a file, then
    all of the process's locks on that file are released, regardless of
    the file descriptor(s) on which the locks were obtained.

The bdrv_reopen_multiple() function reopens a bunch of image files.
Backed by the raw-posix protocol driver, this seems to boil down to a
series of (i) fcntl(F_DUPFD_CLOEXEC), and/or (ii) dup(), and/or (iii)
qemu_open() calls, in raw_reopen_prepare(). The result is stored in
"raw_s->fd" every time.

(In the first two cases, the file description will be shared, in the
third case, the file will be shared, between "s->fd" and "raw_s->fd".)

Assume that one of the raw_reopen_prepare() calls fails. Then
bdrv_reopen_multiple() will roll back the work done thus far, calling
raw_reopen_abort() on the initial subset of image files. This results in
"raw_s->fd" being passed to close(), which is when the lock
(conceptually held for "s->fd") is dropped for good.

If all of the raw_reopen_prepare() calls succeed, then a series of
raw_reopen_commit() calls will occur. That has the same effect: "s->fd"
is passed to close(), which drops the lock for "raw_s->fd" too (which is
supposed to be used for accessing the file, going forward).

Sorry if this is already handled in the series, I couldn't find it.

Thanks
Laszlo
Fam Zheng April 26, 2016, 12:47 a.m. UTC | #10
On Tue, 04/26 01:55, Laszlo Ersek wrote:
> On 04/15/16 05:27, Fam Zheng wrote:
> > Block drivers can implement this new operation .bdrv_lockf to actually lock the
> > image in the protocol specific way.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block.c                   | 42 ++++++++++++++++++++++++++++++++++++++++++
> >  include/block/block_int.h | 12 ++++++++++++
> >  2 files changed, 54 insertions(+)
> > 
> > diff --git a/block.c b/block.c
> > index 1c575e4..7971a25 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -846,6 +846,34 @@ out:
> >      g_free(gen_node_name);
> >  }
> >  
> > +static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
> > +{
> > +    int cmd = BDRV_LOCKF_UNLOCK;
> > +
> > +    if (bs->image_locked == lock_image) {
> > +        return 0;
> > +    } else if (!bs->drv) {
> > +        return -ENOMEDIUM;
> > +    } else if (!bs->drv->bdrv_lockf) {
> > +        return 0;
> > +    }
> > +    if (lock_image) {
> > +        cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
> > +                                             BDRV_LOCKF_ROLOCK;
> > +    }
> > +    return bs->drv->bdrv_lockf(bs, cmd);
> > +}
> > +
> > +static int bdrv_lock_image(BlockDriverState *bs)
> > +{
> > +    return bdrv_lock_unlock_image_do(bs, true);
> > +}
> > +
> > +static int bdrv_unlock_image(BlockDriverState *bs)
> > +{
> > +    return bdrv_lock_unlock_image_do(bs, false);
> > +}
> > +
> >  static QemuOptsList bdrv_runtime_opts = {
> >      .name = "bdrv_common",
> >      .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
> > @@ -995,6 +1023,14 @@ static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
> >          goto free_and_fail;
> >      }
> >  
> > +    if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
> > +        ret = bdrv_lock_image(bs);
> > +        if (ret) {
> > +            error_setg(errp, "Failed to lock image");
> > +            goto free_and_fail;
> > +        }
> > +    }
> > +
> >      ret = refresh_total_sectors(bs, bs->total_sectors);
> >      if (ret < 0) {
> >          error_setg_errno(errp, -ret, "Could not refresh total sector count");
> > @@ -2144,6 +2180,7 @@ static void bdrv_close(BlockDriverState *bs)
> >      if (bs->drv) {
> >          BdrvChild *child, *next;
> >  
> > +        bdrv_unlock_image(bs);
> >          bs->drv->bdrv_close(bs);
> >          bs->drv = NULL;
> >  
> > @@ -3230,6 +3267,9 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
> >          error_setg_errno(errp, -ret, "Could not refresh total sector count");
> >          return;
> >      }
> > +    if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
> > +        bdrv_lock_image(bs);
> > +    }
> >  }
> >  
> >  void bdrv_invalidate_cache_all(Error **errp)
> > @@ -3262,6 +3302,7 @@ static int bdrv_inactivate(BlockDriverState *bs)
> >      }
> >  
> >      bs->open_flags |= BDRV_O_INACTIVE;
> > +    ret = bdrv_unlock_image(bs);
> >      return 0;
> >  }
> >  
> > @@ -3981,3 +4022,4 @@ void bdrv_refresh_filename(BlockDriverState *bs)
> >          QDECREF(json);
> >      }
> >  }
> > +
> > diff --git a/include/block/block_int.h b/include/block/block_int.h
> > index 10d8759..ffa30b0 100644
> > --- a/include/block/block_int.h
> > +++ b/include/block/block_int.h
> > @@ -85,6 +85,12 @@ typedef struct BdrvTrackedRequest {
> >      struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >  
> > +typedef enum {
> > +    BDRV_LOCKF_RWLOCK,
> > +    BDRV_LOCKF_ROLOCK,
> > +    BDRV_LOCKF_UNLOCK,
> > +} BdrvLockfCmd;
> > +
> >  struct BlockDriver {
> >      const char *format_name;
> >      int instance_size;
> > @@ -317,6 +323,11 @@ struct BlockDriver {
> >       */
> >      void (*bdrv_drain)(BlockDriverState *bs);
> >  
> > +    /**
> > +     * Lock/unlock the image.
> > +     */
> > +    int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
> > +
> >      QLIST_ENTRY(BlockDriver) list;
> >  };
> >  
> > @@ -485,6 +496,7 @@ struct BlockDriverState {
> >      NotifierWithReturn write_threshold_notifier;
> >  
> >      int quiesce_counter;
> > +    bool image_locked;
> >  };
> >  
> >  struct BlockBackendRootState {
> > 
> 
> I'd like to raise one point which I think may not have been, yet (after
> briefly skimming the v1 / v2 comments). Sorry if this has been discussed
> already.
> 
> IIUC, the idea is that "protocols" (in the block layer sense) implement
> the lockf method, and then bdrv_open_common() automatically locks image
> files, if the lockf method is available, and if various settings
> (cmdline options etc) don't request otherwise.
> 
> I tried to see if this series modifies -- for example --
> raw_reopen_commit() and raw_reopen_abort(), in "block/raw-posix.c". Or,
> if it modifies bdrv_reopen_multiple(), in "block.c". It doesn't seem to.
> 
> Those functions are relevant for the following reason. Given the
> following chain of references:
> 
>   file descriptor --> file description --> file
> 
> an fcntl() lock is associated with the file. However, the fcntl() lock
> held by the process on the file is dropped if the process closes *any*
> file descriptor that points (through the same or another file
> description) to the file. From
> <http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html>:
> 
>     All locks associated with a file for a given process shall be
>     removed when a file descriptor for that file is closed by that
>     process [...]
> 
> From <http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html>:
> 
>     All outstanding record locks owned by the process on the file
>     associated with the file descriptor shall be removed (that is,
>     unlocked).
> 
> From <http://man7.org/linux/man-pages/man2/fcntl.2.html>:
> 
>     If a process closes any file descriptor referring to a file, then
>     all of the process's locks on that file are released, regardless of
>     the file descriptor(s) on which the locks were obtained.
> 
> The bdrv_reopen_multiple() function reopens a bunch of image files.
> Backed by the raw-posix protocol driver, this seems to boil down to a
> series of (i) fcntl(F_DUPFD_CLOEXEC), and/or (ii) dup(), and/or (iii)
> qemu_open() calls, in raw_reopen_prepare(). The result is stored in
> "raw_s->fd" every time.
> 
> (In the first two cases, the file description will be shared, in the
> third case, the file will be shared, between "s->fd" and "raw_s->fd".)
> 
> Assume that one of the raw_reopen_prepare() calls fails. Then
> bdrv_reopen_multiple() will roll back the work done thus far, calling
> raw_reopen_abort() on the initial subset of image files. This results in
> "raw_s->fd" being passed to close(), which is when the lock
> (conceptually held for "s->fd") is dropped for good.
> 
> If all of the raw_reopen_prepare() calls succeed, then a series of
> raw_reopen_commit() calls will occur. That has the same effect: "s->fd"
> is passed to close(), which drops the lock for "raw_s->fd" too (which is
> supposed to be used for accessing the file, going forward).
> 

Yes, this is a good catch! I'll take care of it in next version, and add some
tests.

Thanks!

Fam
diff mbox

Patch

diff --git a/block.c b/block.c
index 1c575e4..7971a25 100644
--- a/block.c
+++ b/block.c
@@ -846,6 +846,34 @@  out:
     g_free(gen_node_name);
 }
 
+static int bdrv_lock_unlock_image_do(BlockDriverState *bs, bool lock_image)
+{
+    int cmd = BDRV_LOCKF_UNLOCK;
+
+    if (bs->image_locked == lock_image) {
+        return 0;
+    } else if (!bs->drv) {
+        return -ENOMEDIUM;
+    } else if (!bs->drv->bdrv_lockf) {
+        return 0;
+    }
+    if (lock_image) {
+        cmd = bs->open_flags & BDRV_O_RDWR ? BDRV_LOCKF_RWLOCK :
+                                             BDRV_LOCKF_ROLOCK;
+    }
+    return bs->drv->bdrv_lockf(bs, cmd);
+}
+
+static int bdrv_lock_image(BlockDriverState *bs)
+{
+    return bdrv_lock_unlock_image_do(bs, true);
+}
+
+static int bdrv_unlock_image(BlockDriverState *bs)
+{
+    return bdrv_lock_unlock_image_do(bs, false);
+}
+
 static QemuOptsList bdrv_runtime_opts = {
     .name = "bdrv_common",
     .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
@@ -995,6 +1023,14 @@  static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
         goto free_and_fail;
     }
 
+    if (!(open_flags & (BDRV_O_NO_LOCK | BDRV_O_INACTIVE))) {
+        ret = bdrv_lock_image(bs);
+        if (ret) {
+            error_setg(errp, "Failed to lock image");
+            goto free_and_fail;
+        }
+    }
+
     ret = refresh_total_sectors(bs, bs->total_sectors);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not refresh total sector count");
@@ -2144,6 +2180,7 @@  static void bdrv_close(BlockDriverState *bs)
     if (bs->drv) {
         BdrvChild *child, *next;
 
+        bdrv_unlock_image(bs);
         bs->drv->bdrv_close(bs);
         bs->drv = NULL;
 
@@ -3230,6 +3267,9 @@  void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
         error_setg_errno(errp, -ret, "Could not refresh total sector count");
         return;
     }
+    if (!(bs->open_flags & BDRV_O_NO_LOCK)) {
+        bdrv_lock_image(bs);
+    }
 }
 
 void bdrv_invalidate_cache_all(Error **errp)
@@ -3262,6 +3302,7 @@  static int bdrv_inactivate(BlockDriverState *bs)
     }
 
     bs->open_flags |= BDRV_O_INACTIVE;
+    ret = bdrv_unlock_image(bs);
     return 0;
 }
 
@@ -3981,3 +4022,4 @@  void bdrv_refresh_filename(BlockDriverState *bs)
         QDECREF(json);
     }
 }
+
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 10d8759..ffa30b0 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -85,6 +85,12 @@  typedef struct BdrvTrackedRequest {
     struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
+typedef enum {
+    BDRV_LOCKF_RWLOCK,
+    BDRV_LOCKF_ROLOCK,
+    BDRV_LOCKF_UNLOCK,
+} BdrvLockfCmd;
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -317,6 +323,11 @@  struct BlockDriver {
      */
     void (*bdrv_drain)(BlockDriverState *bs);
 
+    /**
+     * Lock/unlock the image.
+     */
+    int (*bdrv_lockf)(BlockDriverState *bs, BdrvLockfCmd cmd);
+
     QLIST_ENTRY(BlockDriver) list;
 };
 
@@ -485,6 +496,7 @@  struct BlockDriverState {
     NotifierWithReturn write_threshold_notifier;
 
     int quiesce_counter;
+    bool image_locked;
 };
 
 struct BlockBackendRootState {