diff mbox

[v6,08/22] raw-posix: Add image locking support

Message ID 1464943756-14143-9-git-send-email-famz@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Fam Zheng June 3, 2016, 8:49 a.m. UTC
virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
the intervene.

Both file and host device protocols are covered.

The complication is with reopen. We have three different locking states,
namely "unlocked", "shared locked" and "exclusively locked".

There have three different states, "unlocked", "shared locked" and "exclusively
locked". When we reopen, the new fd may need a new locking mode. Moving away to
or from exclusive is a bit tricky because we cannot do it atomically. This
patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there
isn't a racy window where we drop the lock on one fd before acquiring the
exclusive lock on the other.

To make the logic easier to manage, and allow better reuse, the code is
internally organized by state transition table (old_lock -> new_lock).

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/raw-posix.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 285 insertions(+)

Comments

Fam Zheng June 3, 2016, 11:53 p.m. UTC | #1
On Fri, 06/03 16:49, Fam Zheng wrote:
> +static
> +int raw_reopen_downgrade(BDRVReopenState *state,
> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret;

This should be initialized to 0 for the nop branches.

Fam

> +
> +    assert(old_lock == BDRV_LOCKF_EXCLUSIVE);
> +    assert(new_lock == BDRV_LOCKF_SHARED);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to downgrade old lock");
> +            break;
> +        }
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to lock new fd");
> +            break;
> +        }
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
Kevin Wolf June 17, 2016, 1:07 p.m. UTC | #2
Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben:
> virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
> the intervene.
> 
> Both file and host device protocols are covered.
> 
> The complication is with reopen. We have three different locking states,
> namely "unlocked", "shared locked" and "exclusively locked".
> 
> There have three different states, "unlocked", "shared locked" and "exclusively
> locked".

This seems to be a corrupted copy of the previous sentence. :-)

> When we reopen, the new fd may need a new locking mode. Moving away to
> or from exclusive is a bit tricky because we cannot do it atomically. This
> patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there
> isn't a racy window where we drop the lock on one fd before acquiring the
> exclusive lock on the other.
> 
> To make the logic easier to manage, and allow better reuse, the code is
> internally organized by state transition table (old_lock -> new_lock).
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>

I must admit that I don't fully understand yet why we can't change the
lock atomincally and how s->lock_fd helps. In any case, I think it
deserves comments in the code and not only in the commit message.

So I'm not giving a full review here, but I think I have one important
point to make at least.

>  block/raw-posix.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 285 insertions(+)
> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index bb8669f..6347350 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -133,6 +133,7 @@ do { \
>  
>  typedef struct BDRVRawState {
>      int fd;
> +    int lock_fd;
>      int type;
>      int open_flags;
>      size_t buf_align;
> @@ -153,6 +154,7 @@ typedef struct BDRVRawState {
>  
>  typedef struct BDRVRawReopenState {
>      int fd;
> +    int lock_fd;
>      int open_flags;
>  #ifdef CONFIG_LINUX_AIO
>      int use_aio;
> @@ -397,6 +399,37 @@ static void raw_attach_aio_context(BlockDriverState *bs,
>  #endif
>  }
>  
> +static int raw_lockf_fd(int fd, BdrvLockfCmd cmd)
> +{
> +    assert(fd >= 0);
> +    /* Locking byte 1 avoids interfereing with virtlockd. */
> +    switch (cmd) {
> +    case BDRV_LOCKF_EXCLUSIVE:
> +        return qemu_lock_fd(fd, 1, 1, true);
> +    case BDRV_LOCKF_SHARED:
> +        return qemu_lock_fd(fd, 1, 1, false);
> +    case BDRV_LOCKF_UNLOCK:
> +        return qemu_unlock_fd(fd, 1, 1);
> +    default:
> +        abort();
> +    }
> +}
> +
> +static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd)
> +{
> +
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (s->lock_fd < 0) {
> +        s->lock_fd = qemu_dup(s->fd);
> +        if (s->lock_fd < 0) {
> +            return s->lock_fd;
> +        }
> +    }
> +
> +    return raw_lockf_fd(s->lock_fd, cmd);
> +}
> +
>  #ifdef CONFIG_LINUX_AIO
>  static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
>  {
> @@ -483,6 +516,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>      raw_parse_flags(bdrv_flags, &s->open_flags);
>  
>      s->fd = -1;
> +    s->lock_fd = -1;
>      fd = qemu_open(filename, s->open_flags, 0644);
>      if (fd < 0) {
>          ret = -errno;
> @@ -593,6 +627,241 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
>      return ret;
>  }
>  
> +typedef enum {
> +    RAW_REOPEN_PREPARE,
> +    RAW_REOPEN_COMMIT,
> +    RAW_REOPEN_ABORT
> +} RawReopenOperation;
> +
> +typedef int (*RawReopenFunc)(BDRVReopenState *state,
> +                             RawReopenOperation op,
> +                             BdrvLockfCmd old_lock,
> +                             BdrvLockfCmd new_lock,
> +                             Error **errp);
> +
> +static
> +int raw_reopen_identical(BDRVReopenState *state,

This is unusual formatting. I'm used to having everything on a single
line or "static int" on its own line, but breaking between "static" and
"int" feels odd.

> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    assert(old_lock == new_lock);
> +    return 0;
> +}
> +
> +static
> +int raw_reopen_from_unlock(BDRVReopenState *state,
> +                           RawReopenOperation op,
> +                           BdrvLockfCmd old_lock,
> +                           BdrvLockfCmd new_lock,
> +                           Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    int ret = 0;
> +
> +    assert(old_lock != new_lock);
> +    assert(old_lock == BDRV_LOCKF_UNLOCK);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        ret = raw_lockf_fd(raw_s->lock_fd, new_lock);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd");
> +        }
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static
> +int raw_reopen_to_unlock(BDRVReopenState *state,
> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret = 0;
> +
> +    assert(old_lock != new_lock);
> +    assert(old_lock == BDRV_LOCKF_UNLOCK);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        if (s->lock_fd >= 0) {
> +            qemu_close(s->lock_fd);
> +            s->lock_fd = -1;
> +        }
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static
> +int raw_reopen_upgrade(BDRVReopenState *state,
> +                       RawReopenOperation op,
> +                       BdrvLockfCmd old_lock,
> +                       BdrvLockfCmd new_lock,
> +                       Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret = 0, ret2;
> +
> +    assert(old_lock == BDRV_LOCKF_SHARED);
> +    assert(new_lock == BDRV_LOCKF_EXCLUSIVE);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
> +            break;
> +        }
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to unlock old fd");
> +            goto restore;
> +        }
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "Failed to lock new fd (exclusive)");
> +            goto restore;
> +        }
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to restore lock on old fd");
> +        }
> +        break;
> +    }
> +
> +    return ret;
> +restore:
> +    ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +    if (ret2) {
> +        error_report("Failed to restore old lock");
> +    }
> +    return ret;
> +
> +}

That final empty line doesn't look intentional.

> +
> +static
> +int raw_reopen_downgrade(BDRVReopenState *state,
> +                         RawReopenOperation op,
> +                         BdrvLockfCmd old_lock,
> +                         BdrvLockfCmd new_lock,
> +                         Error **errp)
> +{
> +    BDRVRawReopenState *raw_s = state->opaque;
> +    BDRVRawState *s = state->bs->opaque;
> +    int ret;
> +
> +    assert(old_lock == BDRV_LOCKF_EXCLUSIVE);
> +    assert(new_lock == BDRV_LOCKF_SHARED);
> +    switch (op) {
> +    case RAW_REOPEN_PREPARE:
> +        break;
> +    case RAW_REOPEN_COMMIT:
> +        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to downgrade old lock");
> +            break;
> +        }
> +        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
> +        if (ret) {
> +            error_report("Failed to lock new fd");
> +            break;
> +        }
> +        break;
> +    case RAW_REOPEN_ABORT:
> +        break;
> +    }
> +
> +    return ret;
> +}
> +
> +static const struct RawReopenFuncRecord {
> +    BdrvLockfCmd old_lock;
> +    BdrvLockfCmd new_lock;
> +    RawReopenFunc func;
> +    bool need_lock_fd;
> +} reopen_functions[] = {
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false},
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true},
> +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false},
> +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true},
> +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false},
> +};
> +
> +static int raw_reopen_handle_lock(BDRVReopenState *state,
> +                                  RawReopenOperation op,
> +                                  Error **errp)

I think we have one big problem here: We don't know whether raw_s->fd is
already locked or not. If dup() and setting the new flags with fcntl()
succeeded, it is, but if we had to fall back on qemu_open(), it isn't.

This means that doing nothing in the raw_reopen_identical case isn't
right because reopening without intending to change anything about the
locking could end up unlocking the image.

Kevin
Fam Zheng June 22, 2016, 8:27 a.m. UTC | #3
On Fri, 06/17 15:07, Kevin Wolf wrote:
> Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben:
> > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid
> > the intervene.
> > 
> > Both file and host device protocols are covered.
> > 
> > The complication is with reopen. We have three different locking states,
> > namely "unlocked", "shared locked" and "exclusively locked".
> > 
> > There have three different states, "unlocked", "shared locked" and "exclusively
> > locked".
> 
> This seems to be a corrupted copy of the previous sentence. :-)

Right, fixing.

> 
> > When we reopen, the new fd may need a new locking mode. Moving away to
> > or from exclusive is a bit tricky because we cannot do it atomically. This
> > patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there
> > isn't a racy window where we drop the lock on one fd before acquiring the
> > exclusive lock on the other.
> > 
> > To make the logic easier to manage, and allow better reuse, the code is
> > internally organized by state transition table (old_lock -> new_lock).
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> 
> I must admit that I don't fully understand yet why we can't change the
> lock atomincally and how s->lock_fd helps. In any case, I think it
> deserves comments in the code and not only in the commit message.

I'll add comments in the code too.

> > +static const struct RawReopenFuncRecord {
> > +    BdrvLockfCmd old_lock;
> > +    BdrvLockfCmd new_lock;
> > +    RawReopenFunc func;
> > +    bool need_lock_fd;
> > +} reopen_functions[] = {
> > +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false},
> > +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true},
> > +    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true},
> > +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> > +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false},
> > +    {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true},
> > +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
> > +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true},
> > +    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false},
> > +};
> > +
> > +static int raw_reopen_handle_lock(BDRVReopenState *state,
> > +                                  RawReopenOperation op,
> > +                                  Error **errp)
> 
> I think we have one big problem here: We don't know whether raw_s->fd is
> already locked or not. If dup() and setting the new flags with fcntl()
> succeeded, it is, but if we had to fall back on qemu_open(), it isn't.
> 
> This means that doing nothing in the raw_reopen_identical case isn't
> right because reopening without intending to change anything about the
> locking could end up unlocking the image.
> 

Unless I'm missing something, we don't rely on that, becasue raw_s->fd is never
locked. Instead, raw_s->lock_fd, as a dup() of raw_s->fd, is what we actually
handle in raw_reopen_identical(), and it always has the correct state when
raw_reopen_handle_lock() is called.  (That is also an advantage of introducing
raw_s->lock_fd.)

Fam
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index bb8669f..6347350 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -133,6 +133,7 @@  do { \
 
 typedef struct BDRVRawState {
     int fd;
+    int lock_fd;
     int type;
     int open_flags;
     size_t buf_align;
@@ -153,6 +154,7 @@  typedef struct BDRVRawState {
 
 typedef struct BDRVRawReopenState {
     int fd;
+    int lock_fd;
     int open_flags;
 #ifdef CONFIG_LINUX_AIO
     int use_aio;
@@ -397,6 +399,37 @@  static void raw_attach_aio_context(BlockDriverState *bs,
 #endif
 }
 
+static int raw_lockf_fd(int fd, BdrvLockfCmd cmd)
+{
+    assert(fd >= 0);
+    /* Locking byte 1 avoids interfereing with virtlockd. */
+    switch (cmd) {
+    case BDRV_LOCKF_EXCLUSIVE:
+        return qemu_lock_fd(fd, 1, 1, true);
+    case BDRV_LOCKF_SHARED:
+        return qemu_lock_fd(fd, 1, 1, false);
+    case BDRV_LOCKF_UNLOCK:
+        return qemu_unlock_fd(fd, 1, 1);
+    default:
+        abort();
+    }
+}
+
+static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd)
+{
+
+    BDRVRawState *s = bs->opaque;
+
+    if (s->lock_fd < 0) {
+        s->lock_fd = qemu_dup(s->fd);
+        if (s->lock_fd < 0) {
+            return s->lock_fd;
+        }
+    }
+
+    return raw_lockf_fd(s->lock_fd, cmd);
+}
+
 #ifdef CONFIG_LINUX_AIO
 static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
 {
@@ -483,6 +516,7 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     raw_parse_flags(bdrv_flags, &s->open_flags);
 
     s->fd = -1;
+    s->lock_fd = -1;
     fd = qemu_open(filename, s->open_flags, 0644);
     if (fd < 0) {
         ret = -errno;
@@ -593,6 +627,241 @@  static int raw_open(BlockDriverState *bs, QDict *options, int flags,
     return ret;
 }
 
+typedef enum {
+    RAW_REOPEN_PREPARE,
+    RAW_REOPEN_COMMIT,
+    RAW_REOPEN_ABORT
+} RawReopenOperation;
+
+typedef int (*RawReopenFunc)(BDRVReopenState *state,
+                             RawReopenOperation op,
+                             BdrvLockfCmd old_lock,
+                             BdrvLockfCmd new_lock,
+                             Error **errp);
+
+static
+int raw_reopen_identical(BDRVReopenState *state,
+                         RawReopenOperation op,
+                         BdrvLockfCmd old_lock,
+                         BdrvLockfCmd new_lock,
+                         Error **errp)
+{
+    assert(old_lock == new_lock);
+    return 0;
+}
+
+static
+int raw_reopen_from_unlock(BDRVReopenState *state,
+                           RawReopenOperation op,
+                           BdrvLockfCmd old_lock,
+                           BdrvLockfCmd new_lock,
+                           Error **errp)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    int ret = 0;
+
+    assert(old_lock != new_lock);
+    assert(old_lock == BDRV_LOCKF_UNLOCK);
+    switch (op) {
+    case RAW_REOPEN_PREPARE:
+        ret = raw_lockf_fd(raw_s->lock_fd, new_lock);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd");
+        }
+        break;
+    case RAW_REOPEN_COMMIT:
+    case RAW_REOPEN_ABORT:
+        break;
+    }
+
+    return ret;
+}
+
+static
+int raw_reopen_to_unlock(BDRVReopenState *state,
+                         RawReopenOperation op,
+                         BdrvLockfCmd old_lock,
+                         BdrvLockfCmd new_lock,
+                         Error **errp)
+{
+    BDRVRawState *s = state->bs->opaque;
+    int ret = 0;
+
+    assert(old_lock != new_lock);
+    assert(old_lock == BDRV_LOCKF_UNLOCK);
+    switch (op) {
+    case RAW_REOPEN_PREPARE:
+        break;
+    case RAW_REOPEN_COMMIT:
+        if (s->lock_fd >= 0) {
+            qemu_close(s->lock_fd);
+            s->lock_fd = -1;
+        }
+        break;
+    case RAW_REOPEN_ABORT:
+        break;
+    }
+
+    return ret;
+}
+
+static
+int raw_reopen_upgrade(BDRVReopenState *state,
+                       RawReopenOperation op,
+                       BdrvLockfCmd old_lock,
+                       BdrvLockfCmd new_lock,
+                       Error **errp)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+    int ret = 0, ret2;
+
+    assert(old_lock == BDRV_LOCKF_SHARED);
+    assert(new_lock == BDRV_LOCKF_EXCLUSIVE);
+    switch (op) {
+    case RAW_REOPEN_PREPARE:
+        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (shared)");
+            break;
+        }
+        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to unlock old fd");
+            goto restore;
+        }
+        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE);
+        if (ret) {
+            error_setg_errno(errp, -ret, "Failed to lock new fd (exclusive)");
+            goto restore;
+        }
+        break;
+    case RAW_REOPEN_COMMIT:
+        break;
+    case RAW_REOPEN_ABORT:
+        raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+        if (ret) {
+            error_report("Failed to restore lock on old fd");
+        }
+        break;
+    }
+
+    return ret;
+restore:
+    ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+    if (ret2) {
+        error_report("Failed to restore old lock");
+    }
+    return ret;
+
+}
+
+static
+int raw_reopen_downgrade(BDRVReopenState *state,
+                         RawReopenOperation op,
+                         BdrvLockfCmd old_lock,
+                         BdrvLockfCmd new_lock,
+                         Error **errp)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+    int ret;
+
+    assert(old_lock == BDRV_LOCKF_EXCLUSIVE);
+    assert(new_lock == BDRV_LOCKF_SHARED);
+    switch (op) {
+    case RAW_REOPEN_PREPARE:
+        break;
+    case RAW_REOPEN_COMMIT:
+        ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED);
+        if (ret) {
+            error_report("Failed to downgrade old lock");
+            break;
+        }
+        ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED);
+        if (ret) {
+            error_report("Failed to lock new fd");
+            break;
+        }
+        break;
+    case RAW_REOPEN_ABORT:
+        break;
+    }
+
+    return ret;
+}
+
+static const struct RawReopenFuncRecord {
+    BdrvLockfCmd old_lock;
+    BdrvLockfCmd new_lock;
+    RawReopenFunc func;
+    bool need_lock_fd;
+} reopen_functions[] = {
+    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false},
+    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true},
+    {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true},
+    {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
+    {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false},
+    {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true},
+    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false},
+    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true},
+    {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false},
+};
+
+static int raw_reopen_handle_lock(BDRVReopenState *state,
+                                  RawReopenOperation op,
+                                  Error **errp)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+    BdrvLockfCmd old_lock, new_lock;
+    const struct RawReopenFuncRecord *rec;
+    int ret;
+
+    old_lock = bdrv_get_locking_cmd(bdrv_get_flags(state->bs));
+    new_lock = bdrv_get_locking_cmd(state->flags);
+
+    for (rec = &reopen_functions[0];
+         rec < &reopen_functions[ARRAY_SIZE(reopen_functions)];
+         rec++) {
+        if (rec->old_lock == old_lock && rec->new_lock == new_lock) {
+            break;
+        }
+    }
+    assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]);
+
+    switch (op) {
+    case RAW_REOPEN_PREPARE:
+        if (rec->need_lock_fd) {
+            ret = qemu_dup(raw_s->fd);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Failed to dup new fd");
+                return ret;
+            }
+            raw_s->lock_fd = ret;
+        }
+        return rec->func(state, op, old_lock, new_lock, errp);
+    case RAW_REOPEN_COMMIT:
+        rec->func(state, op, old_lock, new_lock, errp);
+        if (rec->need_lock_fd) {
+            if (s->lock_fd >= 0) {
+                qemu_close(s->lock_fd);
+            }
+            s->lock_fd = raw_s->lock_fd;
+        }
+        break;
+    case RAW_REOPEN_ABORT:
+        rec->func(state, op, old_lock, new_lock, errp);
+        if (rec->need_lock_fd && raw_s->lock_fd >= 0) {
+            qemu_close(raw_s->lock_fd);
+            raw_s->lock_fd = -1;
+        }
+        break;
+    }
+    return 0;
+}
+
 static int raw_reopen_prepare(BDRVReopenState *state,
                               BlockReopenQueue *queue, Error **errp)
 {
@@ -683,6 +952,10 @@  static int raw_reopen_prepare(BDRVReopenState *state,
         }
     }
 
+    if (!ret) {
+        ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp);
+    }
+
     return ret;
 }
 
@@ -693,6 +966,8 @@  static void raw_reopen_commit(BDRVReopenState *state)
 
     s->open_flags = raw_s->open_flags;
 
+    raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL);
+
     qemu_close(s->fd);
     s->fd = raw_s->fd;
 #ifdef CONFIG_LINUX_AIO
@@ -713,6 +988,8 @@  static void raw_reopen_abort(BDRVReopenState *state)
         return;
     }
 
+    raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL);
+
     if (raw_s->fd >= 0) {
         qemu_close(raw_s->fd);
         raw_s->fd = -1;
@@ -1385,6 +1662,10 @@  static void raw_close(BlockDriverState *bs)
         qemu_close(s->fd);
         s->fd = -1;
     }
+    if (s->lock_fd >= 0) {
+        qemu_close(s->lock_fd);
+        s->lock_fd = -1;
+    }
 }
 
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
@@ -1942,6 +2223,8 @@  BlockDriver bdrv_file = {
     .bdrv_detach_aio_context = raw_detach_aio_context,
     .bdrv_attach_aio_context = raw_attach_aio_context,
 
+    .bdrv_lockf = raw_lockf,
+
     .create_opts = &raw_create_opts,
 };
 
@@ -2396,6 +2679,8 @@  static BlockDriver bdrv_host_device = {
 #ifdef __linux__
     .bdrv_aio_ioctl     = hdev_aio_ioctl,
 #endif
+
+    .bdrv_lockf = raw_lockf,
 };
 
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)