Message ID | 1464943756-14143-9-git-send-email-famz@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, 06/03 16:49, Fam Zheng wrote: > +static > +int raw_reopen_downgrade(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + BDRVRawState *s = state->bs->opaque; > + int ret; This should be initialized to 0 for the nop branches. Fam > + > + assert(old_lock == BDRV_LOCKF_EXCLUSIVE); > + assert(new_lock == BDRV_LOCKF_SHARED); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + break; > + case RAW_REOPEN_COMMIT: > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to downgrade old lock"); > + break; > + } > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to lock new fd"); > + break; > + } > + break; > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +}
Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben: > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid > the intervene. > > Both file and host device protocols are covered. > > The complication is with reopen. We have three different locking states, > namely "unlocked", "shared locked" and "exclusively locked". > > There have three different states, "unlocked", "shared locked" and "exclusively > locked". This seems to be a corrupted copy of the previous sentence. :-) > When we reopen, the new fd may need a new locking mode. Moving away to > or from exclusive is a bit tricky because we cannot do it atomically. This > patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there > isn't a racy window where we drop the lock on one fd before acquiring the > exclusive lock on the other. > > To make the logic easier to manage, and allow better reuse, the code is > internally organized by state transition table (old_lock -> new_lock). > > Signed-off-by: Fam Zheng <famz@redhat.com> I must admit that I don't fully understand yet why we can't change the lock atomincally and how s->lock_fd helps. In any case, I think it deserves comments in the code and not only in the commit message. So I'm not giving a full review here, but I think I have one important point to make at least. > block/raw-posix.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 285 insertions(+) > > diff --git a/block/raw-posix.c b/block/raw-posix.c > index bb8669f..6347350 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -133,6 +133,7 @@ do { \ > > typedef struct BDRVRawState { > int fd; > + int lock_fd; > int type; > int open_flags; > size_t buf_align; > @@ -153,6 +154,7 @@ typedef struct BDRVRawState { > > typedef struct BDRVRawReopenState { > int fd; > + int lock_fd; > int open_flags; > #ifdef CONFIG_LINUX_AIO > int use_aio; > @@ -397,6 +399,37 @@ static void raw_attach_aio_context(BlockDriverState *bs, > #endif > } > > +static int raw_lockf_fd(int fd, BdrvLockfCmd cmd) > +{ > + assert(fd >= 0); > + /* Locking byte 1 avoids interfereing with virtlockd. */ > + switch (cmd) { > + case BDRV_LOCKF_EXCLUSIVE: > + return qemu_lock_fd(fd, 1, 1, true); > + case BDRV_LOCKF_SHARED: > + return qemu_lock_fd(fd, 1, 1, false); > + case BDRV_LOCKF_UNLOCK: > + return qemu_unlock_fd(fd, 1, 1); > + default: > + abort(); > + } > +} > + > +static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd) > +{ > + > + BDRVRawState *s = bs->opaque; > + > + if (s->lock_fd < 0) { > + s->lock_fd = qemu_dup(s->fd); > + if (s->lock_fd < 0) { > + return s->lock_fd; > + } > + } > + > + return raw_lockf_fd(s->lock_fd, cmd); > +} > + > #ifdef CONFIG_LINUX_AIO > static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags) > { > @@ -483,6 +516,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, > raw_parse_flags(bdrv_flags, &s->open_flags); > > s->fd = -1; > + s->lock_fd = -1; > fd = qemu_open(filename, s->open_flags, 0644); > if (fd < 0) { > ret = -errno; > @@ -593,6 +627,241 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, > return ret; > } > > +typedef enum { > + RAW_REOPEN_PREPARE, > + RAW_REOPEN_COMMIT, > + RAW_REOPEN_ABORT > +} RawReopenOperation; > + > +typedef int (*RawReopenFunc)(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp); > + > +static > +int raw_reopen_identical(BDRVReopenState *state, This is unusual formatting. I'm used to having everything on a single line or "static int" on its own line, but breaking between "static" and "int" feels odd. > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + assert(old_lock == new_lock); > + return 0; > +} > + > +static > +int raw_reopen_from_unlock(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + int ret = 0; > + > + assert(old_lock != new_lock); > + assert(old_lock == BDRV_LOCKF_UNLOCK); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + ret = raw_lockf_fd(raw_s->lock_fd, new_lock); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd"); > + } > + break; > + case RAW_REOPEN_COMMIT: > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static > +int raw_reopen_to_unlock(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawState *s = state->bs->opaque; > + int ret = 0; > + > + assert(old_lock != new_lock); > + assert(old_lock == BDRV_LOCKF_UNLOCK); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + break; > + case RAW_REOPEN_COMMIT: > + if (s->lock_fd >= 0) { > + qemu_close(s->lock_fd); > + s->lock_fd = -1; > + } > + break; > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static > +int raw_reopen_upgrade(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + BDRVRawState *s = state->bs->opaque; > + int ret = 0, ret2; > + > + assert(old_lock == BDRV_LOCKF_SHARED); > + assert(new_lock == BDRV_LOCKF_EXCLUSIVE); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd (shared)"); > + break; > + } > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to unlock old fd"); > + goto restore; > + } > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE); > + if (ret) { > + error_setg_errno(errp, -ret, "Failed to lock new fd (exclusive)"); > + goto restore; > + } > + break; > + case RAW_REOPEN_COMMIT: > + break; > + case RAW_REOPEN_ABORT: > + raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to restore lock on old fd"); > + } > + break; > + } > + > + return ret; > +restore: > + ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret2) { > + error_report("Failed to restore old lock"); > + } > + return ret; > + > +} That final empty line doesn't look intentional. > + > +static > +int raw_reopen_downgrade(BDRVReopenState *state, > + RawReopenOperation op, > + BdrvLockfCmd old_lock, > + BdrvLockfCmd new_lock, > + Error **errp) > +{ > + BDRVRawReopenState *raw_s = state->opaque; > + BDRVRawState *s = state->bs->opaque; > + int ret; > + > + assert(old_lock == BDRV_LOCKF_EXCLUSIVE); > + assert(new_lock == BDRV_LOCKF_SHARED); > + switch (op) { > + case RAW_REOPEN_PREPARE: > + break; > + case RAW_REOPEN_COMMIT: > + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to downgrade old lock"); > + break; > + } > + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); > + if (ret) { > + error_report("Failed to lock new fd"); > + break; > + } > + break; > + case RAW_REOPEN_ABORT: > + break; > + } > + > + return ret; > +} > + > +static const struct RawReopenFuncRecord { > + BdrvLockfCmd old_lock; > + BdrvLockfCmd new_lock; > + RawReopenFunc func; > + bool need_lock_fd; > +} reopen_functions[] = { > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false}, > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true}, > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false}, > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true}, > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false}, > +}; > + > +static int raw_reopen_handle_lock(BDRVReopenState *state, > + RawReopenOperation op, > + Error **errp) I think we have one big problem here: We don't know whether raw_s->fd is already locked or not. If dup() and setting the new flags with fcntl() succeeded, it is, but if we had to fall back on qemu_open(), it isn't. This means that doing nothing in the raw_reopen_identical case isn't right because reopening without intending to change anything about the locking could end up unlocking the image. Kevin
On Fri, 06/17 15:07, Kevin Wolf wrote: > Am 03.06.2016 um 10:49 hat Fam Zheng geschrieben: > > virtlockd in libvirt locks the first byte, we lock byte 1 to avoid > > the intervene. > > > > Both file and host device protocols are covered. > > > > The complication is with reopen. We have three different locking states, > > namely "unlocked", "shared locked" and "exclusively locked". > > > > There have three different states, "unlocked", "shared locked" and "exclusively > > locked". > > This seems to be a corrupted copy of the previous sentence. :-) Right, fixing. > > > When we reopen, the new fd may need a new locking mode. Moving away to > > or from exclusive is a bit tricky because we cannot do it atomically. This > > patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there > > isn't a racy window where we drop the lock on one fd before acquiring the > > exclusive lock on the other. > > > > To make the logic easier to manage, and allow better reuse, the code is > > internally organized by state transition table (old_lock -> new_lock). > > > > Signed-off-by: Fam Zheng <famz@redhat.com> > > I must admit that I don't fully understand yet why we can't change the > lock atomincally and how s->lock_fd helps. In any case, I think it > deserves comments in the code and not only in the commit message. I'll add comments in the code too. > > +static const struct RawReopenFuncRecord { > > + BdrvLockfCmd old_lock; > > + BdrvLockfCmd new_lock; > > + RawReopenFunc func; > > + bool need_lock_fd; > > +} reopen_functions[] = { > > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false}, > > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true}, > > + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true}, > > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false}, > > + {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true}, > > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, > > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true}, > > + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false}, > > +}; > > + > > +static int raw_reopen_handle_lock(BDRVReopenState *state, > > + RawReopenOperation op, > > + Error **errp) > > I think we have one big problem here: We don't know whether raw_s->fd is > already locked or not. If dup() and setting the new flags with fcntl() > succeeded, it is, but if we had to fall back on qemu_open(), it isn't. > > This means that doing nothing in the raw_reopen_identical case isn't > right because reopening without intending to change anything about the > locking could end up unlocking the image. > Unless I'm missing something, we don't rely on that, becasue raw_s->fd is never locked. Instead, raw_s->lock_fd, as a dup() of raw_s->fd, is what we actually handle in raw_reopen_identical(), and it always has the correct state when raw_reopen_handle_lock() is called. (That is also an advantage of introducing raw_s->lock_fd.) Fam
diff --git a/block/raw-posix.c b/block/raw-posix.c index bb8669f..6347350 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -133,6 +133,7 @@ do { \ typedef struct BDRVRawState { int fd; + int lock_fd; int type; int open_flags; size_t buf_align; @@ -153,6 +154,7 @@ typedef struct BDRVRawState { typedef struct BDRVRawReopenState { int fd; + int lock_fd; int open_flags; #ifdef CONFIG_LINUX_AIO int use_aio; @@ -397,6 +399,37 @@ static void raw_attach_aio_context(BlockDriverState *bs, #endif } +static int raw_lockf_fd(int fd, BdrvLockfCmd cmd) +{ + assert(fd >= 0); + /* Locking byte 1 avoids interfereing with virtlockd. */ + switch (cmd) { + case BDRV_LOCKF_EXCLUSIVE: + return qemu_lock_fd(fd, 1, 1, true); + case BDRV_LOCKF_SHARED: + return qemu_lock_fd(fd, 1, 1, false); + case BDRV_LOCKF_UNLOCK: + return qemu_unlock_fd(fd, 1, 1); + default: + abort(); + } +} + +static int raw_lockf(BlockDriverState *bs, BdrvLockfCmd cmd) +{ + + BDRVRawState *s = bs->opaque; + + if (s->lock_fd < 0) { + s->lock_fd = qemu_dup(s->fd); + if (s->lock_fd < 0) { + return s->lock_fd; + } + } + + return raw_lockf_fd(s->lock_fd, cmd); +} + #ifdef CONFIG_LINUX_AIO static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags) { @@ -483,6 +516,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, raw_parse_flags(bdrv_flags, &s->open_flags); s->fd = -1; + s->lock_fd = -1; fd = qemu_open(filename, s->open_flags, 0644); if (fd < 0) { ret = -errno; @@ -593,6 +627,241 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, return ret; } +typedef enum { + RAW_REOPEN_PREPARE, + RAW_REOPEN_COMMIT, + RAW_REOPEN_ABORT +} RawReopenOperation; + +typedef int (*RawReopenFunc)(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp); + +static +int raw_reopen_identical(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp) +{ + assert(old_lock == new_lock); + return 0; +} + +static +int raw_reopen_from_unlock(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp) +{ + BDRVRawReopenState *raw_s = state->opaque; + int ret = 0; + + assert(old_lock != new_lock); + assert(old_lock == BDRV_LOCKF_UNLOCK); + switch (op) { + case RAW_REOPEN_PREPARE: + ret = raw_lockf_fd(raw_s->lock_fd, new_lock); + if (ret) { + error_setg_errno(errp, -ret, "Failed to lock new fd"); + } + break; + case RAW_REOPEN_COMMIT: + case RAW_REOPEN_ABORT: + break; + } + + return ret; +} + +static +int raw_reopen_to_unlock(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp) +{ + BDRVRawState *s = state->bs->opaque; + int ret = 0; + + assert(old_lock != new_lock); + assert(old_lock == BDRV_LOCKF_UNLOCK); + switch (op) { + case RAW_REOPEN_PREPARE: + break; + case RAW_REOPEN_COMMIT: + if (s->lock_fd >= 0) { + qemu_close(s->lock_fd); + s->lock_fd = -1; + } + break; + case RAW_REOPEN_ABORT: + break; + } + + return ret; +} + +static +int raw_reopen_upgrade(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp) +{ + BDRVRawReopenState *raw_s = state->opaque; + BDRVRawState *s = state->bs->opaque; + int ret = 0, ret2; + + assert(old_lock == BDRV_LOCKF_SHARED); + assert(new_lock == BDRV_LOCKF_EXCLUSIVE); + switch (op) { + case RAW_REOPEN_PREPARE: + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); + if (ret) { + error_setg_errno(errp, -ret, "Failed to lock new fd (shared)"); + break; + } + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_UNLOCK); + if (ret) { + error_setg_errno(errp, -ret, "Failed to unlock old fd"); + goto restore; + } + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_EXCLUSIVE); + if (ret) { + error_setg_errno(errp, -ret, "Failed to lock new fd (exclusive)"); + goto restore; + } + break; + case RAW_REOPEN_COMMIT: + break; + case RAW_REOPEN_ABORT: + raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); + if (ret) { + error_report("Failed to restore lock on old fd"); + } + break; + } + + return ret; +restore: + ret2 = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); + if (ret2) { + error_report("Failed to restore old lock"); + } + return ret; + +} + +static +int raw_reopen_downgrade(BDRVReopenState *state, + RawReopenOperation op, + BdrvLockfCmd old_lock, + BdrvLockfCmd new_lock, + Error **errp) +{ + BDRVRawReopenState *raw_s = state->opaque; + BDRVRawState *s = state->bs->opaque; + int ret; + + assert(old_lock == BDRV_LOCKF_EXCLUSIVE); + assert(new_lock == BDRV_LOCKF_SHARED); + switch (op) { + case RAW_REOPEN_PREPARE: + break; + case RAW_REOPEN_COMMIT: + ret = raw_lockf_fd(s->lock_fd, BDRV_LOCKF_SHARED); + if (ret) { + error_report("Failed to downgrade old lock"); + break; + } + ret = raw_lockf_fd(raw_s->lock_fd, BDRV_LOCKF_SHARED); + if (ret) { + error_report("Failed to lock new fd"); + break; + } + break; + case RAW_REOPEN_ABORT: + break; + } + + return ret; +} + +static const struct RawReopenFuncRecord { + BdrvLockfCmd old_lock; + BdrvLockfCmd new_lock; + RawReopenFunc func; + bool need_lock_fd; +} reopen_functions[] = { + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_UNLOCK, raw_reopen_identical, false}, + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_SHARED, raw_reopen_from_unlock, true}, + {BDRV_LOCKF_UNLOCK, BDRV_LOCKF_EXCLUSIVE, raw_reopen_from_unlock, true}, + {BDRV_LOCKF_SHARED, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, + {BDRV_LOCKF_SHARED, BDRV_LOCKF_SHARED, raw_reopen_identical, false}, + {BDRV_LOCKF_SHARED, BDRV_LOCKF_EXCLUSIVE, raw_reopen_upgrade, true}, + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_UNLOCK, raw_reopen_to_unlock, false}, + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_SHARED, raw_reopen_downgrade, true}, + {BDRV_LOCKF_EXCLUSIVE, BDRV_LOCKF_EXCLUSIVE, raw_reopen_identical, false}, +}; + +static int raw_reopen_handle_lock(BDRVReopenState *state, + RawReopenOperation op, + Error **errp) +{ + BDRVRawReopenState *raw_s = state->opaque; + BDRVRawState *s = state->bs->opaque; + BdrvLockfCmd old_lock, new_lock; + const struct RawReopenFuncRecord *rec; + int ret; + + old_lock = bdrv_get_locking_cmd(bdrv_get_flags(state->bs)); + new_lock = bdrv_get_locking_cmd(state->flags); + + for (rec = &reopen_functions[0]; + rec < &reopen_functions[ARRAY_SIZE(reopen_functions)]; + rec++) { + if (rec->old_lock == old_lock && rec->new_lock == new_lock) { + break; + } + } + assert(rec != &reopen_functions[ARRAY_SIZE(reopen_functions)]); + + switch (op) { + case RAW_REOPEN_PREPARE: + if (rec->need_lock_fd) { + ret = qemu_dup(raw_s->fd); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to dup new fd"); + return ret; + } + raw_s->lock_fd = ret; + } + return rec->func(state, op, old_lock, new_lock, errp); + case RAW_REOPEN_COMMIT: + rec->func(state, op, old_lock, new_lock, errp); + if (rec->need_lock_fd) { + if (s->lock_fd >= 0) { + qemu_close(s->lock_fd); + } + s->lock_fd = raw_s->lock_fd; + } + break; + case RAW_REOPEN_ABORT: + rec->func(state, op, old_lock, new_lock, errp); + if (rec->need_lock_fd && raw_s->lock_fd >= 0) { + qemu_close(raw_s->lock_fd); + raw_s->lock_fd = -1; + } + break; + } + return 0; +} + static int raw_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { @@ -683,6 +952,10 @@ static int raw_reopen_prepare(BDRVReopenState *state, } } + if (!ret) { + ret = raw_reopen_handle_lock(state, RAW_REOPEN_PREPARE, errp); + } + return ret; } @@ -693,6 +966,8 @@ static void raw_reopen_commit(BDRVReopenState *state) s->open_flags = raw_s->open_flags; + raw_reopen_handle_lock(state, RAW_REOPEN_COMMIT, NULL); + qemu_close(s->fd); s->fd = raw_s->fd; #ifdef CONFIG_LINUX_AIO @@ -713,6 +988,8 @@ static void raw_reopen_abort(BDRVReopenState *state) return; } + raw_reopen_handle_lock(state, RAW_REOPEN_ABORT, NULL); + if (raw_s->fd >= 0) { qemu_close(raw_s->fd); raw_s->fd = -1; @@ -1385,6 +1662,10 @@ static void raw_close(BlockDriverState *bs) qemu_close(s->fd); s->fd = -1; } + if (s->lock_fd >= 0) { + qemu_close(s->lock_fd); + s->lock_fd = -1; + } } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -1942,6 +2223,8 @@ BlockDriver bdrv_file = { .bdrv_detach_aio_context = raw_detach_aio_context, .bdrv_attach_aio_context = raw_attach_aio_context, + .bdrv_lockf = raw_lockf, + .create_opts = &raw_create_opts, }; @@ -2396,6 +2679,8 @@ static BlockDriver bdrv_host_device = { #ifdef __linux__ .bdrv_aio_ioctl = hdev_aio_ioctl, #endif + + .bdrv_lockf = raw_lockf, }; #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
virtlockd in libvirt locks the first byte, we lock byte 1 to avoid the intervene. Both file and host device protocols are covered. The complication is with reopen. We have three different locking states, namely "unlocked", "shared locked" and "exclusively locked". There have three different states, "unlocked", "shared locked" and "exclusively locked". When we reopen, the new fd may need a new locking mode. Moving away to or from exclusive is a bit tricky because we cannot do it atomically. This patch solves it by dup() s->fd to s->lock_fd and avoid close(), so that there isn't a racy window where we drop the lock on one fd before acquiring the exclusive lock on the other. To make the logic easier to manage, and allow better reuse, the code is internally organized by state transition table (old_lock -> new_lock). Signed-off-by: Fam Zheng <famz@redhat.com> --- block/raw-posix.c | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 285 insertions(+)