diff mbox series

[v5,01/10] block: add persistent reservation in/out api

Message ID 20240606122444.2914576-2-luchangqi.123@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Support persistent reservation operations | expand

Commit Message

Changqi Lu June 6, 2024, 12:24 p.m. UTC
Add persistent reservation in/out operations
at the block level. The following operations
are included:

- read_keys:        retrieves the list of registered keys.
- read_reservation: retrieves the current reservation status.
- register:         registers a new reservation key.
- reserve:          initiates a reservation for a specific key.
- release:          releases a reservation for a specific key.
- clear:            clears all existing reservations.
- preempt:          preempts a reservation held by another key.

Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 block/block-backend.c             | 397 ++++++++++++++++++++++++++++++
 block/io.c                        | 163 ++++++++++++
 include/block/block-common.h      |  40 +++
 include/block/block-io.h          |  20 ++
 include/block/block_int-common.h  |  84 +++++++
 include/sysemu/block-backend-io.h |  24 ++
 6 files changed, 728 insertions(+)

Comments

Stefan Hajnoczi June 10, 2024, 5:26 p.m. UTC | #1
On Thu, Jun 06, 2024 at 08:24:35PM +0800, Changqi Lu wrote:
> Add persistent reservation in/out operations
> at the block level. The following operations
> are included:
> 
> - read_keys:        retrieves the list of registered keys.
> - read_reservation: retrieves the current reservation status.
> - register:         registers a new reservation key.
> - reserve:          initiates a reservation for a specific key.
> - release:          releases a reservation for a specific key.
> - clear:            clears all existing reservations.
> - preempt:          preempts a reservation held by another key.
> 
> Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com>
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>  block/block-backend.c             | 397 ++++++++++++++++++++++++++++++
>  block/io.c                        | 163 ++++++++++++
>  include/block/block-common.h      |  40 +++
>  include/block/block-io.h          |  20 ++
>  include/block/block_int-common.h  |  84 +++++++
>  include/sysemu/block-backend-io.h |  24 ++
>  6 files changed, 728 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index db6f9b92a3..6707d94df7 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1770,6 +1770,403 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>      return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
>  }
>  
> +typedef struct BlkPrInCo {
> +    BlockBackend *blk;
> +    uint32_t *generation;
> +    uint32_t num_keys;
> +    BlockPrType *type;
> +    uint64_t *keys;
> +    int ret;
> +} BlkPrInCo;
> +
> +typedef struct BlkPrInCB {
> +    BlockAIOCB common;
> +    BlkPrInCo prco;
> +    bool has_returned;
> +} BlkPrInCB;
> +
> +static const AIOCBInfo blk_pr_in_aiocb_info = {
> +    .aiocb_size         = sizeof(BlkPrInCB),
> +};
> +
> +static void blk_pr_in_complete(BlkPrInCB *acb)
> +{
> +    if (acb->has_returned) {
> +        acb->common.cb(acb->common.opaque, acb->prco.ret);
> +        blk_dec_in_flight(acb->prco.blk);

Did you receive my replies to v1 of this patch series?

Please take a look at them and respond:
https://lore.kernel.org/qemu-devel/20240508093629.441057-1-luchangqi.123@bytedance.com/

Thanks,
Stefan

> +        qemu_aio_unref(acb);
> +    }
> +}
> +
> +static void blk_pr_in_complete_bh(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    assert(acb->has_returned);
> +    blk_pr_in_complete(acb);
> +}
> +
> +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, BlockPrType *type,
> +                                 uint64_t *keys, CoroutineEntry co_entry,
> +                                 BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkPrInCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque);
> +    acb->prco = (BlkPrInCo) {
> +        .blk        = blk,
> +        .generation = generation,
> +        .num_keys   = num_keys,
> +        .type       = type,
> +        .ret        = NOT_DONE,
> +        .keys       = keys,
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(co_entry, acb);
> +    aio_co_enter(qemu_get_current_aio_context(), co);
> +
> +    acb->has_returned = true;
> +    if (acb->prco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
> +                                         blk_pr_in_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation,
> +                        uint32_t num_keys, uint64_t *keys)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys);
> +}
> +
> +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    BlkPrInCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation,
> +                                        prco->num_keys, prco->keys);
> +    blk_pr_in_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, uint64_t *keys,
> +                                 BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_in(blk, generation, num_keys, NULL, keys,
> +                         blk_aio_pr_read_keys_entry, cb, opaque);
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                               uint64_t *key, BlockPrType *type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque)
> +{
> +    BlkPrInCB *acb = opaque;
> +    BlkPrInCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation,
> +                                               prco->keys, prco->type);
> +    blk_pr_in_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                                        uint64_t *key, BlockPrType *type,
> +                                        BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_in(blk, generation, 0, type, key,
> +                         blk_aio_pr_read_reservation_entry, cb, opaque);
> +}
> +
> +typedef struct BlkPrOutCo {
> +    BlockBackend *blk;
> +    uint64_t old_key;
> +    uint64_t new_key;
> +    bool ptpl;
> +    BlockPrType type;
> +    bool ignore_key;
> +    bool abort;
> +    int ret;
> +} BlkPrOutCo;
> +
> +typedef struct BlkPrOutCB {
> +    BlockAIOCB common;
> +    BlkPrOutCo prco;
> +    bool has_returned;
> +} BlkPrOutCB;
> +
> +static const AIOCBInfo blk_pr_out_aiocb_info = {
> +    .aiocb_size         = sizeof(BlkPrOutCB),
> +};
> +
> +static void blk_pr_out_complete(BlkPrOutCB *acb)
> +{
> +    if (acb->has_returned) {
> +        acb->common.cb(acb->common.opaque, acb->prco.ret);
> +        blk_dec_in_flight(acb->prco.blk);
> +        qemu_aio_unref(acb);
> +    }
> +}
> +
> +static void blk_pr_out_complete_bh(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    assert(acb->has_returned);
> +    blk_pr_out_complete(acb);
> +}
> +
> +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key,
> +                                  uint64_t new_key, bool ptpl,
> +                                  BlockPrType type, bool ignore_key,
> +                                  bool abort, CoroutineEntry co_entry,
> +                                  BlockCompletionFunc *cb, void *opaque)
> +{
> +    BlkPrOutCB *acb;
> +    Coroutine *co;
> +
> +    blk_inc_in_flight(blk);
> +    acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque);
> +    acb->prco = (BlkPrOutCo) {
> +        .blk        = blk,
> +        .old_key    = old_key,
> +        .new_key    = new_key,
> +        .ptpl       = ptpl,
> +        .type       = type,
> +        .ignore_key = ignore_key,
> +        .abort      = abort,
> +        .ret        = NOT_DONE,
> +    };
> +    acb->has_returned = false;
> +
> +    co = qemu_coroutine_create(co_entry, acb);
> +    aio_co_enter(qemu_get_current_aio_context(), co);
> +
> +    acb->has_returned = true;
> +    if (acb->prco.ret != NOT_DONE) {
> +        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
> +                                         blk_pr_out_complete_bh, acb);
> +    }
> +
> +    return &acb->common;
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key,
> +                       uint64_t new_key, BlockPrType type,
> +                       bool ptpl, bool ignore_key)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type,
> +                               ptpl, ignore_key);
> +}
> +
> +static void coroutine_fn blk_aio_pr_register_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key,
> +                                       prco->type, prco->ptpl,
> +                                       prco->ignore_key);
> +    blk_pr_out_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
> +                                uint64_t new_key, BlockPrType type,
> +                                bool ptpl, bool ignore_key,
> +                                BlockCompletionFunc *cb,
> +                                void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key, false,
> +                          blk_aio_pr_register_entry, cb, opaque);
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_reserve(blk_bs(blk), key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key,
> +                                      prco->type);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
> +                               BlockPrType type,
> +                               BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, false, type, false, false,
> +                          blk_aio_pr_reserve_entry, cb, opaque);
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_release(blk_bs(blk), key, type);
> +}
> +
> +static void coroutine_fn blk_aio_pr_release_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, false, type, false, false,
> +                          blk_aio_pr_release_entry, cb, opaque);
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_clear(blk_bs(blk), key);
> +}
> +
> +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
> +                             BlockCompletionFunc *cb, void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, key, 0, false, 0, false, false,
> +                          blk_aio_pr_clear_entry, cb, opaque);
> +}
> +
> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> +static int coroutine_fn
> +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key,
> +                      uint64_t pr_key, BlockPrType type, bool abort)
> +{
> +    IO_CODE();
> +
> +    blk_wait_while_drained(blk);
> +    GRAPH_RDLOCK_GUARD();
> +
> +    if (!blk_co_is_available(blk)) {
> +        return -ENOMEDIUM;
> +    }
> +
> +    return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort);
> +}
> +
> +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque)
> +{
> +    BlkPrOutCB *acb = opaque;
> +    BlkPrOutCo *prco = &acb->prco;
> +
> +    prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key,
> +                                      prco->new_key, prco->type,
> +                                      prco->abort);
> +    blk_pr_out_complete(acb);
> +}
> +
> +
> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
> +                               uint64_t pr_key, BlockPrType type,
> +                               bool abort, BlockCompletionFunc *cb,
> +                               void *opaque)
> +{
> +    IO_CODE();
> +    return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort,
> +                          blk_aio_pr_preempt_entry, cb, opaque);
> +}
> +
>  /* To be called between exactly one pair of blk_inc/dec_in_flight() */
>  static int coroutine_fn
>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
> diff --git a/block/io.c b/block/io.c
> index 7217cf811b..87a363c94f 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3220,6 +3220,169 @@ out:
>      return co.ret;
>  }
>  
> +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs,
> +                 uint32_t *generation, uint32_t num_keys,
> +                 uint64_t *keys)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_read_keys) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs,
> +                 uint32_t *generation, uint64_t *key, BlockPrType *type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_read_reservation) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
> +                 uint64_t new_key, BlockPrType type, bool ptpl,
> +                 bool ignore_key)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_register) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type,
> +    ptpl, ignore_key);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key,
> +                                    BlockPrType type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_reserve) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_reserve(bs, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key,
> +                                    BlockPrType type)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_release) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_release(bs, key, type);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_clear) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_clear(bs, key);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
> +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key,
> +                 uint64_t pr_key, BlockPrType type, bool abort)
> +{
> +    BlockDriver *drv = bs->drv;
> +    CoroutineIOCompletion co = {
> +        .coroutine = qemu_coroutine_self(),
> +    };
> +
> +    IO_CODE();
> +    assert_bdrv_graph_readable();
> +
> +    bdrv_inc_in_flight(bs);
> +    if (!drv || !drv->bdrv_co_pr_preempt) {
> +        co.ret = -ENOTSUP;
> +        goto out;
> +    }
> +
> +    co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort);
> +out:
> +    bdrv_dec_in_flight(bs);
> +    return co.ret;
> +}
> +
>  int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
>                          unsigned int *nr_zones,
>                          BlockZoneDescriptor *zones)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index a846023a09..7ca4e2328f 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -524,6 +524,46 @@ typedef enum {
>      BDRV_FIX_ERRORS   = 2,
>  } BdrvCheckMode;
>  
> +/**
> + *  According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4)
> + *  and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0),
> + *  the persistent reservation types and persistent capabilities of
> + *  the public layer block are abstracted.
> + */
> +typedef enum {
> +    BLK_PR_WRITE_EXCLUSIVE              = 0x1,
> +    BLK_PR_EXCLUSIVE_ACCESS             = 0x2,
> +    BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY    = 0x3,
> +    BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY   = 0x4,
> +    BLK_PR_WRITE_EXCLUSIVE_ALL_REGS     = 0x5,
> +    BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS    = 0x6,
> +} BlockPrType;
> +
> +typedef enum BLKPrCap {
> +    /* Persist Through Power Loss */
> +    BLK_PR_CAP_PTPL = 1 << 0,
> +    /* Write Exclusive reservation type */
> +    BLK_PR_CAP_WR_EX = 1 << 1,
> +    /* Exclusive Access reservation type */
> +    BLK_PR_CAP_EX_AC = 1 << 2,
> +    /* Write Exclusive Registrants Only reservation type */
> +    BLK_PR_CAP_WR_EX_RO = 1 << 3,
> +    /* Exclusive Access Registrants Only reservation type */
> +    BLK_PR_CAP_EX_AC_RO = 1 << 4,
> +    /* Write Exclusive All Registrants reservation type */
> +    BLK_PR_CAP_WR_EX_AR = 1 << 5,
> +    /* Exclusive Access All Registrants reservation type */
> +    BLK_PR_CAP_EX_AC_AR = 1 << 6,
> +
> +    BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL |
> +                     BLK_PR_CAP_WR_EX |
> +                     BLK_PR_CAP_EX_AC |
> +                     BLK_PR_CAP_WR_EX_RO |
> +                     BLK_PR_CAP_EX_AC_RO |
> +                     BLK_PR_CAP_WR_EX_AR |
> +                     BLK_PR_CAP_EX_AC_AR),
> +} BLKPrCap;
> +
>  typedef struct BlockSizes {
>      uint32_t phys;
>      uint32_t log;
> diff --git a/include/block/block-io.h b/include/block/block-io.h
> index b49e0537dd..908361862b 100644
> --- a/include/block/block-io.h
> +++ b/include/block/block-io.h
> @@ -106,6 +106,26 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb);
>  int coroutine_fn GRAPH_RDLOCK
>  bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>  
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation,
> +                     uint32_t num_keys, uint64_t *keys);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation,
> +                            uint64_t *key, BlockPrType *type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
> +                    uint64_t new_key, BlockPrType type,
> +                    bool ptpl, bool ignore_key);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key);
> +int coroutine_fn GRAPH_RDLOCK
> +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t pr_key,
> +                   BlockPrType type, bool abort);
> +
>  /* Ensure contents are flushed to disk.  */
>  int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
>  
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 761276127e..6e628069e9 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -766,6 +766,87 @@ struct BlockDriver {
>      int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)(
>          BlockDriverState *bs, unsigned long int req, void *buf);
>  
> +    /*
> +     * Persistent reservation series api.
> +     * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or
> +     * chapter 7 of NVMe Base Specification 2.0.
> +     *
> +     * The block layer driver should implement all the following APIs
> +     * or none at all, including: bdrv_co_pr_read_keys,
> +     * bdrv_co_pr_read_reservation, bdrv_co_pr_register,
> +     * bdrv_co_pr_reserve, bdrv_co_pr_release,
> +     * bdrv_co_pr_clear and bdrv_co_pr_preempt.
> +     *
> +     * Read the registered keys and return them in the @keys.
> +     * @generation: The generation of the reservation key.
> +     * @num_keys:   The maximum number of keys that can be transmitted.
> +     * @keys:       Registered keys array.
> +     *
> +     * On success, store generation in @generation and store keys @keys
> +     * and return the number of @keys.
> +     * On failure return -errno.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)(
> +        BlockDriverState *bs, uint32_t *generation,
> +        uint32_t num_keys, uint64_t *keys);
> +    /*
> +     * Read the reservation key and store it in the @key.
> +     * @generation: The generation of the reservation key.
> +     * @key:        The reservation key.
> +     * @type:       Type of the reservation key.
> +     *
> +     * On success, store generation in @generation, store the
> +     * reservation key in @key and return the number of @key
> +     * which used to determine whether the reservation key exists.
> +     * On failure return -errno.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)(
> +        BlockDriverState *bs, uint32_t *generation,
> +        uint64_t *key, BlockPrType *type);
> +    /*
> +     * Register, unregister, or replace a reservation key.
> +     * @old_key:    The current reservation key associated with the host.
> +     * @new_key:    The new reservation Key.
> +     * @type:       Type of the reservation key.
> +     * @ignore_key: Ignore or not @old_key.
> +     * @ptpl:       Whether to support Persist Through Power Loss(PTPL).
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)(
> +        BlockDriverState *bs, uint64_t old_key,
> +        uint64_t new_key, BlockPrType type,
> +        bool ptpl, bool ignore_key);
> +    /*
> +     * Acquire a reservation on a host.
> +     * @key:  The current reservation key associated with the host.
> +     * @type: Type of the reservation key.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)(
> +        BlockDriverState *bs, uint64_t key, BlockPrType type);
> +    /*
> +     * Release a reservation on a host.
> +     * @key:  The current reservation key associated with the host.
> +     * @type: Type of the reservation key.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)(
> +        BlockDriverState *bs, uint64_t key, BlockPrType type);
> +    /**
> +     * Clear reservations on a host.
> +     * @key: The current reservation key associated with the host.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)(
> +        BlockDriverState *bs, uint64_t key);
> +    /*
> +     * Preempt a reservation held on a host.
> +     * @cr_key: The current reservation key associated with the host.
> +     * @pr_key: The preempt reservation Key which to be
> +     *          unregistered from the namespace.
> +     * @type:   Type of the reservation key.
> +     * @abort:  Whether to abort a reservation held on a host.
> +     */
> +    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)(
> +        BlockDriverState *bs, uint64_t cr_key,
> +        uint64_t pr_key, BlockPrType type, bool abort);
> +
>      /*
>       * Returns 0 for completed check, -errno for internal errors.
>       * The check results are stored in result.
> @@ -899,6 +980,9 @@ typedef struct BlockLimits {
>      uint32_t max_active_zones;
>  
>      uint32_t write_granularity;
> +
> +   /* Persistent reservation capacities. */
> +    uint8_t pr_cap;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
> index d174275a5c..b3d49a3c6f 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -62,6 +62,30 @@ void blk_aio_cancel_async(BlockAIOCB *acb);
>  BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>                            BlockCompletionFunc *cb, void *opaque);
>  
> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
> +                                 uint32_t num_keys, uint64_t *keys,
> +                                 BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
> +                                        uint64_t *key, BlockPrType *type,
> +                                        BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
> +                                uint64_t new_key, BlockPrType type,
> +                                bool ptpl, bool ignore_key,
> +                                BlockCompletionFunc *cb,
> +                                void *opaque);
> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
> +                               BlockPrType type,
> +                               BlockCompletionFunc *cb,
> +                               void *opaque);
> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
> +                               BlockPrType type, BlockCompletionFunc *cb,
> +                               void *opaque);
> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
> +                             BlockCompletionFunc *cb, void *opaque);
> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
> +                               uint64_t pr_key, BlockPrType type, bool abort,
> +                               BlockCompletionFunc *cb, void *opaque);
> +
>  void blk_inc_in_flight(BlockBackend *blk);
>  void blk_dec_in_flight(BlockBackend *blk);
>  
> -- 
> 2.20.1
>
Changqi Lu June 11, 2024, 2:45 a.m. UTC | #2
Hi,

Thanks for your advices! I will add it.

On 2024/6/11 01:26, Stefan Hajnoczi wrote:
> On Thu, Jun 06, 2024 at 08:24:35PM +0800, Changqi Lu wrote:
>> Add persistent reservation in/out operations
>> at the block level. The following operations
>> are included:
>>
>> - read_keys: retrieves the list of registered keys.
>> - read_reservation: retrieves the current reservation status.
>> - register: registers a new reservation key.
>> - reserve: initiates a reservation for a specific key.
>> - release: releases a reservation for a specific key.
>> - clear: clears all existing reservations.
>> - preempt: preempts a reservation held by another key.
>>
>> Signed-off-by: Changqi Lu
>> Signed-off-by: zhenwei pi
>> ---
>> block/block-backend.c | 397 ++++++++++++++++++++++++++++++
>> block/io.c | 163 ++++++++++++
>> include/block/block-common.h | 40 +++
>> include/block/block-io.h | 20 ++
>> include/block/block_int-common.h | 84 +++++++
>> include/sysemu/block-backend-io.h | 24 ++
>> 6 files changed, 728 insertions(+)
>>
>> diff --git a/block/block-backend.c b/block/block-backend.c
>> index db6f9b92a3..6707d94df7 100644
>> --- a/block/block-backend.c
>> +++ b/block/block-backend.c
>> @@ -1770,6 +1770,403 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk,
unsigned long int req, void *buf,
>> return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb,
opaque);
>> }
>>
>> +typedef struct BlkPrInCo {
>> + BlockBackend *blk;
>> + uint32_t *generation;
>> + uint32_t num_keys;
>> + BlockPrType *type;
>> + uint64_t *keys;
>> + int ret;
>> +} BlkPrInCo;
>> +
>> +typedef struct BlkPrInCB {
>> + BlockAIOCB common;
>> + BlkPrInCo prco;
>> + bool has_returned;
>> +} BlkPrInCB;
>> +
>> +static const AIOCBInfo blk_pr_in_aiocb_info = {
>> + .aiocb_size = sizeof(BlkPrInCB),
>> +};
>> +
>> +static void blk_pr_in_complete(BlkPrInCB *acb)
>> +{
>> + if (acb->has_returned) {
>> + acb->common.cb(acb->common.opaque, acb->prco.ret);
>> + blk_dec_in_flight(acb->prco.blk);
>
> Did you receive my replies to v1 of this patch series?
>
> Please take a look at them and respond:
>
https://lore.kernel.org/qemu-devel/20240508093629.441057-1-luchangqi.123@bytedance.com/
>
> Thanks,
> Stefan
>
>> + qemu_aio_unref(acb);
>> + }
>> +}
>> +
>> +static void blk_pr_in_complete_bh(void *opaque)
>> +{
>> + BlkPrInCB *acb = opaque;
>> + assert(acb->has_returned);
>> + blk_pr_in_complete(acb);
>> +}
>> +
>> +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t
*generation,
>> + uint32_t num_keys, BlockPrType *type,
>> + uint64_t *keys, CoroutineEntry co_entry,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + BlkPrInCB *acb;
>> + Coroutine *co;
>> +
>> + blk_inc_in_flight(blk);
>> + acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque);
>> + acb->prco = (BlkPrInCo) {
>> + .blk = blk,
>> + .generation = generation,
>> + .num_keys = num_keys,
>> + .type = type,
>> + .ret = NOT_DONE,
>> + .keys = keys,
>> + };
>> + acb->has_returned = false;
>> +
>> + co = qemu_coroutine_create(co_entry, acb);
>> + aio_co_enter(qemu_get_current_aio_context(), co);
>> +
>> + acb->has_returned = true;
>> + if (acb->prco.ret != NOT_DONE) {
>> + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
>> + blk_pr_in_complete_bh, acb);
>> + }
>> +
>> + return &acb->common;
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation,
>> + uint32_t num_keys, uint64_t *keys)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque)
>> +{
>> + BlkPrInCB *acb = opaque;
>> + BlkPrInCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation,
>> + prco->num_keys, prco->keys);
>> + blk_pr_in_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t
*generation,
>> + uint32_t num_keys, uint64_t *keys,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_in(blk, generation, num_keys, NULL, keys,
>> + blk_aio_pr_read_keys_entry, cb, opaque);
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation,
>> + uint64_t *key, BlockPrType *type)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key,
type);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_read_reservation_entry(void
*opaque)
>> +{
>> + BlkPrInCB *acb = opaque;
>> + BlkPrInCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_read_reservation(prco->blk,
prco->generation,
>> + prco->keys, prco->type);
>> + blk_pr_in_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t
*generation,
>> + uint64_t *key, BlockPrType *type,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_in(blk, generation, 0, type, key,
>> + blk_aio_pr_read_reservation_entry, cb, opaque);
>> +}
>> +
>> +typedef struct BlkPrOutCo {
>> + BlockBackend *blk;
>> + uint64_t old_key;
>> + uint64_t new_key;
>> + bool ptpl;
>> + BlockPrType type;
>> + bool ignore_key;
>> + bool abort;
>> + int ret;
>> +} BlkPrOutCo;
>> +
>> +typedef struct BlkPrOutCB {
>> + BlockAIOCB common;
>> + BlkPrOutCo prco;
>> + bool has_returned;
>> +} BlkPrOutCB;
>> +
>> +static const AIOCBInfo blk_pr_out_aiocb_info = {
>> + .aiocb_size = sizeof(BlkPrOutCB),
>> +};
>> +
>> +static void blk_pr_out_complete(BlkPrOutCB *acb)
>> +{
>> + if (acb->has_returned) {
>> + acb->common.cb(acb->common.opaque, acb->prco.ret);
>> + blk_dec_in_flight(acb->prco.blk);
>> + qemu_aio_unref(acb);
>> + }
>> +}
>> +
>> +static void blk_pr_out_complete_bh(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + assert(acb->has_returned);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key,
>> + uint64_t new_key, bool ptpl,
>> + BlockPrType type, bool ignore_key,
>> + bool abort, CoroutineEntry co_entry,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + BlkPrOutCB *acb;
>> + Coroutine *co;
>> +
>> + blk_inc_in_flight(blk);
>> + acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque);
>> + acb->prco = (BlkPrOutCo) {
>> + .blk = blk,
>> + .old_key = old_key,
>> + .new_key = new_key,
>> + .ptpl = ptpl,
>> + .type = type,
>> + .ignore_key = ignore_key,
>> + .abort = abort,
>> + .ret = NOT_DONE,
>> + };
>> + acb->has_returned = false;
>> +
>> + co = qemu_coroutine_create(co_entry, acb);
>> + aio_co_enter(qemu_get_current_aio_context(), co);
>> +
>> + acb->has_returned = true;
>> + if (acb->prco.ret != NOT_DONE) {
>> + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
>> + blk_pr_out_complete_bh, acb);
>> + }
>> +
>> + return &acb->common;
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key,
>> + uint64_t new_key, BlockPrType type,
>> + bool ptpl, bool ignore_key)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type,
>> + ptpl, ignore_key);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_register_entry(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + BlkPrOutCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key,
prco->new_key,
>> + prco->type, prco->ptpl,
>> + prco->ignore_key);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
>> + uint64_t new_key, BlockPrType type,
>> + bool ptpl, bool ignore_key,
>> + BlockCompletionFunc *cb,
>> + void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key,
false,
>> + blk_aio_pr_register_entry, cb, opaque);
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType
type)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_reserve(blk_bs(blk), key, type);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + BlkPrOutCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key,
>> + prco->type);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +
>> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key,
>> + BlockPrType type,
>> + BlockCompletionFunc *cb,
>> + void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_out(blk, key, 0, false, type, false, false,
>> + blk_aio_pr_reserve_entry, cb, opaque);
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType
type)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_release(blk_bs(blk), key, type);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_release_entry(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + BlkPrOutCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key,
prco->type);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +
>> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
>> + BlockPrType type, BlockCompletionFunc *cb,
>> + void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_out(blk, key, 0, false, type, false, false,
>> + blk_aio_pr_release_entry, cb, opaque);
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_clear(blk_bs(blk), key);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + BlkPrOutCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +
>> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
>> + BlockCompletionFunc *cb, void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_out(blk, key, 0, false, 0, false, false,
>> + blk_aio_pr_clear_entry, cb, opaque);
>> +}
>> +
>> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> +static int coroutine_fn
>> +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key,
>> + uint64_t pr_key, BlockPrType type, bool abort)
>> +{
>> + IO_CODE();
>> +
>> + blk_wait_while_drained(blk);
>> + GRAPH_RDLOCK_GUARD();
>> +
>> + if (!blk_co_is_available(blk)) {
>> + return -ENOMEDIUM;
>> + }
>> +
>> + return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort);
>> +}
>> +
>> +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque)
>> +{
>> + BlkPrOutCB *acb = opaque;
>> + BlkPrOutCo *prco = &acb->prco;
>> +
>> + prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key,
>> + prco->new_key, prco->type,
>> + prco->abort);
>> + blk_pr_out_complete(acb);
>> +}
>> +
>> +
>> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
>> + uint64_t pr_key, BlockPrType type,
>> + bool abort, BlockCompletionFunc *cb,
>> + void *opaque)
>> +{
>> + IO_CODE();
>> + return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort,
>> + blk_aio_pr_preempt_entry, cb, opaque);
>> +}
>> +
>> /* To be called between exactly one pair of blk_inc/dec_in_flight() */
>> static int coroutine_fn
>> blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
>> diff --git a/block/io.c b/block/io.c
>> index 7217cf811b..87a363c94f 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -3220,6 +3220,169 @@ out:
>> return co.ret;
>> }
>>
>> +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs,
>> + uint32_t *generation, uint32_t num_keys,
>> + uint64_t *keys)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_read_keys) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs,
>> + uint32_t *generation, uint64_t *key, BlockPrType *type)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_read_reservation) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t
old_key,
>> + uint64_t new_key, BlockPrType type, bool ptpl,
>> + bool ignore_key)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_register) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type,
>> + ptpl, ignore_key);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key,
>> + BlockPrType type)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_reserve) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_reserve(bs, key, type);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key,
>> + BlockPrType type)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_release) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_release(bs, key, type);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_clear) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_clear(bs, key);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t
cr_key,
>> + uint64_t pr_key, BlockPrType type, bool abort)
>> +{
>> + BlockDriver *drv = bs->drv;
>> + CoroutineIOCompletion co = {
>> + .coroutine = qemu_coroutine_self(),
>> + };
>> +
>> + IO_CODE();
>> + assert_bdrv_graph_readable();
>> +
>> + bdrv_inc_in_flight(bs);
>> + if (!drv || !drv->bdrv_co_pr_preempt) {
>> + co.ret = -ENOTSUP;
>> + goto out;
>> + }
>> +
>> + co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort);
>> +out:
>> + bdrv_dec_in_flight(bs);
>> + return co.ret;
>> +}
>> +
>> int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t
offset,
>> unsigned int *nr_zones,
>> BlockZoneDescriptor *zones)
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index a846023a09..7ca4e2328f 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -524,6 +524,46 @@ typedef enum {
>> BDRV_FIX_ERRORS = 2,
>> } BdrvCheckMode;
>>
>> +/**
>> + * According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4)
>> + * and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0),
>> + * the persistent reservation types and persistent capabilities of
>> + * the public layer block are abstracted.
>> + */
>> +typedef enum {
>> + BLK_PR_WRITE_EXCLUSIVE = 0x1,
>> + BLK_PR_EXCLUSIVE_ACCESS = 0x2,
>> + BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY = 0x3,
>> + BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY = 0x4,
>> + BLK_PR_WRITE_EXCLUSIVE_ALL_REGS = 0x5,
>> + BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS = 0x6,
>> +} BlockPrType;
>> +
>> +typedef enum BLKPrCap {
>> + /* Persist Through Power Loss */
>> + BLK_PR_CAP_PTPL = 1 << 0,
>> + /* Write Exclusive reservation type */
>> + BLK_PR_CAP_WR_EX = 1 << 1,
>> + /* Exclusive Access reservation type */
>> + BLK_PR_CAP_EX_AC = 1 << 2,
>> + /* Write Exclusive Registrants Only reservation type */
>> + BLK_PR_CAP_WR_EX_RO = 1 << 3,
>> + /* Exclusive Access Registrants Only reservation type */
>> + BLK_PR_CAP_EX_AC_RO = 1 << 4,
>> + /* Write Exclusive All Registrants reservation type */
>> + BLK_PR_CAP_WR_EX_AR = 1 << 5,
>> + /* Exclusive Access All Registrants reservation type */
>> + BLK_PR_CAP_EX_AC_AR = 1 << 6,
>> +
>> + BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL |
>> + BLK_PR_CAP_WR_EX |
>> + BLK_PR_CAP_EX_AC |
>> + BLK_PR_CAP_WR_EX_RO |
>> + BLK_PR_CAP_EX_AC_RO |
>> + BLK_PR_CAP_WR_EX_AR |
>> + BLK_PR_CAP_EX_AC_AR),
>> +} BLKPrCap;
>> +
>> typedef struct BlockSizes {
>> uint32_t phys;
>> uint32_t log;
>> diff --git a/include/block/block-io.h b/include/block/block-io.h
>> index b49e0537dd..908361862b 100644
>> --- a/include/block/block-io.h
>> +++ b/include/block/block-io.h
>> @@ -106,6 +106,26 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb);
>> int coroutine_fn GRAPH_RDLOCK
>> bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
>>
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation,
>> + uint32_t num_keys, uint64_t *keys);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation,
>> + uint64_t *key, BlockPrType *type);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
>> + uint64_t new_key, BlockPrType type,
>> + bool ptpl, bool ignore_key);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType
type);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType
type);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key);
>> +int coroutine_fn GRAPH_RDLOCK
>> +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t
pr_key,
>> + BlockPrType type, bool abort);
>> +
>> /* Ensure contents are flushed to disk. */
>> int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
>>
>> diff --git a/include/block/block_int-common.h
b/include/block/block_int-common.h
>> index 761276127e..6e628069e9 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -766,6 +766,87 @@ struct BlockDriver {
>> int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)(
>> BlockDriverState *bs, unsigned long int req, void *buf);
>>
>> + /*
>> + * Persistent reservation series api.
>> + * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or
>> + * chapter 7 of NVMe Base Specification 2.0.
>> + *
>> + * The block layer driver should implement all the following APIs
>> + * or none at all, including: bdrv_co_pr_read_keys,
>> + * bdrv_co_pr_read_reservation, bdrv_co_pr_register,
>> + * bdrv_co_pr_reserve, bdrv_co_pr_release,
>> + * bdrv_co_pr_clear and bdrv_co_pr_preempt.
>> + *
>> + * Read the registered keys and return them in the @keys.
>> + * @generation: The generation of the reservation key.
>> + * @num_keys: The maximum number of keys that can be transmitted.
>> + * @keys: Registered keys array.
>> + *
>> + * On success, store generation in @generation and store keys @keys
>> + * and return the number of @keys.
>> + * On failure return -errno.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)(
>> + BlockDriverState *bs, uint32_t *generation,
>> + uint32_t num_keys, uint64_t *keys);
>> + /*
>> + * Read the reservation key and store it in the @key.
>> + * @generation: The generation of the reservation key.
>> + * @key: The reservation key.
>> + * @type: Type of the reservation key.
>> + *
>> + * On success, store generation in @generation, store the
>> + * reservation key in @key and return the number of @key
>> + * which used to determine whether the reservation key exists.
>> + * On failure return -errno.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)(
>> + BlockDriverState *bs, uint32_t *generation,
>> + uint64_t *key, BlockPrType *type);
>> + /*
>> + * Register, unregister, or replace a reservation key.
>> + * @old_key: The current reservation key associated with the host.
>> + * @new_key: The new reservation Key.
>> + * @type: Type of the reservation key.
>> + * @ignore_key: Ignore or not @old_key.
>> + * @ptpl: Whether to support Persist Through Power Loss(PTPL).
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)(
>> + BlockDriverState *bs, uint64_t old_key,
>> + uint64_t new_key, BlockPrType type,
>> + bool ptpl, bool ignore_key);
>> + /*
>> + * Acquire a reservation on a host.
>> + * @key: The current reservation key associated with the host.
>> + * @type: Type of the reservation key.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)(
>> + BlockDriverState *bs, uint64_t key, BlockPrType type);
>> + /*
>> + * Release a reservation on a host.
>> + * @key: The current reservation key associated with the host.
>> + * @type: Type of the reservation key.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)(
>> + BlockDriverState *bs, uint64_t key, BlockPrType type);
>> + /**
>> + * Clear reservations on a host.
>> + * @key: The current reservation key associated with the host.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)(
>> + BlockDriverState *bs, uint64_t key);
>> + /*
>> + * Preempt a reservation held on a host.
>> + * @cr_key: The current reservation key associated with the host.
>> + * @pr_key: The preempt reservation Key which to be
>> + * unregistered from the namespace.
>> + * @type: Type of the reservation key.
>> + * @abort: Whether to abort a reservation held on a host.
>> + */
>> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)(
>> + BlockDriverState *bs, uint64_t cr_key,
>> + uint64_t pr_key, BlockPrType type, bool abort);
>> +
>> /*
>> * Returns 0 for completed check, -errno for internal errors.
>> * The check results are stored in result.
>> @@ -899,6 +980,9 @@ typedef struct BlockLimits {
>> uint32_t max_active_zones;
>>
>> uint32_t write_granularity;
>> +
>> + /* Persistent reservation capacities. */
>> + uint8_t pr_cap;
>> } BlockLimits;
>>
>> typedef struct BdrvOpBlocker BdrvOpBlocker;
>> diff --git a/include/sysemu/block-backend-io.h
b/include/sysemu/block-backend-io.h
>> index d174275a5c..b3d49a3c6f 100644
>> --- a/include/sysemu/block-backend-io.h
>> +++ b/include/sysemu/block-backend-io.h
>> @@ -62,6 +62,30 @@ void blk_aio_cancel_async(BlockAIOCB *acb);
>> BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void
*buf,
>> BlockCompletionFunc *cb, void *opaque);
>>
>> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t
*generation,
>> + uint32_t num_keys, uint64_t *keys,
>> + BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t
*generation,
>> + uint64_t *key, BlockPrType *type,
>> + BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
>> + uint64_t new_key, BlockPrType type,
>> + bool ptpl, bool ignore_key,
>> + BlockCompletionFunc *cb,
>> + void *opaque);
>> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key,
>> + BlockPrType type,
>> + BlockCompletionFunc *cb,
>> + void *opaque);
>> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
>> + BlockPrType type, BlockCompletionFunc *cb,
>> + void *opaque);
>> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
>> + BlockCompletionFunc *cb, void *opaque);
>> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
>> + uint64_t pr_key, BlockPrType type, bool abort,
>> + BlockCompletionFunc *cb, void *opaque);
>> +
>> void blk_inc_in_flight(BlockBackend *blk);
>> void blk_dec_in_flight(BlockBackend *blk);
>>
>> --
>> 2.20.1
>>
diff mbox series

Patch

diff --git a/block/block-backend.c b/block/block-backend.c
index db6f9b92a3..6707d94df7 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1770,6 +1770,403 @@  BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
     return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
 }
 
+typedef struct BlkPrInCo {
+    BlockBackend *blk;
+    uint32_t *generation;
+    uint32_t num_keys;
+    BlockPrType *type;
+    uint64_t *keys;
+    int ret;
+} BlkPrInCo;
+
+typedef struct BlkPrInCB {
+    BlockAIOCB common;
+    BlkPrInCo prco;
+    bool has_returned;
+} BlkPrInCB;
+
+static const AIOCBInfo blk_pr_in_aiocb_info = {
+    .aiocb_size         = sizeof(BlkPrInCB),
+};
+
+static void blk_pr_in_complete(BlkPrInCB *acb)
+{
+    if (acb->has_returned) {
+        acb->common.cb(acb->common.opaque, acb->prco.ret);
+        blk_dec_in_flight(acb->prco.blk);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void blk_pr_in_complete_bh(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    assert(acb->has_returned);
+    blk_pr_in_complete(acb);
+}
+
+static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, BlockPrType *type,
+                                 uint64_t *keys, CoroutineEntry co_entry,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    BlkPrInCB *acb;
+    Coroutine *co;
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque);
+    acb->prco = (BlkPrInCo) {
+        .blk        = blk,
+        .generation = generation,
+        .num_keys   = num_keys,
+        .type       = type,
+        .ret        = NOT_DONE,
+        .keys       = keys,
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(co_entry, acb);
+    aio_co_enter(qemu_get_current_aio_context(), co);
+
+    acb->has_returned = true;
+    if (acb->prco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
+                                         blk_pr_in_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation,
+                        uint32_t num_keys, uint64_t *keys)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys);
+}
+
+static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    BlkPrInCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation,
+                                        prco->num_keys, prco->keys);
+    blk_pr_in_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, uint64_t *keys,
+                                 BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_in(blk, generation, num_keys, NULL, keys,
+                         blk_aio_pr_read_keys_entry, cb, opaque);
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation,
+                               uint64_t *key, BlockPrType *type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type);
+}
+
+static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque)
+{
+    BlkPrInCB *acb = opaque;
+    BlkPrInCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation,
+                                               prco->keys, prco->type);
+    blk_pr_in_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
+                                        uint64_t *key, BlockPrType *type,
+                                        BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_in(blk, generation, 0, type, key,
+                         blk_aio_pr_read_reservation_entry, cb, opaque);
+}
+
+typedef struct BlkPrOutCo {
+    BlockBackend *blk;
+    uint64_t old_key;
+    uint64_t new_key;
+    bool ptpl;
+    BlockPrType type;
+    bool ignore_key;
+    bool abort;
+    int ret;
+} BlkPrOutCo;
+
+typedef struct BlkPrOutCB {
+    BlockAIOCB common;
+    BlkPrOutCo prco;
+    bool has_returned;
+} BlkPrOutCB;
+
+static const AIOCBInfo blk_pr_out_aiocb_info = {
+    .aiocb_size         = sizeof(BlkPrOutCB),
+};
+
+static void blk_pr_out_complete(BlkPrOutCB *acb)
+{
+    if (acb->has_returned) {
+        acb->common.cb(acb->common.opaque, acb->prco.ret);
+        blk_dec_in_flight(acb->prco.blk);
+        qemu_aio_unref(acb);
+    }
+}
+
+static void blk_pr_out_complete_bh(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    assert(acb->has_returned);
+    blk_pr_out_complete(acb);
+}
+
+static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key,
+                                  uint64_t new_key, bool ptpl,
+                                  BlockPrType type, bool ignore_key,
+                                  bool abort, CoroutineEntry co_entry,
+                                  BlockCompletionFunc *cb, void *opaque)
+{
+    BlkPrOutCB *acb;
+    Coroutine *co;
+
+    blk_inc_in_flight(blk);
+    acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque);
+    acb->prco = (BlkPrOutCo) {
+        .blk        = blk,
+        .old_key    = old_key,
+        .new_key    = new_key,
+        .ptpl       = ptpl,
+        .type       = type,
+        .ignore_key = ignore_key,
+        .abort      = abort,
+        .ret        = NOT_DONE,
+    };
+    acb->has_returned = false;
+
+    co = qemu_coroutine_create(co_entry, acb);
+    aio_co_enter(qemu_get_current_aio_context(), co);
+
+    acb->has_returned = true;
+    if (acb->prco.ret != NOT_DONE) {
+        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
+                                         blk_pr_out_complete_bh, acb);
+    }
+
+    return &acb->common;
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key,
+                       uint64_t new_key, BlockPrType type,
+                       bool ptpl, bool ignore_key)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type,
+                               ptpl, ignore_key);
+}
+
+static void coroutine_fn blk_aio_pr_register_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key,
+                                       prco->type, prco->ptpl,
+                                       prco->ignore_key);
+    blk_pr_out_complete(acb);
+}
+
+BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
+                                uint64_t new_key, BlockPrType type,
+                                bool ptpl, bool ignore_key,
+                                BlockCompletionFunc *cb,
+                                void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key, false,
+                          blk_aio_pr_register_entry, cb, opaque);
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_reserve(blk_bs(blk), key, type);
+}
+
+static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key,
+                                      prco->type);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
+                               BlockPrType type,
+                               BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, false, type, false, false,
+                          blk_aio_pr_reserve_entry, cb, opaque);
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_release(blk_bs(blk), key, type);
+}
+
+static void coroutine_fn blk_aio_pr_release_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, false, type, false, false,
+                          blk_aio_pr_release_entry, cb, opaque);
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_clear(blk_bs(blk), key);
+}
+
+static void coroutine_fn blk_aio_pr_clear_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
+                             BlockCompletionFunc *cb, void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, key, 0, false, 0, false, false,
+                          blk_aio_pr_clear_entry, cb, opaque);
+}
+
+/* To be called between exactly one pair of blk_inc/dec_in_flight() */
+static int coroutine_fn
+blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key,
+                      uint64_t pr_key, BlockPrType type, bool abort)
+{
+    IO_CODE();
+
+    blk_wait_while_drained(blk);
+    GRAPH_RDLOCK_GUARD();
+
+    if (!blk_co_is_available(blk)) {
+        return -ENOMEDIUM;
+    }
+
+    return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort);
+}
+
+static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque)
+{
+    BlkPrOutCB *acb = opaque;
+    BlkPrOutCo *prco = &acb->prco;
+
+    prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key,
+                                      prco->new_key, prco->type,
+                                      prco->abort);
+    blk_pr_out_complete(acb);
+}
+
+
+BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
+                               uint64_t pr_key, BlockPrType type,
+                               bool abort, BlockCompletionFunc *cb,
+                               void *opaque)
+{
+    IO_CODE();
+    return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort,
+                          blk_aio_pr_preempt_entry, cb, opaque);
+}
+
 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
 static int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
diff --git a/block/io.c b/block/io.c
index 7217cf811b..87a363c94f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3220,6 +3220,169 @@  out:
     return co.ret;
 }
 
+int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs,
+                 uint32_t *generation, uint32_t num_keys,
+                 uint64_t *keys)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_read_keys) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs,
+                 uint32_t *generation, uint64_t *key, BlockPrType *type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_read_reservation) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
+                 uint64_t new_key, BlockPrType type, bool ptpl,
+                 bool ignore_key)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_register) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type,
+    ptpl, ignore_key);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key,
+                                    BlockPrType type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_reserve) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_reserve(bs, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key,
+                                    BlockPrType type)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_release) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_release(bs, key, type);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_clear) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_clear(bs, key);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
+int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key,
+                 uint64_t pr_key, BlockPrType type, bool abort)
+{
+    BlockDriver *drv = bs->drv;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+
+    IO_CODE();
+    assert_bdrv_graph_readable();
+
+    bdrv_inc_in_flight(bs);
+    if (!drv || !drv->bdrv_co_pr_preempt) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort);
+out:
+    bdrv_dec_in_flight(bs);
+    return co.ret;
+}
+
 int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
                         unsigned int *nr_zones,
                         BlockZoneDescriptor *zones)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index a846023a09..7ca4e2328f 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -524,6 +524,46 @@  typedef enum {
     BDRV_FIX_ERRORS   = 2,
 } BdrvCheckMode;
 
+/**
+ *  According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4)
+ *  and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0),
+ *  the persistent reservation types and persistent capabilities of
+ *  the public layer block are abstracted.
+ */
+typedef enum {
+    BLK_PR_WRITE_EXCLUSIVE              = 0x1,
+    BLK_PR_EXCLUSIVE_ACCESS             = 0x2,
+    BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY    = 0x3,
+    BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY   = 0x4,
+    BLK_PR_WRITE_EXCLUSIVE_ALL_REGS     = 0x5,
+    BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS    = 0x6,
+} BlockPrType;
+
+typedef enum BLKPrCap {
+    /* Persist Through Power Loss */
+    BLK_PR_CAP_PTPL = 1 << 0,
+    /* Write Exclusive reservation type */
+    BLK_PR_CAP_WR_EX = 1 << 1,
+    /* Exclusive Access reservation type */
+    BLK_PR_CAP_EX_AC = 1 << 2,
+    /* Write Exclusive Registrants Only reservation type */
+    BLK_PR_CAP_WR_EX_RO = 1 << 3,
+    /* Exclusive Access Registrants Only reservation type */
+    BLK_PR_CAP_EX_AC_RO = 1 << 4,
+    /* Write Exclusive All Registrants reservation type */
+    BLK_PR_CAP_WR_EX_AR = 1 << 5,
+    /* Exclusive Access All Registrants reservation type */
+    BLK_PR_CAP_EX_AC_AR = 1 << 6,
+
+    BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL |
+                     BLK_PR_CAP_WR_EX |
+                     BLK_PR_CAP_EX_AC |
+                     BLK_PR_CAP_WR_EX_RO |
+                     BLK_PR_CAP_EX_AC_RO |
+                     BLK_PR_CAP_WR_EX_AR |
+                     BLK_PR_CAP_EX_AC_AR),
+} BLKPrCap;
+
 typedef struct BlockSizes {
     uint32_t phys;
     uint32_t log;
diff --git a/include/block/block-io.h b/include/block/block-io.h
index b49e0537dd..908361862b 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -106,6 +106,26 @@  void bdrv_aio_cancel_async(BlockAIOCB *acb);
 int coroutine_fn GRAPH_RDLOCK
 bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
 
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation,
+                     uint32_t num_keys, uint64_t *keys);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation,
+                            uint64_t *key, BlockPrType *type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key,
+                    uint64_t new_key, BlockPrType type,
+                    bool ptpl, bool ignore_key);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t pr_key,
+                   BlockPrType type, bool abort);
+
 /* Ensure contents are flushed to disk.  */
 int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
 
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 761276127e..6e628069e9 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -766,6 +766,87 @@  struct BlockDriver {
     int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)(
         BlockDriverState *bs, unsigned long int req, void *buf);
 
+    /*
+     * Persistent reservation series api.
+     * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or
+     * chapter 7 of NVMe Base Specification 2.0.
+     *
+     * The block layer driver should implement all the following APIs
+     * or none at all, including: bdrv_co_pr_read_keys,
+     * bdrv_co_pr_read_reservation, bdrv_co_pr_register,
+     * bdrv_co_pr_reserve, bdrv_co_pr_release,
+     * bdrv_co_pr_clear and bdrv_co_pr_preempt.
+     *
+     * Read the registered keys and return them in the @keys.
+     * @generation: The generation of the reservation key.
+     * @num_keys:   The maximum number of keys that can be transmitted.
+     * @keys:       Registered keys array.
+     *
+     * On success, store generation in @generation and store keys @keys
+     * and return the number of @keys.
+     * On failure return -errno.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)(
+        BlockDriverState *bs, uint32_t *generation,
+        uint32_t num_keys, uint64_t *keys);
+    /*
+     * Read the reservation key and store it in the @key.
+     * @generation: The generation of the reservation key.
+     * @key:        The reservation key.
+     * @type:       Type of the reservation key.
+     *
+     * On success, store generation in @generation, store the
+     * reservation key in @key and return the number of @key
+     * which used to determine whether the reservation key exists.
+     * On failure return -errno.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)(
+        BlockDriverState *bs, uint32_t *generation,
+        uint64_t *key, BlockPrType *type);
+    /*
+     * Register, unregister, or replace a reservation key.
+     * @old_key:    The current reservation key associated with the host.
+     * @new_key:    The new reservation Key.
+     * @type:       Type of the reservation key.
+     * @ignore_key: Ignore or not @old_key.
+     * @ptpl:       Whether to support Persist Through Power Loss(PTPL).
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)(
+        BlockDriverState *bs, uint64_t old_key,
+        uint64_t new_key, BlockPrType type,
+        bool ptpl, bool ignore_key);
+    /*
+     * Acquire a reservation on a host.
+     * @key:  The current reservation key associated with the host.
+     * @type: Type of the reservation key.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)(
+        BlockDriverState *bs, uint64_t key, BlockPrType type);
+    /*
+     * Release a reservation on a host.
+     * @key:  The current reservation key associated with the host.
+     * @type: Type of the reservation key.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)(
+        BlockDriverState *bs, uint64_t key, BlockPrType type);
+    /**
+     * Clear reservations on a host.
+     * @key: The current reservation key associated with the host.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)(
+        BlockDriverState *bs, uint64_t key);
+    /*
+     * Preempt a reservation held on a host.
+     * @cr_key: The current reservation key associated with the host.
+     * @pr_key: The preempt reservation Key which to be
+     *          unregistered from the namespace.
+     * @type:   Type of the reservation key.
+     * @abort:  Whether to abort a reservation held on a host.
+     */
+    int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)(
+        BlockDriverState *bs, uint64_t cr_key,
+        uint64_t pr_key, BlockPrType type, bool abort);
+
     /*
      * Returns 0 for completed check, -errno for internal errors.
      * The check results are stored in result.
@@ -899,6 +980,9 @@  typedef struct BlockLimits {
     uint32_t max_active_zones;
 
     uint32_t write_granularity;
+
+   /* Persistent reservation capacities. */
+    uint8_t pr_cap;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h
index d174275a5c..b3d49a3c6f 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -62,6 +62,30 @@  void blk_aio_cancel_async(BlockAIOCB *acb);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque);
 
+BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation,
+                                 uint32_t num_keys, uint64_t *keys,
+                                 BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation,
+                                        uint64_t *key, BlockPrType *type,
+                                        BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key,
+                                uint64_t new_key, BlockPrType type,
+                                bool ptpl, bool ignore_key,
+                                BlockCompletionFunc *cb,
+                                void *opaque);
+BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk,  uint64_t key,
+                               BlockPrType type,
+                               BlockCompletionFunc *cb,
+                               void *opaque);
+BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key,
+                               BlockPrType type, BlockCompletionFunc *cb,
+                               void *opaque);
+BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key,
+                             BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key,
+                               uint64_t pr_key, BlockPrType type, bool abort,
+                               BlockCompletionFunc *cb, void *opaque);
+
 void blk_inc_in_flight(BlockBackend *blk);
 void blk_dec_in_flight(BlockBackend *blk);