diff mbox series

[v3,1/3] file-posix:add the tracking of the zones write pointers

Message ID 20221010023306.43610-2-faithilikerun@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add zone append write for zoned device | expand

Commit Message

Sam Li Oct. 10, 2022, 2:33 a.m. UTC
Since Linux doesn't have a user API to issue zone append operations to
zoned devices from user space, the file-posix driver is modified to add
zone append emulation using regular writes. To do this, the file-posix
driver tracks the wp location of all zones of the device. It uses an
array of uint64_t. The most significant bit of each wp location indicates
if the zone type is conventional zones.

The zones wp can be changed due to the following operations issued:
- zone reset: change the wp to the start offset of that zone
- zone finish: change to the end location of that zone
- write to a zone
- zone append

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  14 +++
 include/block/block_int-common.h |   5 +
 3 files changed, 177 insertions(+)

Comments

Damien Le Moal Oct. 13, 2022, 5:13 a.m. UTC | #1
On 10/10/22 11:33, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is conventional zones.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  14 +++
>  include/block/block_int-common.h |   5 +
>  3 files changed, 177 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index a9d347292e..17c0b58158 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              struct iovec *iov;
>              int niov;
> +            int64_t *append_sector;

This should be added as part of patch 2. You do not need this to track
the wp of zones in this patch.

>          } io;
>          struct {
>              uint64_t cmd;
> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
>          struct {
>              unsigned long zone_op;
>              const char *zone_op_name;
> +            bool all;
>          } zone_mgmt;
>      };
>  } RawPosixAIOData;
> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,

Nit: It would seem more natural to have the fd argument first...

> +                        unsigned int nrz) {
> +    struct blk_zone *blkz;
> +    int64_t rep_size;
> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> +    int ret, n = 0, i = 0;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +
> +    rep = g_malloc(rep_size);
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = sector;
> +        rep->nr_zones = nrz - n;
> +
> +        do {
> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> +        } while (ret != 0 && errno == EINTR);
> +        if (ret != 0) {
> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +                    fd, offset, errno);
> +            return -errno;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            /*
> +             * The wp tracking cares only about sequential writes required and
> +             * sequential write preferred zones so that the wp can advance to
> +             * the right location.
> +             * Use the most significant bit of the wp location to indicate the
> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> +             */
> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {

Double negation... This can simply be:

if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {

> +                wps->wp[i] += 1ULL << 63;

No need for the += here. This can be "=".

> +            } else {
> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +            }
> +        }
> +        sector = blkz[i-1].start + blkz[i-1].len;

spaces missing around the "-" in the "i-1" expressions.

> +    }
> +
> +    return 0;
> +}
> +
> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,

Same nit as above: fd being the first argument would be a little more
natural in my opinion.

> +                            unsigned int nrz) {
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> +        error_report("report zone wp failed");
> +        return;

You are leacking the lock here. Remove the return. Also, given that
get_zones_wp() already prints a message if report fails, I do not think
the message here is useful.

Also, why is this function void typed ? How can the caller know if the
update succeeded or not ?

> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>              return;
>          }
> +
> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> +        if (ret >= 0) {
> +            bs->bl.write_granularity = ret;
> +        }

Why is this change here ? Shouldn't this be part of the previous series
"Add support for zoned device" ?

> +
> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> +            error_report("report wps failed");
> +            g_free(bs->bl.wps);
> +            return;
> +        }
> +        qemu_mutex_init(&bs->bl.wps->lock);
>      }
>  }
>  
> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
>      ssize_t nbytes;
>      char *buf;
>  
> +    /*
> +     * The offset of regular writes, append writes is the wp location
> +     * of that zone.
> +     */
> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +        if (aiocb->bs->bl.zone_size > 0) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            qemu_mutex_lock(&wps->lock);
> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> +                                        aiocb->bs->bl.zone_size];
> +            qemu_mutex_unlock(&wps->lock);
> +        }

I do not understand this hunk at all. What is this trying to do ? zone
append support goes into patch 2. You are overwritting the user
specified aio offset using the tracked wp value. That could result in a
successfull write even if the user sent an unaligned write command. That
is bad.

Here you should only be tracking the write pointer, so increment
wps->wp[index], which you do below.

> +    }
> +
>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
>          /*
>           * If there is just a single buffer, and it is properly aligned
> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
>  
>  out:
>      if (nbytes == aiocb->aio_nbytes) {
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> +            if (wps) {
> +                qemu_mutex_lock(&wps->lock);
> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +                    uint64_t wend_offset =
> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> +                    /* Advance the wp if needed */
> +                    if (wend_offset > wps->wp[index]){
> +                        wps->wp[index] = wend_offset;
> +                    }
> +                }
> +                qemu_mutex_unlock(&wps->lock);
> +            }
> +        }
> +#endif
>          return 0;
>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> @@ -1736,6 +1844,12 @@ out:
>          }
>      } else {
>          assert(nbytes < 0);
> +#if defined(CONFIG_BLKZONED)
> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                            aiocb->bs->bl.nr_zones);

You only need to update the target zone of the aio, not all zones.
Updating all zones is actually a bug as wp[] entries for other zones may
be larger than the device reported wp if there are other write aios in
flight. So the last argument must be "1" here.

> +        }
> +#endif
>          return nbytes;
>      }
>  }
> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
>  #if defined(CONFIG_BLKZONED)
>  static int handle_aiocb_zone_mgmt(void *opaque) {
>      RawPosixAIOData *aiocb = opaque;
> +    BlockDriverState *bs = aiocb->bs;
>      int fd = aiocb->aio_fildes;
>      int64_t sector = aiocb->aio_offset / 512;
>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> +    uint64_t wend_offset;
>      struct blk_zone_range range;
>      int ret;
>  

Why the blank line here ?

> +    BlockZoneWps *wps = bs->bl.wps;
> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> +
>      /* Execute the operation */
>      range.sector = sector;
>      range.nr_sectors = nr_sectors;
> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>      } while (ret != 0 && errno == EINTR);
>      if (ret != 0) {
> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> +                        aiocb->bs->bl.nr_zones);

You need only to update the range of zones that was specified for the
management option, not all zones. So you must specify the zone
management aio offset and size/zone_size here.

>          ret = -errno;
>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>                       ret);
>          return ret;
>      }
> +
> +    qemu_mutex_lock(&wps->lock);
> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> +         /*
> +         * The zoned device allows the last zone smaller that the zone size.
> +         */

comment indentation is off.

> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> +        } else {
> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> +        }
> +
> +        if (aiocb->aio_offset != wps->wp[index] &&
> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {

I do not understand the condition here. Why do you have
"aiocb->aio_offset != wps->wp[index]" ?

> +            if (aiocb->zone_mgmt.all) {

This is the only place where you use this all boolean field. For
simplicity, I would drop this field completely and test that
aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
a reset all zones operation.

> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> +                    wps->wp[i] = i * bs->bl.zone_size;

You are not handling conventional zones here. For conventional zones,
you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
always return false after this.

> +                }
> +            } else {
> +                wps->wp[index] = aiocb->aio_offset;
> +            }
> +        } else if (aiocb->aio_offset != wps->wp[index] &&
> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {

Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?

> +            wps->wp[index] = wend_offset;
> +        }
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> +
>      return ret;
>  }
>  #endif
> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
>      BDRVRawState *s = bs->opaque;
>  
>      if (s->fd >= 0) {
> +#if defined(CONFIG_BLKZONED)
> +        if (bs->bl.wps) {
> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> +            g_free(bs->bl.wps);
> +        }
> +#endif
>          qemu_close(s->fd);
>          s->fd = -1;
>      }
> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      int64_t zone_size, zone_size_mask;
>      const char *zone_op_name;
>      unsigned long zone_op;
> +    bool is_all = false;
>  
>      zone_size = bs->bl.zone_size;
>      zone_size_mask = zone_size - 1;
> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>      case BLK_ZO_RESET_ALL:
>          zone_op_name = "BLKRESETZONE";
>          zone_op = BLKRESETZONE;
> +        is_all = true;
>          break;
>      default:
>          g_assert_not_reached();
> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>          .zone_mgmt  = {
>              .zone_op = zone_op,
>              .zone_op_name = zone_op_name,
> +            .all = is_all,
>          },
>      };
>  
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index 882de6825e..b8b2dba64a 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
>      BlockZoneCondition cond;
>  } BlockZoneDescriptor;
>  
> +/*
> + * Track write pointers of a zone in bytes.
> + */
> +typedef struct BlockZoneWps {
> +    QemuMutex lock;
> +    uint64_t wp[];
> +} BlockZoneWps;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> @@ -206,6 +214,12 @@ typedef enum {
>  #define BDRV_SECTOR_BITS   9
>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>  
> +/*
> + * Get the first most significant bit of wp. If it is zero, then
> + * the zone type is SWR.
> + */
> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> +
>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>                                             INT_MAX >> BDRV_SECTOR_BITS)
>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 37dddc603c..59c2d1316d 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>  
>      /* device capacity expressed in bytes */
>      int64_t capacity;
> +
> +    /* array of write pointers' location of each zone in the zoned device. */
> +    BlockZoneWps *wps;
> +
> +    int64_t write_granularity;
>  } BlockLimits;
>  
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
Sam Li Oct. 13, 2022, 7:08 a.m. UTC | #2
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道:
>
> On 10/10/22 11:33, Sam Li wrote:
> > Since Linux doesn't have a user API to issue zone append operations to
> > zoned devices from user space, the file-posix driver is modified to add
> > zone append emulation using regular writes. To do this, the file-posix
> > driver tracks the wp location of all zones of the device. It uses an
> > array of uint64_t. The most significant bit of each wp location indicates
> > if the zone type is conventional zones.
> >
> > The zones wp can be changed due to the following operations issued:
> > - zone reset: change the wp to the start offset of that zone
> > - zone finish: change to the end location of that zone
> > - write to a zone
> > - zone append
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
> >  include/block/block-common.h     |  14 +++
> >  include/block/block_int-common.h |   5 +
> >  3 files changed, 177 insertions(+)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index a9d347292e..17c0b58158 100755
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
> >          struct {
> >              struct iovec *iov;
> >              int niov;
> > +            int64_t *append_sector;
>
> This should be added as part of patch 2. You do not need this to track
> the wp of zones in this patch.
>
> >          } io;
> >          struct {
> >              uint64_t cmd;
> > @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
> >          struct {
> >              unsigned long zone_op;
> >              const char *zone_op_name;
> > +            bool all;
> >          } zone_mgmt;
> >      };
> >  } RawPosixAIOData;
> > @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
> >  #endif
> >  }
> >
> > +#if defined(CONFIG_BLKZONED)
> > +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>
> Nit: It would seem more natural to have the fd argument first...
>
> > +                        unsigned int nrz) {
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size;
> > +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> > +    int ret, n = 0, i = 0;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = NULL;
> > +
> > +    rep = g_malloc(rep_size);
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = sector;
> > +        rep->nr_zones = nrz - n;
> > +
> > +        do {
> > +            ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        } while (ret != 0 && errno == EINTR);
> > +        if (ret != 0) {
> > +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> > +                    fd, offset, errno);
> > +            return -errno;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            /*
> > +             * The wp tracking cares only about sequential writes required and
> > +             * sequential write preferred zones so that the wp can advance to
> > +             * the right location.
> > +             * Use the most significant bit of the wp location to indicate the
> > +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> > +             */
> > +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
>
> Double negation... This can simply be:
>
> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>
> > +                wps->wp[i] += 1ULL << 63;
>
> No need for the += here. This can be "=".
>
> > +            } else {
> > +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> > +            }
> > +        }
> > +        sector = blkz[i-1].start + blkz[i-1].len;
>
> spaces missing around the "-" in the "i-1" expressions.
>
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>
> Same nit as above: fd being the first argument would be a little more
> natural in my opinion.
>
> > +                            unsigned int nrz) {
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> > +        error_report("report zone wp failed");
> > +        return;
>
> You are leacking the lock here. Remove the return. Also, given that
> get_zones_wp() already prints a message if report fails, I do not think
> the message here is useful.
>
> Also, why is this function void typed ? How can the caller know if the
> update succeeded or not ?

Update failures mean get_zones_wp() fails and that will be reported by
error_report. The error message indicates updates fail not reports
fail. Maybe modifying the message suffices error checking?

+    qemu_mutex_lock(&wps->lock);
+    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
+        error_report("update zone wp failed");
+    }
+    qemu_mutex_unlock(&wps->lock);


>
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> > +}
> > +#endif
> > +
> >  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >  {
> >      BDRVRawState *s = bs->opaque;
> > @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
> >              return;
> >          }
> > +
> > +        ret = get_sysfs_long_val(&st, "physical_block_size");
> > +        if (ret >= 0) {
> > +            bs->bl.write_granularity = ret;
> > +        }
>
> Why is this change here ? Shouldn't this be part of the previous series
> "Add support for zoned device" ?

Because only zone append uses write_granularity to check the iovector
size alignment. The previous series doesn't use this field.

>
> > +
> > +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> > +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> > +            error_report("report wps failed");
> > +            g_free(bs->bl.wps);
> > +            return;
> > +        }
> > +        qemu_mutex_init(&bs->bl.wps->lock);
> >      }
> >  }
> >
> > @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
> >      ssize_t nbytes;
> >      char *buf;
> >
> > +    /*
> > +     * The offset of regular writes, append writes is the wp location
> > +     * of that zone.
> > +     */
> > +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +        if (aiocb->bs->bl.zone_size > 0) {
> > +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +            qemu_mutex_lock(&wps->lock);
> > +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> > +                                        aiocb->bs->bl.zone_size];
> > +            qemu_mutex_unlock(&wps->lock);
> > +        }
>
> I do not understand this hunk at all. What is this trying to do ? zone
> append support goes into patch 2. You are overwritting the user
> specified aio offset using the tracked wp value. That could result in a
> successfull write even if the user sent an unaligned write command. That
> is bad.

Ok, regular writes and append writes got mixed up when I changed the
offset to the wp of that zone.

>
> Here you should only be tracking the write pointer, so increment
> wps->wp[index], which you do below.

Understood. Will move it to the next patch.

>
> > +    }
> > +
> >      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> >          /*
> >           * If there is just a single buffer, and it is properly aligned
> > @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
> >
> >  out:
> >      if (nbytes == aiocb->aio_nbytes) {
> > +#if defined(CONFIG_BLKZONED)
> > +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> > +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> > +            if (wps) {
> > +                qemu_mutex_lock(&wps->lock);
> > +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> > +                    uint64_t wend_offset =
> > +                            aiocb->aio_offset + aiocb->aio_nbytes;
> > +                    /* Advance the wp if needed */
> > +                    if (wend_offset > wps->wp[index]){
> > +                        wps->wp[index] = wend_offset;
> > +                    }
> > +                }
> > +                qemu_mutex_unlock(&wps->lock);
> > +            }
> > +        }
> > +#endif
> >          return 0;
> >      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > @@ -1736,6 +1844,12 @@ out:
> >          }
> >      } else {
> >          assert(nbytes < 0);
> > +#if defined(CONFIG_BLKZONED)
> > +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> > +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> > +                            aiocb->bs->bl.nr_zones);
>
> You only need to update the target zone of the aio, not all zones.
> Updating all zones is actually a bug as wp[] entries for other zones may
> be larger than the device reported wp if there are other write aios in
> flight. So the last argument must be "1" here.

Ok, I understood now.

>
> > +        }
> > +#endif
> >          return nbytes;
> >      }
> >  }
> > @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
> >  #if defined(CONFIG_BLKZONED)
> >  static int handle_aiocb_zone_mgmt(void *opaque) {
> >      RawPosixAIOData *aiocb = opaque;
> > +    BlockDriverState *bs = aiocb->bs;
> >      int fd = aiocb->aio_fildes;
> >      int64_t sector = aiocb->aio_offset / 512;
> >      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > +    uint64_t wend_offset;
> >      struct blk_zone_range range;
> >      int ret;
> >
>
> Why the blank line here ?

For readability, separate it from the execution part.

>
> > +    BlockZoneWps *wps = bs->bl.wps;
> > +    int index = aiocb->aio_offset / bs->bl.zone_size;
> > +
> >      /* Execute the operation */
> >      range.sector = sector;
> >      range.nr_sectors = nr_sectors;
> > @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> >      } while (ret != 0 && errno == EINTR);
> >      if (ret != 0) {
> > +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> > +                        aiocb->bs->bl.nr_zones);
>
> You need only to update the range of zones that was specified for the
> management option, not all zones. So you must specify the zone
> management aio offset and size/zone_size here.
>
> >          ret = -errno;
> >          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
> >                       ret);
> >          return ret;
> >      }
> > +
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> > +         /*
> > +         * The zoned device allows the last zone smaller that the zone size.
> > +         */
>
> comment indentation is off.
>
> > +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> > +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> > +        } else {
> > +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> > +        }
> > +
> > +        if (aiocb->aio_offset != wps->wp[index] &&
> > +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>
> I do not understand the condition here. Why do you have
> "aiocb->aio_offset != wps->wp[index]" ?

It is intended for zone state checks. aio_offset (= start byte of that
zone) = wp means this zone is empty. Only non-empty zones will be
reset.

>
> > +            if (aiocb->zone_mgmt.all) {
>
> This is the only place where you use this all boolean field. For
> simplicity, I would drop this field completely and test that
> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
> a reset all zones operation.

Right, the capacity field makes it possible. I'll drop it.

>
> > +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> > +                    wps->wp[i] = i * bs->bl.zone_size;
>
> You are not handling conventional zones here. For conventional zones,
> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
> always return false after this.

Right, will add a condition line here:
+ if (! BDRV_ZT_IS_CONV(wps->wp[i]))

>
> > +                }
> > +            } else {
> > +                wps->wp[index] = aiocb->aio_offset;
> > +            }
> > +        } else if (aiocb->aio_offset != wps->wp[index] &&
> > +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>
> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?

This should be wend_offset != wps->wp[index]. It means if this zone is
full, no need to finish it.

>
> > +            wps->wp[index] = wend_offset;
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> > +
> >      return ret;
> >  }
> >  #endif
> > @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
> >      BDRVRawState *s = bs->opaque;
> >
> >      if (s->fd >= 0) {
> > +#if defined(CONFIG_BLKZONED)
> > +        if (bs->bl.wps) {
> > +            qemu_mutex_destroy(&bs->bl.wps->lock);
> > +            g_free(bs->bl.wps);
> > +        }
> > +#endif
> >          qemu_close(s->fd);
> >          s->fd = -1;
> >      }
> > @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      int64_t zone_size, zone_size_mask;
> >      const char *zone_op_name;
> >      unsigned long zone_op;
> > +    bool is_all = false;
> >
> >      zone_size = bs->bl.zone_size;
> >      zone_size_mask = zone_size - 1;
> > @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >      case BLK_ZO_RESET_ALL:
> >          zone_op_name = "BLKRESETZONE";
> >          zone_op = BLKRESETZONE;
> > +        is_all = true;
> >          break;
> >      default:
> >          g_assert_not_reached();
> > @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >          .zone_mgmt  = {
> >              .zone_op = zone_op,
> >              .zone_op_name = zone_op_name,
> > +            .all = is_all,
> >          },
> >      };
> >
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index 882de6825e..b8b2dba64a 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
> >      BlockZoneCondition cond;
> >  } BlockZoneDescriptor;
> >
> > +/*
> > + * Track write pointers of a zone in bytes.
> > + */
> > +typedef struct BlockZoneWps {
> > +    QemuMutex lock;
> > +    uint64_t wp[];
> > +} BlockZoneWps;
> > +
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> >      int cluster_size;
> > @@ -206,6 +214,12 @@ typedef enum {
> >  #define BDRV_SECTOR_BITS   9
> >  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >
> > +/*
> > + * Get the first most significant bit of wp. If it is zero, then
> > + * the zone type is SWR.
> > + */
> > +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> > +
> >  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
> >                                             INT_MAX >> BDRV_SECTOR_BITS)
> >  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 37dddc603c..59c2d1316d 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >
> >      /* device capacity expressed in bytes */
> >      int64_t capacity;
> > +
> > +    /* array of write pointers' location of each zone in the zoned device. */
> > +    BlockZoneWps *wps;
> > +
> > +    int64_t write_granularity;
> >  } BlockLimits;
> >
> >  typedef struct BdrvOpBlocker BdrvOpBlocker;
>
> --
> Damien Le Moal
> Western Digital Research
>
Damien Le Moal Oct. 13, 2022, 7:30 a.m. UTC | #3
On 2022/10/13 16:08, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道:
>>
>> On 10/10/22 11:33, Sam Li wrote:
>>> Since Linux doesn't have a user API to issue zone append operations to
>>> zoned devices from user space, the file-posix driver is modified to add
>>> zone append emulation using regular writes. To do this, the file-posix
>>> driver tracks the wp location of all zones of the device. It uses an
>>> array of uint64_t. The most significant bit of each wp location indicates
>>> if the zone type is conventional zones.
>>>
>>> The zones wp can be changed due to the following operations issued:
>>> - zone reset: change the wp to the start offset of that zone
>>> - zone finish: change to the end location of that zone
>>> - write to a zone
>>> - zone append
>>>
>>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>>> ---
>>>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
>>>  include/block/block-common.h     |  14 +++
>>>  include/block/block_int-common.h |   5 +
>>>  3 files changed, 177 insertions(+)
>>>
>>> diff --git a/block/file-posix.c b/block/file-posix.c
>>> index a9d347292e..17c0b58158 100755
>>> --- a/block/file-posix.c
>>> +++ b/block/file-posix.c
>>> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
>>>          struct {
>>>              struct iovec *iov;
>>>              int niov;
>>> +            int64_t *append_sector;
>>
>> This should be added as part of patch 2. You do not need this to track
>> the wp of zones in this patch.
>>
>>>          } io;
>>>          struct {
>>>              uint64_t cmd;
>>> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
>>>          struct {
>>>              unsigned long zone_op;
>>>              const char *zone_op_name;
>>> +            bool all;
>>>          } zone_mgmt;
>>>      };
>>>  } RawPosixAIOData;
>>> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
>>>  #endif
>>>  }
>>>
>>> +#if defined(CONFIG_BLKZONED)
>>> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>>
>> Nit: It would seem more natural to have the fd argument first...
>>
>>> +                        unsigned int nrz) {
>>> +    struct blk_zone *blkz;
>>> +    int64_t rep_size;
>>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
>>> +    int ret, n = 0, i = 0;
>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>> +    g_autofree struct blk_zone_report *rep = NULL;
>>> +
>>> +    rep = g_malloc(rep_size);
>>> +    blkz = (struct blk_zone *)(rep + 1);
>>> +    while (n < nrz) {
>>> +        memset(rep, 0, rep_size);
>>> +        rep->sector = sector;
>>> +        rep->nr_zones = nrz - n;
>>> +
>>> +        do {
>>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
>>> +        } while (ret != 0 && errno == EINTR);
>>> +        if (ret != 0) {
>>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>>> +                    fd, offset, errno);
>>> +            return -errno;
>>> +        }
>>> +
>>> +        if (!rep->nr_zones) {
>>> +            break;
>>> +        }
>>> +
>>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>>> +            /*
>>> +             * The wp tracking cares only about sequential writes required and
>>> +             * sequential write preferred zones so that the wp can advance to
>>> +             * the right location.
>>> +             * Use the most significant bit of the wp location to indicate the
>>> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
>>> +             */
>>> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
>>
>> Double negation... This can simply be:
>>
>> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
>>
>>> +                wps->wp[i] += 1ULL << 63;
>>
>> No need for the += here. This can be "=".
>>
>>> +            } else {
>>> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>>> +            }
>>> +        }
>>> +        sector = blkz[i-1].start + blkz[i-1].len;
>>
>> spaces missing around the "-" in the "i-1" expressions.
>>
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
>>
>> Same nit as above: fd being the first argument would be a little more
>> natural in my opinion.
>>
>>> +                            unsigned int nrz) {
>>> +    qemu_mutex_lock(&wps->lock);
>>> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
>>> +        error_report("report zone wp failed");
>>> +        return;
>>
>> You are leacking the lock here. Remove the return. Also, given that
>> get_zones_wp() already prints a message if report fails, I do not think
>> the message here is useful.
>>
>> Also, why is this function void typed ? How can the caller know if the
>> update succeeded or not ?
> 
> Update failures mean get_zones_wp() fails and that will be reported by
> error_report. The error message indicates updates fail not reports
> fail. Maybe modifying the message suffices error checking?
> 
> +    qemu_mutex_lock(&wps->lock);
> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> +        error_report("update zone wp failed");
> +    }
> +    qemu_mutex_unlock(&wps->lock);
> 
> 
>>
>>> +    }
>>> +    qemu_mutex_unlock(&wps->lock);
>>> +}
>>> +#endif
>>> +
>>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>  {
>>>      BDRVRawState *s = bs->opaque;
>>> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
>>>              return;
>>>          }
>>> +
>>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
>>> +        if (ret >= 0) {
>>> +            bs->bl.write_granularity = ret;
>>> +        }
>>
>> Why is this change here ? Shouldn't this be part of the previous series
>> "Add support for zoned device" ?
> 
> Because only zone append uses write_granularity to check the iovector
> size alignment. The previous series doesn't use this field.

Then move this to patch 2. This should not be in this patch since you are not
dealing with zone append yet.

> 
>>
>>> +
>>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
>>> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
>>> +            error_report("report wps failed");
>>> +            g_free(bs->bl.wps);
>>> +            return;
>>> +        }
>>> +        qemu_mutex_init(&bs->bl.wps->lock);
>>>      }
>>>  }
>>>
>>> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
>>>      ssize_t nbytes;
>>>      char *buf;
>>>
>>> +    /*
>>> +     * The offset of regular writes, append writes is the wp location
>>> +     * of that zone.
>>> +     */
>>> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +        if (aiocb->bs->bl.zone_size > 0) {
>>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
>>> +            qemu_mutex_lock(&wps->lock);
>>> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
>>> +                                        aiocb->bs->bl.zone_size];
>>> +            qemu_mutex_unlock(&wps->lock);
>>> +        }
>>
>> I do not understand this hunk at all. What is this trying to do ? zone
>> append support goes into patch 2. You are overwritting the user
>> specified aio offset using the tracked wp value. That could result in a
>> successfull write even if the user sent an unaligned write command. That
>> is bad.
> 
> Ok, regular writes and append writes got mixed up when I changed the
> offset to the wp of that zone.
> 
>>
>> Here you should only be tracking the write pointer, so increment
>> wps->wp[index], which you do below.
> 
> Understood. Will move it to the next patch.

No ! You should not change the aio offset for regular writes. Otherwise you may
hide errors for bad commands from the guest by having them succeed :)
aio offset change should be done ONLY for zone append, not for regular writes.

> 
>>
>>> +    }
>>> +
>>>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
>>>          /*
>>>           * If there is just a single buffer, and it is properly aligned
>>> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
>>>
>>>  out:
>>>      if (nbytes == aiocb->aio_nbytes) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
>>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
>>> +            if (wps) {
>>> +                qemu_mutex_lock(&wps->lock);
>>> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>>> +                    uint64_t wend_offset =
>>> +                            aiocb->aio_offset + aiocb->aio_nbytes;
>>> +                    /* Advance the wp if needed */
>>> +                    if (wend_offset > wps->wp[index]){
>>> +                        wps->wp[index] = wend_offset;
>>> +                    }
>>> +                }
>>> +                qemu_mutex_unlock(&wps->lock);
>>> +            }
>>> +        }
>>> +#endif
>>>          return 0;
>>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
>>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> @@ -1736,6 +1844,12 @@ out:
>>>          }
>>>      } else {
>>>          assert(nbytes < 0);
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
>>> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>> +                            aiocb->bs->bl.nr_zones);
>>
>> You only need to update the target zone of the aio, not all zones.
>> Updating all zones is actually a bug as wp[] entries for other zones may
>> be larger than the device reported wp if there are other write aios in
>> flight. So the last argument must be "1" here.
> 
> Ok, I understood now.
> 
>>
>>> +        }
>>> +#endif
>>>          return nbytes;
>>>      }
>>>  }
>>> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
>>>  #if defined(CONFIG_BLKZONED)
>>>  static int handle_aiocb_zone_mgmt(void *opaque) {
>>>      RawPosixAIOData *aiocb = opaque;
>>> +    BlockDriverState *bs = aiocb->bs;
>>>      int fd = aiocb->aio_fildes;
>>>      int64_t sector = aiocb->aio_offset / 512;
>>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
>>> +    uint64_t wend_offset;
>>>      struct blk_zone_range range;
>>>      int ret;
>>>
>>
>> Why the blank line here ?
> 
> For readability, separate it from the execution part.

But the following lines are variable declarations. I personally prefer
declarations to stay together before the code :)

> 
>>
>>> +    BlockZoneWps *wps = bs->bl.wps;
>>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
>>> +
>>>      /* Execute the operation */
>>>      range.sector = sector;
>>>      range.nr_sectors = nr_sectors;
>>> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
>>>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
>>>      } while (ret != 0 && errno == EINTR);
>>>      if (ret != 0) {
>>> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
>>> +                        aiocb->bs->bl.nr_zones);
>>
>> You need only to update the range of zones that was specified for the
>> management option, not all zones. So you must specify the zone
>> management aio offset and size/zone_size here.
>>
>>>          ret = -errno;
>>>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
>>>                       ret);
>>>          return ret;
>>>      }
>>> +
>>> +    qemu_mutex_lock(&wps->lock);
>>> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
>>> +         /*
>>> +         * The zoned device allows the last zone smaller that the zone size.
>>> +         */
>>
>> comment indentation is off.
>>
>>> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
>>> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
>>> +        } else {
>>> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
>>> +        }
>>> +
>>> +        if (aiocb->aio_offset != wps->wp[index] &&
>>> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
>>
>> I do not understand the condition here. Why do you have
>> "aiocb->aio_offset != wps->wp[index]" ?
> 
> It is intended for zone state checks. aio_offset (= start byte of that
> zone) = wp means this zone is empty. Only non-empty zones will be
> reset.

That is not very natural to use an input from the user (the guest) to check the
state of something that you (qemu) maintains internally and hides to the user.
You should do such test with a small helpers like this:

static bool zone_is_empty(bl, index)
{
	return bl->wps.wp[index} == index * bl->zone_size;
}

And note that this will ALWAYS return false for conventional zones.
You are not checking for conventional zones either. Any zone management function
should be immediately failed if addressed to a conventional zone. That is
missing. You need a:

if (BDRV_ZT_IS_CONV(wps->wp[index] && "this is not a zone reset all op")
	return -EIO; /* or similar... */

at the beginning of handle_aiocb_zone_mgmt().

> 
>>
>>> +            if (aiocb->zone_mgmt.all) {
>>
>> This is the only place where you use this all boolean field. For
>> simplicity, I would drop this field completely and test that
>> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
>> a reset all zones operation.
> 
> Right, the capacity field makes it possible. I'll drop it.
> 
>>
>>> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
>>> +                    wps->wp[i] = i * bs->bl.zone_size;
>>
>> You are not handling conventional zones here. For conventional zones,
>> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
>> always return false after this.
> 
> Right, will add a condition line here:
> + if (! BDRV_ZT_IS_CONV(wps->wp[i]))

You need:

if (BDRV_ZT_IS_CONV(wps->wp[i]))
    continue;

as the first lines inside the for loop.


> 
>>
>>> +                }
>>> +            } else {
>>> +                wps->wp[index] = aiocb->aio_offset;
>>> +            }
>>> +        } else if (aiocb->aio_offset != wps->wp[index] &&
>>> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
>>
>> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?
> 
> This should be wend_offset != wps->wp[index]. It means if this zone is
> full, no need to finish it.

Nope, this does not mean the zone is full. Full condition would be:

wps->wp[index] >= index * bl->zone_size + zone_cap

But you do not have zone cap per zone (remember that zone capacity is per zone
and may differ between zones)... You could add it to the wp array, but that will
make it larger for not much benefits. Since finishing a zone that is already
full is a very rare case, optimizing for it is not valuable. So simply issue the
zone finish operation. It will be a no-op on the host device if the zone is
already full. No big deal !

> 
>>
>>> +            wps->wp[index] = wend_offset;
>>> +        }
>>> +    }
>>> +    qemu_mutex_unlock(&wps->lock);
>>> +
>>>      return ret;
>>>  }
>>>  #endif
>>> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
>>>      BDRVRawState *s = bs->opaque;
>>>
>>>      if (s->fd >= 0) {
>>> +#if defined(CONFIG_BLKZONED)
>>> +        if (bs->bl.wps) {
>>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
>>> +            g_free(bs->bl.wps);
>>> +        }
>>> +#endif
>>>          qemu_close(s->fd);
>>>          s->fd = -1;
>>>      }
>>> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>      int64_t zone_size, zone_size_mask;
>>>      const char *zone_op_name;
>>>      unsigned long zone_op;
>>> +    bool is_all = false;
>>>
>>>      zone_size = bs->bl.zone_size;
>>>      zone_size_mask = zone_size - 1;
>>> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>      case BLK_ZO_RESET_ALL:
>>>          zone_op_name = "BLKRESETZONE";
>>>          zone_op = BLKRESETZONE;
>>> +        is_all = true;
>>>          break;
>>>      default:
>>>          g_assert_not_reached();
>>> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>>>          .zone_mgmt  = {
>>>              .zone_op = zone_op,
>>>              .zone_op_name = zone_op_name,
>>> +            .all = is_all,
>>>          },
>>>      };
>>>
>>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>>> index 882de6825e..b8b2dba64a 100644
>>> --- a/include/block/block-common.h
>>> +++ b/include/block/block-common.h
>>> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
>>>      BlockZoneCondition cond;
>>>  } BlockZoneDescriptor;
>>>
>>> +/*
>>> + * Track write pointers of a zone in bytes.
>>> + */
>>> +typedef struct BlockZoneWps {
>>> +    QemuMutex lock;
>>> +    uint64_t wp[];
>>> +} BlockZoneWps;
>>> +
>>>  typedef struct BlockDriverInfo {
>>>      /* in bytes, 0 if irrelevant */
>>>      int cluster_size;
>>> @@ -206,6 +214,12 @@ typedef enum {
>>>  #define BDRV_SECTOR_BITS   9
>>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
>>>
>>> +/*
>>> + * Get the first most significant bit of wp. If it is zero, then
>>> + * the zone type is SWR.
>>> + */
>>> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
>>> +
>>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
>>>                                             INT_MAX >> BDRV_SECTOR_BITS)
>>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>>> index 37dddc603c..59c2d1316d 100644
>>> --- a/include/block/block_int-common.h
>>> +++ b/include/block/block_int-common.h
>>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
>>>
>>>      /* device capacity expressed in bytes */
>>>      int64_t capacity;
>>> +
>>> +    /* array of write pointers' location of each zone in the zoned device. */
>>> +    BlockZoneWps *wps;
>>> +
>>> +    int64_t write_granularity;
>>>  } BlockLimits;
>>>
>>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
>>
>> --
>> Damien Le Moal
>> Western Digital Research
>>
Sam Li Oct. 13, 2022, 7:46 a.m. UTC | #4
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 15:30写道:
>
> On 2022/10/13 16:08, Sam Li wrote:
> > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年10月13日周四 13:13写道:
> >>
> >> On 10/10/22 11:33, Sam Li wrote:
> >>> Since Linux doesn't have a user API to issue zone append operations to
> >>> zoned devices from user space, the file-posix driver is modified to add
> >>> zone append emulation using regular writes. To do this, the file-posix
> >>> driver tracks the wp location of all zones of the device. It uses an
> >>> array of uint64_t. The most significant bit of each wp location indicates
> >>> if the zone type is conventional zones.
> >>>
> >>> The zones wp can be changed due to the following operations issued:
> >>> - zone reset: change the wp to the start offset of that zone
> >>> - zone finish: change to the end location of that zone
> >>> - write to a zone
> >>> - zone append
> >>>
> >>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> >>> ---
> >>>  block/file-posix.c               | 158 +++++++++++++++++++++++++++++++
> >>>  include/block/block-common.h     |  14 +++
> >>>  include/block/block_int-common.h |   5 +
> >>>  3 files changed, 177 insertions(+)
> >>>
> >>> diff --git a/block/file-posix.c b/block/file-posix.c
> >>> index a9d347292e..17c0b58158 100755
> >>> --- a/block/file-posix.c
> >>> +++ b/block/file-posix.c
> >>> @@ -206,6 +206,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              struct iovec *iov;
> >>>              int niov;
> >>> +            int64_t *append_sector;
> >>
> >> This should be added as part of patch 2. You do not need this to track
> >> the wp of zones in this patch.
> >>
> >>>          } io;
> >>>          struct {
> >>>              uint64_t cmd;
> >>> @@ -226,6 +227,7 @@ typedef struct RawPosixAIOData {
> >>>          struct {
> >>>              unsigned long zone_op;
> >>>              const char *zone_op_name;
> >>> +            bool all;
> >>>          } zone_mgmt;
> >>>      };
> >>>  } RawPosixAIOData;
> >>> @@ -1331,6 +1333,67 @@ static int hdev_get_max_segments(int fd, struct stat *st) {
> >>>  #endif
> >>>  }
> >>>
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Nit: It would seem more natural to have the fd argument first...
> >>
> >>> +                        unsigned int nrz) {
> >>> +    struct blk_zone *blkz;
> >>> +    int64_t rep_size;
> >>> +    int64_t sector = offset >> BDRV_SECTOR_BITS;
> >>> +    int ret, n = 0, i = 0;
> >>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>> +    g_autofree struct blk_zone_report *rep = NULL;
> >>> +
> >>> +    rep = g_malloc(rep_size);
> >>> +    blkz = (struct blk_zone *)(rep + 1);
> >>> +    while (n < nrz) {
> >>> +        memset(rep, 0, rep_size);
> >>> +        rep->sector = sector;
> >>> +        rep->nr_zones = nrz - n;
> >>> +
> >>> +        do {
> >>> +            ret = ioctl(fd, BLKREPORTZONE, rep);
> >>> +        } while (ret != 0 && errno == EINTR);
> >>> +        if (ret != 0) {
> >>> +            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> >>> +                    fd, offset, errno);
> >>> +            return -errno;
> >>> +        }
> >>> +
> >>> +        if (!rep->nr_zones) {
> >>> +            break;
> >>> +        }
> >>> +
> >>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> >>> +            /*
> >>> +             * The wp tracking cares only about sequential writes required and
> >>> +             * sequential write preferred zones so that the wp can advance to
> >>> +             * the right location.
> >>> +             * Use the most significant bit of the wp location to indicate the
> >>> +             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
> >>> +             */
> >>> +            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
> >>
> >> Double negation... This can simply be:
> >>
> >> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> >>
> >>> +                wps->wp[i] += 1ULL << 63;
> >>
> >> No need for the += here. This can be "=".
> >>
> >>> +            } else {
> >>> +                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> >>> +            }
> >>> +        }
> >>> +        sector = blkz[i-1].start + blkz[i-1].len;
> >>
> >> spaces missing around the "-" in the "i-1" expressions.
> >>
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>> +static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
> >>
> >> Same nit as above: fd being the first argument would be a little more
> >> natural in my opinion.
> >>
> >>> +                            unsigned int nrz) {
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> >>> +        error_report("report zone wp failed");
> >>> +        return;
> >>
> >> You are leacking the lock here. Remove the return. Also, given that
> >> get_zones_wp() already prints a message if report fails, I do not think
> >> the message here is useful.
> >>
> >> Also, why is this function void typed ? How can the caller know if the
> >> update succeeded or not ?
> >
> > Update failures mean get_zones_wp() fails and that will be reported by
> > error_report. The error message indicates updates fail not reports
> > fail. Maybe modifying the message suffices error checking?
> >
> > +    qemu_mutex_lock(&wps->lock);
> > +    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
> > +        error_report("update zone wp failed");
> > +    }
> > +    qemu_mutex_unlock(&wps->lock);
> >
> >
> >>
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +}
> >>> +#endif
> >>> +
> >>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>  {
> >>>      BDRVRawState *s = bs->opaque;
> >>> @@ -1414,6 +1477,19 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>              error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
> >>>              return;
> >>>          }
> >>> +
> >>> +        ret = get_sysfs_long_val(&st, "physical_block_size");
> >>> +        if (ret >= 0) {
> >>> +            bs->bl.write_granularity = ret;
> >>> +        }
> >>
> >> Why is this change here ? Shouldn't this be part of the previous series
> >> "Add support for zoned device" ?
> >
> > Because only zone append uses write_granularity to check the iovector
> > size alignment. The previous series doesn't use this field.
>
> Then move this to patch 2. This should not be in this patch since you are not
> dealing with zone append yet.
>
> >
> >>
> >>> +
> >>> +        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
> >>> +        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
> >>> +            error_report("report wps failed");
> >>> +            g_free(bs->bl.wps);
> >>> +            return;
> >>> +        }
> >>> +        qemu_mutex_init(&bs->bl.wps->lock);
> >>>      }
> >>>  }
> >>>
> >>> @@ -1651,6 +1727,20 @@ static int handle_aiocb_rw(void *opaque)
> >>>      ssize_t nbytes;
> >>>      char *buf;
> >>>
> >>> +    /*
> >>> +     * The offset of regular writes, append writes is the wp location
> >>> +     * of that zone.
> >>> +     */
> >>> +    if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +        if (aiocb->bs->bl.zone_size > 0) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            qemu_mutex_lock(&wps->lock);
> >>> +            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
> >>> +                                        aiocb->bs->bl.zone_size];
> >>> +            qemu_mutex_unlock(&wps->lock);
> >>> +        }
> >>
> >> I do not understand this hunk at all. What is this trying to do ? zone
> >> append support goes into patch 2. You are overwritting the user
> >> specified aio offset using the tracked wp value. That could result in a
> >> successfull write even if the user sent an unaligned write command. That
> >> is bad.
> >
> > Ok, regular writes and append writes got mixed up when I changed the
> > offset to the wp of that zone.
> >
> >>
> >> Here you should only be tracking the write pointer, so increment
> >> wps->wp[index], which you do below.
> >
> > Understood. Will move it to the next patch.
>
> No ! You should not change the aio offset for regular writes. Otherwise you may
> hide errors for bad commands from the guest by having them succeed :)
> aio offset change should be done ONLY for zone append, not for regular writes.
>
> >
> >>
> >>> +    }
> >>> +
> >>>      if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
> >>>          /*
> >>>           * If there is just a single buffer, and it is properly aligned
> >>> @@ -1725,6 +1815,24 @@ static int handle_aiocb_rw(void *opaque)
> >>>
> >>>  out:
> >>>      if (nbytes == aiocb->aio_nbytes) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            BlockZoneWps *wps = aiocb->bs->bl.wps;
> >>> +            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
> >>> +            if (wps) {
> >>> +                qemu_mutex_lock(&wps->lock);
> >>> +                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +                    uint64_t wend_offset =
> >>> +                            aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +                    /* Advance the wp if needed */
> >>> +                    if (wend_offset > wps->wp[index]){
> >>> +                        wps->wp[index] = wend_offset;
> >>> +                    }
> >>> +                }
> >>> +                qemu_mutex_unlock(&wps->lock);
> >>> +            }
> >>> +        }
> >>> +#endif
> >>>          return 0;
> >>>      } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
> >>>          if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> @@ -1736,6 +1844,12 @@ out:
> >>>          }
> >>>      } else {
> >>>          assert(nbytes < 0);
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (aiocb->aio_type & QEMU_AIO_WRITE) {
> >>> +            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                            aiocb->bs->bl.nr_zones);
> >>
> >> You only need to update the target zone of the aio, not all zones.
> >> Updating all zones is actually a bug as wp[] entries for other zones may
> >> be larger than the device reported wp if there are other write aios in
> >> flight. So the last argument must be "1" here.
> >
> > Ok, I understood now.
> >
> >>
> >>> +        }
> >>> +#endif
> >>>          return nbytes;
> >>>      }
> >>>  }
> >>> @@ -2022,12 +2136,17 @@ static int handle_aiocb_zone_report(void *opaque) {
> >>>  #if defined(CONFIG_BLKZONED)
> >>>  static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>      RawPosixAIOData *aiocb = opaque;
> >>> +    BlockDriverState *bs = aiocb->bs;
> >>>      int fd = aiocb->aio_fildes;
> >>>      int64_t sector = aiocb->aio_offset / 512;
> >>>      int64_t nr_sectors = aiocb->aio_nbytes / 512;
> >>> +    uint64_t wend_offset;
> >>>      struct blk_zone_range range;
> >>>      int ret;
> >>>
> >>
> >> Why the blank line here ?
> >
> > For readability, separate it from the execution part.
>
> But the following lines are variable declarations. I personally prefer
> declarations to stay together before the code :)

Ok, will change it.

>
> >
> >>
> >>> +    BlockZoneWps *wps = bs->bl.wps;
> >>> +    int index = aiocb->aio_offset / bs->bl.zone_size;
> >>> +
> >>>      /* Execute the operation */
> >>>      range.sector = sector;
> >>>      range.nr_sectors = nr_sectors;
> >>> @@ -2035,11 +2154,41 @@ static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>          ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
> >>>      } while (ret != 0 && errno == EINTR);
> >>>      if (ret != 0) {
> >>> +        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
> >>> +                        aiocb->bs->bl.nr_zones);
> >>
> >> You need only to update the range of zones that was specified for the
> >> management option, not all zones. So you must specify the zone
> >> management aio offset and size/zone_size here.
> >>
> >>>          ret = -errno;
> >>>          error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
> >>>                       ret);
> >>>          return ret;
> >>>      }
> >>> +
> >>> +    qemu_mutex_lock(&wps->lock);
> >>> +    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
> >>> +         /*
> >>> +         * The zoned device allows the last zone smaller that the zone size.
> >>> +         */
> >>
> >> comment indentation is off.
> >>
> >>> +        if (aiocb->aio_nbytes < bs->bl.zone_size) {
> >>> +            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
> >>> +        } else {
> >>> +            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
> >>> +        }
> >>> +
> >>> +        if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
> >>
> >> I do not understand the condition here. Why do you have
> >> "aiocb->aio_offset != wps->wp[index]" ?
> >
> > It is intended for zone state checks. aio_offset (= start byte of that
> > zone) = wp means this zone is empty. Only non-empty zones will be
> > reset.
>
> That is not very natural to use an input from the user (the guest) to check the
> state of something that you (qemu) maintains internally and hides to the user.
> You should do such test with a small helpers like this:
>
> static bool zone_is_empty(bl, index)
> {
>         return bl->wps.wp[index} == index * bl->zone_size;
> }

Ok.

>
> And note that this will ALWAYS return false for conventional zones.
> You are not checking for conventional zones either. Any zone management function
> should be immediately failed if addressed to a conventional zone. That is
> missing. You need a:
>
> if (BDRV_ZT_IS_CONV(wps->wp[index] && "this is not a zone reset all op")
>         return -EIO; /* or similar... */
>
> at the beginning of handle_aiocb_zone_mgmt().

(Just add an additional note: )
Though this check should be in the block layer API patches, it is
possible when wps[] is introduced.

>
> >
> >>
> >>> +            if (aiocb->zone_mgmt.all) {
> >>
> >> This is the only place where you use this all boolean field. For
> >> simplicity, I would drop this field completely and test that
> >> aiocb->aio_offset == 0 && aiocb->aio_nbytes == bs->bl.capacity to detect
> >> a reset all zones operation.
> >
> > Right, the capacity field makes it possible. I'll drop it.
> >
> >>
> >>> +                for (int i = 0; i < bs->bl.nr_zones; ++i) {
> >>> +                    wps->wp[i] = i * bs->bl.zone_size;
> >>
> >> You are not handling conventional zones here. For conventional zones,
> >> you should not change the value. Otherwise, BDRV_ZT_IS_CONV() will
> >> always return false after this.
> >
> > Right, will add a condition line here:
> > + if (! BDRV_ZT_IS_CONV(wps->wp[i]))
>
> You need:
>
> if (BDRV_ZT_IS_CONV(wps->wp[i]))
>     continue;
>
> as the first lines inside the for loop.

Trivial: looks like the same behavior:
 if (! BDRV_ZT_IS_CONV(wps->wp[i])) {
    /* change pointers */
}

if (BDRV_ZT_IS_CONV(wps->wp[i]))
    continue;
/* change pointers */

>
>
> >
> >>
> >>> +                }
> >>> +            } else {
> >>> +                wps->wp[index] = aiocb->aio_offset;
> >>> +            }
> >>> +        } else if (aiocb->aio_offset != wps->wp[index] &&
> >>> +            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
> >>
> >> Same comment here. Why do you have "aiocb->aio_offset != wps->wp[index]" ?
> >
> > This should be wend_offset != wps->wp[index]. It means if this zone is
> > full, no need to finish it.
>
> Nope, this does not mean the zone is full. Full condition would be:
>
> wps->wp[index] >= index * bl->zone_size + zone_cap
>
> But you do not have zone cap per zone (remember that zone capacity is per zone
> and may differ between zones)... You could add it to the wp array, but that will
> make it larger for not much benefits. Since finishing a zone that is already
> full is a very rare case, optimizing for it is not valuable. So simply issue the
> zone finish operation. It will be a no-op on the host device if the zone is
> already full. No big deal !

I see.

Thanks!

>
> >
> >>
> >>> +            wps->wp[index] = wend_offset;
> >>> +        }
> >>> +    }
> >>> +    qemu_mutex_unlock(&wps->lock);
> >>> +
> >>>      return ret;
> >>>  }
> >>>  #endif
> >>> @@ -2480,6 +2629,12 @@ static void raw_close(BlockDriverState *bs)
> >>>      BDRVRawState *s = bs->opaque;
> >>>
> >>>      if (s->fd >= 0) {
> >>> +#if defined(CONFIG_BLKZONED)
> >>> +        if (bs->bl.wps) {
> >>> +            qemu_mutex_destroy(&bs->bl.wps->lock);
> >>> +            g_free(bs->bl.wps);
> >>> +        }
> >>> +#endif
> >>>          qemu_close(s->fd);
> >>>          s->fd = -1;
> >>>      }
> >>> @@ -3278,6 +3433,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      int64_t zone_size, zone_size_mask;
> >>>      const char *zone_op_name;
> >>>      unsigned long zone_op;
> >>> +    bool is_all = false;
> >>>
> >>>      zone_size = bs->bl.zone_size;
> >>>      zone_size_mask = zone_size - 1;
> >>> @@ -3314,6 +3470,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>      case BLK_ZO_RESET_ALL:
> >>>          zone_op_name = "BLKRESETZONE";
> >>>          zone_op = BLKRESETZONE;
> >>> +        is_all = true;
> >>>          break;
> >>>      default:
> >>>          g_assert_not_reached();
> >>> @@ -3328,6 +3485,7 @@ static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> >>>          .zone_mgmt  = {
> >>>              .zone_op = zone_op,
> >>>              .zone_op_name = zone_op_name,
> >>> +            .all = is_all,
> >>>          },
> >>>      };
> >>>
> >>> diff --git a/include/block/block-common.h b/include/block/block-common.h
> >>> index 882de6825e..b8b2dba64a 100644
> >>> --- a/include/block/block-common.h
> >>> +++ b/include/block/block-common.h
> >>> @@ -93,6 +93,14 @@ typedef struct BlockZoneDescriptor {
> >>>      BlockZoneCondition cond;
> >>>  } BlockZoneDescriptor;
> >>>
> >>> +/*
> >>> + * Track write pointers of a zone in bytes.
> >>> + */
> >>> +typedef struct BlockZoneWps {
> >>> +    QemuMutex lock;
> >>> +    uint64_t wp[];
> >>> +} BlockZoneWps;
> >>> +
> >>>  typedef struct BlockDriverInfo {
> >>>      /* in bytes, 0 if irrelevant */
> >>>      int cluster_size;
> >>> @@ -206,6 +214,12 @@ typedef enum {
> >>>  #define BDRV_SECTOR_BITS   9
> >>>  #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
> >>>
> >>> +/*
> >>> + * Get the first most significant bit of wp. If it is zero, then
> >>> + * the zone type is SWR.
> >>> + */
> >>> +#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
> >>> +
> >>>  #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
> >>>                                             INT_MAX >> BDRV_SECTOR_BITS)
> >>>  #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
> >>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> >>> index 37dddc603c..59c2d1316d 100644
> >>> --- a/include/block/block_int-common.h
> >>> +++ b/include/block/block_int-common.h
> >>> @@ -857,6 +857,11 @@ typedef struct BlockLimits {
> >>>
> >>>      /* device capacity expressed in bytes */
> >>>      int64_t capacity;
> >>> +
> >>> +    /* array of write pointers' location of each zone in the zoned device. */
> >>> +    BlockZoneWps *wps;
> >>> +
> >>> +    int64_t write_granularity;
> >>>  } BlockLimits;
> >>>
> >>>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> >>
> >> --
> >> Damien Le Moal
> >> Western Digital Research
> >>
>
> --
> Damien Le Moal
> Western Digital Research
>
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index a9d347292e..17c0b58158 100755
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -206,6 +206,7 @@  typedef struct RawPosixAIOData {
         struct {
             struct iovec *iov;
             int niov;
+            int64_t *append_sector;
         } io;
         struct {
             uint64_t cmd;
@@ -226,6 +227,7 @@  typedef struct RawPosixAIOData {
         struct {
             unsigned long zone_op;
             const char *zone_op_name;
+            bool all;
         } zone_mgmt;
     };
 } RawPosixAIOData;
@@ -1331,6 +1333,67 @@  static int hdev_get_max_segments(int fd, struct stat *st) {
 #endif
 }
 
+#if defined(CONFIG_BLKZONED)
+static int get_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
+                        unsigned int nrz) {
+    struct blk_zone *blkz;
+    int64_t rep_size;
+    int64_t sector = offset >> BDRV_SECTOR_BITS;
+    int ret, n = 0, i = 0;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+
+    rep = g_malloc(rep_size);
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = sector;
+        rep->nr_zones = nrz - n;
+
+        do {
+            ret = ioctl(fd, BLKREPORTZONE, rep);
+        } while (ret != 0 && errno == EINTR);
+        if (ret != 0) {
+            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
+                    fd, offset, errno);
+            return -errno;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            /*
+             * The wp tracking cares only about sequential writes required and
+             * sequential write preferred zones so that the wp can advance to
+             * the right location.
+             * Use the most significant bit of the wp location to indicate the
+             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
+             */
+            if (!(blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL)) {
+                wps->wp[i] += 1ULL << 63;
+            } else {
+                wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
+            }
+        }
+        sector = blkz[i-1].start + blkz[i-1].len;
+    }
+
+    return 0;
+}
+
+static void update_zones_wp(int64_t offset, int fd, BlockZoneWps *wps,
+                            unsigned int nrz) {
+    qemu_mutex_lock(&wps->lock);
+    if (get_zones_wp(offset, fd, wps, nrz) < 0) {
+        error_report("report zone wp failed");
+        return;
+    }
+    qemu_mutex_unlock(&wps->lock);
+}
+#endif
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRawState *s = bs->opaque;
@@ -1414,6 +1477,19 @@  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
             error_report("Invalid device capacity %" PRId64 " bytes ", bs->bl.capacity);
             return;
         }
+
+        ret = get_sysfs_long_val(&st, "physical_block_size");
+        if (ret >= 0) {
+            bs->bl.write_granularity = ret;
+        }
+
+        bs->bl.wps = g_malloc(sizeof(BlockZoneWps) + sizeof(int64_t) * ret);
+        if (get_zones_wp(0, s->fd, bs->bl.wps, ret) < 0){
+            error_report("report wps failed");
+            g_free(bs->bl.wps);
+            return;
+        }
+        qemu_mutex_init(&bs->bl.wps->lock);
     }
 }
 
@@ -1651,6 +1727,20 @@  static int handle_aiocb_rw(void *opaque)
     ssize_t nbytes;
     char *buf;
 
+    /*
+     * The offset of regular writes, append writes is the wp location
+     * of that zone.
+     */
+    if (aiocb->aio_type & QEMU_AIO_WRITE) {
+        if (aiocb->bs->bl.zone_size > 0) {
+            BlockZoneWps *wps = aiocb->bs->bl.wps;
+            qemu_mutex_lock(&wps->lock);
+            aiocb->aio_offset = wps->wp[aiocb->aio_offset /
+                                        aiocb->bs->bl.zone_size];
+            qemu_mutex_unlock(&wps->lock);
+        }
+    }
+
     if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
         /*
          * If there is just a single buffer, and it is properly aligned
@@ -1725,6 +1815,24 @@  static int handle_aiocb_rw(void *opaque)
 
 out:
     if (nbytes == aiocb->aio_nbytes) {
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            BlockZoneWps *wps = aiocb->bs->bl.wps;
+            int index = aiocb->aio_offset / aiocb->bs->bl.zone_size;
+            if (wps) {
+                qemu_mutex_lock(&wps->lock);
+                if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+                    uint64_t wend_offset =
+                            aiocb->aio_offset + aiocb->aio_nbytes;
+                    /* Advance the wp if needed */
+                    if (wend_offset > wps->wp[index]){
+                        wps->wp[index] = wend_offset;
+                    }
+                }
+                qemu_mutex_unlock(&wps->lock);
+            }
+        }
+#endif
         return 0;
     } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
         if (aiocb->aio_type & QEMU_AIO_WRITE) {
@@ -1736,6 +1844,12 @@  out:
         }
     } else {
         assert(nbytes < 0);
+#if defined(CONFIG_BLKZONED)
+        if (aiocb->aio_type & QEMU_AIO_WRITE) {
+            update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
+                            aiocb->bs->bl.nr_zones);
+        }
+#endif
         return nbytes;
     }
 }
@@ -2022,12 +2136,17 @@  static int handle_aiocb_zone_report(void *opaque) {
 #if defined(CONFIG_BLKZONED)
 static int handle_aiocb_zone_mgmt(void *opaque) {
     RawPosixAIOData *aiocb = opaque;
+    BlockDriverState *bs = aiocb->bs;
     int fd = aiocb->aio_fildes;
     int64_t sector = aiocb->aio_offset / 512;
     int64_t nr_sectors = aiocb->aio_nbytes / 512;
+    uint64_t wend_offset;
     struct blk_zone_range range;
     int ret;
 
+    BlockZoneWps *wps = bs->bl.wps;
+    int index = aiocb->aio_offset / bs->bl.zone_size;
+
     /* Execute the operation */
     range.sector = sector;
     range.nr_sectors = nr_sectors;
@@ -2035,11 +2154,41 @@  static int handle_aiocb_zone_mgmt(void *opaque) {
         ret = ioctl(fd, aiocb->zone_mgmt.zone_op, &range);
     } while (ret != 0 && errno == EINTR);
     if (ret != 0) {
+        update_zones_wp(0, aiocb->aio_fildes, aiocb->bs->bl.wps,
+                        aiocb->bs->bl.nr_zones);
         ret = -errno;
         error_report("ioctl %s failed %d", aiocb->zone_mgmt.zone_op_name,
                      ret);
         return ret;
     }
+
+    qemu_mutex_lock(&wps->lock);
+    if (!BDRV_ZT_IS_CONV(wps->wp[index])) {
+         /*
+         * The zoned device allows the last zone smaller that the zone size.
+         */
+        if (aiocb->aio_nbytes < bs->bl.zone_size) {
+            wend_offset = aiocb->aio_offset + aiocb->aio_nbytes;
+        } else {
+            wend_offset = aiocb->aio_offset + bs->bl.zone_size;
+        }
+
+        if (aiocb->aio_offset != wps->wp[index] &&
+            aiocb->zone_mgmt.zone_op == BLKRESETZONE) {
+            if (aiocb->zone_mgmt.all) {
+                for (int i = 0; i < bs->bl.nr_zones; ++i) {
+                    wps->wp[i] = i * bs->bl.zone_size;
+                }
+            } else {
+                wps->wp[index] = aiocb->aio_offset;
+            }
+        } else if (aiocb->aio_offset != wps->wp[index] &&
+            aiocb->zone_mgmt.zone_op == BLKFINISHZONE) {
+            wps->wp[index] = wend_offset;
+        }
+    }
+    qemu_mutex_unlock(&wps->lock);
+
     return ret;
 }
 #endif
@@ -2480,6 +2629,12 @@  static void raw_close(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
 
     if (s->fd >= 0) {
+#if defined(CONFIG_BLKZONED)
+        if (bs->bl.wps) {
+            qemu_mutex_destroy(&bs->bl.wps->lock);
+            g_free(bs->bl.wps);
+        }
+#endif
         qemu_close(s->fd);
         s->fd = -1;
     }
@@ -3278,6 +3433,7 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     int64_t zone_size, zone_size_mask;
     const char *zone_op_name;
     unsigned long zone_op;
+    bool is_all = false;
 
     zone_size = bs->bl.zone_size;
     zone_size_mask = zone_size - 1;
@@ -3314,6 +3470,7 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
     case BLK_ZO_RESET_ALL:
         zone_op_name = "BLKRESETZONE";
         zone_op = BLKRESETZONE;
+        is_all = true;
         break;
     default:
         g_assert_not_reached();
@@ -3328,6 +3485,7 @@  static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
         .zone_mgmt  = {
             .zone_op = zone_op,
             .zone_op_name = zone_op_name,
+            .all = is_all,
         },
     };
 
diff --git a/include/block/block-common.h b/include/block/block-common.h
index 882de6825e..b8b2dba64a 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -93,6 +93,14 @@  typedef struct BlockZoneDescriptor {
     BlockZoneCondition cond;
 } BlockZoneDescriptor;
 
+/*
+ * Track write pointers of a zone in bytes.
+ */
+typedef struct BlockZoneWps {
+    QemuMutex lock;
+    uint64_t wp[];
+} BlockZoneWps;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
@@ -206,6 +214,12 @@  typedef enum {
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 
+/*
+ * Get the first most significant bit of wp. If it is zero, then
+ * the zone type is SWR.
+ */
+#define BDRV_ZT_IS_CONV(wp)    (wp & (1ULL << 63))
+
 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                            INT_MAX >> BDRV_SECTOR_BITS)
 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 37dddc603c..59c2d1316d 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -857,6 +857,11 @@  typedef struct BlockLimits {
 
     /* device capacity expressed in bytes */
     int64_t capacity;
+
+    /* array of write pointers' location of each zone in the zoned device. */
+    BlockZoneWps *wps;
+
+    int64_t write_granularity;
 } BlockLimits;
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;