diff mbox

[2/2] Migrate zone cache from RB-Tree to arrays of descriptors

Message ID 20160822043402.8855-3-shaun@tancheff.com
State New, archived
Headers show

Commit Message

Shaun Tancheff Aug. 22, 2016, 4:34 a.m. UTC
Currently the RB-Tree zone cache is fast and flexible. It does
use a rather largish amount of ram. This model reduces the ram
required from 120 bytes per zone to 16 bytes per zone with a
moderate transformation of the blk_zone_lookup() api.

This model is predicated on the belief that most variations
on zoned media will follow a pattern of using collections of same
sized zones on a single device. Similar to the pattern of erase
blocks on flash devices being progressivly larger 16K, 64K, ...

The goal is to be able to build a descriptor which is both memory
efficient, performant, and flexible.

Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
---
 block/blk-core.c       |    2 +-
 block/blk-sysfs.c      |   31 +-
 block/blk-zoned.c      |  103 +++--
 drivers/scsi/sd.c      |    5 +-
 drivers/scsi/sd.h      |    4 +-
 drivers/scsi/sd_zbc.c  | 1025 +++++++++++++++++++++++++++---------------------
 include/linux/blkdev.h |   82 +++-
 7 files changed, 716 insertions(+), 536 deletions(-)

Comments

Shaun Tancheff Aug. 22, 2016, 5:25 a.m. UTC | #1
On Sun, Aug 21, 2016 at 11:34 PM, Shaun Tancheff <shaun@tancheff.com> wrote:
> Currently the RB-Tree zone cache is fast and flexible. It does
> use a rather largish amount of ram. This model reduces the ram
> required from 120 bytes per zone to 16 bytes per zone with a
> moderate transformation of the blk_zone_lookup() api.
>
> This model is predicated on the belief that most variations
> on zoned media will follow a pattern of using collections of same
> sized zones on a single device. Similar to the pattern of erase
> blocks on flash devices being progressivly larger 16K, 64K, ...
>
> The goal is to be able to build a descriptor which is both memory
> efficient, performant, and flexible.
>
> Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
> ---
>  block/blk-core.c       |    2 +-
>  block/blk-sysfs.c      |   31 +-
>  block/blk-zoned.c      |  103 +++--
>  drivers/scsi/sd.c      |    5 +-
>  drivers/scsi/sd.h      |    4 +-
>  drivers/scsi/sd_zbc.c  | 1025 +++++++++++++++++++++++++++---------------------
>  include/linux/blkdev.h |   82 +++-
>  7 files changed, 716 insertions(+), 536 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 3a9caf7..3b084a8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -727,7 +727,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
>         INIT_LIST_HEAD(&q->blkg_list);
>  #endif
>  #ifdef CONFIG_BLK_DEV_ZONED
> -       q->zones = RB_ROOT;
> +       q->zones = NULL;
>  #endif
>         INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
>
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 43f441f..ecbd434 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -232,36 +232,7 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
>  #ifdef CONFIG_BLK_DEV_ZONED
>  static ssize_t queue_zoned_show(struct request_queue *q, char *page)
>  {
> -       struct rb_node *node;
> -       struct blk_zone *zone;
> -       ssize_t offset = 0, end = 0;
> -       size_t size = 0, num = 0;
> -       enum blk_zone_type type = BLK_ZONE_TYPE_UNKNOWN;
> -
> -       for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> -               zone = rb_entry(node, struct blk_zone, node);
> -               if (zone->type != type ||
> -                   zone->len != size ||
> -                   end != zone->start) {
> -                       if (size != 0)
> -                               offset += sprintf(page + offset, "%zu\n", num);
> -                       /* We can only store one page ... */
> -                       if (offset + 42 > PAGE_SIZE) {
> -                               offset += sprintf(page + offset, "...\n");
> -                               return offset;
> -                       }
> -                       size = zone->len;
> -                       type = zone->type;
> -                       offset += sprintf(page + offset, "%zu %zu %d ",
> -                                         zone->start, size, type);
> -                       num = 0;
> -                       end = zone->start + size;
> -               } else
> -                       end += zone->len;
> -               num++;
> -       }
> -       offset += sprintf(page + offset, "%zu\n", num);
> -       return offset;
> +       return sprintf(page, "%u\n", q->zones ? 1 : 0);
>  }
>  #endif
>
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 975e863..338a1af 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -8,63 +8,84 @@
>  #include <linux/kernel.h>
>  #include <linux/module.h>
>  #include <linux/blkdev.h>
> -#include <linux/rbtree.h>
> +#include <linux/vmalloc.h>
>
> -struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t lba)
> +/**
> + * blk_lookup_zone() - Lookup zones
> + * @q: Request Queue
> + * @sector: Location to lookup
> + * @start: Pointer to starting location zone (OUT)
> + * @len: Pointer to length of zone (OUT)
> + * @lock: Pointer to spinlock of zones in owning descriptor (OUT)
> + */
> +struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector,
> +                                sector_t *start, sector_t *len,
> +                                spinlock_t **lock)
>  {
> -       struct rb_root *root = &q->zones;
> -       struct rb_node *node = root->rb_node;
> +       int iter;
> +       struct blk_zone *bzone = NULL;
> +       struct zone_wps *zi = q->zones;
> +
> +       *start = 0;
> +       *len = 0;
> +       *lock = NULL;
> +
> +       if (!q->zones)
> +               goto out;
>
> -       while (node) {
> -               struct blk_zone *zone = container_of(node, struct blk_zone,
> -                                                    node);
> +       for (iter = 0; iter < zi->wps_count; iter++) {
> +               if (sector >= zi->wps[iter]->start_lba &&
> +                   sector <  zi->wps[iter]->last_lba) {
> +                       struct contiguous_wps *wp = zi->wps[iter];
> +                       u64 index = (sector - wp->start_lba) / wp->zone_size;
>
> -               if (lba < zone->start)
> -                       node = node->rb_left;
> -               else if (lba >= zone->start + zone->len)
> -                       node = node->rb_right;
> -               else
> -                       return zone;
> +                       if (index >= wp->zone_count) {
> +                               WARN(1, "Impossible index for zone\n");
> +                               goto out;
> +                       }
> +
> +                       bzone = &wp->zones[index];
> +                       *len = wp->zone_size;
> +                       *start = wp->start_lba + (index * wp->zone_size);
> +                       *lock = &wp->lock;
> +               }
>         }
> -       return NULL;
> +
> +out:
> +       return bzone;
>  }
>  EXPORT_SYMBOL_GPL(blk_lookup_zone);
>
> -struct blk_zone *blk_insert_zone(struct request_queue *q, struct blk_zone *data)
> +/**
> + * free_zone_wps() - Free up memory in use by wps
> + * @zi: zone wps array(s).
> + */
> +static void free_zone_wps(struct zone_wps *zi)
>  {
> -       struct rb_root *root = &q->zones;
> -       struct rb_node **new = &(root->rb_node), *parent = NULL;
> +       /* on error free the arrays */
> +       if (zi && zi->wps) {
> +               int ca;
>
> -       /* Figure out where to put new node */
> -       while (*new) {
> -               struct blk_zone *this = container_of(*new, struct blk_zone,
> -                                                    node);
> -               parent = *new;
> -               if (data->start + data->len <= this->start)
> -                       new = &((*new)->rb_left);
> -               else if (data->start >= this->start + this->len)
> -                       new = &((*new)->rb_right);
> -               else {
> -                       /* Return existing zone */
> -                       return this;
> +               for (ca = 0; ca < zi->wps_count; ca++) {
> +                       if (zi->wps[ca]) {
> +                               vfree(zi->wps[ca]);
> +                               zi->wps[ca] = NULL;
> +                       }
>                 }
> +               kfree(zi->wps);
>         }
> -       /* Add new node and rebalance tree. */
> -       rb_link_node(&data->node, parent, new);
> -       rb_insert_color(&data->node, root);
> -
> -       return NULL;
>  }
> -EXPORT_SYMBOL_GPL(blk_insert_zone);
>
> +/**
> + * blk_drop_zones() - Free zones
> + * @q: Request Queue
> + */
>  void blk_drop_zones(struct request_queue *q)
>  {
> -       struct rb_root *root = &q->zones;
> -       struct blk_zone *zone, *next;
> -
> -       rbtree_postorder_for_each_entry_safe(zone, next, root, node) {
> -               kfree(zone);
> +       if (q->zones) {
> +               free_zone_wps(q->zones);
> +               kfree(q->zones);
> +               q->zones = NULL;
>         }
> -       q->zones = RB_ROOT;
>  }
>  EXPORT_SYMBOL_GPL(blk_drop_zones);
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index f144df4..0f749f5 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -2549,8 +2549,9 @@ got_data:
>                                       sdkp->physical_block_size);
>         sdkp->device->sector_size = sector_size;
>
> -       if (sd_zbc_config(sdkp, buffer, SD_BUF_SIZE))
> -               sd_config_discard(sdkp, SD_ZBC_RESET_WP);
> +       if (sdkp->first_scan)
> +               if (sd_zbc_config(sdkp, GFP_KERNEL))
> +                       sd_config_discard(sdkp, SD_ZBC_RESET_WP);
>
>         {
>                 char cap_str_2[10], cap_str_10[10];
> diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
> index fc766db..c9c79e9 100644
> --- a/drivers/scsi/sd.h
> +++ b/drivers/scsi/sd.h
> @@ -299,13 +299,13 @@ extern void sd_zbc_uninit_command(struct scsi_cmnd *cmd);
>  extern void sd_zbc_remove(struct scsi_disk *);
>  extern void sd_zbc_reset_zones(struct scsi_disk *);
>  extern void sd_zbc_update_zones(struct scsi_disk *, sector_t, int, int reason);
> -extern bool sd_zbc_config(struct scsi_disk *, void *, size_t);
> +extern bool sd_zbc_config(struct scsi_disk *, gfp_t);
>
>  extern unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp);
>
>  #else /* CONFIG_SCSI_ZBC */
>
> -static inline bool sd_zbc_config(struct scsi_disk *sdkp, void *b, size_t sz)
> +static inline bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp)
>  {
>         return false;
>  }
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 960af93..c087035 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -22,6 +22,7 @@
>
>  #include <linux/blkdev.h>
>  #include <linux/rbtree.h>
> +#include <linux/vmalloc.h>
>
>  #include <asm/unaligned.h>
>
> @@ -51,11 +52,11 @@
>         } while( 0 )
>
>  struct zbc_update_work {
> -       struct work_struct zone_work;
> -       struct scsi_disk *sdkp;
> -       sector_t        zone_sector;
> -       int             zone_buflen;
> -       char            zone_buf[0];
> +       struct work_struct      zone_work;
> +       struct scsi_disk        *sdkp;
> +       sector_t                zone_sector;
> +       int                     zone_buflen;
> +       struct bdev_zone_report zone_buf[0];
>  };
>
>  /**
> @@ -95,102 +96,19 @@ static inline sector_t get_start_from_desc(struct scsi_disk *sdkp,
>         return logical_to_sectors(sdkp->device, be64_to_cpu(bzde->lba_start));
>  }
>
> -static
> -struct blk_zone *zbc_desc_to_zone(struct scsi_disk *sdkp, unsigned char *rec)
> +static void _fill_zone(struct blk_zone *zone, struct scsi_disk *sdkp,
> +                      struct bdev_zone_descriptor *bzde)
>  {
> -       struct blk_zone *zone;
> -       sector_t wp = (sector_t)-1;
> -
> -       zone = kzalloc(sizeof(struct blk_zone), GFP_KERNEL);
> -       if (!zone)
> -               return NULL;
> -
> -       spin_lock_init(&zone->lock);
> -       zone->type = rec[0] & 0xf;
> -       zone->state = (rec[1] >> 4) & 0xf;
> -       zone->len = logical_to_sectors(sdkp->device,
> -                                      get_unaligned_be64(&rec[8]));
> -       zone->start = logical_to_sectors(sdkp->device,
> -                                        get_unaligned_be64(&rec[16]));
> -
> -       if (blk_zone_is_smr(zone))
> -               wp = logical_to_sectors(sdkp->device,
> -                                       get_unaligned_be64(&rec[24]));
> -       zone->wp = wp;
> -       /*
> -        * Fixup block zone state
> -        */
> -       if (zone->state == BLK_ZONE_EMPTY &&
> -           zone->wp != zone->start) {
> -               sd_zbc_debug(sdkp,
> -                            "zone %zu state EMPTY wp %zu: adjust wp\n",
> -                            zone->start, zone->wp);
> -               zone->wp = zone->start;
> -       }
> -       if (zone->state == BLK_ZONE_FULL &&
> -           zone->wp != zone->start + zone->len) {
> -               sd_zbc_debug(sdkp,
> -                            "zone %zu state FULL wp %zu: adjust wp\n",
> -                            zone->start, zone->wp);
> -               zone->wp = zone->start + zone->len;
> -       }
> -
> -       return zone;
> +       zone->type = bzde->type & 0x0f;
> +       zone->state = (bzde->flags >> 4) & 0x0f;
> +       zone->wp = get_wp_from_desc(sdkp, bzde);
>  }
>
> -static
> -sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
> -                        unsigned int buf_len)
> -{
> -       struct request_queue *q = sdkp->disk->queue;
> -       unsigned char *rec = buf;
> -       int rec_no = 0;
> -       unsigned int list_length;
> -       sector_t next_sector = -1;
> -       u8 same;
> -
> -       /* Parse REPORT ZONES header */
> -       list_length = get_unaligned_be32(&buf[0]);
> -       same = buf[4] & 0xf;
> -       rec = buf + 64;
> -       list_length += 64;
> -
> -       if (list_length < buf_len)
> -               buf_len = list_length;
> -
> -       while (rec < buf + buf_len) {
> -               struct blk_zone *this, *old;
> -               unsigned long flags;
>
> -               this = zbc_desc_to_zone(sdkp, rec);
> -               if (!this)
> -                       break;
> -
> -               if (same == 0 && this->len != zlen) {
> -                       next_sector = this->start + this->len;
> -                       break;
> -               }
> -
> -               next_sector = this->start + this->len;
> -               old = blk_insert_zone(q, this);
> -               if (old) {
> -                       spin_lock_irqsave(&old->lock, flags);
> -                       if (blk_zone_is_smr(old)) {
> -                               old->wp = this->wp;
> -                               old->state = this->state;
> -                       }
> -                       spin_unlock_irqrestore(&old->lock, flags);
> -                       kfree(this);
> -               }
> -               rec += 64;
> -               rec_no++;
> -       }
> -
> -       sd_zbc_debug(sdkp,
> -                    "Inserted %d zones, next sector %zu len %d\n",
> -                    rec_no, next_sector, list_length);
> -
> -       return next_sector;
> +static void fill_zone(struct contiguous_wps *cwps, int z_count,
> +                     struct scsi_disk *sdkp, struct bdev_zone_descriptor *bzde)
> +{
> +       _fill_zone(&cwps->zones[z_count], sdkp, bzde);
>  }
>
>  /**
> @@ -200,12 +118,10 @@ sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
>   * @bufflen: length of @buffer
>   * @start_sector: logical sector for the zone information should be reported
>   * @option: reporting option to be used
> - * @partial: flag to set the 'partial' bit for report zones command
>   */
> -static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
> -                              int bufflen, sector_t start_sector,
> -                              enum zbc_zone_reporting_options option,
> -                              bool partial)
> +static int sd_zbc_report_zones(struct scsi_disk *sdkp,
> +                              struct bdev_zone_report *buffer,
> +                              int bufflen, sector_t start_sector, u8 option)
>  {
>         struct scsi_device *sdp = sdkp->device;
>         const int timeout = sdp->request_queue->rq_timeout
> @@ -225,7 +141,7 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
>         cmd[1] = ZI_REPORT_ZONES;
>         put_unaligned_be64(start_lba, &cmd[2]);
>         put_unaligned_be32(bufflen, &cmd[10]);
> -       cmd[14] = (partial ? ZBC_REPORT_ZONE_PARTIAL : 0) | option;
> +       cmd[14] = option;
>         memset(buffer, 0, bufflen);
>
>         result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
> @@ -248,49 +164,38 @@ static void sd_zbc_refresh_zone_work(struct work_struct *work)
>                 container_of(work, struct zbc_update_work, zone_work);
>         struct scsi_disk *sdkp = zbc_work->sdkp;
>         struct request_queue *q = sdkp->disk->queue;
> -       unsigned char *zone_buf = zbc_work->zone_buf;
> +       struct bdev_zone_report *rpt = zbc_work->zone_buf;
>         unsigned int zone_buflen = zbc_work->zone_buflen;
> +       struct bdev_zone_descriptor *bzde;
> +       int iter;
> +       int offmax;
> +       sector_t z_at, z_start, z_len;
> +       spinlock_t *lock;
> +       struct blk_zone *zone;
>         int ret;
> -       u8 same;
> -       u64 zlen = 0;
>         sector_t last_sector;
>         sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
>
> -       ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
> +       ret = sd_zbc_report_zones(sdkp, rpt, zone_buflen,
>                                   zbc_work->zone_sector,
> -                                 ZBC_ZONE_REPORTING_OPTION_ALL, true);
> +                                 ZBC_ZONE_REPORTING_OPTION_ALL);
>         if (ret)
>                 goto done_free;
>
> -       /* this whole path is unlikely so extra reports shouldn't be a
> -        * large impact */
> -       same = zone_buf[4] & 0xf;
> -       if (same == 0) {
> -               unsigned char *desc = &zone_buf[64];
> -               unsigned int blen = zone_buflen;
> -
> -               /* just pull the first zone */
> -               if (blen > 512)
> -                       blen = 512;
> -               ret = sd_zbc_report_zones(sdkp, zone_buf, blen, 0,
> -                                         ZBC_ZONE_REPORTING_OPTION_ALL, true);
> -               if (ret)
> -                       goto done_free;
> -
> -               /* Read the zone length from the first zone descriptor */
> -               zlen = logical_to_sectors(sdkp->device,
> -                                         get_unaligned_be64(&desc[8]));
> -
> -               ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
> -                                         zbc_work->zone_sector,
> -                                         ZBC_ZONE_REPORTING_OPTION_ALL, true);
> -               if (ret)
> -                       goto done_free;
> +       offmax = max_report_entries(zone_buflen);
> +       for (iter = 0; iter < offmax; iter++) {
> +               bzde = &rpt->descriptors[iter];
> +               z_at = get_start_from_desc(sdkp, bzde);
> +               if (!z_at)
> +                       break;
> +               zone = blk_lookup_zone(q, z_at, &z_start, &z_len, &lock);
> +               if (zone) {
> +                       _fill_zone(zone, sdkp, bzde);
> +                       last_sector = z_start + z_len;
> +               }
>         }
>
> -       last_sector = zbc_parse_zones(sdkp, zlen, zone_buf, zone_buflen);
> -       capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> -       if (last_sector != -1 && last_sector < capacity) {
> +       if (sdkp->zone_work_q && last_sector != -1 && last_sector < capacity) {
>                 if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
>                         sd_zbc_debug(sdkp,
>                                      "zones in reset, canceling refresh\n");
> @@ -333,10 +238,7 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
>  {
>         struct request_queue *q = sdkp->disk->queue;
>         struct zbc_update_work *zbc_work;
> -       struct blk_zone *zone;
> -       struct rb_node *node;
> -       int zone_num = 0, zone_busy = 0, num_rec;
> -       sector_t next_sector = sector;
> +       int num_rec;
>
>         if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
>                 sd_zbc_debug(sdkp,
> @@ -346,18 +248,23 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
>
>         if (reason != SD_ZBC_INIT) {
>                 /* lookup sector, is zone pref? then ignore */
> -               struct blk_zone *zone = blk_lookup_zone(q, sector);
> -
> +               sector_t z_start, z_len;
> +               spinlock_t *lck;
> +               struct blk_zone *zone = blk_lookup_zone(q, sector, &z_start,
> +                                                       &z_len, &lck);
> +               /* zone actions on conventional zones are invalid */
> +               if (zone && reason == SD_ZBC_RESET_WP && blk_zone_is_cmr(zone))
> +                       return;
>                 if (reason == SD_ZBC_RESET_WP)
>                         sd_zbc_debug(sdkp, "RESET WP failed %lx\n", sector);
> -
> -               if (zone && blk_zone_is_seq_pref(zone))
> -                       return;
>         }
>
> +       if (!sdkp->zone_work_q)
> +               return;
> +
>  retry:
>         zbc_work = kzalloc(sizeof(struct zbc_update_work) + bufsize,
> -                          reason != SD_ZBC_INIT ? GFP_NOWAIT : GFP_KERNEL);
> +                          reason != SD_ZBC_INIT ? GFP_ATOMIC : GFP_KERNEL);
>         if (!zbc_work) {
>                 if (bufsize > 512) {
>                         sd_zbc_debug(sdkp,
> @@ -381,30 +288,40 @@ retry:
>          * Mark zones under update as BUSY
>          */
>         if (reason != SD_ZBC_INIT) {
> -               for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> -                       unsigned long flags;
> -
> -                       zone = rb_entry(node, struct blk_zone, node);
> -                       if (num_rec == 0)
> +               unsigned long flags;
> +               int iter;
> +               struct zone_wps *zi = q->zones;
> +               struct contiguous_wps *wp = NULL;
> +               u64 index = -1;
> +               int zone_busy = 0;
> +               int z_flgd = 0;
> +
> +               for (iter = 0; iter < zi->wps_count; iter++) {
> +                       if (sector >= zi->wps[iter]->start_lba &&
> +                           sector <  zi->wps[iter]->last_lba) {
> +                               wp = zi->wps[iter];
>                                 break;
> -                       if (zone->start != next_sector)
> -                               continue;
> -                       next_sector += zone->len;
> -                       num_rec--;
> -
> -                       spin_lock_irqsave(&zone->lock, flags);
> -                       if (blk_zone_is_smr(zone)) {
> -                               if (zone->state == BLK_ZONE_BUSY) {
> +                       }
> +               }
> +               if (wp) {
> +                       spin_lock_irqsave(&wp->lock, flags);
> +                       index = (sector - wp->start_lba) / wp->zone_size;
> +                       while (index < wp->zone_count && z_flgd < num_rec) {
> +                               struct blk_zone *bzone = &wp->zones[index];
> +
> +                               index++;
> +                               z_flgd++;
> +                               if (!blk_zone_is_smr(bzone))
> +                                       continue;
> +
> +                               if (bzone->state == BLK_ZONE_BUSY)
>                                         zone_busy++;
> -                               } else {
> -                                       zone->state = BLK_ZONE_BUSY;
> -                                       zone->wp = zone->start;
> -                               }
> -                               zone_num++;
> +                               else
> +                                       bzone->state = BLK_ZONE_BUSY;
>                         }
> -                       spin_unlock_irqrestore(&zone->lock, flags);
> +                       spin_unlock_irqrestore(&wp->lock, flags);
>                 }
> -               if (zone_num && (zone_num == zone_busy)) {
> +               if (z_flgd && (z_flgd == zone_busy)) {
>                         sd_zbc_debug(sdkp,
>                                      "zone update for %zu in progress\n",
>                                      sector);
> @@ -476,43 +393,26 @@ static void discard_or_write_same(struct scsi_cmnd *cmd, sector_t sector,
>  int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
>  {
>         struct request *rq = cmd->request;
> -       struct scsi_device *sdp = cmd->device;
>         struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
>         sector_t sector = blk_rq_pos(rq);
>         unsigned int nr_sectors = blk_rq_sectors(rq);
>         int ret = BLKPREP_OK;
>         struct blk_zone *zone;
>         unsigned long flags;
> -       u32 wp_offset;
>         bool use_write_same = false;
> +       sector_t z_start, z_len;
> +       spinlock_t *lck;
>
> -       zone = blk_lookup_zone(rq->q, sector);
> -       if (!zone) {
> -               /* Test for a runt zone before giving up */
> -               if (sdp->type != TYPE_ZBC) {
> -                       struct request_queue *q = rq->q;
> -                       struct rb_node *node;
> -
> -                       node = rb_last(&q->zones);
> -                       if (node)
> -                               zone = rb_entry(node, struct blk_zone, node);
> -                       if (zone) {
> -                               spin_lock_irqsave(&zone->lock, flags);
> -                               if ((zone->start + zone->len) <= sector)
> -                                       goto out;
> -                               spin_unlock_irqrestore(&zone->lock, flags);
> -                               zone = NULL;
> -                       }
> -               }
> +       zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
> +       if (!zone)
>                 return BLKPREP_KILL;
> -       }
>
> -       spin_lock_irqsave(&zone->lock, flags);
> +       spin_lock_irqsave(lck, flags);
>         if (zone->state == BLK_ZONE_UNKNOWN ||
>             zone->state == BLK_ZONE_BUSY) {
>                 sd_zbc_debug_ratelimit(sdkp,
>                                        "Discarding zone %zx state %x, deferring\n",
> -                                      zone->start, zone->state);
> +                                      z_start, zone->state);
>                 ret = BLKPREP_DEFER;
>                 goto out;
>         }
> @@ -520,39 +420,37 @@ int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
>                 /* let the drive fail the command */
>                 sd_zbc_debug_ratelimit(sdkp,
>                                        "Discarding offline zone %zx\n",
> -                                      zone->start);
> +                                      z_start);
>                 goto out;
>         }
>         if (blk_zone_is_cmr(zone)) {
>                 use_write_same = true;
>                 sd_zbc_debug_ratelimit(sdkp,
> -                                      "Discarding CMR zone %zx\n",
> -                                      zone->start);
> +                                      "Discarding CMR zone %zx\n", z_start);
>                 goto out;
>         }
> -       if (zone->start != sector || zone->len < nr_sectors) {
> +       if (z_start != sector || z_len < nr_sectors) {
>                 sd_printk(KERN_ERR, sdkp,
>                           "Misaligned RESET WP %zx/%x on zone %zx/%zx\n",
> -                         sector, nr_sectors, zone->start, zone->len);
> +                         sector, nr_sectors, z_start, z_len);
>                 ret = BLKPREP_KILL;
>                 goto out;
>         }
>         /* Protect against Reset WP when more data had been written to the
>          * zone than is being discarded.
>          */
> -       wp_offset = zone->wp - zone->start;
> -       if (wp_offset > nr_sectors) {
> +       if (zone->wp > nr_sectors) {
>                 sd_printk(KERN_ERR, sdkp,
> -                         "Will Corrupt RESET WP %zx/%x/%x on zone %zx/%zx/%zx\n",
> -                         sector, wp_offset, nr_sectors,
> -                         zone->start, zone->wp, zone->len);
> +                         "Will Corrupt RESET WP %zx/%zx/%x on zone %zx/%zx/%zx\n",
> +                         sector, (sector_t)zone->wp, nr_sectors,
> +                         z_start, z_start + zone->wp, z_len);
>                 ret = BLKPREP_KILL;
>                 goto out;
>         }
>         if (blk_zone_is_empty(zone)) {
>                 sd_zbc_debug_ratelimit(sdkp,
>                                        "Discarding empty zone %zx [WP: %zx]\n",
> -                                      zone->start, zone->wp);
> +                                      z_start, (sector_t)zone->wp);
>                 ret = BLKPREP_DONE;
>                 goto out;
>         }
> @@ -563,8 +461,8 @@ out:
>          * zone update if RESET WRITE POINTER fails.
>          */
>         if (ret == BLKPREP_OK && !use_write_same)
> -               zone->wp = zone->start;
> -       spin_unlock_irqrestore(&zone->lock, flags);
> +               zone->wp = 0;
> +       spin_unlock_irqrestore(lck, flags);
>
>         if (ret == BLKPREP_OK)
>                 discard_or_write_same(cmd, sector, nr_sectors, use_write_same);
> @@ -573,13 +471,14 @@ out:
>  }
>
>
> -static void __set_zone_state(struct blk_zone *zone, int op)
> +static void __set_zone_state(struct blk_zone *zone, sector_t z_len,
> +                            spinlock_t *lck, int op)
>  {
>         unsigned long flags;
>
> -       spin_lock_irqsave(&zone->lock, flags);
> -       if (blk_zone_is_cmr(zone))
> -               goto out_unlock;
> +       spin_lock_irqsave(lck, flags);
> +       if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> +               goto out;
>
>         switch (op) {
>         case REQ_OP_ZONE_OPEN:
> @@ -587,38 +486,45 @@ static void __set_zone_state(struct blk_zone *zone, int op)
>                 break;
>         case REQ_OP_ZONE_FINISH:
>                 zone->state = BLK_ZONE_FULL;
> -               zone->wp = zone->start + zone->len;
> +               zone->wp = z_len;
>                 break;
>         case REQ_OP_ZONE_CLOSE:
>                 zone->state = BLK_ZONE_CLOSED;
>                 break;
>         case REQ_OP_ZONE_RESET:
> -               zone->wp = zone->start;
> +               zone->wp = 0;
>                 break;
>         default:
>                 WARN_ONCE(1, "%s: invalid op code: %u\n", __func__, op);
>         }
> -out_unlock:
> -       spin_unlock_irqrestore(&zone->lock, flags);
> +out:
> +       spin_unlock_irqrestore(lck, flags);
>  }
>
>  static void update_zone_state(struct request *rq, sector_t lba, unsigned int op)
>  {
> -       struct request_queue *q = rq->q;
> -       struct blk_zone *zone = NULL;
> +       struct blk_zone *zone;
>
>         if (lba == ~0ul) {
> -               struct rb_node *node;
> -
> -               for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> -                       zone = rb_entry(node, struct blk_zone, node);
> -                       __set_zone_state(zone, op);
> +               struct zone_wps *zi = rq->q->zones;
> +               struct contiguous_wps *wp;
> +               u32 iter, entry;
> +
> +               for (iter = 0; iter < zi->wps_count; iter++) {
> +                       wp = zi->wps[iter];
> +                       for (entry = 0; entry < wp->zone_count; entry++) {
> +                               zone = &wp->zones[entry];
> +                               __set_zone_state(zone, wp->zone_size, &wp->lock,
> +                                                op);
> +                       }
>                 }
> -               return;
>         } else {
> -               zone = blk_lookup_zone(q, lba);
> +               sector_t z_start, z_len;
> +               spinlock_t *lck;
> +
> +               zone = blk_lookup_zone(rq->q, lba, &z_start, &z_len, &lck);
>                 if (zone)
> -                       __set_zone_state(zone, op);
> +                       __set_zone_state(zone, z_len, lck, op);
>         }
>  }
>
> @@ -641,6 +547,8 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
>         struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
>         sector_t sector = blk_rq_pos(rq);
>         struct blk_zone *zone;
> +       spinlock_t *lck;
> +       sector_t z_start, z_len;
>         unsigned long flags;
>         unsigned int nr_sectors;
>         int ret = BLKPREP_DONE;
> @@ -651,17 +559,17 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
>         if (is_fua || op != REQ_OP_ZONE_RESET)
>                 goto out;
>
> -       zone = blk_lookup_zone(rq->q, sector);
> +       zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
>         if (!zone || sdkp->provisioning_mode != SD_ZBC_RESET_WP)
>                 goto out;
>
>         /* Map a Reset WP w/o FUA to a discard request */
> -       spin_lock_irqsave(&zone->lock, flags);
> -       sector = zone->start;
> -       nr_sectors = zone->len;
> +       spin_lock_irqsave(lck, flags);
> +       sector = z_start;
> +       nr_sectors = z_len;
>         if (blk_zone_is_cmr(zone))
>                 use_write_same = true;
> -       spin_unlock_irqrestore(&zone->lock, flags);
> +       spin_unlock_irqrestore(lck, flags);
>
>         rq->completion_data = NULL;
>         if (use_write_same) {
> @@ -712,137 +620,157 @@ static sector_t bzrpt_fill(struct request *rq,
>                            struct bdev_zone_descriptor *bzd,
>                            size_t sz, sector_t lba, u8 opt)
>  {
> -       struct request_queue *q = rq->q;
>         struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
> +       struct scsi_device *sdp = sdkp->device;
> +       struct zone_wps *zi = rq->q->zones;
> +       struct contiguous_wps *wpdscr;
>         struct blk_zone *zone = NULL;
> -       struct rb_node *node = NULL;
>         sector_t progress = lba;
>         sector_t clen = ~0ul;
> +       sector_t z_start, z_len, z_wp_abs;
>         unsigned long flags;
>         u32 max_entries = bzrpt ? max_report_entries(sz) : sz / sizeof(*bzd);
>         u32 entry = 0;
> +       u32 iter, idscr;
>         int len_diffs = 0;
>         int type_diffs = 0;
>         u8 ctype;
>         u8 same = 0;
>
> -       zone = blk_lookup_zone(q, lba);
> -       if (zone)
> -               node = &zone->node;
> -
> -       for (entry = 0; entry < max_entries && node; node = rb_next(node)) {
> -               u64 z_len, z_start, z_wp_abs;
> -               u8 cond = 0;
> -               u8 flgs = 0;
> -
> -               spin_lock_irqsave(&zone->lock, flags);
> -               z_len = zone->len;
> -               z_start = zone->start;
> -               z_wp_abs = zone->wp;
> -               progress = z_start + z_len;
> -               cond = zone->state;
> -               if (blk_zone_is_cmr(zone))
> -                       flgs |= 0x02;
> -               else if (zone->wp != zone->start)
> -                       flgs |= 0x01; /* flag as RWP recommended? */
> -               spin_unlock_irqrestore(&zone->lock, flags);
> -
> -               switch (opt & ZBC_REPORT_OPTION_MASK) {
> -               case ZBC_ZONE_REPORTING_OPTION_EMPTY:
> -                       if (z_wp_abs != z_start)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
> -                       if (cond != BLK_ZONE_OPEN)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
> -                       if (cond != BLK_ZONE_OPEN_EXPLICIT)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_CLOSED:
> -                       if (cond != BLK_ZONE_CLOSED)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_FULL:
> -                       if (cond != BLK_ZONE_FULL)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_READONLY:
> -                       if (cond == BLK_ZONE_READONLY)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
> -                       if (cond == BLK_ZONE_OFFLINE)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
> -                       if (z_wp_abs == z_start)
> -                               continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_NON_WP:
> -                       if (cond == BLK_ZONE_NO_WP)
> +       for (iter = 0; entry < max_entries && iter < zi->wps_count; iter++) {
> +               wpdscr = zi->wps[iter];
> +               if (lba > wpdscr->last_lba)
> +                       continue;
> +
> +               spin_lock_irqsave(&wpdscr->lock, flags);
> +               for (idscr = 0;
> +                    entry < max_entries && idscr < wpdscr->zone_count;
> +                    idscr++) {
> +                       struct bdev_zone_descriptor *dscr;
> +                       u64 zoff = idscr * wpdscr->zone_size;
> +                       u8 cond, flgs = 0;
> +
> +                       z_len = wpdscr->zone_size;
> +                       zoff = idscr * z_len;
> +                       z_start = wpdscr->start_lba + zoff;
> +                       if (lba >= z_start + z_len)
>                                 continue;
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
> -                       /* this can only be reported by the HW */
> -                       break;
> -               case ZBC_ZONE_REPORTING_OPTION_ALL:
> -               default:
> -                       break;
> -               }
>
> -               /* if same code only applies to returned zones */
> -               if (opt & ZBC_REPORT_ZONE_PARTIAL) {
> -                       if (clen != ~0ul) {
> -                               clen = z_len;
> +                       zone = &wpdscr->zones[idscr];
> +                       if (blk_zone_is_cmr(zone))
> +                               z_wp_abs = z_start + wpdscr->zone_size;
> +                       else
> +                               z_wp_abs = z_start + zone->wp;
> +
> +                       switch (opt & ZBC_REPORT_OPTION_MASK) {
> +                       case ZBC_ZONE_REPORTING_OPTION_EMPTY:
> +                               if (z_wp_abs != z_start)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
> +                               if (zone->state != BLK_ZONE_OPEN)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
> +                               if (zone->state != BLK_ZONE_OPEN_EXPLICIT)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_CLOSED:
> +                               if (zone->state != BLK_ZONE_CLOSED)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_FULL:
> +                               if (zone->state != BLK_ZONE_FULL)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_READONLY:
> +                               if (zone->state == BLK_ZONE_READONLY)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
> +                               if (zone->state == BLK_ZONE_OFFLINE)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
> +                               if (z_wp_abs == z_start)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_NON_WP:
> +                               if (zone->state == BLK_ZONE_NO_WP)
> +                                       continue;
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
> +                               /* this can only be reported by the HW */
> +                               break;
> +                       case ZBC_ZONE_REPORTING_OPTION_ALL:
> +                       default:
> +                               break;
> +                       }
> +
> +                       /* if same code only applies to returned zones */
> +                       if (opt & ZBC_REPORT_ZONE_PARTIAL) {
> +                               if (clen != ~0ul) {
> +                                       clen = z_len;
> +                                       ctype = zone->type;
> +                               }
> +                               if (z_len != clen)
> +                                       len_diffs++;
> +                               if (zone->type != ctype)
> +                                       type_diffs++;
>                                 ctype = zone->type;
>                         }
> -                       if (z_len != clen)
> -                               len_diffs++;
> -                       if (zone->type != ctype)
> -                               type_diffs++;
> -                       ctype = zone->type;
> -               }
> +                       progress = z_start + z_len;
>
> -               /* shift to device units */
> -               z_start >>= ilog2(sdkp->device->sector_size) - 9;
> -               z_len >>= ilog2(sdkp->device->sector_size) - 9;
> -               z_wp_abs >>= ilog2(sdkp->device->sector_size) - 9;
> +                       if (!bzd) {
> +                               if (bzrpt)
> +                                       bzrpt->descriptor_count =
> +                                               cpu_to_be32(++entry);
> +                               continue;
> +                       }
>
> -               if (!bzd) {
> +                       /* shift to device units */
> +                       z_start >>= ilog2(sdp->sector_size) - 9;
> +                       z_len >>= ilog2(sdp->sector_size) - 9;
> +                       z_wp_abs >>= ilog2(sdp->sector_size) - 9;
> +
> +                       cond = zone->state;
> +                       if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> +                               flgs |= 0x02;
> +                       else if (zone->wp)
> +                               flgs |= 0x01; /* flag as RWP recommended? */
> +
> +                       dscr = &bzd[entry];
> +                       dscr->lba_start = cpu_to_be64(z_start);
> +                       dscr->length = cpu_to_be64(z_len);
> +                       dscr->lba_wptr = cpu_to_be64(z_wp_abs);
> +                       dscr->type = zone->type;
> +                       dscr->flags = cond << 4 | flgs;
> +                       entry++;
>                         if (bzrpt)
> -                               bzrpt->descriptor_count =
> -                                       cpu_to_be32(++entry);
> -                       continue;
> +                               bzrpt->descriptor_count = cpu_to_be32(entry);
>                 }
> -
> -               bzd[entry].lba_start = cpu_to_be64(z_start);
> -               bzd[entry].length = cpu_to_be64(z_len);
> -               bzd[entry].lba_wptr = cpu_to_be64(z_wp_abs);
> -               bzd[entry].type = zone->type;
> -               bzd[entry].flags = cond << 4 | flgs;
> -               entry++;
> -               if (bzrpt)
> -                       bzrpt->descriptor_count = cpu_to_be32(entry);
> +               spin_unlock_irqrestore(&wpdscr->lock, flags);
>         }
>
>         /* if same code applies to all zones */
>         if (bzrpt && !(opt & ZBC_REPORT_ZONE_PARTIAL)) {
> -               for (node = rb_first(&q->zones); node; node = rb_next(node)) {
> -                       zone = rb_entry(node, struct blk_zone, node);
> -
> -                       spin_lock_irqsave(&zone->lock, flags);
> -                       if (clen != ~0ul) {
> -                               clen = zone->len;
> +               for (iter = 0; iter < zi->wps_count; iter++) {
> +                       wpdscr = zi->wps[iter];
> +                       spin_lock_irqsave(&wpdscr->lock, flags);
> +                       for (idscr = 0; idscr < wpdscr->zone_count; idscr++) {
> +                               z_len = wpdscr->zone_size;
> +                               zone = &wpdscr->zones[idscr];
> +                               if (clen != ~0ul) {
> +                                       clen = z_len;
> +                                       ctype = zone->type;
> +                               }
> +                               if (z_len != clen)
> +                                       len_diffs++;
> +                               if (zone->type != ctype)
> +                                       type_diffs++;
>                                 ctype = zone->type;
>                         }
> -                       if (zone->len != clen)
> -                               len_diffs++;
> -                       if (zone->type != ctype)
> -                               type_diffs++;
> -                       ctype = zone->type;
> -                       spin_unlock_irqrestore(&zone->lock, flags);
> +                       spin_unlock_irqrestore(&wpdscr->lock, flags);
>                 }
>         }
>
> @@ -985,12 +913,15 @@ out:
>  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>                             sector_t sector, unsigned int *num_sectors)
>  {
> +       struct request_queue *q = sdkp->disk->queue;
>         struct blk_zone *zone;
> +       sector_t z_start, z_len;
> +       spinlock_t *lck;
>         unsigned int sectors = *num_sectors;
>         int ret = BLKPREP_OK;
>         unsigned long flags;
>
> -       zone = blk_lookup_zone(sdkp->disk->queue, sector);
> +       zone = blk_lookup_zone(q, sector, &z_start, &z_len, &lck);
>         if (!zone) {
>                 /* Might happen during zone initialization */
>                 sd_zbc_debug_ratelimit(sdkp,
> @@ -999,7 +930,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>                 return BLKPREP_OK;
>         }
>
> -       spin_lock_irqsave(&zone->lock, flags);
> +       spin_lock_irqsave(lck, flags);
>
>         if (blk_zone_is_cmr(zone))
>                 goto out;
> @@ -1008,7 +939,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>             zone->state == BLK_ZONE_BUSY) {
>                 sd_zbc_debug_ratelimit(sdkp,
>                                        "zone %zu state %x, deferring\n",
> -                                      zone->start, zone->state);
> +                                      z_start, zone->state);
>                 ret = BLKPREP_DEFER;
>                 goto out;
>         }
> @@ -1017,25 +948,22 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>                 if (op_is_write(req_op(rq))) {
>                         u64 nwp = sector + sectors;
>
> -                       while (nwp > (zone->start + zone->len)) {
> -                               struct rb_node *node = rb_next(&zone->node);
> +                       while (nwp > (z_start + z_len)) {
> +                               zone->wp = z_len;
> +                               sector = z_start + z_len;
> +                               sectors = nwp - sector;
> +                               spin_unlock_irqrestore(lck, flags);
>
> -                               zone->wp = zone->start + zone->len;
> -                               sector = zone->wp;
> -                               sectors = nwp - zone->wp;
> -                               spin_unlock_irqrestore(&zone->lock, flags);
> -
> -                               if (!node)
> -                                       return BLKPREP_OK;
> -                               zone = rb_entry(node, struct blk_zone, node);
> +                               zone = blk_lookup_zone(q, sector,
> +                                                      &z_start, &z_len, &lck);
>                                 if (!zone)
>                                         return BLKPREP_OK;
>
> -                               spin_lock_irqsave(&zone->lock, flags);
> +                               spin_lock_irqsave(lck, flags);
>                                 nwp = sector + sectors;
>                         }
> -                       if (nwp > zone->wp)
> -                               zone->wp = nwp;
> +                       if (nwp > z_start + zone->wp)
> +                               zone->wp = nwp - z_start;
>                 }
>                 goto out;
>         }
> @@ -1044,37 +972,37 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>                 /* let the drive fail the command */
>                 sd_zbc_debug_ratelimit(sdkp,
>                                        "zone %zu offline\n",
> -                                      zone->start);
> +                                      z_start);
>                 goto out;
>         }
>
>         if (op_is_write(req_op(rq))) {
>                 if (zone->state == BLK_ZONE_READONLY)
>                         goto out;
> -               if (blk_zone_is_full(zone)) {
> +               if (zone->wp == z_len) {
>                         sd_zbc_debug(sdkp,
> -                                    "Write to full zone %zu/%zu\n",
> -                                    sector, zone->wp);
> +                                    "Write to full zone %zu/%zu/%zu\n",
> +                                    sector, (sector_t)zone->wp, z_len);
>                         ret = BLKPREP_KILL;
>                         goto out;
>                 }
> -               if (zone->wp != sector) {
> +               if (sector != (z_start + zone->wp)) {
>                         sd_zbc_debug(sdkp,
>                                      "Misaligned write %zu/%zu\n",
> -                                    sector, zone->wp);
> +                                    sector, z_start + zone->wp);
>                         ret = BLKPREP_KILL;
>                         goto out;
>                 }
>                 zone->wp += sectors;
> -       } else if (zone->wp <= sector + sectors) {
> -               if (zone->wp <= sector) {
> +       } else if (z_start + zone->wp <= sector + sectors) {
> +               if (z_start + zone->wp <= sector) {
>                         /* Read beyond WP: clear request buffer */
>                         struct req_iterator iter;
>                         struct bio_vec bvec;
>                         void *buf;
>                         sd_zbc_debug(sdkp,
>                                      "Read beyond wp %zu+%u/%zu\n",
> -                                    sector, sectors, zone->wp);
> +                                    sector, sectors, z_start + zone->wp);
>                         rq_for_each_segment(bvec, rq, iter) {
>                                 buf = bvec_kmap_irq(&bvec, &flags);
>                                 memset(buf, 0, bvec.bv_len);
> @@ -1085,15 +1013,15 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
>                         goto out;
>                 }
>                 /* Read straddle WP position: limit request size */
> -               *num_sectors = zone->wp - sector;
> +               *num_sectors = z_start + zone->wp - sector;
>                 sd_zbc_debug(sdkp,
>                              "Read straddle wp %zu+%u/%zu => %zu+%u\n",
> -                            sector, sectors, zone->wp,
> +                            sector, sectors, z_start + zone->wp,
>                              sector, *num_sectors);
>         }
>
>  out:
> -       spin_unlock_irqrestore(&zone->lock, flags);
> +       spin_unlock_irqrestore(lck, flags);
>
>         return ret;
>  }
> @@ -1145,21 +1073,22 @@ static void update_zones_from_report(struct scsi_cmnd *cmd, u32 nr_bytes)
>                         struct bdev_zone_descriptor *entry = &bzde[iter];
>                         sector_t s = get_start_from_desc(sdkp, entry);
>                         sector_t z_len = get_len_from_desc(sdkp, entry);
> +                       sector_t z_strt;
> +                       spinlock_t *lck;
>                         unsigned long flags;
>
>                         if (!z_len)
>                                 goto done;
>
> -                       zone = blk_lookup_zone(rq->q, s);
> +                       zone = blk_lookup_zone(rq->q, s, &z_strt, &z_len, &lck);
>                         if (!zone)
>                                 goto done;
>
> -                       spin_lock_irqsave(&zone->lock, flags);
> +                       spin_lock_irqsave(lck, flags);
>                         zone->type = entry->type & 0xF;
>                         zone->state = (entry->flags >> 4) & 0xF;
>                         zone->wp = get_wp_from_desc(sdkp, entry);
> -                       zone->len = z_len;
> -                       spin_unlock_irqrestore(&zone->lock, flags);
> +                       spin_unlock_irqrestore(lck, flags);
>                 }
>                 nread += len;
>                 if (!dmax)
> @@ -1233,113 +1162,314 @@ void sd_zbc_uninit_command(struct scsi_cmnd *cmd)
>  }
>
>  /**
> - * sd_zbc_init - Load zones of matching zlen size into rb tree.
> + * alloc_cpws() - Allocate space for a contiguous set of write pointers
> + * @items: Number of wps needed.
> + * @lba: lba of the start of the next zone.
> + * @z_start: Starting lba of this contiguous set.
> + * @z_size: Size of each zone this contiguous set.
>   *
> + * Return: Allocated wps or NULL on error.
>   */
> -static int sd_zbc_init(struct scsi_disk *sdkp, u64 zlen, char *buf, int buf_len)
> +static struct contiguous_wps *alloc_cpws(int items, u64 lba, u64 z_start,
> +                                        u64 z_size)
>  {
> -       sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> -       sector_t last_sector;
> +       struct contiguous_wps *cwps = NULL;
> +       size_t sz;
>
> -       if (test_and_set_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags)) {
> -               sdev_printk(KERN_WARNING, sdkp->device,
> -                           "zone initialization already running\n");
> -               return 0;
> +       sz = sizeof(struct contiguous_wps) + (items * sizeof(struct blk_zone));
> +       if (items) {
> +               cwps = vzalloc(sz);
> +               if (!cwps)
> +                       goto out;
> +               spin_lock_init(&cwps->lock);
> +               cwps->start_lba = z_start;
> +               cwps->last_lba = lba - 1;
> +               cwps->zone_size = z_size;
> +               cwps->is_zoned = items > 1 ? 1 : 0;
> +               cwps->zone_count = items;
>         }
>
> -       if (!sdkp->zone_work_q) {
> -               char wq_name[32];
> +out:
> +       return cwps;
> +}
>
> -               sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
> -               sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
> -               if (!sdkp->zone_work_q) {
> -                       sdev_printk(KERN_WARNING, sdkp->device,
> -                                   "create zoned disk workqueue failed\n");
> -                       return -ENOMEM;
> +/**
> + * free_zone_wps() - Free up memory in use by wps
> + * @zi: zone wps array(s).
> + */
> +static void free_zone_wps(struct zone_wps *zi)
> +{
> +       /* on error free the arrays */
> +       if (zi && zi->wps) {
> +               int ca;
> +
> +               for (ca = 0; ca < zi->wps_count; ca++) {
> +                       if (zi->wps[ca]) {
> +                               vfree(zi->wps[ca]);
> +                               zi->wps[ca] = NULL;
> +                       }
>                 }
> -       } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> -               drain_workqueue(sdkp->zone_work_q);
> -               clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
> +               kfree(zi->wps);
>         }
> +}
>
> -       last_sector = zbc_parse_zones(sdkp, zlen, buf, buf_len);
> -       capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
> -       if (last_sector != -1 && last_sector < capacity) {
> -               sd_zbc_update_zones(sdkp, last_sector,
> -                                   SD_ZBC_BUF_SIZE, SD_ZBC_INIT);
> -       } else
> -               clear_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags);
> +static int wps_realloc(struct zone_wps *zi, gfp_t gfp_mask)
> +{
> +       int rcode = 0;
> +       struct contiguous_wps **old;
> +       struct contiguous_wps **tmp;
> +       int n = zi->wps_count * 2;
> +
> +       old = zi->wps;
> +       tmp = kzalloc(n, sizeof(*zi->wps), gfp_mask);

Apologies, should be kcalloc() here.

> +       if (!tmp) {
> +               rcode = -ENOMEM;
> +               goto out;
> +       }
> +       memcpy(tmp, zi->wps, zi->wps_count * sizeof(*zi->wps));
> +       zi->wps = tmp;
> +       kfree(old);
>
> -       return 0;
> +out:
> +       return rcode;
>  }
>
> +#define FMT_CHANGING_CAPACITY "Changing capacity from %zu to Max LBA+1 %zu"
> +
>  /**
> - * sd_zbc_config() - Configure a ZBC device (on attach)
> - * @sdkp: SCSI disk being attached.
> - * @buffer: Buffer to working data.
> - * @buf_sz: Size of buffer to use for working data
> + * zbc_init_zones() - Re-Sync expected WP location with drive
> + * @sdkp: scsi_disk
> + * @gfp_mask: Allocation mask.
>   *
> - * Return: true of SD_ZBC_RESET_WP provisioning is supported
> + * Return: 0 on success, otherwise error.
>   */
> -bool sd_zbc_config(struct scsi_disk *sdkp, void *buffer, size_t buf_sz)
> +int zbc_init_zones(struct scsi_disk *sdkp, gfp_t gfp_mask)
>  {
> -       struct bdev_zone_report *bzrpt = buffer;
> -       u64 zone_len, lba;
> -       int retval;
> -       u32 rep_len;
> -       u8 same;
> +       struct request_queue *q = sdkp->disk->queue;
> +       int rcode = 0;
> +       int entry = 0;
> +       int offset;
> +       int offmax;
> +       u64 iter;
> +       u64 z_start = 0ul;
> +       u64 z_size = 0; /* size of zone */
> +       int z_count = 0; /* number of zones of z_size */
> +       int do_fill = 0;
> +       int array_count = 0;
> +       int one_time_setup = 0;
> +       u8 opt = ZBC_ZONE_REPORTING_OPTION_ALL;
> +       size_t bufsz = SD_ZBC_BUF_SIZE;
> +       struct bdev_zone_report *rpt = NULL;
> +       struct zone_wps *zi = NULL;
> +       struct contiguous_wps *cwps = NULL;
> +
> +       if (q->zones)
> +               goto out;
>
> -       if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC)
> -               /*
> -                * Device managed or normal SCSI disk,
> -                * no special handling required
> -                */
> -               return false;
> -
> -       retval = sd_zbc_report_zones(sdkp, bzrpt, buf_sz,
> -                                    0, ZBC_ZONE_REPORTING_OPTION_ALL, false);
> -       if (retval < 0)
> -               return false;
> -
> -       rep_len = be32_to_cpu(bzrpt->descriptor_count);
> -       if (rep_len < 7) {
> -               sd_printk(KERN_WARNING, sdkp,
> -                         "REPORT ZONES report invalid length %u\n",
> -                         rep_len);
> -               return false;
> +       zi = kzalloc(sizeof(*zi), gfp_mask);
> +       if (!zi) {
> +               rcode = -ENOMEM;
> +               goto out;
>         }
>
> -       if (sdkp->rc_basis == 0) {
> -               /* The max_lba field is the capacity of a zoned device */
> -               lba = be64_to_cpu(bzrpt->maximum_lba);
> -               if (lba + 1 > sdkp->capacity) {
> -                       if (sdkp->first_scan)
> -                               sd_printk(KERN_WARNING, sdkp,
> -                                         "Changing capacity from %zu to Max LBA+1 %zu\n",
> -                                         sdkp->capacity, (sector_t) lba + 1);
> -                       sdkp->capacity = lba + 1;
> +       if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) {
> +               struct gendisk *disk = sdkp->disk;
> +
> +               zi->wps = kzalloc(sizeof(*zi->wps), gfp_mask);
> +               zi->wps[0] = alloc_cpws(1, disk->part0.nr_sects, z_start, 1);
> +               if (!zi->wps[0]) {
> +                       rcode = -ENOMEM;
> +                       goto out;
>                 }
> +               zi->wps_count = 1;
> +               goto out;
> +       }
> +
> +       rpt = kmalloc(bufsz, gfp_mask);
> +       if (!rpt) {
> +               rcode = -ENOMEM;
> +               goto out;
>         }
>
>         /*
> -        * Adjust 'chunk_sectors' to the zone length if the device
> -        * supports equal zone sizes.
> +        * Start by handling upto 32 different zone sizes. 2 will work
> +        * for all the current drives, but maybe something exotic will
> +        * surface.
>          */
> -       same = bzrpt->same_field & 0x0f;
> -       if (same > 3) {
> -               sd_printk(KERN_WARNING, sdkp,
> -                         "REPORT ZONES SAME type %d not supported\n", same);
> -               return false;
> +       zi->wps = kcalloc(32, sizeof(*zi->wps), gfp_mask);
> +       zi->wps_count = 32;
> +       if (!zi->wps) {
> +               rcode = -ENOMEM;
> +               goto out;
>         }
> -       /* Read the zone length from the first zone descriptor */
> -       zone_len = be64_to_cpu(bzrpt->descriptors[0].length);
> -       sdkp->unmap_alignment = zone_len;
> -       sdkp->unmap_granularity = zone_len;
> -       blk_queue_chunk_sectors(sdkp->disk->queue,
> -                               logical_to_sectors(sdkp->device, zone_len));
> -
> -       sd_zbc_init(sdkp, zone_len, buffer, buf_sz);
> -       return true;
> +
> +fill:
> +       offset = 0;
> +       offmax = 0;
> +       for (entry = 0, iter = 0; iter < sdkp->capacity; entry++) {
> +               struct bdev_zone_descriptor *bzde;
> +               int stop_end = 0;
> +               int stop_size = 0;
> +
> +               if (offset == 0) {
> +                       int err;
> +
> +                       err = sd_zbc_report_zones(sdkp, rpt, bufsz, iter, opt);
> +                       if (err) {
> +                               pr_err("report zones-> %d\n", err);
> +                               if (err != -ENOTSUPP)
> +                                       rcode = err;
> +                               goto out;
> +                       }
> +                       if (sdkp->rc_basis == 0) {
> +                               sector_t lba = be64_to_cpu(rpt->maximum_lba);
> +
> +                               if (lba + 1 > sdkp->capacity) {
> +                                       sd_printk(KERN_WARNING, sdkp,
> +                                                 FMT_CHANGING_CAPACITY "\n",
> +                                                 sdkp->capacity, lba + 1);
> +                                       sdkp->capacity = lba + 1;
> +                               }
> +                       }
> +                       offmax = max_report_entries(bufsz);
> +               }
> +               bzde = &rpt->descriptors[offset];
> +               if (z_size == 0)
> +                       z_size = get_len_from_desc(sdkp, bzde);
> +               if (z_size != get_len_from_desc(sdkp, bzde))
> +                       stop_size = 1;
> +               if ((iter + z_size) >= sdkp->capacity)
> +                       stop_end = 1;
> +
> +               if (!one_time_setup) {
> +                       u8 type = bzde->type & 0x0F;
> +
> +                       if (type != BLK_ZONE_TYPE_CONVENTIONAL) {
> +                               one_time_setup = 1;
> +                               blk_queue_chunk_sectors(sdkp->disk->queue,
> +                                                       z_size);
> +                       }
> +               }
> +
> +               if (do_fill == 0) {
> +                       if (stop_end || stop_size) {
> +                               /* include the next/last zone? */
> +                               if (!stop_size) {
> +                                       z_count++;
> +                                       iter += z_size;
> +                               }
> +                               cwps = alloc_cpws(z_count, iter,
> +                                                 z_start, z_size);
> +                               if (!cwps) {
> +                                       rcode = -ENOMEM;
> +                                       goto out;
> +                               }
> +                               if (array_count > 0)
> +                                       cwps->is_zoned = 1;
> +
> +                               zi->wps[array_count] = cwps;
> +                               z_start = iter;
> +                               z_size = 0;
> +                               z_count = 0;
> +                               array_count++;
> +                               if (array_count >= zi->wps_count) {
> +                                       rcode = wps_realloc(zi, gfp_mask);
> +                                       if (rcode)
> +                                               goto out;
> +                               }
> +                               /* add the runt zone */
> +                               if (stop_end && stop_size) {
> +                                       z_count++;
> +                                       z_size = get_len_from_desc(sdkp, bzde);
> +                                       cwps = alloc_cpws(z_count,
> +                                                         iter + z_size,
> +                                                         z_start, z_size);
> +                                       if (!cwps) {
> +                                               rcode = -ENOMEM;
> +                                               goto out;
> +                                       }
> +                                       if (array_count > 0)
> +                                               cwps->is_zoned = 1;
> +                                       zi->wps[array_count] = cwps;
> +                                       array_count++;
> +                               }
> +                               if (stop_end) {
> +                                       do_fill = 1;
> +                                       array_count = 0;
> +                                       z_count = 0;
> +                                       z_size = 0;
> +                                       goto fill;
> +                               }
> +                       }
> +                       z_size = get_len_from_desc(sdkp, bzde);
> +                       iter += z_size;
> +                       z_count++;
> +               } else {
> +                       fill_zone(zi->wps[array_count], z_count, sdkp, bzde);
> +                       z_count++;
> +                       iter += z_size;
> +                       if (zi->wps[array_count]->zone_count == z_count) {
> +                               z_count = 0;
> +                               array_count++;
> +                               zi->wps_count = array_count;
> +                       }
> +               }
> +               offset++;
> +               if (offset >= offmax)
> +                       offset = 0;
> +       }
> +out:
> +       kfree(rpt);
> +
> +       if (rcode) {
> +               if (zi) {
> +                       free_zone_wps(zi);
> +                       kfree(zi);
> +               }
> +       } else {
> +               q->zones = zi;
> +       }
> +
> +       return rcode;
> +}
> +
> +/**
> + * sd_zbc_config() - Configure a ZBC device (on attach)
> + * @sdkp: SCSI disk being attached.
> + * @gfp_mask: Memory allocation strategy
> + *
> + * Return: true of SD_ZBC_RESET_WP provisioning is supported
> + */
> +bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp_mask)
> +{
> +       bool can_reset_wp = false;
> +
> +       if (zbc_init_zones(sdkp, gfp_mask)) {
> +               sdev_printk(KERN_WARNING, sdkp->device,
> +                           "Initialize zone cache failed\n");
> +               goto out;
> +       }
> +
> +       if (sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC)
> +               can_reset_wp = true;
> +
> +       if (!sdkp->zone_work_q) {
> +               char wq_name[32];
> +
> +               sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
> +               sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
> +               if (!sdkp->zone_work_q) {
> +                       sdev_printk(KERN_WARNING, sdkp->device,
> +                                   "create zoned disk workqueue failed\n");
> +                       goto out;
> +               }
> +       } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
> +               drain_workqueue(sdkp->zone_work_q);
> +               clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
> +       }
> +
> +out:
> +       return can_reset_wp;
>  }
>
>  /**
> @@ -1365,15 +1495,16 @@ void sd_zbc_remove(struct scsi_disk *sdkp)
>   */
>  unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp)
>  {
> -       unsigned int bytes = 1;
>         struct request_queue *q = sdkp->disk->queue;
> -       struct rb_node *node = rb_first(&q->zones);
> +       struct zone_wps *zi = q->zones;
> +       unsigned int bytes = 1;
>
> -       if (node) {
> -               struct blk_zone *zone = rb_entry(node, struct blk_zone, node);
> +       if (zi && zi->wps_count > 0) {
> +               struct contiguous_wps *wp = zi->wps[0];
>
> -               bytes = zone->len;
> +               bytes = wp->zone_size;
>         }
> +
>         bytes <<= ilog2(sdkp->device->sector_size);
>         return bytes;
>  }
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index d5cdb5d..113c5a8 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -264,27 +264,83 @@ struct blk_queue_tag {
>
>  #ifdef CONFIG_BLK_DEV_ZONED
>
> +/**
> + * struct blk_zone - A single zone type/stats and WP offset.
> + *
> + * @wp:    Holds the wp offset from the start of the zone.
> + * @type:  Holds the zone type nibble.
> + * @state: Holds the zone state nibble + kernel (zone busy)
> + * @private_data: Used to hold whatever the implicit domain owner
> + *                of the zone needs to track.
> + *
> + * Type is left at 4 bits (only 2 are needed currently) to match
> + * the current ZBC/ZAC standards.
> + *
> + * State is using 5 bits to accommodate the ZONE_BUSY. The first 4 bits
> + * match the current ZBC/ZAC spec.
> + * ZONE_BUSY could be mapped to one of the reserved bits. Using it as
> + * mask bit or independent flag my be useful for decoding the zone
> + * state before it transitioned to BUSY.
> + *
> + * A zone sized at order (39+9) is very unlikely (current zones are 16+9)
> + * Even at lba48 equivalent number of sectors we have a large amount
> + * of padding to fill out 8 bytes.
> + *
> + * Getting this to fit in 4 bytes would limit the maximum size of a zone
> + * to 4G [order 23 of 512 byte sectors + 9 bits for flags] which is probably
> + * okay for embedded or 32-bit systems where the private_data pointer
> + * would also shrink to 32 bits. There are also WP tracking schemes
> + * that don't make use of the private_data helper so perhaps that
> + * could be factored out as well.
> + */
>  struct blk_zone {
> -       struct rb_node node;
> -       spinlock_t lock;
> -       sector_t start;
> -       size_t len;
> -       sector_t wp;
> -       enum blk_zone_type type;
> -       enum blk_zone_state state;
> +       unsigned long long wp:39;
> +       unsigned long long type:4;
> +       unsigned long long state:5;
> +       unsigned long long padding:15;
>         void *private_data;
>  };
>
> +/**
> + * struct contiguous_wps - A descriptor of zones of the same size
> + *
> + * @start_lba:  LBA of first zone covered by the descriptor.
> + * @last_lba:   LBA of last zone.
> + * @zone_size:  Size of zones as a number of 512 byte sectors.
> + * @zone_count: Number of zones (last-start/size) for convenience.
> + * @lock:       A spinlock protecting these zones.
> + * @is_zoned:   0 when all zones are conventional no WP zones.
> + * zones:       Array of blk_zone entries.
> + */
> +struct contiguous_wps {
> +       u64 start_lba;
> +       u64 last_lba;
> +       u64 zone_size;
> +       u32 zone_count;
> +       spinlock_t lock;
> +       unsigned is_zoned:1;
> +       struct blk_zone zones[0];
> +};
> +
> +/**
> + * struct zone_wps - A collection of zone descriptors to describe zoned media.
> + *
> + * @wps_count:  Number of descriptors.
> + * @wps:        Array of zone descriptors.
> + */
> +struct zone_wps {
> +       u32 wps_count;
> +       struct contiguous_wps **wps;
> +};
> +
>  #define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ)
>  #define blk_zone_is_seq_pref(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_PREF)
>  #define blk_zone_is_smr(z) (blk_zone_is_seq_req(z) || blk_zone_is_seq_pref(z))
>  #define blk_zone_is_cmr(z) ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL)
> -#define blk_zone_is_full(z) ((z)->wp == (z)->start + (z)->len)
> -#define blk_zone_is_empty(z) ((z)->wp == (z)->start)
> +#define blk_zone_is_empty(z) ((z)->wp == 0)
>
> -extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t);
> -extern struct blk_zone *blk_insert_zone(struct request_queue *,
> -                                       struct blk_zone *);
> +extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t,
> +                                       sector_t *, sector_t *, spinlock_t **);
>  extern void blk_drop_zones(struct request_queue *);
>  #else
>  static inline void blk_drop_zones(struct request_queue *q) { };
> @@ -463,7 +519,7 @@ struct request_queue {
>         struct queue_limits     limits;
>
>  #ifdef CONFIG_BLK_DEV_ZONED
> -       struct rb_root          zones;
> +       struct zone_wps         *zones;
>  #endif
>         /*
>          * sg stuff
> --
> 2.9.3
>
Hannes Reinecke Aug. 22, 2016, 7:11 a.m. UTC | #2
On 08/22/2016 06:34 AM, Shaun Tancheff wrote:
> Currently the RB-Tree zone cache is fast and flexible. It does
> use a rather largish amount of ram. This model reduces the ram
> required from 120 bytes per zone to 16 bytes per zone with a
> moderate transformation of the blk_zone_lookup() api.
> 
> This model is predicated on the belief that most variations
> on zoned media will follow a pattern of using collections of same
> sized zones on a single device. Similar to the pattern of erase
> blocks on flash devices being progressivly larger 16K, 64K, ...
> 
> The goal is to be able to build a descriptor which is both memory
> efficient, performant, and flexible.
> 
> Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
> ---
>  block/blk-core.c       |    2 +-
>  block/blk-sysfs.c      |   31 +-
>  block/blk-zoned.c      |  103 +++--
>  drivers/scsi/sd.c      |    5 +-
>  drivers/scsi/sd.h      |    4 +-
>  drivers/scsi/sd_zbc.c  | 1025 +++++++++++++++++++++++++++---------------------
>  include/linux/blkdev.h |   82 +++-
>  7 files changed, 716 insertions(+), 536 deletions(-)
> 
Have you measure the performance impact here?
The main idea behind using an RB-tree is that each single element will
fit in the CPU cache; using an array will prevent that.
So we will increase the number of cache flushes, and most likely a
performance penalty, too.
Hence I'd rather like to see a performance measurement here before going
down that road.

Cheers,

Hannes
Shaun Tancheff Aug. 22, 2016, 3:43 p.m. UTC | #3
On Mon, Aug 22, 2016 at 2:11 AM, Hannes Reinecke <hare@suse.de> wrote:
> On 08/22/2016 06:34 AM, Shaun Tancheff wrote:
>> Currently the RB-Tree zone cache is fast and flexible. It does
>> use a rather largish amount of ram. This model reduces the ram
>> required from 120 bytes per zone to 16 bytes per zone with a
>> moderate transformation of the blk_zone_lookup() api.
>>
>> This model is predicated on the belief that most variations
>> on zoned media will follow a pattern of using collections of same
>> sized zones on a single device. Similar to the pattern of erase
>> blocks on flash devices being progressivly larger 16K, 64K, ...
>>
>> The goal is to be able to build a descriptor which is both memory
>> efficient, performant, and flexible.
>>
>> Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com>
>> ---
>>  block/blk-core.c       |    2 +-
>>  block/blk-sysfs.c      |   31 +-
>>  block/blk-zoned.c      |  103 +++--
>>  drivers/scsi/sd.c      |    5 +-
>>  drivers/scsi/sd.h      |    4 +-
>>  drivers/scsi/sd_zbc.c  | 1025 +++++++++++++++++++++++++++---------------------
>>  include/linux/blkdev.h |   82 +++-
>>  7 files changed, 716 insertions(+), 536 deletions(-)

> Have you measure the performance impact here?

As far as actual hardware (HostAware) I am seeing the same
I/O performance. I suspect its just that below 100k iops the
zone cache just isn't a bottleneck.

> The main idea behind using an RB-tree is that each single element will
> fit in the CPU cache; using an array will prevent that.
> So we will increase the number of cache flushes, and most likely a
> performance penalty, too.
> Hence I'd rather like to see a performance measurement here before going
> down that road.

I think it will have to be a simulated benchmark, if that's okay.

Of course I'm open to suggestions if there is something you have in mind.
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index 3a9caf7..3b084a8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -727,7 +727,7 @@  struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	INIT_LIST_HEAD(&q->blkg_list);
 #endif
 #ifdef CONFIG_BLK_DEV_ZONED
-	q->zones = RB_ROOT;
+	q->zones = NULL;
 #endif
 	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 43f441f..ecbd434 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -232,36 +232,7 @@  static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 #ifdef CONFIG_BLK_DEV_ZONED
 static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 {
-	struct rb_node *node;
-	struct blk_zone *zone;
-	ssize_t offset = 0, end = 0;
-	size_t size = 0, num = 0;
-	enum blk_zone_type type = BLK_ZONE_TYPE_UNKNOWN;
-
-	for (node = rb_first(&q->zones); node; node = rb_next(node)) {
-		zone = rb_entry(node, struct blk_zone, node);
-		if (zone->type != type ||
-		    zone->len != size ||
-		    end != zone->start) {
-			if (size != 0)
-				offset += sprintf(page + offset, "%zu\n", num);
-			/* We can only store one page ... */
-			if (offset + 42 > PAGE_SIZE) {
-				offset += sprintf(page + offset, "...\n");
-				return offset;
-			}
-			size = zone->len;
-			type = zone->type;
-			offset += sprintf(page + offset, "%zu %zu %d ",
-					  zone->start, size, type);
-			num = 0;
-			end = zone->start + size;
-		} else
-			end += zone->len;
-		num++;
-	}
-	offset += sprintf(page + offset, "%zu\n", num);
-	return offset;
+	return sprintf(page, "%u\n", q->zones ? 1 : 0);
 }
 #endif
 
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 975e863..338a1af 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -8,63 +8,84 @@ 
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
-#include <linux/rbtree.h>
+#include <linux/vmalloc.h>
 
-struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t lba)
+/**
+ * blk_lookup_zone() - Lookup zones
+ * @q: Request Queue
+ * @sector: Location to lookup
+ * @start: Pointer to starting location zone (OUT)
+ * @len: Pointer to length of zone (OUT)
+ * @lock: Pointer to spinlock of zones in owning descriptor (OUT)
+ */
+struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector,
+				 sector_t *start, sector_t *len,
+				 spinlock_t **lock)
 {
-	struct rb_root *root = &q->zones;
-	struct rb_node *node = root->rb_node;
+	int iter;
+	struct blk_zone *bzone = NULL;
+	struct zone_wps *zi = q->zones;
+
+	*start = 0;
+	*len = 0;
+	*lock = NULL;
+
+	if (!q->zones)
+		goto out;
 
-	while (node) {
-		struct blk_zone *zone = container_of(node, struct blk_zone,
-						     node);
+	for (iter = 0; iter < zi->wps_count; iter++) {
+		if (sector >= zi->wps[iter]->start_lba &&
+		    sector <  zi->wps[iter]->last_lba) {
+			struct contiguous_wps *wp = zi->wps[iter];
+			u64 index = (sector - wp->start_lba) / wp->zone_size;
 
-		if (lba < zone->start)
-			node = node->rb_left;
-		else if (lba >= zone->start + zone->len)
-			node = node->rb_right;
-		else
-			return zone;
+			if (index >= wp->zone_count) {
+				WARN(1, "Impossible index for zone\n");
+				goto out;
+			}
+
+			bzone = &wp->zones[index];
+			*len = wp->zone_size;
+			*start = wp->start_lba + (index * wp->zone_size);
+			*lock = &wp->lock;
+		}
 	}
-	return NULL;
+
+out:
+	return bzone;
 }
 EXPORT_SYMBOL_GPL(blk_lookup_zone);
 
-struct blk_zone *blk_insert_zone(struct request_queue *q, struct blk_zone *data)
+/**
+ * free_zone_wps() - Free up memory in use by wps
+ * @zi: zone wps array(s).
+ */
+static void free_zone_wps(struct zone_wps *zi)
 {
-	struct rb_root *root = &q->zones;
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	/* on error free the arrays */
+	if (zi && zi->wps) {
+		int ca;
 
-	/* Figure out where to put new node */
-	while (*new) {
-		struct blk_zone *this = container_of(*new, struct blk_zone,
-						     node);
-		parent = *new;
-		if (data->start + data->len <= this->start)
-			new = &((*new)->rb_left);
-		else if (data->start >= this->start + this->len)
-			new = &((*new)->rb_right);
-		else {
-			/* Return existing zone */
-			return this;
+		for (ca = 0; ca < zi->wps_count; ca++) {
+			if (zi->wps[ca]) {
+				vfree(zi->wps[ca]);
+				zi->wps[ca] = NULL;
+			}
 		}
+		kfree(zi->wps);
 	}
-	/* Add new node and rebalance tree. */
-	rb_link_node(&data->node, parent, new);
-	rb_insert_color(&data->node, root);
-
-	return NULL;
 }
-EXPORT_SYMBOL_GPL(blk_insert_zone);
 
+/**
+ * blk_drop_zones() - Free zones
+ * @q: Request Queue
+ */
 void blk_drop_zones(struct request_queue *q)
 {
-	struct rb_root *root = &q->zones;
-	struct blk_zone *zone, *next;
-
-	rbtree_postorder_for_each_entry_safe(zone, next, root, node) {
-		kfree(zone);
+	if (q->zones) {
+		free_zone_wps(q->zones);
+		kfree(q->zones);
+		q->zones = NULL;
 	}
-	q->zones = RB_ROOT;
 }
 EXPORT_SYMBOL_GPL(blk_drop_zones);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index f144df4..0f749f5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2549,8 +2549,9 @@  got_data:
 				      sdkp->physical_block_size);
 	sdkp->device->sector_size = sector_size;
 
-	if (sd_zbc_config(sdkp, buffer, SD_BUF_SIZE))
-		sd_config_discard(sdkp, SD_ZBC_RESET_WP);
+	if (sdkp->first_scan)
+		if (sd_zbc_config(sdkp, GFP_KERNEL))
+			sd_config_discard(sdkp, SD_ZBC_RESET_WP);
 
 	{
 		char cap_str_2[10], cap_str_10[10];
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index fc766db..c9c79e9 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -299,13 +299,13 @@  extern void sd_zbc_uninit_command(struct scsi_cmnd *cmd);
 extern void sd_zbc_remove(struct scsi_disk *);
 extern void sd_zbc_reset_zones(struct scsi_disk *);
 extern void sd_zbc_update_zones(struct scsi_disk *, sector_t, int, int reason);
-extern bool sd_zbc_config(struct scsi_disk *, void *, size_t);
+extern bool sd_zbc_config(struct scsi_disk *, gfp_t);
 
 extern unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp);
 
 #else /* CONFIG_SCSI_ZBC */
 
-static inline bool sd_zbc_config(struct scsi_disk *sdkp, void *b, size_t sz)
+static inline bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp)
 {
 	return false;
 }
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 960af93..c087035 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -22,6 +22,7 @@ 
 
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/vmalloc.h>
 
 #include <asm/unaligned.h>
 
@@ -51,11 +52,11 @@ 
 	} while( 0 )
 
 struct zbc_update_work {
-	struct work_struct zone_work;
-	struct scsi_disk *sdkp;
-	sector_t	zone_sector;
-	int		zone_buflen;
-	char		zone_buf[0];
+	struct work_struct	zone_work;
+	struct scsi_disk	*sdkp;
+	sector_t		zone_sector;
+	int			zone_buflen;
+	struct bdev_zone_report zone_buf[0];
 };
 
 /**
@@ -95,102 +96,19 @@  static inline sector_t get_start_from_desc(struct scsi_disk *sdkp,
 	return logical_to_sectors(sdkp->device, be64_to_cpu(bzde->lba_start));
 }
 
-static
-struct blk_zone *zbc_desc_to_zone(struct scsi_disk *sdkp, unsigned char *rec)
+static void _fill_zone(struct blk_zone *zone, struct scsi_disk *sdkp,
+		       struct bdev_zone_descriptor *bzde)
 {
-	struct blk_zone *zone;
-	sector_t wp = (sector_t)-1;
-
-	zone = kzalloc(sizeof(struct blk_zone), GFP_KERNEL);
-	if (!zone)
-		return NULL;
-
-	spin_lock_init(&zone->lock);
-	zone->type = rec[0] & 0xf;
-	zone->state = (rec[1] >> 4) & 0xf;
-	zone->len = logical_to_sectors(sdkp->device,
-				       get_unaligned_be64(&rec[8]));
-	zone->start = logical_to_sectors(sdkp->device,
-					 get_unaligned_be64(&rec[16]));
-
-	if (blk_zone_is_smr(zone))
-		wp = logical_to_sectors(sdkp->device,
-					get_unaligned_be64(&rec[24]));
-	zone->wp = wp;
-	/*
-	 * Fixup block zone state
-	 */
-	if (zone->state == BLK_ZONE_EMPTY &&
-	    zone->wp != zone->start) {
-		sd_zbc_debug(sdkp,
-			     "zone %zu state EMPTY wp %zu: adjust wp\n",
-			     zone->start, zone->wp);
-		zone->wp = zone->start;
-	}
-	if (zone->state == BLK_ZONE_FULL &&
-	    zone->wp != zone->start + zone->len) {
-		sd_zbc_debug(sdkp,
-			     "zone %zu state FULL wp %zu: adjust wp\n",
-			     zone->start, zone->wp);
-		zone->wp = zone->start + zone->len;
-	}
-
-	return zone;
+	zone->type = bzde->type & 0x0f;
+	zone->state = (bzde->flags >> 4) & 0x0f;
+	zone->wp = get_wp_from_desc(sdkp, bzde);
 }
 
-static
-sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
-			 unsigned int buf_len)
-{
-	struct request_queue *q = sdkp->disk->queue;
-	unsigned char *rec = buf;
-	int rec_no = 0;
-	unsigned int list_length;
-	sector_t next_sector = -1;
-	u8 same;
-
-	/* Parse REPORT ZONES header */
-	list_length = get_unaligned_be32(&buf[0]);
-	same = buf[4] & 0xf;
-	rec = buf + 64;
-	list_length += 64;
-
-	if (list_length < buf_len)
-		buf_len = list_length;
-
-	while (rec < buf + buf_len) {
-		struct blk_zone *this, *old;
-		unsigned long flags;
 
-		this = zbc_desc_to_zone(sdkp, rec);
-		if (!this)
-			break;
-
-		if (same == 0 && this->len != zlen) {
-			next_sector = this->start + this->len;
-			break;
-		}
-
-		next_sector = this->start + this->len;
-		old = blk_insert_zone(q, this);
-		if (old) {
-			spin_lock_irqsave(&old->lock, flags);
-			if (blk_zone_is_smr(old)) {
-				old->wp = this->wp;
-				old->state = this->state;
-			}
-			spin_unlock_irqrestore(&old->lock, flags);
-			kfree(this);
-		}
-		rec += 64;
-		rec_no++;
-	}
-
-	sd_zbc_debug(sdkp,
-		     "Inserted %d zones, next sector %zu len %d\n",
-		     rec_no, next_sector, list_length);
-
-	return next_sector;
+static void fill_zone(struct contiguous_wps *cwps, int z_count,
+		      struct scsi_disk *sdkp, struct bdev_zone_descriptor *bzde)
+{
+	_fill_zone(&cwps->zones[z_count], sdkp, bzde);
 }
 
 /**
@@ -200,12 +118,10 @@  sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf,
  * @bufflen: length of @buffer
  * @start_sector: logical sector for the zone information should be reported
  * @option: reporting option to be used
- * @partial: flag to set the 'partial' bit for report zones command
  */
-static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
-			       int bufflen, sector_t start_sector,
-			       enum zbc_zone_reporting_options option,
-			       bool partial)
+static int sd_zbc_report_zones(struct scsi_disk *sdkp,
+			       struct bdev_zone_report *buffer,
+			       int bufflen, sector_t start_sector, u8 option)
 {
 	struct scsi_device *sdp = sdkp->device;
 	const int timeout = sdp->request_queue->rq_timeout
@@ -225,7 +141,7 @@  static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer,
 	cmd[1] = ZI_REPORT_ZONES;
 	put_unaligned_be64(start_lba, &cmd[2]);
 	put_unaligned_be32(bufflen, &cmd[10]);
-	cmd[14] = (partial ? ZBC_REPORT_ZONE_PARTIAL : 0) | option;
+	cmd[14] = option;
 	memset(buffer, 0, bufflen);
 
 	result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
@@ -248,49 +164,38 @@  static void sd_zbc_refresh_zone_work(struct work_struct *work)
 		container_of(work, struct zbc_update_work, zone_work);
 	struct scsi_disk *sdkp = zbc_work->sdkp;
 	struct request_queue *q = sdkp->disk->queue;
-	unsigned char *zone_buf = zbc_work->zone_buf;
+	struct bdev_zone_report *rpt = zbc_work->zone_buf;
 	unsigned int zone_buflen = zbc_work->zone_buflen;
+	struct bdev_zone_descriptor *bzde;
+	int iter;
+	int offmax;
+	sector_t z_at, z_start, z_len;
+	spinlock_t *lock;
+	struct blk_zone *zone;
 	int ret;
-	u8 same;
-	u64 zlen = 0;
 	sector_t last_sector;
 	sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
 
-	ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
+	ret = sd_zbc_report_zones(sdkp, rpt, zone_buflen,
 				  zbc_work->zone_sector,
-				  ZBC_ZONE_REPORTING_OPTION_ALL, true);
+				  ZBC_ZONE_REPORTING_OPTION_ALL);
 	if (ret)
 		goto done_free;
 
-	/* this whole path is unlikely so extra reports shouldn't be a
-	 * large impact */
-	same = zone_buf[4] & 0xf;
-	if (same == 0) {
-		unsigned char *desc = &zone_buf[64];
-		unsigned int blen = zone_buflen;
-
-		/* just pull the first zone */
-		if (blen > 512)
-			blen = 512;
-		ret = sd_zbc_report_zones(sdkp, zone_buf, blen, 0,
-					  ZBC_ZONE_REPORTING_OPTION_ALL, true);
-		if (ret)
-			goto done_free;
-
-		/* Read the zone length from the first zone descriptor */
-		zlen = logical_to_sectors(sdkp->device,
-					  get_unaligned_be64(&desc[8]));
-
-		ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen,
-					  zbc_work->zone_sector,
-					  ZBC_ZONE_REPORTING_OPTION_ALL, true);
-		if (ret)
-			goto done_free;
+	offmax = max_report_entries(zone_buflen);
+	for (iter = 0; iter < offmax; iter++) {
+		bzde = &rpt->descriptors[iter];
+		z_at = get_start_from_desc(sdkp, bzde);
+		if (!z_at)
+			break;
+		zone = blk_lookup_zone(q, z_at, &z_start, &z_len, &lock);
+		if (zone) {
+			_fill_zone(zone, sdkp, bzde);
+			last_sector = z_start + z_len;
+		}
 	}
 
-	last_sector = zbc_parse_zones(sdkp, zlen, zone_buf, zone_buflen);
-	capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
-	if (last_sector != -1 && last_sector < capacity) {
+	if (sdkp->zone_work_q && last_sector != -1 && last_sector < capacity) {
 		if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
 			sd_zbc_debug(sdkp,
 				     "zones in reset, canceling refresh\n");
@@ -333,10 +238,7 @@  void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
 {
 	struct request_queue *q = sdkp->disk->queue;
 	struct zbc_update_work *zbc_work;
-	struct blk_zone *zone;
-	struct rb_node *node;
-	int zone_num = 0, zone_busy = 0, num_rec;
-	sector_t next_sector = sector;
+	int num_rec;
 
 	if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
 		sd_zbc_debug(sdkp,
@@ -346,18 +248,23 @@  void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize,
 
 	if (reason != SD_ZBC_INIT) {
 		/* lookup sector, is zone pref? then ignore */
-		struct blk_zone *zone = blk_lookup_zone(q, sector);
-
+		sector_t z_start, z_len;
+		spinlock_t *lck;
+		struct blk_zone *zone = blk_lookup_zone(q, sector, &z_start,
+							&z_len, &lck);
+		/* zone actions on conventional zones are invalid */
+		if (zone && reason == SD_ZBC_RESET_WP && blk_zone_is_cmr(zone))
+			return;
 		if (reason == SD_ZBC_RESET_WP)
 			sd_zbc_debug(sdkp, "RESET WP failed %lx\n", sector);
-
-		if (zone && blk_zone_is_seq_pref(zone))
-			return;
 	}
 
+	if (!sdkp->zone_work_q)
+		return;
+
 retry:
 	zbc_work = kzalloc(sizeof(struct zbc_update_work) + bufsize,
-			   reason != SD_ZBC_INIT ? GFP_NOWAIT : GFP_KERNEL);
+			   reason != SD_ZBC_INIT ? GFP_ATOMIC : GFP_KERNEL);
 	if (!zbc_work) {
 		if (bufsize > 512) {
 			sd_zbc_debug(sdkp,
@@ -381,30 +288,40 @@  retry:
 	 * Mark zones under update as BUSY
 	 */
 	if (reason != SD_ZBC_INIT) {
-		for (node = rb_first(&q->zones); node; node = rb_next(node)) {
-			unsigned long flags;
-
-			zone = rb_entry(node, struct blk_zone, node);
-			if (num_rec == 0)
+		unsigned long flags;
+		int iter;
+		struct zone_wps *zi = q->zones;
+		struct contiguous_wps *wp = NULL;
+		u64 index = -1;
+		int zone_busy = 0;
+		int z_flgd = 0;
+
+		for (iter = 0; iter < zi->wps_count; iter++) {
+			if (sector >= zi->wps[iter]->start_lba &&
+			    sector <  zi->wps[iter]->last_lba) {
+				wp = zi->wps[iter];
 				break;
-			if (zone->start != next_sector)
-				continue;
-			next_sector += zone->len;
-			num_rec--;
-
-			spin_lock_irqsave(&zone->lock, flags);
-			if (blk_zone_is_smr(zone)) {
-				if (zone->state == BLK_ZONE_BUSY) {
+			}
+		}
+		if (wp) {
+			spin_lock_irqsave(&wp->lock, flags);
+			index = (sector - wp->start_lba) / wp->zone_size;
+			while (index < wp->zone_count && z_flgd < num_rec) {
+				struct blk_zone *bzone = &wp->zones[index];
+
+				index++;
+				z_flgd++;
+				if (!blk_zone_is_smr(bzone))
+					continue;
+
+				if (bzone->state == BLK_ZONE_BUSY)
 					zone_busy++;
-				} else {
-					zone->state = BLK_ZONE_BUSY;
-					zone->wp = zone->start;
-				}
-				zone_num++;
+				else
+					bzone->state = BLK_ZONE_BUSY;
 			}
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(&wp->lock, flags);
 		}
-		if (zone_num && (zone_num == zone_busy)) {
+		if (z_flgd && (z_flgd == zone_busy)) {
 			sd_zbc_debug(sdkp,
 				     "zone update for %zu in progress\n",
 				     sector);
@@ -476,43 +393,26 @@  static void discard_or_write_same(struct scsi_cmnd *cmd, sector_t sector,
 int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
-	struct scsi_device *sdp = cmd->device;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 	sector_t sector = blk_rq_pos(rq);
 	unsigned int nr_sectors = blk_rq_sectors(rq);
 	int ret = BLKPREP_OK;
 	struct blk_zone *zone;
 	unsigned long flags;
-	u32 wp_offset;
 	bool use_write_same = false;
+	sector_t z_start, z_len;
+	spinlock_t *lck;
 
-	zone = blk_lookup_zone(rq->q, sector);
-	if (!zone) {
-		/* Test for a runt zone before giving up */
-		if (sdp->type != TYPE_ZBC) {
-			struct request_queue *q = rq->q;
-			struct rb_node *node;
-
-			node = rb_last(&q->zones);
-			if (node)
-				zone = rb_entry(node, struct blk_zone, node);
-			if (zone) {
-				spin_lock_irqsave(&zone->lock, flags);
-				if ((zone->start + zone->len) <= sector)
-					goto out;
-				spin_unlock_irqrestore(&zone->lock, flags);
-				zone = NULL;
-			}
-		}
+	zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
+	if (!zone)
 		return BLKPREP_KILL;
-	}
 
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock_irqsave(lck, flags);
 	if (zone->state == BLK_ZONE_UNKNOWN ||
 	    zone->state == BLK_ZONE_BUSY) {
 		sd_zbc_debug_ratelimit(sdkp,
 				       "Discarding zone %zx state %x, deferring\n",
-				       zone->start, zone->state);
+				       z_start, zone->state);
 		ret = BLKPREP_DEFER;
 		goto out;
 	}
@@ -520,39 +420,37 @@  int sd_zbc_setup_discard(struct scsi_cmnd *cmd)
 		/* let the drive fail the command */
 		sd_zbc_debug_ratelimit(sdkp,
 				       "Discarding offline zone %zx\n",
-				       zone->start);
+				       z_start);
 		goto out;
 	}
 	if (blk_zone_is_cmr(zone)) {
 		use_write_same = true;
 		sd_zbc_debug_ratelimit(sdkp,
-				       "Discarding CMR zone %zx\n",
-				       zone->start);
+				       "Discarding CMR zone %zx\n", z_start);
 		goto out;
 	}
-	if (zone->start != sector || zone->len < nr_sectors) {
+	if (z_start != sector || z_len < nr_sectors) {
 		sd_printk(KERN_ERR, sdkp,
 			  "Misaligned RESET WP %zx/%x on zone %zx/%zx\n",
-			  sector, nr_sectors, zone->start, zone->len);
+			  sector, nr_sectors, z_start, z_len);
 		ret = BLKPREP_KILL;
 		goto out;
 	}
 	/* Protect against Reset WP when more data had been written to the
 	 * zone than is being discarded.
 	 */
-	wp_offset = zone->wp - zone->start;
-	if (wp_offset > nr_sectors) {
+	if (zone->wp > nr_sectors) {
 		sd_printk(KERN_ERR, sdkp,
-			  "Will Corrupt RESET WP %zx/%x/%x on zone %zx/%zx/%zx\n",
-			  sector, wp_offset, nr_sectors,
-			  zone->start, zone->wp, zone->len);
+			  "Will Corrupt RESET WP %zx/%zx/%x on zone %zx/%zx/%zx\n",
+			  sector, (sector_t)zone->wp, nr_sectors,
+			  z_start, z_start + zone->wp, z_len);
 		ret = BLKPREP_KILL;
 		goto out;
 	}
 	if (blk_zone_is_empty(zone)) {
 		sd_zbc_debug_ratelimit(sdkp,
 				       "Discarding empty zone %zx [WP: %zx]\n",
-				       zone->start, zone->wp);
+				       z_start, (sector_t)zone->wp);
 		ret = BLKPREP_DONE;
 		goto out;
 	}
@@ -563,8 +461,8 @@  out:
 	 * zone update if RESET WRITE POINTER fails.
 	 */
 	if (ret == BLKPREP_OK && !use_write_same)
-		zone->wp = zone->start;
-	spin_unlock_irqrestore(&zone->lock, flags);
+		zone->wp = 0;
+	spin_unlock_irqrestore(lck, flags);
 
 	if (ret == BLKPREP_OK)
 		discard_or_write_same(cmd, sector, nr_sectors, use_write_same);
@@ -573,13 +471,14 @@  out:
 }
 
 
-static void __set_zone_state(struct blk_zone *zone, int op)
+static void __set_zone_state(struct blk_zone *zone, sector_t z_len,
+			     spinlock_t *lck, int op)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&zone->lock, flags);
-	if (blk_zone_is_cmr(zone))
-		goto out_unlock;
+	spin_lock_irqsave(lck, flags);
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		goto out;
 
 	switch (op) {
 	case REQ_OP_ZONE_OPEN:
@@ -587,38 +486,45 @@  static void __set_zone_state(struct blk_zone *zone, int op)
 		break;
 	case REQ_OP_ZONE_FINISH:
 		zone->state = BLK_ZONE_FULL;
-		zone->wp = zone->start + zone->len;
+		zone->wp = z_len;
 		break;
 	case REQ_OP_ZONE_CLOSE:
 		zone->state = BLK_ZONE_CLOSED;
 		break;
 	case REQ_OP_ZONE_RESET:
-		zone->wp = zone->start;
+		zone->wp = 0;
 		break;
 	default:
 		WARN_ONCE(1, "%s: invalid op code: %u\n", __func__, op);
 	}
-out_unlock:
-	spin_unlock_irqrestore(&zone->lock, flags);
+out:
+	spin_unlock_irqrestore(lck, flags);
 }
 
 static void update_zone_state(struct request *rq, sector_t lba, unsigned int op)
 {
-	struct request_queue *q = rq->q;
-	struct blk_zone *zone = NULL;
+	struct blk_zone *zone;
 
 	if (lba == ~0ul) {
-		struct rb_node *node;
-
-		for (node = rb_first(&q->zones); node; node = rb_next(node)) {
-			zone = rb_entry(node, struct blk_zone, node);
-			__set_zone_state(zone, op);
+		struct zone_wps *zi = rq->q->zones;
+		struct contiguous_wps *wp;
+		u32 iter, entry;
+
+		for (iter = 0; iter < zi->wps_count; iter++) {
+			wp = zi->wps[iter];
+			for (entry = 0; entry < wp->zone_count; entry++) {
+				zone = &wp->zones[entry];
+				__set_zone_state(zone, wp->zone_size, &wp->lock,
+						 op);
+			}
 		}
-		return;
 	} else {
-		zone = blk_lookup_zone(q, lba);
+		sector_t z_start, z_len;
+		spinlock_t *lck;
+
+		zone = blk_lookup_zone(rq->q, lba, &z_start, &z_len, &lck);
 		if (zone)
-			__set_zone_state(zone, op);
+			__set_zone_state(zone, z_len, lck, op);
 	}
 }
 
@@ -641,6 +547,8 @@  int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 	sector_t sector = blk_rq_pos(rq);
 	struct blk_zone *zone;
+	spinlock_t *lck;
+	sector_t z_start, z_len;
 	unsigned long flags;
 	unsigned int nr_sectors;
 	int ret = BLKPREP_DONE;
@@ -651,17 +559,17 @@  int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd)
 	if (is_fua || op != REQ_OP_ZONE_RESET)
 		goto out;
 
-	zone = blk_lookup_zone(rq->q, sector);
+	zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck);
 	if (!zone || sdkp->provisioning_mode != SD_ZBC_RESET_WP)
 		goto out;
 
 	/* Map a Reset WP w/o FUA to a discard request */
-	spin_lock_irqsave(&zone->lock, flags);
-	sector = zone->start;
-	nr_sectors = zone->len;
+	spin_lock_irqsave(lck, flags);
+	sector = z_start;
+	nr_sectors = z_len;
 	if (blk_zone_is_cmr(zone))
 		use_write_same = true;
-	spin_unlock_irqrestore(&zone->lock, flags);
+	spin_unlock_irqrestore(lck, flags);
 
 	rq->completion_data = NULL;
 	if (use_write_same) {
@@ -712,137 +620,157 @@  static sector_t bzrpt_fill(struct request *rq,
 			   struct bdev_zone_descriptor *bzd,
 			   size_t sz, sector_t lba, u8 opt)
 {
-	struct request_queue *q = rq->q;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+	struct scsi_device *sdp = sdkp->device;
+	struct zone_wps *zi = rq->q->zones;
+	struct contiguous_wps *wpdscr;
 	struct blk_zone *zone = NULL;
-	struct rb_node *node = NULL;
 	sector_t progress = lba;
 	sector_t clen = ~0ul;
+	sector_t z_start, z_len, z_wp_abs;
 	unsigned long flags;
 	u32 max_entries = bzrpt ? max_report_entries(sz) : sz / sizeof(*bzd);
 	u32 entry = 0;
+	u32 iter, idscr;
 	int len_diffs = 0;
 	int type_diffs = 0;
 	u8 ctype;
 	u8 same = 0;
 
-	zone = blk_lookup_zone(q, lba);
-	if (zone)
-		node = &zone->node;
-
-	for (entry = 0; entry < max_entries && node; node = rb_next(node)) {
-		u64 z_len, z_start, z_wp_abs;
-		u8 cond = 0;
-		u8 flgs = 0;
-
-		spin_lock_irqsave(&zone->lock, flags);
-		z_len = zone->len;
-		z_start = zone->start;
-		z_wp_abs = zone->wp;
-		progress = z_start + z_len;
-		cond = zone->state;
-		if (blk_zone_is_cmr(zone))
-			flgs |= 0x02;
-		else if (zone->wp != zone->start)
-			flgs |= 0x01; /* flag as RWP recommended? */
-		spin_unlock_irqrestore(&zone->lock, flags);
-
-		switch (opt & ZBC_REPORT_OPTION_MASK) {
-		case ZBC_ZONE_REPORTING_OPTION_EMPTY:
-			if (z_wp_abs != z_start)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
-			if (cond != BLK_ZONE_OPEN)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
-			if (cond != BLK_ZONE_OPEN_EXPLICIT)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_CLOSED:
-			if (cond != BLK_ZONE_CLOSED)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_FULL:
-			if (cond != BLK_ZONE_FULL)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_READONLY:
-			if (cond == BLK_ZONE_READONLY)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
-			if (cond == BLK_ZONE_OFFLINE)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
-			if (z_wp_abs == z_start)
-				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_NON_WP:
-			if (cond == BLK_ZONE_NO_WP)
+	for (iter = 0; entry < max_entries && iter < zi->wps_count; iter++) {
+		wpdscr = zi->wps[iter];
+		if (lba > wpdscr->last_lba)
+			continue;
+
+		spin_lock_irqsave(&wpdscr->lock, flags);
+		for (idscr = 0;
+		     entry < max_entries && idscr < wpdscr->zone_count;
+		     idscr++) {
+			struct bdev_zone_descriptor *dscr;
+			u64 zoff = idscr * wpdscr->zone_size;
+			u8 cond, flgs = 0;
+
+			z_len = wpdscr->zone_size;
+			zoff = idscr * z_len;
+			z_start = wpdscr->start_lba + zoff;
+			if (lba >= z_start + z_len)
 				continue;
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
-			/* this can only be reported by the HW */
-			break;
-		case ZBC_ZONE_REPORTING_OPTION_ALL:
-		default:
-			break;
-		}
 
-		/* if same code only applies to returned zones */
-		if (opt & ZBC_REPORT_ZONE_PARTIAL) {
-			if (clen != ~0ul) {
-				clen = z_len;
+			zone = &wpdscr->zones[idscr];
+			if (blk_zone_is_cmr(zone))
+				z_wp_abs = z_start + wpdscr->zone_size;
+			else
+				z_wp_abs = z_start + zone->wp;
+
+			switch (opt & ZBC_REPORT_OPTION_MASK) {
+			case ZBC_ZONE_REPORTING_OPTION_EMPTY:
+				if (z_wp_abs != z_start)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN:
+				if (zone->state != BLK_ZONE_OPEN)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN:
+				if (zone->state != BLK_ZONE_OPEN_EXPLICIT)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_CLOSED:
+				if (zone->state != BLK_ZONE_CLOSED)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_FULL:
+				if (zone->state != BLK_ZONE_FULL)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_READONLY:
+				if (zone->state == BLK_ZONE_READONLY)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_OFFLINE:
+				if (zone->state == BLK_ZONE_OFFLINE)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP:
+				if (z_wp_abs == z_start)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_NON_WP:
+				if (zone->state == BLK_ZONE_NO_WP)
+					continue;
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE:
+				/* this can only be reported by the HW */
+				break;
+			case ZBC_ZONE_REPORTING_OPTION_ALL:
+			default:
+				break;
+			}
+
+			/* if same code only applies to returned zones */
+			if (opt & ZBC_REPORT_ZONE_PARTIAL) {
+				if (clen != ~0ul) {
+					clen = z_len;
+					ctype = zone->type;
+				}
+				if (z_len != clen)
+					len_diffs++;
+				if (zone->type != ctype)
+					type_diffs++;
 				ctype = zone->type;
 			}
-			if (z_len != clen)
-				len_diffs++;
-			if (zone->type != ctype)
-				type_diffs++;
-			ctype = zone->type;
-		}
+			progress = z_start + z_len;
 
-		/* shift to device units */
-		z_start >>= ilog2(sdkp->device->sector_size) - 9;
-		z_len >>= ilog2(sdkp->device->sector_size) - 9;
-		z_wp_abs >>= ilog2(sdkp->device->sector_size) - 9;
+			if (!bzd) {
+				if (bzrpt)
+					bzrpt->descriptor_count =
+						cpu_to_be32(++entry);
+				continue;
+			}
 
-		if (!bzd) {
+			/* shift to device units */
+			z_start >>= ilog2(sdp->sector_size) - 9;
+			z_len >>= ilog2(sdp->sector_size) - 9;
+			z_wp_abs >>= ilog2(sdp->sector_size) - 9;
+
+			cond = zone->state;
+			if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+				flgs |= 0x02;
+			else if (zone->wp)
+				flgs |= 0x01; /* flag as RWP recommended? */
+
+			dscr = &bzd[entry];
+			dscr->lba_start = cpu_to_be64(z_start);
+			dscr->length = cpu_to_be64(z_len);
+			dscr->lba_wptr = cpu_to_be64(z_wp_abs);
+			dscr->type = zone->type;
+			dscr->flags = cond << 4 | flgs;
+			entry++;
 			if (bzrpt)
-				bzrpt->descriptor_count =
-					cpu_to_be32(++entry);
-			continue;
+				bzrpt->descriptor_count = cpu_to_be32(entry);
 		}
-
-		bzd[entry].lba_start = cpu_to_be64(z_start);
-		bzd[entry].length = cpu_to_be64(z_len);
-		bzd[entry].lba_wptr = cpu_to_be64(z_wp_abs);
-		bzd[entry].type = zone->type;
-		bzd[entry].flags = cond << 4 | flgs;
-		entry++;
-		if (bzrpt)
-			bzrpt->descriptor_count = cpu_to_be32(entry);
+		spin_unlock_irqrestore(&wpdscr->lock, flags);
 	}
 
 	/* if same code applies to all zones */
 	if (bzrpt && !(opt & ZBC_REPORT_ZONE_PARTIAL)) {
-		for (node = rb_first(&q->zones); node; node = rb_next(node)) {
-			zone = rb_entry(node, struct blk_zone, node);
-
-			spin_lock_irqsave(&zone->lock, flags);
-			if (clen != ~0ul) {
-				clen = zone->len;
+		for (iter = 0; iter < zi->wps_count; iter++) {
+			wpdscr = zi->wps[iter];
+			spin_lock_irqsave(&wpdscr->lock, flags);
+			for (idscr = 0; idscr < wpdscr->zone_count; idscr++) {
+				z_len = wpdscr->zone_size;
+				zone = &wpdscr->zones[idscr];
+				if (clen != ~0ul) {
+					clen = z_len;
+					ctype = zone->type;
+				}
+				if (z_len != clen)
+					len_diffs++;
+				if (zone->type != ctype)
+					type_diffs++;
 				ctype = zone->type;
 			}
-			if (zone->len != clen)
-				len_diffs++;
-			if (zone->type != ctype)
-				type_diffs++;
-			ctype = zone->type;
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(&wpdscr->lock, flags);
 		}
 	}
 
@@ -985,12 +913,15 @@  out:
 int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 			    sector_t sector, unsigned int *num_sectors)
 {
+	struct request_queue *q = sdkp->disk->queue;
 	struct blk_zone *zone;
+	sector_t z_start, z_len;
+	spinlock_t *lck;
 	unsigned int sectors = *num_sectors;
 	int ret = BLKPREP_OK;
 	unsigned long flags;
 
-	zone = blk_lookup_zone(sdkp->disk->queue, sector);
+	zone = blk_lookup_zone(q, sector, &z_start, &z_len, &lck);
 	if (!zone) {
 		/* Might happen during zone initialization */
 		sd_zbc_debug_ratelimit(sdkp,
@@ -999,7 +930,7 @@  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 		return BLKPREP_OK;
 	}
 
-	spin_lock_irqsave(&zone->lock, flags);
+	spin_lock_irqsave(lck, flags);
 
 	if (blk_zone_is_cmr(zone))
 		goto out;
@@ -1008,7 +939,7 @@  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 	    zone->state == BLK_ZONE_BUSY) {
 		sd_zbc_debug_ratelimit(sdkp,
 				       "zone %zu state %x, deferring\n",
-				       zone->start, zone->state);
+				       z_start, zone->state);
 		ret = BLKPREP_DEFER;
 		goto out;
 	}
@@ -1017,25 +948,22 @@  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 		if (op_is_write(req_op(rq))) {
 			u64 nwp = sector + sectors;
 
-			while (nwp > (zone->start + zone->len)) {
-				struct rb_node *node = rb_next(&zone->node);
+			while (nwp > (z_start + z_len)) {
+				zone->wp = z_len;
+				sector = z_start + z_len;
+				sectors = nwp - sector;
+				spin_unlock_irqrestore(lck, flags);
 
-				zone->wp = zone->start + zone->len;
-				sector = zone->wp;
-				sectors = nwp - zone->wp;
-				spin_unlock_irqrestore(&zone->lock, flags);
-
-				if (!node)
-					return BLKPREP_OK;
-				zone = rb_entry(node, struct blk_zone, node);
+				zone = blk_lookup_zone(q, sector,
+						       &z_start, &z_len, &lck);
 				if (!zone)
 					return BLKPREP_OK;
 
-				spin_lock_irqsave(&zone->lock, flags);
+				spin_lock_irqsave(lck, flags);
 				nwp = sector + sectors;
 			}
-			if (nwp > zone->wp)
-				zone->wp = nwp;
+			if (nwp > z_start + zone->wp)
+				zone->wp = nwp - z_start;
 		}
 		goto out;
 	}
@@ -1044,37 +972,37 @@  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 		/* let the drive fail the command */
 		sd_zbc_debug_ratelimit(sdkp,
 				       "zone %zu offline\n",
-				       zone->start);
+				       z_start);
 		goto out;
 	}
 
 	if (op_is_write(req_op(rq))) {
 		if (zone->state == BLK_ZONE_READONLY)
 			goto out;
-		if (blk_zone_is_full(zone)) {
+		if (zone->wp == z_len) {
 			sd_zbc_debug(sdkp,
-				     "Write to full zone %zu/%zu\n",
-				     sector, zone->wp);
+				     "Write to full zone %zu/%zu/%zu\n",
+				     sector, (sector_t)zone->wp, z_len);
 			ret = BLKPREP_KILL;
 			goto out;
 		}
-		if (zone->wp != sector) {
+		if (sector != (z_start + zone->wp)) {
 			sd_zbc_debug(sdkp,
 				     "Misaligned write %zu/%zu\n",
-				     sector, zone->wp);
+				     sector, z_start + zone->wp);
 			ret = BLKPREP_KILL;
 			goto out;
 		}
 		zone->wp += sectors;
-	} else if (zone->wp <= sector + sectors) {
-		if (zone->wp <= sector) {
+	} else if (z_start + zone->wp <= sector + sectors) {
+		if (z_start + zone->wp <= sector) {
 			/* Read beyond WP: clear request buffer */
 			struct req_iterator iter;
 			struct bio_vec bvec;
 			void *buf;
 			sd_zbc_debug(sdkp,
 				     "Read beyond wp %zu+%u/%zu\n",
-				     sector, sectors, zone->wp);
+				     sector, sectors, z_start + zone->wp);
 			rq_for_each_segment(bvec, rq, iter) {
 				buf = bvec_kmap_irq(&bvec, &flags);
 				memset(buf, 0, bvec.bv_len);
@@ -1085,15 +1013,15 @@  int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq,
 			goto out;
 		}
 		/* Read straddle WP position: limit request size */
-		*num_sectors = zone->wp - sector;
+		*num_sectors = z_start + zone->wp - sector;
 		sd_zbc_debug(sdkp,
 			     "Read straddle wp %zu+%u/%zu => %zu+%u\n",
-			     sector, sectors, zone->wp,
+			     sector, sectors, z_start + zone->wp,
 			     sector, *num_sectors);
 	}
 
 out:
-	spin_unlock_irqrestore(&zone->lock, flags);
+	spin_unlock_irqrestore(lck, flags);
 
 	return ret;
 }
@@ -1145,21 +1073,22 @@  static void update_zones_from_report(struct scsi_cmnd *cmd, u32 nr_bytes)
 			struct bdev_zone_descriptor *entry = &bzde[iter];
 			sector_t s = get_start_from_desc(sdkp, entry);
 			sector_t z_len = get_len_from_desc(sdkp, entry);
+			sector_t z_strt;
+			spinlock_t *lck;
 			unsigned long flags;
 
 			if (!z_len)
 				goto done;
 
-			zone = blk_lookup_zone(rq->q, s);
+			zone = blk_lookup_zone(rq->q, s, &z_strt, &z_len, &lck);
 			if (!zone)
 				goto done;
 
-			spin_lock_irqsave(&zone->lock, flags);
+			spin_lock_irqsave(lck, flags);
 			zone->type = entry->type & 0xF;
 			zone->state = (entry->flags >> 4) & 0xF;
 			zone->wp = get_wp_from_desc(sdkp, entry);
-			zone->len = z_len;
-			spin_unlock_irqrestore(&zone->lock, flags);
+			spin_unlock_irqrestore(lck, flags);
 		}
 		nread += len;
 		if (!dmax)
@@ -1233,113 +1162,314 @@  void sd_zbc_uninit_command(struct scsi_cmnd *cmd)
 }
 
 /**
- * sd_zbc_init - Load zones of matching zlen size into rb tree.
+ * alloc_cpws() - Allocate space for a contiguous set of write pointers
+ * @items: Number of wps needed.
+ * @lba: lba of the start of the next zone.
+ * @z_start: Starting lba of this contiguous set.
+ * @z_size: Size of each zone this contiguous set.
  *
+ * Return: Allocated wps or NULL on error.
  */
-static int sd_zbc_init(struct scsi_disk *sdkp, u64 zlen, char *buf, int buf_len)
+static struct contiguous_wps *alloc_cpws(int items, u64 lba, u64 z_start,
+					 u64 z_size)
 {
-	sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
-	sector_t last_sector;
+	struct contiguous_wps *cwps = NULL;
+	size_t sz;
 
-	if (test_and_set_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags)) {
-		sdev_printk(KERN_WARNING, sdkp->device,
-			    "zone initialization already running\n");
-		return 0;
+	sz = sizeof(struct contiguous_wps) + (items * sizeof(struct blk_zone));
+	if (items) {
+		cwps = vzalloc(sz);
+		if (!cwps)
+			goto out;
+		spin_lock_init(&cwps->lock);
+		cwps->start_lba = z_start;
+		cwps->last_lba = lba - 1;
+		cwps->zone_size = z_size;
+		cwps->is_zoned = items > 1 ? 1 : 0;
+		cwps->zone_count = items;
 	}
 
-	if (!sdkp->zone_work_q) {
-		char wq_name[32];
+out:
+	return cwps;
+}
 
-		sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
-		sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
-		if (!sdkp->zone_work_q) {
-			sdev_printk(KERN_WARNING, sdkp->device,
-				    "create zoned disk workqueue failed\n");
-			return -ENOMEM;
+/**
+ * free_zone_wps() - Free up memory in use by wps
+ * @zi: zone wps array(s).
+ */
+static void free_zone_wps(struct zone_wps *zi)
+{
+	/* on error free the arrays */
+	if (zi && zi->wps) {
+		int ca;
+
+		for (ca = 0; ca < zi->wps_count; ca++) {
+			if (zi->wps[ca]) {
+				vfree(zi->wps[ca]);
+				zi->wps[ca] = NULL;
+			}
 		}
-	} else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
-		drain_workqueue(sdkp->zone_work_q);
-		clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
+		kfree(zi->wps);
 	}
+}
 
-	last_sector = zbc_parse_zones(sdkp, zlen, buf, buf_len);
-	capacity = logical_to_sectors(sdkp->device, sdkp->capacity);
-	if (last_sector != -1 && last_sector < capacity) {
-		sd_zbc_update_zones(sdkp, last_sector,
-				    SD_ZBC_BUF_SIZE, SD_ZBC_INIT);
-	} else
-		clear_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags);
+static int wps_realloc(struct zone_wps *zi, gfp_t gfp_mask)
+{
+	int rcode = 0;
+	struct contiguous_wps **old;
+	struct contiguous_wps **tmp;
+	int n = zi->wps_count * 2;
+
+	old = zi->wps;
+	tmp = kzalloc(n, sizeof(*zi->wps), gfp_mask);
+	if (!tmp) {
+		rcode = -ENOMEM;
+		goto out;
+	}
+	memcpy(tmp, zi->wps, zi->wps_count * sizeof(*zi->wps));
+	zi->wps = tmp;
+	kfree(old);
 
-	return 0;
+out:
+	return rcode;
 }
 
+#define FMT_CHANGING_CAPACITY "Changing capacity from %zu to Max LBA+1 %zu"
+
 /**
- * sd_zbc_config() - Configure a ZBC device (on attach)
- * @sdkp: SCSI disk being attached.
- * @buffer: Buffer to working data.
- * @buf_sz: Size of buffer to use for working data
+ * zbc_init_zones() - Re-Sync expected WP location with drive
+ * @sdkp: scsi_disk
+ * @gfp_mask: Allocation mask.
  *
- * Return: true of SD_ZBC_RESET_WP provisioning is supported
+ * Return: 0 on success, otherwise error.
  */
-bool sd_zbc_config(struct scsi_disk *sdkp, void *buffer, size_t buf_sz)
+int zbc_init_zones(struct scsi_disk *sdkp, gfp_t gfp_mask)
 {
-	struct bdev_zone_report *bzrpt = buffer;
-	u64 zone_len, lba;
-	int retval;
-	u32 rep_len;
-	u8 same;
+	struct request_queue *q = sdkp->disk->queue;
+	int rcode = 0;
+	int entry = 0;
+	int offset;
+	int offmax;
+	u64 iter;
+	u64 z_start = 0ul;
+	u64 z_size = 0; /* size of zone */
+	int z_count = 0; /* number of zones of z_size */
+	int do_fill = 0;
+	int array_count = 0;
+	int one_time_setup = 0;
+	u8 opt = ZBC_ZONE_REPORTING_OPTION_ALL;
+	size_t bufsz = SD_ZBC_BUF_SIZE;
+	struct bdev_zone_report *rpt = NULL;
+	struct zone_wps *zi = NULL;
+	struct contiguous_wps *cwps = NULL;
+
+	if (q->zones)
+		goto out;
 
-	if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC)
-		/*
-		 * Device managed or normal SCSI disk,
-		 * no special handling required
-		 */
-		return false;
-
-	retval = sd_zbc_report_zones(sdkp, bzrpt, buf_sz,
-				     0, ZBC_ZONE_REPORTING_OPTION_ALL, false);
-	if (retval < 0)
-		return false;
-
-	rep_len = be32_to_cpu(bzrpt->descriptor_count);
-	if (rep_len < 7) {
-		sd_printk(KERN_WARNING, sdkp,
-			  "REPORT ZONES report invalid length %u\n",
-			  rep_len);
-		return false;
+	zi = kzalloc(sizeof(*zi), gfp_mask);
+	if (!zi) {
+		rcode = -ENOMEM;
+		goto out;
 	}
 
-	if (sdkp->rc_basis == 0) {
-		/* The max_lba field is the capacity of a zoned device */
-		lba = be64_to_cpu(bzrpt->maximum_lba);
-		if (lba + 1 > sdkp->capacity) {
-			if (sdkp->first_scan)
-				sd_printk(KERN_WARNING, sdkp,
-					  "Changing capacity from %zu to Max LBA+1 %zu\n",
-					  sdkp->capacity, (sector_t) lba + 1);
-			sdkp->capacity = lba + 1;
+	if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) {
+		struct gendisk *disk = sdkp->disk;
+
+		zi->wps = kzalloc(sizeof(*zi->wps), gfp_mask);
+		zi->wps[0] = alloc_cpws(1, disk->part0.nr_sects, z_start, 1);
+		if (!zi->wps[0]) {
+			rcode = -ENOMEM;
+			goto out;
 		}
+		zi->wps_count = 1;
+		goto out;
+	}
+
+	rpt = kmalloc(bufsz, gfp_mask);
+	if (!rpt) {
+		rcode = -ENOMEM;
+		goto out;
 	}
 
 	/*
-	 * Adjust 'chunk_sectors' to the zone length if the device
-	 * supports equal zone sizes.
+	 * Start by handling upto 32 different zone sizes. 2 will work
+	 * for all the current drives, but maybe something exotic will
+	 * surface.
 	 */
-	same = bzrpt->same_field & 0x0f;
-	if (same > 3) {
-		sd_printk(KERN_WARNING, sdkp,
-			  "REPORT ZONES SAME type %d not supported\n", same);
-		return false;
+	zi->wps = kcalloc(32, sizeof(*zi->wps), gfp_mask);
+	zi->wps_count = 32;
+	if (!zi->wps) {
+		rcode = -ENOMEM;
+		goto out;
 	}
-	/* Read the zone length from the first zone descriptor */
-	zone_len = be64_to_cpu(bzrpt->descriptors[0].length);
-	sdkp->unmap_alignment = zone_len;
-	sdkp->unmap_granularity = zone_len;
-	blk_queue_chunk_sectors(sdkp->disk->queue,
-				logical_to_sectors(sdkp->device, zone_len));
-
-	sd_zbc_init(sdkp, zone_len, buffer, buf_sz);
-	return true;
+
+fill:
+	offset = 0;
+	offmax = 0;
+	for (entry = 0, iter = 0; iter < sdkp->capacity; entry++) {
+		struct bdev_zone_descriptor *bzde;
+		int stop_end = 0;
+		int stop_size = 0;
+
+		if (offset == 0) {
+			int err;
+
+			err = sd_zbc_report_zones(sdkp, rpt, bufsz, iter, opt);
+			if (err) {
+				pr_err("report zones-> %d\n", err);
+				if (err != -ENOTSUPP)
+					rcode = err;
+				goto out;
+			}
+			if (sdkp->rc_basis == 0) {
+				sector_t lba = be64_to_cpu(rpt->maximum_lba);
+
+				if (lba + 1 > sdkp->capacity) {
+					sd_printk(KERN_WARNING, sdkp,
+						  FMT_CHANGING_CAPACITY "\n",
+						  sdkp->capacity, lba + 1);
+					sdkp->capacity = lba + 1;
+				}
+			}
+			offmax = max_report_entries(bufsz);
+		}
+		bzde = &rpt->descriptors[offset];
+		if (z_size == 0)
+			z_size = get_len_from_desc(sdkp, bzde);
+		if (z_size != get_len_from_desc(sdkp, bzde))
+			stop_size = 1;
+		if ((iter + z_size) >= sdkp->capacity)
+			stop_end = 1;
+
+		if (!one_time_setup) {
+			u8 type = bzde->type & 0x0F;
+
+			if (type != BLK_ZONE_TYPE_CONVENTIONAL) {
+				one_time_setup = 1;
+				blk_queue_chunk_sectors(sdkp->disk->queue,
+							z_size);
+			}
+		}
+
+		if (do_fill == 0) {
+			if (stop_end || stop_size) {
+				/* include the next/last zone? */
+				if (!stop_size) {
+					z_count++;
+					iter += z_size;
+				}
+				cwps = alloc_cpws(z_count, iter,
+						  z_start, z_size);
+				if (!cwps) {
+					rcode = -ENOMEM;
+					goto out;
+				}
+				if (array_count > 0)
+					cwps->is_zoned = 1;
+
+				zi->wps[array_count] = cwps;
+				z_start = iter;
+				z_size = 0;
+				z_count = 0;
+				array_count++;
+				if (array_count >= zi->wps_count) {
+					rcode = wps_realloc(zi, gfp_mask);
+					if (rcode)
+						goto out;
+				}
+				/* add the runt zone */
+				if (stop_end && stop_size) {
+					z_count++;
+					z_size = get_len_from_desc(sdkp, bzde);
+					cwps = alloc_cpws(z_count,
+							  iter + z_size,
+							  z_start, z_size);
+					if (!cwps) {
+						rcode = -ENOMEM;
+						goto out;
+					}
+					if (array_count > 0)
+						cwps->is_zoned = 1;
+					zi->wps[array_count] = cwps;
+					array_count++;
+				}
+				if (stop_end) {
+					do_fill = 1;
+					array_count = 0;
+					z_count = 0;
+					z_size = 0;
+					goto fill;
+				}
+			}
+			z_size = get_len_from_desc(sdkp, bzde);
+			iter += z_size;
+			z_count++;
+		} else {
+			fill_zone(zi->wps[array_count], z_count, sdkp, bzde);
+			z_count++;
+			iter += z_size;
+			if (zi->wps[array_count]->zone_count == z_count) {
+				z_count = 0;
+				array_count++;
+				zi->wps_count = array_count;
+			}
+		}
+		offset++;
+		if (offset >= offmax)
+			offset = 0;
+	}
+out:
+	kfree(rpt);
+
+	if (rcode) {
+		if (zi) {
+			free_zone_wps(zi);
+			kfree(zi);
+		}
+	} else {
+		q->zones = zi;
+	}
+
+	return rcode;
+}
+
+/**
+ * sd_zbc_config() - Configure a ZBC device (on attach)
+ * @sdkp: SCSI disk being attached.
+ * @gfp_mask: Memory allocation strategy
+ *
+ * Return: true of SD_ZBC_RESET_WP provisioning is supported
+ */
+bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp_mask)
+{
+	bool can_reset_wp = false;
+
+	if (zbc_init_zones(sdkp, gfp_mask)) {
+		sdev_printk(KERN_WARNING, sdkp->device,
+			    "Initialize zone cache failed\n");
+		goto out;
+	}
+
+	if (sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC)
+		can_reset_wp = true;
+
+	if (!sdkp->zone_work_q) {
+		char wq_name[32];
+
+		sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name);
+		sdkp->zone_work_q = create_singlethread_workqueue(wq_name);
+		if (!sdkp->zone_work_q) {
+			sdev_printk(KERN_WARNING, sdkp->device,
+				    "create zoned disk workqueue failed\n");
+			goto out;
+		}
+	} else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) {
+		drain_workqueue(sdkp->zone_work_q);
+		clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags);
+	}
+
+out:
+	return can_reset_wp;
 }
 
 /**
@@ -1365,15 +1495,16 @@  void sd_zbc_remove(struct scsi_disk *sdkp)
  */
 unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp)
 {
-	unsigned int bytes = 1;
 	struct request_queue *q = sdkp->disk->queue;
-	struct rb_node *node = rb_first(&q->zones);
+	struct zone_wps *zi = q->zones;
+	unsigned int bytes = 1;
 
-	if (node) {
-		struct blk_zone *zone = rb_entry(node, struct blk_zone, node);
+	if (zi && zi->wps_count > 0) {
+		struct contiguous_wps *wp = zi->wps[0];
 
-		bytes = zone->len;
+		bytes = wp->zone_size;
 	}
+
 	bytes <<= ilog2(sdkp->device->sector_size);
 	return bytes;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d5cdb5d..113c5a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -264,27 +264,83 @@  struct blk_queue_tag {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
+/**
+ * struct blk_zone - A single zone type/stats and WP offset.
+ *
+ * @wp:    Holds the wp offset from the start of the zone.
+ * @type:  Holds the zone type nibble.
+ * @state: Holds the zone state nibble + kernel (zone busy)
+ * @private_data: Used to hold whatever the implicit domain owner
+ *                of the zone needs to track.
+ *
+ * Type is left at 4 bits (only 2 are needed currently) to match
+ * the current ZBC/ZAC standards.
+ *
+ * State is using 5 bits to accommodate the ZONE_BUSY. The first 4 bits
+ * match the current ZBC/ZAC spec.
+ * ZONE_BUSY could be mapped to one of the reserved bits. Using it as
+ * mask bit or independent flag my be useful for decoding the zone
+ * state before it transitioned to BUSY.
+ *
+ * A zone sized at order (39+9) is very unlikely (current zones are 16+9)
+ * Even at lba48 equivalent number of sectors we have a large amount
+ * of padding to fill out 8 bytes.
+ *
+ * Getting this to fit in 4 bytes would limit the maximum size of a zone
+ * to 4G [order 23 of 512 byte sectors + 9 bits for flags] which is probably
+ * okay for embedded or 32-bit systems where the private_data pointer
+ * would also shrink to 32 bits. There are also WP tracking schemes
+ * that don't make use of the private_data helper so perhaps that
+ * could be factored out as well.
+ */
 struct blk_zone {
-	struct rb_node node;
-	spinlock_t lock;
-	sector_t start;
-	size_t len;
-	sector_t wp;
-	enum blk_zone_type type;
-	enum blk_zone_state state;
+	unsigned long long wp:39;
+	unsigned long long type:4;
+	unsigned long long state:5;
+	unsigned long long padding:15;
 	void *private_data;
 };
 
+/**
+ * struct contiguous_wps - A descriptor of zones of the same size
+ *
+ * @start_lba:  LBA of first zone covered by the descriptor.
+ * @last_lba:   LBA of last zone.
+ * @zone_size:  Size of zones as a number of 512 byte sectors.
+ * @zone_count: Number of zones (last-start/size) for convenience.
+ * @lock:       A spinlock protecting these zones.
+ * @is_zoned:   0 when all zones are conventional no WP zones.
+ * zones:       Array of blk_zone entries.
+ */
+struct contiguous_wps {
+	u64 start_lba;
+	u64 last_lba;
+	u64 zone_size;
+	u32 zone_count;
+	spinlock_t lock;
+	unsigned is_zoned:1;
+	struct blk_zone zones[0];
+};
+
+/**
+ * struct zone_wps - A collection of zone descriptors to describe zoned media.
+ *
+ * @wps_count:  Number of descriptors.
+ * @wps:        Array of zone descriptors.
+ */
+struct zone_wps {
+	u32 wps_count;
+	struct contiguous_wps **wps;
+};
+
 #define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 #define blk_zone_is_seq_pref(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_PREF)
 #define blk_zone_is_smr(z) (blk_zone_is_seq_req(z) || blk_zone_is_seq_pref(z))
 #define blk_zone_is_cmr(z) ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL)
-#define blk_zone_is_full(z) ((z)->wp == (z)->start + (z)->len)
-#define blk_zone_is_empty(z) ((z)->wp == (z)->start)
+#define blk_zone_is_empty(z) ((z)->wp == 0)
 
-extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t);
-extern struct blk_zone *blk_insert_zone(struct request_queue *,
-					struct blk_zone *);
+extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t,
+					sector_t *, sector_t *, spinlock_t **);
 extern void blk_drop_zones(struct request_queue *);
 #else
 static inline void blk_drop_zones(struct request_queue *q) { };
@@ -463,7 +519,7 @@  struct request_queue {
 	struct queue_limits	limits;
 
 #ifdef CONFIG_BLK_DEV_ZONED
-	struct rb_root		zones;
+	struct zone_wps		*zones;
 #endif
 	/*
 	 * sg stuff