Message ID | 20160822043402.8855-3-shaun@tancheff.com (mailing list archive) |
---|---|
State | Changes Requested, archived |
Headers | show |
On Sun, Aug 21, 2016 at 11:34 PM, Shaun Tancheff <shaun@tancheff.com> wrote: > Currently the RB-Tree zone cache is fast and flexible. It does > use a rather largish amount of ram. This model reduces the ram > required from 120 bytes per zone to 16 bytes per zone with a > moderate transformation of the blk_zone_lookup() api. > > This model is predicated on the belief that most variations > on zoned media will follow a pattern of using collections of same > sized zones on a single device. Similar to the pattern of erase > blocks on flash devices being progressivly larger 16K, 64K, ... > > The goal is to be able to build a descriptor which is both memory > efficient, performant, and flexible. > > Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com> > --- > block/blk-core.c | 2 +- > block/blk-sysfs.c | 31 +- > block/blk-zoned.c | 103 +++-- > drivers/scsi/sd.c | 5 +- > drivers/scsi/sd.h | 4 +- > drivers/scsi/sd_zbc.c | 1025 +++++++++++++++++++++++++++--------------------- > include/linux/blkdev.h | 82 +++- > 7 files changed, 716 insertions(+), 536 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 3a9caf7..3b084a8 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -727,7 +727,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) > INIT_LIST_HEAD(&q->blkg_list); > #endif > #ifdef CONFIG_BLK_DEV_ZONED > - q->zones = RB_ROOT; > + q->zones = NULL; > #endif > INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); > > diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c > index 43f441f..ecbd434 100644 > --- a/block/blk-sysfs.c > +++ b/block/blk-sysfs.c > @@ -232,36 +232,7 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) > #ifdef CONFIG_BLK_DEV_ZONED > static ssize_t queue_zoned_show(struct request_queue *q, char *page) > { > - struct rb_node *node; > - struct blk_zone *zone; > - ssize_t offset = 0, end = 0; > - size_t size = 0, num = 0; > - enum blk_zone_type type = BLK_ZONE_TYPE_UNKNOWN; > - > - for (node = rb_first(&q->zones); node; node = rb_next(node)) { > - zone = rb_entry(node, struct blk_zone, node); > - if (zone->type != type || > - zone->len != size || > - end != zone->start) { > - if (size != 0) > - offset += sprintf(page + offset, "%zu\n", num); > - /* We can only store one page ... */ > - if (offset + 42 > PAGE_SIZE) { > - offset += sprintf(page + offset, "...\n"); > - return offset; > - } > - size = zone->len; > - type = zone->type; > - offset += sprintf(page + offset, "%zu %zu %d ", > - zone->start, size, type); > - num = 0; > - end = zone->start + size; > - } else > - end += zone->len; > - num++; > - } > - offset += sprintf(page + offset, "%zu\n", num); > - return offset; > + return sprintf(page, "%u\n", q->zones ? 1 : 0); > } > #endif > > diff --git a/block/blk-zoned.c b/block/blk-zoned.c > index 975e863..338a1af 100644 > --- a/block/blk-zoned.c > +++ b/block/blk-zoned.c > @@ -8,63 +8,84 @@ > #include <linux/kernel.h> > #include <linux/module.h> > #include <linux/blkdev.h> > -#include <linux/rbtree.h> > +#include <linux/vmalloc.h> > > -struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t lba) > +/** > + * blk_lookup_zone() - Lookup zones > + * @q: Request Queue > + * @sector: Location to lookup > + * @start: Pointer to starting location zone (OUT) > + * @len: Pointer to length of zone (OUT) > + * @lock: Pointer to spinlock of zones in owning descriptor (OUT) > + */ > +struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector, > + sector_t *start, sector_t *len, > + spinlock_t **lock) > { > - struct rb_root *root = &q->zones; > - struct rb_node *node = root->rb_node; > + int iter; > + struct blk_zone *bzone = NULL; > + struct zone_wps *zi = q->zones; > + > + *start = 0; > + *len = 0; > + *lock = NULL; > + > + if (!q->zones) > + goto out; > > - while (node) { > - struct blk_zone *zone = container_of(node, struct blk_zone, > - node); > + for (iter = 0; iter < zi->wps_count; iter++) { > + if (sector >= zi->wps[iter]->start_lba && > + sector < zi->wps[iter]->last_lba) { > + struct contiguous_wps *wp = zi->wps[iter]; > + u64 index = (sector - wp->start_lba) / wp->zone_size; > > - if (lba < zone->start) > - node = node->rb_left; > - else if (lba >= zone->start + zone->len) > - node = node->rb_right; > - else > - return zone; > + if (index >= wp->zone_count) { > + WARN(1, "Impossible index for zone\n"); > + goto out; > + } > + > + bzone = &wp->zones[index]; > + *len = wp->zone_size; > + *start = wp->start_lba + (index * wp->zone_size); > + *lock = &wp->lock; > + } > } > - return NULL; > + > +out: > + return bzone; > } > EXPORT_SYMBOL_GPL(blk_lookup_zone); > > -struct blk_zone *blk_insert_zone(struct request_queue *q, struct blk_zone *data) > +/** > + * free_zone_wps() - Free up memory in use by wps > + * @zi: zone wps array(s). > + */ > +static void free_zone_wps(struct zone_wps *zi) > { > - struct rb_root *root = &q->zones; > - struct rb_node **new = &(root->rb_node), *parent = NULL; > + /* on error free the arrays */ > + if (zi && zi->wps) { > + int ca; > > - /* Figure out where to put new node */ > - while (*new) { > - struct blk_zone *this = container_of(*new, struct blk_zone, > - node); > - parent = *new; > - if (data->start + data->len <= this->start) > - new = &((*new)->rb_left); > - else if (data->start >= this->start + this->len) > - new = &((*new)->rb_right); > - else { > - /* Return existing zone */ > - return this; > + for (ca = 0; ca < zi->wps_count; ca++) { > + if (zi->wps[ca]) { > + vfree(zi->wps[ca]); > + zi->wps[ca] = NULL; > + } > } > + kfree(zi->wps); > } > - /* Add new node and rebalance tree. */ > - rb_link_node(&data->node, parent, new); > - rb_insert_color(&data->node, root); > - > - return NULL; > } > -EXPORT_SYMBOL_GPL(blk_insert_zone); > > +/** > + * blk_drop_zones() - Free zones > + * @q: Request Queue > + */ > void blk_drop_zones(struct request_queue *q) > { > - struct rb_root *root = &q->zones; > - struct blk_zone *zone, *next; > - > - rbtree_postorder_for_each_entry_safe(zone, next, root, node) { > - kfree(zone); > + if (q->zones) { > + free_zone_wps(q->zones); > + kfree(q->zones); > + q->zones = NULL; > } > - q->zones = RB_ROOT; > } > EXPORT_SYMBOL_GPL(blk_drop_zones); > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c > index f144df4..0f749f5 100644 > --- a/drivers/scsi/sd.c > +++ b/drivers/scsi/sd.c > @@ -2549,8 +2549,9 @@ got_data: > sdkp->physical_block_size); > sdkp->device->sector_size = sector_size; > > - if (sd_zbc_config(sdkp, buffer, SD_BUF_SIZE)) > - sd_config_discard(sdkp, SD_ZBC_RESET_WP); > + if (sdkp->first_scan) > + if (sd_zbc_config(sdkp, GFP_KERNEL)) > + sd_config_discard(sdkp, SD_ZBC_RESET_WP); > > { > char cap_str_2[10], cap_str_10[10]; > diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h > index fc766db..c9c79e9 100644 > --- a/drivers/scsi/sd.h > +++ b/drivers/scsi/sd.h > @@ -299,13 +299,13 @@ extern void sd_zbc_uninit_command(struct scsi_cmnd *cmd); > extern void sd_zbc_remove(struct scsi_disk *); > extern void sd_zbc_reset_zones(struct scsi_disk *); > extern void sd_zbc_update_zones(struct scsi_disk *, sector_t, int, int reason); > -extern bool sd_zbc_config(struct scsi_disk *, void *, size_t); > +extern bool sd_zbc_config(struct scsi_disk *, gfp_t); > > extern unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp); > > #else /* CONFIG_SCSI_ZBC */ > > -static inline bool sd_zbc_config(struct scsi_disk *sdkp, void *b, size_t sz) > +static inline bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp) > { > return false; > } > diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c > index 960af93..c087035 100644 > --- a/drivers/scsi/sd_zbc.c > +++ b/drivers/scsi/sd_zbc.c > @@ -22,6 +22,7 @@ > > #include <linux/blkdev.h> > #include <linux/rbtree.h> > +#include <linux/vmalloc.h> > > #include <asm/unaligned.h> > > @@ -51,11 +52,11 @@ > } while( 0 ) > > struct zbc_update_work { > - struct work_struct zone_work; > - struct scsi_disk *sdkp; > - sector_t zone_sector; > - int zone_buflen; > - char zone_buf[0]; > + struct work_struct zone_work; > + struct scsi_disk *sdkp; > + sector_t zone_sector; > + int zone_buflen; > + struct bdev_zone_report zone_buf[0]; > }; > > /** > @@ -95,102 +96,19 @@ static inline sector_t get_start_from_desc(struct scsi_disk *sdkp, > return logical_to_sectors(sdkp->device, be64_to_cpu(bzde->lba_start)); > } > > -static > -struct blk_zone *zbc_desc_to_zone(struct scsi_disk *sdkp, unsigned char *rec) > +static void _fill_zone(struct blk_zone *zone, struct scsi_disk *sdkp, > + struct bdev_zone_descriptor *bzde) > { > - struct blk_zone *zone; > - sector_t wp = (sector_t)-1; > - > - zone = kzalloc(sizeof(struct blk_zone), GFP_KERNEL); > - if (!zone) > - return NULL; > - > - spin_lock_init(&zone->lock); > - zone->type = rec[0] & 0xf; > - zone->state = (rec[1] >> 4) & 0xf; > - zone->len = logical_to_sectors(sdkp->device, > - get_unaligned_be64(&rec[8])); > - zone->start = logical_to_sectors(sdkp->device, > - get_unaligned_be64(&rec[16])); > - > - if (blk_zone_is_smr(zone)) > - wp = logical_to_sectors(sdkp->device, > - get_unaligned_be64(&rec[24])); > - zone->wp = wp; > - /* > - * Fixup block zone state > - */ > - if (zone->state == BLK_ZONE_EMPTY && > - zone->wp != zone->start) { > - sd_zbc_debug(sdkp, > - "zone %zu state EMPTY wp %zu: adjust wp\n", > - zone->start, zone->wp); > - zone->wp = zone->start; > - } > - if (zone->state == BLK_ZONE_FULL && > - zone->wp != zone->start + zone->len) { > - sd_zbc_debug(sdkp, > - "zone %zu state FULL wp %zu: adjust wp\n", > - zone->start, zone->wp); > - zone->wp = zone->start + zone->len; > - } > - > - return zone; > + zone->type = bzde->type & 0x0f; > + zone->state = (bzde->flags >> 4) & 0x0f; > + zone->wp = get_wp_from_desc(sdkp, bzde); > } > > -static > -sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf, > - unsigned int buf_len) > -{ > - struct request_queue *q = sdkp->disk->queue; > - unsigned char *rec = buf; > - int rec_no = 0; > - unsigned int list_length; > - sector_t next_sector = -1; > - u8 same; > - > - /* Parse REPORT ZONES header */ > - list_length = get_unaligned_be32(&buf[0]); > - same = buf[4] & 0xf; > - rec = buf + 64; > - list_length += 64; > - > - if (list_length < buf_len) > - buf_len = list_length; > - > - while (rec < buf + buf_len) { > - struct blk_zone *this, *old; > - unsigned long flags; > > - this = zbc_desc_to_zone(sdkp, rec); > - if (!this) > - break; > - > - if (same == 0 && this->len != zlen) { > - next_sector = this->start + this->len; > - break; > - } > - > - next_sector = this->start + this->len; > - old = blk_insert_zone(q, this); > - if (old) { > - spin_lock_irqsave(&old->lock, flags); > - if (blk_zone_is_smr(old)) { > - old->wp = this->wp; > - old->state = this->state; > - } > - spin_unlock_irqrestore(&old->lock, flags); > - kfree(this); > - } > - rec += 64; > - rec_no++; > - } > - > - sd_zbc_debug(sdkp, > - "Inserted %d zones, next sector %zu len %d\n", > - rec_no, next_sector, list_length); > - > - return next_sector; > +static void fill_zone(struct contiguous_wps *cwps, int z_count, > + struct scsi_disk *sdkp, struct bdev_zone_descriptor *bzde) > +{ > + _fill_zone(&cwps->zones[z_count], sdkp, bzde); > } > > /** > @@ -200,12 +118,10 @@ sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf, > * @bufflen: length of @buffer > * @start_sector: logical sector for the zone information should be reported > * @option: reporting option to be used > - * @partial: flag to set the 'partial' bit for report zones command > */ > -static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer, > - int bufflen, sector_t start_sector, > - enum zbc_zone_reporting_options option, > - bool partial) > +static int sd_zbc_report_zones(struct scsi_disk *sdkp, > + struct bdev_zone_report *buffer, > + int bufflen, sector_t start_sector, u8 option) > { > struct scsi_device *sdp = sdkp->device; > const int timeout = sdp->request_queue->rq_timeout > @@ -225,7 +141,7 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer, > cmd[1] = ZI_REPORT_ZONES; > put_unaligned_be64(start_lba, &cmd[2]); > put_unaligned_be32(bufflen, &cmd[10]); > - cmd[14] = (partial ? ZBC_REPORT_ZONE_PARTIAL : 0) | option; > + cmd[14] = option; > memset(buffer, 0, bufflen); > > result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, > @@ -248,49 +164,38 @@ static void sd_zbc_refresh_zone_work(struct work_struct *work) > container_of(work, struct zbc_update_work, zone_work); > struct scsi_disk *sdkp = zbc_work->sdkp; > struct request_queue *q = sdkp->disk->queue; > - unsigned char *zone_buf = zbc_work->zone_buf; > + struct bdev_zone_report *rpt = zbc_work->zone_buf; > unsigned int zone_buflen = zbc_work->zone_buflen; > + struct bdev_zone_descriptor *bzde; > + int iter; > + int offmax; > + sector_t z_at, z_start, z_len; > + spinlock_t *lock; > + struct blk_zone *zone; > int ret; > - u8 same; > - u64 zlen = 0; > sector_t last_sector; > sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); > > - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen, > + ret = sd_zbc_report_zones(sdkp, rpt, zone_buflen, > zbc_work->zone_sector, > - ZBC_ZONE_REPORTING_OPTION_ALL, true); > + ZBC_ZONE_REPORTING_OPTION_ALL); > if (ret) > goto done_free; > > - /* this whole path is unlikely so extra reports shouldn't be a > - * large impact */ > - same = zone_buf[4] & 0xf; > - if (same == 0) { > - unsigned char *desc = &zone_buf[64]; > - unsigned int blen = zone_buflen; > - > - /* just pull the first zone */ > - if (blen > 512) > - blen = 512; > - ret = sd_zbc_report_zones(sdkp, zone_buf, blen, 0, > - ZBC_ZONE_REPORTING_OPTION_ALL, true); > - if (ret) > - goto done_free; > - > - /* Read the zone length from the first zone descriptor */ > - zlen = logical_to_sectors(sdkp->device, > - get_unaligned_be64(&desc[8])); > - > - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen, > - zbc_work->zone_sector, > - ZBC_ZONE_REPORTING_OPTION_ALL, true); > - if (ret) > - goto done_free; > + offmax = max_report_entries(zone_buflen); > + for (iter = 0; iter < offmax; iter++) { > + bzde = &rpt->descriptors[iter]; > + z_at = get_start_from_desc(sdkp, bzde); > + if (!z_at) > + break; > + zone = blk_lookup_zone(q, z_at, &z_start, &z_len, &lock); > + if (zone) { > + _fill_zone(zone, sdkp, bzde); > + last_sector = z_start + z_len; > + } > } > > - last_sector = zbc_parse_zones(sdkp, zlen, zone_buf, zone_buflen); > - capacity = logical_to_sectors(sdkp->device, sdkp->capacity); > - if (last_sector != -1 && last_sector < capacity) { > + if (sdkp->zone_work_q && last_sector != -1 && last_sector < capacity) { > if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { > sd_zbc_debug(sdkp, > "zones in reset, canceling refresh\n"); > @@ -333,10 +238,7 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize, > { > struct request_queue *q = sdkp->disk->queue; > struct zbc_update_work *zbc_work; > - struct blk_zone *zone; > - struct rb_node *node; > - int zone_num = 0, zone_busy = 0, num_rec; > - sector_t next_sector = sector; > + int num_rec; > > if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { > sd_zbc_debug(sdkp, > @@ -346,18 +248,23 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize, > > if (reason != SD_ZBC_INIT) { > /* lookup sector, is zone pref? then ignore */ > - struct blk_zone *zone = blk_lookup_zone(q, sector); > - > + sector_t z_start, z_len; > + spinlock_t *lck; > + struct blk_zone *zone = blk_lookup_zone(q, sector, &z_start, > + &z_len, &lck); > + /* zone actions on conventional zones are invalid */ > + if (zone && reason == SD_ZBC_RESET_WP && blk_zone_is_cmr(zone)) > + return; > if (reason == SD_ZBC_RESET_WP) > sd_zbc_debug(sdkp, "RESET WP failed %lx\n", sector); > - > - if (zone && blk_zone_is_seq_pref(zone)) > - return; > } > > + if (!sdkp->zone_work_q) > + return; > + > retry: > zbc_work = kzalloc(sizeof(struct zbc_update_work) + bufsize, > - reason != SD_ZBC_INIT ? GFP_NOWAIT : GFP_KERNEL); > + reason != SD_ZBC_INIT ? GFP_ATOMIC : GFP_KERNEL); > if (!zbc_work) { > if (bufsize > 512) { > sd_zbc_debug(sdkp, > @@ -381,30 +288,40 @@ retry: > * Mark zones under update as BUSY > */ > if (reason != SD_ZBC_INIT) { > - for (node = rb_first(&q->zones); node; node = rb_next(node)) { > - unsigned long flags; > - > - zone = rb_entry(node, struct blk_zone, node); > - if (num_rec == 0) > + unsigned long flags; > + int iter; > + struct zone_wps *zi = q->zones; > + struct contiguous_wps *wp = NULL; > + u64 index = -1; > + int zone_busy = 0; > + int z_flgd = 0; > + > + for (iter = 0; iter < zi->wps_count; iter++) { > + if (sector >= zi->wps[iter]->start_lba && > + sector < zi->wps[iter]->last_lba) { > + wp = zi->wps[iter]; > break; > - if (zone->start != next_sector) > - continue; > - next_sector += zone->len; > - num_rec--; > - > - spin_lock_irqsave(&zone->lock, flags); > - if (blk_zone_is_smr(zone)) { > - if (zone->state == BLK_ZONE_BUSY) { > + } > + } > + if (wp) { > + spin_lock_irqsave(&wp->lock, flags); > + index = (sector - wp->start_lba) / wp->zone_size; > + while (index < wp->zone_count && z_flgd < num_rec) { > + struct blk_zone *bzone = &wp->zones[index]; > + > + index++; > + z_flgd++; > + if (!blk_zone_is_smr(bzone)) > + continue; > + > + if (bzone->state == BLK_ZONE_BUSY) > zone_busy++; > - } else { > - zone->state = BLK_ZONE_BUSY; > - zone->wp = zone->start; > - } > - zone_num++; > + else > + bzone->state = BLK_ZONE_BUSY; > } > - spin_unlock_irqrestore(&zone->lock, flags); > + spin_unlock_irqrestore(&wp->lock, flags); > } > - if (zone_num && (zone_num == zone_busy)) { > + if (z_flgd && (z_flgd == zone_busy)) { > sd_zbc_debug(sdkp, > "zone update for %zu in progress\n", > sector); > @@ -476,43 +393,26 @@ static void discard_or_write_same(struct scsi_cmnd *cmd, sector_t sector, > int sd_zbc_setup_discard(struct scsi_cmnd *cmd) > { > struct request *rq = cmd->request; > - struct scsi_device *sdp = cmd->device; > struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); > sector_t sector = blk_rq_pos(rq); > unsigned int nr_sectors = blk_rq_sectors(rq); > int ret = BLKPREP_OK; > struct blk_zone *zone; > unsigned long flags; > - u32 wp_offset; > bool use_write_same = false; > + sector_t z_start, z_len; > + spinlock_t *lck; > > - zone = blk_lookup_zone(rq->q, sector); > - if (!zone) { > - /* Test for a runt zone before giving up */ > - if (sdp->type != TYPE_ZBC) { > - struct request_queue *q = rq->q; > - struct rb_node *node; > - > - node = rb_last(&q->zones); > - if (node) > - zone = rb_entry(node, struct blk_zone, node); > - if (zone) { > - spin_lock_irqsave(&zone->lock, flags); > - if ((zone->start + zone->len) <= sector) > - goto out; > - spin_unlock_irqrestore(&zone->lock, flags); > - zone = NULL; > - } > - } > + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck); > + if (!zone) > return BLKPREP_KILL; > - } > > - spin_lock_irqsave(&zone->lock, flags); > + spin_lock_irqsave(lck, flags); > if (zone->state == BLK_ZONE_UNKNOWN || > zone->state == BLK_ZONE_BUSY) { > sd_zbc_debug_ratelimit(sdkp, > "Discarding zone %zx state %x, deferring\n", > - zone->start, zone->state); > + z_start, zone->state); > ret = BLKPREP_DEFER; > goto out; > } > @@ -520,39 +420,37 @@ int sd_zbc_setup_discard(struct scsi_cmnd *cmd) > /* let the drive fail the command */ > sd_zbc_debug_ratelimit(sdkp, > "Discarding offline zone %zx\n", > - zone->start); > + z_start); > goto out; > } > if (blk_zone_is_cmr(zone)) { > use_write_same = true; > sd_zbc_debug_ratelimit(sdkp, > - "Discarding CMR zone %zx\n", > - zone->start); > + "Discarding CMR zone %zx\n", z_start); > goto out; > } > - if (zone->start != sector || zone->len < nr_sectors) { > + if (z_start != sector || z_len < nr_sectors) { > sd_printk(KERN_ERR, sdkp, > "Misaligned RESET WP %zx/%x on zone %zx/%zx\n", > - sector, nr_sectors, zone->start, zone->len); > + sector, nr_sectors, z_start, z_len); > ret = BLKPREP_KILL; > goto out; > } > /* Protect against Reset WP when more data had been written to the > * zone than is being discarded. > */ > - wp_offset = zone->wp - zone->start; > - if (wp_offset > nr_sectors) { > + if (zone->wp > nr_sectors) { > sd_printk(KERN_ERR, sdkp, > - "Will Corrupt RESET WP %zx/%x/%x on zone %zx/%zx/%zx\n", > - sector, wp_offset, nr_sectors, > - zone->start, zone->wp, zone->len); > + "Will Corrupt RESET WP %zx/%zx/%x on zone %zx/%zx/%zx\n", > + sector, (sector_t)zone->wp, nr_sectors, > + z_start, z_start + zone->wp, z_len); > ret = BLKPREP_KILL; > goto out; > } > if (blk_zone_is_empty(zone)) { > sd_zbc_debug_ratelimit(sdkp, > "Discarding empty zone %zx [WP: %zx]\n", > - zone->start, zone->wp); > + z_start, (sector_t)zone->wp); > ret = BLKPREP_DONE; > goto out; > } > @@ -563,8 +461,8 @@ out: > * zone update if RESET WRITE POINTER fails. > */ > if (ret == BLKPREP_OK && !use_write_same) > - zone->wp = zone->start; > - spin_unlock_irqrestore(&zone->lock, flags); > + zone->wp = 0; > + spin_unlock_irqrestore(lck, flags); > > if (ret == BLKPREP_OK) > discard_or_write_same(cmd, sector, nr_sectors, use_write_same); > @@ -573,13 +471,14 @@ out: > } > > > -static void __set_zone_state(struct blk_zone *zone, int op) > +static void __set_zone_state(struct blk_zone *zone, sector_t z_len, > + spinlock_t *lck, int op) > { > unsigned long flags; > > - spin_lock_irqsave(&zone->lock, flags); > - if (blk_zone_is_cmr(zone)) > - goto out_unlock; > + spin_lock_irqsave(lck, flags); > + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) > + goto out; > > switch (op) { > case REQ_OP_ZONE_OPEN: > @@ -587,38 +486,45 @@ static void __set_zone_state(struct blk_zone *zone, int op) > break; > case REQ_OP_ZONE_FINISH: > zone->state = BLK_ZONE_FULL; > - zone->wp = zone->start + zone->len; > + zone->wp = z_len; > break; > case REQ_OP_ZONE_CLOSE: > zone->state = BLK_ZONE_CLOSED; > break; > case REQ_OP_ZONE_RESET: > - zone->wp = zone->start; > + zone->wp = 0; > break; > default: > WARN_ONCE(1, "%s: invalid op code: %u\n", __func__, op); > } > -out_unlock: > - spin_unlock_irqrestore(&zone->lock, flags); > +out: > + spin_unlock_irqrestore(lck, flags); > } > > static void update_zone_state(struct request *rq, sector_t lba, unsigned int op) > { > - struct request_queue *q = rq->q; > - struct blk_zone *zone = NULL; > + struct blk_zone *zone; > > if (lba == ~0ul) { > - struct rb_node *node; > - > - for (node = rb_first(&q->zones); node; node = rb_next(node)) { > - zone = rb_entry(node, struct blk_zone, node); > - __set_zone_state(zone, op); > + struct zone_wps *zi = rq->q->zones; > + struct contiguous_wps *wp; > + u32 iter, entry; > + > + for (iter = 0; iter < zi->wps_count; iter++) { > + wp = zi->wps[iter]; > + for (entry = 0; entry < wp->zone_count; entry++) { > + zone = &wp->zones[entry]; > + __set_zone_state(zone, wp->zone_size, &wp->lock, > + op); > + } > } > - return; > } else { > - zone = blk_lookup_zone(q, lba); > + sector_t z_start, z_len; > + spinlock_t *lck; > + > + zone = blk_lookup_zone(rq->q, lba, &z_start, &z_len, &lck); > if (zone) > - __set_zone_state(zone, op); > + __set_zone_state(zone, z_len, lck, op); > } > } > > @@ -641,6 +547,8 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd) > struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); > sector_t sector = blk_rq_pos(rq); > struct blk_zone *zone; > + spinlock_t *lck; > + sector_t z_start, z_len; > unsigned long flags; > unsigned int nr_sectors; > int ret = BLKPREP_DONE; > @@ -651,17 +559,17 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd) > if (is_fua || op != REQ_OP_ZONE_RESET) > goto out; > > - zone = blk_lookup_zone(rq->q, sector); > + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck); > if (!zone || sdkp->provisioning_mode != SD_ZBC_RESET_WP) > goto out; > > /* Map a Reset WP w/o FUA to a discard request */ > - spin_lock_irqsave(&zone->lock, flags); > - sector = zone->start; > - nr_sectors = zone->len; > + spin_lock_irqsave(lck, flags); > + sector = z_start; > + nr_sectors = z_len; > if (blk_zone_is_cmr(zone)) > use_write_same = true; > - spin_unlock_irqrestore(&zone->lock, flags); > + spin_unlock_irqrestore(lck, flags); > > rq->completion_data = NULL; > if (use_write_same) { > @@ -712,137 +620,157 @@ static sector_t bzrpt_fill(struct request *rq, > struct bdev_zone_descriptor *bzd, > size_t sz, sector_t lba, u8 opt) > { > - struct request_queue *q = rq->q; > struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); > + struct scsi_device *sdp = sdkp->device; > + struct zone_wps *zi = rq->q->zones; > + struct contiguous_wps *wpdscr; > struct blk_zone *zone = NULL; > - struct rb_node *node = NULL; > sector_t progress = lba; > sector_t clen = ~0ul; > + sector_t z_start, z_len, z_wp_abs; > unsigned long flags; > u32 max_entries = bzrpt ? max_report_entries(sz) : sz / sizeof(*bzd); > u32 entry = 0; > + u32 iter, idscr; > int len_diffs = 0; > int type_diffs = 0; > u8 ctype; > u8 same = 0; > > - zone = blk_lookup_zone(q, lba); > - if (zone) > - node = &zone->node; > - > - for (entry = 0; entry < max_entries && node; node = rb_next(node)) { > - u64 z_len, z_start, z_wp_abs; > - u8 cond = 0; > - u8 flgs = 0; > - > - spin_lock_irqsave(&zone->lock, flags); > - z_len = zone->len; > - z_start = zone->start; > - z_wp_abs = zone->wp; > - progress = z_start + z_len; > - cond = zone->state; > - if (blk_zone_is_cmr(zone)) > - flgs |= 0x02; > - else if (zone->wp != zone->start) > - flgs |= 0x01; /* flag as RWP recommended? */ > - spin_unlock_irqrestore(&zone->lock, flags); > - > - switch (opt & ZBC_REPORT_OPTION_MASK) { > - case ZBC_ZONE_REPORTING_OPTION_EMPTY: > - if (z_wp_abs != z_start) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN: > - if (cond != BLK_ZONE_OPEN) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN: > - if (cond != BLK_ZONE_OPEN_EXPLICIT) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_CLOSED: > - if (cond != BLK_ZONE_CLOSED) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_FULL: > - if (cond != BLK_ZONE_FULL) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_READONLY: > - if (cond == BLK_ZONE_READONLY) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_OFFLINE: > - if (cond == BLK_ZONE_OFFLINE) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP: > - if (z_wp_abs == z_start) > - continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_NON_WP: > - if (cond == BLK_ZONE_NO_WP) > + for (iter = 0; entry < max_entries && iter < zi->wps_count; iter++) { > + wpdscr = zi->wps[iter]; > + if (lba > wpdscr->last_lba) > + continue; > + > + spin_lock_irqsave(&wpdscr->lock, flags); > + for (idscr = 0; > + entry < max_entries && idscr < wpdscr->zone_count; > + idscr++) { > + struct bdev_zone_descriptor *dscr; > + u64 zoff = idscr * wpdscr->zone_size; > + u8 cond, flgs = 0; > + > + z_len = wpdscr->zone_size; > + zoff = idscr * z_len; > + z_start = wpdscr->start_lba + zoff; > + if (lba >= z_start + z_len) > continue; > - break; > - case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE: > - /* this can only be reported by the HW */ > - break; > - case ZBC_ZONE_REPORTING_OPTION_ALL: > - default: > - break; > - } > > - /* if same code only applies to returned zones */ > - if (opt & ZBC_REPORT_ZONE_PARTIAL) { > - if (clen != ~0ul) { > - clen = z_len; > + zone = &wpdscr->zones[idscr]; > + if (blk_zone_is_cmr(zone)) > + z_wp_abs = z_start + wpdscr->zone_size; > + else > + z_wp_abs = z_start + zone->wp; > + > + switch (opt & ZBC_REPORT_OPTION_MASK) { > + case ZBC_ZONE_REPORTING_OPTION_EMPTY: > + if (z_wp_abs != z_start) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN: > + if (zone->state != BLK_ZONE_OPEN) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN: > + if (zone->state != BLK_ZONE_OPEN_EXPLICIT) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_CLOSED: > + if (zone->state != BLK_ZONE_CLOSED) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_FULL: > + if (zone->state != BLK_ZONE_FULL) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_READONLY: > + if (zone->state == BLK_ZONE_READONLY) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_OFFLINE: > + if (zone->state == BLK_ZONE_OFFLINE) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP: > + if (z_wp_abs == z_start) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_NON_WP: > + if (zone->state == BLK_ZONE_NO_WP) > + continue; > + break; > + case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE: > + /* this can only be reported by the HW */ > + break; > + case ZBC_ZONE_REPORTING_OPTION_ALL: > + default: > + break; > + } > + > + /* if same code only applies to returned zones */ > + if (opt & ZBC_REPORT_ZONE_PARTIAL) { > + if (clen != ~0ul) { > + clen = z_len; > + ctype = zone->type; > + } > + if (z_len != clen) > + len_diffs++; > + if (zone->type != ctype) > + type_diffs++; > ctype = zone->type; > } > - if (z_len != clen) > - len_diffs++; > - if (zone->type != ctype) > - type_diffs++; > - ctype = zone->type; > - } > + progress = z_start + z_len; > > - /* shift to device units */ > - z_start >>= ilog2(sdkp->device->sector_size) - 9; > - z_len >>= ilog2(sdkp->device->sector_size) - 9; > - z_wp_abs >>= ilog2(sdkp->device->sector_size) - 9; > + if (!bzd) { > + if (bzrpt) > + bzrpt->descriptor_count = > + cpu_to_be32(++entry); > + continue; > + } > > - if (!bzd) { > + /* shift to device units */ > + z_start >>= ilog2(sdp->sector_size) - 9; > + z_len >>= ilog2(sdp->sector_size) - 9; > + z_wp_abs >>= ilog2(sdp->sector_size) - 9; > + > + cond = zone->state; > + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) > + flgs |= 0x02; > + else if (zone->wp) > + flgs |= 0x01; /* flag as RWP recommended? */ > + > + dscr = &bzd[entry]; > + dscr->lba_start = cpu_to_be64(z_start); > + dscr->length = cpu_to_be64(z_len); > + dscr->lba_wptr = cpu_to_be64(z_wp_abs); > + dscr->type = zone->type; > + dscr->flags = cond << 4 | flgs; > + entry++; > if (bzrpt) > - bzrpt->descriptor_count = > - cpu_to_be32(++entry); > - continue; > + bzrpt->descriptor_count = cpu_to_be32(entry); > } > - > - bzd[entry].lba_start = cpu_to_be64(z_start); > - bzd[entry].length = cpu_to_be64(z_len); > - bzd[entry].lba_wptr = cpu_to_be64(z_wp_abs); > - bzd[entry].type = zone->type; > - bzd[entry].flags = cond << 4 | flgs; > - entry++; > - if (bzrpt) > - bzrpt->descriptor_count = cpu_to_be32(entry); > + spin_unlock_irqrestore(&wpdscr->lock, flags); > } > > /* if same code applies to all zones */ > if (bzrpt && !(opt & ZBC_REPORT_ZONE_PARTIAL)) { > - for (node = rb_first(&q->zones); node; node = rb_next(node)) { > - zone = rb_entry(node, struct blk_zone, node); > - > - spin_lock_irqsave(&zone->lock, flags); > - if (clen != ~0ul) { > - clen = zone->len; > + for (iter = 0; iter < zi->wps_count; iter++) { > + wpdscr = zi->wps[iter]; > + spin_lock_irqsave(&wpdscr->lock, flags); > + for (idscr = 0; idscr < wpdscr->zone_count; idscr++) { > + z_len = wpdscr->zone_size; > + zone = &wpdscr->zones[idscr]; > + if (clen != ~0ul) { > + clen = z_len; > + ctype = zone->type; > + } > + if (z_len != clen) > + len_diffs++; > + if (zone->type != ctype) > + type_diffs++; > ctype = zone->type; > } > - if (zone->len != clen) > - len_diffs++; > - if (zone->type != ctype) > - type_diffs++; > - ctype = zone->type; > - spin_unlock_irqrestore(&zone->lock, flags); > + spin_unlock_irqrestore(&wpdscr->lock, flags); > } > } > > @@ -985,12 +913,15 @@ out: > int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > sector_t sector, unsigned int *num_sectors) > { > + struct request_queue *q = sdkp->disk->queue; > struct blk_zone *zone; > + sector_t z_start, z_len; > + spinlock_t *lck; > unsigned int sectors = *num_sectors; > int ret = BLKPREP_OK; > unsigned long flags; > > - zone = blk_lookup_zone(sdkp->disk->queue, sector); > + zone = blk_lookup_zone(q, sector, &z_start, &z_len, &lck); > if (!zone) { > /* Might happen during zone initialization */ > sd_zbc_debug_ratelimit(sdkp, > @@ -999,7 +930,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > return BLKPREP_OK; > } > > - spin_lock_irqsave(&zone->lock, flags); > + spin_lock_irqsave(lck, flags); > > if (blk_zone_is_cmr(zone)) > goto out; > @@ -1008,7 +939,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > zone->state == BLK_ZONE_BUSY) { > sd_zbc_debug_ratelimit(sdkp, > "zone %zu state %x, deferring\n", > - zone->start, zone->state); > + z_start, zone->state); > ret = BLKPREP_DEFER; > goto out; > } > @@ -1017,25 +948,22 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > if (op_is_write(req_op(rq))) { > u64 nwp = sector + sectors; > > - while (nwp > (zone->start + zone->len)) { > - struct rb_node *node = rb_next(&zone->node); > + while (nwp > (z_start + z_len)) { > + zone->wp = z_len; > + sector = z_start + z_len; > + sectors = nwp - sector; > + spin_unlock_irqrestore(lck, flags); > > - zone->wp = zone->start + zone->len; > - sector = zone->wp; > - sectors = nwp - zone->wp; > - spin_unlock_irqrestore(&zone->lock, flags); > - > - if (!node) > - return BLKPREP_OK; > - zone = rb_entry(node, struct blk_zone, node); > + zone = blk_lookup_zone(q, sector, > + &z_start, &z_len, &lck); > if (!zone) > return BLKPREP_OK; > > - spin_lock_irqsave(&zone->lock, flags); > + spin_lock_irqsave(lck, flags); > nwp = sector + sectors; > } > - if (nwp > zone->wp) > - zone->wp = nwp; > + if (nwp > z_start + zone->wp) > + zone->wp = nwp - z_start; > } > goto out; > } > @@ -1044,37 +972,37 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > /* let the drive fail the command */ > sd_zbc_debug_ratelimit(sdkp, > "zone %zu offline\n", > - zone->start); > + z_start); > goto out; > } > > if (op_is_write(req_op(rq))) { > if (zone->state == BLK_ZONE_READONLY) > goto out; > - if (blk_zone_is_full(zone)) { > + if (zone->wp == z_len) { > sd_zbc_debug(sdkp, > - "Write to full zone %zu/%zu\n", > - sector, zone->wp); > + "Write to full zone %zu/%zu/%zu\n", > + sector, (sector_t)zone->wp, z_len); > ret = BLKPREP_KILL; > goto out; > } > - if (zone->wp != sector) { > + if (sector != (z_start + zone->wp)) { > sd_zbc_debug(sdkp, > "Misaligned write %zu/%zu\n", > - sector, zone->wp); > + sector, z_start + zone->wp); > ret = BLKPREP_KILL; > goto out; > } > zone->wp += sectors; > - } else if (zone->wp <= sector + sectors) { > - if (zone->wp <= sector) { > + } else if (z_start + zone->wp <= sector + sectors) { > + if (z_start + zone->wp <= sector) { > /* Read beyond WP: clear request buffer */ > struct req_iterator iter; > struct bio_vec bvec; > void *buf; > sd_zbc_debug(sdkp, > "Read beyond wp %zu+%u/%zu\n", > - sector, sectors, zone->wp); > + sector, sectors, z_start + zone->wp); > rq_for_each_segment(bvec, rq, iter) { > buf = bvec_kmap_irq(&bvec, &flags); > memset(buf, 0, bvec.bv_len); > @@ -1085,15 +1013,15 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, > goto out; > } > /* Read straddle WP position: limit request size */ > - *num_sectors = zone->wp - sector; > + *num_sectors = z_start + zone->wp - sector; > sd_zbc_debug(sdkp, > "Read straddle wp %zu+%u/%zu => %zu+%u\n", > - sector, sectors, zone->wp, > + sector, sectors, z_start + zone->wp, > sector, *num_sectors); > } > > out: > - spin_unlock_irqrestore(&zone->lock, flags); > + spin_unlock_irqrestore(lck, flags); > > return ret; > } > @@ -1145,21 +1073,22 @@ static void update_zones_from_report(struct scsi_cmnd *cmd, u32 nr_bytes) > struct bdev_zone_descriptor *entry = &bzde[iter]; > sector_t s = get_start_from_desc(sdkp, entry); > sector_t z_len = get_len_from_desc(sdkp, entry); > + sector_t z_strt; > + spinlock_t *lck; > unsigned long flags; > > if (!z_len) > goto done; > > - zone = blk_lookup_zone(rq->q, s); > + zone = blk_lookup_zone(rq->q, s, &z_strt, &z_len, &lck); > if (!zone) > goto done; > > - spin_lock_irqsave(&zone->lock, flags); > + spin_lock_irqsave(lck, flags); > zone->type = entry->type & 0xF; > zone->state = (entry->flags >> 4) & 0xF; > zone->wp = get_wp_from_desc(sdkp, entry); > - zone->len = z_len; > - spin_unlock_irqrestore(&zone->lock, flags); > + spin_unlock_irqrestore(lck, flags); > } > nread += len; > if (!dmax) > @@ -1233,113 +1162,314 @@ void sd_zbc_uninit_command(struct scsi_cmnd *cmd) > } > > /** > - * sd_zbc_init - Load zones of matching zlen size into rb tree. > + * alloc_cpws() - Allocate space for a contiguous set of write pointers > + * @items: Number of wps needed. > + * @lba: lba of the start of the next zone. > + * @z_start: Starting lba of this contiguous set. > + * @z_size: Size of each zone this contiguous set. > * > + * Return: Allocated wps or NULL on error. > */ > -static int sd_zbc_init(struct scsi_disk *sdkp, u64 zlen, char *buf, int buf_len) > +static struct contiguous_wps *alloc_cpws(int items, u64 lba, u64 z_start, > + u64 z_size) > { > - sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); > - sector_t last_sector; > + struct contiguous_wps *cwps = NULL; > + size_t sz; > > - if (test_and_set_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags)) { > - sdev_printk(KERN_WARNING, sdkp->device, > - "zone initialization already running\n"); > - return 0; > + sz = sizeof(struct contiguous_wps) + (items * sizeof(struct blk_zone)); > + if (items) { > + cwps = vzalloc(sz); > + if (!cwps) > + goto out; > + spin_lock_init(&cwps->lock); > + cwps->start_lba = z_start; > + cwps->last_lba = lba - 1; > + cwps->zone_size = z_size; > + cwps->is_zoned = items > 1 ? 1 : 0; > + cwps->zone_count = items; > } > > - if (!sdkp->zone_work_q) { > - char wq_name[32]; > +out: > + return cwps; > +} > > - sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name); > - sdkp->zone_work_q = create_singlethread_workqueue(wq_name); > - if (!sdkp->zone_work_q) { > - sdev_printk(KERN_WARNING, sdkp->device, > - "create zoned disk workqueue failed\n"); > - return -ENOMEM; > +/** > + * free_zone_wps() - Free up memory in use by wps > + * @zi: zone wps array(s). > + */ > +static void free_zone_wps(struct zone_wps *zi) > +{ > + /* on error free the arrays */ > + if (zi && zi->wps) { > + int ca; > + > + for (ca = 0; ca < zi->wps_count; ca++) { > + if (zi->wps[ca]) { > + vfree(zi->wps[ca]); > + zi->wps[ca] = NULL; > + } > } > - } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { > - drain_workqueue(sdkp->zone_work_q); > - clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags); > + kfree(zi->wps); > } > +} > > - last_sector = zbc_parse_zones(sdkp, zlen, buf, buf_len); > - capacity = logical_to_sectors(sdkp->device, sdkp->capacity); > - if (last_sector != -1 && last_sector < capacity) { > - sd_zbc_update_zones(sdkp, last_sector, > - SD_ZBC_BUF_SIZE, SD_ZBC_INIT); > - } else > - clear_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags); > +static int wps_realloc(struct zone_wps *zi, gfp_t gfp_mask) > +{ > + int rcode = 0; > + struct contiguous_wps **old; > + struct contiguous_wps **tmp; > + int n = zi->wps_count * 2; > + > + old = zi->wps; > + tmp = kzalloc(n, sizeof(*zi->wps), gfp_mask); Apologies, should be kcalloc() here. > + if (!tmp) { > + rcode = -ENOMEM; > + goto out; > + } > + memcpy(tmp, zi->wps, zi->wps_count * sizeof(*zi->wps)); > + zi->wps = tmp; > + kfree(old); > > - return 0; > +out: > + return rcode; > } > > +#define FMT_CHANGING_CAPACITY "Changing capacity from %zu to Max LBA+1 %zu" > + > /** > - * sd_zbc_config() - Configure a ZBC device (on attach) > - * @sdkp: SCSI disk being attached. > - * @buffer: Buffer to working data. > - * @buf_sz: Size of buffer to use for working data > + * zbc_init_zones() - Re-Sync expected WP location with drive > + * @sdkp: scsi_disk > + * @gfp_mask: Allocation mask. > * > - * Return: true of SD_ZBC_RESET_WP provisioning is supported > + * Return: 0 on success, otherwise error. > */ > -bool sd_zbc_config(struct scsi_disk *sdkp, void *buffer, size_t buf_sz) > +int zbc_init_zones(struct scsi_disk *sdkp, gfp_t gfp_mask) > { > - struct bdev_zone_report *bzrpt = buffer; > - u64 zone_len, lba; > - int retval; > - u32 rep_len; > - u8 same; > + struct request_queue *q = sdkp->disk->queue; > + int rcode = 0; > + int entry = 0; > + int offset; > + int offmax; > + u64 iter; > + u64 z_start = 0ul; > + u64 z_size = 0; /* size of zone */ > + int z_count = 0; /* number of zones of z_size */ > + int do_fill = 0; > + int array_count = 0; > + int one_time_setup = 0; > + u8 opt = ZBC_ZONE_REPORTING_OPTION_ALL; > + size_t bufsz = SD_ZBC_BUF_SIZE; > + struct bdev_zone_report *rpt = NULL; > + struct zone_wps *zi = NULL; > + struct contiguous_wps *cwps = NULL; > + > + if (q->zones) > + goto out; > > - if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) > - /* > - * Device managed or normal SCSI disk, > - * no special handling required > - */ > - return false; > - > - retval = sd_zbc_report_zones(sdkp, bzrpt, buf_sz, > - 0, ZBC_ZONE_REPORTING_OPTION_ALL, false); > - if (retval < 0) > - return false; > - > - rep_len = be32_to_cpu(bzrpt->descriptor_count); > - if (rep_len < 7) { > - sd_printk(KERN_WARNING, sdkp, > - "REPORT ZONES report invalid length %u\n", > - rep_len); > - return false; > + zi = kzalloc(sizeof(*zi), gfp_mask); > + if (!zi) { > + rcode = -ENOMEM; > + goto out; > } > > - if (sdkp->rc_basis == 0) { > - /* The max_lba field is the capacity of a zoned device */ > - lba = be64_to_cpu(bzrpt->maximum_lba); > - if (lba + 1 > sdkp->capacity) { > - if (sdkp->first_scan) > - sd_printk(KERN_WARNING, sdkp, > - "Changing capacity from %zu to Max LBA+1 %zu\n", > - sdkp->capacity, (sector_t) lba + 1); > - sdkp->capacity = lba + 1; > + if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) { > + struct gendisk *disk = sdkp->disk; > + > + zi->wps = kzalloc(sizeof(*zi->wps), gfp_mask); > + zi->wps[0] = alloc_cpws(1, disk->part0.nr_sects, z_start, 1); > + if (!zi->wps[0]) { > + rcode = -ENOMEM; > + goto out; > } > + zi->wps_count = 1; > + goto out; > + } > + > + rpt = kmalloc(bufsz, gfp_mask); > + if (!rpt) { > + rcode = -ENOMEM; > + goto out; > } > > /* > - * Adjust 'chunk_sectors' to the zone length if the device > - * supports equal zone sizes. > + * Start by handling upto 32 different zone sizes. 2 will work > + * for all the current drives, but maybe something exotic will > + * surface. > */ > - same = bzrpt->same_field & 0x0f; > - if (same > 3) { > - sd_printk(KERN_WARNING, sdkp, > - "REPORT ZONES SAME type %d not supported\n", same); > - return false; > + zi->wps = kcalloc(32, sizeof(*zi->wps), gfp_mask); > + zi->wps_count = 32; > + if (!zi->wps) { > + rcode = -ENOMEM; > + goto out; > } > - /* Read the zone length from the first zone descriptor */ > - zone_len = be64_to_cpu(bzrpt->descriptors[0].length); > - sdkp->unmap_alignment = zone_len; > - sdkp->unmap_granularity = zone_len; > - blk_queue_chunk_sectors(sdkp->disk->queue, > - logical_to_sectors(sdkp->device, zone_len)); > - > - sd_zbc_init(sdkp, zone_len, buffer, buf_sz); > - return true; > + > +fill: > + offset = 0; > + offmax = 0; > + for (entry = 0, iter = 0; iter < sdkp->capacity; entry++) { > + struct bdev_zone_descriptor *bzde; > + int stop_end = 0; > + int stop_size = 0; > + > + if (offset == 0) { > + int err; > + > + err = sd_zbc_report_zones(sdkp, rpt, bufsz, iter, opt); > + if (err) { > + pr_err("report zones-> %d\n", err); > + if (err != -ENOTSUPP) > + rcode = err; > + goto out; > + } > + if (sdkp->rc_basis == 0) { > + sector_t lba = be64_to_cpu(rpt->maximum_lba); > + > + if (lba + 1 > sdkp->capacity) { > + sd_printk(KERN_WARNING, sdkp, > + FMT_CHANGING_CAPACITY "\n", > + sdkp->capacity, lba + 1); > + sdkp->capacity = lba + 1; > + } > + } > + offmax = max_report_entries(bufsz); > + } > + bzde = &rpt->descriptors[offset]; > + if (z_size == 0) > + z_size = get_len_from_desc(sdkp, bzde); > + if (z_size != get_len_from_desc(sdkp, bzde)) > + stop_size = 1; > + if ((iter + z_size) >= sdkp->capacity) > + stop_end = 1; > + > + if (!one_time_setup) { > + u8 type = bzde->type & 0x0F; > + > + if (type != BLK_ZONE_TYPE_CONVENTIONAL) { > + one_time_setup = 1; > + blk_queue_chunk_sectors(sdkp->disk->queue, > + z_size); > + } > + } > + > + if (do_fill == 0) { > + if (stop_end || stop_size) { > + /* include the next/last zone? */ > + if (!stop_size) { > + z_count++; > + iter += z_size; > + } > + cwps = alloc_cpws(z_count, iter, > + z_start, z_size); > + if (!cwps) { > + rcode = -ENOMEM; > + goto out; > + } > + if (array_count > 0) > + cwps->is_zoned = 1; > + > + zi->wps[array_count] = cwps; > + z_start = iter; > + z_size = 0; > + z_count = 0; > + array_count++; > + if (array_count >= zi->wps_count) { > + rcode = wps_realloc(zi, gfp_mask); > + if (rcode) > + goto out; > + } > + /* add the runt zone */ > + if (stop_end && stop_size) { > + z_count++; > + z_size = get_len_from_desc(sdkp, bzde); > + cwps = alloc_cpws(z_count, > + iter + z_size, > + z_start, z_size); > + if (!cwps) { > + rcode = -ENOMEM; > + goto out; > + } > + if (array_count > 0) > + cwps->is_zoned = 1; > + zi->wps[array_count] = cwps; > + array_count++; > + } > + if (stop_end) { > + do_fill = 1; > + array_count = 0; > + z_count = 0; > + z_size = 0; > + goto fill; > + } > + } > + z_size = get_len_from_desc(sdkp, bzde); > + iter += z_size; > + z_count++; > + } else { > + fill_zone(zi->wps[array_count], z_count, sdkp, bzde); > + z_count++; > + iter += z_size; > + if (zi->wps[array_count]->zone_count == z_count) { > + z_count = 0; > + array_count++; > + zi->wps_count = array_count; > + } > + } > + offset++; > + if (offset >= offmax) > + offset = 0; > + } > +out: > + kfree(rpt); > + > + if (rcode) { > + if (zi) { > + free_zone_wps(zi); > + kfree(zi); > + } > + } else { > + q->zones = zi; > + } > + > + return rcode; > +} > + > +/** > + * sd_zbc_config() - Configure a ZBC device (on attach) > + * @sdkp: SCSI disk being attached. > + * @gfp_mask: Memory allocation strategy > + * > + * Return: true of SD_ZBC_RESET_WP provisioning is supported > + */ > +bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp_mask) > +{ > + bool can_reset_wp = false; > + > + if (zbc_init_zones(sdkp, gfp_mask)) { > + sdev_printk(KERN_WARNING, sdkp->device, > + "Initialize zone cache failed\n"); > + goto out; > + } > + > + if (sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC) > + can_reset_wp = true; > + > + if (!sdkp->zone_work_q) { > + char wq_name[32]; > + > + sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name); > + sdkp->zone_work_q = create_singlethread_workqueue(wq_name); > + if (!sdkp->zone_work_q) { > + sdev_printk(KERN_WARNING, sdkp->device, > + "create zoned disk workqueue failed\n"); > + goto out; > + } > + } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { > + drain_workqueue(sdkp->zone_work_q); > + clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags); > + } > + > +out: > + return can_reset_wp; > } > > /** > @@ -1365,15 +1495,16 @@ void sd_zbc_remove(struct scsi_disk *sdkp) > */ > unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp) > { > - unsigned int bytes = 1; > struct request_queue *q = sdkp->disk->queue; > - struct rb_node *node = rb_first(&q->zones); > + struct zone_wps *zi = q->zones; > + unsigned int bytes = 1; > > - if (node) { > - struct blk_zone *zone = rb_entry(node, struct blk_zone, node); > + if (zi && zi->wps_count > 0) { > + struct contiguous_wps *wp = zi->wps[0]; > > - bytes = zone->len; > + bytes = wp->zone_size; > } > + > bytes <<= ilog2(sdkp->device->sector_size); > return bytes; > } > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index d5cdb5d..113c5a8 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -264,27 +264,83 @@ struct blk_queue_tag { > > #ifdef CONFIG_BLK_DEV_ZONED > > +/** > + * struct blk_zone - A single zone type/stats and WP offset. > + * > + * @wp: Holds the wp offset from the start of the zone. > + * @type: Holds the zone type nibble. > + * @state: Holds the zone state nibble + kernel (zone busy) > + * @private_data: Used to hold whatever the implicit domain owner > + * of the zone needs to track. > + * > + * Type is left at 4 bits (only 2 are needed currently) to match > + * the current ZBC/ZAC standards. > + * > + * State is using 5 bits to accommodate the ZONE_BUSY. The first 4 bits > + * match the current ZBC/ZAC spec. > + * ZONE_BUSY could be mapped to one of the reserved bits. Using it as > + * mask bit or independent flag my be useful for decoding the zone > + * state before it transitioned to BUSY. > + * > + * A zone sized at order (39+9) is very unlikely (current zones are 16+9) > + * Even at lba48 equivalent number of sectors we have a large amount > + * of padding to fill out 8 bytes. > + * > + * Getting this to fit in 4 bytes would limit the maximum size of a zone > + * to 4G [order 23 of 512 byte sectors + 9 bits for flags] which is probably > + * okay for embedded or 32-bit systems where the private_data pointer > + * would also shrink to 32 bits. There are also WP tracking schemes > + * that don't make use of the private_data helper so perhaps that > + * could be factored out as well. > + */ > struct blk_zone { > - struct rb_node node; > - spinlock_t lock; > - sector_t start; > - size_t len; > - sector_t wp; > - enum blk_zone_type type; > - enum blk_zone_state state; > + unsigned long long wp:39; > + unsigned long long type:4; > + unsigned long long state:5; > + unsigned long long padding:15; > void *private_data; > }; > > +/** > + * struct contiguous_wps - A descriptor of zones of the same size > + * > + * @start_lba: LBA of first zone covered by the descriptor. > + * @last_lba: LBA of last zone. > + * @zone_size: Size of zones as a number of 512 byte sectors. > + * @zone_count: Number of zones (last-start/size) for convenience. > + * @lock: A spinlock protecting these zones. > + * @is_zoned: 0 when all zones are conventional no WP zones. > + * zones: Array of blk_zone entries. > + */ > +struct contiguous_wps { > + u64 start_lba; > + u64 last_lba; > + u64 zone_size; > + u32 zone_count; > + spinlock_t lock; > + unsigned is_zoned:1; > + struct blk_zone zones[0]; > +}; > + > +/** > + * struct zone_wps - A collection of zone descriptors to describe zoned media. > + * > + * @wps_count: Number of descriptors. > + * @wps: Array of zone descriptors. > + */ > +struct zone_wps { > + u32 wps_count; > + struct contiguous_wps **wps; > +}; > + > #define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ) > #define blk_zone_is_seq_pref(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_PREF) > #define blk_zone_is_smr(z) (blk_zone_is_seq_req(z) || blk_zone_is_seq_pref(z)) > #define blk_zone_is_cmr(z) ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL) > -#define blk_zone_is_full(z) ((z)->wp == (z)->start + (z)->len) > -#define blk_zone_is_empty(z) ((z)->wp == (z)->start) > +#define blk_zone_is_empty(z) ((z)->wp == 0) > > -extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t); > -extern struct blk_zone *blk_insert_zone(struct request_queue *, > - struct blk_zone *); > +extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t, > + sector_t *, sector_t *, spinlock_t **); > extern void blk_drop_zones(struct request_queue *); > #else > static inline void blk_drop_zones(struct request_queue *q) { }; > @@ -463,7 +519,7 @@ struct request_queue { > struct queue_limits limits; > > #ifdef CONFIG_BLK_DEV_ZONED > - struct rb_root zones; > + struct zone_wps *zones; > #endif > /* > * sg stuff > -- > 2.9.3 >
On 08/22/2016 06:34 AM, Shaun Tancheff wrote: > Currently the RB-Tree zone cache is fast and flexible. It does > use a rather largish amount of ram. This model reduces the ram > required from 120 bytes per zone to 16 bytes per zone with a > moderate transformation of the blk_zone_lookup() api. > > This model is predicated on the belief that most variations > on zoned media will follow a pattern of using collections of same > sized zones on a single device. Similar to the pattern of erase > blocks on flash devices being progressivly larger 16K, 64K, ... > > The goal is to be able to build a descriptor which is both memory > efficient, performant, and flexible. > > Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com> > --- > block/blk-core.c | 2 +- > block/blk-sysfs.c | 31 +- > block/blk-zoned.c | 103 +++-- > drivers/scsi/sd.c | 5 +- > drivers/scsi/sd.h | 4 +- > drivers/scsi/sd_zbc.c | 1025 +++++++++++++++++++++++++++--------------------- > include/linux/blkdev.h | 82 +++- > 7 files changed, 716 insertions(+), 536 deletions(-) > Have you measure the performance impact here? The main idea behind using an RB-tree is that each single element will fit in the CPU cache; using an array will prevent that. So we will increase the number of cache flushes, and most likely a performance penalty, too. Hence I'd rather like to see a performance measurement here before going down that road. Cheers, Hannes
On Mon, Aug 22, 2016 at 2:11 AM, Hannes Reinecke <hare@suse.de> wrote: > On 08/22/2016 06:34 AM, Shaun Tancheff wrote: >> Currently the RB-Tree zone cache is fast and flexible. It does >> use a rather largish amount of ram. This model reduces the ram >> required from 120 bytes per zone to 16 bytes per zone with a >> moderate transformation of the blk_zone_lookup() api. >> >> This model is predicated on the belief that most variations >> on zoned media will follow a pattern of using collections of same >> sized zones on a single device. Similar to the pattern of erase >> blocks on flash devices being progressivly larger 16K, 64K, ... >> >> The goal is to be able to build a descriptor which is both memory >> efficient, performant, and flexible. >> >> Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com> >> --- >> block/blk-core.c | 2 +- >> block/blk-sysfs.c | 31 +- >> block/blk-zoned.c | 103 +++-- >> drivers/scsi/sd.c | 5 +- >> drivers/scsi/sd.h | 4 +- >> drivers/scsi/sd_zbc.c | 1025 +++++++++++++++++++++++++++--------------------- >> include/linux/blkdev.h | 82 +++- >> 7 files changed, 716 insertions(+), 536 deletions(-) > Have you measure the performance impact here? As far as actual hardware (HostAware) I am seeing the same I/O performance. I suspect its just that below 100k iops the zone cache just isn't a bottleneck. > The main idea behind using an RB-tree is that each single element will > fit in the CPU cache; using an array will prevent that. > So we will increase the number of cache flushes, and most likely a > performance penalty, too. > Hence I'd rather like to see a performance measurement here before going > down that road. I think it will have to be a simulated benchmark, if that's okay. Of course I'm open to suggestions if there is something you have in mind.
diff --git a/block/blk-core.c b/block/blk-core.c index 3a9caf7..3b084a8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -727,7 +727,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) INIT_LIST_HEAD(&q->blkg_list); #endif #ifdef CONFIG_BLK_DEV_ZONED - q->zones = RB_ROOT; + q->zones = NULL; #endif INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 43f441f..ecbd434 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -232,36 +232,7 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) #ifdef CONFIG_BLK_DEV_ZONED static ssize_t queue_zoned_show(struct request_queue *q, char *page) { - struct rb_node *node; - struct blk_zone *zone; - ssize_t offset = 0, end = 0; - size_t size = 0, num = 0; - enum blk_zone_type type = BLK_ZONE_TYPE_UNKNOWN; - - for (node = rb_first(&q->zones); node; node = rb_next(node)) { - zone = rb_entry(node, struct blk_zone, node); - if (zone->type != type || - zone->len != size || - end != zone->start) { - if (size != 0) - offset += sprintf(page + offset, "%zu\n", num); - /* We can only store one page ... */ - if (offset + 42 > PAGE_SIZE) { - offset += sprintf(page + offset, "...\n"); - return offset; - } - size = zone->len; - type = zone->type; - offset += sprintf(page + offset, "%zu %zu %d ", - zone->start, size, type); - num = 0; - end = zone->start + size; - } else - end += zone->len; - num++; - } - offset += sprintf(page + offset, "%zu\n", num); - return offset; + return sprintf(page, "%u\n", q->zones ? 1 : 0); } #endif diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 975e863..338a1af 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -8,63 +8,84 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blkdev.h> -#include <linux/rbtree.h> +#include <linux/vmalloc.h> -struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t lba) +/** + * blk_lookup_zone() - Lookup zones + * @q: Request Queue + * @sector: Location to lookup + * @start: Pointer to starting location zone (OUT) + * @len: Pointer to length of zone (OUT) + * @lock: Pointer to spinlock of zones in owning descriptor (OUT) + */ +struct blk_zone *blk_lookup_zone(struct request_queue *q, sector_t sector, + sector_t *start, sector_t *len, + spinlock_t **lock) { - struct rb_root *root = &q->zones; - struct rb_node *node = root->rb_node; + int iter; + struct blk_zone *bzone = NULL; + struct zone_wps *zi = q->zones; + + *start = 0; + *len = 0; + *lock = NULL; + + if (!q->zones) + goto out; - while (node) { - struct blk_zone *zone = container_of(node, struct blk_zone, - node); + for (iter = 0; iter < zi->wps_count; iter++) { + if (sector >= zi->wps[iter]->start_lba && + sector < zi->wps[iter]->last_lba) { + struct contiguous_wps *wp = zi->wps[iter]; + u64 index = (sector - wp->start_lba) / wp->zone_size; - if (lba < zone->start) - node = node->rb_left; - else if (lba >= zone->start + zone->len) - node = node->rb_right; - else - return zone; + if (index >= wp->zone_count) { + WARN(1, "Impossible index for zone\n"); + goto out; + } + + bzone = &wp->zones[index]; + *len = wp->zone_size; + *start = wp->start_lba + (index * wp->zone_size); + *lock = &wp->lock; + } } - return NULL; + +out: + return bzone; } EXPORT_SYMBOL_GPL(blk_lookup_zone); -struct blk_zone *blk_insert_zone(struct request_queue *q, struct blk_zone *data) +/** + * free_zone_wps() - Free up memory in use by wps + * @zi: zone wps array(s). + */ +static void free_zone_wps(struct zone_wps *zi) { - struct rb_root *root = &q->zones; - struct rb_node **new = &(root->rb_node), *parent = NULL; + /* on error free the arrays */ + if (zi && zi->wps) { + int ca; - /* Figure out where to put new node */ - while (*new) { - struct blk_zone *this = container_of(*new, struct blk_zone, - node); - parent = *new; - if (data->start + data->len <= this->start) - new = &((*new)->rb_left); - else if (data->start >= this->start + this->len) - new = &((*new)->rb_right); - else { - /* Return existing zone */ - return this; + for (ca = 0; ca < zi->wps_count; ca++) { + if (zi->wps[ca]) { + vfree(zi->wps[ca]); + zi->wps[ca] = NULL; + } } + kfree(zi->wps); } - /* Add new node and rebalance tree. */ - rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); - - return NULL; } -EXPORT_SYMBOL_GPL(blk_insert_zone); +/** + * blk_drop_zones() - Free zones + * @q: Request Queue + */ void blk_drop_zones(struct request_queue *q) { - struct rb_root *root = &q->zones; - struct blk_zone *zone, *next; - - rbtree_postorder_for_each_entry_safe(zone, next, root, node) { - kfree(zone); + if (q->zones) { + free_zone_wps(q->zones); + kfree(q->zones); + q->zones = NULL; } - q->zones = RB_ROOT; } EXPORT_SYMBOL_GPL(blk_drop_zones); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index f144df4..0f749f5 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2549,8 +2549,9 @@ got_data: sdkp->physical_block_size); sdkp->device->sector_size = sector_size; - if (sd_zbc_config(sdkp, buffer, SD_BUF_SIZE)) - sd_config_discard(sdkp, SD_ZBC_RESET_WP); + if (sdkp->first_scan) + if (sd_zbc_config(sdkp, GFP_KERNEL)) + sd_config_discard(sdkp, SD_ZBC_RESET_WP); { char cap_str_2[10], cap_str_10[10]; diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index fc766db..c9c79e9 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -299,13 +299,13 @@ extern void sd_zbc_uninit_command(struct scsi_cmnd *cmd); extern void sd_zbc_remove(struct scsi_disk *); extern void sd_zbc_reset_zones(struct scsi_disk *); extern void sd_zbc_update_zones(struct scsi_disk *, sector_t, int, int reason); -extern bool sd_zbc_config(struct scsi_disk *, void *, size_t); +extern bool sd_zbc_config(struct scsi_disk *, gfp_t); extern unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp); #else /* CONFIG_SCSI_ZBC */ -static inline bool sd_zbc_config(struct scsi_disk *sdkp, void *b, size_t sz) +static inline bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp) { return false; } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 960af93..c087035 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/rbtree.h> +#include <linux/vmalloc.h> #include <asm/unaligned.h> @@ -51,11 +52,11 @@ } while( 0 ) struct zbc_update_work { - struct work_struct zone_work; - struct scsi_disk *sdkp; - sector_t zone_sector; - int zone_buflen; - char zone_buf[0]; + struct work_struct zone_work; + struct scsi_disk *sdkp; + sector_t zone_sector; + int zone_buflen; + struct bdev_zone_report zone_buf[0]; }; /** @@ -95,102 +96,19 @@ static inline sector_t get_start_from_desc(struct scsi_disk *sdkp, return logical_to_sectors(sdkp->device, be64_to_cpu(bzde->lba_start)); } -static -struct blk_zone *zbc_desc_to_zone(struct scsi_disk *sdkp, unsigned char *rec) +static void _fill_zone(struct blk_zone *zone, struct scsi_disk *sdkp, + struct bdev_zone_descriptor *bzde) { - struct blk_zone *zone; - sector_t wp = (sector_t)-1; - - zone = kzalloc(sizeof(struct blk_zone), GFP_KERNEL); - if (!zone) - return NULL; - - spin_lock_init(&zone->lock); - zone->type = rec[0] & 0xf; - zone->state = (rec[1] >> 4) & 0xf; - zone->len = logical_to_sectors(sdkp->device, - get_unaligned_be64(&rec[8])); - zone->start = logical_to_sectors(sdkp->device, - get_unaligned_be64(&rec[16])); - - if (blk_zone_is_smr(zone)) - wp = logical_to_sectors(sdkp->device, - get_unaligned_be64(&rec[24])); - zone->wp = wp; - /* - * Fixup block zone state - */ - if (zone->state == BLK_ZONE_EMPTY && - zone->wp != zone->start) { - sd_zbc_debug(sdkp, - "zone %zu state EMPTY wp %zu: adjust wp\n", - zone->start, zone->wp); - zone->wp = zone->start; - } - if (zone->state == BLK_ZONE_FULL && - zone->wp != zone->start + zone->len) { - sd_zbc_debug(sdkp, - "zone %zu state FULL wp %zu: adjust wp\n", - zone->start, zone->wp); - zone->wp = zone->start + zone->len; - } - - return zone; + zone->type = bzde->type & 0x0f; + zone->state = (bzde->flags >> 4) & 0x0f; + zone->wp = get_wp_from_desc(sdkp, bzde); } -static -sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf, - unsigned int buf_len) -{ - struct request_queue *q = sdkp->disk->queue; - unsigned char *rec = buf; - int rec_no = 0; - unsigned int list_length; - sector_t next_sector = -1; - u8 same; - - /* Parse REPORT ZONES header */ - list_length = get_unaligned_be32(&buf[0]); - same = buf[4] & 0xf; - rec = buf + 64; - list_length += 64; - - if (list_length < buf_len) - buf_len = list_length; - - while (rec < buf + buf_len) { - struct blk_zone *this, *old; - unsigned long flags; - this = zbc_desc_to_zone(sdkp, rec); - if (!this) - break; - - if (same == 0 && this->len != zlen) { - next_sector = this->start + this->len; - break; - } - - next_sector = this->start + this->len; - old = blk_insert_zone(q, this); - if (old) { - spin_lock_irqsave(&old->lock, flags); - if (blk_zone_is_smr(old)) { - old->wp = this->wp; - old->state = this->state; - } - spin_unlock_irqrestore(&old->lock, flags); - kfree(this); - } - rec += 64; - rec_no++; - } - - sd_zbc_debug(sdkp, - "Inserted %d zones, next sector %zu len %d\n", - rec_no, next_sector, list_length); - - return next_sector; +static void fill_zone(struct contiguous_wps *cwps, int z_count, + struct scsi_disk *sdkp, struct bdev_zone_descriptor *bzde) +{ + _fill_zone(&cwps->zones[z_count], sdkp, bzde); } /** @@ -200,12 +118,10 @@ sector_t zbc_parse_zones(struct scsi_disk *sdkp, u64 zlen, unsigned char *buf, * @bufflen: length of @buffer * @start_sector: logical sector for the zone information should be reported * @option: reporting option to be used - * @partial: flag to set the 'partial' bit for report zones command */ -static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer, - int bufflen, sector_t start_sector, - enum zbc_zone_reporting_options option, - bool partial) +static int sd_zbc_report_zones(struct scsi_disk *sdkp, + struct bdev_zone_report *buffer, + int bufflen, sector_t start_sector, u8 option) { struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout @@ -225,7 +141,7 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, void *buffer, cmd[1] = ZI_REPORT_ZONES; put_unaligned_be64(start_lba, &cmd[2]); put_unaligned_be32(bufflen, &cmd[10]); - cmd[14] = (partial ? ZBC_REPORT_ZONE_PARTIAL : 0) | option; + cmd[14] = option; memset(buffer, 0, bufflen); result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, @@ -248,49 +164,38 @@ static void sd_zbc_refresh_zone_work(struct work_struct *work) container_of(work, struct zbc_update_work, zone_work); struct scsi_disk *sdkp = zbc_work->sdkp; struct request_queue *q = sdkp->disk->queue; - unsigned char *zone_buf = zbc_work->zone_buf; + struct bdev_zone_report *rpt = zbc_work->zone_buf; unsigned int zone_buflen = zbc_work->zone_buflen; + struct bdev_zone_descriptor *bzde; + int iter; + int offmax; + sector_t z_at, z_start, z_len; + spinlock_t *lock; + struct blk_zone *zone; int ret; - u8 same; - u64 zlen = 0; sector_t last_sector; sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen, + ret = sd_zbc_report_zones(sdkp, rpt, zone_buflen, zbc_work->zone_sector, - ZBC_ZONE_REPORTING_OPTION_ALL, true); + ZBC_ZONE_REPORTING_OPTION_ALL); if (ret) goto done_free; - /* this whole path is unlikely so extra reports shouldn't be a - * large impact */ - same = zone_buf[4] & 0xf; - if (same == 0) { - unsigned char *desc = &zone_buf[64]; - unsigned int blen = zone_buflen; - - /* just pull the first zone */ - if (blen > 512) - blen = 512; - ret = sd_zbc_report_zones(sdkp, zone_buf, blen, 0, - ZBC_ZONE_REPORTING_OPTION_ALL, true); - if (ret) - goto done_free; - - /* Read the zone length from the first zone descriptor */ - zlen = logical_to_sectors(sdkp->device, - get_unaligned_be64(&desc[8])); - - ret = sd_zbc_report_zones(sdkp, zone_buf, zone_buflen, - zbc_work->zone_sector, - ZBC_ZONE_REPORTING_OPTION_ALL, true); - if (ret) - goto done_free; + offmax = max_report_entries(zone_buflen); + for (iter = 0; iter < offmax; iter++) { + bzde = &rpt->descriptors[iter]; + z_at = get_start_from_desc(sdkp, bzde); + if (!z_at) + break; + zone = blk_lookup_zone(q, z_at, &z_start, &z_len, &lock); + if (zone) { + _fill_zone(zone, sdkp, bzde); + last_sector = z_start + z_len; + } } - last_sector = zbc_parse_zones(sdkp, zlen, zone_buf, zone_buflen); - capacity = logical_to_sectors(sdkp->device, sdkp->capacity); - if (last_sector != -1 && last_sector < capacity) { + if (sdkp->zone_work_q && last_sector != -1 && last_sector < capacity) { if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { sd_zbc_debug(sdkp, "zones in reset, canceling refresh\n"); @@ -333,10 +238,7 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize, { struct request_queue *q = sdkp->disk->queue; struct zbc_update_work *zbc_work; - struct blk_zone *zone; - struct rb_node *node; - int zone_num = 0, zone_busy = 0, num_rec; - sector_t next_sector = sector; + int num_rec; if (test_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { sd_zbc_debug(sdkp, @@ -346,18 +248,23 @@ void sd_zbc_update_zones(struct scsi_disk *sdkp, sector_t sector, int bufsize, if (reason != SD_ZBC_INIT) { /* lookup sector, is zone pref? then ignore */ - struct blk_zone *zone = blk_lookup_zone(q, sector); - + sector_t z_start, z_len; + spinlock_t *lck; + struct blk_zone *zone = blk_lookup_zone(q, sector, &z_start, + &z_len, &lck); + /* zone actions on conventional zones are invalid */ + if (zone && reason == SD_ZBC_RESET_WP && blk_zone_is_cmr(zone)) + return; if (reason == SD_ZBC_RESET_WP) sd_zbc_debug(sdkp, "RESET WP failed %lx\n", sector); - - if (zone && blk_zone_is_seq_pref(zone)) - return; } + if (!sdkp->zone_work_q) + return; + retry: zbc_work = kzalloc(sizeof(struct zbc_update_work) + bufsize, - reason != SD_ZBC_INIT ? GFP_NOWAIT : GFP_KERNEL); + reason != SD_ZBC_INIT ? GFP_ATOMIC : GFP_KERNEL); if (!zbc_work) { if (bufsize > 512) { sd_zbc_debug(sdkp, @@ -381,30 +288,40 @@ retry: * Mark zones under update as BUSY */ if (reason != SD_ZBC_INIT) { - for (node = rb_first(&q->zones); node; node = rb_next(node)) { - unsigned long flags; - - zone = rb_entry(node, struct blk_zone, node); - if (num_rec == 0) + unsigned long flags; + int iter; + struct zone_wps *zi = q->zones; + struct contiguous_wps *wp = NULL; + u64 index = -1; + int zone_busy = 0; + int z_flgd = 0; + + for (iter = 0; iter < zi->wps_count; iter++) { + if (sector >= zi->wps[iter]->start_lba && + sector < zi->wps[iter]->last_lba) { + wp = zi->wps[iter]; break; - if (zone->start != next_sector) - continue; - next_sector += zone->len; - num_rec--; - - spin_lock_irqsave(&zone->lock, flags); - if (blk_zone_is_smr(zone)) { - if (zone->state == BLK_ZONE_BUSY) { + } + } + if (wp) { + spin_lock_irqsave(&wp->lock, flags); + index = (sector - wp->start_lba) / wp->zone_size; + while (index < wp->zone_count && z_flgd < num_rec) { + struct blk_zone *bzone = &wp->zones[index]; + + index++; + z_flgd++; + if (!blk_zone_is_smr(bzone)) + continue; + + if (bzone->state == BLK_ZONE_BUSY) zone_busy++; - } else { - zone->state = BLK_ZONE_BUSY; - zone->wp = zone->start; - } - zone_num++; + else + bzone->state = BLK_ZONE_BUSY; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(&wp->lock, flags); } - if (zone_num && (zone_num == zone_busy)) { + if (z_flgd && (z_flgd == zone_busy)) { sd_zbc_debug(sdkp, "zone update for %zu in progress\n", sector); @@ -476,43 +393,26 @@ static void discard_or_write_same(struct scsi_cmnd *cmd, sector_t sector, int sd_zbc_setup_discard(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; - struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); int ret = BLKPREP_OK; struct blk_zone *zone; unsigned long flags; - u32 wp_offset; bool use_write_same = false; + sector_t z_start, z_len; + spinlock_t *lck; - zone = blk_lookup_zone(rq->q, sector); - if (!zone) { - /* Test for a runt zone before giving up */ - if (sdp->type != TYPE_ZBC) { - struct request_queue *q = rq->q; - struct rb_node *node; - - node = rb_last(&q->zones); - if (node) - zone = rb_entry(node, struct blk_zone, node); - if (zone) { - spin_lock_irqsave(&zone->lock, flags); - if ((zone->start + zone->len) <= sector) - goto out; - spin_unlock_irqrestore(&zone->lock, flags); - zone = NULL; - } - } + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck); + if (!zone) return BLKPREP_KILL; - } - spin_lock_irqsave(&zone->lock, flags); + spin_lock_irqsave(lck, flags); if (zone->state == BLK_ZONE_UNKNOWN || zone->state == BLK_ZONE_BUSY) { sd_zbc_debug_ratelimit(sdkp, "Discarding zone %zx state %x, deferring\n", - zone->start, zone->state); + z_start, zone->state); ret = BLKPREP_DEFER; goto out; } @@ -520,39 +420,37 @@ int sd_zbc_setup_discard(struct scsi_cmnd *cmd) /* let the drive fail the command */ sd_zbc_debug_ratelimit(sdkp, "Discarding offline zone %zx\n", - zone->start); + z_start); goto out; } if (blk_zone_is_cmr(zone)) { use_write_same = true; sd_zbc_debug_ratelimit(sdkp, - "Discarding CMR zone %zx\n", - zone->start); + "Discarding CMR zone %zx\n", z_start); goto out; } - if (zone->start != sector || zone->len < nr_sectors) { + if (z_start != sector || z_len < nr_sectors) { sd_printk(KERN_ERR, sdkp, "Misaligned RESET WP %zx/%x on zone %zx/%zx\n", - sector, nr_sectors, zone->start, zone->len); + sector, nr_sectors, z_start, z_len); ret = BLKPREP_KILL; goto out; } /* Protect against Reset WP when more data had been written to the * zone than is being discarded. */ - wp_offset = zone->wp - zone->start; - if (wp_offset > nr_sectors) { + if (zone->wp > nr_sectors) { sd_printk(KERN_ERR, sdkp, - "Will Corrupt RESET WP %zx/%x/%x on zone %zx/%zx/%zx\n", - sector, wp_offset, nr_sectors, - zone->start, zone->wp, zone->len); + "Will Corrupt RESET WP %zx/%zx/%x on zone %zx/%zx/%zx\n", + sector, (sector_t)zone->wp, nr_sectors, + z_start, z_start + zone->wp, z_len); ret = BLKPREP_KILL; goto out; } if (blk_zone_is_empty(zone)) { sd_zbc_debug_ratelimit(sdkp, "Discarding empty zone %zx [WP: %zx]\n", - zone->start, zone->wp); + z_start, (sector_t)zone->wp); ret = BLKPREP_DONE; goto out; } @@ -563,8 +461,8 @@ out: * zone update if RESET WRITE POINTER fails. */ if (ret == BLKPREP_OK && !use_write_same) - zone->wp = zone->start; - spin_unlock_irqrestore(&zone->lock, flags); + zone->wp = 0; + spin_unlock_irqrestore(lck, flags); if (ret == BLKPREP_OK) discard_or_write_same(cmd, sector, nr_sectors, use_write_same); @@ -573,13 +471,14 @@ out: } -static void __set_zone_state(struct blk_zone *zone, int op) +static void __set_zone_state(struct blk_zone *zone, sector_t z_len, + spinlock_t *lck, int op) { unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); - if (blk_zone_is_cmr(zone)) - goto out_unlock; + spin_lock_irqsave(lck, flags); + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + goto out; switch (op) { case REQ_OP_ZONE_OPEN: @@ -587,38 +486,45 @@ static void __set_zone_state(struct blk_zone *zone, int op) break; case REQ_OP_ZONE_FINISH: zone->state = BLK_ZONE_FULL; - zone->wp = zone->start + zone->len; + zone->wp = z_len; break; case REQ_OP_ZONE_CLOSE: zone->state = BLK_ZONE_CLOSED; break; case REQ_OP_ZONE_RESET: - zone->wp = zone->start; + zone->wp = 0; break; default: WARN_ONCE(1, "%s: invalid op code: %u\n", __func__, op); } -out_unlock: - spin_unlock_irqrestore(&zone->lock, flags); +out: + spin_unlock_irqrestore(lck, flags); } static void update_zone_state(struct request *rq, sector_t lba, unsigned int op) { - struct request_queue *q = rq->q; - struct blk_zone *zone = NULL; + struct blk_zone *zone; if (lba == ~0ul) { - struct rb_node *node; - - for (node = rb_first(&q->zones); node; node = rb_next(node)) { - zone = rb_entry(node, struct blk_zone, node); - __set_zone_state(zone, op); + struct zone_wps *zi = rq->q->zones; + struct contiguous_wps *wp; + u32 iter, entry; + + for (iter = 0; iter < zi->wps_count; iter++) { + wp = zi->wps[iter]; + for (entry = 0; entry < wp->zone_count; entry++) { + zone = &wp->zones[entry]; + __set_zone_state(zone, wp->zone_size, &wp->lock, + op); + } } - return; } else { - zone = blk_lookup_zone(q, lba); + sector_t z_start, z_len; + spinlock_t *lck; + + zone = blk_lookup_zone(rq->q, lba, &z_start, &z_len, &lck); if (zone) - __set_zone_state(zone, op); + __set_zone_state(zone, z_len, lck, op); } } @@ -641,6 +547,8 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); sector_t sector = blk_rq_pos(rq); struct blk_zone *zone; + spinlock_t *lck; + sector_t z_start, z_len; unsigned long flags; unsigned int nr_sectors; int ret = BLKPREP_DONE; @@ -651,17 +559,17 @@ int sd_zbc_setup_zone_action(struct scsi_cmnd *cmd) if (is_fua || op != REQ_OP_ZONE_RESET) goto out; - zone = blk_lookup_zone(rq->q, sector); + zone = blk_lookup_zone(rq->q, sector, &z_start, &z_len, &lck); if (!zone || sdkp->provisioning_mode != SD_ZBC_RESET_WP) goto out; /* Map a Reset WP w/o FUA to a discard request */ - spin_lock_irqsave(&zone->lock, flags); - sector = zone->start; - nr_sectors = zone->len; + spin_lock_irqsave(lck, flags); + sector = z_start; + nr_sectors = z_len; if (blk_zone_is_cmr(zone)) use_write_same = true; - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(lck, flags); rq->completion_data = NULL; if (use_write_same) { @@ -712,137 +620,157 @@ static sector_t bzrpt_fill(struct request *rq, struct bdev_zone_descriptor *bzd, size_t sz, sector_t lba, u8 opt) { - struct request_queue *q = rq->q; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_device *sdp = sdkp->device; + struct zone_wps *zi = rq->q->zones; + struct contiguous_wps *wpdscr; struct blk_zone *zone = NULL; - struct rb_node *node = NULL; sector_t progress = lba; sector_t clen = ~0ul; + sector_t z_start, z_len, z_wp_abs; unsigned long flags; u32 max_entries = bzrpt ? max_report_entries(sz) : sz / sizeof(*bzd); u32 entry = 0; + u32 iter, idscr; int len_diffs = 0; int type_diffs = 0; u8 ctype; u8 same = 0; - zone = blk_lookup_zone(q, lba); - if (zone) - node = &zone->node; - - for (entry = 0; entry < max_entries && node; node = rb_next(node)) { - u64 z_len, z_start, z_wp_abs; - u8 cond = 0; - u8 flgs = 0; - - spin_lock_irqsave(&zone->lock, flags); - z_len = zone->len; - z_start = zone->start; - z_wp_abs = zone->wp; - progress = z_start + z_len; - cond = zone->state; - if (blk_zone_is_cmr(zone)) - flgs |= 0x02; - else if (zone->wp != zone->start) - flgs |= 0x01; /* flag as RWP recommended? */ - spin_unlock_irqrestore(&zone->lock, flags); - - switch (opt & ZBC_REPORT_OPTION_MASK) { - case ZBC_ZONE_REPORTING_OPTION_EMPTY: - if (z_wp_abs != z_start) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN: - if (cond != BLK_ZONE_OPEN) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN: - if (cond != BLK_ZONE_OPEN_EXPLICIT) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_CLOSED: - if (cond != BLK_ZONE_CLOSED) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_FULL: - if (cond != BLK_ZONE_FULL) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_READONLY: - if (cond == BLK_ZONE_READONLY) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_OFFLINE: - if (cond == BLK_ZONE_OFFLINE) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP: - if (z_wp_abs == z_start) - continue; - break; - case ZBC_ZONE_REPORTING_OPTION_NON_WP: - if (cond == BLK_ZONE_NO_WP) + for (iter = 0; entry < max_entries && iter < zi->wps_count; iter++) { + wpdscr = zi->wps[iter]; + if (lba > wpdscr->last_lba) + continue; + + spin_lock_irqsave(&wpdscr->lock, flags); + for (idscr = 0; + entry < max_entries && idscr < wpdscr->zone_count; + idscr++) { + struct bdev_zone_descriptor *dscr; + u64 zoff = idscr * wpdscr->zone_size; + u8 cond, flgs = 0; + + z_len = wpdscr->zone_size; + zoff = idscr * z_len; + z_start = wpdscr->start_lba + zoff; + if (lba >= z_start + z_len) continue; - break; - case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE: - /* this can only be reported by the HW */ - break; - case ZBC_ZONE_REPORTING_OPTION_ALL: - default: - break; - } - /* if same code only applies to returned zones */ - if (opt & ZBC_REPORT_ZONE_PARTIAL) { - if (clen != ~0ul) { - clen = z_len; + zone = &wpdscr->zones[idscr]; + if (blk_zone_is_cmr(zone)) + z_wp_abs = z_start + wpdscr->zone_size; + else + z_wp_abs = z_start + zone->wp; + + switch (opt & ZBC_REPORT_OPTION_MASK) { + case ZBC_ZONE_REPORTING_OPTION_EMPTY: + if (z_wp_abs != z_start) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN: + if (zone->state != BLK_ZONE_OPEN) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN: + if (zone->state != BLK_ZONE_OPEN_EXPLICIT) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_CLOSED: + if (zone->state != BLK_ZONE_CLOSED) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_FULL: + if (zone->state != BLK_ZONE_FULL) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_READONLY: + if (zone->state == BLK_ZONE_READONLY) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_OFFLINE: + if (zone->state == BLK_ZONE_OFFLINE) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP: + if (z_wp_abs == z_start) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_NON_WP: + if (zone->state == BLK_ZONE_NO_WP) + continue; + break; + case ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE: + /* this can only be reported by the HW */ + break; + case ZBC_ZONE_REPORTING_OPTION_ALL: + default: + break; + } + + /* if same code only applies to returned zones */ + if (opt & ZBC_REPORT_ZONE_PARTIAL) { + if (clen != ~0ul) { + clen = z_len; + ctype = zone->type; + } + if (z_len != clen) + len_diffs++; + if (zone->type != ctype) + type_diffs++; ctype = zone->type; } - if (z_len != clen) - len_diffs++; - if (zone->type != ctype) - type_diffs++; - ctype = zone->type; - } + progress = z_start + z_len; - /* shift to device units */ - z_start >>= ilog2(sdkp->device->sector_size) - 9; - z_len >>= ilog2(sdkp->device->sector_size) - 9; - z_wp_abs >>= ilog2(sdkp->device->sector_size) - 9; + if (!bzd) { + if (bzrpt) + bzrpt->descriptor_count = + cpu_to_be32(++entry); + continue; + } - if (!bzd) { + /* shift to device units */ + z_start >>= ilog2(sdp->sector_size) - 9; + z_len >>= ilog2(sdp->sector_size) - 9; + z_wp_abs >>= ilog2(sdp->sector_size) - 9; + + cond = zone->state; + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + flgs |= 0x02; + else if (zone->wp) + flgs |= 0x01; /* flag as RWP recommended? */ + + dscr = &bzd[entry]; + dscr->lba_start = cpu_to_be64(z_start); + dscr->length = cpu_to_be64(z_len); + dscr->lba_wptr = cpu_to_be64(z_wp_abs); + dscr->type = zone->type; + dscr->flags = cond << 4 | flgs; + entry++; if (bzrpt) - bzrpt->descriptor_count = - cpu_to_be32(++entry); - continue; + bzrpt->descriptor_count = cpu_to_be32(entry); } - - bzd[entry].lba_start = cpu_to_be64(z_start); - bzd[entry].length = cpu_to_be64(z_len); - bzd[entry].lba_wptr = cpu_to_be64(z_wp_abs); - bzd[entry].type = zone->type; - bzd[entry].flags = cond << 4 | flgs; - entry++; - if (bzrpt) - bzrpt->descriptor_count = cpu_to_be32(entry); + spin_unlock_irqrestore(&wpdscr->lock, flags); } /* if same code applies to all zones */ if (bzrpt && !(opt & ZBC_REPORT_ZONE_PARTIAL)) { - for (node = rb_first(&q->zones); node; node = rb_next(node)) { - zone = rb_entry(node, struct blk_zone, node); - - spin_lock_irqsave(&zone->lock, flags); - if (clen != ~0ul) { - clen = zone->len; + for (iter = 0; iter < zi->wps_count; iter++) { + wpdscr = zi->wps[iter]; + spin_lock_irqsave(&wpdscr->lock, flags); + for (idscr = 0; idscr < wpdscr->zone_count; idscr++) { + z_len = wpdscr->zone_size; + zone = &wpdscr->zones[idscr]; + if (clen != ~0ul) { + clen = z_len; + ctype = zone->type; + } + if (z_len != clen) + len_diffs++; + if (zone->type != ctype) + type_diffs++; ctype = zone->type; } - if (zone->len != clen) - len_diffs++; - if (zone->type != ctype) - type_diffs++; - ctype = zone->type; - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(&wpdscr->lock, flags); } } @@ -985,12 +913,15 @@ out: int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, sector_t sector, unsigned int *num_sectors) { + struct request_queue *q = sdkp->disk->queue; struct blk_zone *zone; + sector_t z_start, z_len; + spinlock_t *lck; unsigned int sectors = *num_sectors; int ret = BLKPREP_OK; unsigned long flags; - zone = blk_lookup_zone(sdkp->disk->queue, sector); + zone = blk_lookup_zone(q, sector, &z_start, &z_len, &lck); if (!zone) { /* Might happen during zone initialization */ sd_zbc_debug_ratelimit(sdkp, @@ -999,7 +930,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, return BLKPREP_OK; } - spin_lock_irqsave(&zone->lock, flags); + spin_lock_irqsave(lck, flags); if (blk_zone_is_cmr(zone)) goto out; @@ -1008,7 +939,7 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, zone->state == BLK_ZONE_BUSY) { sd_zbc_debug_ratelimit(sdkp, "zone %zu state %x, deferring\n", - zone->start, zone->state); + z_start, zone->state); ret = BLKPREP_DEFER; goto out; } @@ -1017,25 +948,22 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, if (op_is_write(req_op(rq))) { u64 nwp = sector + sectors; - while (nwp > (zone->start + zone->len)) { - struct rb_node *node = rb_next(&zone->node); + while (nwp > (z_start + z_len)) { + zone->wp = z_len; + sector = z_start + z_len; + sectors = nwp - sector; + spin_unlock_irqrestore(lck, flags); - zone->wp = zone->start + zone->len; - sector = zone->wp; - sectors = nwp - zone->wp; - spin_unlock_irqrestore(&zone->lock, flags); - - if (!node) - return BLKPREP_OK; - zone = rb_entry(node, struct blk_zone, node); + zone = blk_lookup_zone(q, sector, + &z_start, &z_len, &lck); if (!zone) return BLKPREP_OK; - spin_lock_irqsave(&zone->lock, flags); + spin_lock_irqsave(lck, flags); nwp = sector + sectors; } - if (nwp > zone->wp) - zone->wp = nwp; + if (nwp > z_start + zone->wp) + zone->wp = nwp - z_start; } goto out; } @@ -1044,37 +972,37 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, /* let the drive fail the command */ sd_zbc_debug_ratelimit(sdkp, "zone %zu offline\n", - zone->start); + z_start); goto out; } if (op_is_write(req_op(rq))) { if (zone->state == BLK_ZONE_READONLY) goto out; - if (blk_zone_is_full(zone)) { + if (zone->wp == z_len) { sd_zbc_debug(sdkp, - "Write to full zone %zu/%zu\n", - sector, zone->wp); + "Write to full zone %zu/%zu/%zu\n", + sector, (sector_t)zone->wp, z_len); ret = BLKPREP_KILL; goto out; } - if (zone->wp != sector) { + if (sector != (z_start + zone->wp)) { sd_zbc_debug(sdkp, "Misaligned write %zu/%zu\n", - sector, zone->wp); + sector, z_start + zone->wp); ret = BLKPREP_KILL; goto out; } zone->wp += sectors; - } else if (zone->wp <= sector + sectors) { - if (zone->wp <= sector) { + } else if (z_start + zone->wp <= sector + sectors) { + if (z_start + zone->wp <= sector) { /* Read beyond WP: clear request buffer */ struct req_iterator iter; struct bio_vec bvec; void *buf; sd_zbc_debug(sdkp, "Read beyond wp %zu+%u/%zu\n", - sector, sectors, zone->wp); + sector, sectors, z_start + zone->wp); rq_for_each_segment(bvec, rq, iter) { buf = bvec_kmap_irq(&bvec, &flags); memset(buf, 0, bvec.bv_len); @@ -1085,15 +1013,15 @@ int sd_zbc_setup_read_write(struct scsi_disk *sdkp, struct request *rq, goto out; } /* Read straddle WP position: limit request size */ - *num_sectors = zone->wp - sector; + *num_sectors = z_start + zone->wp - sector; sd_zbc_debug(sdkp, "Read straddle wp %zu+%u/%zu => %zu+%u\n", - sector, sectors, zone->wp, + sector, sectors, z_start + zone->wp, sector, *num_sectors); } out: - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(lck, flags); return ret; } @@ -1145,21 +1073,22 @@ static void update_zones_from_report(struct scsi_cmnd *cmd, u32 nr_bytes) struct bdev_zone_descriptor *entry = &bzde[iter]; sector_t s = get_start_from_desc(sdkp, entry); sector_t z_len = get_len_from_desc(sdkp, entry); + sector_t z_strt; + spinlock_t *lck; unsigned long flags; if (!z_len) goto done; - zone = blk_lookup_zone(rq->q, s); + zone = blk_lookup_zone(rq->q, s, &z_strt, &z_len, &lck); if (!zone) goto done; - spin_lock_irqsave(&zone->lock, flags); + spin_lock_irqsave(lck, flags); zone->type = entry->type & 0xF; zone->state = (entry->flags >> 4) & 0xF; zone->wp = get_wp_from_desc(sdkp, entry); - zone->len = z_len; - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock_irqrestore(lck, flags); } nread += len; if (!dmax) @@ -1233,113 +1162,314 @@ void sd_zbc_uninit_command(struct scsi_cmnd *cmd) } /** - * sd_zbc_init - Load zones of matching zlen size into rb tree. + * alloc_cpws() - Allocate space for a contiguous set of write pointers + * @items: Number of wps needed. + * @lba: lba of the start of the next zone. + * @z_start: Starting lba of this contiguous set. + * @z_size: Size of each zone this contiguous set. * + * Return: Allocated wps or NULL on error. */ -static int sd_zbc_init(struct scsi_disk *sdkp, u64 zlen, char *buf, int buf_len) +static struct contiguous_wps *alloc_cpws(int items, u64 lba, u64 z_start, + u64 z_size) { - sector_t capacity = logical_to_sectors(sdkp->device, sdkp->capacity); - sector_t last_sector; + struct contiguous_wps *cwps = NULL; + size_t sz; - if (test_and_set_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags)) { - sdev_printk(KERN_WARNING, sdkp->device, - "zone initialization already running\n"); - return 0; + sz = sizeof(struct contiguous_wps) + (items * sizeof(struct blk_zone)); + if (items) { + cwps = vzalloc(sz); + if (!cwps) + goto out; + spin_lock_init(&cwps->lock); + cwps->start_lba = z_start; + cwps->last_lba = lba - 1; + cwps->zone_size = z_size; + cwps->is_zoned = items > 1 ? 1 : 0; + cwps->zone_count = items; } - if (!sdkp->zone_work_q) { - char wq_name[32]; +out: + return cwps; +} - sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name); - sdkp->zone_work_q = create_singlethread_workqueue(wq_name); - if (!sdkp->zone_work_q) { - sdev_printk(KERN_WARNING, sdkp->device, - "create zoned disk workqueue failed\n"); - return -ENOMEM; +/** + * free_zone_wps() - Free up memory in use by wps + * @zi: zone wps array(s). + */ +static void free_zone_wps(struct zone_wps *zi) +{ + /* on error free the arrays */ + if (zi && zi->wps) { + int ca; + + for (ca = 0; ca < zi->wps_count; ca++) { + if (zi->wps[ca]) { + vfree(zi->wps[ca]); + zi->wps[ca] = NULL; + } } - } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { - drain_workqueue(sdkp->zone_work_q); - clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags); + kfree(zi->wps); } +} - last_sector = zbc_parse_zones(sdkp, zlen, buf, buf_len); - capacity = logical_to_sectors(sdkp->device, sdkp->capacity); - if (last_sector != -1 && last_sector < capacity) { - sd_zbc_update_zones(sdkp, last_sector, - SD_ZBC_BUF_SIZE, SD_ZBC_INIT); - } else - clear_bit(SD_ZBC_ZONE_INIT, &sdkp->zone_flags); +static int wps_realloc(struct zone_wps *zi, gfp_t gfp_mask) +{ + int rcode = 0; + struct contiguous_wps **old; + struct contiguous_wps **tmp; + int n = zi->wps_count * 2; + + old = zi->wps; + tmp = kzalloc(n, sizeof(*zi->wps), gfp_mask); + if (!tmp) { + rcode = -ENOMEM; + goto out; + } + memcpy(tmp, zi->wps, zi->wps_count * sizeof(*zi->wps)); + zi->wps = tmp; + kfree(old); - return 0; +out: + return rcode; } +#define FMT_CHANGING_CAPACITY "Changing capacity from %zu to Max LBA+1 %zu" + /** - * sd_zbc_config() - Configure a ZBC device (on attach) - * @sdkp: SCSI disk being attached. - * @buffer: Buffer to working data. - * @buf_sz: Size of buffer to use for working data + * zbc_init_zones() - Re-Sync expected WP location with drive + * @sdkp: scsi_disk + * @gfp_mask: Allocation mask. * - * Return: true of SD_ZBC_RESET_WP provisioning is supported + * Return: 0 on success, otherwise error. */ -bool sd_zbc_config(struct scsi_disk *sdkp, void *buffer, size_t buf_sz) +int zbc_init_zones(struct scsi_disk *sdkp, gfp_t gfp_mask) { - struct bdev_zone_report *bzrpt = buffer; - u64 zone_len, lba; - int retval; - u32 rep_len; - u8 same; + struct request_queue *q = sdkp->disk->queue; + int rcode = 0; + int entry = 0; + int offset; + int offmax; + u64 iter; + u64 z_start = 0ul; + u64 z_size = 0; /* size of zone */ + int z_count = 0; /* number of zones of z_size */ + int do_fill = 0; + int array_count = 0; + int one_time_setup = 0; + u8 opt = ZBC_ZONE_REPORTING_OPTION_ALL; + size_t bufsz = SD_ZBC_BUF_SIZE; + struct bdev_zone_report *rpt = NULL; + struct zone_wps *zi = NULL; + struct contiguous_wps *cwps = NULL; + + if (q->zones) + goto out; - if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) - /* - * Device managed or normal SCSI disk, - * no special handling required - */ - return false; - - retval = sd_zbc_report_zones(sdkp, bzrpt, buf_sz, - 0, ZBC_ZONE_REPORTING_OPTION_ALL, false); - if (retval < 0) - return false; - - rep_len = be32_to_cpu(bzrpt->descriptor_count); - if (rep_len < 7) { - sd_printk(KERN_WARNING, sdkp, - "REPORT ZONES report invalid length %u\n", - rep_len); - return false; + zi = kzalloc(sizeof(*zi), gfp_mask); + if (!zi) { + rcode = -ENOMEM; + goto out; } - if (sdkp->rc_basis == 0) { - /* The max_lba field is the capacity of a zoned device */ - lba = be64_to_cpu(bzrpt->maximum_lba); - if (lba + 1 > sdkp->capacity) { - if (sdkp->first_scan) - sd_printk(KERN_WARNING, sdkp, - "Changing capacity from %zu to Max LBA+1 %zu\n", - sdkp->capacity, (sector_t) lba + 1); - sdkp->capacity = lba + 1; + if (sdkp->zoned != 1 && sdkp->device->type != TYPE_ZBC) { + struct gendisk *disk = sdkp->disk; + + zi->wps = kzalloc(sizeof(*zi->wps), gfp_mask); + zi->wps[0] = alloc_cpws(1, disk->part0.nr_sects, z_start, 1); + if (!zi->wps[0]) { + rcode = -ENOMEM; + goto out; } + zi->wps_count = 1; + goto out; + } + + rpt = kmalloc(bufsz, gfp_mask); + if (!rpt) { + rcode = -ENOMEM; + goto out; } /* - * Adjust 'chunk_sectors' to the zone length if the device - * supports equal zone sizes. + * Start by handling upto 32 different zone sizes. 2 will work + * for all the current drives, but maybe something exotic will + * surface. */ - same = bzrpt->same_field & 0x0f; - if (same > 3) { - sd_printk(KERN_WARNING, sdkp, - "REPORT ZONES SAME type %d not supported\n", same); - return false; + zi->wps = kcalloc(32, sizeof(*zi->wps), gfp_mask); + zi->wps_count = 32; + if (!zi->wps) { + rcode = -ENOMEM; + goto out; } - /* Read the zone length from the first zone descriptor */ - zone_len = be64_to_cpu(bzrpt->descriptors[0].length); - sdkp->unmap_alignment = zone_len; - sdkp->unmap_granularity = zone_len; - blk_queue_chunk_sectors(sdkp->disk->queue, - logical_to_sectors(sdkp->device, zone_len)); - - sd_zbc_init(sdkp, zone_len, buffer, buf_sz); - return true; + +fill: + offset = 0; + offmax = 0; + for (entry = 0, iter = 0; iter < sdkp->capacity; entry++) { + struct bdev_zone_descriptor *bzde; + int stop_end = 0; + int stop_size = 0; + + if (offset == 0) { + int err; + + err = sd_zbc_report_zones(sdkp, rpt, bufsz, iter, opt); + if (err) { + pr_err("report zones-> %d\n", err); + if (err != -ENOTSUPP) + rcode = err; + goto out; + } + if (sdkp->rc_basis == 0) { + sector_t lba = be64_to_cpu(rpt->maximum_lba); + + if (lba + 1 > sdkp->capacity) { + sd_printk(KERN_WARNING, sdkp, + FMT_CHANGING_CAPACITY "\n", + sdkp->capacity, lba + 1); + sdkp->capacity = lba + 1; + } + } + offmax = max_report_entries(bufsz); + } + bzde = &rpt->descriptors[offset]; + if (z_size == 0) + z_size = get_len_from_desc(sdkp, bzde); + if (z_size != get_len_from_desc(sdkp, bzde)) + stop_size = 1; + if ((iter + z_size) >= sdkp->capacity) + stop_end = 1; + + if (!one_time_setup) { + u8 type = bzde->type & 0x0F; + + if (type != BLK_ZONE_TYPE_CONVENTIONAL) { + one_time_setup = 1; + blk_queue_chunk_sectors(sdkp->disk->queue, + z_size); + } + } + + if (do_fill == 0) { + if (stop_end || stop_size) { + /* include the next/last zone? */ + if (!stop_size) { + z_count++; + iter += z_size; + } + cwps = alloc_cpws(z_count, iter, + z_start, z_size); + if (!cwps) { + rcode = -ENOMEM; + goto out; + } + if (array_count > 0) + cwps->is_zoned = 1; + + zi->wps[array_count] = cwps; + z_start = iter; + z_size = 0; + z_count = 0; + array_count++; + if (array_count >= zi->wps_count) { + rcode = wps_realloc(zi, gfp_mask); + if (rcode) + goto out; + } + /* add the runt zone */ + if (stop_end && stop_size) { + z_count++; + z_size = get_len_from_desc(sdkp, bzde); + cwps = alloc_cpws(z_count, + iter + z_size, + z_start, z_size); + if (!cwps) { + rcode = -ENOMEM; + goto out; + } + if (array_count > 0) + cwps->is_zoned = 1; + zi->wps[array_count] = cwps; + array_count++; + } + if (stop_end) { + do_fill = 1; + array_count = 0; + z_count = 0; + z_size = 0; + goto fill; + } + } + z_size = get_len_from_desc(sdkp, bzde); + iter += z_size; + z_count++; + } else { + fill_zone(zi->wps[array_count], z_count, sdkp, bzde); + z_count++; + iter += z_size; + if (zi->wps[array_count]->zone_count == z_count) { + z_count = 0; + array_count++; + zi->wps_count = array_count; + } + } + offset++; + if (offset >= offmax) + offset = 0; + } +out: + kfree(rpt); + + if (rcode) { + if (zi) { + free_zone_wps(zi); + kfree(zi); + } + } else { + q->zones = zi; + } + + return rcode; +} + +/** + * sd_zbc_config() - Configure a ZBC device (on attach) + * @sdkp: SCSI disk being attached. + * @gfp_mask: Memory allocation strategy + * + * Return: true of SD_ZBC_RESET_WP provisioning is supported + */ +bool sd_zbc_config(struct scsi_disk *sdkp, gfp_t gfp_mask) +{ + bool can_reset_wp = false; + + if (zbc_init_zones(sdkp, gfp_mask)) { + sdev_printk(KERN_WARNING, sdkp->device, + "Initialize zone cache failed\n"); + goto out; + } + + if (sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC) + can_reset_wp = true; + + if (!sdkp->zone_work_q) { + char wq_name[32]; + + sprintf(wq_name, "zbc_wq_%s", sdkp->disk->disk_name); + sdkp->zone_work_q = create_singlethread_workqueue(wq_name); + if (!sdkp->zone_work_q) { + sdev_printk(KERN_WARNING, sdkp->device, + "create zoned disk workqueue failed\n"); + goto out; + } + } else if (!test_and_set_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags)) { + drain_workqueue(sdkp->zone_work_q); + clear_bit(SD_ZBC_ZONE_RESET, &sdkp->zone_flags); + } + +out: + return can_reset_wp; } /** @@ -1365,15 +1495,16 @@ void sd_zbc_remove(struct scsi_disk *sdkp) */ unsigned int sd_zbc_discard_granularity(struct scsi_disk *sdkp) { - unsigned int bytes = 1; struct request_queue *q = sdkp->disk->queue; - struct rb_node *node = rb_first(&q->zones); + struct zone_wps *zi = q->zones; + unsigned int bytes = 1; - if (node) { - struct blk_zone *zone = rb_entry(node, struct blk_zone, node); + if (zi && zi->wps_count > 0) { + struct contiguous_wps *wp = zi->wps[0]; - bytes = zone->len; + bytes = wp->zone_size; } + bytes <<= ilog2(sdkp->device->sector_size); return bytes; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5cdb5d..113c5a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -264,27 +264,83 @@ struct blk_queue_tag { #ifdef CONFIG_BLK_DEV_ZONED +/** + * struct blk_zone - A single zone type/stats and WP offset. + * + * @wp: Holds the wp offset from the start of the zone. + * @type: Holds the zone type nibble. + * @state: Holds the zone state nibble + kernel (zone busy) + * @private_data: Used to hold whatever the implicit domain owner + * of the zone needs to track. + * + * Type is left at 4 bits (only 2 are needed currently) to match + * the current ZBC/ZAC standards. + * + * State is using 5 bits to accommodate the ZONE_BUSY. The first 4 bits + * match the current ZBC/ZAC spec. + * ZONE_BUSY could be mapped to one of the reserved bits. Using it as + * mask bit or independent flag my be useful for decoding the zone + * state before it transitioned to BUSY. + * + * A zone sized at order (39+9) is very unlikely (current zones are 16+9) + * Even at lba48 equivalent number of sectors we have a large amount + * of padding to fill out 8 bytes. + * + * Getting this to fit in 4 bytes would limit the maximum size of a zone + * to 4G [order 23 of 512 byte sectors + 9 bits for flags] which is probably + * okay for embedded or 32-bit systems where the private_data pointer + * would also shrink to 32 bits. There are also WP tracking schemes + * that don't make use of the private_data helper so perhaps that + * could be factored out as well. + */ struct blk_zone { - struct rb_node node; - spinlock_t lock; - sector_t start; - size_t len; - sector_t wp; - enum blk_zone_type type; - enum blk_zone_state state; + unsigned long long wp:39; + unsigned long long type:4; + unsigned long long state:5; + unsigned long long padding:15; void *private_data; }; +/** + * struct contiguous_wps - A descriptor of zones of the same size + * + * @start_lba: LBA of first zone covered by the descriptor. + * @last_lba: LBA of last zone. + * @zone_size: Size of zones as a number of 512 byte sectors. + * @zone_count: Number of zones (last-start/size) for convenience. + * @lock: A spinlock protecting these zones. + * @is_zoned: 0 when all zones are conventional no WP zones. + * zones: Array of blk_zone entries. + */ +struct contiguous_wps { + u64 start_lba; + u64 last_lba; + u64 zone_size; + u32 zone_count; + spinlock_t lock; + unsigned is_zoned:1; + struct blk_zone zones[0]; +}; + +/** + * struct zone_wps - A collection of zone descriptors to describe zoned media. + * + * @wps_count: Number of descriptors. + * @wps: Array of zone descriptors. + */ +struct zone_wps { + u32 wps_count; + struct contiguous_wps **wps; +}; + #define blk_zone_is_seq_req(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_REQ) #define blk_zone_is_seq_pref(z) ((z)->type == BLK_ZONE_TYPE_SEQWRITE_PREF) #define blk_zone_is_smr(z) (blk_zone_is_seq_req(z) || blk_zone_is_seq_pref(z)) #define blk_zone_is_cmr(z) ((z)->type == BLK_ZONE_TYPE_CONVENTIONAL) -#define blk_zone_is_full(z) ((z)->wp == (z)->start + (z)->len) -#define blk_zone_is_empty(z) ((z)->wp == (z)->start) +#define blk_zone_is_empty(z) ((z)->wp == 0) -extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t); -extern struct blk_zone *blk_insert_zone(struct request_queue *, - struct blk_zone *); +extern struct blk_zone *blk_lookup_zone(struct request_queue *, sector_t, + sector_t *, sector_t *, spinlock_t **); extern void blk_drop_zones(struct request_queue *); #else static inline void blk_drop_zones(struct request_queue *q) { }; @@ -463,7 +519,7 @@ struct request_queue { struct queue_limits limits; #ifdef CONFIG_BLK_DEV_ZONED - struct rb_root zones; + struct zone_wps *zones; #endif /* * sg stuff
Currently the RB-Tree zone cache is fast and flexible. It does use a rather largish amount of ram. This model reduces the ram required from 120 bytes per zone to 16 bytes per zone with a moderate transformation of the blk_zone_lookup() api. This model is predicated on the belief that most variations on zoned media will follow a pattern of using collections of same sized zones on a single device. Similar to the pattern of erase blocks on flash devices being progressivly larger 16K, 64K, ... The goal is to be able to build a descriptor which is both memory efficient, performant, and flexible. Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com> --- block/blk-core.c | 2 +- block/blk-sysfs.c | 31 +- block/blk-zoned.c | 103 +++-- drivers/scsi/sd.c | 5 +- drivers/scsi/sd.h | 4 +- drivers/scsi/sd_zbc.c | 1025 +++++++++++++++++++++++++++--------------------- include/linux/blkdev.h | 82 +++- 7 files changed, 716 insertions(+), 536 deletions(-)