new file mode 100644
@@ -0,0 +1,75 @@
+Overview of Host Aware ZBC/ZAC Device Mapper
+ - Zone size (256MiB)
+ - Reset WP, Open Zone, Close Zone, Get Zone Info ...
+
+The Zoned DM treats a zoned device as a collection of 1024 zones 256GiB,
+referred to internally as 'megazones' as with zoned devices the last
+megazone may be less than 1024 zones in size. If the last zone is of
+different size it is ignored.
+
+What that means is that drives which report SAME=0 are not supported
+and likely won't be supported within this architecture. However drives
+which report a SAME code of: all same, last differs or same length
+different types, would all be supported by this architecture.
+
+The initial implementation focuses on drives with same sized zones of
+256MB which is 65536 4k blocks. In future the zone size of 256MB will
+be relaxed to allow any size of zone as long as they are all the same.
+
+Internally all addressing is on 4k boundaries. Currently a 4k PAGE_SIZE is
+assumed. Architectures with 8k (or other) PAGE_SIZE have not been tested
+and are likly broken at the moment.
+
+Host Managed drives should work if the zone type at the start of the partition
+is Conventional, or Preferred.
+
+
+Megazones:
+ Each megazone is managed independently and partitioned into
+ meta data and data. The Device Mapper Meta Data is logically
+ located in the 2nd and 3rd zones of the megazone. The 1st and
+ 2nd zones are reserved for the megazone's minimally relocatable
+ super block which must be the first block of the first or
+ second zone. The most recent is determined by the generation
+ number embedded in the super block. The meta data is sized
+ for two zones and logically located in sectors non-addressable
+ to the upper device. The actual storage of the meta data is pooled
+ with the data using the same mapping scheme.
+ The device mapper internally is a COW device with a 4k per block
+ addressing scheme. There are some fix-ups to handle non-4k aligned
+ requests to support applications which read and write in 512 byte
+ blocks, however it is still desirable to submit patches for these
+ subsystems assuming the respective maintainers are willing to
+ accept 4k alignment patches.
+
+Address space:
+ The zoned device mapper presents a smaller block device than
+ the amount of data available on the physical media. The extra
+ space is used to hold the meta data needed for managing the
+ data being stored on the drive performing COW block [re]mapping.
+ The 'shrink' is done by appropriately sizing the device via
+ dmsetup.
+ See the zdmadm utility will detect and size the device appropriaty.
+
+Map Strategy:
+ Map incoming sector onto device mapper sector space.
+
+Read Strategy:
+ Check each block requested in the bio to determine if the data
+ blocks are consecutively stored on disk. Pass as much per-bio
+ as possible through to the backing block device.
+
+Write Strategy:
+ Allocate space for entire bio on the backing block device
+ redirecting all incoming write requests to the most recently
+ written zone until the zone is filled or the bio is too large
+ to fit and a new zone is opened. Note that if the zone is not
+ filled this zone will likely become used by meta data writes
+ which are typically single blocks.
+
+Sync Strategy:
+ On SYNC bios all the meta data need to restore the zoned device
+ mapper for disk is written to one of the well known zones at
+ the beginning of the mega zone. Data consistency is only
+ 'guaranteed' to be on-disk and consistent following sync
+ events [same as ext4].
@@ -12182,6 +12182,13 @@ L: zd1211-devs@lists.sourceforge.net (subscribers-only)
S: Maintained
F: drivers/net/wireless/zydas/zd1211rw/
+ZDM ZONED DEVICE MAPPER TARGET
+M: Shaun Tancheff <shaun.tancheff@seagate.com>
+L: dm-devel@redhat.com
+S: Maintained
+F: drivers/md/dm-zoned.*
+F: drivers/md/libzoned.c
+
ZPOOL COMPRESSED PAGE STORAGE API
M: Dan Streetman <ddstreet@ieee.org>
L: linux-mm@kvack.org
@@ -345,6 +345,17 @@ config DM_ERA
over time. Useful for maintaining cache coherency when using
vendor snapshots.
+config DM_ZONED
+ tristate "ZDM: Zoned based device target (EXPERIMENTAL)"
+ depends on BLK_DEV_DM
+ default n
+ select LIBCRC32C
+ ---help---
+ dm-zoned provides a rand access block device on top of a
+ ZBC/ZAC block device.
+ Forward writing within zones, garbage collection within zones.
+ Use zdmadm to create, repair and/or restore ZDM instances.
+
config DM_MIRROR
tristate "Mirror target"
depends on BLK_DEV_DM
@@ -60,6 +60,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_ZONED) += dm-zoned.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
new file mode 100644
@@ -0,0 +1,2535 @@
+/*
+ * Kernel Device Mapper for abstracting ZAC/ZBC devices as normal
+ * block devices for linux file systems.
+ *
+ * Copyright (C) 2015 Seagate Technology PLC
+ *
+ * Written by:
+ * Shaun Tancheff <shaun.tancheff@seagate.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include "dm.h"
+#include <linux/dm-io.h>
+#include <linux/init.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/crc32c.h>
+#include <linux/crc16.h>
+#include <linux/sort.h>
+#include <linux/ctype.h>
+#include <linux/types.h>
+#include <linux/blkzoned_api.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kfifo.h>
+#include <linux/bsearch.h>
+#include "dm-zoned.h"
+
+#define PRIu64 "llu"
+#define PRIx64 "llx"
+#define PRId32 "d"
+#define PRIx32 "x"
+#define PRIu32 "u"
+
+#define BIOSET_RESV 4
+
+/**
+ * _zdisk() - Return a pretty ZDM name.
+ * @znd: ZDM Instance
+ *
+ * Return: ZDM/backing device pretty name.
+ */
+static inline char *_zdisk(struct zoned *znd)
+{
+ return znd->bdev_name;
+}
+
+#define Z_ERR(znd, fmt, arg...) \
+ pr_err("dm-zoned(%s): " fmt "\n", _zdisk(znd), ## arg)
+
+#define Z_INFO(znd, fmt, arg...) \
+ pr_info("dm-zoned(%s): " fmt "\n", _zdisk(znd), ## arg)
+
+#define Z_DBG(znd, fmt, arg...) \
+ pr_debug("dm-zoned(%s): " fmt "\n", _zdisk(znd), ## arg)
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+static void do_io_work(struct work_struct *work);
+static int block_io(struct zoned *, enum dm_io_mem_type, void *, sector_t,
+ unsigned int, int, int);
+static int znd_async_io(struct zoned *znd,
+ enum dm_io_mem_type dtype,
+ void *data,
+ sector_t block, unsigned int nDMsect, int rw, int queue,
+ io_notify_fn callback, void *context);
+static int zoned_bio(struct zoned *znd, struct bio *bio);
+static int zoned_map_write(struct zoned *znd, struct bio*, u64 s_zdm);
+static sector_t get_dev_size(struct dm_target *ti);
+static int dmz_reset_wp(struct zoned *znd, u64 z_id);
+static int dmz_open_zone(struct zoned *znd, u64 z_id);
+static int dmz_close_zone(struct zoned *znd, u64 z_id);
+static u32 dmz_report_count(struct zoned *znd, void *report, size_t bufsz);
+static int dmz_report_zones(struct zoned *znd, u64 z_id,
+ struct page *pgs, size_t bufsz);
+static void activity_timeout(unsigned long data);
+static void zoned_destroy(struct zoned *);
+static int gc_can_cherrypick(struct zoned *znd, u32 sid, int delay, gfp_t gfp);
+static void bg_work_task(struct work_struct *work);
+static void on_timeout_activity(struct zoned *znd, int delay);
+static int zdm_create_proc_entries(struct zoned *znd);
+static void zdm_remove_proc_entries(struct zoned *znd);
+
+/**
+ * get_bdev_bd_inode() - Get primary backing device inode
+ * @znd: ZDM Instance
+ *
+ * Return: backing device inode
+ */
+static inline struct inode *get_bdev_bd_inode(struct zoned *znd)
+{
+ return znd->dev->bdev->bd_inode;
+}
+
+#include "libzoned.c"
+
+#define BIO_CACHE_SECTORS (IO_VCACHE_PAGES * Z_BLOCKS_PER_DM_SECTOR)
+
+/**
+ * bio_stream() - Decode stream id from BIO.
+ * @znd: ZDM Instance
+ *
+ * Return: stream_id
+ */
+static inline u32 bio_stream(struct bio *bio)
+{
+ u32 stream_id = 0x40;
+
+ /*
+ * Since adding stream id to a BIO is not yet in mainline we just
+ * use this heuristic to try to skip unnecessary co-mingling of data.
+ */
+
+ if (bio->bi_rw & REQ_META)
+ stream_id = 0xff;
+
+ return stream_id;
+}
+
+/**
+ * zoned_map_discard() - Return a pretty ZDM name.
+ * @znd: ZDM Instance
+ * @bio: struct bio hold discard information
+ * @s_zdm: tlba being discarded.
+ *
+ * Return: 0 on success, otherwise error code.
+ */
+static int zoned_map_discard(struct zoned *znd, struct bio *bio, u64 s_zdm)
+{
+ int rcode = DM_MAPIO_SUBMITTED;
+ u64 blks = bio->bi_iter.bi_size / Z_C4K;
+ int err;
+
+ if (znd->is_empty)
+ goto cleanup_out;
+
+ err = z_mapped_discard(znd, s_zdm, blks, CRIT);
+ if (err < 0) {
+ rcode = err;
+ goto out;
+ }
+
+cleanup_out:
+ bio->bi_iter.bi_sector = 8;
+ bio_endio(bio);
+out:
+ return rcode;
+}
+
+/**
+ * is_non_wp_zone() - Test zone # to see if it flagged as conventional.
+ * @znd: ZDM Instance
+ * @z_id: Zone #
+ *
+ * Return: 1 if conventional zone. 0 if sequentional write zone.
+ */
+static int is_non_wp_zone(struct zoned *znd, u64 z_id)
+{
+ u32 gzoff = z_id % 1024;
+ struct meta_pg *wpg = &znd->wp[z_id >> 10];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ return (wp & Z_WP_NON_SEQ) ? 1 : 0;
+}
+
+/**
+ */
+struct zone_action {
+ struct work_struct work;
+ struct zoned *znd;
+ u64 s_addr;
+ unsigned long bi_rw;
+ int wp_err;
+};
+
+/**
+ * do_zone_action_work() - Issue a 'zone action' to the backing device.
+ * @work: Work to do.
+ */
+static void do_zone_action_work(struct work_struct *work)
+{
+ struct zone_action *za = container_of(work, struct zone_action, work);
+ struct zoned *znd = za->znd;
+ struct block_device *bdev = znd->dev->bdev;
+ const gfp_t gfp = GFP_KERNEL;
+
+ /* Explicitly work on device lbas not partition offsets. */
+ if (bdev != bdev->bd_contains)
+ bdev = bdev->bd_contains;
+
+ za->wp_err = blkdev_issue_zone_action(bdev, za->bi_rw, za->s_addr, gfp);
+}
+
+/**
+ * dmz_zone_action() - Issue a 'zone action' to the backing device (via worker).
+ * @znd: ZDM Instance
+ * @z_id: Zone # to open.
+ * @rw: One of REQ_OPEN_ZONE, REQ_CLOSE_ZONE, or REQ_RESET_ZONE.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int dmz_zone_action(struct zoned *znd, u64 z_id, unsigned long rw)
+{
+ int wp_err = 0;
+
+ if (is_non_wp_zone(znd, z_id))
+ return wp_err;
+
+ if (znd->bdev_is_zoned) {
+ u64 z_offset = zone_to_sector(z_id + znd->zdstart);
+ struct zone_action za = {
+ .znd = znd,
+ .bi_rw = rw,
+ .s_addr = z_offset,
+ .wp_err = 0,
+ };
+ if (znd->ata_passthrough)
+ za.bi_rw |= REQ_META;
+ /*
+ * Issue the synchronous I/O from a different thread
+ * to avoid generic_make_request recursion.
+ */
+ INIT_WORK_ONSTACK(&za.work, do_zone_action_work);
+ queue_work(znd->zone_action_wq, &za.work);
+ flush_workqueue(znd->zone_action_wq);
+ destroy_work_on_stack(&za.work);
+ wp_err = za.wp_err;
+
+ if (wp_err) {
+ Z_ERR(znd, "Zone Cmd: LBA: %" PRIx64
+ " [Z:%" PRIu64 "] -> %d failed.",
+ za.s_addr, z_id, wp_err);
+ Z_ERR(znd, "ZAC/ZBC support disabled.");
+ znd->bdev_is_zoned = 0;
+ wp_err = -ENOTSUPP;
+ }
+ }
+ return wp_err;
+}
+
+/**
+ * dmz_reset_wp() - Reset write pointer for zone z_id.
+ * @znd: ZDM Instance
+ * @z_id: Zone # to reset.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int dmz_reset_wp(struct zoned *znd, u64 z_id)
+{
+ return dmz_zone_action(znd, z_id, REQ_RESET_ZONE);
+}
+
+/**
+ * dmz_open_zone() - Open zone for writing.
+ * @znd: ZDM Instance
+ * @z_id: Zone # to open.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int dmz_open_zone(struct zoned *znd, u64 z_id)
+{
+ if (!znd->issue_open_zone)
+ return 0;
+ return dmz_zone_action(znd, z_id, REQ_OPEN_ZONE);
+
+}
+
+/**
+ * dmz_close_zone() - Close zone to writing.
+ * @znd: ZDM Instance
+ * @z_id: Zone # to close.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int dmz_close_zone(struct zoned *znd, u64 z_id)
+{
+ if (!znd->issue_close_zone)
+ return 0;
+ return dmz_zone_action(znd, z_id, REQ_CLOSE_ZONE);
+}
+
+/**
+ * dmz_report_count() - Read number of zones returned.
+ * @znd: ZDM Instance
+ * @rpt_in: Report data.
+ * @bufsz: size of space allocated for report
+ *
+ * Return: Number of zones in report.
+ */
+static u32 dmz_report_count(struct zoned *znd, void *rpt_in, size_t bufsz)
+{
+ u32 count;
+ u32 max_count = (bufsz - sizeof(struct bdev_zone_report))
+ / sizeof(struct bdev_zone_descriptor);
+
+ if (znd->ata_passthrough) {
+ struct bdev_zone_report_le *report = rpt_in;
+
+ /* ZAC: ata results are little endian */
+ if (max_count > le32_to_cpu(report->descriptor_count))
+ report->descriptor_count = cpu_to_le32(max_count);
+ count = le32_to_cpu(report->descriptor_count);
+ } else {
+ struct bdev_zone_report *report = rpt_in;
+
+ /* ZBC: scsi results are big endian */
+ if (max_count > be32_to_cpu(report->descriptor_count))
+ report->descriptor_count = cpu_to_be32(max_count);
+ count = be32_to_cpu(report->descriptor_count);
+ }
+ return count;
+}
+
+/**
+ * dmz_report_zones() - issue report zones from z_id zones after zdstart
+ * @znd: ZDM Instance
+ * @z_id: Zone past zdstart
+ * @report: structure filled
+ * @bufsz: kmalloc()'d space reserved for report
+ *
+ * Return: -ENOTSUPP or 0 on success
+ */
+static int dmz_report_zones(struct zoned *znd, u64 z_id,
+ struct page *pgs, size_t bufsz)
+{
+ int wp_err = -ENOTSUPP;
+
+ if (znd->bdev_is_zoned) {
+ u8 opt = ZOPT_NON_SEQ_AND_RESET;
+ unsigned long bi_rw = 0;
+ struct block_device *bdev = znd->dev->bdev;
+ u64 s_addr = zone_to_sector(z_id + znd->zdstart);
+
+ if (bdev != bdev->bd_contains)
+ s_addr -= znd->start_sect << Z_SHFT4K;
+
+ if (znd->ata_passthrough)
+ bi_rw = REQ_META;
+
+ wp_err = blkdev_issue_zone_report(bdev, bi_rw, s_addr, opt,
+ pgs, bufsz, GFP_KERNEL);
+ if (wp_err) {
+ Z_ERR(znd, "Report Zones: LBA: %" PRIx64
+ " [Z:%" PRIu64 " -> %d failed.",
+ s_addr, z_id + znd->zdstart, wp_err);
+ Z_ERR(znd, "ZAC/ZBC support disabled.");
+ znd->bdev_is_zoned = 0;
+ wp_err = -ENOTSUPP;
+ }
+ }
+ return wp_err;
+}
+
+/**
+ * is_conventional() - Determine if zone is conventional.
+ * @dentry: Zone descriptor entry.
+ *
+ * Return: 1 if zone type is conventional.
+ */
+static inline int is_conventional(struct bdev_zone_descriptor *dentry)
+{
+ return (ZTYP_CONVENTIONAL == (dentry->type & 0x0F)) ? 1 : 0;
+}
+
+/**
+ * is_zone_reset() - Determine if zone is reset / ready for writing.
+ * @dentry: Zone descriptor entry.
+ *
+ * Return: 1 if zone condition is empty or zone type is conventional.
+ */
+static inline int is_zone_reset(struct bdev_zone_descriptor *dentry)
+{
+ u8 type = dentry->type & 0x0F;
+ u8 cond = (dentry->flags & 0xF0) >> 4;
+
+ return (ZCOND_ZC1_EMPTY == cond || ZTYP_CONVENTIONAL == type) ? 1 : 0;
+}
+
+/**
+ * get_wp_from_descriptor() - Decode write pointer as # of blocks from start
+ * @znd: ZDM Instance
+ * @dentry_in: Zone descriptor entry.
+ *
+ * Return: Write Pointer as number of blocks from start of zone.
+ */
+static inline u32 get_wp_from_descriptor(struct zoned *znd, void *dentry_in)
+{
+ u32 wp = 0;
+
+ /*
+ * If ATA passthrough was used then ZAC results are little endian.
+ * otherwise ZBC results are big endian.
+ */
+
+ if (znd->ata_passthrough) {
+ struct bdev_zone_descriptor_le *lil = dentry_in;
+
+ wp = le64_to_cpu(lil->lba_wptr) - le64_to_cpu(lil->lba_start);
+ } else {
+ struct bdev_zone_descriptor *big = dentry_in;
+
+ wp = be64_to_cpu(big->lba_wptr) - be64_to_cpu(big->lba_start);
+ }
+ return wp;
+}
+
+/**
+ * _dec_wp_avail_by_lost() - Update free count due to lost/unusable blocks.
+ * @wpg: Write pointer metadata page.
+ * @gzoff: Zone entry in page.
+ * @lost: Number of blocks 'lost'.
+ */
+static inline
+void _dec_wp_avail_by_lost(struct meta_pg *wpg, u32 gzoff, u32 lost)
+{
+ wpg->zf_est[gzoff] = cpu_to_le32(le32_to_cpu(wpg->zf_est[gzoff])+lost);
+}
+
+/**
+ * zoned_wp_sync() - Re-Sync expected WP location with drive
+ * @znd: ZDM Instance
+ * @reset_non_empty: Reset the non-empty zones.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int zoned_wp_sync(struct zoned *znd, int reset_non_empty)
+{
+ int rcode = 0;
+ u32 rcount = 0;
+ u32 iter;
+ struct bdev_zone_report *report = NULL;
+ int order = REPORT_ORDER;
+ size_t bufsz = REPORT_FILL_PGS * Z_C4K;
+ struct page *pgs = alloc_pages(GFP_KERNEL, order);
+
+ if (pgs)
+ report = page_address(pgs);
+
+ if (!report) {
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ for (iter = 0; iter < znd->data_zones; iter++) {
+ u32 entry = iter % 4096;
+ u32 gzno = iter >> 10;
+ u32 gzoff = iter & ((1 << 10) - 1);
+ struct meta_pg *wpg = &znd->wp[gzno];
+ struct bdev_zone_descriptor *dentry;
+ u32 wp_flgs;
+ u32 wp_at;
+ u32 wp;
+
+ if (entry == 0) {
+ int err = dmz_report_zones(znd, iter, pgs, bufsz);
+
+ if (err) {
+ Z_ERR(znd, "report zones-> %d", err);
+ if (err != -ENOTSUPP)
+ rcode = err;
+ goto out;
+ }
+ rcount = dmz_report_count(znd, report, bufsz);
+ }
+
+ dentry = &report->descriptors[entry];
+ if (reset_non_empty && !is_conventional(dentry)) {
+ int err = 0;
+
+ if (!is_zone_reset(dentry))
+ err = dmz_reset_wp(znd, iter);
+
+ if (err) {
+ Z_ERR(znd, "reset wp-> %d", err);
+ if (err != -ENOTSUPP)
+ rcode = err;
+ goto out;
+ }
+ wp = wp_at = 0;
+ wpg->wp_alloc[gzoff] = cpu_to_le32(0);
+ wpg->zf_est[gzoff] = cpu_to_le32(Z_BLKSZ);
+ continue;
+ }
+
+ wp = get_wp_from_descriptor(znd, dentry);
+ wp >>= Z_SHFT4K; /* 512 sectors to 4k sectors */
+ wp_at = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_VALUE_MASK;
+ wp_flgs = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_FLAGS_MASK;
+
+ if (is_conventional(dentry)) {
+ wp = wp_at;
+ wp_flgs |= Z_WP_NON_SEQ;
+ } else {
+ wp_flgs &= ~Z_WP_NON_SEQ;
+ }
+
+ if (wp > wp_at) {
+ u32 lost = wp - wp_at;
+
+ wp_at = wp;
+ _dec_wp_avail_by_lost(wpg, gzoff, lost);
+
+ Z_ERR(znd, "Z#%u z:%x [wp:%x rz:%x] lost %u blocks.",
+ iter, gzoff, wp_at, wp, lost);
+ }
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wp_at|wp_flgs);
+ }
+
+out:
+ if (pgs)
+ __free_pages(pgs, order);
+
+ return rcode;
+}
+
+#if USE_KTHREAD
+
+static inline int stop_or_data(struct zoned *znd)
+{
+ if (kthread_should_stop())
+ return 1;
+ return kfifo_is_empty(&znd->bio_fifo) ? 0 : 1;
+}
+
+static int znd_bio_kthread(void *arg)
+{
+ int err = 0;
+ struct zoned *znd = (struct zoned *)arg;
+
+ Z_ERR(znd, "znd_bio_kthread [started]");
+
+ while (!kthread_should_stop()) {
+ struct bio *bio = NULL;
+ int rcode;
+
+ if (kfifo_is_empty(&znd->bio_fifo)) {
+ wake_up(&znd->wait_fifo);
+ wait_event_freezable(znd->wait_bio, stop_or_data(znd));
+ continue;
+ }
+ if (!kfifo_get(&znd->bio_fifo, &bio)) {
+ wake_up(&znd->wait_fifo);
+ continue;
+ }
+ if (kfifo_avail(&znd->bio_fifo) > 5)
+ wake_up(&znd->wait_fifo);
+
+ rcode = zoned_bio(znd, bio);
+ if (rcode == DM_MAPIO_REMAPPED)
+ rcode = DM_MAPIO_SUBMITTED;
+
+ if (rcode < 0) {
+ znd->meta_result = err = rcode;
+ goto out;
+ }
+ }
+ err = 0;
+ Z_ERR(znd, "znd_bio_kthread [stopped]");
+
+out:
+ return err;
+}
+
+/**
+ * zoned_map() - Handle an incoming BIO
+ * @ti: Device Mapper Target Instance
+ * @bio: The BIO to disposition.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int zoned_map(struct dm_target *ti, struct bio *bio)
+{
+ struct zoned *znd = ti->private;
+ int rcode = DM_MAPIO_REQUEUE;
+
+ while (kfifo_put(&znd->bio_fifo, bio) == 0) {
+ wake_up_process(znd->bio_kthread);
+ wake_up(&znd->wait_bio);
+ wait_event_freezable(znd->wait_fifo,
+ kfifo_avail(&znd->bio_fifo) > 0);
+ }
+ rcode = DM_MAPIO_SUBMITTED;
+ wake_up_process(znd->bio_kthread);
+ wake_up(&znd->wait_bio);
+
+ return rcode;
+}
+
+#else
+
+/**
+ * zoned_map() - Handle an incoming BIO
+ * @ti: Device Mapper Target Instance
+ * @bio: The BIO to disposition.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int zoned_map(struct dm_target *ti, struct bio *bio)
+{
+ struct zoned *znd = ti->private;
+ int err = zoned_bio(znd, bio);
+
+ if (err < 0) {
+ znd->meta_result = err;
+ err = 0;
+ }
+
+ return err;
+}
+
+#endif
+
+/**
+ * zoned_actual_size() - Set number of 4k blocks available on block device.
+ * @ti: Device Mapper Target Instance
+ * @znd: ZDM Instance
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static void zoned_actual_size(struct dm_target *ti, struct zoned *znd)
+{
+ znd->nr_blocks = i_size_read(get_bdev_bd_inode(znd)) / Z_C4K;
+}
+
+/**
+ * zoned_ctr() - Create a ZDM Instance from DM Target Instance and args.
+ * @ti: Device Mapper Target Instance
+ * @argc: Number of args to handle.
+ * @argv: args to handle.
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int zoned_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ const int reset_non_empty = 0;
+ int create = 0;
+ int force = 0;
+ int zbc_probe = 1;
+ int zac_probe = 1;
+ int r;
+ struct zoned *znd;
+ long long first_data_zone = 0;
+ long long mz_md_provision = MZ_METADATA_ZONES;
+
+ BUILD_BUG_ON(Z_C4K != (sizeof(struct map_sect_to_lba) * Z_UNSORTED));
+ BUILD_BUG_ON(Z_C4K != (sizeof(struct io_4k_block)));
+ BUILD_BUG_ON(Z_C4K != (sizeof(struct mz_superkey)));
+
+ znd = ZDM_ALLOC(NULL, sizeof(*znd), KM_00, NORMAL);
+ if (!znd) {
+ ti->error = "Error allocating zoned structure";
+ return -ENOMEM;
+ }
+
+ znd->trim = 1;
+
+ if (argc < 1) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ for (r = 1; r < argc; r++) {
+ if (isdigit(*argv[r])) {
+ int krc = kstrtoll(argv[r], 0, &first_data_zone);
+
+ if (krc != 0) {
+ DMERR("Failed to parse %s: %d", argv[r], krc);
+ first_data_zone = 0;
+ }
+ }
+ if (!strcasecmp("create", argv[r]))
+ create = 1;
+ if (!strcasecmp("load", argv[r]))
+ create = 0;
+ if (!strcasecmp("force", argv[r]))
+ force = 1;
+ if (!strcasecmp("nozbc", argv[r]))
+ zbc_probe = 0;
+ if (!strcasecmp("nozac", argv[r]))
+ zac_probe = 0;
+ if (!strcasecmp("discard", argv[r]))
+ znd->trim = 1;
+ if (!strcasecmp("nodiscard", argv[r]))
+ znd->trim = 0;
+
+ if (!strncasecmp("reserve=", argv[r], 8)) {
+ long long mz_resv;
+ int krc = kstrtoll(argv[r] + 8, 0, &mz_resv);
+
+ if (krc == 0) {
+ if (mz_resv > mz_md_provision)
+ mz_md_provision = mz_resv;
+ } else {
+ DMERR("Reserved 'FAILED TO PARSE.' %s: %d",
+ argv[r]+8, krc);
+ mz_resv = 0;
+ }
+ }
+ }
+
+ znd->ti = ti;
+ ti->private = znd;
+ znd->zdstart = first_data_zone;
+ znd->mz_provision = mz_md_provision;
+
+ r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &znd->dev);
+ if (r) {
+ ti->error = "Error opening backing device";
+ zoned_destroy(znd);
+ return -EINVAL;
+ }
+
+ if (znd->dev->bdev) {
+ bdevname(znd->dev->bdev, znd->bdev_name);
+ znd->start_sect = get_start_sect(znd->dev->bdev) >> 3;
+ }
+
+ /*
+ * Set if this target needs to receive flushes regardless of
+ * whether or not its underlying devices have support.
+ */
+ ti->flush_supported = true;
+
+ /*
+ * Set if this target needs to receive discards regardless of
+ * whether or not its underlying devices have support.
+ */
+ ti->discards_supported = true;
+
+ /*
+ * Set if the target required discard bios to be split
+ * on max_io_len boundary.
+ */
+ ti->split_discard_bios = false;
+
+ /*
+ * Set if this target does not return zeroes on discarded blocks.
+ */
+ ti->discard_zeroes_data_unsupported = false;
+
+ /*
+ * Set if this target wants discard bios to be sent.
+ */
+ ti->num_discard_bios = 1;
+
+ if (!znd->trim) {
+ ti->discards_supported = false;
+ ti->num_discard_bios = 0;
+ }
+
+ zoned_actual_size(ti, znd);
+
+ r = do_init_zoned(ti, znd);
+ if (r) {
+ ti->error = "Error in zoned init";
+ zoned_destroy(znd);
+ return -EINVAL;
+ }
+
+ znd->issue_open_zone = 1;
+ znd->issue_close_zone = 1;
+ znd->filled_zone = NOZONE;
+
+ if (zac_probe)
+ znd->bdev_is_zoned = znd->ata_passthrough = 1;
+ if (zbc_probe)
+ znd->bdev_is_zoned = 1;
+
+ r = zoned_init_disk(ti, znd, create, force);
+ if (r) {
+ ti->error = "Error in zoned init from disk";
+ zoned_destroy(znd);
+ return -EINVAL;
+ }
+ r = zoned_wp_sync(znd, reset_non_empty);
+ if (r) {
+ ti->error = "Error in zoned re-sync WP";
+ zoned_destroy(znd);
+ return -EINVAL;
+ }
+
+ znd->bio_set = bioset_create(BIOSET_RESV, 0);
+ if (!znd->bio_set)
+ return -ENOMEM;
+
+#if USE_KTHREAD
+ r = kfifo_alloc(&znd->bio_fifo, KFIFO_SIZE, GFP_KERNEL);
+ if (r)
+ return r;
+
+ znd->bio_kthread = kthread_run(znd_bio_kthread, znd, "zdm-io-%s",
+ znd->bdev_name);
+ if (IS_ERR(znd->bio_kthread)) {
+ r = PTR_ERR(znd->bio_kthread);
+ ti->error = "Couldn't alloc kthread";
+ zoned_destroy(znd);
+ return r;
+ }
+#endif
+
+ r = zdm_create_proc_entries(znd);
+ if (r) {
+ ti->error = "Failed to create /proc entries";
+ zoned_destroy(znd);
+ return -EINVAL;
+ }
+
+ mod_timer(&znd->timer, jiffies + msecs_to_jiffies(5000));
+
+ return 0;
+}
+
+/**
+ * zoned_dtr() - Deconstruct a ZDM Instance from DM Target Instance.
+ * @ti: Device Mapper Target Instance
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static void zoned_dtr(struct dm_target *ti)
+{
+ struct zoned *znd = ti->private;
+
+ if (znd->z_sballoc) {
+ struct mz_superkey *key_blk = znd->z_sballoc;
+ struct zdm_superblock *sblock = &key_blk->sblock;
+
+ sblock->flags = cpu_to_le32(0);
+ sblock->csum = sb_crc32(sblock);
+ }
+
+#if USE_KTHREAD
+ wake_up_process(znd->bio_kthread);
+ wait_event(znd->wait_bio, kfifo_is_empty(&znd->bio_fifo));
+ kthread_stop(znd->bio_kthread);
+ kfifo_free(&znd->bio_fifo);
+#endif
+ if (znd->bio_set)
+ bioset_free(znd->bio_set);
+ zdm_remove_proc_entries(znd);
+ zoned_destroy(znd);
+}
+
+
+/**
+ * do_io_work() - Read or write a data from a block device.
+ * @work: Work to be done.
+ */
+static void do_io_work(struct work_struct *work)
+{
+ struct z_io_req_t *req = container_of(work, struct z_io_req_t, work);
+ struct dm_io_request *io_req = req->io_req;
+ unsigned long error_bits = 0;
+
+ req->result = dm_io(io_req, 1, req->where, &error_bits);
+ if (error_bits)
+ DMERR("ERROR: dm_io error: %lx", error_bits);
+}
+
+/**
+ * znd_async_io() - Issue I/O via dm_io async or sync (using worker thread).
+ * @znd: ZDM Instance
+ * @dtype: Type of memory in data
+ * @data: Data for I/O
+ * @block: bLBA for I/O
+ * @nDMsect: Number of 512 byte blocks to read/write.
+ * @rw: READ or WRITE
+ * @queue: if true then use worker thread for I/O and wait.
+ * @callback: callback to use on I/O complete.
+ * context: context to be passed to callback.
+ *
+ * Return 0 on success, otherwise error.
+ */
+static int znd_async_io(struct zoned *znd,
+ enum dm_io_mem_type dtype,
+ void *data,
+ sector_t block, unsigned int nDMsect, int rw, int queue,
+ io_notify_fn callback, void *context)
+{
+ unsigned long error_bits = 0;
+ int rcode;
+ struct dm_io_region where = {
+ .bdev = znd->dev->bdev,
+ .sector = block,
+ .count = nDMsect,
+ };
+ struct dm_io_request io_req = {
+ .bi_rw = rw,
+ .mem.type = dtype,
+ .mem.offset = 0,
+ .mem.ptr.vma = data,
+ .client = znd->io_client,
+ .notify.fn = callback,
+ .notify.context = context,
+ };
+
+ switch (dtype) {
+ case DM_IO_KMEM:
+ io_req.mem.ptr.addr = data;
+ break;
+ case DM_IO_BIO:
+ io_req.mem.ptr.bio = data;
+ where.count = nDMsect;
+ break;
+ case DM_IO_VMA:
+ io_req.mem.ptr.vma = data;
+ break;
+ default:
+ Z_ERR(znd, "page list not handled here .. see dm-io.");
+ break;
+ }
+
+ if (queue) {
+ struct z_io_req_t req;
+
+ /*
+ * Issue the synchronous I/O from a different thread
+ * to avoid generic_make_request recursion.
+ */
+ INIT_WORK_ONSTACK(&req.work, do_io_work);
+ req.where = &where;
+ req.io_req = &io_req;
+ queue_work(znd->io_wq, &req.work);
+ flush_workqueue(znd->io_wq);
+ destroy_work_on_stack(&req.work);
+
+ rcode = req.result;
+ if (rcode < 0)
+ Z_ERR(znd, "ERROR: dm_io error: %d", rcode);
+ goto done;
+ }
+ rcode = dm_io(&io_req, 1, &where, &error_bits);
+ if (error_bits || rcode < 0)
+ Z_ERR(znd, "ERROR: dm_io error: %d -- %lx", rcode, error_bits);
+
+done:
+ return rcode;
+}
+
+/**
+ * block_io() - Issue sync I/O maybe using using a worker thread.
+ * @znd: ZDM Instance
+ * @dtype: Type of memory in data
+ * @data: Data for I/O
+ * @s: bLBA for I/O [512 byte resolution]
+ * @n: Number of 512 byte blocks to read/write.
+ * @rw: READ or WRITE
+ * @queue: if true then use worker thread for I/O and wait.
+ *
+ * Return 0 on success, otherwise error.
+ */
+static int block_io(struct zoned *znd,
+ enum dm_io_mem_type dtype,
+ void *data, sector_t s, unsigned int n, int rw, int queue)
+{
+ return znd_async_io(znd, dtype, data, s, n, rw, queue, NULL, NULL);
+}
+
+/**
+ * read_block() - Issue sync read maybe using using a worker thread.
+ * @ti: Device Mapper Target Instance
+ * @dtype: Type of memory in data
+ * @data: Data for I/O
+ * @lba: bLBA for I/O [4k resolution]
+ * @count: Number of 4k blocks to read/write.
+ * @queue: if true then use worker thread for I/O and wait.
+ *
+ * Return 0 on success, otherwise error.
+ */
+static int read_block(struct dm_target *ti, enum dm_io_mem_type dtype,
+ void *data, u64 lba, unsigned int count, int queue)
+{
+ struct zoned *znd = ti->private;
+ sector_t block = lba << Z_SHFT4K;
+ unsigned int nDMsect = count << Z_SHFT4K;
+ int rc;
+
+ if (lba >= znd->nr_blocks) {
+ Z_ERR(znd, "Error reading past end of media: %llx.", lba);
+ rc = -EIO;
+ return rc;
+ }
+
+ rc = block_io(znd, dtype, data, block, nDMsect, READ, queue);
+ if (rc) {
+ Z_ERR(znd, "read error: %d -- R: %llx [%u dm sect] (Q:%d)",
+ rc, lba, nDMsect, queue);
+ dump_stack();
+ }
+
+ return rc;
+}
+
+/**
+ * writef_block() - Issue sync write maybe using using a worker thread.
+ * @ti: Device Mapper Target Instance
+ * @dtype: Type of memory in data
+ * @data: Data for I/O
+ * @lba: bLBA for I/O [4k resolution]
+ * @count: Number of 4k blocks to read/write.
+ * @queue: if true then use worker thread for I/O and wait.
+ *
+ * Return 0 on success, otherwise error.
+ */
+static int writef_block(struct dm_target *ti, int rw, enum dm_io_mem_type dtype,
+ void *data, u64 lba, unsigned int count, int queue)
+{
+ struct zoned *znd = ti->private;
+ sector_t block = lba << Z_SHFT4K;
+ unsigned int nDMsect = count << Z_SHFT4K;
+ int rc;
+
+ rc = block_io(znd, dtype, data, block, nDMsect, WRITE, queue);
+ if (rc) {
+ Z_ERR(znd, "write error: %d W: %llx [%u dm sect] (Q:%d)",
+ rc, lba, nDMsect, queue);
+ dump_stack();
+ }
+
+ return rc;
+}
+
+/**
+ * write_block() - Issue sync write maybe using using a worker thread.
+ * @ti: Device Mapper Target Instance
+ * @dtype: Type of memory in data
+ * @data: Data for I/O
+ * @lba: bLBA for I/O [4k resolution]
+ * @count: Number of 4k blocks to read/write.
+ * @queue: if true then use worker thread for I/O and wait.
+ *
+ * Return 0 on success, otherwise error.
+ */
+static int write_block(struct dm_target *ti, enum dm_io_mem_type dtype,
+ void *data, u64 lba, unsigned int count, int queue)
+{
+ return writef_block(ti, WRITE, dtype, data, lba, count, queue);
+}
+
+/**
+ * struct zsplit_hook - Extra data attached to a hooked bio
+ * @znd: ZDM Instance to update on BIO completion.
+ * @endio: BIO's original bi_end_io handler
+ * @private: BIO's original bi_private data.
+ */
+struct zsplit_hook {
+ struct zoned *znd;
+ bio_end_io_t *endio;
+ void *private;
+};
+
+/**
+ * hook_bio() - Wrapper for hooking bio's endio function.
+ * @znd: ZDM Instance
+ * @bio: Bio to clone and hook
+ * @endiofn: End IO Function to hook with.
+ */
+static int hook_bio(struct zoned *znd, struct bio *split, bio_end_io_t *endiofn)
+{
+ struct zsplit_hook *hook = kmalloc(sizeof(*hook), GFP_NOIO);
+
+ if (!hook)
+ return -ENOMEM;
+
+ /*
+ * On endio report back to ZDM Instance and restore
+ * original the bi_private and bi_end_io.
+ * Since all of our splits are also chain'd we also
+ * 'know' that bi_private will be the bio we sharded
+ * and that bi_end_io is the bio_chain_endio helper.
+ */
+ hook->znd = znd;
+ hook->private = split->bi_private; /* = bio */
+ hook->endio = split->bi_end_io; /* = bio_chain_endio */
+
+ /*
+ * Now on complete the bio will call endiofn which is 'zsplit_endio'
+ * and we can record the update WP location and restore the
+ * original bi_private and bi_end_io
+ */
+ split->bi_private = hook;
+ split->bi_end_io = endiofn;
+
+ return 0;
+}
+
+
+/**
+ * TODO: On write error such as this we can incr wp-used but we need
+ * to re-queue/re-map the write to a new location on disk?
+ *
+ * sd 0:0:1:0: [sdb] tag#1 FAILED Result: hostbyte=DID_SOFT_ERROR
+ * driverbyte=DRIVER_OK
+ * sd 0:0:1:0: [sdb] tag#1
+ * CDB: Write(16) 8a 00 00 00 00 00 06 d4 92 a0 00 00 00 08 00 00
+ * blk_update_request: I/O error, dev sdb, sector 114594464
+ * exec scsi cmd failed,opcode:133
+ * sdb: command 1 failed
+ * sd 0:0:1:0: [sdb] tag#1
+ * CDB: Write(16) 8a 00 00 00 00 00 00 01 0a 70 00 00 00 18 00 00
+ * mpt3sas_cm0: sas_address(0x4433221105000000), phy(5)
+ * mpt3sas_cm0: enclosure_logical_id(0x500605b0074854d0),slot(6)
+ * mpt3sas_cm0: enclosure level(0x0000), connector name( )
+ * mpt3sas_cm0: handle(0x000a), ioc_status(success)(0x0000), smid(17)
+ * mpt3sas_cm0: request_len(12288), underflow(12288), resid(-1036288)
+ * mpt3sas_cm0: tag(65535), transfer_count(1048576), sc->result(0x00000000)
+ * mpt3sas_cm0: scsi_status(check condition)(0x02),
+ * scsi_state(autosense valid )(0x01)
+ * mpt3sas_cm0: [sense_key,asc,ascq]: [0x06,0x29,0x00], count(18)
+ * Aborting journal on device dm-0-8.
+ * EXT4-fs error (device dm-0):
+ * ext4_journal_check_start:56: Detected aborted journal
+ * EXT4-fs (dm-0): Remounting filesystem read-only
+ */
+
+/**
+ * _common_endio() - Bio endio tracking for update internal WP.
+ * @bio: Bio being completed.
+ *
+ * Bios that are split for writing are usually split to land on a zone
+ * boundary. Forward the bio along the endio path and update the WP.
+ */
+static void _common_endio(struct zoned *znd, struct bio *bio)
+{
+ u64 lba = bio->bi_iter.bi_sector >> Z_SHFT4K;
+ u32 blks = bio->bi_iter.bi_size / Z_C4K;
+
+ if (bio_data_dir(bio) == WRITE && lba > znd->start_sect) {
+ lba -= znd->start_sect;
+ if (lba > 0)
+ increment_used_blks(znd, lba - 1, blks + 1);
+ }
+}
+
+/**
+ * zoned_endio() - DM bio completion notification.
+ * @ti: DM Target instance.
+ * @bio: Bio being completed.
+ * @err: Error associated with bio.
+ *
+ * Non-split and non-dm_io bios end notification is here.
+ * Update the WP location for WRITE bios.
+ */
+static int zoned_endio(struct dm_target *ti, struct bio *bio, int err)
+{
+ struct zoned *znd = ti->private;
+
+ _common_endio(znd, bio);
+ return 0;
+}
+
+/**
+ * zsplit_endio() - Bio endio tracking for update internal WP.
+ * @bio: Bio being completed.
+ *
+ * Bios that are split for writing are usually split to land on a zone
+ * boundary. Forward the bio along the endio path and update the WP.
+ */
+static void zsplit_endio(struct bio *bio)
+{
+ struct zsplit_hook *hook = bio->bi_private;
+ struct bio *parent = hook->private;
+ struct zoned *znd = hook->znd;
+
+ _common_endio(znd, bio);
+
+ bio->bi_private = hook->private;
+ bio->bi_end_io = hook->endio;
+
+ /* On split bio's we are responsible for de-ref'ing and freeing */
+ bio_put(bio);
+ if (parent)
+ bio_endio(parent);
+
+ /* release our temporary private data */
+ kfree(hook);
+}
+
+/**
+ * zsplit_bio() - Split and chain a bio.
+ * @znd: ZDM Instance
+ * @bio: Bio to split
+ * @sectors: Number of sectors.
+ *
+ * Return: split bio.
+ */
+static struct bio *zsplit_bio(struct zoned *znd, struct bio *bio, int sectors)
+{
+ struct bio *split = bio;
+
+ if (bio_sectors(bio) > sectors) {
+ split = bio_split(bio, sectors, GFP_NOIO, znd->bio_set);
+ if (!split)
+ goto out;
+ bio_chain(split, bio);
+ if (bio_data_dir(bio) == WRITE)
+ hook_bio(znd, split, zsplit_endio);
+ }
+out:
+ return split;
+}
+
+/**
+ * zm_cow() - Read Modify Write to write less than 4k size blocks.
+ * @znd: ZDM Instance
+ * @bio: Bio to write
+ * @s_zdm: tLBA
+ * @blks: number of blocks to RMW (should be 1).
+ * @origin: Current bLBA
+ *
+ * Return: 0 on success, otherwise error.
+ */
+static int zm_cow(struct zoned *znd, struct bio *bio, u64 s_zdm, u32 blks,
+ u64 origin)
+{
+ struct dm_target *ti = znd->ti;
+ int count = 1;
+ int use_wq = 1;
+ unsigned int bytes = bio_cur_bytes(bio);
+ u8 *data = bio_data(bio);
+ u8 *io = NULL;
+ u16 ua_off = bio->bi_iter.bi_sector & 0x0007;
+ u16 ua_size = bio->bi_iter.bi_size & 0x0FFF; /* in bytes */
+ u32 mapped = 0;
+ u64 disk_lba = 0;
+
+ if (!znd->cow_block)
+ znd->cow_block = ZDM_ALLOC(znd, Z_C4K, PG_02, CRIT);
+
+ io = znd->cow_block;
+ if (!io)
+ return -EIO;
+
+ disk_lba = z_acquire(znd, Z_AQ_STREAM_ID, blks, &mapped);
+ if (!disk_lba || !mapped)
+ return -ENOSPC;
+
+ while (bytes) {
+ int ioer;
+ unsigned int iobytes = Z_C4K;
+
+ /* ---------------------------------------------------------- */
+ if (origin) {
+ if (s_zdm != znd->cow_addr) {
+ Z_ERR(znd, "Copy block from %llx <= %llx",
+ origin, s_zdm);
+ ioer = read_block(ti, DM_IO_KMEM, io, origin,
+ count, use_wq);
+ if (ioer)
+ return -EIO;
+
+ znd->cow_addr = s_zdm;
+ } else {
+ Z_ERR(znd, "Cached block from %llx <= %llx",
+ origin, s_zdm);
+ }
+ } else {
+ memset(io, 0, Z_C4K);
+ }
+
+ if (ua_off)
+ iobytes -= ua_off * 512;
+
+ if (bytes < iobytes)
+ iobytes = bytes;
+
+ Z_ERR(znd, "Moving %u bytes from origin [offset:%u]",
+ iobytes, ua_off * 512);
+
+ memcpy(io + (ua_off * 512), data, iobytes);
+
+ /* ---------------------------------------------------------- */
+
+ ioer = write_block(ti, DM_IO_KMEM, io, disk_lba, count, use_wq);
+ if (ioer)
+ return -EIO;
+
+ MutexLock(&znd->mz_io_mutex);
+ ioer = z_mapped_addmany(znd, s_zdm, disk_lba, mapped, CRIT);
+ mutex_unlock(&znd->mz_io_mutex);
+ if (ioer) {
+ Z_ERR(znd, "%s: Journal MANY failed.", __func__);
+ return -EIO;
+ }
+ increment_used_blks(znd, disk_lba, mapped);
+
+ data += iobytes;
+ bytes -= iobytes;
+ ua_size -= (ua_size > iobytes) ? iobytes : ua_size;
+ ua_off = 0;
+ disk_lba++;
+
+ if (bytes && (ua_size || ua_off)) {
+ s_zdm++;
+ origin = current_mapping(znd, s_zdm, CRIT);
+ }
+ }
+ bio_endio(bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+/**
+ * Write 4k blocks from cache to lba.
+ * Move any remaining 512 byte blocks to the start of cache and update
+ * the @_blen count is updated
+ */
+static int zm_write_cache(struct zoned *znd, struct io_dm_block *dm_vbuf,
+ u64 lba, u32 *_blen)
+{
+ int use_wq = 1;
+ int cached = *_blen;
+ int blks = cached >> 3;
+ int sectors = blks << 3;
+ int remainder = cached - sectors;
+ int err;
+
+ err = write_block(znd->ti, DM_IO_VMA, dm_vbuf, lba, blks, use_wq);
+ if (!err) {
+ if (remainder)
+ memcpy(dm_vbuf[0].data,
+ dm_vbuf[sectors].data, remainder * 512);
+ *_blen = remainder;
+ }
+ return err;
+}
+
+/**
+ * zm_write_bios() - Map and write bios.
+ * @znd: ZDM Instance
+ * @bio: Bio to be written.
+ * @s_zdm: tLBA for mapping.
+ *
+ * Return: DM_MAPIO_SUBMITTED or negative on error.
+ */
+static int zm_write_bios(struct zoned *znd, struct bio *bio, u64 s_zdm)
+{
+ struct bio *split = NULL;
+ u32 acqflgs = Z_AQ_STREAM_ID | bio_stream(bio);
+ u64 lba = 0;
+ u32 mapped = 0;
+ int err = -EIO;
+ int done = 0;
+ int sectors;
+ u32 blks;
+
+ do {
+ blks = dm_div_up(bio->bi_iter.bi_size, Z_C4K);
+ lba = z_acquire(znd, acqflgs, blks, &mapped);
+ if (!lba && mapped)
+ lba = z_acquire(znd, acqflgs, mapped, &mapped);
+
+ if (!lba) {
+ if (atomic_read(&znd->gc_throttle) == 0) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ Z_ERR(znd, "Throttle input ... Mandatory GC.");
+ if (delayed_work_pending(&znd->gc_work)) {
+ mod_delayed_work(znd->gc_wq, &znd->gc_work, 0);
+ flush_delayed_work(&znd->gc_work);
+ }
+ continue;
+ }
+
+ sectors = mapped << Z_SHFT4K;
+ split = zsplit_bio(znd, bio, sectors);
+ if (split == bio)
+ done = 1;
+
+ if (!split) {
+ err = -ENOMEM;
+ goto out;
+ }
+ split->bi_iter.bi_sector = lba << Z_SHFT4K;
+ generic_make_request(split);
+ MutexLock(&znd->mz_io_mutex);
+ err = z_mapped_addmany(znd, s_zdm, lba, mapped, CRIT);
+ mutex_unlock(&znd->mz_io_mutex);
+ if (err) {
+ Z_ERR(znd, "%s: Journal MANY failed.", __func__);
+ err = DM_MAPIO_REQUEUE;
+ goto out;
+ }
+ s_zdm += mapped;
+ } while (!done);
+ err = DM_MAPIO_SUBMITTED;
+
+out:
+ return err;
+}
+
+/**
+ * zm_write_pages() - Copy bio pages to 4k aligned buffer. Write and map buffer.
+ * @znd: ZDM Instance
+ * @bio: Bio to be written.
+ * @s_zdm: tLBA for mapping.
+ *
+ * Return: DM_MAPIO_SUBMITTED or negative on error.
+ */
+static int zm_write_pages(struct zoned *znd, struct bio *bio, u64 s_zdm)
+{
+ u32 blks = dm_div_up(bio->bi_iter.bi_size, Z_C4K);
+ u64 lba = 0;
+ u32 blen = 0; /* total: IO_VCACHE_PAGES * 8 */
+ u32 written = 0;
+ int avail = 0;
+ u32 acqflgs = Z_AQ_STREAM_ID | bio_stream(bio);
+ int err;
+ struct bvec_iter start;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ struct io_4k_block *io_vcache;
+ struct io_dm_block *dm_vbuf = NULL;
+
+ MutexLock(&znd->vcio_lock);
+ io_vcache = get_io_vcache(znd, CRIT);
+ if (!io_vcache) {
+ Z_ERR(znd, "%s: FAILED to get SYNC CACHE.", __func__);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dm_vbuf = (struct io_dm_block *)io_vcache;
+
+ /* USE: dm_vbuf for dumping bio pages to disk ... */
+ start = bio->bi_iter; /* struct implicit copy */
+ do {
+ u64 alloc_ori = 0;
+ u32 mcount = 0;
+ u32 mapped = 0;
+
+reacquire:
+ /*
+ * When lba is zero no blocks were not allocated.
+ * Retry with the smaller request
+ */
+ lba = z_acquire(znd, acqflgs, blks - written, &mapped);
+ if (!lba && mapped)
+ lba = z_acquire(znd, acqflgs, mapped, &mapped);
+
+ if (!lba) {
+ if (atomic_read(&znd->gc_throttle) == 0) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ Z_ERR(znd, "Throttle input ... Mandatory GC.");
+ if (delayed_work_pending(&znd->gc_work)) {
+ mod_delayed_work(znd->gc_wq, &znd->gc_work, 0);
+ flush_delayed_work(&znd->gc_work);
+ }
+ goto reacquire;
+ }
+
+ /* this may be redundant .. if we have lba we have mapped > 0 */
+ if (lba && mapped)
+ avail += mapped * 8; /* claimed pages in dm blocks */
+
+ alloc_ori = lba;
+
+ /* copy [upto mapped] pages to buffer */
+ __bio_for_each_segment(bv, bio, iter, start) {
+ int issue_write = 0;
+ unsigned int boff;
+ void *src;
+
+ if (avail <= 0) {
+ Z_ERR(znd, "%s: TBD: Close Z# %llu",
+ __func__, alloc_ori >> 16);
+ start = iter;
+ break;
+ }
+
+ src = kmap_atomic(bv.bv_page);
+ boff = bv.bv_offset;
+ memcpy(dm_vbuf[blen].data, src + boff, bv.bv_len);
+ kunmap_atomic(src);
+ blen += bv.bv_len / 512;
+ avail -= bv.bv_len / 512;
+
+ if ((blen >= (mapped * 8)) ||
+ (blen >= (BIO_CACHE_SECTORS - 8)))
+ issue_write = 1;
+
+ /*
+ * If there is less than 1 4k block in out cache,
+ * send the available blocks to disk
+ */
+ if (issue_write) {
+ int blks = blen / 8;
+
+ err = zm_write_cache(znd, dm_vbuf, lba, &blen);
+ if (err) {
+ Z_ERR(znd, "%s: bio-> %" PRIx64
+ " [%d of %d blks] -> %d",
+ __func__, lba, blen, blks, err);
+ bio->bi_error = err;
+ bio_endio(bio);
+ goto out;
+ }
+ lba += blks;
+ written += blks;
+ mcount += blks;
+ mapped -= blks;
+
+ if (mapped == 0) {
+ bio_advance_iter(bio, &iter, bv.bv_len);
+ start = iter;
+ break;
+ }
+ if (mapped < 0) {
+ Z_ERR(znd, "ERROR: Bad write %"
+ PRId32 " beyond alloc'd space",
+ mapped);
+ }
+ }
+ } /* end: __bio_for_each_segment */
+ if ((mapped > 0) && ((blen / 8) > 0)) {
+ int blks = blen / 8;
+
+ err = zm_write_cache(znd, dm_vbuf, lba, &blen);
+ if (err) {
+ Z_ERR(znd, "%s: bio-> %" PRIx64
+ " [%d of %d blks] -> %d",
+ __func__, lba, blen, blks, err);
+ bio->bi_error = err;
+ bio_endio(bio);
+ goto out;
+ }
+ lba += blks;
+ written += blks;
+ mcount += blks;
+ mapped -= blks;
+
+ if (mapped < 0) {
+ Z_ERR(znd, "ERROR: [2] Bad write %"
+ PRId32 " beyond alloc'd space",
+ mapped);
+ }
+ }
+ MutexLock(&znd->mz_io_mutex);
+ err = z_mapped_addmany(znd, s_zdm, alloc_ori, mcount, CRIT);
+ mutex_unlock(&znd->mz_io_mutex);
+ if (err) {
+ Z_ERR(znd, "%s: Journal MANY failed.", __func__);
+ err = DM_MAPIO_REQUEUE;
+ /*
+ * FIXME:
+ * Ending the BIO here is causing a GFP:
+ - DEBUG_PAGEALLOC
+ - in Workqueue:
+ - writeback bdi_writeback_workfn (flush-252:0)
+ - backtrace:
+ - __map_bio+0x7a/0x280
+ - __split_and_process_bio+0x2e3/0x4e0
+ - ? __split_and_process_bio+0x22/0x4e0
+ - ? generic_start_io_acct+0x5/0x210
+ - dm_make_request+0x6b/0x100
+ - generic_make_request+0xc0/0x110
+ - ....
+ -
+ - bio->bi_error = err;
+ - bio_endio(bio);
+ */
+ goto out;
+ }
+ increment_used_blks(znd, alloc_ori, mcount);
+
+ if (written < blks)
+ s_zdm += written;
+
+ if (written == blks && blen > 0)
+ Z_ERR(znd, "%s: blen: %d un-written blocks!!",
+ __func__, blen);
+ } while (written < blks);
+ bio_endio(bio);
+ err = DM_MAPIO_SUBMITTED;
+
+out:
+ put_io_vcache(znd, io_vcache);
+ mutex_unlock(&znd->vcio_lock);
+
+ return err;
+}
+
+/**
+ * is_empty_page() - Scan memory range for any set bits.
+ * @pg: The start of memory to be scanned.
+ * @len: Number of bytes to check (should be long aligned)
+ * Return: 0 if any bits are set, 1 if all bits are 0.
+ */
+static int is_empty_page(void *pg, size_t len)
+{
+ unsigned long *chk = pg;
+ size_t count = len / sizeof(*chk);
+ size_t entry;
+
+ for (entry = 0; entry < count; entry++) {
+ if (chk[entry])
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * is_zero_bio() - Scan bio to see if all bytes are 0.
+ * @bio: The bio to be scanned.
+ * Return: 1 if all bits are 0. 0 if any bits in bio are set.
+ */
+static int is_zero_bio(struct bio *bio)
+{
+ int is_empty = 0;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ /* Scan bio to determine if it is zero'd */
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned int boff;
+ void *src;
+
+ src = kmap_atomic(bv.bv_page);
+ boff = bv.bv_offset;
+ is_empty = is_empty_page(src + boff, bv.bv_len);
+ kunmap_atomic(src);
+
+ if (!is_empty)
+ break;
+ } /* end: __bio_for_each_segment */
+
+ return is_empty;
+}
+
+/**
+ * is_bio_aligned() - Test bio and bio_vec for 4k aligned pages.
+ * @bio: Bio to be tested.
+ * Return: 1 if bio is 4k aligned, 0 if not.
+ */
+static int is_bio_aligned(struct bio *bio)
+{
+ int aligned = 1;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if ((bv.bv_offset & 0x0FFF) || (bv.bv_len & 0x0FFF)) {
+ aligned = 0;
+ break;
+ }
+ }
+ return aligned;
+}
+
+/**
+ * zoned_map_write() - Write a bio by the fastest safe method.
+ * @znd: ZDM Instance
+ * @bio: Bio to be written
+ * @s_zdm: tLBA for mapping.
+ *
+ * Bios that are less than 4k need RMW.
+ * Bios that are single pages are deduped and written or discarded.
+ * Bios that are multiple pages with 4k aligned bvecs are written as bio(s).
+ * Biso that are multiple pages and mis-algined are copied to an algined buffer
+ * and submitted and new I/O.
+ */
+static int zoned_map_write(struct zoned *znd, struct bio *bio, u64 s_zdm)
+{
+ u32 blks = dm_div_up(bio->bi_iter.bi_size, Z_C4K);
+ u16 ua_off = bio->bi_iter.bi_sector & 0x0007;
+ u16 ua_size = bio->bi_iter.bi_size & 0x0FFF; /* in bytes */
+ int rcode = -EIO;
+
+ if (ua_size || ua_off) {
+ u64 origin;
+
+ origin = current_mapping(znd, s_zdm, CRIT);
+ if (origin) {
+ znd->is_empty = 0;
+ rcode = zm_cow(znd, bio, s_zdm, blks, origin);
+ }
+ return rcode;
+ }
+
+ /* on RAID 4/5/6 all writes are 4k */
+ if (blks == 1) {
+ if (is_zero_bio(bio)) {
+ rcode = zoned_map_discard(znd, bio, s_zdm);
+ } else {
+ znd->is_empty = 0;
+ rcode = zm_write_bios(znd, bio, s_zdm);
+ }
+ return rcode;
+ }
+
+ /*
+ * For larger bios test for 4k alignment.
+ * When bios are mis-algined we must copy out the
+ * the mis-algined pages into a new bio and submit.
+ * [The 4k alignment requests on our queue may be ignored
+ * by mis-behaving layers that are not 4k safe].
+ */
+ znd->is_empty = 0;
+ if (is_bio_aligned(bio))
+ rcode = zm_write_bios(znd, bio, s_zdm);
+ else
+ rcode = zm_write_pages(znd, bio, s_zdm);
+
+ return rcode;
+}
+
+/**
+ * zm_read_bios() - Read bios from device
+ * @znd: ZDM Instance
+ * @bio: Bio to read
+ * @s_zdm: tLBA to read from.
+ *
+ * Return DM_MAPIO_SUBMITTED or negative on error.
+ */
+static int zm_read_bios(struct zoned *znd, struct bio *bio, u64 s_zdm)
+{
+ struct bio *split = NULL;
+ int rcode = DM_MAPIO_SUBMITTED;
+ u64 lba;
+ u64 blba;
+ u32 blks;
+ int sectors;
+ int count;
+ u16 ua_off;
+ u16 ua_size;
+
+ do {
+ blks = dm_div_up(bio->bi_iter.bi_size, Z_C4K);
+ ua_off = bio->bi_iter.bi_sector & 0x0007;
+ ua_size = bio->bi_iter.bi_size & 0x0FFF; /* in bytes */
+
+ lba = blba = current_mapping(znd, s_zdm, CRIT);
+ if (blba) {
+ for (count = 1; count < blks; count++) {
+ lba = current_mapping(znd, s_zdm + count, CRIT);
+ if (lba != (blba + count))
+ break;
+ }
+ } else {
+ for (count = 1; count < blks; count++) {
+ lba = current_mapping(znd, s_zdm + count, CRIT);
+ if (lba != 0ul)
+ break;
+ }
+ }
+ s_zdm += count;
+ sectors = (count << Z_SHFT4K);
+ if (ua_size)
+ sectors += (ua_size >> SECTOR_SHIFT) - 8;
+
+ split = zsplit_bio(znd, bio, sectors);
+ if (!split) {
+ rcode = -ENOMEM;
+ goto out;
+ }
+ if (blba) {
+ split->bi_iter.bi_sector = (blba << Z_SHFT4K) + ua_off;
+ generic_make_request(split);
+ } else {
+ zero_fill_bio(split);
+ bio_endio(split);
+ }
+ } while (split != bio);
+
+out:
+ return rcode;
+}
+
+/**
+ * zoned_bio() - Handle and incoming BIO.
+ * @znd: ZDM Instance
+ */
+static int zoned_bio(struct zoned *znd, struct bio *bio)
+{
+ bool is_write = (bio_data_dir(bio) == WRITE);
+ u64 s_zdm = (bio->bi_iter.bi_sector >> Z_SHFT4K) + znd->md_end;
+ int rcode = DM_MAPIO_SUBMITTED;
+ struct request_queue *q;
+ int force_sync_now = 0;
+
+ /* map to backing device ... NOT dm-zoned device */
+ bio->bi_bdev = znd->dev->bdev;
+
+ q = bdev_get_queue(bio->bi_bdev);
+ q->queue_flags |= QUEUE_FLAG_NOMERGES;
+
+ if (is_write && znd->meta_result) {
+ if (!(bio->bi_rw & REQ_DISCARD)) {
+ rcode = znd->meta_result;
+ Z_ERR(znd, "MAP ERR (meta): %d", rcode);
+ goto out;
+ }
+ }
+
+ /* check for REQ_FLUSH flag */
+ if (bio->bi_rw & REQ_FLUSH) {
+ bio->bi_rw &= ~(REQ_FLUSH);
+ set_bit(DO_FLUSH, &znd->flags);
+ force_sync_now = 1;
+ }
+ if (bio->bi_rw & REQ_SYNC) {
+ set_bit(DO_SYNC, &znd->flags);
+ force_sync_now = 1;
+ }
+
+ Z_DBG(znd, "%s: s:%"PRIx64" sz:%u -> %s", __func__, s_zdm,
+ bio->bi_iter.bi_size, is_write ? "W" : "R");
+
+ if (bio->bi_iter.bi_size) {
+ if (bio->bi_rw & REQ_DISCARD) {
+ rcode = zoned_map_discard(znd, bio, s_zdm);
+ } else if (is_write) {
+ const int wait = 1;
+ u32 blks = bio->bi_iter.bi_size >> 12;
+
+ rcode = zoned_map_write(znd, bio, s_zdm);
+ if (rcode == DM_MAPIO_SUBMITTED) {
+ z_discard_partial(znd, blks, CRIT);
+ if (znd->z_gc_free < 5)
+ gc_immediate(znd, !wait, CRIT);
+ }
+ } else {
+ rcode = zm_read_bios(znd, bio, s_zdm);
+ }
+ znd->age = jiffies;
+ }
+
+ if (test_bit(DO_FLUSH, &znd->flags) ||
+ test_bit(DO_SYNC, &znd->flags) ||
+ test_bit(DO_JOURNAL_MOVE, &znd->flags) ||
+ test_bit(DO_MEMPOOL, &znd->flags)) {
+ if (!test_bit(DO_METAWORK_QD, &znd->flags) &&
+ !work_pending(&znd->meta_work)) {
+ set_bit(DO_METAWORK_QD, &znd->flags);
+ queue_work(znd->meta_wq, &znd->meta_work);
+ }
+ }
+
+ if (force_sync_now && work_pending(&znd->meta_work))
+ flush_workqueue(znd->meta_wq);
+
+out:
+ if (rcode == DM_MAPIO_REMAPPED || rcode == DM_MAPIO_SUBMITTED)
+ goto done;
+
+ if (rcode < 0 || rcode == DM_MAPIO_REQUEUE) {
+ Z_ERR(znd, "MAP ERR: %d", rcode);
+ Z_ERR(znd, "%s: s:%"PRIx64" sz:%u -> %s", __func__, s_zdm,
+ bio->bi_iter.bi_size, is_write ? "W" : "R");
+ dump_stack();
+ } else {
+ Z_ERR(znd, "MAP ERR: %d", rcode);
+ Z_ERR(znd, "%s: s:%"PRIx64" sz:%u -> %s", __func__, s_zdm,
+ bio->bi_iter.bi_size, is_write ? "W" : "R");
+ dump_stack();
+ rcode = -EIO;
+ }
+done:
+ return rcode;
+}
+
+/**
+ * _do_mem_purge() - conditionally trigger a reduction of cache memory
+ * @znd: ZDM Instance
+ */
+static inline int _do_mem_purge(struct zoned *znd)
+{
+ int do_work = 0;
+
+ if (atomic_read(&znd->incore) > 3) {
+ set_bit(DO_MEMPOOL, &znd->flags);
+ if (!work_pending(&znd->meta_work))
+ do_work = 1;
+ }
+ return do_work;
+}
+
+/**
+ * on_timeout_activity() - Periodic background task execution.
+ * @znd: ZDM Instance
+ * @mempurge: If memory purge should be scheduled.
+ * @delay: Delay metric for periodic GC
+ *
+ * NOTE: Executed as a worker task queued froma timer.
+ */
+static void on_timeout_activity(struct zoned *znd, int delay)
+{
+ int max_tries = 1;
+
+ if (test_bit(ZF_FREEZE, &znd->flags))
+ return;
+
+ if (is_expired_msecs(znd->age, DISCARD_IDLE_MSECS))
+ max_tries = 20;
+
+ do {
+ int count;
+
+ count = z_discard_partial(znd, Z_BLKSZ, NORMAL);
+ if (count != 1 || --max_tries < 0)
+ break;
+
+ if (test_bit(ZF_FREEZE, &znd->flags))
+ return;
+
+ } while (is_expired_msecs(znd->age, DISCARD_IDLE_MSECS));
+
+ gc_queue_with_delay(znd, delay, NORMAL);
+
+ if (!test_bit(DO_GC_NO_PURGE, &znd->flags) && _do_mem_purge(znd))
+ queue_work(znd->meta_wq, &znd->meta_work);
+}
+
+/**
+ * bg_work_task() - periodic background worker
+ * @work: context for worker thread
+ */
+static void bg_work_task(struct work_struct *work)
+{
+ struct zoned *znd;
+ const int delay = 1;
+
+ if (!work)
+ return;
+
+ znd = container_of(work, struct zoned, bg_work);
+ on_timeout_activity(znd, delay);
+}
+
+/**
+ * activity_timeout() - Handler for timer used to trigger background worker.
+ * @data: context for timer.
+ */
+static void activity_timeout(unsigned long data)
+{
+ struct zoned *znd = (struct zoned *) data;
+
+ if (!work_pending(&znd->bg_work))
+ queue_work(znd->bg_wq, &znd->bg_work);
+
+ if (!test_bit(ZF_FREEZE, &znd->flags))
+ mod_timer(&znd->timer, jiffies + msecs_to_jiffies(2500));
+}
+
+/**
+ * get_dev_size() - Report accessible size of device to upper layer.
+ * @ti: DM Target
+ *
+ * Return: Size in 512 byte sectors
+ */
+static sector_t get_dev_size(struct dm_target *ti)
+{
+ struct zoned *znd = ti->private;
+ u64 sz = i_size_read(get_bdev_bd_inode(znd)); /* size in bytes. */
+ u64 lut_resv = znd->gz_count * znd->mz_provision;
+
+ /*
+ * NOTE: `sz` should match `ti->len` when the dm_table
+ * is setup correctly
+ */
+ sz -= (lut_resv * Z_SMR_SZ_BYTES);
+
+ return to_sector(sz);
+}
+
+/**
+ * zoned_iterate_devices() - Iterate over devices call fn() at each.
+ * @ti: DM Target
+ * @fn: Function for each callout
+ * @data: Context for fn().
+ */
+static int zoned_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct zoned *znd = ti->private;
+ int rc;
+
+ rc = fn(ti, znd->dev, 0, get_dev_size(ti), data);
+ return rc;
+}
+
+/**
+ * zoned_io_hints() - The place to tweek queue limits for DM targets
+ * @ti: DM Target
+ * @limits: queue_limits for this DM target
+ */
+static void zoned_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct zoned *znd = ti->private;
+ u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
+
+ /*
+ * If the system-determined stacked limits are compatible with the
+ * zoned device's blocksize (io_opt is a factor) do not override them.
+ */
+ if (io_opt_sectors < 8 || do_div(io_opt_sectors, 8)) {
+ blk_limits_io_min(limits, 0);
+ blk_limits_io_opt(limits, 8 << SECTOR_SHIFT);
+ }
+
+ limits->logical_block_size =
+ limits->physical_block_size =
+ limits->io_min = Z_C4K;
+ if (znd->trim) {
+ limits->discard_alignment = Z_C4K;
+ limits->discard_granularity = Z_C4K;
+ limits->max_discard_sectors = 1 << 30;
+ limits->max_hw_discard_sectors = 1 << 30;
+ limits->discard_zeroes_data = 1;
+ }
+}
+
+/**
+ * zoned_status() - Report status of DM Target
+ * @ti: DM Target
+ * @type: Type of status to report.
+ * @status_flags: Flags
+ * @result: Fill in with status.
+ * @maxlen: Maximum number of bytes for result.
+ */
+static void zoned_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct zoned *znd = (struct zoned *) ti->private;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = '\0';
+ break;
+
+ case STATUSTYPE_TABLE:
+ scnprintf(result, maxlen, "%s Z#%u", znd->dev->name,
+ znd->zdstart);
+ break;
+ }
+}
+
+/* -------------------------------------------------------------------------- */
+/* --ProcFS Support Routines------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+#if defined(CONFIG_PROC_FS)
+
+/**
+ * struct zone_info_entry - Proc zone entry.
+ * @zone: Zone Index
+ * @info: Info (WP/Used).
+ */
+struct zone_info_entry {
+ u32 zone;
+ u32 info;
+};
+
+/**
+ * Startup writing to our proc entry
+ */
+static void *proc_wp_start(struct seq_file *seqf, loff_t *pos)
+{
+ struct zoned *znd = seqf->private;
+
+ if (*pos == 0)
+ znd->wp_proc_at = *pos;
+ return &znd->wp_proc_at;
+}
+
+/**
+ * Increment to our next 'grand zone' 4k page.
+ */
+static void *proc_wp_next(struct seq_file *seqf, void *v, loff_t *pos)
+{
+ struct zoned *znd = seqf->private;
+ u32 zone = ++znd->wp_proc_at;
+
+ return zone < znd->data_zones ? &znd->wp_proc_at : NULL;
+}
+
+/**
+ * Stop ... a place to free resources that we don't hold .. [noop].
+ */
+static void proc_wp_stop(struct seq_file *seqf, void *v)
+{
+}
+
+/**
+ * Write as many entries as possbile ....
+ */
+static int proc_wp_show(struct seq_file *seqf, void *v)
+{
+ int err = 0;
+ struct zoned *znd = seqf->private;
+ u32 zone = znd->wp_proc_at;
+ u32 out = 0;
+
+ while (zone < znd->data_zones) {
+ u32 gzno = zone >> GZ_BITS;
+ u32 gzoff = zone & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ struct zone_info_entry entry;
+
+ entry.zone = zone;
+ entry.info = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ err = seq_write(seqf, &entry, sizeof(entry));
+ if (err) {
+ /*
+ * write failure is temporary ..
+ * just return and try again
+ */
+ err = 0;
+ goto out;
+ }
+ out++;
+ zone = ++znd->wp_proc_at;
+ }
+
+out:
+ if (err)
+ Z_ERR(znd, "%s: %llu -> %d", __func__, znd->wp_proc_at, err);
+
+ return err;
+}
+
+/**
+ * zdm_wp_ops() - Seq_file operations for retrieving WP via proc fs
+ */
+static const struct seq_operations zdm_wp_ops = {
+ .start = proc_wp_start,
+ .next = proc_wp_next,
+ .stop = proc_wp_stop,
+ .show = proc_wp_show
+};
+
+/**
+ * zdm_wp_open() - Need to migrate our private data to the seq_file
+ */
+static int zdm_wp_open(struct inode *inode, struct file *file)
+{
+ /* seq_open will populate file->private_data with a seq_file */
+ int err = seq_open(file, &zdm_wp_ops);
+
+ if (!err) {
+ struct zoned *znd = PDE_DATA(inode);
+ struct seq_file *seqf = file->private_data;
+
+ seqf->private = znd;
+ }
+ return err;
+}
+
+/**
+ * zdm_wp_fops() - File operations for retrieving WP via proc fs
+ */
+static const struct file_operations zdm_wp_fops = {
+ .open = zdm_wp_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+/**
+ * Startup writing to our proc entry
+ */
+static void *proc_used_start(struct seq_file *seqf, loff_t *pos)
+{
+ struct zoned *znd = seqf->private;
+
+ if (*pos == 0)
+ znd->wp_proc_at = *pos;
+ return &znd->wp_proc_at;
+}
+
+/**
+ * Increment to our next zone
+ */
+static void *proc_used_next(struct seq_file *seqf, void *v, loff_t *pos)
+{
+ struct zoned *znd = seqf->private;
+ u32 zone = ++znd->wp_proc_at;
+
+ return zone < znd->data_zones ? &znd->wp_proc_at : NULL;
+}
+
+/**
+ * Stop ... a place to free resources that we don't hold .. [noop].
+ */
+static void proc_used_stop(struct seq_file *seqf, void *v)
+{
+}
+
+/**
+ * proc_used_show() - Write as many 'used' entries as possbile.
+ * @seqf: seq_file I/O handler
+ * @v: An unused parameter.
+ */
+static int proc_used_show(struct seq_file *seqf, void *v)
+{
+ int err = 0;
+ struct zoned *znd = seqf->private;
+ u32 zone = znd->wp_proc_at;
+ u32 out = 0;
+
+ while (zone < znd->data_zones) {
+ u32 gzno = zone >> GZ_BITS;
+ u32 gzoff = zone & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ struct zone_info_entry entry;
+
+ entry.zone = zone;
+ entry.info = le32_to_cpu(wpg->zf_est[gzoff]);
+
+ err = seq_write(seqf, &entry, sizeof(entry));
+ if (err) {
+ /*
+ * write failure is temporary ..
+ * just return and try again
+ */
+ err = 0;
+ goto out;
+ }
+ out++;
+ zone = ++znd->wp_proc_at;
+ }
+
+out:
+ if (err)
+ Z_ERR(znd, "%s: %llu -> %d", __func__, znd->wp_proc_at, err);
+
+ return err;
+}
+
+/**
+ * zdm_used_ops() - Seq_file Ops for retrieving 'used' state via proc fs
+ */
+static const struct seq_operations zdm_used_ops = {
+ .start = proc_used_start,
+ .next = proc_used_next,
+ .stop = proc_used_stop,
+ .show = proc_used_show
+};
+
+/**
+ * zdm_used_open() - Need to migrate our private data to the seq_file
+ */
+static int zdm_used_open(struct inode *inode, struct file *file)
+{
+ /* seq_open will populate file->private_data with a seq_file */
+ int err = seq_open(file, &zdm_used_ops);
+
+ if (!err) {
+ struct zoned *znd = PDE_DATA(inode);
+ struct seq_file *seqf = file->private_data;
+
+ seqf->private = znd;
+ }
+ return err;
+}
+
+/**
+ * zdm_used_fops() - File operations for retrieving 'used' state via proc fs
+ */
+static const struct file_operations zdm_used_fops = {
+ .open = zdm_used_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/**
+ * zdm_status_show() - Dump the status structure via proc fs
+ */
+static int zdm_status_show(struct seq_file *seqf, void *unused)
+{
+ struct zoned *znd = seqf->private;
+ struct zdm_ioc_status status;
+ u32 zone;
+
+ memset(&status, 0, sizeof(status));
+
+ for (zone = 0; zone < znd->data_zones; zone++) {
+ u32 gzno = zone >> GZ_BITS;
+ u32 gzoff = zone & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp_at = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_VALUE_MASK;
+
+ status.b_used += wp_at;
+ status.b_available += Z_BLKSZ - wp_at;
+ }
+ status.mc_entries = znd->mc_entries;
+ status.dc_entries = znd->dc_entries;
+ status.b_discard = znd->discard_count;
+
+ /* fixed array of ->fwd_tm and ->rev_tm */
+ status.m_zones = znd->data_zones;
+
+ status.memstat = znd->memstat;
+ memcpy(status.bins, znd->bins, sizeof(status.bins));
+ status.mlut_blocks = atomic_read(&znd->incore);
+
+ return seq_write(seqf, &status, sizeof(status));
+}
+
+/**
+ * zdm_status_open() - Open seq_file from file.
+ * @inode: Our data is stuffed here, Retrieve it.
+ * @file: file objected used by seq_file.
+ */
+static int zdm_status_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zdm_status_show, PDE_DATA(inode));
+}
+
+/**
+ * zdm_used_fops() - File operations to chain to zdm_status_open.
+ */
+static const struct file_operations zdm_status_fops = {
+ .open = zdm_status_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * zdm_info_show() - Report some information as text.
+ * @seqf: Sequence file for writing
+ * @unused: Not used.
+ */
+static int zdm_info_show(struct seq_file *seqf, void *unused)
+{
+ struct zoned *znd = seqf->private;
+
+ seq_printf(seqf, "On device: %s\n", _zdisk(znd));
+ seq_printf(seqf, "Data Zones: %u\n", znd->data_zones);
+ seq_printf(seqf, "Empty Zones: %u\n", znd->z_gc_free);
+ seq_printf(seqf, "Cached Pages: %u\n", znd->mc_entries);
+ seq_printf(seqf, "Discard Pages: %u\n", znd->dc_entries);
+ seq_printf(seqf, "ZTL Pages: %d\n", atomic_read(&znd->incore));
+ seq_printf(seqf, " in ZTL: %d\n", znd->in_zlt);
+ seq_printf(seqf, " in LZY: %d\n", znd->in_lzy);
+ seq_printf(seqf, "RAM in Use: %lu\n", znd->memstat);
+ seq_printf(seqf, "Zones GC'd: %u\n", znd->gc_events);
+ seq_printf(seqf, "GC Throttle: %d\n", atomic_read(&znd->gc_throttle));
+#if ALLOC_DEBUG
+ seq_printf(seqf, "Max Allocs: %u\n", znd->hw_allocs);
+#endif
+
+ return 0;
+}
+
+/**
+ * zdm_info_open() - Open seq_file from file.
+ * @inode: Our data is stuffed here, Retrieve it.
+ * @file: file objected used by seq_file.
+ */
+static int zdm_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zdm_info_show, PDE_DATA(inode));
+}
+
+/**
+ * zdm_used_fops() - File operations to chain to zdm_info_open.
+ */
+static const struct file_operations zdm_info_fops = {
+ .open = zdm_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * zdm_create_proc_entries() - Create proc entries for ZDM utilities
+ * @znd: ZDM Instance
+ */
+static int zdm_create_proc_entries(struct zoned *znd)
+{
+ snprintf(znd->proc_name, sizeof(znd->proc_name), "zdm_%s", _zdisk(znd));
+
+ znd->proc_fs = proc_mkdir(znd->proc_name, NULL);
+ if (!znd->proc_fs)
+ return -ENOMEM;
+
+ proc_create_data(PROC_WP, 0, znd->proc_fs, &zdm_wp_fops, znd);
+ proc_create_data(PROC_FREE, 0, znd->proc_fs, &zdm_used_fops, znd);
+ proc_create_data(PROC_DATA, 0, znd->proc_fs, &zdm_status_fops, znd);
+ proc_create_data(PROC_STATUS, 0, znd->proc_fs, &zdm_info_fops, znd);
+
+ return 0;
+}
+
+/**
+ * zdm_remove_proc_entries() - Remove proc entries
+ * @znd: ZDM Instance
+ */
+static void zdm_remove_proc_entries(struct zoned *znd)
+{
+ remove_proc_subtree(znd->proc_name, NULL);
+}
+
+#else /* !CONFIG_PROC_FS */
+
+static int zdm_create_proc_entries(struct zoned *znd)
+{
+ (void)znd;
+ return 0;
+}
+static void zdm_remove_proc_entries(struct zoned *znd)
+{
+ (void)znd;
+}
+#endif /* CONFIG_PROC_FS */
+
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static void start_worker(struct zoned *znd)
+{
+ clear_bit(ZF_FREEZE, &znd->flags);
+ atomic_set(&znd->suspended, 0);
+ mod_timer(&znd->timer, jiffies + msecs_to_jiffies(5000));
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static void stop_worker(struct zoned *znd)
+{
+ set_bit(ZF_FREEZE, &znd->flags);
+ atomic_set(&znd->suspended, 1);
+ zoned_io_flush(znd);
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static void zoned_postsuspend(struct dm_target *ti)
+{
+ struct zoned *znd = ti->private;
+
+ stop_worker(znd);
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static void zoned_resume(struct dm_target *ti)
+{
+ /* TODO */
+}
+
+/*
+ *
+ */
+static int zoned_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct zoned *znd = ti->private;
+ int iter;
+
+ for (iter = 0; iter < argc; iter++)
+ Z_ERR(znd, "Message: %s not handled.", argv[argc]);
+
+ return 0;
+}
+
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static int zoned_preresume(struct dm_target *ti)
+{
+ struct zoned *znd = ti->private;
+
+ start_worker(znd);
+ return 0;
+}
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static struct target_type zoned_target = {
+ .name = "zoned",
+ .module = THIS_MODULE,
+ .version = {1, 0, 0},
+ .ctr = zoned_ctr,
+ .dtr = zoned_dtr,
+ .map = zoned_map,
+ .end_io = zoned_endio,
+ .postsuspend = zoned_postsuspend,
+ .preresume = zoned_preresume,
+ .resume = zoned_resume,
+ .status = zoned_status,
+ .message = zoned_message,
+ .iterate_devices = zoned_iterate_devices,
+ .io_hints = zoned_io_hints
+};
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static int __init dm_zoned_init(void)
+{
+ int rcode = dm_register_target(&zoned_target);
+
+ if (rcode)
+ DMERR("zoned target registration failed: %d", rcode);
+
+ return rcode;
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+static void __exit dm_zoned_exit(void)
+{
+ dm_unregister_target(&zoned_target);
+}
+
+module_init(dm_zoned_init);
+module_exit(dm_zoned_exit);
+
+MODULE_DESCRIPTION(DM_NAME " zoned target for Host Aware/Managed drives.");
+MODULE_AUTHOR("Shaun Tancheff <shaun.tancheff@seagate.com>");
+MODULE_LICENSE("GPL");
new file mode 100644
@@ -0,0 +1,714 @@
+/*
+ * Kernel Device Mapper for abstracting ZAC/ZBC devices as normal
+ * block devices for linux file systems.
+ *
+ * Copyright (C) 2015 Seagate Technology PLC
+ *
+ * Written by:
+ * Shaun Tancheff <shaun.tancheff@seagate.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DM_ZONED_H
+#define _DM_ZONED_H
+
+#define ALLOC_DEBUG 0
+#define USE_KTHREAD 0
+#define KFIFO_SIZE (1 << 14)
+
+#define NORMAL 0
+#define CRIT GFP_NOIO
+
+#define DM_MSG_PREFIX "zoned"
+
+#define PROC_WP "wp.bin"
+#define PROC_FREE "free.bin"
+#define PROC_DATA "data.bin"
+#define PROC_STATUS "status"
+
+#define ZDM_RESERVED_ZNR 0
+#define ZDM_CRC_STASH_ZNR 1 /* first 64 blocks */
+#define ZDM_RMAP_ZONE 2
+#define ZDM_SECTOR_MAP_ZNR 3
+#define ZDM_DATA_START_ZNR 4
+
+#define Z_WP_GC_FULL (1u << 31)
+#define Z_WP_GC_ACTIVE (1u << 30)
+#define Z_WP_GC_TARGET (1u << 29)
+#define Z_WP_GC_READY (1u << 28)
+#define Z_WP_GC_BITS (0xFu << 28)
+
+#define Z_WP_GC_PENDING (Z_WP_GC_FULL|Z_WP_GC_ACTIVE)
+#define Z_WP_NON_SEQ (1u << 27)
+#define Z_WP_RRECALC (1u << 26)
+#define Z_WP_RESV_02 (1u << 25)
+#define Z_WP_RESV_03 (1u << 24)
+
+#define Z_WP_VALUE_MASK (~0u >> 8)
+#define Z_WP_FLAGS_MASK (~0u << 24)
+#define Z_WP_STREAM_MASK Z_WP_FLAGS_MASK
+
+#define Z_AQ_GC (1u << 31)
+#define Z_AQ_META (1u << 30)
+#define Z_AQ_NORMAL (1u << 29)
+#define Z_AQ_STREAM_ID (1u << 28)
+#define Z_AQ_STREAM_MASK (0xFF)
+#define Z_AQ_META_STREAM (Z_AQ_META | Z_AQ_STREAM_ID | 0xFE)
+
+#define Z_C4K (4096ul)
+#define Z_UNSORTED (Z_C4K / sizeof(struct map_sect_to_lba))
+#define Z_BLOCKS_PER_DM_SECTOR (Z_C4K/512)
+#define MZ_METADATA_ZONES (8ul)
+#define Z_SHFT4K (3)
+
+
+#define LBA_SB_START 1
+
+#define SUPERBLOCK_LOCATION 0
+#define SUPERBLOCK_MAGIC 0x5a6f4e65ul /* ZoNe */
+#define SUPERBLOCK_CSUM_XOR 146538381
+#define MIN_ZONED_VERSION 1
+#define Z_VERSION 1
+#define MAX_ZONED_VERSION 1
+#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
+
+#define UUID_LEN 16
+
+#define Z_TYPE_SMR 2
+#define Z_TYPE_SMR_HA 1
+#define Z_VPD_INFO_BYTE 8
+
+#define MAX_CACHE_INCR 320ul
+#define CACHE_COPIES 3
+#define MAX_MZ_SUPP 64
+#define FWD_TM_KEY_BASE 4096ul
+
+#define IO_VCACHE_ORDER 8
+#define IO_VCACHE_PAGES (1 << IO_VCACHE_ORDER) /* 256 pages => 1MiB */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+enum superblock_flags_t {
+ SB_DIRTY = 1,
+};
+
+struct z_io_req_t {
+ struct dm_io_region *where;
+ struct dm_io_request *io_req;
+ struct work_struct work;
+ int result;
+};
+
+#define Z_LOWER48 (~0ul >> 16)
+#define Z_UPPER16 (~Z_LOWER48)
+
+#define STREAM_SIZE 256
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+/**
+ * enum pg_flag_enum - Map Pg flags
+ * @IS_DIRTY: Block is modified from on-disk copy.
+ * @IS_STALE: ??
+ *
+ * @IS_FWD: Is part of Forward ZLT or CRC table.
+ * @IS_REV: Is part of Reverse ZLT or CRC table.
+ * @IS_CRC: Is part of CRC table.
+ * @IS_LUT: Is part of ZTL table.
+ *
+ * @R_IN_FLIGHT: Async read in progress.
+ * @W_IN_FLIGHT: ASync write in progress.
+ * @DELAY_ADD: Spinlock was busy for zltlst, added to lazy for transit.
+ * @STICKY: ???
+ * @IS_READA: Block will pulled for Read Ahead. Cleared when used.
+ * @IS_DROPPED: Has clean/expired from zlt_lst and is waiting free().
+ *
+ * @IS_LAZY: Had been added to 'lazy' lzy_lst.
+ * @IN_ZLT: Had been added to 'inpool' zlt_lst.
+ *
+ */
+enum pg_flag_enum {
+ IS_DIRTY,
+ IS_STALE,
+ IS_FLUSH,
+
+ IS_FWD,
+ IS_REV,
+ IS_CRC,
+ IS_LUT,
+
+ R_IN_FLIGHT,
+ W_IN_FLIGHT,
+ DELAY_ADD,
+ STICKY,
+ IS_READA,
+ IS_DROPPED,
+ IS_LAZY,
+ IN_ZLT,
+};
+
+/**
+ * enum gc_flags_enum - Garbage Collection [GC] states.
+ */
+enum gc_flags_enum {
+ DO_GC_NEW,
+ DO_GC_PREPARE, /* -> READ or COMPLETE state */
+ DO_GC_WRITE,
+ DO_GC_META, /* -> PREPARE state */
+ DO_GC_COMPLETE,
+};
+
+/**
+ * enum znd_flags_enum - zoned state/feature/action flags
+ */
+enum znd_flags_enum {
+ ZF_FREEZE,
+ ZF_POOL_FWD,
+ ZF_POOL_REV,
+ ZF_POOL_CRCS,
+
+ ZF_RESV_1,
+ ZF_RESV_2,
+ ZF_RESV_3,
+ ZF_RESV_4,
+
+ DO_JOURNAL_MOVE,
+ DO_MEMPOOL,
+ DO_SYNC,
+ DO_FLUSH,
+ DO_JOURNAL_LOAD,
+ DO_GC_NO_PURGE,
+ DO_METAWORK_QD,
+};
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+struct zoned;
+
+/**
+ * struct gc_state - A page of map table
+ * @znd: ZDM Instance
+ * @gc_flags: See gc_flags_enum
+ * @r_ptr: Next read in zone.
+ * @w_ptr: Next write in target zone.
+ * @nblks: Number of blocks in I/O
+ * @result: GC operation result.
+ * @z_gc: Zone undergoing compacation
+ * @tag: System wide serial number (debugging).
+ *
+ * Longer description of this structure.
+ */
+struct gc_state {
+ struct zoned *znd;
+ unsigned long gc_flags;
+
+ u32 r_ptr;
+ u32 w_ptr;
+
+ u32 nblks; /* 1-65536 */
+ u32 z_gc;
+
+ int result;
+};
+
+/**
+ * struct map_addr - A page of map table
+ * @dm_s: full map on dm layer
+ * @zone_id: z_id match zone_list_t.z_id
+ * @pg_idx: entry in lut (0-1023)
+ * @lut_s: sector table lba
+ * @lut_r: reverse table lba
+ *
+ * Longer description of this structure.
+ */
+struct map_addr {
+ u64 dm_s;
+ u64 lut_s;
+ u64 lut_r;
+
+ u32 zone_id;
+ u32 pg_idx;
+};
+
+/**
+ * struct map_sect_to_lba - Sector to LBA mapping.
+ * @tlba: tlba
+ * @physical: blba or number of blocks
+ *
+ * Longer description of this structure.
+ */
+struct map_sect_to_lba {
+ __le64 tlba; /* record type [16 bits] + logical sector # */
+ __le64 bval; /* csum 16 [16 bits] + 'physical' block lba */
+} __packed;
+
+/**
+ * enum gc_flags_enum - Garbage Collection [GC] states.
+ */
+enum map_type_enum {
+ IS_MAP,
+ IS_DISCARD,
+ MAP_COUNT,
+};
+
+/**
+ * struct map_cache - A page of map table
+ * @mclist:
+ * @jdata: 4k page of data
+ * @refcount:
+ * @cached_lock:
+ * @busy_locked:
+ * @jcount:
+ * @jsorted:
+ * @jsize:
+ * @map_content: See map_type_enum
+ *
+ * Longer description of this structure.
+ */
+struct map_cache {
+ struct list_head mclist;
+ struct map_sect_to_lba *jdata;
+ atomic_t refcount;
+ struct mutex cached_lock;
+ atomic_t busy_locked;
+ u32 jcount;
+ u32 jsorted;
+ u32 jsize;
+ u32 map_content; /* IS_MAP or IS_DISC */
+};
+
+
+union map_pg_data {
+ __le32 *addr;
+ __le16 *crc;
+};
+
+
+/**
+ * struct map_pg - A page of map table
+ *
+ * @mdata: 4k page of table entries
+ * @refcount:
+ * @age: most recent access in jiffies
+ * @lba: logical position (use lookups to find actual)
+ * @md_lock: lock mdata i/o
+ * @last_write: last known position on disk
+ * @flags: is dirty flag, is fwd, is rev, is crc, is lut
+ * @inpool:
+ *
+ * Longer description of this structure.
+ */
+struct map_pg {
+ union map_pg_data data;
+ atomic_t refcount;
+ u64 age;
+ u64 lba;
+ u64 last_write;
+ struct mutex md_lock;
+ unsigned long flags;
+ struct list_head zltlst;
+
+ /* in flight tracking */
+ struct list_head lazy;
+ struct zoned *znd;
+ struct map_pg *crc_pg;
+ int hotness;
+ int index;
+ __le16 md_crc;
+};
+
+/**
+ * struct map_crc - Map to backing crc16.
+ *
+ * @table: Maybe via table
+ * @pg_no: Page [index] of table entry if applicable.
+ * @pg_idx: Offset within page (From: zoned::md_crcs when table is null)
+ *
+ * Longer description of this structure.
+ */
+struct map_crc {
+ struct map_pg **table;
+ int pg_no;
+ int pg_idx;
+};
+
+/**
+ * struct map_info - Map to backing lookup table.
+ *
+ * @table: backing table
+ * @crc: backing crc16 detail.
+ * @index: index [page] of table entry. Use map_addr::pg_idx for offset.
+ * @bit_type: IS_LUT or IS_CRC
+ * @bit_dir: IS_FWD or IS_REV
+ *
+ * Longer description of this structure.
+ */
+struct mpinfo {
+ struct map_pg **table;
+ struct map_crc crc;
+ int index;
+ int bit_type;
+ int bit_dir;
+};
+
+
+/**
+ * struct meta_pg - A page of zone WP mapping.
+ *
+ * @wp_alloc: Bits 23-0: wp alloc location. Bits 31-24: GC Flags, Type Flags
+ * @zf_est: Bits 23-0: free block count. Bits 31-24: Stream Id
+ * @wp_used: Bits 23-0: wp written location. Bits 31-24: Ratio ReCalc flag.
+ * @lba: pinned LBA in conventional/preferred zone.
+ * @wplck: spinlock held during data updates.
+ * @flags: IS_DIRTY flag
+ *
+ * One page is used per 1024 zones on media.
+ * For an 8TB drive this uses 30 entries or about 360k RAM.
+ */
+struct meta_pg {
+ __le32 *wp_alloc;
+ __le32 *zf_est;
+ __le32 *wp_used;
+ u64 lba;
+ spinlock_t wplck;
+ unsigned long flags;
+};
+
+/**
+ * struct zdm_superblock - A page of map table
+ * @uuid:
+ * @nr_zones:
+ * @magic:
+ * @zdstart:
+ * @version:
+ * @packed_meta:
+ * @flags:
+ * @csum:
+ *
+ * Longer description of this structure.
+ */
+struct zdm_superblock {
+ u8 uuid[UUID_LEN]; /* 16 */
+ __le64 nr_zones; /* 8 */
+ __le64 magic; /* 8 */
+ __le32 resvd; /* 4 */
+ __le32 zdstart; /* 4 */
+ __le32 version; /* 4 */
+ __le32 packed_meta; /* 4 */
+ __le32 flags; /* 4 */
+ __le32 csum; /* 4 */
+} __packed; /* 56 */
+
+#define MAX_CACHE_SYNC 400
+
+/**
+ * struct mz_superkey - A page of map table
+ * @sig0: 8 - Native endian
+ * @sig1: 8 - Little endian
+ * @sblock: 56 -
+ * @stream: 1024 -
+ * @reserved: 2982 -
+ * @gc_resv: 4 -
+ * @meta_resv: 4 -
+ * @generation: 8 -
+ * @key_crc: 2 -
+ * @magic: 8 -
+ *
+ * Longer description of this structure.
+ */
+struct mz_superkey {
+ u64 sig0;
+ __le64 sig1;
+ struct zdm_superblock sblock;
+ __le32 stream[STREAM_SIZE];
+ __le32 gc_resv;
+ __le32 meta_resv;
+ __le16 n_crcs;
+ __le16 crcs[MAX_CACHE_SYNC];
+ __le16 md_crc;
+ __le16 wp_crc[64];
+ __le16 zf_crc[64];
+ __le16 discards;
+ __le16 maps;
+ u8 reserved[1908];
+ __le32 crc32;
+ __le64 generation;
+ __le64 magic;
+} __packed;
+
+/**
+ * struct io_4k_block - Sector to LBA mapping.
+ * @data: A 4096 byte block
+ *
+ * Longer description of this structure.
+ */
+struct io_4k_block {
+ u8 data[Z_C4K];
+};
+
+/**
+ * struct io_dm_block - Sector to LBA mapping.
+ * @data: A 512 byte block
+ *
+ * Longer description of this structure.
+ */
+struct io_dm_block {
+ u8 data[512];
+};
+
+struct stale_tracking {
+ u32 binsz;
+ u32 count;
+ int bins[STREAM_SIZE];
+};
+
+
+/*
+ *
+ * Partition -----------------------------------------------------------------+
+ * Table ---+ |
+ * | |
+ * SMR Drive |^-------------------------------------------------------------^|
+ * CMR Zones ^^^^^^^^^
+ * meta data |||||||||
+ *
+ * Remaining partition is filesystem data
+ *
+ */
+
+/**
+ * struct zoned - A page of map table
+ * @ti: dm_target entry
+ * @dev: dm_dev entry
+ * @mclist: list of pages of in-memory LBA mappings.
+ * @mclck: in memory map-cache lock (spinlock)
+
+ * @zltpool: pages of lookup table entries
+ * @zlt_lck: zltpool: memory pool lock
+ * @lzy_pool:
+ * @lzy_lock:
+ * @fwd_tm:
+ * @rev_tm:
+
+ * @bg_work: background worker (periodic GC/Mem purge)
+ * @bg_wq: background work queue
+ * @stats_lock:
+ * @gc_active: Current GC state
+ * @gc_lock: GC Lock
+ * @gc_work: GC Worker
+ * @gc_wq: GC Work Queue
+ * @data_zones: # of data zones on device
+ * @gz_count: # of 256G mega-zones
+ * @nr_blocks: 4k blocks on backing device
+ * @md_start: LBA at start of metadata pool
+ * @data_lba: LBA at start of data pool
+ * @zdstart: ZONE # at start of data pool (first Possible WP ZONE)
+ * @start_sect: where ZDM partition starts (RAW LBA)
+ * @flags: See: enum znd_flags_enum
+ * @gc_backlog:
+ * @gc_io_buf:
+ * @io_vcache[32]:
+ * @io_vcache_flags:
+ * @z_sballoc:
+ * @super_block:
+ * @z_mega:
+ * @meta_wq:
+ * @gc_postmap:
+ * @io_client:
+ * @io_wq:
+ * @zone_action_wq:
+ * @timer:
+ * @bins: Memory usage accounting/reporting.
+ * @bdev_name:
+ * @memstat:
+ * @suspended:
+ * @gc_mz_pref:
+ * @mz_provision: Number of zones per 1024 of over-provisioning.
+ * @ata_passthrough:
+ * @is_empty: For fast discards on initial format
+ *
+ * Longer description of this structure.
+ */
+struct zoned {
+ struct dm_target *ti;
+ struct dm_dev *dev;
+
+ struct list_head zltpool;
+ struct list_head lzy_pool;
+ struct list_head mclist[MAP_COUNT];
+ spinlock_t mclck[MAP_COUNT];
+
+ struct work_struct bg_work;
+ struct workqueue_struct *bg_wq;
+
+ spinlock_t zlt_lck;
+ spinlock_t lzy_lck;
+ spinlock_t stats_lock;
+ spinlock_t mapkey_lock; /* access LUT and CRC array of pointers */
+ spinlock_t ct_lock; /* access LUT and CRC array of pointers */
+
+ struct mutex gc_wait;
+ struct mutex pool_mtx;
+ struct mutex mz_io_mutex;
+ struct mutex vcio_lock;
+
+#if USE_KTHREAD /* experimental++ */
+ struct task_struct *bio_kthread;
+ DECLARE_KFIFO_PTR(bio_fifo, struct bio *);
+ wait_queue_head_t wait_bio;
+ wait_queue_head_t wait_fifo;
+#endif
+ struct bio_set *bio_set;
+
+ struct gc_state *gc_active;
+ spinlock_t gc_lock;
+ struct delayed_work gc_work;
+ struct workqueue_struct *gc_wq;
+
+ u64 nr_blocks;
+ u64 start_sect;
+
+ u64 md_start;
+ u64 md_end;
+ u64 data_lba;
+
+ unsigned long flags;
+
+ u64 r_base;
+ u64 s_base;
+ u64 c_base;
+ u64 c_mid;
+ u64 c_end;
+
+ u64 sk_low; /* unused ? */
+ u64 sk_high; /* unused ? */
+
+ struct meta_pg *wp;
+
+ struct map_pg **fwd_tm; /* nr_blocks / 1024 */
+ struct map_pg **rev_tm; /* nr_blocks / 1024 */
+ struct map_pg **fwd_crc; /* (nr_blocks / 1024) / 2048 */
+ struct map_pg **rev_crc; /* (nr_blocks / 1024) / 2048 */
+ __le16 *md_crcs; /* one of crc16's for fwd, 1 for rev */
+ u32 crc_count;
+ u32 map_count;
+
+ void *z_sballoc;
+ struct mz_superkey *bmkeys;
+ struct zdm_superblock *super_block;
+
+ struct work_struct meta_work;
+ sector_t last_w;
+ u8 *cow_block;
+ u64 cow_addr;
+ u32 data_zones;
+ u32 gz_count;
+ u32 zdstart;
+ u32 z_gc_free;
+ atomic_t incore;
+ u32 discard_count;
+ u32 z_current;
+ u32 z_meta_resv;
+ u32 z_gc_resv;
+ u32 gc_events;
+ int mc_entries;
+ int dc_entries;
+ int in_zlt;
+ int in_lzy;
+ int meta_result;
+ struct stale_tracking stale;
+
+ int gc_backlog;
+ void *gc_io_buf;
+ struct mutex gc_vcio_lock;
+ struct io_4k_block *io_vcache[32];
+ unsigned long io_vcache_flags;
+ u64 age;
+ struct workqueue_struct *meta_wq;
+ struct map_cache gc_postmap;
+ struct dm_io_client *io_client;
+ struct workqueue_struct *io_wq;
+ struct workqueue_struct *zone_action_wq;
+ struct timer_list timer;
+
+ u32 bins[40];
+ char bdev_name[BDEVNAME_SIZE];
+ char proc_name[BDEVNAME_SIZE+4];
+ struct proc_dir_entry *proc_fs;
+ loff_t wp_proc_at;
+ loff_t used_proc_at;
+
+ size_t memstat;
+ atomic_t suspended;
+ atomic_t gc_throttle;
+
+#if ALLOC_DEBUG
+ atomic_t allocs;
+ int hw_allocs;
+ void **alloc_trace;
+#endif
+ u32 filled_zone;
+ u16 mz_provision;
+ unsigned bdev_is_zoned:1;
+ unsigned ata_passthrough:1;
+ unsigned issue_open_zone:1;
+ unsigned issue_close_zone:1;
+ unsigned is_empty:1;
+ unsigned trim:1;
+
+};
+
+/**
+ * struct zdm_ioc_request - Sector to LBA mapping.
+ * @result_size:
+ * @megazone_nr:
+ *
+ * Longer description of this structure.
+ */
+struct zdm_ioc_request {
+ u32 result_size;
+ u32 megazone_nr;
+};
+
+/**
+ * struct zdm_ioc_status - Sector to LBA mapping.
+ * @b_used: Number of blocks used
+ * @b_available: Number of blocks free
+ * @b_discard: Number of blocks stale
+ * @m_zones: Number of zones.
+ * @mc_entries: Mem cache blocks in use
+ * @dc_entries: Discard cache blocks in use.
+ * @mlut_blocks:
+ * @crc_blocks:
+ * @memstat: Total memory in use by ZDM via *alloc()
+ * @bins: Allocation by subsystem.
+ *
+ * This status structure is used to pass run-time information to
+ * user spaces tools (zdm-tools) for diagnostics and tuning.
+ */
+struct zdm_ioc_status {
+ u64 b_used;
+ u64 b_available;
+ u64 b_discard;
+ u64 m_zones;
+ u32 mc_entries;
+ u32 dc_entries;
+ u64 mlut_blocks;
+ u64 crc_blocks;
+ u64 memstat;
+ u32 bins[40];
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DM_ZONED_H */
new file mode 100644
@@ -0,0 +1,7149 @@
+/*
+ * Kernel Device Mapper for abstracting ZAC/ZBC devices as normal
+ * block devices for linux file systems.
+ *
+ * Copyright (C) 2015 Seagate Technology PLC
+ *
+ * Written by:
+ * Shaun Tancheff <shaun.tancheff@seagate.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#define BUILD_NO 108
+
+#define EXTRA_DEBUG 0
+#define ENABLE_PG_FREE_VIA_LAZY 1
+
+#define MZ_MEMPOOL_SZ 512
+#define READ_AHEAD 16 /* number of LUT entries to read-ahead */
+#define MEMCACHE_HWMARK 5
+#define MEM_PURGE_MSECS 15000
+#define MEM_HOT_BOOST_INC 5000
+#define DISCARD_IDLE_MSECS 2000
+#define DISCARD_MAX_INGRESS 150
+#define MAX_WSET 2048
+
+
+/* acceptable 'free' count for MZ free levels */
+#define GC_PRIO_DEFAULT 0xFF00
+#define GC_PRIO_LOW 0x7FFF
+#define GC_PRIO_HIGH 0x0400
+#define GC_PRIO_CRIT 0x0040
+
+/* When less than 20 zones are free use aggressive gc in the megazone */
+#define GC_COMPACT_AGGRESSIVE 32
+
+/*
+ * For performance tuning:
+ * Q? smaller strips give smoother performance
+ * a single drive I/O is 8 (or 32?) blocks?
+ * A? Does not seem to ...
+ */
+#define GC_MAX_STRIPE 256
+#define REPORT_ORDER 7
+#define REPORT_FILL_PGS 65 /* 65 -> min # pages for 4096 descriptors */
+#define SYNC_IO_ORDER 2
+#define SYNC_IO_SZ ((1 << SYNC_IO_ORDER) * PAGE_SIZE)
+
+#define MZTEV_UNUSED (cpu_to_le32(0xFFFFFFFFu))
+#define MZTEV_NF (cpu_to_le32(0xFFFFFFFEu))
+
+#define Z_TABLE_MAGIC 0x123456787654321Eul
+#define Z_KEY_SIG 0xFEDCBA987654321Ful
+
+#define Z_CRC_4K 4096
+#define ZONE_SECT_BITS 19
+#define Z_BLKBITS 16
+#define Z_BLKSZ (1ul << Z_BLKBITS)
+#define MAX_ZONES_PER_MZ 1024
+#define Z_SMR_SZ_BYTES (Z_C4K << Z_BLKBITS)
+
+#define GC_READ (1ul << 15)
+#define BAD_ADDR (~0ul)
+#define MC_INVALID (cpu_to_le64(BAD_ADDR))
+#define NOZONE (~0u)
+
+#define GZ_BITS 10
+#define GZ_MMSK ((1u << GZ_BITS) - 1)
+
+#define CRC_BITS 11
+#define CRC_MMSK ((1u << CRC_BITS) - 1)
+
+#define MD_CRC_INIT (cpu_to_le16(0x5249u))
+
+static int map_addr_calc(struct zoned *, u64 dm_s, struct map_addr *out);
+static int zoned_io_flush(struct zoned *znd);
+static int zoned_wp_sync(struct zoned *znd, int reset_non_empty);
+static void cache_if_dirty(struct zoned *znd, struct map_pg *pg, int wq);
+static int write_if_dirty(struct zoned *, struct map_pg *, int use_wq, int snc);
+static void gc_work_task(struct work_struct *work);
+static void meta_work_task(struct work_struct *work);
+static u64 mcache_greatest_gen(struct zoned *, int, u64 *, u64 *);
+static u64 mcache_find_gen(struct zoned *, u64 base, int, u64 *out);
+static int find_superblock(struct zoned *znd, int use_wq, int do_init);
+static int sync_mapped_pages(struct zoned *znd, int sync, int drop);
+static int unused_phy(struct zoned *znd, u64 lba, u64 orig_s, gfp_t gfp);
+static struct io_4k_block *get_io_vcache(struct zoned *znd, gfp_t gfp);
+static int put_io_vcache(struct zoned *znd, struct io_4k_block *cache);
+static struct map_pg *get_map_entry(struct zoned *, u64 lba, gfp_t gfp);
+static void put_map_entry(struct map_pg *);
+static int cache_pg(struct zoned *znd, struct map_pg *pg, gfp_t gfp,
+ struct mpinfo *mpi);
+static int move_to_map_tables(struct zoned *znd, struct map_cache *mcache);
+static void update_stale_ratio(struct zoned *znd, u32 zone);
+static int zoned_create_disk(struct dm_target *ti, struct zoned *znd);
+static int do_init_zoned(struct dm_target *ti, struct zoned *znd);
+static int z_discard_partial(struct zoned *znd, u32 blks, gfp_t gfp);
+static u64 z_discard_range(struct zoned *znd, u64 addr, gfp_t gfp);
+static u64 z_lookup_cache(struct zoned *znd, u64 addr, int type);
+static u64 z_lookup_table(struct zoned *znd, u64 addr, gfp_t gfp);
+static u64 current_mapping(struct zoned *znd, u64 addr, gfp_t gfp);
+static int z_mapped_add_one(struct zoned *znd, u64 dm_s, u64 lba, gfp_t gfp);
+static int z_mapped_discard(struct zoned *znd, u64 tlba, u64 count, gfp_t gfp);
+static int z_mapped_addmany(struct zoned *znd, u64 dm_s, u64 lba, u64,
+ gfp_t gfp);
+static int z_to_map_list(struct zoned *znd, u64 dm_s, u64 lba, gfp_t gfp);
+static int z_to_discard_list(struct zoned *znd, u64 dm_s, u64 blks, gfp_t gfp);
+static int discard_merge(struct zoned *znd, u64 tlba, u64 blks);
+static int z_mapped_sync(struct zoned *znd);
+static int z_mapped_init(struct zoned *znd);
+static u64 z_acquire(struct zoned *znd, u32 flags, u32 nblks, u32 *nfound);
+static __le32 sb_crc32(struct zdm_superblock *sblock);
+static int update_map_entry(struct zoned *, struct map_pg *,
+ struct map_addr *, u64, int);
+static int read_block(struct dm_target *, enum dm_io_mem_type,
+ void *, u64, unsigned int, int);
+static int write_block(struct dm_target *, enum dm_io_mem_type,
+ void *, u64, unsigned int, int);
+static int writef_block(struct dm_target *ti, int rw, enum dm_io_mem_type dtype,
+ void *data, u64 lba, unsigned int count, int queue);
+static int zoned_init_disk(struct dm_target *ti, struct zoned *znd,
+ int create, int force);
+
+#define MutexLock(m) test_and_lock((m), __LINE__)
+#define SpinLock(s) test_and_spin((s), __LINE__)
+
+static __always_inline void test_and_lock(struct mutex *m, int lineno)
+{
+ if (!mutex_trylock(m)) {
+ pr_debug("mutex stall at %d\n", lineno);
+ mutex_lock(m);
+ }
+}
+
+static __always_inline void test_and_spin(spinlock_t *lock, int lineno)
+{
+ if (!spin_trylock(lock)) {
+ pr_debug("spin stall at %d\n", lineno);
+ spin_lock(lock);
+ }
+}
+
+/**
+ * ref_pg() - Decrement refcount on page of ZLT
+ * @pg: Page of ZLT map
+ */
+static __always_inline void deref_pg(struct map_pg *pg)
+{
+ atomic_dec(&pg->refcount);
+}
+
+/**
+ * ref_pg() - Increment refcount on page of ZLT
+ * @pg: Page of ZLT map
+ */
+static __always_inline void ref_pg(struct map_pg *pg)
+{
+ atomic_inc(&pg->refcount);
+#if 0 /* ALLOC_DEBUG */
+ if (atomic_read(&pg->refcount) > 20) {
+ pr_err("Excessive %d who dunnit?\n",
+ atomic_read(&pg->refcount));
+ dump_stack();
+ }
+#endif
+}
+
+/**
+ * getref_pg() - Read the refcount
+ * @pg: Page of ZLT map
+ */
+static __always_inline int getref_pg(struct map_pg *pg)
+{
+ return atomic_read(&pg->refcount);
+}
+
+
+/**
+ * mcache_ref() - Increment the reference count of mcache()
+ * @mcache: Page of map cache
+ */
+static __always_inline void mcache_ref(struct map_cache *mcache)
+{
+ atomic_inc(&mcache->refcount);
+}
+
+/**
+ * mcache_deref() - Decrement the reference count of mcache()
+ * @mcache: Page of map cache
+ */
+static __always_inline void mcache_deref(struct map_cache *mcache)
+{
+ atomic_dec(&mcache->refcount);
+}
+
+/**
+ * mcache_getref() - Read the refcount
+ * @mcache: Page of map cache
+ */
+static __always_inline int mcache_getref(struct map_cache *mcache)
+{
+ return atomic_read(&mcache->refcount);
+}
+
+/**
+ * mcache_busy() - Increment the busy test of mcache()
+ * @mcache: Page of map cache
+ */
+static __always_inline void mcache_busy(struct map_cache *mcache)
+{
+ atomic_inc(&mcache->busy_locked);
+}
+
+/**
+ * mcache_unbusy() - Decrement the busy test count of mcache()
+ * @mcache: Page of map cache
+ */
+static __always_inline void mcache_unbusy(struct map_cache *mcache)
+{
+ atomic_dec(&mcache->busy_locked);
+}
+
+/**
+ * mcache_is_busy() - Test if mcache is busy
+ * @mcache: Page of map cache
+ * Return non-zero if busy otherwise 0.
+ */
+static __always_inline int mcache_is_busy(struct map_cache *mcache)
+{
+ return atomic_read(&mcache->busy_locked);
+}
+
+/**
+ * crc16_md() - 16 bit CRC on metadata blocks
+ * @data: Block of metadata.
+ * @len: Number of bytes in block.
+ *
+ * Return: 16 bit CRC.
+ */
+static inline u16 crc16_md(void const *data, size_t len)
+{
+ const u16 init = 0xFFFF;
+ const u8 *p = data;
+
+ return crc16(init, p, len);
+}
+
+/**
+ * crc_md_le16() - 16 bit CRC on metadata blocks in little endian
+ * @data: Block of metadata.
+ * @len: Number of bytes in block.
+ *
+ * Return: 16 bit CRC.
+ */
+static inline __le16 crc_md_le16(void const *data, size_t len)
+{
+ u16 crc = crc16_md(data, len);
+
+ return cpu_to_le16(crc);
+}
+
+/**
+ * crcpg() - 32 bit CRC [NOTE: 32c is HW assisted on Intel]
+ * @data: Block of metadata [4K bytes].
+ *
+ * Return: 32 bit CRC.
+ */
+static inline u32 crcpg(void *data)
+{
+ return crc32c(~0u, data, Z_CRC_4K) ^ SUPERBLOCK_CSUM_XOR;
+}
+
+/**
+ * le64_to_lba48() - Return the lower 48 bits of LBA
+ * @enc: 64 bit LBA + flags
+ * @flg: optional 16 bits of classification.
+ *
+ * Return: 48 bits of LBA [and flg].
+ */
+static inline u64 le64_to_lba48(__le64 enc, u16 *flg)
+{
+ const u64 lba64 = le64_to_cpu(enc);
+
+ if (flg)
+ *flg = (lba64 >> 48) & 0xFFFF;
+
+ return lba64 & Z_LOWER48;
+}
+
+/**
+ * lba48_to_le64() - Encode 48 bits of lba + 16 bits of flags.
+ * @flags: flags to encode.
+ * @lba48: LBA to encode
+ *
+ * Return: Little endian u64.
+ */
+static inline __le64 lba48_to_le64(u16 flags, u64 lba48)
+{
+ u64 high_bits = flags;
+
+ return cpu_to_le64((high_bits << 48) | (lba48 & Z_LOWER48));
+}
+
+/**
+ * sb_test_flag() - Test if flag is set in Superblock.
+ * @sb: zdm_superblock.
+ * @bit_no: superblock flag
+ *
+ * Return: non-zero if flag is set.
+ */
+static inline int sb_test_flag(struct zdm_superblock *sb, int bit_no)
+{
+ u32 flags = le32_to_cpu(sb->flags);
+
+ return (flags & (1 << bit_no)) ? 1 : 0;
+}
+
+/**
+ * sb_set_flag() - Set a flag in superblock.
+ * @sb: zdm_superblock.
+ * @bit_no: superblock flag
+ */
+static inline void sb_set_flag(struct zdm_superblock *sb, int bit_no)
+{
+ u32 flags = le32_to_cpu(sb->flags);
+
+ flags |= (1 << bit_no);
+ sb->flags = cpu_to_le32(flags);
+}
+
+/**
+ * zone_to_sector() - Calculate starting LBA of zone
+ * @zone: zone number (0 based)
+ *
+ * Return: LBA at start of zone.
+ */
+static inline u64 zone_to_sector(u64 zone)
+{
+ return zone << ZONE_SECT_BITS;
+}
+
+/**
+ * is_expired_msecs() - Determine if age + msecs is older than now.
+ * @age: jiffies at last access
+ * @msecs: msecs of extra time.
+ *
+ * Return: non-zero if block is expired.
+ */
+static inline int is_expired_msecs(u64 age, u32 msecs)
+{
+ u64 expire_at = age + msecs_to_jiffies(msecs);
+ int expired = time_after64(jiffies_64, expire_at);
+
+ return expired;
+}
+
+/**
+ * is_expired() - Determine if age is older than MEM_PURGE_MSECS.
+ * @age: jiffies at last access
+ *
+ * Return: non-zero if block is expired.
+ */
+static inline int is_expired(u64 age)
+{
+ return is_expired_msecs(age, MEM_PURGE_MSECS);
+}
+
+/**
+ * _calc_zone() - Determine zone number from addr
+ * @addr: 4k sector number
+ *
+ * Return: znum or 0xFFFFFFFF if addr is in metadata space.
+ */
+static inline u32 _calc_zone(struct zoned *znd, u64 addr)
+{
+ u32 znum = NOZONE;
+
+ if (addr < znd->data_lba)
+ return znum;
+
+ addr -= znd->data_lba;
+ znum = addr >> Z_BLKBITS;
+
+ return znum;
+}
+
+/**
+ * lazy_pool_add - Set a flag and add map page to the lazy pool
+ * @znd: ZDM Instance
+ * @expg: Map table page.
+ * @bit: Flag to set on page.
+ *
+ * Lazy pool is used for deferred adding and delayed removal.
+ */
+static __always_inline
+void lazy_pool_add(struct zoned *znd, struct map_pg *expg, int bit)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&znd->lzy_lck, flags);
+ if (!test_bit(IS_LAZY, &expg->flags)) {
+ set_bit(IS_LAZY, &expg->flags);
+ list_add(&expg->lazy, &znd->lzy_pool);
+ znd->in_lzy++;
+ }
+ set_bit(bit, &expg->flags);
+ spin_unlock_irqrestore(&znd->lzy_lck, flags);
+}
+
+/**
+ * lazy_pool_splice - Add a list of pages to the lazy pool
+ * @znd: ZDM Instance
+ * @list: List of map table page to add.
+ *
+ * Lazy pool is used for deferred adding and delayed removal.
+ */
+static __always_inline
+void lazy_pool_splice(struct zoned *znd, struct list_head *list)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&znd->lzy_lck, flags);
+ list_splice_tail(list, &znd->lzy_pool);
+ spin_unlock_irqrestore(&znd->lzy_lck, flags);
+}
+
+/**
+ * pool_add() - Add metadata block to zltlst
+ * @znd: ZDM instance
+ * @expg: current metadata block to add to zltlst list.
+ */
+static inline int pool_add(struct zoned *znd, struct map_pg *expg)
+{
+ int rcode = 0;
+
+ if (expg) {
+ unsigned long flags;
+
+ if (spin_trylock_irqsave(&znd->zlt_lck, flags)) {
+ if (test_bit(IN_ZLT, &expg->flags)) {
+ Z_ERR(znd, "Double list_add from:");
+ dump_stack();
+ } else {
+ set_bit(IN_ZLT, &expg->flags);
+ list_add(&expg->zltlst, &znd->zltpool);
+ znd->in_zlt++;
+ }
+ spin_unlock_irqrestore(&znd->zlt_lck, flags);
+ rcode = 0;
+ } else {
+ lazy_pool_add(znd, expg, DELAY_ADD);
+ }
+ }
+
+ return rcode;
+}
+
+/**
+ * to_table_entry() - Deconstrct metadata page into mpinfo
+ * @znd: ZDM instance
+ * @lba: Address (4k resolution)
+ * @expg: current metadata block in zltlst list.
+ *
+ * Return: Index into mpinfo.table.
+ */
+static int to_table_entry(struct zoned *znd, u64 lba, struct mpinfo *mpi)
+{
+ int index = -1;
+
+ if (lba >= znd->s_base && lba < znd->r_base) {
+ mpi->table = znd->fwd_tm;
+ index = lba - znd->s_base;
+ mpi->bit_type = IS_LUT;
+ mpi->bit_dir = IS_FWD;
+ mpi->crc.table = znd->fwd_crc;
+ mpi->crc.pg_no = index >> CRC_BITS;
+ mpi->crc.pg_idx = index & CRC_MMSK;
+ if (index < 0 || index >= znd->map_count) {
+ Z_ERR(znd, "%s: FWD BAD IDX %"PRIx64" %d of %d",
+ __func__, lba, index, znd->map_count);
+ dump_stack();
+ }
+ } else if (lba >= znd->r_base && lba < znd->c_base) {
+ mpi->table = znd->rev_tm;
+ index = lba - znd->r_base;
+ mpi->bit_type = IS_LUT;
+ mpi->bit_dir = IS_REV;
+ mpi->crc.table = znd->rev_crc;
+ mpi->crc.pg_no = index >> CRC_BITS;
+ mpi->crc.pg_idx = index & CRC_MMSK;
+ if (index < 0 || index >= znd->map_count) {
+ Z_ERR(znd, "%s: REV BAD IDX %"PRIx64" %d of %d",
+ __func__, lba, index, znd->map_count);
+ dump_stack();
+ }
+ } else if (lba >= znd->c_base && lba < znd->c_mid) {
+ mpi->table = znd->fwd_crc;
+ index = lba - znd->c_base;
+ mpi->bit_type = IS_CRC;
+ mpi->bit_dir = IS_FWD;
+ mpi->crc.table = NULL;
+ mpi->crc.pg_no = 0;
+ mpi->crc.pg_idx = index & CRC_MMSK;
+ if (index < 0 || index >= znd->crc_count) {
+ Z_ERR(znd, "%s: CRC BAD IDX %"PRIx64" %d of %d",
+ __func__, lba, index, znd->crc_count);
+ dump_stack();
+ }
+ } else if (lba >= znd->c_mid && lba < znd->c_end) {
+ mpi->table = znd->rev_crc;
+ index = lba - znd->c_mid;
+ mpi->bit_type = IS_CRC;
+ mpi->bit_dir = IS_REV;
+ mpi->crc.table = NULL;
+ mpi->crc.pg_no = 1;
+ mpi->crc.pg_idx = (1 << CRC_BITS) + (index & CRC_MMSK);
+ if (index < 0 || index >= znd->crc_count) {
+ Z_ERR(znd, "%s: CRC BAD IDX %"PRIx64" %d of %d",
+ __func__, lba, index, znd->crc_count);
+ dump_stack();
+ }
+ } else {
+ Z_ERR(znd, "** Corrupt lba %" PRIx64 " not in range.", lba);
+ znd->meta_result = -EIO;
+ dump_stack();
+ }
+ mpi->index = index;
+ return index;
+}
+
+/**
+ * is_ready_for_gc() - Test zone flags for GC sanity and ready flag.
+ * @znd: ZDM instance
+ * @z_id: Address (4k resolution)
+ *
+ * Return: non-zero if zone is suitable for GC.
+ */
+static inline int is_ready_for_gc(struct zoned *znd, u32 z_id)
+{
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ u32 used = le32_to_cpu(wpg->wp_used[gzoff]) & Z_WP_VALUE_MASK;
+
+ if (((wp & Z_WP_GC_BITS) == Z_WP_GC_READY) && (used == Z_BLKSZ))
+ return 1;
+ return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+/*
+ * generic-ish n-way alloc/free
+ * Use kmalloc for small (< 4k) allocations.
+ * Use vmalloc for multi-page alloctions
+ * Except:
+ * Use multipage allocations for dm_io'd pages that a frequently hit.
+ *
+ * NOTE: ALL allocations are zero'd before returning.
+ * alloc/free count is tracked for dynamic analysis.
+ */
+
+#define GET_ZPG 0x040000
+#define GET_KM 0x080000
+#define GET_VM 0x100000
+
+#define PG_01 (GET_ZPG | 1)
+#define PG_02 (GET_ZPG | 2)
+#define PG_05 (GET_ZPG | 5)
+#define PG_06 (GET_ZPG | 6)
+#define PG_08 (GET_ZPG | 8)
+#define PG_09 (GET_ZPG | 9)
+#define PG_10 (GET_ZPG | 10)
+#define PG_11 (GET_ZPG | 11)
+#define PG_13 (GET_ZPG | 13)
+#define PG_17 (GET_ZPG | 17)
+#define PG_27 (GET_ZPG | 27)
+
+#define KM_00 (GET_KM | 0)
+#define KM_07 (GET_KM | 7)
+#define KM_14 (GET_KM | 14)
+#define KM_15 (GET_KM | 15)
+#define KM_16 (GET_KM | 16)
+#define KM_18 (GET_KM | 18)
+#define KM_19 (GET_KM | 19)
+#define KM_20 (GET_KM | 20)
+#define KM_25 (GET_KM | 25)
+#define KM_26 (GET_KM | 26)
+#define KM_28 (GET_KM | 28)
+#define KM_29 (GET_KM | 29)
+#define KM_30 (GET_KM | 30)
+
+#define VM_03 (GET_VM | 3)
+#define VM_04 (GET_VM | 4)
+#define VM_12 (GET_VM | 12)
+#define VM_21 (GET_VM | 21)
+#define VM_22 (GET_VM | 22)
+
+#define ZDM_FREE(z, _p, sz, id) \
+ do { zdm_free((z), (_p), (sz), (id)); (_p) = NULL; } while (0)
+
+#define ZDM_ALLOC(z, sz, id, gfp) zdm_alloc((z), (sz), (id), (gfp))
+#define ZDM_CALLOC(z, n, sz, id, gfp) zdm_calloc((z), (n), (sz), (id), (gfp))
+
+/**
+ * zdm_free_debug() - Extra cleanup for memory debugging.
+ * @znd: ZDM instance
+ * @p: memory to be released.
+ * @sz: allocated size.
+ * @id: allocation bin.
+ *
+ * Additional alloc/free debugging and statistics handling.
+ */
+static inline void zdm_free_debug(struct zoned *znd, void *p, size_t sz, int id)
+{
+#if ALLOC_DEBUG
+ int iter;
+
+ if (atomic_read(&znd->allocs) > znd->hw_allocs)
+ znd->hw_allocs = atomic_read(&znd->allocs);
+ atomic_dec(&znd->allocs);
+ for (iter = 0; iter < znd->hw_allocs; iter++) {
+ if (p == znd->alloc_trace[iter]) {
+ znd->alloc_trace[iter] = NULL;
+ break;
+ }
+ }
+ if (iter == znd->hw_allocs) {
+ Z_ERR(znd, "Free'd something *NOT* allocated? %d", id);
+ dump_stack();
+ }
+#endif
+
+ SpinLock(&znd->stats_lock);
+ if (sz > znd->memstat)
+ Z_ERR(znd, "Free'd more mem than allocated? %d", id);
+
+ if (sz > znd->bins[id]) {
+ Z_ERR(znd, "Free'd more mem than allocated? %d", id);
+ dump_stack();
+ }
+ znd->memstat -= sz;
+ znd->bins[id] -= sz;
+ spin_unlock(&znd->stats_lock);
+}
+
+/**
+ * zdm_free() - Unified free by allocation 'code'
+ * @znd: ZDM instance
+ * @p: memory to be released.
+ * @sz: allocated size.
+ * @code: allocation size
+ *
+ * This (ugly) unified scheme helps to find leaks and monitor usage
+ * via ioctl tools.
+ */
+static void zdm_free(struct zoned *znd, void *p, size_t sz, u32 code)
+{
+ int id = code & 0x00FFFF;
+ int flag = code & 0xFF0000;
+
+ if (p) {
+
+ if (znd)
+ zdm_free_debug(znd, p, sz, id);
+
+ memset(p, 0, sz); /* DEBUG */
+
+ switch (flag) {
+ case GET_ZPG:
+ free_page((unsigned long)p);
+ break;
+ case GET_KM:
+ kfree(p);
+ break;
+ case GET_VM:
+ vfree(p);
+ break;
+ default:
+ Z_ERR(znd,
+ "zdm_free %p scheme %x not mapped.", p, code);
+ break;
+ }
+ } else {
+ Z_ERR(znd, "double zdm_free %p [%d]", p, id);
+ dump_stack();
+ }
+}
+
+/**
+ * zdm_free_debug() - Extra tracking for memory debugging.
+ * @znd: ZDM instance
+ * @p: memory to be released.
+ * @sz: allocated size.
+ * @id: allocation bin.
+ *
+ * Additional alloc/free debugging and statistics handling.
+ */
+static inline
+void zdm_alloc_debug(struct zoned *znd, void *p, size_t sz, int id)
+{
+#if ALLOC_DEBUG
+ int iter;
+ int count;
+
+ atomic_inc(&znd->allocs);
+
+ count = atomic_read(&znd->allocs);
+ for (iter = 0; iter < count; iter++) {
+ if (!znd->alloc_trace[iter]) {
+ znd->alloc_trace[iter] = p;
+ break;
+ }
+ }
+#endif
+
+ SpinLock(&znd->stats_lock);
+ znd->memstat += sz;
+ znd->bins[id] += sz;
+ spin_unlock(&znd->stats_lock);
+}
+
+
+/**
+ * zdm_alloc() - Unified alloc by 'code':
+ * @znd: ZDM instance
+ * @sz: allocated size.
+ * @code: allocation size
+ * @gfp: kernel allocation flags.
+ *
+ * There a few things (like dm_io) that seem to need pages and not just
+ * kmalloc'd memory.
+ *
+ * This (ugly) unified scheme helps to find leaks and monitor usage
+ * via ioctl tools.
+ */
+static void *zdm_alloc(struct zoned *znd, size_t sz, int code, gfp_t gfp)
+{
+ void *pmem = NULL;
+ int id = code & 0x00FFFF;
+ int flag = code & 0xFF0000;
+ gfp_t gfp_mask;
+
+#if USE_KTHREAD
+ gfp_mask = GFP_KERNEL;
+#else
+ gfp_mask = gfp ? GFP_ATOMIC : GFP_KERNEL;
+#endif
+
+ if (flag == GET_VM)
+ might_sleep();
+
+ if (gfp_mask != GFP_ATOMIC)
+ might_sleep();
+
+ switch (flag) {
+ case GET_ZPG:
+ pmem = (void *)get_zeroed_page(gfp_mask);
+ if (!pmem && gfp_mask == GFP_ATOMIC) {
+ Z_ERR(znd, "No atomic for %d, try noio.", id);
+ pmem = (void *)get_zeroed_page(GFP_NOIO);
+ }
+ break;
+ case GET_KM:
+ pmem = kzalloc(sz, gfp_mask);
+ if (!pmem && gfp_mask == GFP_ATOMIC) {
+ Z_ERR(znd, "No atomic for %d, try noio.", id);
+ pmem = kzalloc(sz, GFP_NOIO);
+ }
+ break;
+ case GET_VM:
+ WARN_ON(gfp);
+ pmem = vzalloc(sz);
+ break;
+ default:
+ Z_ERR(znd, "zdm alloc scheme for %u unknown.", code);
+ break;
+ }
+
+ if (pmem) {
+ if (znd)
+ zdm_alloc_debug(znd, pmem, sz, id);
+ } else {
+ Z_ERR(znd, "Out of memory. %d", id);
+ dump_stack();
+ }
+
+ return pmem;
+}
+
+/**
+ * zdm_calloc() - Unified alloc by 'code':
+ * @znd: ZDM instance
+ * @n: number of elements in array.
+ * @sz: allocation size of each element.
+ * @c: allocation strategy (VM, KM, PAGE, N-PAGES).
+ * @q: kernel allocation flags.
+ *
+ * calloc is just an zeroed memory array alloc.
+ * all zdm_alloc schemes are for zeroed memory so no extra memset needed.
+ */
+static void *zdm_calloc(struct zoned *znd, size_t n, size_t sz, int c, gfp_t q)
+{
+ return zdm_alloc(znd, sz * n, c, q);
+}
+
+/**
+ * get_io_vcache() - Get a pre-allocated pool of memory for IO.
+ * @znd: ZDM instance
+ * @gfp: Allocation flags if no pre-allocated pool can be found.
+ *
+ * Return: Pointer to pool memory or NULL.
+ */
+static struct io_4k_block *get_io_vcache(struct zoned *znd, gfp_t gfp)
+{
+ struct io_4k_block *cache = NULL;
+ int avail;
+
+ might_sleep();
+
+ for (avail = 0; avail < ARRAY_SIZE(znd->io_vcache); avail++) {
+ if (!test_and_set_bit(avail, &znd->io_vcache_flags)) {
+ cache = znd->io_vcache[avail];
+ if (!cache)
+ znd->io_vcache[avail] = cache =
+ ZDM_CALLOC(znd, IO_VCACHE_PAGES,
+ sizeof(*cache), VM_12, gfp);
+ if (cache)
+ break;
+ }
+ }
+ return cache;
+}
+
+/**
+ * put_io_vcache() - Get a pre-allocated pool of memory for IO.
+ * @znd: ZDM instance
+ * @cache: Allocated cache entry.
+ *
+ * Return: Pointer to pool memory or NULL.
+ */
+static int put_io_vcache(struct zoned *znd, struct io_4k_block *cache)
+{
+ int err = -ENOENT;
+ int avail;
+
+ if (cache) {
+ for (avail = 0; avail < ARRAY_SIZE(znd->io_vcache); avail++) {
+ if (cache == znd->io_vcache[avail]) {
+ WARN_ON(!test_and_clear_bit(avail,
+ &znd->io_vcache_flags));
+ err = 0;
+ break;
+ }
+ }
+ }
+ return err;
+}
+
+/**
+ * map_value() - translate a lookup table entry to a Sector #, or LBA.
+ * @znd: ZDM instance
+ * @delta: little endian map entry.
+ *
+ * Return: LBA or 0 if invalid.
+ */
+static inline u64 map_value(struct zoned *znd, __le32 delta)
+{
+ u64 mval = 0ul;
+
+ if ((delta != MZTEV_UNUSED) && (delta != MZTEV_NF))
+ mval = le32_to_cpu(delta);
+
+ return mval;
+}
+
+/**
+ * map_encode() - Encode a Sector # or LBA to a lookup table entry value.
+ * @znd: ZDM instance
+ * @to_addr: address to encode.
+ * @value: encoded value
+ *
+ * Return: 0.
+ */
+static int map_encode(struct zoned *znd, u64 to_addr, __le32 *value)
+{
+ int err = 0;
+
+ *value = MZTEV_UNUSED;
+ if (to_addr != BAD_ADDR)
+ *value = cpu_to_le32((u32)to_addr);
+
+ return err;
+}
+
+/**
+ * release_memcache() - Free all the memcache blocks.
+ * @znd: ZDM instance
+ *
+ * Return: 0.
+ */
+static int release_memcache(struct zoned *znd)
+{
+ int no;
+
+ for (no = 0; no < MAP_COUNT; no++) {
+ struct list_head *head = &(znd->mclist[no]);
+ struct map_cache *mcache;
+ struct map_cache *_mc;
+
+ if (list_empty(head))
+ return 0;
+
+ SpinLock(&znd->mclck[no]);
+ list_for_each_entry_safe(mcache, _mc, head, mclist) {
+ list_del(&mcache->mclist);
+ ZDM_FREE(znd, mcache->jdata, Z_C4K, PG_08);
+ ZDM_FREE(znd, mcache, sizeof(*mcache), KM_07);
+ }
+ spin_unlock(&znd->mclck[no]);
+
+ }
+ return 0;
+}
+
+/**
+ * warn_bad_lba() - Warn if a give LBA is not valid (Esp if beyond a WP)
+ * @znd: ZDM instance
+ * @lba48: 48 bit lba.
+ *
+ * Return: non-zero if lba is not valid.
+ */
+static inline int warn_bad_lba(struct zoned *znd, u64 lba48)
+{
+#define FMT_ERR "LBA %" PRIx64 " is not valid: Z# %u, off:%x wp:%x"
+ int rcode = 0;
+ u32 zone;
+
+ if (lba48 < znd->data_lba)
+ return rcode;
+
+ zone = _calc_zone(znd, lba48);
+ if (zone < znd->data_zones) { /* FIXME: MAYBE? md_end */
+ u32 gzoff = zone & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[zone >> GZ_BITS];
+ u32 wp_at = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_VALUE_MASK;
+ u16 off = (lba48 - znd->data_lba) % Z_BLKSZ;
+
+ if (off >= wp_at) {
+ rcode = 1;
+ Z_ERR(znd, FMT_ERR, lba48, zone, off, wp_at);
+ dump_stack();
+ }
+ } else {
+ rcode = 1;
+ Z_ERR(znd, "LBA is not valid - Z# %u, count %u",
+ zone, znd->data_zones);
+ }
+
+ return rcode;
+}
+
+/**
+ * mapped_free() - Release a page of lookup table entries.
+ * @znd: ZDM instance
+ * @mapped: mapped page struct to free.
+ */
+static void mapped_free(struct zoned *znd, struct map_pg *mapped)
+{
+ if (mapped) {
+ MutexLock(&mapped->md_lock);
+ WARN_ON(test_bit(IS_DIRTY, &mapped->flags));
+ if (mapped->data.addr) {
+ ZDM_FREE(znd, mapped->data.addr, Z_C4K, PG_27);
+ atomic_dec(&znd->incore);
+ }
+ mutex_unlock(&mapped->md_lock);
+ ZDM_FREE(znd, mapped, sizeof(*mapped), KM_20);
+ }
+}
+
+/**
+ * flush_map() - write dirty map entres to disk.
+ * @znd: ZDM instance
+ * @map: Array of mapped pages.
+ * @count: number of elements in range.
+ * Return: non-zero on error.
+ */
+static int flush_map(struct zoned *znd, struct map_pg **map, u32 count)
+{
+ const int use_wq = 1;
+ const int sync = 1;
+ u32 ii;
+ int err = 0;
+
+ if (!map)
+ return err;
+
+ for (ii = 0; ii < count; ii++) {
+ if (map[ii] && map[ii]->data.addr) {
+ cache_if_dirty(znd, map[ii], use_wq);
+ err |= write_if_dirty(znd, map[ii], use_wq, sync);
+ }
+ }
+
+ return err;
+}
+
+/**
+ * zoned_io_flush() - flush all pending IO.
+ * @znd: ZDM instance
+ */
+static int zoned_io_flush(struct zoned *znd)
+{
+ int err = 0;
+
+ set_bit(ZF_FREEZE, &znd->flags);
+ atomic_inc(&znd->gc_throttle);
+
+ mod_delayed_work(znd->gc_wq, &znd->gc_work, 0);
+ flush_delayed_work(&znd->gc_work);
+
+ clear_bit(DO_GC_NO_PURGE, &znd->flags);
+ set_bit(DO_JOURNAL_MOVE, &znd->flags);
+ set_bit(DO_MEMPOOL, &znd->flags);
+ set_bit(DO_SYNC, &znd->flags);
+ set_bit(DO_FLUSH, &znd->flags);
+ queue_work(znd->meta_wq, &znd->meta_work);
+ flush_workqueue(znd->meta_wq);
+ flush_workqueue(znd->bg_wq);
+
+ mod_delayed_work(znd->gc_wq, &znd->gc_work, 0);
+ flush_delayed_work(&znd->gc_work);
+ atomic_dec(&znd->gc_throttle);
+
+ spin_lock(&znd->lzy_lck);
+ INIT_LIST_HEAD(&znd->lzy_pool);
+ spin_unlock(&znd->lzy_lck);
+
+ spin_lock(&znd->zlt_lck);
+ INIT_LIST_HEAD(&znd->zltpool);
+ spin_unlock(&znd->zlt_lck);
+
+ err = flush_map(znd, znd->fwd_tm, znd->map_count);
+ if (err)
+ goto out;
+
+ err = flush_map(znd, znd->rev_tm, znd->map_count);
+ if (err)
+ goto out;
+
+ err = flush_map(znd, znd->fwd_crc, znd->crc_count);
+ if (err)
+ goto out;
+
+ err = flush_map(znd, znd->rev_crc, znd->crc_count);
+ if (err)
+ goto out;
+
+ set_bit(DO_SYNC, &znd->flags);
+ set_bit(DO_FLUSH, &znd->flags);
+ queue_work(znd->meta_wq, &znd->meta_work);
+ flush_workqueue(znd->meta_wq);
+
+out:
+ return err;
+}
+
+/**
+ * release_table_pages() - flush and free all table map entries.
+ * @znd: ZDM instance
+ */
+static void release_table_pages(struct zoned *znd)
+{
+ int entry;
+
+ if (znd->fwd_tm) {
+ for (entry = 0; entry < znd->map_count; entry++) {
+ mapped_free(znd, znd->fwd_tm[entry]);
+ znd->fwd_tm[entry] = NULL;
+ }
+ }
+ if (znd->rev_tm) {
+ for (entry = 0; entry < znd->map_count; entry++) {
+ mapped_free(znd, znd->rev_tm[entry]);
+ znd->rev_tm[entry] = NULL;
+ }
+ }
+ if (znd->fwd_crc) {
+ for (entry = 0; entry < znd->crc_count; entry++) {
+ mapped_free(znd, znd->fwd_crc[entry]);
+ znd->fwd_crc[entry] = NULL;
+ }
+ }
+ if (znd->rev_crc) {
+ for (entry = 0; entry < znd->crc_count; entry++) {
+ mapped_free(znd, znd->rev_crc[entry]);
+ znd->rev_crc[entry] = NULL;
+ }
+ }
+}
+
+/**
+ * _release_wp() - free all WP alloc/usage/used data.
+ * @znd: ZDM instance
+ * @wp: Object to free.
+ */
+static void _release_wp(struct zoned *znd, struct meta_pg *wp)
+{
+ u32 gzno;
+
+ for (gzno = 0; gzno < znd->gz_count; gzno++) {
+ struct meta_pg *wpg = &wp[gzno];
+
+ if (wpg->wp_alloc)
+ ZDM_FREE(znd, wpg->wp_alloc, Z_C4K, PG_06);
+ if (wpg->zf_est)
+ ZDM_FREE(znd, wpg->zf_est, Z_C4K, PG_06);
+ if (wpg->wp_used)
+ ZDM_FREE(znd, wpg->wp_used, Z_C4K, PG_06);
+ }
+ ZDM_FREE(znd, wp, znd->gz_count * sizeof(*wp), VM_21);
+ znd->wp = NULL;
+}
+
+/**
+ * zoned_destroy() - Teardown a zoned device mapper instance.
+ * @znd: ZDM instance
+ */
+static void zoned_destroy(struct zoned *znd)
+{
+ int purge;
+ size_t ppgsz = sizeof(struct map_pg *);
+ size_t mapsz = ppgsz * znd->map_count;
+ size_t crcsz = ppgsz * znd->crc_count;
+
+ del_timer_sync(&znd->timer);
+
+ if (zoned_io_flush(znd))
+ Z_ERR(znd, "sync/flush failure");
+
+ release_table_pages(znd);
+ release_memcache(znd);
+
+ if (znd->dev) {
+ dm_put_device(znd->ti, znd->dev);
+ znd->dev = NULL;
+ }
+ if (znd->io_wq) {
+ destroy_workqueue(znd->io_wq);
+ znd->io_wq = NULL;
+ }
+ if (znd->zone_action_wq) {
+ destroy_workqueue(znd->zone_action_wq);
+ znd->zone_action_wq = NULL;
+ }
+ if (znd->bg_wq) {
+ destroy_workqueue(znd->bg_wq);
+ znd->bg_wq = NULL;
+ }
+ if (znd->gc_wq) {
+ destroy_workqueue(znd->gc_wq);
+ znd->gc_wq = NULL;
+ }
+ if (znd->meta_wq) {
+ destroy_workqueue(znd->meta_wq);
+ znd->meta_wq = NULL;
+ }
+ if (znd->io_client)
+ dm_io_client_destroy(znd->io_client);
+ if (znd->wp)
+ _release_wp(znd, znd->wp);
+ if (znd->md_crcs)
+ ZDM_FREE(znd, znd->md_crcs, Z_C4K * 2, VM_22);
+ if (znd->gc_io_buf)
+ ZDM_FREE(znd, znd->gc_io_buf, Z_C4K * GC_MAX_STRIPE, VM_04);
+ if (znd->gc_postmap.jdata) {
+ size_t sz = Z_BLKSZ * sizeof(*znd->gc_postmap.jdata);
+
+ ZDM_FREE(znd, znd->gc_postmap.jdata, sz, VM_03);
+ }
+ if (znd->fwd_tm)
+ ZDM_FREE(znd, znd->fwd_tm, mapsz, VM_21);
+ if (znd->rev_tm)
+ ZDM_FREE(znd, znd->rev_tm, mapsz, VM_22);
+ if (znd->fwd_crc)
+ ZDM_FREE(znd, znd->fwd_crc, crcsz, VM_21);
+ if (znd->rev_crc)
+ ZDM_FREE(znd, znd->rev_crc, crcsz, VM_22);
+
+ for (purge = 0; purge < ARRAY_SIZE(znd->io_vcache); purge++) {
+ size_t vcsz = IO_VCACHE_PAGES * sizeof(struct io_4k_block *);
+
+ if (znd->io_vcache[purge]) {
+ if (test_and_clear_bit(purge, &znd->io_vcache_flags))
+ Z_ERR(znd, "sync cache entry %d still in use!",
+ purge);
+ ZDM_FREE(znd, znd->io_vcache[purge], vcsz, VM_12);
+ }
+ }
+ if (znd->z_sballoc)
+ ZDM_FREE(znd, znd->z_sballoc, Z_C4K, PG_05);
+
+ ZDM_FREE(NULL, znd, sizeof(*znd), KM_00);
+}
+
+
+/**
+ * _init_streams() - Setup initial conditions for streams and reserved zones.
+ * @znd: ZDM instance
+ */
+static void _init_streams(struct zoned *znd)
+{
+ u64 stream;
+
+ for (stream = 0; stream < ARRAY_SIZE(znd->bmkeys->stream); stream++)
+ znd->bmkeys->stream[stream] = cpu_to_le32(NOZONE);
+
+ znd->z_meta_resv = cpu_to_le32(znd->data_zones - 2);
+ znd->z_gc_resv = cpu_to_le32(znd->data_zones - 1);
+ znd->z_gc_free = znd->data_zones - 2;
+}
+
+/**
+ * _init_mdcrcs() - Setup initial values for empty CRC blocks.
+ * @znd: ZDM instance
+ */
+static void _init_mdcrcs(struct zoned *znd)
+{
+ int idx;
+
+ for (idx = 0; idx < Z_C4K; idx++)
+ znd->md_crcs[idx] = MD_CRC_INIT;
+}
+
+/**
+ * _init_wp() - Setup initial usage for empty data zones.
+ * @znd: ZDM instance
+ */
+static void _init_wp(struct zoned *znd, u32 gzno, struct meta_pg *wpg)
+{
+ u32 gzcount = 1 << GZ_BITS;
+ u32 iter;
+
+ if (znd->data_zones < ((gzno+1) << GZ_BITS))
+ gzcount = znd->data_zones & GZ_MMSK;
+
+ /* mark as empty */
+ for (iter = 0; iter < gzcount; iter++)
+ wpg->zf_est[iter] = cpu_to_le32(Z_BLKSZ);
+
+ /* mark as n/a -- full */
+ gzcount = 1 << GZ_BITS;
+ for (; iter < gzcount; iter++) {
+ wpg->wp_alloc[iter] = cpu_to_le32(~0u);
+ wpg->wp_used[iter] = cpu_to_le32(~0u);
+ }
+}
+
+/**
+ * _alloc_wp() - Allocate needed WP (meta_pg) objects
+ * @znd: ZDM instance
+ */
+struct meta_pg *_alloc_wp(struct zoned *znd)
+{
+ struct meta_pg *wp;
+ u32 gzno;
+
+ wp = ZDM_CALLOC(znd, znd->gz_count, sizeof(*znd->wp), VM_21, NORMAL);
+ if (!wp)
+ goto out;
+ for (gzno = 0; gzno < znd->gz_count; gzno++) {
+ struct meta_pg *wpg = &wp[gzno];
+
+ spin_lock_init(&wpg->wplck);
+ wpg->lba = 2048ul + (gzno * 2);
+ wpg->wp_alloc = ZDM_ALLOC(znd, Z_C4K, PG_06, NORMAL);
+ wpg->zf_est = ZDM_ALLOC(znd, Z_C4K, PG_06, NORMAL);
+ wpg->wp_used = ZDM_ALLOC(znd, Z_C4K, PG_06, NORMAL);
+ if (!wpg->wp_alloc || !wpg->zf_est || !wpg->wp_used) {
+ _release_wp(znd, wp);
+ wp = NULL;
+ goto out;
+ }
+ _init_wp(znd, gzno, wpg);
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ }
+
+out:
+ return wp;
+}
+
+/**
+ * alloc_map_tables() - Allocate map table entries.
+ * @znd: ZDM Target
+ * @mapct: Number of map entries needed.
+ * @crcct: Number of CRC entries needed.
+ */
+static int alloc_map_tables(struct zoned *znd, u64 mapct, u64 crcct)
+{
+ const size_t ptrsz = sizeof(struct map_pg *);
+ int rcode = 0;
+
+ znd->fwd_tm = ZDM_CALLOC(znd, mapct, ptrsz, VM_21, NORMAL);
+ znd->rev_tm = ZDM_CALLOC(znd, mapct, ptrsz, VM_22, NORMAL);
+ znd->fwd_crc = ZDM_CALLOC(znd, crcct, ptrsz, VM_21, NORMAL);
+ znd->rev_crc = ZDM_CALLOC(znd, crcct, ptrsz, VM_22, NORMAL);
+ if (!znd->fwd_tm || !znd->rev_tm || !znd->fwd_crc || !znd->rev_crc)
+ rcode = -ENOMEM;
+
+ return rcode;
+}
+
+/**
+ * do_init_zoned() - Initialize a zoned device mapper instance
+ * @ti: DM Target Info
+ * @znd: ZDM Target
+ *
+ * Return: 0 on success.
+ *
+ * Setup the zone pointer table and do a one time calculation of some
+ * basic limits.
+ *
+ * While start of partition may not be zone aligned
+ * md_start, data_lba and md_end are all zone aligned.
+ * From start of partition [or device] to md_start some conv/pref
+ * space is required for superblocks, memcache, zone pointers, crcs
+ * and optionally pinned forward lookup blocks.
+ *
+ * 0 < znd->md_start <= znd->data_lba <= znd->md_end
+ *
+ * incoming [FS sectors] are linearly mapped after md_end.
+ * And blocks following data_lba are serialzied into zones either with
+ * explicit stream id support from BIOs [FUTURE], or implictly by LBA
+ * or type of data.
+ */
+static int do_init_zoned(struct dm_target *ti, struct zoned *znd)
+{
+ u64 size = i_size_read(get_bdev_bd_inode(znd));
+ u64 bdev_nr_sect4k = size / Z_C4K;
+ u64 data_zones = (bdev_nr_sect4k >> Z_BLKBITS) - znd->zdstart;
+ u64 mzcount = dm_div_up(data_zones, MAX_ZONES_PER_MZ);
+ u64 cache = MAX_CACHE_INCR * CACHE_COPIES * MAX_MZ_SUPP;
+ u64 z0_lba = dm_round_up(znd->start_sect, Z_BLKSZ);
+ u64 mapct = dm_div_up(data_zones << Z_BLKBITS, 1024);
+ u64 crcct = dm_div_up(mapct, 2048);
+ int type;
+ u32 mz_min = 0; /* cache */
+ int rcode = 0;
+
+ INIT_LIST_HEAD(&znd->zltpool);
+ INIT_LIST_HEAD(&znd->lzy_pool);
+
+ for (type = 0; type < MAP_COUNT; type++) {
+ INIT_LIST_HEAD(&znd->mclist[type]);
+ spin_lock_init(&znd->mclck[type]);
+ }
+ spin_lock_init(&znd->zlt_lck);
+ spin_lock_init(&znd->lzy_lck);
+ spin_lock_init(&znd->gc_lock);
+ spin_lock_init(&znd->stats_lock);
+ spin_lock_init(&znd->mapkey_lock);
+ spin_lock_init(&znd->ct_lock);
+
+ mutex_init(&znd->pool_mtx);
+ mutex_init(&znd->gc_wait);
+ mutex_init(&znd->gc_postmap.cached_lock);
+ mutex_init(&znd->gc_vcio_lock);
+ mutex_init(&znd->vcio_lock);
+ mutex_init(&znd->mz_io_mutex);
+
+ znd->data_zones = data_zones;
+ znd->gz_count = mzcount;
+ znd->crc_count = crcct;
+ znd->map_count = mapct;
+
+ /* z0_lba - lba of first full zone in disk addr space */
+
+ znd->md_start = z0_lba - znd->start_sect;
+ if (znd->md_start < cache) {
+ znd->md_start += Z_BLKSZ;
+ mz_min++;
+ }
+
+ /* md_start - lba of first full zone in partition addr space */
+ znd->s_base = znd->md_start;
+ mz_min += mzcount;
+ if (mz_min < znd->zdstart)
+ set_bit(ZF_POOL_FWD, &znd->flags);
+
+ znd->r_base = znd->s_base + (mzcount << Z_BLKBITS);
+ mz_min += mzcount;
+ if (mz_min < znd->zdstart)
+ set_bit(ZF_POOL_REV, &znd->flags);
+
+ znd->c_base = znd->r_base + (mzcount << Z_BLKBITS);
+ znd->c_mid = znd->c_base + (mzcount * 0x20);
+ znd->c_end = znd->c_mid + (mzcount * 0x20);
+ mz_min++;
+
+ if (mz_min < znd->zdstart) {
+ Z_ERR(znd, "Conv space for CRCs: Seting ZF_POOL_CRCS");
+ set_bit(ZF_POOL_CRCS, &znd->flags);
+ }
+
+ if (test_bit(ZF_POOL_FWD, &znd->flags)) {
+ znd->sk_low = znd->sk_high = 0;
+ } else {
+ znd->sk_low = znd->s_base;
+ znd->sk_high = znd->sk_low + (mzcount * 0x40);
+ }
+
+ /* logical *ending* lba for meta [bumped up to next zone alignment] */
+ znd->md_end = znd->c_base + (1 << Z_BLKBITS);
+
+ /* actual starting lba for data pool */
+ znd->data_lba = znd->md_end;
+ if (!test_bit(ZF_POOL_CRCS, &znd->flags))
+ znd->data_lba = znd->c_base;
+ if (!test_bit(ZF_POOL_REV, &znd->flags))
+ znd->data_lba = znd->r_base;
+ if (!test_bit(ZF_POOL_FWD, &znd->flags))
+ znd->data_lba = znd->s_base;
+
+ /* NOTE: md_end == data_lba => all meta is in conventional zones. */
+ Z_INFO(znd, "ZDM #%u", BUILD_NO);
+ Z_INFO(znd, "Start Sect: %" PRIx64 " ZDStart# %u md- start %" PRIx64
+ " end %" PRIx64 " Data LBA %" PRIx64,
+ znd->start_sect, znd->zdstart, znd->md_start,
+ znd->md_end, znd->data_lba);
+
+ Z_INFO(znd, "%s: size:%" PRIu64 " zones: %u, gzs %u, resvd %d",
+ __func__, size, znd->data_zones, znd->gz_count,
+ znd->gz_count * znd->mz_provision);
+
+#if ALLOC_DEBUG
+ znd->alloc_trace = vzalloc(sizeof(*znd->alloc_trace) * 65536);
+ if (!znd->alloc_trace) {
+ ti->error = "couldn't allocate in-memory mem debug trace";
+ rcode = -ENOMEM;
+ goto out;
+ }
+#endif
+
+ znd->z_sballoc = ZDM_ALLOC(znd, Z_C4K, PG_05, NORMAL);
+ if (!znd->z_sballoc) {
+ ti->error = "couldn't allocate in-memory superblock";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ rcode = alloc_map_tables(znd, mapct, crcct);
+ if (rcode)
+ goto out;
+
+ znd->gc_postmap.jdata = ZDM_CALLOC(znd, Z_BLKSZ,
+ sizeof(*znd->gc_postmap.jdata), VM_03, NORMAL);
+ znd->md_crcs = ZDM_ALLOC(znd, Z_C4K * 2, VM_22, NORMAL);
+ znd->gc_io_buf = ZDM_CALLOC(znd, GC_MAX_STRIPE, Z_C4K, VM_04, NORMAL);
+ znd->wp = _alloc_wp(znd);
+ znd->io_vcache[0] = ZDM_CALLOC(znd, IO_VCACHE_PAGES,
+ sizeof(struct io_4k_block), VM_12, NORMAL);
+ znd->io_vcache[1] = ZDM_CALLOC(znd, IO_VCACHE_PAGES,
+ sizeof(struct io_4k_block), VM_12, NORMAL);
+
+ if (!znd->gc_postmap.jdata || !znd->md_crcs || !znd->gc_io_buf ||
+ !znd->wp || !znd->io_vcache[0] || !znd->io_vcache[1]) {
+ rcode = -ENOMEM;
+ goto out;
+ }
+ znd->gc_postmap.jsize = Z_BLKSZ;
+ _init_mdcrcs(znd);
+
+ znd->io_client = dm_io_client_create();
+ if (!znd->io_client) {
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ znd->meta_wq = create_singlethread_workqueue("znd_meta_wq");
+ if (!znd->meta_wq) {
+ ti->error = "couldn't start metadata worker thread";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ znd->gc_wq = create_singlethread_workqueue("znd_gc_wq");
+ if (!znd->gc_wq) {
+ ti->error = "couldn't start GC workqueue.";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ znd->bg_wq = create_singlethread_workqueue("znd_bg_wq");
+ if (!znd->bg_wq) {
+ ti->error = "couldn't start background workqueue.";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ znd->io_wq = create_singlethread_workqueue("kzoned_dm_io_wq");
+ if (!znd->io_wq) {
+ ti->error = "couldn't start DM I/O thread";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+ znd->zone_action_wq = create_singlethread_workqueue("zone_act_wq");
+ if (!znd->zone_action_wq) {
+ ti->error = "couldn't start zone action worker";
+ rcode = -ENOMEM;
+ goto out;
+ }
+
+#if USE_KTHREAD
+ init_waitqueue_head(&znd->wait_bio);
+ init_waitqueue_head(&znd->wait_fifo);
+#endif
+
+ INIT_WORK(&znd->meta_work, meta_work_task);
+ INIT_WORK(&znd->bg_work, bg_work_task);
+ INIT_DELAYED_WORK(&znd->gc_work, gc_work_task);
+ setup_timer(&znd->timer, activity_timeout, (unsigned long)znd);
+ znd->last_w = BAD_ADDR;
+ set_bit(DO_SYNC, &znd->flags);
+
+out:
+ return rcode;
+}
+
+/**
+ * check_metadata_version() - Test ZDM version for compatibility.
+ * @sblock: Super block
+ *
+ * Return 0 if valud, or -EINVAL if version is not recognized.
+ */
+static int check_metadata_version(struct zdm_superblock *sblock)
+{
+ u32 metadata_version = le32_to_cpu(sblock->version);
+
+ if (metadata_version < MIN_ZONED_VERSION
+ || metadata_version > MAX_ZONED_VERSION) {
+ DMERR("Unsupported metadata version %u found.",
+ metadata_version);
+ DMERR("Only versions between %u and %u supported.",
+ MIN_ZONED_VERSION, MAX_ZONED_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * sb_crc32() - CRC check for superblock.
+ * @sblock: Superblock to check.
+ */
+static __le32 sb_crc32(struct zdm_superblock *sblock)
+{
+ const __le32 was = sblock->csum;
+ u32 crc;
+
+ sblock->csum = 0;
+ crc = crc32c(~(u32) 0u, sblock, sizeof(*sblock)) ^ SUPERBLOCK_CSUM_XOR;
+
+ sblock->csum = was;
+ return cpu_to_le32(crc);
+}
+
+/**
+ * sb_check() - Check the superblock to see if it is valid and not corrupt.
+ * @sblock: Superblock to check.
+ */
+static int sb_check(struct zdm_superblock *sblock)
+{
+ __le32 csum_le;
+
+ if (le64_to_cpu(sblock->magic) != SUPERBLOCK_MAGIC) {
+ DMERR("sb_check failed: magic %" PRIx64 ": wanted %lx",
+ le64_to_cpu(sblock->magic), SUPERBLOCK_MAGIC);
+ return -EILSEQ;
+ }
+
+ csum_le = sb_crc32(sblock);
+ if (csum_le != sblock->csum) {
+ DMERR("sb_check failed: csum %u: wanted %u",
+ csum_le, sblock->csum);
+ return -EILSEQ;
+ }
+
+ return check_metadata_version(sblock);
+}
+
+/**
+ * zoned_create_disk() - Initialize the on-disk format of a zoned device mapper.
+ * @ti: DM Target Instance
+ * @znd: ZDM Instance
+ */
+static int zoned_create_disk(struct dm_target *ti, struct zoned *znd)
+{
+ const int reset_non_empty = 1;
+ struct zdm_superblock *sblock = znd->super_block;
+ int err;
+
+ memset(sblock, 0, sizeof(*sblock));
+ generate_random_uuid(sblock->uuid);
+ sblock->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
+ sblock->version = cpu_to_le32(Z_VERSION);
+ sblock->zdstart = cpu_to_le32(znd->zdstart);
+
+ err = zoned_wp_sync(znd, reset_non_empty);
+
+ return err;
+}
+
+
+/**
+ * zoned_repair() - Attempt easy on-line fixes.
+ * @znd: ZDM Instance
+ *
+ * Repair an otherwise good device mapper instance that was not cleanly removed.
+ */
+static int zoned_repair(struct zoned *znd)
+{
+ Z_INFO(znd, "Is Dirty .. zoned_repair consistency fixer TODO!!!.");
+ return -ENOMEM;
+}
+
+
+/**
+ * zoned_init_disk() - Init from exising or re-create DM Target (ZDM)
+ * @ti: DM Target Instance
+ * @znd: ZDM Instance
+ * @create: Create if not found.
+ * @force: Force create even if it looks like a ZDM was here.
+ *
+ * Locate the existing SB on disk and re-load or create the device-mapper
+ * instance based on the existing disk state.
+ */
+static int zoned_init_disk(struct dm_target *ti, struct zoned *znd,
+ int create, int force)
+{
+ struct mz_superkey *key_blk = znd->z_sballoc;
+
+ int jinit = 1;
+ int n4kblks = 1;
+ int use_wq = 1;
+ int rc = 0;
+ u32 zdstart = znd->zdstart;
+
+ memset(key_blk, 0, sizeof(*key_blk));
+
+ znd->super_block = &key_blk->sblock;
+
+ znd->bmkeys = key_blk;
+ znd->bmkeys->sig0 = Z_KEY_SIG;
+ znd->bmkeys->sig1 = cpu_to_le64(Z_KEY_SIG);
+ znd->bmkeys->magic = cpu_to_le64(Z_TABLE_MAGIC);
+
+ _init_streams(znd);
+
+ znd->stale.binsz = STREAM_SIZE;
+ if (znd->data_zones < STREAM_SIZE)
+ znd->stale.binsz = znd->data_zones;
+ else if ((znd->data_zones / STREAM_SIZE) > STREAM_SIZE)
+ znd->stale.binsz = dm_div_up(znd->data_zones, STREAM_SIZE);
+ znd->stale.count = dm_div_up(znd->data_zones, znd->stale.binsz);
+
+ Z_ERR(znd, "Bin: Sz %u, count %u", znd->stale.binsz, znd->stale.count);
+
+ if (create && force) {
+ Z_ERR(znd, "Force Creating a clean instance.");
+ } else if (find_superblock(znd, use_wq, 1)) {
+ u64 sb_lba = 0;
+ u64 generation;
+
+/* FIXME: Logic [zdstart] not well tested or ... logicial */
+
+ Z_INFO(znd, "Found existing superblock");
+ if (zdstart != znd->zdstart) {
+ if (force) {
+ Z_ERR(znd, " (force) zdstart: %u <- %u",
+ zdstart, znd->zdstart);
+ } else {
+ znd->zdstart = zdstart;
+ jinit = 0;
+ }
+ }
+
+ generation = mcache_greatest_gen(znd, use_wq, &sb_lba, NULL);
+ Z_DBG(znd, "Generation: %" PRIu64 " @ %" PRIx64,
+ generation, sb_lba);
+
+ rc = read_block(ti, DM_IO_KMEM, key_blk, sb_lba,
+ n4kblks, use_wq);
+ if (rc) {
+ ti->error = "Superblock read error.";
+ return rc;
+ }
+ }
+
+ rc = sb_check(znd->super_block);
+ if (rc) {
+ jinit = 0;
+ if (create) {
+ DMWARN("Check failed .. creating superblock.");
+ zoned_create_disk(ti, znd);
+ znd->super_block->nr_zones =
+ cpu_to_le64(znd->data_zones);
+ DMWARN("in-memory superblock created.");
+ znd->is_empty = 1;
+ } else {
+ ti->error = "Superblock check failed.";
+ return rc;
+ }
+ }
+
+ if (sb_test_flag(znd->super_block, SB_DIRTY)) {
+ int repair_check = zoned_repair(znd);
+
+ if (!force) {
+ /* if repair failed -- don't load from disk */
+ if (repair_check)
+ jinit = 0;
+ } else if (repair_check && jinit) {
+ Z_ERR(znd, "repair failed, force enabled loading ...");
+ }
+ }
+
+ if (jinit) {
+ Z_ERR(znd, "INIT: Reloading DM Zoned metadata from DISK");
+ znd->zdstart = le32_to_cpu(znd->super_block->zdstart);
+ set_bit(DO_JOURNAL_LOAD, &znd->flags);
+ queue_work(znd->meta_wq, &znd->meta_work);
+ Z_ERR(znd, "Waiting for load to complete.");
+ flush_workqueue(znd->meta_wq);
+ }
+
+ Z_ERR(znd, "ZONED: Build No %d marking superblock dirty.", BUILD_NO);
+
+ /* write the 'dirty' flag back to disk. */
+ sb_set_flag(znd->super_block, SB_DIRTY);
+ znd->super_block->csum = sb_crc32(znd->super_block);
+
+ return 0;
+}
+
+/**
+ * compare_tlba() - Compare on tlba48 ignoring high 16 bits.
+ * @x1: Page of map cache
+ * @x2: Page of map cache
+ *
+ * Return: -1 less than, 1 greater than, 0 if equal.
+ */
+static int compare_tlba(const void *x1, const void *x2)
+{
+ const struct map_sect_to_lba *r1 = x1;
+ const struct map_sect_to_lba *r2 = x2;
+ const u64 v1 = le64_to_lba48(r1->tlba, NULL);
+ const u64 v2 = le64_to_lba48(r2->tlba, NULL);
+
+ return (v1 < v2) ? -1 : ((v1 > v2) ? 1 : 0);
+}
+
+/**
+ * _lsearch_tlba() - Do a linear search over a page of map_cache entries.
+ * @mcache: Page of map cache entries to search.
+ * @dm_s: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ */
+static int _lsearch_tlba(struct map_cache *mcache, u64 dm_s)
+{
+ int at = -1;
+ int jentry;
+
+ for (jentry = mcache->jcount; jentry > 0; jentry--) {
+ u64 logi = le64_to_lba48(mcache->jdata[jentry].tlba, NULL);
+
+ if (logi == dm_s) {
+ at = jentry - 1;
+ goto out;
+ }
+ }
+
+out:
+ return at;
+}
+
+/**
+ * _lsearch_extent() - Do a linear search over a page of discard entries.
+ * @mcache: Page of map cache entries to search.
+ * @dm_s: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ *
+ * NOTE: In this case the match is if the tlba is include in the extent
+ */
+static int _lsearch_extent(struct map_cache *mcache, u64 dm_s)
+{
+ int at = -1;
+ int jentry;
+
+ for (jentry = mcache->jcount; jentry > 0; jentry--) {
+ u64 addr = le64_to_lba48(mcache->jdata[jentry].tlba, NULL);
+ u64 blocks = le64_to_lba48(mcache->jdata[jentry].bval, NULL);
+
+ if ((dm_s >= addr) && dm_s < (addr+blocks)) {
+ at = jentry - 1;
+ goto out;
+ }
+ }
+
+out:
+ return at;
+}
+
+/**
+ * _bsrch_tlba() - Do a binary search over a page of map_cache entries.
+ * @mcache: Page of map cache entries to search.
+ * @tlba: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ */
+static int _bsrch_tlba(struct map_cache *mcache, u64 tlba)
+{
+ int at = -1;
+ void *base = &mcache->jdata[1];
+ void *map;
+ struct map_sect_to_lba find;
+
+ find.tlba = lba48_to_le64(0, tlba);
+
+ if (mcache->jcount < 0 || mcache->jcount > mcache->jsize)
+ return at;
+
+ map = bsearch(&find, base, mcache->jcount, sizeof(find), compare_tlba);
+ if (map)
+ at = (map - base) / sizeof(find);
+
+ return at;
+}
+
+/**
+ * compare_extent() - Compare on tlba48 ignoring high 16 bits.
+ * @x1: Page of map cache
+ * @x2: Page of map cache
+ *
+ * Return: -1 less than, 1 greater than, 0 if equal.
+ */
+static int compare_ext(const void *x1, const void *x2)
+{
+ const struct map_sect_to_lba *r1 = x1;
+ const struct map_sect_to_lba *r2 = x2;
+ const u64 key = le64_to_lba48(r1->tlba, NULL);
+ const u64 val = le64_to_lba48(r2->tlba, NULL);
+ const u64 ext = le64_to_lba48(r2->bval, NULL);
+
+ if (key < val)
+ return -1;
+ if (key < (val+ext))
+ return 0;
+ return 1;
+}
+
+
+/**
+ * _bsrch_extent() - Do a binary search over a page of map_cache extents.
+ * @mcache: Page of map cache extent entries to search.
+ * @tlba: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ */
+static int _bsrch_extent(struct map_cache *mcache, u64 tlba)
+{
+ int at = -1;
+ void *base = &mcache->jdata[1];
+ void *map;
+ struct map_sect_to_lba find;
+
+ find.tlba = lba48_to_le64(0, tlba);
+ map = bsearch(&find, base, mcache->jcount, sizeof(find), compare_ext);
+ if (map)
+ at = (map - base) / sizeof(find);
+
+#if 1
+ {
+ int cmp = _lsearch_extent(mcache, tlba);
+
+ if (cmp != at) {
+ pr_err("_bsrch_extent is failing got %d need %d.\n",
+ at, cmp);
+ at = cmp;
+ }
+ }
+#endif
+
+ return at;
+}
+
+/**
+ * _bsrch_tlba() - Mutex Lock and binary search.
+ * @mcache: Page of map cache entries to search.
+ * @tlba: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ */
+static int _bsrch_tlba_lck(struct map_cache *mcache, u64 tlba)
+{
+ int at = -1;
+
+ MutexLock(&mcache->cached_lock);
+ at = _bsrch_tlba(mcache, tlba);
+ mutex_unlock(&mcache->cached_lock);
+
+ return at;
+}
+
+/**
+ * _bsrch_extent_lck() - Mutex Lock and binary search.
+ * @mcache: Page of map cache extent entries to search.
+ * @tlba: tlba being sought.
+ *
+ * Return: 0 to jcount - 1 if found. -1 if not found
+ */
+static int _bsrch_extent_lck(struct map_cache *mcache, u64 tlba)
+{
+ int at = -1;
+
+ MutexLock(&mcache->cached_lock);
+ at = _bsrch_extent(mcache, tlba);
+ mutex_unlock(&mcache->cached_lock);
+
+ return at;
+}
+
+/**
+ * mcache_bsearch() - Do a binary search over a page of map_cache entries.
+ * @mcache: Page of map cache entries to search.
+ * @tlba: tlba being sought.
+ */
+static inline int mcache_bsearch(struct map_cache *mcache, u64 tlba)
+{
+ int at = (mcache->map_content == IS_MAP)
+ ? _bsrch_tlba(mcache, tlba)
+ : _bsrch_extent(mcache, tlba);
+
+ return at;
+}
+
+
+/**
+ * mcache_lsearch() - Do a linear search over a page of map/discard entries.
+ * @mcache: Page of map cache entries to search.
+ * @dm_s: tlba being sought.
+ */
+static inline int mcache_lsearch(struct map_cache *mcache, u64 tlba)
+{
+ int at = (mcache->map_content == IS_MAP)
+ ? _lsearch_tlba(mcache, tlba)
+ : _lsearch_extent(mcache, tlba);
+
+ return at;
+}
+
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+static u64 z_lookup_key_range(struct zoned *znd, u64 addr)
+{
+ u64 found = 0ul;
+
+ if (test_bit(ZF_POOL_FWD, &znd->flags))
+ return found;
+
+ if ((znd->sk_low <= addr) && (addr < znd->sk_high))
+ found = FWD_TM_KEY_BASE + (addr - znd->sk_low);
+
+ return found;
+}
+
+/**
+ * increment_used_blks() - Update the 'used' WP when data hits disk.
+ * @znd: ZDM Instance
+ * @lba: blba if bio completed.
+ * @blks: number of blocks of bio completed.
+ *
+ * Called from a BIO end_io function so should not sleep or deadlock
+ * The 'critical' piece here is ensuring that the wp is advanced to 0x10000
+ * Secondarily is triggering filled_zone which ultimatly sets the
+ * Z_WP_GC_READY in wp_alloc. While important this flag could be set
+ * during other non-critical passes over wp_alloc and wp_used such
+ * as during update_stale_ratio().
+ */
+static void increment_used_blks(struct zoned *znd, u64 lba, u32 blks)
+{
+ u32 zone = _calc_zone(znd, lba);
+ u32 wwp = ((lba - znd->data_lba) - (zone << Z_BLKBITS)) + blks;
+
+ if (zone < znd->data_zones) {
+ u32 gzno = zone >> GZ_BITS;
+ u32 gzoff = zone & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 used, uflags;
+
+ used = le32_to_cpu(wpg->wp_used[gzoff]);
+ uflags = used & Z_WP_FLAGS_MASK;
+ used &= Z_WP_VALUE_MASK;
+
+ if (wwp > used) {
+ wpg->wp_used[gzoff] = cpu_to_le32(wwp | uflags);
+ /* signal zone closure */
+ if (wwp == Z_BLKSZ)
+ znd->filled_zone = zone;
+ }
+ }
+}
+
+/**
+ * current_mapping() - Lookup a logical sector address to find the disk LBA
+ * @znd: ZDM instance
+ * @nodisk: Optional ignore the discard cache
+ * @addr: Logical LBA of page.
+ * @gfp: Memory allocation rule
+ *
+ * Return: Disk LBA or 0 if not found.
+ */
+static u64 _current_mapping(struct zoned *znd, int nodisc, u64 addr, gfp_t gfp)
+{
+ u64 found = 0ul;
+
+ if (addr < znd->data_lba)
+ found = addr;
+ if (!found)
+ found = z_lookup_key_range(znd, addr);
+ if (!found && !nodisc) {
+ found = z_lookup_cache(znd, addr, IS_DISCARD);
+ if (found) {
+ found = 0;
+ goto out;
+ }
+ }
+ if (!found)
+ found = z_lookup_cache(znd, addr, IS_MAP);
+ if (!found)
+ found = z_lookup_table(znd, addr, gfp);
+
+out:
+ return found;
+}
+
+/**
+ * current_mapping() - Lookup a logical sector address to find the disk LBA
+ * @znd: ZDM instance
+ * @addr: Logical LBA of page.
+ * @gfp: Memory allocation rule
+ *
+ * NOTE: Discard cache is checked.
+ *
+ * Return: Disk LBA or 0 if not found.
+ */
+static u64 current_mapping(struct zoned *znd, u64 addr, gfp_t gfp)
+{
+ const int nodisc = 0;
+
+ return _current_mapping(znd, nodisc, addr, gfp);
+}
+
+/**
+ * z_mapped_add_one() - Add an entry to the map cache block mapping.
+ * @znd: ZDM Instance
+ * @tlba: Address being discarded.
+ * @blks: number of blocks being discard.
+ * @gfp: Current memory allocation scheme.
+ *
+ * Add a new extent entry.
+ */
+static int z_mapped_add_one(struct zoned *znd, u64 dm_s, u64 lba, gfp_t gfp)
+{
+ int err = 0;
+
+ if (dm_s < znd->data_lba)
+ return err;
+
+#if 0 /* FIXME: Do the RIGHT thing ... whatever that is. */
+
+ /*
+ * location of the SLT key sectors need to be
+ * stashed into the sector lookup table block map
+ * Does dm_s point in the sector lookup table block map
+ */
+ if (!test_bit(ZF_POOL_FWD, &znd->flags)) {
+ if ((znd->sk_low <= dm_s) && (dm_s < znd->sk_high))
+ lba = FWD_TM_KEY_BASE + (dm_s - znd->sk_low);
+ }
+#endif
+
+ /*
+ * If this mapping over-writes a discard entry then we need to punch a
+ * hole in the discard entry.
+ * - Find the discard entry.
+ * - Translate a chunk of the discard into map entries.
+ * - Break [or trim] the discard entry.
+ */
+ if (lba) {
+ int is_locked = mutex_is_locked(&znd->mz_io_mutex);
+
+ if (is_locked)
+ mutex_unlock(&znd->mz_io_mutex);
+ z_discard_range(znd, dm_s, gfp);
+ if (is_locked)
+ MutexLock(&znd->mz_io_mutex);
+ }
+
+ do {
+ err = z_to_map_list(znd, dm_s, lba, gfp);
+ } while (-EBUSY == err);
+
+ return err;
+}
+
+/**
+ * z_discard_small() - Translate discard extents to map cache block mapping.
+ * @znd: ZDM Instance
+ * @tlba: Address being discarded.
+ * @blks: number of blocks being discard.
+ * @gfp: Current memory allocation scheme.
+ *
+ * Add a new extent entry.
+ */
+static int z_discard_small(struct zoned *znd, u64 tlba, u64 count, gfp_t gfp)
+{
+ const int nodisc = 1;
+ int err = 0;
+ u64 ii;
+
+ for (ii = 0; ii < count; ii++) {
+ u64 lba = _current_mapping(znd, nodisc, tlba+ii, gfp);
+
+ if (lba)
+ do {
+ err = z_to_map_list(znd, tlba+ii, 0ul, gfp);
+ } while (-EBUSY == err);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/**
+ * z_discard_large() - Add a discard extent to the mapping cache
+ * @znd: ZDM Instance
+ * @tlba: Address being discarded.
+ * @blks: number of blocks being discard.
+ * @gfp: Current memory allocation scheme.
+ *
+ * Add a new extent entry.
+ */
+static int z_discard_large(struct zoned *znd, u64 tlba, u64 blks, gfp_t gfp)
+{
+ int rcode = 0;
+
+ /* for a large discard ... add it to the discard queue */
+ do {
+ rcode = z_to_discard_list(znd, tlba, blks, gfp);
+ } while (-EBUSY == rcode);
+
+ return rcode;
+}
+
+/**
+ * mapped_discard() - Add a discard extent to the mapping cache
+ * @znd: ZDM Instance
+ * @tlba: Address being discarded.
+ * @blks: number of blocks being discard.
+ * @merge: Attempt to merge with an existing extent.
+ * @gfp: Current memory allocation scheme.
+ *
+ * First attempt to merge with an existing entry.
+ * Otherwise attempt to push small entires into the map_cache 1:1 mapping
+ * Otherwise add a new extent entry.
+ */
+static
+int mapped_discard(struct zoned *znd, u64 tlba, u64 blks, int merge, gfp_t gfp)
+{
+ int err = 0;
+
+ if (merge && discard_merge(znd, tlba, blks))
+ goto done;
+
+ if (blks < DISCARD_MAX_INGRESS || znd->dc_entries > 39) {
+ /* io_mutex is avoid adding map cache entries during SYNC */
+ MutexLock(&znd->mz_io_mutex);
+ err = z_discard_small(znd, tlba, blks, gfp);
+ mutex_unlock(&znd->mz_io_mutex);
+ goto done;
+ }
+
+ err = z_discard_large(znd, tlba, blks, gfp);
+
+done:
+ return err;
+}
+
+/**
+ * z_mapped_discard() - Add a discard extent to the mapping cache
+ * @znd: ZDM Instance
+ * @tlba: Address being discarded.
+ * @blks: number of blocks being discard.
+ * @gfp: Current memory allocation scheme.
+ */
+static int z_mapped_discard(struct zoned *znd, u64 tlba, u64 blks, gfp_t gfp)
+{
+ const int merge = 1;
+
+ return mapped_discard(znd, tlba, blks, merge, gfp);
+}
+
+/**
+ * mcache_alloc() - Allocate a new chunk of map cache
+ * @znd: ZDM Instance
+ * @gfp: Current memory allocation scheme.
+ */
+static struct map_cache *mcache_alloc(struct zoned *znd, gfp_t gfp)
+{
+ struct map_cache *mcache;
+
+ mcache = ZDM_ALLOC(znd, sizeof(*mcache), KM_07, gfp);
+ if (mcache) {
+ mutex_init(&mcache->cached_lock);
+ mcache->jcount = 0;
+ mcache->jsorted = 0;
+ mcache->jdata = ZDM_CALLOC(znd, Z_UNSORTED,
+ sizeof(*mcache->jdata), PG_08, gfp);
+
+ if (mcache->jdata) {
+ u64 logical = Z_LOWER48;
+ u64 physical = Z_LOWER48;
+
+ mcache->jdata[0].tlba = cpu_to_le64(logical);
+ mcache->jdata[0].bval = cpu_to_le64(physical);
+ mcache->jsize = Z_UNSORTED - 1;
+
+ } else {
+ Z_ERR(znd, "%s: in memory journal is out of space.",
+ __func__);
+ ZDM_FREE(znd, mcache, sizeof(*mcache), KM_07);
+ mcache = NULL;
+ }
+ }
+ return mcache;
+}
+
+/**
+ * mcache_first_get() - Get the first chunk from the list.
+ * @znd: ZDM Instance
+ * @mcache: Map cache chunk to add.
+ * @type: Type flag of mcache entry (MAP or DISCARD extent).
+ *
+ * The mcache chunk will have it's refcount elevated in the xchg.
+ * Caller should use xchg_next to navigate through the list or
+ * deref_cache when exiting from the list walk early.
+ */
+static struct map_cache *mcache_first_get(struct zoned *znd, int type)
+{
+ unsigned long flags;
+ struct map_cache *mc;
+
+ spin_lock_irqsave(&znd->mclck[type], flags);
+ mc = list_first_entry_or_null(&znd->mclist[type], typeof(*mc), mclist);
+ if (mc && (&mc->mclist != &znd->mclist[type]))
+ mcache_ref(mc);
+ else
+ mc = NULL;
+ spin_unlock_irqrestore(&znd->mclck[type], flags);
+
+ return mc;
+}
+
+/**
+ * mcache_put_get_next() - Get the next map cache page and put current.
+ * @znd: ZDM Instance
+ * @mcache: Map cache chunk to add.
+ * @type: Type flag of mcache entry (MAP or DISCARD extent).
+ *
+ * The current mcache chunk will have an elevated refcount,
+ * that will dropped, while the new the returned chunk will
+ * have it's refcount elevated in the xchg.
+ */
+static inline struct map_cache *mcache_put_get_next(struct zoned *znd,
+ struct map_cache *mcache,
+ int type)
+{
+ unsigned long flags;
+ struct map_cache *next;
+
+ spin_lock_irqsave(&znd->mclck[type], flags);
+ next = list_next_entry(mcache, mclist);
+ if (next && (&next->mclist != &znd->mclist[type]))
+ mcache_ref(next);
+ else
+ next = NULL;
+ mcache_deref(mcache);
+ spin_unlock_irqrestore(&znd->mclck[type], flags);
+
+ return next;
+}
+
+/**
+ * mclist_add() - Add a new chunk to the map_cache pool.
+ * @znd: ZDM Instance
+ * @mcache: Map cache chunk to add.
+ * @type: Type flag of mcache entry (MAP or DISCARD extent).
+ */
+static inline void mclist_add(struct zoned *znd,
+ struct map_cache *mcache, int type)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&znd->mclck[type], flags);
+ list_add(&mcache->mclist, &znd->mclist[type]);
+ spin_unlock_irqrestore(&znd->mclck[type], flags);
+}
+
+/**
+ * mclist_del() - Release an (empty) chunk of map cache.
+ * @znd: ZDM Instance
+ * @mcache: Map cache chunk to add.
+ * @type: Type flag of mcache entry (MAP or DISCARD extent).
+ */
+static inline int mclist_del(struct zoned *znd,
+ struct map_cache *mcache, int type)
+{
+ unsigned long flags;
+ int deleted = 0;
+
+ if (spin_trylock_irqsave(&znd->mclck[type], flags)) {
+ MutexLock(&mcache->cached_lock);
+ if (mcache->jcount == 0 && mcache_getref(mcache) == 1) {
+ list_del(&mcache->mclist);
+ deleted = 1;
+ }
+ mutex_unlock(&mcache->cached_lock);
+ spin_unlock_irqrestore(&znd->mclck[type], flags);
+ }
+ return deleted;
+}
+
+/**
+ * mcache_put() - Dereference an mcache page
+ * @mcache: Map cache page.
+ */
+static inline void mcache_put(struct map_cache *mcache)
+{
+ mcache_deref(mcache);
+}
+
+/**
+ * memcache_sort() - Sort an unsorted map cache page
+ * @znd: ZDM instance
+ * @mcache: Map cache page.
+ *
+ * Sort a map cache entry if is sorted.
+ * Lock using mutex if not already locked.
+ */
+static void memcache_sort(struct zoned *znd, struct map_cache *mcache)
+{
+ mcache_ref(mcache);
+ if (mcache->jsorted < mcache->jcount) {
+ sort(&mcache->jdata[1], mcache->jcount,
+ sizeof(*mcache->jdata), compare_tlba, NULL);
+ mcache->jsorted = mcache->jcount;
+ }
+ mcache_deref(mcache);
+}
+
+/**
+ * memcache_lock_and_sort() - Attempt to sort an unsorted map cache page
+ * @znd: ZDM instance
+ * @mcache: Map cache page.
+ *
+ * Sort a map cache entry if is not busy being purged to the ZLT.
+ *
+ * Return: -EBUSY if busy, or 0 if sorted.
+ */
+static int memcache_lock_and_sort(struct zoned *znd, struct map_cache *mcache)
+{
+ if (mcache_is_busy(mcache))
+ return -EBUSY;
+
+ if (mcache->jsorted == mcache->jcount)
+ return 0;
+
+ mutex_lock_nested(&mcache->cached_lock, SINGLE_DEPTH_NESTING);
+ memcache_sort(znd, mcache);
+ mutex_unlock(&mcache->cached_lock);
+ return 0;
+}
+
+/**
+ * z_lookup_cache() - Scan mcache entries for addr
+ * @znd: ZDM Instance
+ * @addr: Address [tLBA] to find.
+ * @type: mcache type (MAP or DISCARD cache).
+ */
+static u64 z_lookup_cache(struct zoned *znd, u64 addr, int type)
+{
+ struct map_cache *mcache;
+ u64 found = 0ul;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int at;
+ int err;
+
+ /* sort, if needed. only err is -EBUSY so do a linear find. */
+ err = memcache_lock_and_sort(znd, mcache);
+ if (!err)
+ at = mcache_bsearch(mcache, addr);
+ else
+ at = mcache_lsearch(mcache, addr);
+
+ if (at != -1) {
+ struct map_sect_to_lba *data = &mcache->jdata[at + 1];
+
+ found = le64_to_lba48(data->bval, NULL);
+ }
+ if (found) {
+ mcache_put(mcache);
+ goto out;
+ }
+
+ mcache = mcache_put_get_next(znd, mcache, type);
+ }
+out:
+ return found;
+}
+
+/**
+ * mcache_insert - Insertion sort (for mostly sorted data)
+ * @znd: ZDM Instance
+ * @mcache: Map Cache block
+ * @tlba: upper stack sector
+ * @blba: lower mapped lba
+ */
+static void mcache_insert(struct zoned *znd, struct map_cache *mcache,
+ u64 tlba, u64 blba)
+{
+ u16 fido = ++mcache->jcount;
+ u16 idx;
+
+ WARN_ON(mcache->jcount > mcache->jsize);
+
+ for (idx = fido; idx > 1; --idx) {
+ u16 prev = idx - 1;
+ u64 a0 = le64_to_lba48(mcache->jdata[prev].tlba, NULL);
+
+ if (a0 < tlba) {
+ mcache->jdata[idx].tlba = lba48_to_le64(0, tlba);
+ mcache->jdata[idx].bval = lba48_to_le64(0, blba);
+ if ((mcache->jsorted + 1) == mcache->jcount)
+ mcache->jsorted = mcache->jcount;
+ break;
+ }
+ /* move UP .. tlba < a0 */
+ mcache->jdata[idx].tlba = mcache->jdata[prev].tlba;
+ mcache->jdata[idx].bval = mcache->jdata[prev].bval;
+ }
+ if (idx == 1) {
+ mcache->jdata[idx].tlba = lba48_to_le64(0, tlba);
+ mcache->jdata[idx].bval = lba48_to_le64(0, blba);
+ mcache->jsorted = mcache->jcount;
+ }
+}
+
+
+/**
+ * mc_delete_entry - Overwrite a map_cache entry (for mostly sorted data)
+ * @mcache: Map Cache block
+ * @entry: Entry to remove. [1 .. count]
+ */
+void mc_delete_entry(struct map_cache *mcache, int entry)
+{
+ int iter;
+
+ MutexLock(&mcache->cached_lock);
+
+ mcache->jdata[entry].tlba = lba48_to_le64(0, 0ul);
+ mcache->jdata[entry].bval = lba48_to_le64(0, 0ul);
+
+ for (iter = entry; iter < mcache->jcount; iter++) {
+ int next = iter + 1;
+
+ mcache->jdata[iter].tlba = mcache->jdata[next].tlba;
+ mcache->jdata[iter].bval = mcache->jdata[next].bval;
+ }
+ if (mcache->jcount > 0) {
+ if (mcache->jsorted == mcache->jcount)
+ mcache->jsorted--;
+ mcache->jcount--;
+ } else {
+ pr_err("mcache delete ... discard beyound empty\n");
+ }
+ mutex_unlock(&mcache->cached_lock);
+}
+
+/**
+ * _discard_split_entry() - Split a discard into two parts.
+ * @znd: ZDM Instance
+ * @mcahce: mcache holding the current entry
+ * @at: location of entry in mcache (0 .. count - 1)
+ * @addr: address to punch a hole at
+ * @gfp: current memory allocation rules.
+ *
+ * Discard 'extents' need to broken up when a sub range is being allocated
+ * for writing. Ideally the punch only requires the extent to be shrunk.
+ */
+static int _discard_split_entry(struct zoned *znd, struct map_cache *mcache,
+ int at, u64 addr, gfp_t gfp)
+{
+ const int merge = 0;
+ int err = 0;
+ struct map_sect_to_lba *entry = &mcache->jdata[at + 1];
+ u64 tlba = le64_to_lba48(entry->tlba, NULL);
+ u64 blocks = le64_to_lba48(entry->bval, NULL);
+ u64 dblks = blocks;
+ const u64 chunk = addr - tlba;
+
+ /*
+ * Now chunk is a number of blocks until addr starts.
+ * If the chunk is small push the discard immediately.
+ * Otherwise punch a hole and see if we need to create
+ * a new entry with what remains.
+ */
+ if (chunk > 0 && chunk < DISCARD_MAX_INGRESS) {
+
+ err = mapped_discard(znd, tlba, chunk, merge, gfp);
+ if (err)
+ goto out;
+
+ blocks -= chunk;
+ tlba += chunk;
+
+ entry->tlba = lba48_to_le64(0, tlba);
+ entry->bval = lba48_to_le64(0, blocks);
+
+ if (blocks == 0ul) {
+ mc_delete_entry(mcache, at + 1);
+ goto out;
+ }
+ dblks = blocks;
+ } else {
+ dblks = blocks - chunk; /* nblks from addr to end */
+ }
+
+ /*
+ * When chunk >= DISCARD_MAX_INGRESS
+ * then the current entry needs to be ended at 'chunk'.
+ * Note that tlba != addr in this case and 'chunk' will
+ * be until we get around to committing to truncating
+ * the entry extent. Specifcily we need to ensure that
+ * the hole doens't expose blocks covered by the current
+ * extent.
+ */
+ if (dblks > DISCARD_MAX_INGRESS)
+ dblks = DISCARD_MAX_INGRESS - 1;
+
+ err = mapped_discard(znd, addr, dblks, merge, gfp);
+ if (err)
+ goto out;
+
+ /*
+ * Now we have moved the hole into the map cache we are free
+ * to update the entry.
+ * If tlba == addr the we have continued to discard from the
+ * from of this extent.
+ *
+ * If tlba != addr then we need discarded from the middle or
+ * the end. Either there is a hole and we need a second extent
+ * entry or we ran to the end of the extent and we can just
+ * truncate the extent.
+ */
+
+ if (tlba == addr) {
+ blocks -= dblks;
+ tlba += dblks;
+ entry->tlba = lba48_to_le64(0, tlba);
+ entry->bval = lba48_to_le64(0, blocks);
+ if (blocks == 0ul)
+ mc_delete_entry(mcache, at + 1);
+ } else {
+ u64 start = addr + dblks;
+
+ /*
+ * Because we are doing insertion sorting
+ * and tlba < addr we can safely add a new
+ * discard extent entry.
+ * As the current entry is 'stable' we don't need to
+ * re-acquire it.
+ */
+ blocks -= (chunk + dblks);
+ if (blocks > 0ul) {
+ err = mapped_discard(znd, start, blocks, merge, gfp);
+ if (err)
+ goto out;
+ }
+ entry->bval = lba48_to_le64(0, chunk);
+ }
+
+out:
+ return err;
+}
+
+
+/**
+ * z_discard_range() - Migrate a discard range to memcache/ZLT
+ * @znd: ZDM Instance
+ * @addr: tLBA target for breaking up a discard range.
+ * @gfp: Memory allocation scheme
+ *
+ * When an address is being written to a check is made to see if the tLBA
+ * overlaps with an exisitng discard pool. If so the discard pool is
+ * split and multiple smaller discard entries are pushed into the memcache.
+ *
+ * The overall purpose of the discard pool is to reduce the amount of intake
+ * on the memcache to avoid starving the I/O requests.
+ */
+static u64 z_discard_range(struct zoned *znd, u64 addr, gfp_t gfp)
+{
+ struct map_cache *mcache;
+ u64 found = 0ul;
+ int type = IS_DISCARD;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int at;
+ int err;
+
+ /* sort, if needed. only err is -EBUSY so do a linear find. */
+ err = memcache_lock_and_sort(znd, mcache);
+ if (!err)
+ at = mcache_bsearch(mcache, addr);
+ else
+ at = mcache_lsearch(mcache, addr);
+
+ if (at != -1) {
+ /* break apart the entry */
+ err = _discard_split_entry(znd, mcache, at, addr, gfp);
+ if (err) {
+ mcache_put(mcache);
+ goto out;
+ }
+ found = 1;
+ }
+
+ if (found) {
+ mcache_put(mcache);
+ goto out;
+ }
+
+ mcache = mcache_put_get_next(znd, mcache, type);
+ }
+out:
+ return found;
+}
+
+/**
+ * z_discard_partial() - Migrate a discard range to memcache/ZLT
+ * @znd: ZDM Instance
+ * @minblks: Target number of blocks to cull.
+ * @gfp: Memory allocation scheme
+ *
+ * When an address is being written to a check is made to see if the tLBA
+ * overlaps with an exisitng discard pool. If so the discard pool is
+ * split and multiple smaller discard entries are pushed into the memcache.
+ *
+ * The overall purpose of the discard pool is to reduce the amount of intake
+ * on the memcache to avoid starving the I/O requests.
+ */
+static int z_discard_partial(struct zoned *znd, u32 minblks, gfp_t gfp)
+{
+ struct map_cache *mcache;
+ int completions = 0;
+ int type = IS_DISCARD;
+ int lumax = minblks;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int at = -1;
+ int err;
+
+ if (mcache->jcount > 1)
+ at = 0;
+
+ if (at != -1) {
+ const int nodisc = 1;
+ s32 e = at + 1;
+ u64 tlba = le64_to_lba48(mcache->jdata[e].tlba, NULL);
+ u64 blks = le64_to_lba48(mcache->jdata[e].bval, NULL);
+ u64 chunk = DISCARD_MAX_INGRESS - 1;
+ u32 dcount = 0;
+ u32 dstop = minblks;
+
+ while (blks > 0 && dcount < dstop && --lumax > 0) {
+ u64 lba = _current_mapping(znd, nodisc,
+ tlba, gfp);
+ if (lba) {
+ do {
+ err = z_to_map_list(znd, tlba,
+ 0ul, gfp);
+ } while (-EBUSY == err);
+ if (err)
+ break;
+ dcount++;
+ }
+ tlba++;
+ blks--;
+ }
+
+ if (chunk > blks)
+ chunk = blks;
+
+ if (chunk == 0) {
+ mc_delete_entry(mcache, e);
+ completions++;
+ } else if (chunk < DISCARD_MAX_INGRESS) {
+ err = mapped_discard(znd, tlba, chunk, 0, gfp);
+ if (err) {
+ mcache_put(mcache);
+ goto out;
+ }
+ blks -= chunk;
+ tlba += chunk;
+
+ mcache->jdata[e].tlba = lba48_to_le64(0, tlba);
+ mcache->jdata[e].bval = lba48_to_le64(0, blks);
+ if (blks == 0ul)
+ mc_delete_entry(mcache, e);
+ }
+ completions++;
+ }
+ if (completions) {
+ int deleted = 0;
+
+ if (completions > 1 &&
+ znd->dc_entries > MEMCACHE_HWMARK &&
+ mcache->jcount == 0 &&
+ mcache_getref(mcache) == 1)
+ deleted = mclist_del(znd, mcache, type);
+
+ mcache_put(mcache);
+
+ if (deleted) {
+ ZDM_FREE(znd, mcache->jdata, Z_C4K, PG_08);
+ ZDM_FREE(znd, mcache, sizeof(*mcache), KM_07);
+ mcache = NULL;
+ znd->dc_entries--;
+ }
+ goto out;
+ }
+ mcache = mcache_put_get_next(znd, mcache, type);
+ }
+out:
+ return completions;
+}
+
+/**
+ * lba_in_zone() - Scan a map cache entry for any targets pointing to zone.
+ * @znd: ZDM instance
+ * @mcache: Map cache page.
+ * @zone: Zone (that was gc'd).
+ *
+ * Scan the map cache page to see if any entries are pointing to a zone.
+ *
+ * Return: 0 if none are found. 1 if any are found.
+ */
+static int lba_in_zone(struct zoned *znd, struct map_cache *mcache, u32 zone)
+{
+ int jentry;
+
+ if (zone >= znd->data_zones)
+ goto out;
+
+ for (jentry = mcache->jcount; jentry > 0; jentry--) {
+ u64 lba = le64_to_lba48(mcache->jdata[jentry].bval, NULL);
+
+ if (lba && _calc_zone(znd, lba) == zone)
+ return 1;
+ }
+out:
+ return 0;
+}
+
+/**
+ * gc_verify_cache() - Test map cache to see if any entries point to zone.
+ * @znd: ZDM instance
+ * @zone: Zone (that was gc'd).
+ *
+ * Scan the map cache to see if any entries are pointing to a zone that
+ * was just GC'd.
+ * If any are found it indicates a bug.
+ *
+ * Return: 0 on success or 1 on error and sets meta_result to ENOSPC
+ * to effectivly freeze ZDM.
+ */
+static int gc_verify_cache(struct zoned *znd, u32 zone)
+{
+ struct map_cache *mcache = NULL;
+ int err = 0;
+ int type = IS_MAP;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ MutexLock(&mcache->cached_lock);
+ if (lba_in_zone(znd, mcache, zone)) {
+ Z_ERR(znd, "GC: **ERR** %" PRIx32
+ " LBA in cache <= Corrupt", zone);
+ err = 1;
+ znd->meta_result = -ENOSPC;
+ }
+ mutex_unlock(&mcache->cached_lock);
+ mcache = mcache_put_get_next(znd, mcache, type);
+ }
+
+ return err;
+}
+
+
+/**
+ * __cached_to_tables() - Migrate map cache entries to ZLT
+ * @znd: ZDM instance
+ * @type: Which type (MAP) of cache entries to migrate.
+ * @zone: zone to force migration for partial memcache block
+ *
+ * Scan the memcache and move any full blocks to lookup tables
+ * If a (the) partial memcache block contains lbas that map to zone force
+ * early migration of the memcache block to ensure it is properly accounted
+ * for and migrated during and upcoming GC pass.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int __cached_to_tables(struct zoned *znd, int type, u32 zone)
+{
+ struct map_cache *mcache = NULL;
+ int err = 0;
+ int try_free = 0;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int deleted = 0;
+ struct map_cache *jskip;
+
+ mcache_busy(mcache);
+ MutexLock(&mcache->cached_lock);
+ if (mcache->jcount == mcache->jsize) {
+ memcache_sort(znd, mcache);
+ err = move_to_map_tables(znd, mcache);
+ if (!err && (mcache->jcount == 0))
+ try_free++;
+ } else {
+ if (lba_in_zone(znd, mcache, zone)) {
+ Z_ERR(znd,
+ "Moving %d Runts because z: %u",
+ mcache->jcount, zone);
+
+ memcache_sort(znd, mcache);
+ err = move_to_map_tables(znd, mcache);
+ }
+ }
+ mutex_unlock(&mcache->cached_lock);
+ mcache_unbusy(mcache);
+ if (err) {
+ mcache_put(mcache);
+ Z_ERR(znd, "%s: Sector map failed.", __func__);
+ goto out;
+ }
+
+ mcache_ref(mcache);
+ jskip = mcache_put_get_next(znd, mcache, type);
+
+ if (try_free > 0 && znd->mc_entries > MEMCACHE_HWMARK &&
+ mcache->jcount == 0 && mcache_getref(mcache) == 1) {
+ deleted = mclist_del(znd, mcache, type);
+ try_free--;
+ }
+
+ mcache_deref(mcache);
+
+ if (deleted) {
+ ZDM_FREE(znd, mcache->jdata, Z_C4K, PG_08);
+ ZDM_FREE(znd, mcache, sizeof(*mcache), KM_07);
+ mcache = NULL;
+ znd->mc_entries--;
+ }
+ mcache = jskip;
+ }
+out:
+
+ return err;
+}
+
+
+/**
+ * _cached_to_tables() - Migrate memcache entries to lookup tables
+ * @znd: ZDM instance
+ * @zone: zone to force migration for partial memcache block
+ *
+ * Scan the memcache and move any full blocks to lookup tables
+ * If a (the) partial memcache block contains lbas that map to zone force
+ * early migration of the memcache block to ensure it is properly accounted
+ * for and migrated during and upcoming GC pass.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int _cached_to_tables(struct zoned *znd, u32 zone)
+{
+ int err;
+
+ err = __cached_to_tables(znd, IS_MAP, zone);
+ return err;
+}
+
+
+/**
+ * z_flush_bdev() - Request backing device flushed to disk.
+ * @znd: ZDM instance
+ *
+ * Return: 0 on success or -errno value
+ */
+static int z_flush_bdev(struct zoned *znd, gfp_t gfp)
+{
+ int err;
+ sector_t bi_done;
+
+ err = blkdev_issue_flush(znd->dev->bdev, gfp, &bi_done);
+ if (err)
+ Z_ERR(znd, "%s: flush failing sector %lu!", __func__, bi_done);
+
+ return err;
+}
+
+/**
+ * pg_delete - Free a map_pg with spin locks held.
+ * @znd: ZDM Instance
+ * @expg: Page being released.
+ *
+ * Forced inline as it is 'optional' and because it is called with
+ * spin locks enabled and only from a single caller.
+ */
+static __always_inline int pg_delete(struct zoned *znd, struct map_pg *expg)
+{
+ struct map_pg **zlt;
+ spinlock_t *lock;
+ int req_flush = 0;
+ int dropped = 0;
+ int index = expg->index;
+ int is_lut = test_bit(IS_LUT, &expg->flags);
+ int is_fwd = test_bit(IS_FWD, &expg->flags);
+ u32 count = is_lut ? znd->map_count : znd->crc_count;
+
+ if (is_lut) {
+ zlt = is_fwd ? znd->fwd_tm : znd->rev_tm;
+ lock = &znd->mapkey_lock;
+ } else {
+ zlt = is_fwd ? znd->fwd_crc : znd->rev_crc;
+ lock = &znd->ct_lock;
+ }
+
+ if (!spin_trylock(lock))
+ return req_flush;
+
+ if (index > -1 && index < count && expg == zlt[index] &&
+ test_bit(IS_DROPPED, &expg->flags)) {
+ zlt[index] = NULL;
+ MutexLock(&expg->md_lock);
+ if (test_and_clear_bit(IS_LAZY, &expg->flags)) {
+ clear_bit(IS_DROPPED, &expg->flags);
+ clear_bit(DELAY_ADD, &expg->flags);
+ if (expg->data.addr) {
+ void *pg = expg->data.addr;
+
+ expg->data.addr = NULL;
+ ZDM_FREE(znd, pg, Z_C4K, PG_27);
+ atomic_dec(&znd->incore);
+ } else {
+ Z_ERR(znd, "** No data pg? %llx", expg->lba);
+ }
+ list_del(&expg->lazy);
+ znd->in_lzy--;
+ req_flush = !test_bit(IS_FLUSH, &expg->flags);
+ dropped = 1;
+ } else {
+ Z_ERR(znd, "Detected double list del.");
+ }
+ mutex_unlock(&expg->md_lock);
+ if (dropped)
+ ZDM_FREE(znd, expg, sizeof(*expg), KM_20);
+ }
+ spin_unlock(lock);
+ return req_flush;
+}
+
+/**
+ * manage_lazy_activity() - Migrate delayed 'add' entries to the 'ZTL'
+ * @znd: ZDM Instance
+ *
+ * The lzy list is used to perform less critical activities that could
+ * be done via the ZTL primary list but gives a second chance when
+ * - Adding if the spin lock would lock.
+ * - Deleting ... if the cache entry turns out to be 'hotter' than
+ * the default we can catch it and make it 'hotter' before the
+ * hotness indicator is lost.
+ */
+static int manage_lazy_activity(struct zoned *znd)
+{
+ struct map_pg *expg;
+ struct map_pg *_tpg;
+ int want_flush = 0;
+ const u32 msecs = MEM_PURGE_MSECS;
+
+ spin_lock(&znd->lzy_lck);
+ expg = list_first_entry_or_null(&znd->lzy_pool, typeof(*expg), lazy);
+ if (!expg || (&expg->lazy == &znd->lzy_pool))
+ goto out;
+
+ _tpg = list_next_entry(expg, lazy);
+ while (&expg->lazy != &znd->lzy_pool) {
+ /*
+ * this should never happen:
+ */
+ if (test_bit(IS_DIRTY, &expg->flags))
+ set_bit(DELAY_ADD, &expg->flags);
+ /*
+ * Migrage pg to zltlst list
+ */
+ if (test_bit(DELAY_ADD, &expg->flags)) {
+ if (spin_trylock(&znd->zlt_lck)) {
+ if (!test_bit(IN_ZLT, &expg->flags)) {
+ list_del(&expg->lazy);
+ znd->in_lzy--;
+ clear_bit(IS_LAZY, &expg->flags);
+ clear_bit(IS_DROPPED, &expg->flags);
+ clear_bit(DELAY_ADD, &expg->flags);
+ set_bit(IN_ZLT, &expg->flags);
+ list_add(&expg->zltlst, &znd->zltpool);
+ znd->in_zlt++;
+ } else {
+ Z_ERR(znd, "** ZLT double add? %llx",
+ expg->lba);
+ }
+ spin_unlock(&znd->zlt_lck);
+ }
+#if ENABLE_PG_FREE_VIA_LAZY
+ } else {
+ /*
+ * Delete page
+ */
+ if (!test_bit(IN_ZLT, &expg->flags) &&
+ test_bit(IS_DROPPED, &expg->flags)) {
+ if (is_expired_msecs(expg->age, msecs))
+ want_flush |= pg_delete(znd, expg);
+ }
+#endif
+ }
+ expg = _tpg;
+ _tpg = list_next_entry(expg, lazy);
+ }
+
+out:
+ spin_unlock(&znd->lzy_lck);
+ return want_flush;
+}
+
+
+
+/**
+ * mark_clean_flush_zlt() - Mark all non-dirty ZLT blocks as 'FLUSH'
+ * @znd: ZDM instance
+ *
+ * After a FLUSH/FUA these blocks are on disk and redundant FLUSH
+ * can be skipped if the block is later ejected.
+ */
+static void mark_clean_flush_zlt(struct zoned *znd)
+{
+ struct map_pg *expg = NULL;
+ struct map_pg *_tpg;
+
+ spin_lock(&znd->zlt_lck);
+ if (list_empty(&znd->zltpool))
+ goto out;
+
+ expg = list_last_entry(&znd->zltpool, typeof(*expg), zltlst);
+ if (!expg || &expg->zltlst == (&znd->zltpool))
+ goto out;
+
+ _tpg = list_prev_entry(expg, zltlst);
+ while (&expg->zltlst != &znd->zltpool) {
+ ref_pg(expg);
+ if (!test_bit(IS_DIRTY, &expg->flags))
+ set_bit(IS_FLUSH, &expg->flags);
+ deref_pg(expg);
+ expg = _tpg;
+ _tpg = list_prev_entry(expg, zltlst);
+ }
+
+out:
+ spin_unlock(&znd->zlt_lck);
+}
+
+/**
+ * mark_clean_flush_lzy() - Mark all non-dirty ZLT blocks as 'FLUSH'
+ * @znd: ZDM instance
+ *
+ * After a FLUSH/FUA these blocks are on disk and redundant FLUSH
+ * can be skipped if the block is later ejected.
+ */
+static void mark_clean_flush_lzy(struct zoned *znd)
+{
+ struct map_pg *expg = NULL;
+ struct map_pg *_tpg;
+
+ spin_lock(&znd->lzy_lck);
+ expg = list_first_entry_or_null(&znd->lzy_pool, typeof(*expg), lazy);
+ if (!expg || (&expg->lazy == &znd->lzy_pool))
+ goto out;
+
+ _tpg = list_next_entry(expg, lazy);
+ while (&expg->lazy != &znd->lzy_pool) {
+ ref_pg(expg);
+ if (!test_bit(IS_DIRTY, &expg->flags))
+ set_bit(IS_FLUSH, &expg->flags);
+ deref_pg(expg);
+ expg = _tpg;
+ _tpg = list_next_entry(expg, lazy);
+ }
+
+out:
+ spin_unlock(&znd->lzy_lck);
+}
+
+/**
+ * mark_clean_flush() - Mark all non-dirty ZLT/LZY blocks as 'FLUSH'
+ * @znd: ZDM instance
+ *
+ * After a FLUSH/FUA these blocks are on disk and redundant FLUSH
+ * can be skipped if the block is later ejected.
+ */
+static void mark_clean_flush(struct zoned *znd)
+{
+ mark_clean_flush_zlt(znd);
+ mark_clean_flush_lzy(znd);
+}
+
+/**
+ * do_sync_metadata() - Write ZDM state to disk.
+ * @znd: ZDM instance
+ *
+ * Return: 0 on success or -errno value
+ */
+static int do_sync_metadata(struct zoned *znd, int sync, int drop)
+{
+ int err = 0;
+ int want_flush;
+
+ want_flush = manage_lazy_activity(znd);
+ if (want_flush)
+ set_bit(DO_FLUSH, &znd->flags);
+
+ /* if drop is non-zero, DO_FLUSH may be set on return */
+ err = sync_mapped_pages(znd, sync, drop);
+ if (err) {
+ Z_ERR(znd, "Uh oh: sync_mapped_pages -> %d", err);
+ goto out;
+ }
+
+ /*
+ * If we are lucky then this sync will get us to a 'clean'
+ * state and the follow on bdev flush is redunant and skipped
+ *
+ * If not we will suffer a performance stall because we were
+ * ejected blocks.
+ *
+ * TODO: On Sync/Flush/FUA we can mark all of our clean ZLT
+ * as flushed and we can bypass elevating the drop count
+ * to trigger a flush for such already flushed blocks.
+ */
+ want_flush = test_bit(DO_FLUSH, &znd->flags);
+ err = z_mapped_sync(znd);
+ if (err) {
+ Z_ERR(znd, "Uh oh. z_mapped_sync -> %d", err);
+ goto out;
+ }
+
+ if (test_and_clear_bit(DO_FLUSH, &znd->flags)) {
+ err = z_flush_bdev(znd, GFP_KERNEL);
+ if (err) {
+ Z_ERR(znd, "Uh oh. flush_bdev failed. -> %d", err);
+ goto out;
+ }
+ want_flush = 1;
+ }
+ if (want_flush)
+ mark_clean_flush(znd);
+
+out:
+ return err;
+}
+
+/**
+ * do_init_from_journal() - Restore ZDM state from disk.
+ * @znd: ZDM instance
+ *
+ * Return: 0 on success or -errno value
+ */
+static int do_init_from_journal(struct zoned *znd)
+{
+ int err = 0;
+
+ if (test_and_clear_bit(DO_JOURNAL_LOAD, &znd->flags))
+ err = z_mapped_init(znd);
+
+ return err;
+}
+
+/**
+ * do_journal_to_table() - Migrate memcache entries to lookup tables
+ * @znd: ZDM instance
+ *
+ * Return: 0 on success or -errno value
+ */
+static int do_journal_to_table(struct zoned *znd)
+{
+ int err = 0;
+
+ if (test_and_clear_bit(DO_JOURNAL_MOVE, &znd->flags) ||
+ test_bit(DO_SYNC, &znd->flags))
+ err = _cached_to_tables(znd, znd->data_zones);
+
+ return err;
+}
+
+/**
+ * do_sync_to_disk() - Write ZDM state to disk.
+ * @znd: ZDM instance
+ *
+ * Return: 0 on success or -errno value
+ */
+static int do_sync_to_disk(struct zoned *znd)
+{
+ int err = 0;
+ int drop = 0;
+ int sync = 0;
+
+ if (test_and_clear_bit(DO_SYNC, &znd->flags) ||
+ test_bit(DO_FLUSH, &znd->flags))
+ sync = 1;
+
+ if (test_and_clear_bit(DO_MEMPOOL, &znd->flags)) {
+ int pool_size = MZ_MEMPOOL_SZ * 4;
+
+#if ENABLE_PG_FREE_VIA_LAZY
+ /**
+ * Trust our cache miss algo
+ */
+ pool_size = MZ_MEMPOOL_SZ * 3;
+#endif
+
+ if (is_expired_msecs(znd->age, MEM_PURGE_MSECS * 2))
+ pool_size = 3;
+ else if (is_expired_msecs(znd->age, MEM_PURGE_MSECS))
+ pool_size = MZ_MEMPOOL_SZ;
+
+ if (atomic_read(&znd->incore) > pool_size)
+ drop = atomic_read(&znd->incore) - pool_size;
+ }
+ if (sync || drop)
+ err = do_sync_metadata(znd, sync, drop);
+
+ return err;
+}
+
+/**
+ * meta_work_task() - Worker thread from metadata activity.
+ * @work: Work struct containing ZDM instance.
+ */
+static void meta_work_task(struct work_struct *work)
+{
+ int err = 0;
+ int locked = 0;
+ struct zoned *znd;
+
+ if (!work)
+ return;
+
+ znd = container_of(work, struct zoned, meta_work);
+ if (!znd)
+ return;
+
+ err = do_init_from_journal(znd);
+
+ if (test_bit(DO_JOURNAL_MOVE, &znd->flags) ||
+ test_bit(DO_FLUSH, &znd->flags) ||
+ test_bit(DO_SYNC, &znd->flags)) {
+ MutexLock(&znd->mz_io_mutex);
+ locked = 1;
+ }
+
+ /*
+ * Reduce memory pressure on journal list of arrays
+ * by pushing them into the sector map lookup tables
+ */
+ if (!err)
+ err = do_journal_to_table(znd);
+
+ /* force a consistent set of meta data out to disk */
+ if (!err)
+ err = do_sync_to_disk(znd);
+
+ if (locked)
+ mutex_unlock(&znd->mz_io_mutex);
+
+ znd->age = jiffies_64;
+ if (err < 0)
+ znd->meta_result = err;
+
+ clear_bit(DO_METAWORK_QD, &znd->flags);
+}
+
+/**
+ * _update_disc() - Update an existing map cache entry.
+ * @znd: ZDM instance
+ * @mcache: Map cache block
+ * @at: Data entry in map cache block
+ * @dm_s: tBLA mapping from
+ * @lba: New lba
+ * @gfp: Allocation flags to use.
+ *
+ * If overwriting an entry will trigger an 'unused' to be pushed recording
+ * the old block being stale.
+ */
+static int _update_entry(struct zoned *znd, struct map_cache *mcache, int at,
+ u64 dm_s, u64 lba, gfp_t gfp)
+{
+ struct map_sect_to_lba *data;
+ u64 lba_was;
+ int err = 0;
+
+ data = &mcache->jdata[at + 1];
+ lba_was = le64_to_lba48(data->bval, NULL);
+ lba &= Z_LOWER48;
+ data->bval = lba48_to_le64(0, lba);
+
+ if (lba != lba_was) {
+ Z_DBG(znd, "Remap %" PRIx64 " -> %" PRIx64
+ " (was %" PRIx64 "->%" PRIx64 ")",
+ dm_s, lba, le64_to_lba48(data->tlba, NULL), lba_was);
+ err = unused_phy(znd, lba_was, 0, gfp);
+ if (err == 1)
+ err = 0;
+ }
+
+ return err;
+}
+
+/**
+ * _update_disc() - Update an existing discard cache entry
+ * @znd: ZDM instance
+ * @mcache: Map cache block
+ * @at: Data entry in map cache block
+ * @dm_s: tBLA mapping from
+ * @blks: number of blocks
+ */
+static int _update_disc(struct zoned *znd, struct map_cache *mcache, int at,
+ u64 dm_s, u64 blks)
+{
+ struct map_sect_to_lba *data;
+ u64 oldsz;
+ int err = 0;
+
+ data = &mcache->jdata[at + 1];
+ oldsz = le64_to_lba48(data->bval, NULL);
+ data->bval = lba48_to_le64(0, blks);
+
+ return err;
+}
+
+/**
+ * mc_update() - Update an existing cache entry
+ * @znd: ZDM instance
+ * @mcache: Map cache block
+ * @at: Data entry in map cache block
+ * @dm_s: tBLA mapping from
+ * @lba: Value (lba or number of blocks)
+ * @type: List (type) to be adding to (MAP or DISCARD)
+ * @gfp: Allocation (kmalloc) flags
+ */
+static int mc_update(struct zoned *znd, struct map_cache *mcache, int at,
+ u64 dm_s, u64 lba, int type, gfp_t gfp)
+{
+ int rcode;
+
+ if (type == IS_MAP)
+ rcode = _update_entry(znd, mcache, at, dm_s, lba, gfp);
+ else
+ rcode = _update_disc(znd, mcache, at, dm_s, lba);
+
+ return rcode;
+}
+
+/**
+ * _mapped_list() - Add a entry to the discard or map cache
+ * @znd: ZDM instance
+ * @dm_s: tBLA mapping from
+ * @lba: Value (lba or number of blocks)
+ * @type: List (type) to be adding to (MAP or DISCARD)
+ * @gfp: Allocation (kmalloc) flags
+ */
+static
+int _mapped_list(struct zoned *znd, u64 dm_s, u64 lba, int type, gfp_t gfp)
+{
+ struct map_cache *mcache = NULL;
+ struct map_cache *mc_add = NULL;
+ int handled = 0;
+ int list_count = 0;
+ int err = 0;
+ int add_to_list = 0;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int at;
+
+ MutexLock(&mcache->cached_lock);
+ memcache_sort(znd, mcache);
+ at = _bsrch_tlba(mcache, dm_s);
+ if (at != -1) {
+ mcache_ref(mcache);
+ err = mc_update(znd, mcache, at, dm_s, lba, type, gfp);
+ mcache_deref(mcache);
+ handled = 1;
+ } else if (!mc_add) {
+ if (mcache->jcount < mcache->jsize) {
+ mc_add = mcache;
+ mcache_ref(mc_add);
+ }
+ }
+ mutex_unlock(&mcache->cached_lock);
+ if (handled) {
+ if (mc_add)
+ mcache_deref(mc_add);
+ mcache_put(mcache);
+ goto out;
+ }
+ mcache = mcache_put_get_next(znd, mcache, type);
+ list_count++;
+ }
+
+ /* ------------------------------------------------------------------ */
+ /* ------------------------------------------------------------------ */
+
+ if (!mc_add) {
+ mc_add = mcache_alloc(znd, gfp);
+ if (!mc_add) {
+ Z_ERR(znd, "%s: in memory journal is out of space.",
+ __func__);
+ err = -ENOMEM;
+ goto out;
+ }
+ mcache_ref(mc_add);
+ if (type == IS_MAP) {
+ if (list_count > MEMCACHE_HWMARK)
+ set_bit(DO_JOURNAL_MOVE, &znd->flags);
+ znd->mc_entries = list_count + 1;
+ } else {
+ znd->dc_entries = list_count + 1;
+ }
+ add_to_list = 1;
+ }
+
+ /* ------------------------------------------------------------------ */
+ /* ------------------------------------------------------------------ */
+
+ if (mc_add) {
+ MutexLock(&mc_add->cached_lock);
+
+ if (!mc_add->jdata) {
+ mc_add->jdata = ZDM_CALLOC(znd, Z_UNSORTED,
+ sizeof(*mc_add->jdata), PG_08, gfp);
+ }
+ if (!mc_add->jdata) {
+ Z_ERR(znd, "%s: in memory journal is out of space.",
+ __func__);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (mc_add->jcount < mc_add->jsize) {
+ mcache_insert(znd, mc_add, dm_s, lba);
+ if (add_to_list)
+ mclist_add(znd, mc_add, type);
+ } else {
+ err = -EBUSY;
+ }
+ mutex_unlock(&mc_add->cached_lock);
+ mcache_put(mc_add);
+ }
+out:
+ return err;
+}
+
+/**
+ * z_to_map_list() - Add a discard extent entry to the discard cache
+ * @znd: ZDM instance
+ * @dm_s: tBLA mapping from
+ * @blks: Number of blocks in the extent.
+ * @gfp: Allocation (kmalloc) flags
+ */
+static int z_to_discard_list(struct zoned *znd, u64 dm_s, u64 blks, gfp_t gfp)
+{
+ return _mapped_list(znd, dm_s, blks, IS_DISCARD, gfp);
+}
+
+/**
+ * z_to_map_list() - Add a map cache entry to the map cache
+ * @znd: ZDM instance
+ * @dm_s: tBLA mapping from
+ * @lba: bLBA mapping to
+ * @gfp: Allocation (kmalloc) flags
+ */
+static int z_to_map_list(struct zoned *znd, u64 dm_s, u64 lba, gfp_t gfp)
+{
+ return _mapped_list(znd, dm_s, lba, IS_MAP, gfp);
+}
+
+/**
+ * discard_merge() - Merge a discard request with a existing entry.
+ * @znd: ZDM Instance
+ * @tlba: Starting address
+ * @blks: Number of blocks in discard.
+ * @gfp: Current memory allocation scheme.
+ *
+ */
+static int discard_merge(struct zoned *znd, u64 tlba, u64 blks)
+{
+ struct map_cache *mcache;
+ u64 ends = tlba - 1;
+ u64 next = tlba + blks;
+ int type = IS_DISCARD;
+ int merged = 0;
+
+ mcache = mcache_first_get(znd, type);
+ while (mcache) {
+ int at;
+ int err;
+
+ /*
+ * 1) Modify existing discard entry, if it exists
+ * 2) Otherwise: If an existing entry *starts* where this
+ * entry *ends* extend that entry.
+ * 3) Otherwise: If an existing entry ends where this
+ * entry starts, extend *that* entry.
+ */
+ err = memcache_lock_and_sort(znd, mcache);
+ if (err == -EBUSY)
+ at = _lsearch_tlba(mcache, tlba);
+ else
+ at = _bsrch_tlba_lck(mcache, tlba);
+
+ if (at != -1) {
+ mcache_ref(mcache);
+ err = _update_disc(znd, mcache, at, tlba, blks);
+ mcache_deref(mcache);
+ merged = 1;
+ at = -1;
+ }
+
+ /* Existing entry stars where this discard ends */
+ if (!merged) {
+ if (err == -EBUSY)
+ at = _lsearch_tlba(mcache, next);
+ else
+ at = _bsrch_tlba_lck(mcache, next);
+ }
+ at = _bsrch_tlba(mcache, ends);
+ if (at != -1) {
+ struct map_sect_to_lba *data;
+ u64 oldsz;
+
+ mcache_ref(mcache);
+ data = &mcache->jdata[at + 1];
+ data->tlba = lba48_to_le64(0, tlba);
+ oldsz = le64_to_lba48(data->bval, NULL);
+ data->bval = lba48_to_le64(0, oldsz + blks);
+ mcache_deref(mcache);
+ merged = 1;
+ at = -1;
+ }
+
+ /*
+ * Find a discard that includes 'ends', if so
+ * determine if the containing discard should be extended
+ * or if this discard is fully contained within the current
+ * discard entry.
+ */
+ if (!merged) {
+ if (err == -EBUSY)
+ at = _lsearch_extent(mcache, ends);
+ else
+ at = _bsrch_extent_lck(mcache, ends);
+ }
+ if (at != -1) {
+ struct map_sect_to_lba *data;
+ u64 oldsz;
+ u64 origin;
+ u64 extend;
+
+ mcache_ref(mcache);
+ data = &mcache->jdata[at + 1];
+ origin = le64_to_lba48(data->tlba, NULL);
+ oldsz = le64_to_lba48(data->bval, NULL);
+ extend = (tlba - origin) + blks;
+ if (extend > oldsz)
+ data->bval = lba48_to_le64(0, extend);
+ mcache_deref(mcache);
+ merged = 1;
+ at = -1;
+ }
+ if (merged) {
+ mcache_put(mcache);
+ goto done;
+ }
+ mcache = mcache_put_get_next(znd, mcache, type);
+ }
+done:
+ return merged;
+}
+
+/**
+ * next_generation() - Increment generation number for superblock.
+ * @znd: ZDM instance
+ */
+static inline u64 next_generation(struct zoned *znd)
+{
+ u64 generation = le64_to_cpu(znd->bmkeys->generation);
+
+ if (generation == 0)
+ generation = 2;
+
+ generation++;
+ if (generation == 0)
+ generation++;
+
+ return generation;
+}
+
+/**
+ * z_mapped_sync() - Write cache entries and bump superblock generation.
+ * @znd: ZDM instance
+ */
+static int z_mapped_sync(struct zoned *znd)
+{
+ struct dm_target *ti = znd->ti;
+ struct map_cache *mcache;
+ int nblks = 1;
+ int use_wq = 0;
+ int rc = 1;
+ int discards = 0;
+ int maps = 0;
+ int jwrote = 0;
+ int cached = 0;
+ int idx = 0;
+ int no;
+ int need_sync_io = 1;
+ u64 lba = LBA_SB_START;
+ u64 generation = next_generation(znd);
+ u64 modulo = CACHE_COPIES;
+ u64 incr = MAX_CACHE_INCR;
+ struct io_4k_block *io_vcache;
+
+ MutexLock(&znd->vcio_lock);
+ io_vcache = get_io_vcache(znd, NORMAL);
+ if (!io_vcache) {
+ Z_ERR(znd, "%s: FAILED to get IO CACHE.", __func__);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ lba = 0x2048ul;
+ for (idx = 0; idx < znd->gz_count; idx++) {
+ struct meta_pg *wpg = &znd->wp[idx];
+
+ if (test_bit(IS_DIRTY, &wpg->flags)) {
+ cached = 0;
+ memcpy(io_vcache[cached].data, wpg->wp_alloc, Z_C4K);
+ znd->bmkeys->wp_crc[idx] =
+ crc_md_le16(io_vcache[cached].data, Z_CRC_4K);
+ cached++;
+ memcpy(io_vcache[cached].data, wpg->zf_est, Z_C4K);
+ znd->bmkeys->zf_crc[idx] =
+ crc_md_le16(io_vcache[cached].data, Z_CRC_4K);
+ cached++;
+ rc = write_block(ti, DM_IO_VMA, io_vcache, lba,
+ cached, use_wq);
+ if (rc)
+ goto out;
+ clear_bit(IS_DIRTY, &wpg->flags);
+
+ Z_DBG(znd, "%d# -- WP: %04x | ZF: %04x",
+ idx, znd->bmkeys->wp_crc[idx],
+ znd->bmkeys->zf_crc[idx]);
+ }
+ lba += 2;
+ }
+
+ lba = (generation % modulo) * incr;
+ if (lba == 0)
+ lba++;
+
+ znd->bmkeys->generation = cpu_to_le64(generation);
+ znd->bmkeys->gc_resv = cpu_to_le32(znd->z_gc_resv);
+ znd->bmkeys->meta_resv = cpu_to_le32(znd->z_meta_resv);
+
+ idx = 0;
+ for (no = 0; no < MAP_COUNT; no++) {
+ mcache = mcache_first_get(znd, no);
+ while (mcache) {
+ u64 phy = le64_to_lba48(mcache->jdata[0].bval, NULL);
+ u16 jcount = mcache->jcount & 0xFFFF;
+
+ if (jcount) {
+ mcache->jdata[0].bval =
+ lba48_to_le64(jcount, phy);
+ znd->bmkeys->crcs[idx] =
+ crc_md_le16(mcache->jdata, Z_CRC_4K);
+ idx++;
+ memcpy(io_vcache[cached].data,
+ mcache->jdata, Z_C4K);
+ cached++;
+
+ if (no == IS_DISCARD)
+ discards++;
+ else
+ maps++;
+ }
+
+ if (cached == IO_VCACHE_PAGES) {
+ rc = write_block(ti, DM_IO_VMA, io_vcache, lba,
+ cached, use_wq);
+ if (rc) {
+ Z_ERR(znd, "%s: cache-> %" PRIu64
+ " [%d blks] %p -> %d",
+ __func__, lba, nblks,
+ mcache->jdata, rc);
+ mcache_put(mcache);
+ goto out;
+ }
+ lba += cached;
+ jwrote += cached;
+ cached = 0;
+ }
+ mcache = mcache_put_get_next(znd, mcache, no);
+ }
+ }
+ jwrote += cached;
+ if (discards > 40)
+ Z_ERR(znd, "**WARNING** large discard cache %d", discards);
+ if (maps > 40)
+ Z_ERR(znd, "**WARNING** large map cache %d", maps);
+
+ znd->bmkeys->md_crc = crc_md_le16(znd->md_crcs, 2 * Z_CRC_4K);
+ znd->bmkeys->n_crcs = cpu_to_le16(jwrote);
+ znd->bmkeys->discards = cpu_to_le16(discards);
+ znd->bmkeys->maps = cpu_to_le16(maps);
+ znd->bmkeys->crc32 = 0;
+ znd->bmkeys->crc32 = cpu_to_le32(crc32c(~0u, znd->bmkeys, Z_CRC_4K));
+ if (cached < (IO_VCACHE_PAGES - 3)) {
+ memcpy(io_vcache[cached].data, znd->bmkeys, Z_C4K);
+ cached++;
+ memcpy(io_vcache[cached].data, znd->md_crcs, Z_C4K * 2);
+ cached += 2;
+ need_sync_io = 0;
+ }
+
+ do {
+ if (cached > 0) {
+ int rw = WRITE;
+
+ if (!need_sync_io &&
+ test_and_clear_bit(DO_FLUSH, &znd->flags))
+ rw |= (REQ_FLUSH | REQ_FUA);
+ rc = writef_block(ti, rw, DM_IO_VMA, io_vcache, lba,
+ cached, use_wq);
+ if (rc) {
+ Z_ERR(znd, "%s: mcache-> %" PRIu64
+ " [%d blks] %p -> %d",
+ __func__, lba, cached, io_vcache, rc);
+ goto out;
+ }
+ lba += cached;
+ }
+
+ cached = 0;
+ if (need_sync_io) {
+ memcpy(io_vcache[cached].data, znd->bmkeys, Z_C4K);
+ cached++;
+ memcpy(io_vcache[cached].data, znd->md_crcs, Z_C4K * 2);
+ cached += 2;
+ need_sync_io = 0;
+ }
+ } while (cached > 0);
+
+out:
+ put_io_vcache(znd, io_vcache);
+ mutex_unlock(&znd->vcio_lock);
+ return rc;
+}
+
+/**
+ * is_key_page() - Probe block for magic and crc to see if it is recognized.
+ * @_data: ZDM instance
+ */
+static inline int is_key_page(void *_data)
+{
+ int is_key = 0;
+ struct mz_superkey *data = _data;
+
+ /* Starts with Z_KEY_SIG and ends with magic */
+
+ if (le64_to_cpu(data->sig1) == Z_KEY_SIG &&
+ le64_to_cpu(data->magic) == Z_TABLE_MAGIC) {
+ __le32 orig = data->crc32;
+ __le32 crc_check;
+
+ data->crc32 = 0;
+ crc_check = cpu_to_le32(crc32c(~0u, data, Z_CRC_4K));
+ data->crc32 = orig;
+ if (crc_check == orig)
+ is_key = 1;
+ }
+ return is_key;
+}
+
+/**
+ * zoned_personality() - Update zdstart value from superblock
+ * @znd: ZDM instance
+ * @sblock: Lba to start scanning for superblock.
+ */
+static inline
+void zoned_personality(struct zoned *znd, struct zdm_superblock *sblock)
+{
+ znd->zdstart = le32_to_cpu(sblock->zdstart);
+}
+
+/**
+ * find_superblock_at() - Find superblock following lba
+ * @znd: ZDM instance
+ * @lba: Lba to start scanning for superblock.
+ * @use_wq: If a work queue is needed to scanning.
+ * @do_init: Set zdstart from found superblock.
+ */
+static
+int find_superblock_at(struct zoned *znd, u64 lba, int use_wq, int do_init)
+{
+ struct dm_target *ti = znd->ti;
+ int found = 0;
+ int nblks = 1;
+ int rc = -ENOMEM;
+ u32 count = 0;
+ u64 *data = ZDM_ALLOC(znd, Z_C4K, PG_10, NORMAL);
+
+ if (!data) {
+ Z_ERR(znd, "No memory for finding generation ..");
+ return 0;
+ }
+ if (lba == 0)
+ lba++;
+ do {
+ rc = read_block(ti, DM_IO_KMEM, data, lba, nblks, use_wq);
+ if (rc) {
+ Z_ERR(znd, "%s: read @%" PRIu64 " [%d blks] %p -> %d",
+ __func__, lba, nblks, data, rc);
+ goto out;
+ }
+ if (is_key_page(data)) {
+ struct mz_superkey *kblk = (struct mz_superkey *) data;
+ struct zdm_superblock *sblock = &kblk->sblock;
+ int err = sb_check(sblock);
+
+ if (!err) {
+ found = 1;
+ if (do_init)
+ zoned_personality(znd, sblock);
+ }
+ goto out;
+ }
+ if (data[0] == 0 && data[1] == 0) {
+ /* No SB here. */
+ Z_ERR(znd, "FGen: Invalid block %" PRIx64 "?", lba);
+ goto out;
+ }
+ lba++;
+ count++;
+ if (count > MAX_CACHE_SYNC) {
+ Z_ERR(znd, "FSB: Too deep to be useful.");
+ goto out;
+ }
+ } while (!found);
+
+out:
+ ZDM_FREE(znd, data, Z_C4K, PG_10);
+ return found;
+}
+
+/**
+ * find_superblock() - Find (any) superblock
+ * @znd: ZDM instance
+ * @use_wq: If a work queue is needed to scanning.
+ * @do_init: Set/Retrieve zdstart from found super block.
+ */
+static int find_superblock(struct zoned *znd, int use_wq, int do_init)
+{
+ int found = 0;
+ int iter = 0;
+ u64 lba = LBA_SB_START;
+ u64 last = MAX_CACHE_INCR * CACHE_COPIES;
+
+ do {
+ found = find_superblock_at(znd, lba, use_wq, do_init);
+ if (found)
+ break;
+ iter++;
+ lba = MAX_CACHE_INCR * iter;
+ } while (lba < last);
+
+ return found;
+}
+
+/**
+ * mcache_find_gen() - Find the super block following lba and get gen#
+ * @znd: ZDM instance
+ * @lba: LBA to start scanning for the super block.
+ * @use_wq: If a work queue is needed to scanning.
+ * @sb_lba: LBA where the super block was found.
+ */
+static u64 mcache_find_gen(struct zoned *znd, u64 lba, int use_wq, u64 *sb_lba)
+{
+ struct dm_target *ti = znd->ti;
+ u64 generation = 0;
+ int nblks = 1;
+ int rc = 1;
+ int done = 0;
+ u32 count = 0;
+ u64 *data = ZDM_ALLOC(znd, Z_C4K, PG_11, NORMAL);
+
+ if (!data) {
+ Z_ERR(znd, "No memory for finding generation ..");
+ return 0;
+ }
+ do {
+ rc = read_block(ti, DM_IO_KMEM, data, lba, nblks, use_wq);
+ if (rc) {
+ Z_ERR(znd, "%s: mcache-> %" PRIu64
+ " [%d blks] %p -> %d",
+ __func__, lba, nblks, data, rc);
+ goto out;
+ }
+ if (is_key_page(data)) {
+ struct mz_superkey *kblk = (struct mz_superkey *) data;
+
+ generation = le64_to_cpu(kblk->generation);
+ done = 1;
+ if (sb_lba)
+ *sb_lba = lba;
+ goto out;
+ }
+ lba++;
+ count++;
+ if (count > MAX_CACHE_SYNC) {
+ Z_ERR(znd, "FGen: Too deep to be useful.");
+ goto out;
+ }
+ } while (!done);
+
+out:
+ ZDM_FREE(znd, data, Z_C4K, PG_11);
+ return generation;
+}
+
+/**
+ * cmp_gen() - compare two u64 numbers considering rollover
+ * @left: a u64
+ * @right: a u64
+ * Return: -1, 0, 1 if left < right, equal, or > respectively.
+ */
+static inline int cmp_gen(u64 left, u64 right)
+{
+ int result = 0;
+
+ if (left != right) {
+ u64 delta = (left > right) ? left - right : right - left;
+
+ result = (left > right) ? -1 : 1;
+ if (delta > 0xFFFFFFFF) {
+ if (left == BAD_ADDR)
+ result = 1;
+ } else {
+ if (right > left)
+ result = 1;
+ }
+ }
+
+ return result;
+}
+
+/**
+ * mcache_greatest_gen() - Pick the lba where the super block should start.
+ * @znd: ZDM instance
+ * @use_wq: If a workqueue is needed for IO.
+ * @sb: LBA of super block itself.
+ * @at_lba: LBA where sync data starts (in front of the super block).
+ */
+static
+u64 mcache_greatest_gen(struct zoned *znd, int use_wq, u64 *sb, u64 *at_lba)
+{
+ u64 lba = LBA_SB_START;
+ u64 gen_no[CACHE_COPIES] = { 0ul, 0ul, 0ul };
+ u64 gen_lba[CACHE_COPIES] = { 0ul, 0ul, 0ul };
+ u64 gen_sb[CACHE_COPIES] = { 0ul, 0ul, 0ul };
+ u64 incr = MAX_CACHE_INCR;
+ int locations = ARRAY_SIZE(gen_lba);
+ int pick = 0;
+ int idx;
+
+ for (idx = 0; idx < locations; idx++) {
+ u64 *pAt = &gen_sb[idx];
+
+ gen_lba[idx] = lba;
+ gen_no[idx] = mcache_find_gen(znd, lba, use_wq, pAt);
+ if (gen_no[idx])
+ pick = idx;
+ lba = idx * incr;
+ }
+
+ for (idx = 0; idx < locations; idx++) {
+ if (cmp_gen(gen_no[pick], gen_no[idx]) > 0)
+ pick = idx;
+ }
+
+ if (gen_no[pick]) {
+ if (at_lba)
+ *at_lba = gen_lba[pick];
+ if (sb)
+ *sb = gen_sb[pick];
+ }
+
+ return gen_no[pick];
+}
+
+/**
+ * count_stale_blocks() - Number of stale blocks covered by meta_pg.
+ * @znd: ZDM instance
+ * @gzno: Meta page # to scan.
+ * @wpg: Meta page to scan.
+ */
+static u64 count_stale_blocks(struct zoned *znd, u32 gzno, struct meta_pg *wpg)
+{
+ u32 gzcount = 1 << GZ_BITS;
+ u32 iter;
+ u64 stale = 0;
+
+ if ((gzno << GZ_BITS) > znd->data_zones)
+ gzcount = znd->data_zones & GZ_MMSK;
+
+ /* mark as empty */
+ for (iter = 0; iter < gzcount; iter++) {
+ u32 wp = le32_to_cpu(wpg->wp_alloc[iter]) & Z_WP_VALUE_MASK;
+ u32 nf = le32_to_cpu(wpg->zf_est[iter]) & Z_WP_VALUE_MASK;
+
+ if (wp > (Z_BLKSZ - nf))
+ stale += (wp - (Z_BLKSZ - nf));
+ }
+
+ return stale;
+}
+
+static int do_load_cache(struct zoned *znd, int type, u64 lba, int idx, int wq)
+{
+ u16 count;
+ __le16 crc;
+ int rc = 0;
+ int blks = 1;
+ struct map_cache *mcache = mcache_alloc(znd, NORMAL);
+
+ if (!mcache) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = read_block(znd->ti, DM_IO_KMEM, mcache->jdata, lba, blks, wq);
+ if (rc) {
+ Z_ERR(znd, "%s: mcache-> %" PRIu64
+ " [%d blks] %p -> %d",
+ __func__, lba, blks, mcache->jdata, rc);
+
+ ZDM_FREE(znd, mcache->jdata, Z_C4K, PG_08);
+ ZDM_FREE(znd, mcache, sizeof(*mcache), KM_07);
+ goto out;
+ }
+ crc = crc_md_le16(mcache->jdata, Z_CRC_4K);
+ if (crc != znd->bmkeys->crcs[idx]) {
+ rc = -EIO;
+ Z_ERR(znd, "%s: bad crc %" PRIu64, __func__, lba);
+ goto out;
+ }
+ (void)le64_to_lba48(mcache->jdata[0].bval, &count);
+ mcache->jcount = count;
+ mclist_add(znd, mcache, type);
+
+out:
+ return rc;
+}
+
+static int do_load_map_cache(struct zoned *znd, u64 lba, int idx, int wq)
+{
+ return do_load_cache(znd, IS_MAP, lba, idx, wq);
+}
+
+static int do_load_discard_cache(struct zoned *znd, u64 lba, int idx, int wq)
+{
+ return do_load_cache(znd, IS_DISCARD, lba, idx, wq);
+}
+
+
+
+/**
+ * z_mapped_init() - Re-Load an existing ZDM instance from the block device.
+ * @znd: ZDM instance
+ *
+ * FIXME: Discard extent read-back does not match z_mapped_sync writing
+ */
+static int z_mapped_init(struct zoned *znd)
+{
+ struct dm_target *ti = znd->ti;
+ int nblks = 1;
+ int wq = 0;
+ int rc = 1;
+ int idx = 0;
+ int jcount = 0;
+ u64 sblba = 0;
+ u64 lba = 0;
+ u64 generation;
+ __le32 crc_chk;
+ struct io_4k_block *io_vcache;
+
+ MutexLock(&znd->vcio_lock);
+ io_vcache = get_io_vcache(znd, NORMAL);
+
+ if (!io_vcache) {
+ Z_ERR(znd, "%s: FAILED to get SYNC CACHE.", __func__);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ generation = mcache_greatest_gen(znd, wq, &sblba, &lba);
+ if (generation == 0) {
+ rc = -ENODATA;
+ goto out;
+ }
+
+ if (lba == 0)
+ lba++;
+
+ /* read superblock */
+ rc = read_block(ti, DM_IO_VMA, io_vcache, sblba, nblks, wq);
+ if (rc)
+ goto out;
+
+ memcpy(znd->bmkeys, io_vcache, sizeof(*znd->bmkeys));
+
+ /* read in map cache */
+ for (idx = 0; idx < le16_to_cpu(znd->bmkeys->maps); idx++) {
+ rc = do_load_map_cache(znd, lba++, jcount++, wq);
+ if (rc)
+ goto out;
+ }
+
+ /* read in discard cache */
+ for (idx = 0; idx < le16_to_cpu(znd->bmkeys->discards); idx++) {
+ rc = do_load_discard_cache(znd, lba++, jcount++, wq);
+ if (rc)
+ goto out;
+ }
+
+ /* skip re-read of superblock */
+ if (lba == sblba)
+ lba++;
+
+ /* read in CRC pgs */
+ rc = read_block(ti, DM_IO_VMA, znd->md_crcs, lba, 2, wq);
+ if (rc)
+ goto out;
+
+ crc_chk = znd->bmkeys->crc32;
+ znd->bmkeys->crc32 = 0;
+ znd->bmkeys->crc32 = cpu_to_le32(crc32c(~0u, znd->bmkeys, Z_CRC_4K));
+
+ if (crc_chk != znd->bmkeys->crc32) {
+ Z_ERR(znd, "Bad Block Map KEYS!");
+ Z_ERR(znd, "Key CRC: Ex: %04x vs %04x <- calculated",
+ le32_to_cpu(crc_chk),
+ le32_to_cpu(znd->bmkeys->crc32));
+ rc = -EIO;
+ goto out;
+ }
+
+ if (jcount != le16_to_cpu(znd->bmkeys->n_crcs)) {
+ Z_ERR(znd, " ... mcache entries: found = %u, expected = %u",
+ jcount, le16_to_cpu(znd->bmkeys->n_crcs));
+ rc = -EIO;
+ goto out;
+ }
+
+ crc_chk = crc_md_le16(znd->md_crcs, Z_CRC_4K * 2);
+ if (crc_chk != znd->bmkeys->md_crc) {
+ Z_ERR(znd, "CRC of CRC PGs: Ex %04x vs %04x <- calculated",
+ le16_to_cpu(znd->bmkeys->md_crc),
+ le16_to_cpu(crc_chk));
+ rc = -EIO;
+ goto out;
+ }
+
+ /*
+ * Read write pointers / free counters.
+ */
+ lba = 0x2048ul;
+ znd->discard_count = 0;
+ for (idx = 0; idx < znd->gz_count; idx++) {
+ struct meta_pg *wpg = &znd->wp[idx];
+ __le16 crc_wp;
+ __le16 crc_zf;
+
+ rc = read_block(ti, DM_IO_KMEM, wpg->wp_alloc, lba, 1, wq);
+ if (rc)
+ goto out;
+ crc_wp = crc_md_le16(wpg->wp_alloc, Z_CRC_4K);
+ if (znd->bmkeys->wp_crc[idx] != crc_wp)
+ Z_ERR(znd, "WP @ %d does not match written.", idx);
+
+ rc = read_block(ti, DM_IO_KMEM, wpg->zf_est, lba + 1, 1, wq);
+ if (rc)
+ goto out;
+ crc_zf = crc_md_le16(wpg->zf_est, Z_CRC_4K);
+ if (znd->bmkeys->zf_crc[idx] != crc_zf)
+ Z_ERR(znd, "ZF @ %d does not match written.", idx);
+
+ Z_DBG(znd, "%d# -- WP: %04x [%04x] | ZF: %04x [%04x]",
+ idx, znd->bmkeys->wp_crc[idx], crc_wp,
+ znd->bmkeys->zf_crc[idx], crc_zf);
+
+
+ if (znd->bmkeys->wp_crc[idx] == crc_wp &&
+ znd->bmkeys->zf_crc[idx] == crc_zf)
+ znd->discard_count += count_stale_blocks(znd, idx, wpg);
+
+ lba += 2;
+ }
+ znd->z_gc_resv = le32_to_cpu(znd->bmkeys->gc_resv);
+ znd->z_meta_resv = le32_to_cpu(znd->bmkeys->meta_resv);
+
+out:
+ put_io_vcache(znd, io_vcache);
+ mutex_unlock(&znd->vcio_lock);
+ return rc;
+}
+
+/**
+ * z_mapped_addmany() - Add multiple entries into the map_cache
+ * @znd: ZDM instance
+ * @dm_s: tLBA
+ * @lba: lba on backing device.
+ * @c: Number of (contiguous) map entries to add.
+ * @fp: Allocation flags (for _ALLOC)
+ */
+static int z_mapped_addmany(struct zoned *znd, u64 dm_s, u64 lba,
+ u64 count, gfp_t gfp)
+{
+ int rc = 0;
+ sector_t blk;
+
+ for (blk = 0; blk < count; blk++) {
+ rc = z_mapped_add_one(znd, dm_s + blk, lba + blk, gfp);
+ if (rc)
+ goto out;
+ }
+
+out:
+ return rc;
+}
+
+/**
+ * alloc_pg() - Allocate a map page
+ * @znd: ZDM instance
+ * @entry: entry (in mpi table) to update on allocation.
+ * @lba: LBA associated with the page of ZLT
+ * @mpi: Map page information (lookup table entry, bit flags, etc)
+ * @ahead: Flag to set READA flag on page
+ * @gfp: Allocation flags (for _ALLOC)
+ */
+static struct map_pg *alloc_pg(struct zoned *znd, int entry, u64 lba,
+ struct mpinfo *mpi, int ahead, gfp_t gfp)
+{
+ spinlock_t *lock = &znd->ct_lock;
+ struct map_pg *found = ZDM_ALLOC(znd, sizeof(*found), KM_20, gfp);
+
+ if (found) {
+ if (mpi->bit_type == IS_LUT)
+ lock = &znd->mapkey_lock;
+
+ found->lba = lba;
+ mutex_init(&found->md_lock);
+ set_bit(mpi->bit_dir, &found->flags);
+ set_bit(mpi->bit_type, &found->flags);
+ found->age = jiffies_64;
+ found->index = entry;
+ INIT_LIST_HEAD(&found->zltlst);
+ INIT_LIST_HEAD(&found->lazy);
+ found->znd = znd;
+ found->crc_pg = NULL; /* */
+ ref_pg(found);
+ if (ahead)
+ set_bit(IS_READA, &found->flags);
+
+ /* allocation done. check and see if there as a
+ * concurrent race
+ */
+ spin_lock(lock);
+ if (mpi->table[entry] == NULL)
+ mpi->table[entry] = found;
+ else
+ ZDM_FREE(znd, found, sizeof(*found), KM_20);
+ spin_unlock(lock);
+ } else {
+ Z_ERR(znd, "NO MEM for mapped_t !!!");
+ }
+ return found;
+}
+
+/**
+ * _maybe_undrop() - If a page is on its way out of cache pull it back.
+ * @znd: ZDM instance
+ * @pg: Page to claim
+ *
+ * When a table page is being dropped from the cache it may transition
+ * through the lazy pool. It a page is caught in the lazy pool it is
+ * deemed to be 'warm'. Not hot enough to be frequently hit but clearly
+ * too warm to be dropped quickly. Give it a boost to keep in in cache
+ * longer.
+ */
+static __always_inline int _maybe_undrop(struct zoned *znd, struct map_pg *pg)
+{
+ int undrop = 0;
+
+ if (test_bit(IS_DROPPED, &pg->flags)) {
+ spin_lock(&znd->lzy_lck);
+ if (test_bit(IS_DROPPED, &pg->flags) &&
+ test_bit(IS_LAZY, &pg->flags)) {
+ clear_bit(IS_DROPPED, &pg->flags);
+ set_bit(DELAY_ADD, &pg->flags);
+ }
+ spin_unlock(&znd->lzy_lck);
+ if (pg->hotness < MEM_PURGE_MSECS)
+ pg->hotness += MEM_HOT_BOOST_INC;
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ undrop = 1;
+ }
+ ref_pg(pg);
+ return undrop;
+}
+
+/**
+ * _load_backing_pages() - Cache a backing page
+ * @znd: ZDM instance
+ * @lba: Logical LBA of page.
+ * @gfp: Memory allocation rule
+ *
+ * When metadata is pooled with data the FWD table lookup can
+ * be recursive (the page needed to resolve the FWD entry is
+ * itself on disk). The recursion is never deep but it can
+ * be avoided or mitigated by keep such 'key' pages in cache.
+ */
+static int _load_backing_pages(struct zoned *znd, u64 lba, gfp_t gfp)
+{
+ const int raflg = 1;
+ int rc = 0;
+ int entry;
+ struct mpinfo mpi;
+ struct map_addr maddr;
+ struct map_pg *found;
+
+ if (lba < znd->data_lba)
+ goto out;
+
+ map_addr_calc(znd, lba, &maddr);
+ entry = to_table_entry(znd, maddr.lut_s, &mpi);
+ if (entry > -1) {
+ spinlock_t *lock = &znd->ct_lock;
+
+ if (mpi.bit_type == IS_LUT)
+ lock = &znd->mapkey_lock;
+
+ spin_lock(lock);
+ found = mpi.table[entry];
+ if (found)
+ _maybe_undrop(znd, found);
+ spin_unlock(lock);
+ if (!found)
+ found = alloc_pg(znd, entry, lba, &mpi, raflg, gfp);
+ if (found) {
+ if (!found->data.addr) {
+ rc = cache_pg(znd, found, gfp, &mpi);
+ if (rc < 0 && rc != -EBUSY)
+ znd->meta_result = rc;
+ }
+ deref_pg(found);
+
+ if (getref_pg(found) != 0)
+ Z_ERR(znd, "Backing page with elevated ref: %u",
+ getref_pg(found));
+ }
+ }
+
+out:
+ return rc;
+}
+
+/**
+ * _load_crc_page() - Cache a page of CRC
+ * @znd: ZDM instance
+ * @lba: Logical LBA of page.
+ * @gfp: Memory allocation rule
+ *
+ * When a table page is cached the page containing its CRC is also pulled
+ * into cache. Rather than defer it to cache_pg() it's brought into the
+ * cache here.
+ */
+static int _load_crc_page(struct zoned *znd, struct mpinfo *mpi, gfp_t gfp)
+{
+ const int raflg = 0;
+ int rc = 0;
+ int entry;
+ struct map_pg *found;
+ spinlock_t *lock = &znd->ct_lock;
+ u64 base = (mpi->bit_dir == IS_REV) ? znd->c_mid : znd->c_base;
+
+ base += mpi->crc.pg_no;
+ entry = to_table_entry(znd, base, mpi);
+
+ if (mpi->bit_type != IS_CRC)
+ return rc;
+ if (entry >= znd->crc_count)
+ return rc;
+
+ spin_lock(lock);
+ found = mpi->table[entry];
+ if (found)
+ _maybe_undrop(znd, found);
+ spin_unlock(lock);
+ if (!found)
+ found = alloc_pg(znd, entry, base, mpi, raflg, gfp);
+
+ if (found) {
+ if (!found->data.crc) {
+ rc = cache_pg(znd, found, gfp, mpi);
+ if (rc < 0 && rc != -EBUSY)
+ znd->meta_result = rc;
+ }
+ deref_pg(found);
+ }
+ return rc;
+}
+
+/**
+ * put_map_entry() - Decrement refcount of mapped page.
+ * @pg: mapped page
+ */
+static inline void put_map_entry(struct map_pg *pg)
+{
+ if (pg)
+ deref_pg(pg);
+}
+
+/**
+ * get_map_entry() - Find a page of LUT or CRC table map.
+ * @znd: ZDM instance
+ * @lba: Logical LBA of page.
+ * @gfp: Memory allocation rule
+ *
+ * Return: struct map_pg * or NULL on error.
+ *
+ * Page will be loaded from disk it if is not already in core memory.
+ */
+static struct map_pg *get_map_entry(struct zoned *znd, u64 lba, gfp_t gfp)
+{
+ struct mpinfo mpi;
+ struct map_pg *ahead[READ_AHEAD];
+ int entry;
+ int iter;
+ int range;
+ u32 count;
+ spinlock_t *lock;
+ int empty_val;
+
+ memset(&ahead, 0, sizeof(ahead));
+
+ entry = to_table_entry(znd, lba, &mpi);
+ if (entry < 0)
+ return NULL;
+
+ if (mpi.bit_type == IS_LUT) {
+ lock = &znd->mapkey_lock;
+ count = znd->map_count;
+ empty_val = 0xff;
+
+ if (mpi.bit_dir == IS_FWD) {
+ /*
+ * pre-load any backing pages
+ * to unwind recursive page lookups.
+ */
+ _load_backing_pages(znd, lba, gfp);
+ }
+ _load_crc_page(znd, &mpi, gfp);
+
+ range = entry + ARRAY_SIZE(ahead);
+ if (range > count)
+ range = count;
+ } else {
+ lock = &znd->ct_lock;
+ count = znd->crc_count;
+ empty_val = 0;
+
+ /* CRC's cover 2k pages .. so only pull one extra */
+ range = entry + 1;
+ if (range > count)
+ range = count;
+ }
+
+ iter = 0;
+ while (entry < range) {
+ int want_cached = 1;
+ struct map_pg *found;
+
+ entry = to_table_entry(znd, lba, &mpi);
+ if (entry < 0)
+ break;
+
+ spin_lock(lock);
+ found = mpi.table[entry];
+ if (found &&
+ _maybe_undrop(znd, found) &&
+ test_bit(IS_READA, &found->flags)
+ && iter > 0)
+ want_cached = 0;
+
+ if (found) {
+ if (want_cached)
+ found->age = jiffies_64
+ + msecs_to_jiffies(found->hotness);
+ else
+ found->age +=
+ msecs_to_jiffies(MEM_HOT_BOOST_INC);
+ }
+ spin_unlock(lock);
+
+ if (want_cached && !found)
+ found = alloc_pg(znd, entry, lba, &mpi, iter, gfp);
+
+ if (found) {
+ if (want_cached)
+ ahead[iter] = found;
+ else
+ deref_pg(found);
+ }
+ iter++;
+ entry++;
+ lba++;
+ }
+
+ /*
+ * Each entry in ahead has an elevated refcount.
+ * Only allow the target of get_map_entry() to remain elevated.
+ */
+ for (iter = 0; iter < ARRAY_SIZE(ahead); iter++) {
+ struct map_pg *pg = ahead[iter];
+
+ if (pg) {
+ if (!pg->data.addr) {
+ int rc;
+
+ to_table_entry(znd, pg->lba, &mpi);
+ rc = cache_pg(znd, pg, gfp, &mpi);
+
+ if (rc < 0 && rc != -EBUSY) {
+ znd->meta_result = rc;
+ ahead[iter] = NULL;
+ }
+ }
+ if (iter > 0)
+ deref_pg(pg);
+ }
+ }
+
+ return ahead[0];
+}
+
+/**
+ * metadata_dirty_fling() - Force a ZLT block into cache and flag it dirty.
+ * @znd: ZDM Instance
+ * @dm_s: Current lba to consider.
+ *
+ * Used when data and ZDM's metadata are co-mingled. If dm_s is a block
+ * of ZDM's metadata it needs to be relocated. Since we re-locate
+ * blocks that are in dirty and in the cache ... if this block is
+ * metadata, force it into the cache and flag it as dirty.
+ */
+static int metadata_dirty_fling(struct zoned *znd, u64 dm_s)
+{
+ struct map_pg *pg = NULL;
+ int is_flung = 0;
+
+ /*
+ * When co
+ * nothing in the GC reverse map should point to
+ * a block *before* a data pool block
+ */
+ if (dm_s < znd->data_lba)
+ return is_flung;
+
+ if (dm_s >= znd->r_base && dm_s < znd->c_end) {
+ pg = get_map_entry(znd, dm_s, NORMAL);
+ if (!pg)
+ Z_ERR(znd, "Failed to fling: %" PRIx64, dm_s);
+ }
+ if (pg) {
+ MutexLock(&pg->md_lock);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ clear_bit(IS_READA, &pg->flags);
+ set_bit(IS_DIRTY, &pg->flags);
+ clear_bit(IS_FLUSH, &pg->flags);
+ is_flung = 1;
+ mutex_unlock(&pg->md_lock);
+
+ if (pg->lba != dm_s)
+ Z_ERR(znd, "Excess churn? lba %"PRIx64
+ " [last: %"PRIx64"]", dm_s, pg->lba);
+ put_map_entry(pg);
+ }
+ return is_flung;
+}
+
+/**
+ * z_do_copy_more() - GC transition to read more blocks.
+ * @gc_state: GC State to be updated.
+ */
+static inline void z_do_copy_more(struct gc_state *gc_entry)
+{
+ unsigned long flags;
+ struct zoned *znd = gc_entry->znd;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ set_bit(DO_GC_PREPARE, &gc_entry->gc_flags);
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+}
+
+/**
+ * gc_post_add() - Add a tLBA and current bLBA origin.
+ * @znd: ZDM Instance
+ * @dm_s: tLBA
+ * @lba: bLBA
+ *
+ * Return: 1 if tLBA is added, 0 if block was stale.
+ *
+ * Stale block checks are performed before tLBA is added.
+ * Add a non-stale block to the list of blocks for moving and
+ * metadata updating.
+ */
+static int gc_post_add(struct zoned *znd, u64 addr, u64 lba)
+{
+ struct map_cache *post = &znd->gc_postmap;
+ int handled = 0;
+
+ if (z_lookup_cache(znd, addr, IS_DISCARD))
+ return handled;
+
+ if (metadata_dirty_fling(znd, addr))
+ return handled;
+
+ if (post->jcount < post->jsize) {
+ u16 idx = ++post->jcount;
+
+ WARN_ON(post->jcount > post->jsize);
+
+ post->jdata[idx].tlba = lba48_to_le64(0, addr);
+ post->jdata[idx].bval = lba48_to_le64(0, lba);
+ handled = 1;
+ } else {
+ Z_ERR(znd, "*CRIT* post overflow L:%" PRIx64 "-> S:%" PRIx64,
+ lba, addr);
+ }
+ return handled;
+}
+
+
+/**
+ * _fwd_to_cache() - Helper to pull the forward map block into cache.
+ * @znd: ZDM Instance
+ * @addr: Address (from reverse map table entry).
+ */
+static __always_inline int _fwd_to_cache(struct zoned *znd, u64 addr)
+{
+ int err = 0;
+ struct map_pg *pg;
+ struct map_addr maddr;
+
+ if (addr < znd->nr_blocks) {
+ map_addr_calc(znd, addr, &maddr);
+ pg = get_map_entry(znd, maddr.lut_s, NORMAL);
+ if (!pg)
+ err = -ENOMEM;
+ put_map_entry(pg);
+ } else {
+ Z_ERR(znd, "Invalid rmap entry: %" PRIx64, addr);
+ }
+
+ return err;
+}
+
+/**
+ * z_zone_gc_metadata_to_ram() - Load affected metadata blocks to ram.
+ * @gc_entry: Compaction event in progress
+ *
+ * Return: 0, otherwise errno.
+ *
+ * Use the reverse ZLT to find the forward ZLT entries that need to be
+ * remapped in this zone.
+ * When complete the znd->gc_postmap have a map of all the non-stale
+ * blocks remaining in the zone.
+ */
+static int z_zone_gc_metadata_to_ram(struct gc_state *gc_entry)
+{
+ struct zoned *znd = gc_entry->znd;
+ u64 from_lba = (gc_entry->z_gc << Z_BLKBITS) + znd->md_end;
+ struct map_pg *rev_pg;
+ struct map_addr origin;
+ int count;
+ int rcode = 0;
+
+ /* pull all of the affected struct map_pg and crc pages into memory: */
+ for (count = 0; count < Z_BLKSZ; count++) {
+ __le32 ORencoded;
+ u64 blba = from_lba + count;
+
+ map_addr_calc(znd, blba, &origin);
+ rev_pg = get_map_entry(znd, origin.lut_r, NORMAL);
+ if (rev_pg && rev_pg->data.addr) {
+ ref_pg(rev_pg);
+ MutexLock(&rev_pg->md_lock);
+ ORencoded = rev_pg->data.addr[origin.pg_idx];
+ mutex_unlock(&rev_pg->md_lock);
+ if (ORencoded != MZTEV_UNUSED) {
+ u64 tlba = map_value(znd, ORencoded);
+ int err = _fwd_to_cache(znd, tlba);
+
+ if (err)
+ rcode = err;
+ gc_post_add(znd, tlba, blba);
+ }
+ deref_pg(rev_pg);
+ }
+ put_map_entry(rev_pg);
+ }
+
+ return rcode;
+}
+
+/**
+ * append_blks() - Read (more) blocks into buffer.
+ * @znd: ZDM Instance
+ * @lba: Starting blba
+ * @io_buf: Buffer to read into
+ * @count: Number of blocks to read.
+ */
+static int append_blks(struct zoned *znd, u64 lba,
+ struct io_4k_block *io_buf, int count)
+{
+ int rcode = 0;
+ int rc;
+ u32 chunk;
+ struct io_4k_block *io_vcache;
+
+ MutexLock(&znd->gc_vcio_lock);
+ io_vcache = get_io_vcache(znd, NORMAL);
+ if (!io_vcache) {
+ Z_ERR(znd, "%s: FAILED to get SYNC CACHE.", __func__);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ for (chunk = 0; chunk < count; chunk += IO_VCACHE_PAGES) {
+ u32 nblks = count - chunk;
+
+ if (nblks > IO_VCACHE_PAGES)
+ nblks = IO_VCACHE_PAGES;
+
+ rc = read_block(znd->ti, DM_IO_VMA, io_vcache, lba, nblks, 0);
+ if (rc) {
+ Z_ERR(znd, "Reading error ... disable zone: %u",
+ (u32)(lba >> 16));
+ rcode = -EIO;
+ goto out;
+ }
+ memcpy(&io_buf[chunk], io_vcache, nblks * Z_C4K);
+ lba += nblks;
+ }
+out:
+ put_io_vcache(znd, io_vcache);
+ mutex_unlock(&znd->gc_vcio_lock);
+ return rcode;
+}
+
+/**
+ * z_zone_gc_read() - Read (up to) a buffer worth of data from zone.
+ * @gc_entry: Active GC state
+ */
+static int z_zone_gc_read(struct gc_state *gc_entry)
+{
+ struct zoned *znd = gc_entry->znd;
+ struct io_4k_block *io_buf = znd->gc_io_buf;
+ struct map_cache *post = &znd->gc_postmap;
+ unsigned long flags;
+ u64 start_lba;
+ int nblks;
+ int rcode = 0;
+ int fill = 0;
+ int jstart;
+ int jentry;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ jstart = gc_entry->r_ptr;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ if (!jstart)
+ jstart++;
+
+ MutexLock(&post->cached_lock);
+
+ /* A discard may have puched holes in the postmap. re-sync lba */
+ jentry = jstart;
+ while (jentry <= post->jcount && (Z_LOWER48 ==
+ le64_to_lba48(post->jdata[jentry].bval, NULL))) {
+ jentry++;
+ }
+ /* nothing left to move */
+ if (jentry > post->jcount)
+ goto out_finished;
+
+ /* skip over any discarded blocks */
+ if (jstart != jentry)
+ jstart = jentry;
+
+ start_lba = le64_to_lba48(post->jdata[jentry].bval, NULL);
+ post->jdata[jentry].bval = lba48_to_le64(GC_READ, start_lba);
+ nblks = 1;
+ jentry++;
+
+ while (jentry <= post->jcount && (nblks+fill) < GC_MAX_STRIPE) {
+ u64 dm_s = le64_to_lba48(post->jdata[jentry].tlba, NULL);
+ u64 lba = le64_to_lba48(post->jdata[jentry].bval, NULL);
+
+ if (Z_LOWER48 == dm_s || Z_LOWER48 == lba) {
+ jentry++;
+ continue;
+ }
+
+ post->jdata[jentry].bval = lba48_to_le64(GC_READ, lba);
+
+ /* if the block is contiguous add it to the read */
+ if (lba == (start_lba + nblks)) {
+ nblks++;
+ } else {
+ if (nblks) {
+ int err;
+
+ err = append_blks(znd, start_lba,
+ &io_buf[fill], nblks);
+ if (err) {
+ rcode = err;
+ goto out;
+ }
+ fill += nblks;
+ }
+ start_lba = lba;
+ nblks = 1;
+ }
+ jentry++;
+ }
+
+ /* Issue a copy of 'nblks' blocks */
+ if (nblks > 0) {
+ int err;
+
+ err = append_blks(znd, start_lba, &io_buf[fill], nblks);
+ if (err) {
+ rcode = err;
+ goto out;
+ }
+ fill += nblks;
+ }
+
+out_finished:
+ Z_DBG(znd, "Read %d blocks from %d", fill, gc_entry->r_ptr);
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ gc_entry->nblks = fill;
+ gc_entry->r_ptr = jentry;
+ if (fill > 0)
+ set_bit(DO_GC_WRITE, &gc_entry->gc_flags);
+ else
+ set_bit(DO_GC_COMPLETE, &gc_entry->gc_flags);
+
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+out:
+ mutex_unlock(&post->cached_lock);
+
+ return rcode;
+}
+
+/**
+ * z_zone_gc_write() - Write (up to) a buffer worth of data to WP.
+ * @gc_entry: Active GC state
+ * @stream_id: Stream Id to prefer for allocation.
+ */
+static int z_zone_gc_write(struct gc_state *gc_entry, u32 stream_id)
+{
+ struct zoned *znd = gc_entry->znd;
+ struct dm_target *ti = znd->ti;
+ struct io_4k_block *io_buf = znd->gc_io_buf;
+ struct map_cache *post = &znd->gc_postmap;
+ unsigned long flags;
+ u32 aq_flags = Z_AQ_GC | Z_AQ_STREAM_ID | stream_id;
+ u64 lba;
+ u32 nblks;
+ u32 out = 0;
+ int err = 0;
+ int jentry;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ jentry = gc_entry->w_ptr;
+ nblks = gc_entry->nblks;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ if (!jentry)
+ jentry++;
+
+ MutexLock(&post->cached_lock);
+ while (nblks > 0) {
+ u32 nfound = 0;
+ u32 added = 0;
+
+ /*
+ * When lba is zero blocks were not allocated.
+ * Retry with the smaller request
+ */
+ lba = z_acquire(znd, aq_flags, nblks, &nfound);
+ if (!lba) {
+ if (nfound) {
+ u32 avail = nfound;
+
+ nfound = 0;
+ lba = z_acquire(znd, aq_flags, avail, &nfound);
+ }
+ }
+
+ if (!lba) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ err = write_block(ti, DM_IO_VMA, &io_buf[out], lba, nfound, 0);
+ if (err) {
+ Z_ERR(znd, "Write %d blocks to %"PRIx64". ERROR: %d",
+ nfound, lba, err);
+ goto out;
+ }
+ out += nfound;
+
+ while ((jentry <= post->jcount) && (added < nfound)) {
+ u16 rflg;
+ u64 orig = le64_to_lba48(
+ post->jdata[jentry].bval, &rflg);
+ u64 dm_s = le64_to_lba48(
+ post->jdata[jentry].tlba, NULL);
+
+ if ((Z_LOWER48 == dm_s || Z_LOWER48 == orig)) {
+ jentry++;
+
+ if (rflg & GC_READ) {
+ Z_ERR(znd, "ERROR: %" PRIx64
+ " read and not written %" PRIx64,
+ orig, dm_s);
+ lba++;
+ added++;
+ }
+ continue;
+ }
+ rflg &= ~GC_READ;
+ post->jdata[jentry].bval = lba48_to_le64(rflg, lba);
+ lba++;
+ added++;
+ jentry++;
+ }
+ nblks -= nfound;
+ }
+ Z_DBG(znd, "Write %d blocks from %d", gc_entry->nblks, gc_entry->w_ptr);
+ set_bit(DO_GC_META, &gc_entry->gc_flags);
+
+out:
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ gc_entry->nblks = 0;
+ gc_entry->w_ptr = jentry;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+ mutex_unlock(&post->cached_lock);
+
+ return err;
+}
+
+/**
+ * gc_finalize() - Final sanity check on GC'd block map.
+ * @gc_entry: Active GC state
+ *
+ * gc_postmap is expected to be empty (all blocks original
+ * scheduled to be moved to a new zone have been accounted for...
+ */
+static int gc_finalize(struct gc_state *gc_entry)
+{
+ int err = 0;
+ struct zoned *znd = gc_entry->znd;
+ struct map_cache *post = &znd->gc_postmap;
+ int jentry;
+
+ MutexLock(&post->cached_lock);
+ for (jentry = post->jcount; jentry > 0; jentry--) {
+ u64 dm_s = le64_to_lba48(post->jdata[jentry].tlba, NULL);
+ u64 lba = le64_to_lba48(post->jdata[jentry].bval, NULL);
+
+ if (dm_s != Z_LOWER48 || lba != Z_LOWER48) {
+ Z_ERR(znd, "GC: Failed to move %" PRIx64
+ " from %"PRIx64" [%d]",
+ dm_s, lba, jentry);
+ err = -EIO;
+ }
+ }
+ mutex_unlock(&post->cached_lock);
+ post->jcount = jentry;
+ post->jsorted = 0;
+
+ return err;
+}
+
+/**
+ * clear_gc_target_flag() - Clear any zone tagged as a GC target.
+ * @znd: ZDM Instance
+ *
+ * FIXME: Can we reduce the weight of this ?
+ * Ex. execute as zones are closed and specify the zone to clear
+ * at GC completion/cleanup.
+ */
+static void clear_gc_target_flag(struct zoned *znd)
+{
+ int z_id;
+
+ for (z_id = 0; z_id < znd->data_zones; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp;
+
+ SpinLock(&wpg->wplck);
+ wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ if (wp & Z_WP_GC_TARGET) {
+ wp &= ~Z_WP_GC_TARGET;
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wp);
+ }
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ spin_unlock(&wpg->wplck);
+ }
+}
+
+/**
+ * z_zone_gc_metadata_update() - Update ZLT as needed.
+ * @gc_entry: Active GC state
+ *
+ * Dispose or account for all blocks originally scheduled to be
+ * moved. Update ZLT (via map cache) for all moved blocks.
+ */
+static int z_zone_gc_metadata_update(struct gc_state *gc_entry)
+{
+ struct zoned *znd = gc_entry->znd;
+ struct map_cache *post = &znd->gc_postmap;
+ u32 used = post->jcount;
+ int err = 0;
+ int jentry;
+
+ for (jentry = post->jcount; jentry > 0; jentry--) {
+ int discard = 0;
+ int mapping = 0;
+ struct map_pg *mapped = NULL;
+ u64 dm_s = le64_to_lba48(post->jdata[jentry].tlba, NULL);
+ u64 lba = le64_to_lba48(post->jdata[jentry].bval, NULL);
+
+ if ((znd->r_base <= dm_s) && dm_s < (znd->r_base + Z_BLKSZ)) {
+ u64 off = dm_s - znd->r_base;
+
+ mapped = znd->rev_tm[off];
+ mapping = 1;
+ } else if ((znd->s_base <= dm_s) &&
+ (dm_s < (znd->s_base + Z_BLKSZ))) {
+ u64 off = dm_s - znd->s_base;
+
+ mapped = znd->fwd_tm[off];
+ mapping = 1;
+ }
+
+ if (mapping && !mapped)
+ Z_ERR(znd, "MD: dm_s: %" PRIx64 " -> lba: %" PRIx64
+ " no mapping in ram.", dm_s, lba);
+
+ if (mapped) {
+ u32 in_z;
+
+ ref_pg(mapped);
+ MutexLock(&mapped->md_lock);
+ in_z = _calc_zone(znd, mapped->last_write);
+ if (in_z != gc_entry->z_gc) {
+ Z_ERR(znd, "MD: %" PRIx64
+ " Discarded - %" PRIx64
+ " already flown to: %x",
+ dm_s, mapped->last_write, in_z);
+ discard = 1;
+ } else if (mapped->data.addr &&
+ test_bit(IS_DIRTY, &mapped->flags)) {
+ Z_ERR(znd,
+ "MD: %" PRIx64 " Discarded - %"PRIx64
+ " is in-flight",
+ dm_s, mapped->last_write);
+ discard = 2;
+ }
+ if (!discard)
+ mapped->last_write = lba;
+ mutex_unlock(&mapped->md_lock);
+ deref_pg(mapped);
+ }
+
+ MutexLock(&post->cached_lock);
+ if (discard == 1) {
+ Z_ERR(znd, "Dropped: %" PRIx64 " -> %"PRIx64,
+ le64_to_cpu(post->jdata[jentry].tlba),
+ le64_to_cpu(post->jdata[jentry].bval));
+
+ post->jdata[jentry].tlba = MC_INVALID;
+ post->jdata[jentry].bval = MC_INVALID;
+ }
+ if (post->jdata[jentry].tlba == MC_INVALID &&
+ post->jdata[jentry].bval == MC_INVALID) {
+ used--;
+ } else if (lba) {
+ increment_used_blks(znd, lba, 1);
+ }
+ mutex_unlock(&post->cached_lock);
+ }
+ err = move_to_map_tables(znd, post);
+ if (err)
+ Z_ERR(znd, "Move to tables post GC failure");
+
+ clear_gc_target_flag(znd);
+
+ return err;
+}
+
+/**
+ * _blkalloc() - Attempt to reserve blocks at z_at in ZDM znd
+ * @znd: ZDM instance.
+ * @z_at: Zone to write data to
+ * @flags: Acquisition type.
+ * @nblks: Number of blocks desired.
+ * @nfound: Number of blocks allocated or available.
+ *
+ * Attempt allocation of @nblks within fron the current WP of z_at
+ * When nblks are not available 0 is returned and @nfound is the
+ * contains the number of blocks *available* but not *allocated*.
+ * When nblks are available the starting LBA in 4k space is returned and
+ * nblks are allocated *allocated* and *nfound is the number of blocks
+ * remaining in zone z_at from the LBA returned.
+ *
+ * Return: LBA if request is met, otherwise 0. nfound will contain the
+ * available blocks remaining.
+ */
+static sector_t _blkalloc(struct zoned *znd, u32 z_at, u32 flags,
+ u32 nblks, u32 *nfound)
+{
+#define ALLOC_STICKY (Z_WP_GC_TARGET|Z_WP_NON_SEQ|Z_WP_RRECALC)
+
+ sector_t found = 0;
+ u32 avail = 0;
+ int do_open_zone = 0;
+ u32 gzno = z_at >> GZ_BITS;
+ u32 gzoff = z_at & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp;
+ u32 wptr;
+ u32 gc_tflg;
+
+ if (gzno >= znd->gz_count || z_at >= znd->data_zones) {
+ Z_ERR(znd, "Invalid zone for allocation: %u", z_at);
+ dump_stack();
+ return 0ul;
+ }
+
+ SpinLock(&wpg->wplck);
+ wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ gc_tflg = wp & ALLOC_STICKY;
+ wptr = wp & ~ALLOC_STICKY;
+ if (wptr < Z_BLKSZ)
+ avail = Z_BLKSZ - wptr;
+
+#if 0 /* DEBUG START: Testing zm_write_pages() */
+ if (avail > 7)
+ avail = 7;
+#endif /* DEBUG END: Testing zm_write_pages() */
+
+ *nfound = avail;
+ if (nblks <= avail) {
+ u64 lba = ((u64)z_at << Z_BLKBITS) + znd->data_lba;
+ u32 zf_est = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+
+ found = lba + wptr;
+ *nfound = nblks;
+ if (wptr == 0)
+ do_open_zone = 1;
+
+ wptr += nblks;
+ zf_est -= nblks;
+ if (wptr == Z_BLKSZ)
+ znd->discard_count += zf_est;
+
+ wptr |= gc_tflg;
+ if (flags & Z_AQ_GC)
+ wptr |= Z_WP_GC_TARGET;
+
+ if (flags & Z_AQ_STREAM_ID)
+ zf_est |= (flags & Z_AQ_STREAM_MASK) << 24;
+ else
+ zf_est |= le32_to_cpu(wpg->zf_est[gzoff])
+ & Z_WP_STREAM_MASK;
+
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wptr);
+ wpg->zf_est[gzoff] = cpu_to_le32(zf_est);
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ }
+ spin_unlock(&wpg->wplck);
+
+ if (do_open_zone)
+ dmz_open_zone(znd, z_at);
+
+ return found;
+}
+
+/**
+ * update_stale_ratio() - Update the stale ratio for the finished bin.
+ * @znd: ZDM instance
+ * @zone: Zone that needs update.
+ */
+static void update_stale_ratio(struct zoned *znd, u32 zone)
+{
+ u64 total_stale = 0;
+ u64 free_zones = 1;
+ u32 bin = zone / znd->stale.binsz;
+ u32 z_id = bin * znd->stale.binsz;
+ u32 s_end = z_id + znd->stale.binsz;
+
+ if (s_end > znd->data_zones)
+ s_end = znd->data_zones;
+
+ for (; z_id < s_end; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 stale = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_VALUE_MASK;
+ u32 wflg = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ if (wflg & Z_WP_RRECALC) {
+ SpinLock(&wpg->wplck);
+ wflg = le32_to_cpu(wpg->wp_alloc[gzoff])
+ & ~Z_WP_RRECALC;
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wflg);
+ spin_unlock(&wpg->wplck);
+ }
+
+ if (wp == Z_BLKSZ)
+ total_stale += stale;
+ else
+ free_zones++;
+ }
+
+ total_stale /= free_zones;
+ znd->stale.bins[bin] = (total_stale > ~0u) ? ~0u : total_stale;
+}
+
+/**
+ * z_zone_compact_queue() - Queue zone compaction.
+ * @znd: ZDM instance
+ * @z_gc: Zone to queue.
+ * @delay: Delay queue metric
+ * @gfp: Allocation scheme.
+ *
+ * Return: 1 on success, 0 if not queued/busy, negative on error.
+ */
+static
+int z_zone_compact_queue(struct zoned *znd, u32 z_gc, int delay, gfp_t gfp)
+{
+ unsigned long flags;
+ int do_queue = 0;
+ int err = 0;
+ struct gc_state *gc_entry;
+
+ gc_entry = ZDM_ALLOC(znd, sizeof(*gc_entry), KM_16, gfp);
+ if (!gc_entry) {
+ Z_ERR(znd, "No Memory for compact!!");
+ return -ENOMEM;
+ }
+ gc_entry->znd = znd;
+ gc_entry->z_gc = z_gc;
+ set_bit(DO_GC_NEW, &gc_entry->gc_flags);
+ znd->gc_backlog++;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ if (!znd->gc_active) {
+ znd->gc_active = gc_entry;
+ do_queue = 1;
+ } else {
+ Z_ERR(znd, "GC: Tried to queue but already active");
+ }
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ if (do_queue) {
+ unsigned long tval = msecs_to_jiffies(delay);
+
+ if (queue_delayed_work(znd->gc_wq, &znd->gc_work, tval))
+ err = 1;
+ } else {
+ ZDM_FREE(znd, gc_entry, sizeof(*gc_entry), KM_16);
+ znd->gc_backlog--;
+ Z_ERR(znd, "GC: FAILED to queue work");
+ }
+
+ return err;
+}
+
+/**
+ * zone_zfest() - Queue zone compaction.
+ * @znd: ZDM instance
+ * @z_id: Zone to queue.
+ */
+static u32 zone_zfest(struct zoned *znd, u32 z_id)
+{
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+
+ return le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+}
+
+/**
+ * gc_request_queued() - Called periodically to initiate GC
+ *
+ * @znd: ZDM instance
+ * @bin: Bin with stale zones to scan for GC
+ * @delay: Metric for delay queuing.
+ * @gfp: Default memory allocation scheme.
+ *
+ */
+static int gc_request_queued(struct zoned *znd, int bin, int delay, gfp_t gfp)
+{
+ unsigned long flags;
+ int queued = 0;
+ u32 top_roi = NOZONE;
+ u32 stale = 0;
+ u32 z_gc = bin * znd->stale.binsz;
+ u32 s_end = z_gc + znd->stale.binsz;
+
+ if (znd->meta_result)
+ goto out;
+
+ if (test_bit(ZF_FREEZE, &znd->flags)) {
+ Z_ERR(znd, "Is frozen -- GC paused.");
+ goto out;
+ }
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ if (znd->gc_active)
+ queued = 1;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+ if (queued)
+ goto out;
+
+ if (s_end > znd->data_zones)
+ s_end = znd->data_zones;
+
+ /* scan for most stale zone in STREAM [top_roi] */
+ for (; z_gc < s_end; z_gc++) {
+ u32 gzno = z_gc >> GZ_BITS;
+ u32 gzoff = z_gc & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp_v = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ u32 nfree = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+ u32 wp_f = wp_v & Z_WP_FLAGS_MASK;
+
+ wp_v &= Z_WP_VALUE_MASK;
+ if (wp_v == 0)
+ continue;
+ if ((wp_f & Z_WP_GC_PENDING) != 0)
+ continue;
+
+ if (wp_v == Z_BLKSZ) {
+ stale += nfree;
+ if ((wp_f & Z_WP_GC_BITS) == Z_WP_GC_READY) {
+ if (top_roi == NOZONE)
+ top_roi = z_gc;
+ else if (nfree > zone_zfest(znd, top_roi))
+ top_roi = z_gc;
+ }
+ }
+ }
+
+ if (!delay && top_roi == NOZONE)
+ Z_ERR(znd, "No GC candidate in bin: %u -> %u", z_gc, s_end);
+
+ /* determine the cut-off for GC based on MZ overall staleness */
+ if (top_roi != NOZONE) {
+ u32 state_metric = GC_PRIO_DEFAULT;
+ u32 n_empty = znd->z_gc_free;
+ int pctfree = n_empty * 100 / znd->data_zones;
+
+ /*
+ * -> at less than 5 zones free switch to critical
+ * -> at less than 5% zones free switch to HIGH
+ * -> at less than 25% free switch to LOW
+ * -> high level is 'cherry picking' near empty zones
+ */
+ if (znd->z_gc_free < 5)
+ state_metric = GC_PRIO_CRIT;
+ else if (pctfree < 5)
+ state_metric = GC_PRIO_HIGH;
+ else if (pctfree < 25)
+ state_metric = GC_PRIO_LOW;
+
+ if (zone_zfest(znd, top_roi) > state_metric) {
+ int rc;
+
+ rc = z_zone_compact_queue(znd, top_roi, delay * 5, gfp);
+ if (rc == 1)
+ queued = 1;
+ else if (rc < 0)
+ Z_ERR(znd, "GC: Z#%u !Q: ERR: %d", top_roi, rc);
+ }
+
+ if (!delay && !queued)
+ Z_ERR(znd, "GC: Z#%u !Q .. M: %u E: %u PCT: %d ZF: %u",
+ top_roi, state_metric, n_empty, pctfree,
+ zone_zfest(znd, top_roi));
+ }
+out:
+ return queued;
+}
+
+/**
+ * z_zone_gc_compact() - Primary compaction worker.
+ * @gc_entry: GC State
+ */
+static int z_zone_gc_compact(struct gc_state *gc_entry)
+{
+ unsigned long flags;
+ int err = 0;
+ struct zoned *znd = gc_entry->znd;
+ u32 z_gc = gc_entry->z_gc;
+ u32 gzno = z_gc >> GZ_BITS;
+ u32 gzoff = z_gc & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+
+ znd->age = jiffies_64;
+
+ if (test_bit(DO_GC_NEW, &gc_entry->gc_flags)) {
+ err = z_flush_bdev(znd, GFP_KERNEL);
+ if (err) {
+ gc_entry->result = err;
+ goto out;
+ }
+ }
+
+ /* If a SYNC is in progress and we can delay then postpone*/
+ if (mutex_is_locked(&znd->mz_io_mutex) &&
+ atomic_read(&znd->gc_throttle) == 0)
+ return -EAGAIN;
+
+ if (test_and_clear_bit(DO_GC_NEW, &gc_entry->gc_flags)) {
+ u32 wp;
+
+ SpinLock(&wpg->wplck);
+ wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ wp |= Z_WP_GC_FULL;
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wp);
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ spin_unlock(&wpg->wplck);
+
+ err = _cached_to_tables(znd, gc_entry->z_gc);
+ if (err) {
+ Z_ERR(znd, "Failed to purge journal: %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+
+ if (znd->gc_postmap.jcount > 0) {
+ Z_ERR(znd, "*** Unexpected data in postmap!!");
+ znd->gc_postmap.jcount = 0;
+ }
+
+ err = z_zone_gc_metadata_to_ram(gc_entry);
+ if (err) {
+ Z_ERR(znd,
+ "Pre-load metadata to memory failed!! %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+
+ do {
+ err = memcache_lock_and_sort(znd, &znd->gc_postmap);
+ } while (err == -EBUSY);
+
+ set_bit(DO_GC_PREPARE, &gc_entry->gc_flags);
+
+ if (znd->gc_throttle.counter == 0)
+ return -EAGAIN;
+ }
+
+next_in_queue:
+ znd->age = jiffies_64;
+
+ if (test_and_clear_bit(DO_GC_PREPARE, &gc_entry->gc_flags)) {
+
+ MutexLock(&znd->mz_io_mutex);
+ mutex_unlock(&znd->mz_io_mutex);
+
+ err = z_zone_gc_read(gc_entry);
+ if (err < 0) {
+ Z_ERR(znd, "z_zone_gc_chunk issue failure: %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+ if (znd->gc_throttle.counter == 0)
+ return -EAGAIN;
+ }
+
+ if (test_and_clear_bit(DO_GC_WRITE, &gc_entry->gc_flags)) {
+ u32 sid = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_STREAM_MASK;
+
+ MutexLock(&znd->mz_io_mutex);
+ mutex_unlock(&znd->mz_io_mutex);
+
+ err = z_zone_gc_write(gc_entry, sid >> 24);
+ if (err) {
+ Z_ERR(znd, "z_zone_gc_write issue failure: %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+ if (znd->gc_throttle.counter == 0)
+ return -EAGAIN;
+ }
+
+ if (test_and_clear_bit(DO_GC_META, &gc_entry->gc_flags)) {
+ z_do_copy_more(gc_entry);
+ goto next_in_queue;
+ }
+
+ znd->age = jiffies_64;
+ if (test_and_clear_bit(DO_GC_COMPLETE, &gc_entry->gc_flags)) {
+ u32 non_seq;
+ u32 reclaimed;
+
+ err = z_zone_gc_metadata_update(gc_entry);
+ gc_entry->result = err;
+ if (err) {
+ Z_ERR(znd, "Metadata error ... disable zone: %u",
+ gc_entry->z_gc);
+ }
+ err = gc_finalize(gc_entry);
+ if (err) {
+ Z_ERR(znd, "GC: Failed to finalize: %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+
+ gc_verify_cache(znd, gc_entry->z_gc);
+
+ err = _cached_to_tables(znd, gc_entry->z_gc);
+ if (err) {
+ Z_ERR(znd, "Failed to purge journal: %d", err);
+ gc_entry->result = err;
+ goto out;
+ }
+
+ err = z_flush_bdev(znd, GFP_KERNEL);
+ if (err) {
+ gc_entry->result = err;
+ goto out;
+ }
+
+ /* Release the zones for writing */
+ dmz_reset_wp(znd, gc_entry->z_gc);
+
+ SpinLock(&wpg->wplck);
+ non_seq = le32_to_cpu(wpg->wp_alloc[gzoff]) & Z_WP_NON_SEQ;
+ reclaimed = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+ wpg->wp_alloc[gzoff] = cpu_to_le32(non_seq);
+ wpg->wp_used[gzoff] = cpu_to_le32(0u);
+ wpg->zf_est[gzoff] = cpu_to_le32(Z_BLKSZ);
+ znd->discard_count -= reclaimed;
+ znd->z_gc_free++;
+ if (znd->z_gc_resv & Z_WP_GC_ACTIVE)
+ znd->z_gc_resv = gc_entry->z_gc;
+ else if (znd->z_meta_resv & Z_WP_GC_ACTIVE)
+ znd->z_meta_resv = gc_entry->z_gc;
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ spin_unlock(&wpg->wplck);
+
+ znd->gc_events++;
+ update_stale_ratio(znd, gc_entry->z_gc);
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ znd->gc_backlog--;
+ znd->gc_active = NULL;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ ZDM_FREE(znd, gc_entry, sizeof(*gc_entry), KM_16);
+
+ set_bit(DO_JOURNAL_MOVE, &znd->flags);
+ set_bit(DO_MEMPOOL, &znd->flags);
+ set_bit(DO_SYNC, &znd->flags);
+ }
+out:
+ return 0;
+}
+
+/**
+ * gc_work_task() - Worker thread for GC activity.
+ * @work: Work struct holding the ZDM instance to do work on ...
+ */
+static void gc_work_task(struct work_struct *work)
+{
+ struct gc_state *gc_entry = NULL;
+ unsigned long flags;
+ struct zoned *znd;
+ int err;
+
+ if (!work)
+ return;
+
+ znd = container_of(to_delayed_work(work), struct zoned, gc_work);
+ if (!znd)
+ return;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ if (znd->gc_active)
+ gc_entry = znd->gc_active;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ if (!gc_entry) {
+ Z_ERR(znd, "ERROR: gc_active not set!");
+ return;
+ }
+
+ err = z_zone_gc_compact(gc_entry);
+ if (-EAGAIN == err) {
+ unsigned long tval = msecs_to_jiffies(10);
+
+ queue_delayed_work(znd->gc_wq, &znd->gc_work, tval);
+ } else {
+ const int delay = 10;
+
+ on_timeout_activity(znd, delay);
+ }
+}
+
+/**
+ * is_reserved() - Check to see if a zone is 'special'
+ * @znd: ZDM Instance
+ * @z_pref: Zone to be tested.
+ */
+static inline int is_reserved(struct zoned *znd, const u32 z_pref)
+{
+ const u32 gc = znd->z_gc_resv & Z_WP_VALUE_MASK;
+ const u32 meta = znd->z_meta_resv & Z_WP_VALUE_MASK;
+
+ return (gc == z_pref || meta == z_pref) ? 1 : 0;
+}
+
+/**
+ * gc_can_cherrypick() - Queue a GC for zone in this bin if ... it will be easy
+ * @znd: ZDM Instance
+ * @bin: The bin (0 to 255)
+ * @delay: Delay metric
+ * @gfp: Allocation flags to use.
+ */
+static int gc_can_cherrypick(struct zoned *znd, u32 bin, int delay, gfp_t gfp)
+{
+ u32 z_id = bin * znd->stale.binsz;
+ u32 s_end = z_id + znd->stale.binsz;
+
+ if (s_end > znd->data_zones)
+ s_end = znd->data_zones;
+
+ for (; z_id < s_end; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ u32 nfree = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+
+ if (wp & Z_WP_RRECALC)
+ update_stale_ratio(znd, z_id);
+
+ if (((wp & Z_WP_GC_BITS) == Z_WP_GC_READY) &&
+ ((wp & Z_WP_VALUE_MASK) == Z_BLKSZ) &&
+ (nfree == Z_BLKSZ)) {
+ if (z_zone_compact_queue(znd, z_id, delay, gfp))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * gc_queue_with_delay() - Scan to see if a GC can/should be queued.
+ * @znd: ZDM Instance
+ * @delay: Delay metric
+ * @gfp: Allocation flags to use.
+ *
+ * Return 1 if gc in progress or queued. 0 otherwise.
+ */
+static int gc_queue_with_delay(struct zoned *znd, int delay, gfp_t gfp)
+{
+ int gc_idle = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&znd->gc_lock, flags);
+ gc_idle = !znd->gc_active;
+ spin_unlock_irqrestore(&znd->gc_lock, flags);
+
+ if (gc_idle) {
+ int bin = 0;
+ int ratio = 0;
+ u32 iter;
+
+ /* Find highest ratio stream */
+ for (iter = 0; iter < znd->stale.count; iter++)
+ if (znd->stale.bins[iter] > ratio)
+ ratio = znd->stale.bins[iter], bin = iter;
+
+ /* Cherrypick a zone in the stream */
+ if (gc_idle && gc_can_cherrypick(znd, bin, delay, gfp))
+ gc_idle = 0;
+
+ /* Otherwise cherrypick *something* */
+ for (iter = 0; gc_idle && (iter < znd->stale.count); iter++)
+ if (gc_idle && (bin != iter) &&
+ gc_can_cherrypick(znd, iter, delay, gfp))
+ gc_idle = 0;
+
+ /* Otherwise compact a zone in the stream */
+ if (gc_idle && gc_request_queued(znd, bin, delay, gfp))
+ gc_idle = 0;
+
+ /* Otherwise compact *something* */
+ for (iter = 0; gc_idle && (iter < znd->stale.count); iter++)
+ if (gc_idle && gc_request_queued(znd, iter, delay, gfp))
+ gc_idle = 0;
+ }
+ return !gc_idle;
+}
+
+/**
+ * gc_immediate() - Free up some space as soon as possible.
+ * @znd: ZDM Instance
+ * @gfp: Allocation flags to use.
+ */
+static int gc_immediate(struct zoned *znd, int wait, gfp_t gfp)
+{
+ const int delay = 0;
+ int can_retry = 0;
+ int lock_owner = 0;
+ int queued;
+
+ /*
+ * Lock on the first entry. On subsequent entries
+ * only lock of a 'wait' was requested.
+ *
+ * Note: We really should be using a wait queue here but this
+ * mutex hack should do as a temporary workaround.
+ */
+ atomic_inc(&znd->gc_throttle);
+ if (atomic_read(&znd->gc_throttle) == 1) {
+ mutex_lock(&znd->gc_wait);
+ lock_owner = 1;
+ } else if (atomic_read(&znd->gc_throttle) > 1) {
+ if (wait) {
+ mutex_lock(&znd->gc_wait);
+ can_retry = 1;
+ mutex_unlock(&znd->gc_wait);
+ }
+ goto out;
+ }
+ flush_delayed_work(&znd->gc_work);
+ queued = gc_queue_with_delay(znd, delay, gfp);
+ if (!queued) {
+ Z_ERR(znd, " ... GC immediate .. failed to queue GC!!.");
+ z_discard_partial(znd, DISCARD_MAX_INGRESS, GFP_KERNEL);
+ can_retry = 1;
+ goto out;
+ }
+ can_retry = flush_delayed_work(&znd->gc_work);
+
+out:
+ if (lock_owner)
+ mutex_unlock(&znd->gc_wait);
+ atomic_dec(&znd->gc_throttle);
+
+ return can_retry;
+}
+
+/**
+ * set_current() - Make zone the preferred zone for allocation.
+ * @znd: ZDM Instance
+ * @flags: BLock allocation scheme (including stream id)
+ * @zone: The zone to make preferred.
+ *
+ * Once a zone is opened for allocation, future allocations will prefer
+ * the same zone, until the zone is full.
+ * Each stream id has it's own preferred zone.
+ *
+ * NOTE: z_current is being deprecated if favor of assuming a default
+ * stream id when nothing is provided.
+ */
+static inline void set_current(struct zoned *znd, u32 flags, u32 zone)
+{
+ if (flags & Z_AQ_STREAM_ID) {
+ u32 stream_id = flags & Z_AQ_STREAM_MASK;
+
+ znd->bmkeys->stream[stream_id] = cpu_to_le32(zone);
+ }
+ znd->z_current = zone;
+ if (znd->z_gc_free > 0)
+ znd->z_gc_free--;
+ else
+ Z_ERR(znd, "Dec z_gc_free below 0?");
+}
+
+/**
+ * next_open_zone() - Grab the next available zone
+ * @znd: ZDM Instance
+ * @z_at: Zone to start scanning from (presumable just filled).
+ *
+ * Return: NOZONE if no zone exists with space for writing.
+ *
+ * Scan through the available zones for an empty zone.
+ * If no empty zone is available the a zone that is not full is
+ * used instead.
+ */
+static u32 next_open_zone(struct zoned *znd, u32 z_at)
+{
+ u32 zone = NOZONE;
+ u32 z_id;
+
+ if (znd->data_zones < z_at)
+ z_at = znd->data_zones;
+
+ /* scan higher lba zones */
+ for (z_id = z_at; z_id < znd->data_zones; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ if ((wp & Z_WP_VALUE_MASK) == 0) {
+ u32 check = gzno << GZ_BITS | gzoff;
+
+ if (!is_reserved(znd, check)) {
+ zone = check;
+ goto out;
+ }
+ }
+ }
+
+ /* scan lower lba zones */
+ for (z_id = 0; z_id < z_at; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ if ((wp & Z_WP_VALUE_MASK) == 0) {
+ u32 check = gzno << GZ_BITS | gzoff;
+
+ if (!is_reserved(znd, check)) {
+ zone = check;
+ goto out;
+ }
+ }
+ }
+
+ /* No empty zones .. start co-mingling streams */
+ for (z_id = 0; z_id < znd->data_zones; z_id++) {
+ u32 gzno = z_id >> GZ_BITS;
+ u32 gzoff = z_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+
+ if ((wp & Z_WP_VALUE_MASK) < Z_BLKSZ) {
+ u32 check = gzno << GZ_BITS | gzoff;
+
+ if (!is_reserved(znd, check)) {
+ zone = check;
+ goto out;
+ }
+ }
+ }
+
+out:
+ return zone;
+}
+
+/**
+ * zone_filled_cleanup() - Update wp_alloc GC Readu flags based on wp_used.
+ * @znd: ZDM Instance
+ */
+static void zone_filled_cleanup(struct zoned *znd)
+{
+ if (znd->filled_zone != NOZONE) {
+ u32 zone = znd->filled_zone;
+ u32 gzno;
+ u32 gzoff;
+ u32 wp;
+ u32 used;
+ struct meta_pg *wpg;
+
+ znd->filled_zone = NOZONE;
+
+ gzno = zone >> GZ_BITS;
+ gzoff = zone & GZ_MMSK;
+ wpg = &znd->wp[gzno];
+
+ SpinLock(&wpg->wplck);
+ wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ used = le32_to_cpu(wpg->wp_used[gzoff]) & Z_WP_VALUE_MASK;
+ if (used == Z_BLKSZ) {
+ if (Z_BLKSZ == (wp & Z_WP_VALUE_MASK)) {
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wp
+ | Z_WP_GC_READY);
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ } else {
+ Z_ERR(znd, "Zone %u seems bogus.", zone);
+ }
+ }
+ spin_unlock(&wpg->wplck);
+
+ dmz_close_zone(znd, zone);
+ update_stale_ratio(znd, zone);
+ }
+}
+
+/**
+ * z_acquire() - Allocate blocks for writing
+ * @znd: ZDM Instance
+ * @flags: Alloc strategy and stream id.
+ * @nblks: Number of blocks desired.
+ * @nfound: Number of blocks available.
+ *
+ * Return: Lba for writing.
+ */
+static u64 z_acquire(struct zoned *znd, u32 flags, u32 nblks, u32 *nfound)
+{
+ sector_t found = 0;
+ u32 z_pref = znd->z_current;
+ u32 stream_id = 0;
+ u32 z_find;
+ const int wait = 1;
+ gfp_t gfp = (flags & Z_AQ_NORMAL) ? GFP_ATOMIC : GFP_KERNEL;
+
+ zone_filled_cleanup(znd);
+
+ if (flags & Z_AQ_STREAM_ID) {
+ stream_id = flags & Z_AQ_STREAM_MASK;
+ z_pref = le32_to_cpu(znd->bmkeys->stream[stream_id]);
+ }
+ if (z_pref >= znd->data_zones) {
+ z_pref = next_open_zone(znd, znd->z_current);
+ if (z_pref < znd->data_zones)
+ set_current(znd, flags, z_pref);
+ }
+
+ if (z_pref < znd->data_zones) {
+ found = _blkalloc(znd, z_pref, flags, nblks, nfound);
+ if (found || *nfound)
+ goto out;
+ }
+
+ if (znd->z_gc_free < 5) {
+ Z_DBG(znd, "... alloc - gc low on free space.");
+ gc_immediate(znd, !wait, gfp);
+ }
+
+retry:
+ z_find = next_open_zone(znd, znd->z_current);
+ if (z_find < znd->data_zones) {
+ found = _blkalloc(znd, z_find, flags, nblks, nfound);
+ if (found || *nfound) {
+ set_current(znd, flags, z_find);
+ goto out;
+ }
+ }
+
+ if (flags & Z_AQ_GC) {
+ u32 gresv = znd->z_gc_resv & Z_WP_VALUE_MASK;
+
+ Z_ERR(znd, "Using GC Reserve (%u)", gresv);
+ found = _blkalloc(znd, gresv, flags, nblks, nfound);
+ znd->z_gc_resv |= Z_WP_GC_ACTIVE;
+ }
+
+ if (flags & Z_AQ_META) {
+ int can_retry = gc_immediate(znd, wait, gfp);
+ u32 mresv = znd->z_meta_resv & Z_WP_VALUE_MASK;
+
+ Z_DBG(znd, "GC: Need META.");
+ if (can_retry)
+ goto retry;
+
+ Z_ERR(znd, "Using META Reserve (%u)", znd->z_meta_resv);
+ found = _blkalloc(znd, mresv, flags, nblks, nfound);
+ }
+
+out:
+ if (!found && (*nfound == 0)) {
+ Z_DBG(znd, "... alloc - out of space?");
+ if (gc_immediate(znd, wait, gfp))
+ goto retry;
+
+ Z_ERR(znd, "%s: -> Out of space.", __func__);
+ }
+ return found;
+}
+
+/**
+ * compare_lba() - Compare map page on lba.
+ * @x1: map page
+ * @x2: map page
+ *
+ * Return -1, 0, or 1 if x1 < x2, equal, or >, respectivly.
+ */
+static int compare_lba(const void *x1, const void *x2)
+{
+ const struct map_pg *v1 = *(const struct map_pg **)x1;
+ const struct map_pg *v2 = *(const struct map_pg **)x2;
+ int cmp = (v1->lba < v2->lba) ? -1 : ((v1->lba > v2->lba) ? 1 : 0);
+
+ return cmp;
+}
+
+/**
+ * is_dirty() - Test map page if is of bit_type and dirty.
+ * @expg: map page
+ * @bit_type: map page flag to test for...
+ *
+ * Return 1 if page had dirty and bit_type flags set.
+ *
+ * Note: bit_type is IS_CRC and IS_LUT most typically.
+ */
+static __always_inline int is_dirty(struct map_pg *expg, int bit_type)
+{
+ return (test_bit(bit_type, &expg->flags) &&
+ test_bit(IS_DIRTY, &expg->flags));
+}
+
+
+/**
+ * is_old_and_clean() - Test map page if it expired and can be dropped.
+ * @expg: map page
+ * @bit_type: map page flag to test for...
+ *
+ * Return 1 if page is clean and old.
+ */
+static __always_inline int is_old_and_clean(struct map_pg *expg, int bit_type)
+{
+ int is_match = 0;
+
+ if (!test_bit(IS_DIRTY, &expg->flags) && is_expired(expg->age)) {
+ if (getref_pg(expg) == 1)
+ is_match = 1;
+ else
+ pr_err("%llu: dirty, exp, and elev: %d\n",
+ expg->lba, getref_pg(expg));
+ }
+
+ (void)bit_type;
+
+ return is_match;
+}
+
+/**
+ * _pool_write() - Sort/Write and array of ZLT pages.
+ * @znd: ZDM Instance
+ * @wset: Array of pages to be written.
+ * @count: Number of entries.
+ *
+ * NOTE: On entry all map_pg entries have elevated refcount from _pool_fill().
+ * write_if_dirty() will dec the refcount when the block hits disk.
+ */
+static int _pool_write(struct zoned *znd, struct map_pg **wset, int count)
+{
+ const int use_wq = 0;
+ int iter;
+ struct map_pg *expg;
+ int err = 0;
+
+ /* write dirty table pages */
+ if (count <= 0)
+ goto out;
+
+ sort(wset, count, sizeof(*wset), compare_lba, NULL);
+
+ for (iter = 0; iter < count; iter++) {
+ expg = wset[iter];
+ if (expg) {
+ if (iter && expg->lba == wset[iter-1]->lba) {
+ wset[iter] = NULL;
+ Z_ERR(znd, "** DUPE ** ignored.");
+ } else
+ cache_if_dirty(znd, expg, use_wq);
+ }
+ }
+
+ for (iter = 0; iter < count; iter++) {
+ const int sync = 1;
+ /* REDUX: For Async WB: int sync = (iter == last) ? 1 : 0; */
+
+ expg = wset[iter];
+ if (expg) {
+ err = write_if_dirty(znd, expg, use_wq, sync);
+ if (err) {
+ Z_ERR(znd, "Write failed: %d", err);
+ goto out;
+ }
+ }
+ }
+ err = count;
+
+out:
+ return err;
+}
+
+/**
+ * z_metadata_lba() - Alloc a block of metadata.
+ * @znd: ZDM Instance
+ * @map: Block of metadata.
+ * @num: Number of blocks allocated.
+ *
+ * Return: lba or 0 on failure.
+ *
+ * When map->lba is less than data_lba the metadata is pinned to it's logical
+ * location.
+ * When map->lba lands in data space it is dynmaically allocated and intermixed
+ * within the datapool.
+ *
+ * [FUTURE: Pick a stream_id for floating metadata]
+ */
+static u64 z_metadata_lba(struct zoned *znd, struct map_pg *map, u32 *num)
+{
+ u32 nblks = 1;
+ u64 lba = map->lba;
+
+ if (lba < znd->data_lba) {
+ *num = 1;
+ return map->lba;
+ }
+ return z_acquire(znd, Z_AQ_META_STREAM, nblks, num);
+}
+
+/**
+ * pg_crc() -
+ * @znd: ZDM Instance
+ * @pg:
+ * @md_crc: 16 bit crc of page.
+ *
+ * callback from dm_io notify.. cannot hold mutex here
+ */
+static void pg_crc(struct zoned *znd, struct map_pg *pg, __le16 md_crc)
+{
+ struct mpinfo mpi;
+
+ to_table_entry(znd, pg->lba, &mpi);
+ if (pg->crc_pg) {
+ struct map_pg *crc_pg = pg->crc_pg;
+ int entry = mpi.crc.pg_idx;
+
+ if (crc_pg && crc_pg->data.crc) {
+ ref_pg(crc_pg);
+ crc_pg->data.crc[entry] = md_crc;
+ clear_bit(IS_READA, &crc_pg->flags);
+ set_bit(IS_DIRTY, &crc_pg->flags);
+ clear_bit(IS_FLUSH, &crc_pg->flags);
+ crc_pg->age = jiffies_64
+ + msecs_to_jiffies(crc_pg->hotness);
+ deref_pg(crc_pg);
+ }
+ put_map_entry(crc_pg);
+ if (getref_pg(crc_pg) == 0)
+ pg->crc_pg = NULL;
+ } else if (!test_bit(IS_LUT, &pg->flags)) {
+ znd->md_crcs[mpi.crc.pg_idx] = md_crc;
+ } else {
+ Z_ERR(znd, "unexpected state.");
+ dump_stack();
+ }
+}
+
+
+/**
+ * pg_written() - Handle accouting related to lookup table page writes
+ * @pg: The page of lookup table [or CRC] that was written.
+ * @error: non-zero if an error occurred.
+ *
+ * callback from dm_io notify.. cannot hold mutex here, cannot sleep.
+ *
+ */
+static int pg_written(struct map_pg *pg, unsigned long error)
+{
+ int rcode = 0;
+ struct zoned *znd = pg->znd;
+ __le16 md_crc;
+
+ if (error) {
+ Z_ERR(znd, "write_page: %" PRIx64 " -> %" PRIx64
+ " ERR: %ld", pg->lba, pg->last_write, error);
+ rcode = -EIO;
+ goto out;
+ }
+
+ /*
+ * Re-calculate CRC on current memory page. If unchanged then on-disk
+ * is stable and in-memory is not dirty. Otherwise in memory changed
+ * during write back so leave the dirty flag set. For the purpose of
+ * the CRC table we assume that in-memory == on-disk although this
+ * is not strictly true as the page could have updated post disk write.
+ */
+
+ md_crc = crc_md_le16(pg->data.addr, Z_CRC_4K);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ if (md_crc == pg->md_crc)
+ clear_bit(IS_DIRTY, &pg->flags);
+
+ clear_bit(W_IN_FLIGHT, &pg->flags);
+ pg_crc(znd, pg, md_crc);
+
+ if (pg->last_write < znd->data_lba)
+ goto out;
+
+/*
+ * NOTE: If we reach here it's a problem.
+ * TODO: mutex free path for adding a map entry ...
+ */
+
+ /* written lba was allocated from data-pool */
+ rcode = z_mapped_addmany(znd, pg->lba, pg->last_write, 1, CRIT);
+ if (rcode) {
+ Z_ERR(znd, "%s: Journal MANY failed.", __func__);
+ goto out;
+ }
+ increment_used_blks(znd, pg->last_write, 1);
+
+out:
+ return rcode;
+}
+
+
+/**
+ * on_pg_written() - A block of map table was written.
+ * @error: Any error code that occurred during the I/O.
+ * @context: The map_pg that was queued/written.
+ */
+static void on_pg_written(unsigned long error, void *context)
+{
+ struct map_pg *pg = context;
+ int rcode;
+
+ rcode = pg_written(pg, error);
+ deref_pg(pg);
+ if (rcode < 0)
+ pg->znd->meta_result = rcode;
+}
+
+/**
+ * queue_pg() - Queue a map table page for writeback
+ * @znd: ZDM Instance
+ * @pg: The target page to ensure the cover CRC blocks is cached.
+ * @lba: The address to write the block to.
+ */
+static int queue_pg(struct zoned *znd, struct map_pg *pg, u64 lba)
+{
+ const int use_wq = 0;
+ sector_t block = lba << Z_SHFT4K;
+ unsigned int nDMsect = 1 << Z_SHFT4K;
+ int rc;
+
+ pg->znd = znd;
+ MutexLock(&pg->md_lock);
+ pg->md_crc = crc_md_le16(pg->data.addr, Z_CRC_4K);
+ pg->last_write = lba; /* presumably */
+ mutex_unlock(&pg->md_lock);
+
+ rc = znd_async_io(znd, DM_IO_KMEM, pg->data.addr, block, nDMsect, WRITE,
+ use_wq, on_pg_written, pg);
+ if (rc) {
+ Z_ERR(znd, "queue error: %d Q: %" PRIx64 " [%u dm sect] (Q:%d)",
+ rc, lba, nDMsect, use_wq);
+ dump_stack();
+ }
+
+ return rc;
+}
+
+/**
+ * cache_if_dirty() - Load a page of CRC's into memory.
+ * @znd: ZDM Instance
+ * @pg: The target page to ensure the cover CRC blocks is cached.
+ * @wp: If a queue is needed for I/O.
+ *
+ * The purpose of loading is to ensure the pages are in memory with the
+ * async_io (write) completes the CRC accounting doesn't cause a sleep
+ * and violate the callback() API rules.
+ */
+static void cache_if_dirty(struct zoned *znd, struct map_pg *pg, int wq)
+{
+ if (test_bit(IS_DIRTY, &pg->flags) && test_bit(IS_LUT, &pg->flags) &&
+ pg->data.addr) {
+ u64 base = znd->c_base;
+ struct map_pg *crc_pg;
+ struct mpinfo mpi;
+
+ if (test_bit(IS_REV, &pg->flags))
+ base = znd->c_mid;
+
+ to_table_entry(znd, pg->lba, &mpi);
+ base += mpi.crc.pg_no;
+ crc_pg = get_map_entry(znd, base, NORMAL);
+ if (!crc_pg)
+ Z_ERR(znd, "Out of memory. No CRC Pg");
+ pg->crc_pg = crc_pg;
+ }
+}
+
+/**
+ * write_if_dirty() - Write old pages (flagged as DIRTY) pages of table map.
+ * @znd: ZDM instance.
+ * @pg: A page of table map data.
+ * @wq: Use worker queue for sync writes.
+ * @snc: Performa a Sync or Async write.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int write_if_dirty(struct zoned *znd, struct map_pg *pg, int wq, int snc)
+{
+ int rcode = 0;
+
+ if (!pg)
+ return rcode;
+
+ if (test_bit(IS_DIRTY, &pg->flags) && pg->data.addr) {
+ u64 dm_s = pg->lba;
+ u32 nf;
+ u64 lba = z_metadata_lba(znd, pg, &nf);
+
+ Z_DBG(znd, "Write if dirty: %" PRIx64" -> %" PRIx64, dm_s, lba);
+
+ if (lba && nf) {
+ int rcwrt;
+ int count = 1;
+ __le16 md_crc;
+
+ set_bit(W_IN_FLIGHT, &pg->flags);
+
+ if (!snc) {
+ rcode = queue_pg(znd, pg, lba);
+ goto out_queued;
+ }
+
+ md_crc = crc_md_le16(pg->data.addr, Z_CRC_4K);
+ rcwrt = write_block(znd->ti, DM_IO_KMEM,
+ pg->data.addr, lba, count, wq);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ pg->last_write = lba;
+ pg_crc(znd, pg, md_crc);
+
+ if (rcwrt) {
+ Z_ERR(znd, "write_page: %" PRIx64 " -> %" PRIx64
+ " ERR: %d", pg->lba, lba, rcwrt);
+ rcode = rcwrt;
+ goto out;
+ }
+
+ if (crc_md_le16(pg->data.addr, Z_CRC_4K) == md_crc)
+ clear_bit(IS_DIRTY, &pg->flags);
+ clear_bit(W_IN_FLIGHT, &pg->flags);
+ if (lba < znd->data_lba)
+ goto out;
+
+ rcwrt = z_mapped_addmany(znd, dm_s, lba, nf, NORMAL);
+ if (rcwrt) {
+ Z_ERR(znd, "%s: Journal MANY failed.",
+ __func__);
+ rcode = rcwrt;
+ goto out;
+ }
+ increment_used_blks(znd, lba, nf);
+ } else {
+ Z_ERR(znd, "%s: Out of space for metadata?", __func__);
+ rcode = -ENOSPC;
+ goto out;
+ }
+ }
+
+out:
+ deref_pg(pg);
+
+out_queued:
+ if (rcode < 0)
+ znd->meta_result = rcode;
+
+ return rcode;
+}
+
+/**
+ * _sync_dirty() - Write all *dirty* ZLT blocks to disk (journal->SYNC->home)
+ * @znd: ZDM instance
+ * @bit_type: MAP blocks then CRC blocks.
+ * @sync: If true write dirty blocks to disk
+ * @drop: Number of ZLT blocks to free.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int _sync_dirty(struct zoned *znd, int bit_type, int sync, int drop)
+{
+ int err = 0;
+ int entries = 0;
+ int want_flush = 0;
+ struct map_pg *expg = NULL;
+ struct map_pg *_tpg;
+ struct map_pg **wset = NULL;
+ int dlstsz = 0;
+ LIST_HEAD(droplist);
+
+ wset = ZDM_CALLOC(znd, sizeof(*wset), MAX_WSET, KM_19, NORMAL);
+
+ spin_lock(&znd->zlt_lck);
+ if (list_empty(&znd->zltpool))
+ goto writeback;
+
+ expg = list_last_entry(&znd->zltpool, typeof(*expg), zltlst);
+ if (!expg || &expg->zltlst == (&znd->zltpool))
+ goto writeback;
+
+ _tpg = list_prev_entry(expg, zltlst);
+ while (&expg->zltlst != &znd->zltpool) {
+ ref_pg(expg);
+
+ if (sync && (entries < MAX_WSET) && is_dirty(expg, bit_type)) {
+ wset[entries++] = expg;
+ } else if ((drop > 0) && is_old_and_clean(expg, bit_type)) {
+ struct map_pg **zlt;
+ spinlock_t *lock;
+ int index = expg->index;
+ int is_lut = test_bit(IS_LUT, &expg->flags);
+ int is_fwd = test_bit(IS_FWD, &expg->flags);
+ u32 count = is_lut ? znd->map_count : znd->crc_count;
+
+ if (is_lut) {
+ zlt = is_fwd ? znd->fwd_tm : znd->rev_tm;
+ lock = &znd->mapkey_lock;
+ } else {
+ zlt = is_fwd ? znd->fwd_crc : znd->rev_crc;
+ lock = &znd->ct_lock;
+ }
+ spin_lock(lock);
+ if (index > -1 && index < count &&
+ getref_pg(expg) == 1 && expg == zlt[index]) {
+ zlt[expg->index] = NULL;
+ list_del(&expg->zltlst);
+ znd->in_zlt--;
+ clear_bit(IN_ZLT, &expg->flags);
+ drop--;
+ expg->age = jiffies_64;
+ + msecs_to_jiffies(expg->hotness);
+
+#if ENABLE_PG_FREE_VIA_LAZY
+ if (test_bit(IS_LAZY, &expg->flags))
+ Z_ERR(znd, "** Pg is lazy && zlt %llx",
+ expg->lba);
+
+ if (!test_bit(IS_LAZY, &expg->flags)) {
+ list_add(&expg->lazy, &droplist);
+ znd->in_lzy++;
+ set_bit(IS_LAZY, &expg->flags);
+ set_bit(IS_DROPPED, &expg->flags);
+ dlstsz++;
+ }
+ zlt[expg->index] = expg; /* so undrop works */
+ deref_pg(expg);
+ if (getref_pg(expg) > 0)
+ Z_ERR(znd, "Moving elv ref: %u",
+ getref_pg(expg));
+#else
+ MutexLock(&expg->md_lock);
+ if (expg->data.addr) {
+ void *pg = expg->data.addr;
+
+ expg->data.addr = NULL;
+ ZDM_FREE(znd, pg, Z_C4K, PG_27);
+ atomic_dec(&znd->incore);
+ if (!test_bit(IS_FLUSH, &expg->flags))
+ want_flush++;
+ }
+ mutex_unlock(&expg->md_lock);
+ deref_pg(expg);
+ if (getref_pg(expg) > 0)
+ Z_ERR(znd, "Dropped elv ref: %u",
+ getref_pg(expg));
+ ZDM_FREE(znd, expg, sizeof(*expg), KM_20);
+#endif
+ }
+ spin_unlock(lock);
+ } else {
+ deref_pg(expg);
+ }
+ if (entries == MAX_WSET)
+ break;
+
+ expg = _tpg;
+ _tpg = list_prev_entry(expg, zltlst);
+ }
+
+writeback:
+ spin_unlock(&znd->zlt_lck);
+
+ if (entries > 0) {
+ err = _pool_write(znd, wset, entries);
+ if (err < 0)
+ goto out;
+ if (entries == MAX_WSET)
+ err = -EBUSY;
+ }
+
+out:
+ if (want_flush)
+ set_bit(DO_FLUSH, &znd->flags);
+ if (!list_empty(&droplist))
+ lazy_pool_splice(znd, &droplist);
+
+ if (wset)
+ ZDM_FREE(znd, wset, sizeof(*wset) * MAX_WSET, KM_19);
+
+ return err;
+}
+
+/**
+ * sync_dirty() - Write all *dirty* ZLT blocks to disk (journal->SYNC->home)
+ * @znd: ZDM instance
+ * @bit_type: MAP blocks then CRC blocks.
+ * @sync: Write dirty blocks
+ * @drop: IN: # of pages to free.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int sync_dirty(struct zoned *znd, int bit_type, int sync, int drop)
+{
+ int err;
+
+ MutexLock(&znd->pool_mtx);
+ do {
+ err = _sync_dirty(znd, bit_type, sync, drop);
+ drop = 0;
+ } while (err == -EBUSY);
+
+ if (err > 0)
+ err = 0;
+ mutex_unlock(&znd->pool_mtx);
+
+ return err;
+}
+
+/**
+ * sync_mapped_pages() - Migrate lookup tables and crc pages to disk
+ * @znd: ZDM instance
+ * @sync: If dirty blocks need to be written.
+ * @drop: Number of blocks to drop.
+ *
+ * Return: 0 on success or -errno value
+ */
+static int sync_mapped_pages(struct zoned *znd, int sync, int drop)
+{
+ int err;
+ int remove = drop ? 1 : 0;
+
+ err = sync_dirty(znd, IS_LUT, sync, drop);
+
+ /* on error return */
+ if (err < 0)
+ return err;
+
+ /* TBD: purge CRC's on ref-count? */
+ err = sync_dirty(znd, IS_CRC, sync, remove);
+
+ return err;
+}
+
+/**
+ * dm_s is a logical sector that maps 1:1 to the whole disk in 4k blocks
+ * Here the logical LBA and field are calculated for the lookup table
+ * where the physical LBA can be read from disk.
+ */
+static int map_addr_aligned(struct zoned *znd, u64 dm_s, struct map_addr *out)
+{
+ u64 block = dm_s >> 10;
+
+ out->zone_id = block >> 6;
+ out->lut_s = block + znd->s_base;
+ out->lut_r = block + znd->r_base;
+ out->pg_idx = dm_s & 0x3FF;
+
+ return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+/**
+ * map_addr_calc()
+ * @znd: ZDM instance
+ * @origin: address to calc
+ * @out: address, zone, crc, lut addr
+ *
+ * dm_s is a logical sector that maps 1:1 to the whole disk in 4k blocks
+ * Here the logical LBA and field are calculated for the lookup table
+ * where the physical LBA can be read from disk.
+ */
+static int map_addr_calc(struct zoned *znd, u64 origin, struct map_addr *out)
+{
+ u64 offset = ((origin < znd->md_end) ? znd->md_start : znd->md_end);
+
+ out->dm_s = origin;
+ return map_addr_aligned(znd, origin - offset, out);
+}
+
+/**
+ * read_pg() - Read a page of LUT/CRC from disk.
+ * @znd: ZDM instance
+ * @pg: Page to fill
+ * @gfp: Memory allocation rule
+ * @mpi: Backing page locations.
+ *
+ * Load a page of the sector lookup table that maps to pg->lba
+ * If pg->lba is not on disk return 0
+ *
+ * Return: 1 if page exists, 0 if unmodified, else -errno on error.
+ */
+static int read_pg(struct zoned *znd, struct map_pg *pg, u64 lba48, gfp_t gfp,
+ struct mpinfo *mpi)
+{
+ int rcode = 0;
+ const int count = 1;
+ const int wq = 1;
+ int rd;
+ __le16 check;
+ __le16 expect = 0;
+
+ /**
+ * This table entry may be on-disk, if so it needs to
+ * be loaded.
+ * If not it needs to be initialized to 0xFF
+ *
+ * When ! ZF_POOL_FWD and mapped block is FWD && LUT
+ * the mapped->lba may be pinned in sk_ pool
+ * if so the sk_ pool entry may need to be pulled
+ * to resvole the current address of mapped->lba
+ */
+
+/* TODO: Async reads */
+
+ if (warn_bad_lba(znd, lba48))
+ Z_ERR(znd, "Bad PAGE %" PRIx64, lba48);
+
+ rd = read_block(znd->ti, DM_IO_KMEM, pg->data.addr, lba48, count, wq);
+ if (rd) {
+ Z_ERR(znd, "%s: read_block: ERROR: %d", __func__, rd);
+ rcode = -EIO;
+ goto out;
+ }
+
+ /*
+ * Now check block crc
+ */
+ check = crc_md_le16(pg->data.addr, Z_CRC_4K);
+ if (test_bit(IS_LUT, &pg->flags)) {
+ struct map_pg *crc_pg;
+ u64 base = znd->c_base;
+
+ if (test_bit(IS_REV, &pg->flags))
+ base = znd->c_mid;
+
+ crc_pg = get_map_entry(znd, mpi->crc.pg_no + base, gfp);
+ if (crc_pg) {
+ ref_pg(crc_pg);
+ if (crc_pg->data.crc) {
+ MutexLock(&crc_pg->md_lock);
+ expect = crc_pg->data.crc[mpi->crc.pg_idx];
+ mutex_unlock(&crc_pg->md_lock);
+ crc_pg->age = jiffies_64
+ + msecs_to_jiffies(crc_pg->hotness);
+ }
+ deref_pg(crc_pg);
+ }
+ put_map_entry(crc_pg);
+ } else {
+ expect = znd->md_crcs[mpi->crc.pg_idx];
+ }
+
+ if (check != expect) {
+ znd->meta_result = -ENOSPC;
+
+ Z_ERR(znd,
+ "Corrupt metadata: %" PRIx64 " from %" PRIx64
+ " [%04x != %04x]",
+ pg->lba, lba48,
+ le16_to_cpu(check),
+ le16_to_cpu(expect));
+ }
+ rcode = 1;
+
+out:
+ return rcode;
+}
+
+/**
+ * cache_pg() - Load a page of LUT/CRC into memory from disk, or default values.
+ * @znd: ZDM instance
+ * @pg: Page to fill
+ * @gfp: Memory allocation rule
+ * @mpi: Backing page locations.
+ *
+ * Return: 1 if page loaded from disk, 0 if empty, else -errno on error.
+ */
+static int cache_pg(struct zoned *znd, struct map_pg *pg, gfp_t gfp,
+ struct mpinfo *mpi)
+{
+ int rc = 0;
+ int empty_val = test_bit(IS_LUT, &pg->flags) ? 0xff : 0;
+ u64 lba48 = current_mapping(znd, pg->lba, gfp);
+
+ ref_pg(pg);
+ MutexLock(&pg->md_lock);
+ if (!pg->data.addr) {
+ pg->data.addr = ZDM_ALLOC(znd, Z_C4K, PG_27, gfp);
+ if (pg->data.addr) {
+ if (lba48)
+ rc = read_pg(znd, pg, lba48, gfp, mpi);
+ else
+ memset(pg->data.addr, empty_val, Z_C4K);
+
+ if (rc < 0) {
+ Z_ERR(znd, "%s: read_pg from %" PRIx64
+ " [to? %x] error: %d", __func__,
+ pg->lba, empty_val, rc);
+ ZDM_FREE(znd, pg->data.addr, Z_C4K, PG_27);
+ }
+ } else {
+ Z_ERR(znd, "%s: Out of memory.", __func__);
+ rc = -ENOMEM;
+ }
+ if (pg->data.addr) {
+ set_bit(IS_FLUSH, &pg->flags);
+ atomic_inc(&znd->incore);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ rc = pool_add(znd, pg);
+ Z_DBG(znd, "Page loaded: lba: %" PRIx64, pg->lba);
+ }
+ }
+ mutex_unlock(&pg->md_lock);
+
+ deref_pg(pg);
+ return rc;
+}
+
+/**
+ * z_lookup_table() - resolve a sector mapping via ZLT mapping
+ * @znd: ZDM Instance
+ * @addr: Address to resolve (via FWD map).
+ * @gfp: Current allocation flags.
+ */
+static u64 z_lookup_table(struct zoned *znd, u64 addr, gfp_t gfp)
+{
+ struct map_addr maddr;
+ struct map_pg *pg;
+ u64 old_phy = 0;
+
+ map_addr_calc(znd, addr, &maddr);
+ pg = get_map_entry(znd, maddr.lut_s, gfp);
+ if (pg) {
+ ref_pg(pg);
+ if (pg->data.addr) {
+ __le32 delta;
+
+ MutexLock(&pg->md_lock);
+ delta = pg->data.addr[maddr.pg_idx];
+ mutex_unlock(&pg->md_lock);
+ old_phy = map_value(znd, delta);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ clear_bit(IS_READA, &pg->flags);
+ }
+ deref_pg(pg);
+ put_map_entry(pg);
+ }
+ return old_phy;
+}
+
+/**
+ * update_map_entry() - Migrate memcache to lookup table map entries.
+ * @znd: ZDM instance
+ * @mapped: memcache block.
+ * @maddr: map_addr
+ * @to_addr: LBA or sector #.
+ * @is_fwd: flag forward or reverse lookup table.
+ *
+ * when is_fwd is 0:
+ * - maddr->dm_s is a sector -> lba.
+ * in this case the old lba is discarded and scheduled for cleanup
+ * by updating the reverse map lba tables noting that this location
+ * is now unused.
+ * when is_fwd is 0:
+ * - maddr->dm_s is an lba, lba -> dm_s
+ *
+ * Return: non-zero on error.
+ */
+static int update_map_entry(struct zoned *znd, struct map_pg *pg,
+ struct map_addr *maddr, u64 to_addr, int is_fwd)
+{
+ int err = -ENOMEM;
+
+ if (pg && pg->data.addr) {
+ u64 index = maddr->pg_idx;
+ __le32 delta;
+ __le32 value;
+ int was_updated = 0;
+
+ ref_pg(pg);
+ MutexLock(&pg->md_lock);
+ delta = pg->data.addr[index];
+ err = map_encode(znd, to_addr, &value);
+ if (!err) {
+ /*
+ * if the value is modified update the table and
+ * place it on top of the active [zltlst] list
+ * this will keep the chunk of lookup table in
+ * memory.
+ */
+ if (pg->data.addr[index] != value) {
+ pg->data.addr[index] = value;
+ pg->age = jiffies_64
+ + msecs_to_jiffies(pg->hotness);
+ set_bit(IS_DIRTY, &pg->flags);
+ clear_bit(IS_FLUSH, &pg->flags);
+ clear_bit(IS_READA, &pg->flags);
+ was_updated = 1;
+ } else if (value != MZTEV_UNUSED) {
+ Z_ERR(znd, "*ERR* %" PRIx64
+ " -> data.addr[index] (%x) == (%x)",
+ maddr->dm_s, pg->data.addr[index],
+ value);
+ dump_stack();
+ }
+ } else {
+ Z_ERR(znd, "*ERR* Mapping: %" PRIx64 " to %" PRIx64,
+ to_addr, maddr->dm_s);
+ }
+ mutex_unlock(&pg->md_lock);
+
+ if (was_updated && is_fwd && (delta != MZTEV_UNUSED)) {
+ u64 old_phy = map_value(znd, delta);
+
+ /*
+ * add to discard list of the controlling mzone
+ * for the 'delta' physical block
+ */
+ Z_DBG(znd, "%s: unused_phy: %" PRIu64
+ " (new lba: %" PRIu64 ")",
+ __func__, old_phy, to_addr);
+
+ WARN_ON(old_phy >= znd->nr_blocks);
+
+ err = unused_phy(znd, old_phy, 0, NORMAL);
+ if (err)
+ err = -ENOSPC;
+ }
+ deref_pg(pg);
+ }
+ return err;
+}
+
+/**
+ * move_to_map_tables() - Migrate memcache to lookup table map entries.
+ * @znd: ZDM instance
+ * @mcache: memcache block.
+ *
+ * Return: non-zero on error.
+ */
+static int move_to_map_tables(struct zoned *znd, struct map_cache *mcache)
+{
+ struct map_pg *smtbl = NULL;
+ struct map_pg *rmtbl = NULL;
+ struct map_addr maddr = { .dm_s = 0ul };
+ struct map_addr rev = { .dm_s = 0ul };
+ u64 lut_s = BAD_ADDR;
+ u64 lut_r = BAD_ADDR;
+ int jentry;
+ int err = 0;
+ int is_fwd = 1;
+
+ /* the journal being move must remain stable so sorting
+ * is disabled. If a sort is desired due to an unsorted
+ * page the search devolves to a linear lookup.
+ */
+ mcache_busy(mcache);
+ for (jentry = mcache->jcount; jentry > 0;) {
+ u64 dm_s = le64_to_lba48(mcache->jdata[jentry].tlba, NULL);
+ u64 lba = le64_to_lba48(mcache->jdata[jentry].bval, NULL);
+
+ if (dm_s == Z_LOWER48 || lba == Z_LOWER48) {
+ mcache->jcount = --jentry;
+ continue;
+ }
+
+ map_addr_calc(znd, dm_s, &maddr);
+ if (lut_s != maddr.lut_s) {
+ put_map_entry(smtbl);
+ if (smtbl)
+ deref_pg(smtbl);
+ smtbl = get_map_entry(znd, maddr.lut_s, NORMAL);
+ if (!smtbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ ref_pg(smtbl);
+ lut_s = smtbl->lba;
+ }
+
+ is_fwd = 1;
+ if (lba == 0ul)
+ lba = BAD_ADDR;
+ err = update_map_entry(znd, smtbl, &maddr, lba, is_fwd);
+ if (err < 0)
+ goto out;
+
+ /*
+ * In this case the reverse was handled as part of
+ * discarding the forward map entry -- if it was in use.
+ */
+ if (lba != BAD_ADDR) {
+ map_addr_calc(znd, lba, &rev);
+ if (lut_r != rev.lut_r) {
+ put_map_entry(rmtbl);
+ if (rmtbl)
+ deref_pg(rmtbl);
+ rmtbl = get_map_entry(znd, rev.lut_r, NORMAL);
+ if (!rmtbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ ref_pg(rmtbl);
+ lut_r = rmtbl->lba;
+ }
+ is_fwd = 0;
+ err = update_map_entry(znd, rmtbl, &rev, dm_s, is_fwd);
+ if (err == 1)
+ err = 0;
+ }
+
+ if (err < 0)
+ goto out;
+
+ mcache->jdata[jentry].tlba = MC_INVALID;
+ mcache->jdata[jentry].bval = MC_INVALID;
+ if (mcache->jsorted == mcache->jcount)
+ mcache->jsorted--;
+ mcache->jcount = --jentry;
+ }
+out:
+ if (smtbl)
+ deref_pg(smtbl);
+ if (rmtbl)
+ deref_pg(rmtbl);
+ put_map_entry(smtbl);
+ put_map_entry(rmtbl);
+ set_bit(DO_MEMPOOL, &znd->flags);
+ mcache_unbusy(mcache);
+
+ return err;
+}
+
+/**
+ * unused_phy() - Mark a block as unused.
+ * @znd: ZDM instance
+ * @lba: Logical LBA of block.
+ * @orig_s: Sector being marked.
+ * @gfp: Memory allocation rule
+ *
+ * Return: non-zero on error.
+ *
+ * Add an unused block to the list of blocks to be discarded during
+ * garbage collection.
+ */
+static int unused_phy(struct zoned *znd, u64 lba, u64 orig_s, gfp_t gfp)
+{
+ int err = 0;
+ struct map_pg *pg;
+ struct map_addr reverse;
+ int z_off;
+
+ if (lba < znd->data_lba)
+ return 0;
+
+ map_addr_calc(znd, lba, &reverse);
+ z_off = reverse.zone_id % 1024;
+ pg = get_map_entry(znd, reverse.lut_r, gfp);
+ if (!pg) {
+ err = -EIO;
+ Z_ERR(znd, "unused_phy: Reverse Map Entry not found.");
+ goto out;
+ }
+
+ if (!pg->data.addr) {
+ Z_ERR(znd, "Catastrophic missing LUT page.");
+ dump_stack();
+ err = -EIO;
+ goto out;
+ }
+ ref_pg(pg);
+
+ /*
+ * if the value is modified update the table and
+ * place it on top of the active [zltlst] list
+ */
+ if (pg->data.addr[reverse.pg_idx] != MZTEV_UNUSED) {
+ u32 gzno = reverse.zone_id >> GZ_BITS;
+ u32 gzoff = reverse.zone_id & GZ_MMSK;
+ struct meta_pg *wpg = &znd->wp[gzno];
+ u32 wp;
+ u32 zf;
+ u32 stream_id;
+
+ if (orig_s) {
+ __le32 enc = pg->data.addr[reverse.pg_idx];
+ u64 dm_s = map_value(znd, enc);
+ int drop_discard = 0;
+
+ if (dm_s < znd->data_lba) {
+ drop_discard = 1;
+ Z_ERR(znd, "Discard invalid target %"
+ PRIx64" - Is ZDM Meta %"PRIx64" vs %"
+ PRIx64, lba, orig_s, dm_s);
+ }
+ if (orig_s != dm_s) {
+ drop_discard = 1;
+ Z_ERR(znd,
+ "Discard %" PRIx64
+ " mismatched src: %"PRIx64 " vs %" PRIx64,
+ lba, orig_s, dm_s);
+ }
+ if (drop_discard)
+ goto out_unlock;
+ }
+ MutexLock(&pg->md_lock);
+ pg->data.addr[reverse.pg_idx] = MZTEV_UNUSED;
+ mutex_unlock(&pg->md_lock);
+ pg->age = jiffies_64 + msecs_to_jiffies(pg->hotness);
+ set_bit(IS_DIRTY, &pg->flags);
+ clear_bit(IS_READA, &pg->flags);
+ clear_bit(IS_FLUSH, &pg->flags);
+
+ SpinLock(&wpg->wplck);
+ wp = le32_to_cpu(wpg->wp_alloc[gzoff]);
+ zf = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_VALUE_MASK;
+ stream_id = le32_to_cpu(wpg->zf_est[gzoff]) & Z_WP_STREAM_MASK;
+ if (wp > 0 && zf < Z_BLKSZ) {
+ zf++;
+ wpg->zf_est[gzoff] = cpu_to_le32(zf | stream_id);
+ wpg->wp_alloc[gzoff] = cpu_to_le32(wp | Z_WP_RRECALC);
+ set_bit(IS_DIRTY, &wpg->flags);
+ clear_bit(IS_FLUSH, &wpg->flags);
+ }
+ spin_unlock(&wpg->wplck);
+ if ((wp & Z_WP_VALUE_MASK) == Z_BLKSZ)
+ znd->discard_count++;
+ } else {
+ Z_DBG(znd, "lba: %" PRIx64 " alread reported as free?", lba);
+ }
+
+out_unlock:
+ deref_pg(pg);
+
+out:
+ put_map_entry(pg);
+
+ return err;
+}
ZDM presents a traditional block device for ZBC/ZAC zoned devices. User space utilities in zdm-tools for creating, repairing and restore DM instances at: https://github.com/Seagate Signed-off-by: Shaun Tancheff <shaun.tancheff@seagate.com> --- Documentation/device-mapper/zoned.txt | 75 + MAINTAINERS | 7 + drivers/md/Kconfig | 11 + drivers/md/Makefile | 1 + drivers/md/dm-zoned.c | 2535 ++++++++++++ drivers/md/dm-zoned.h | 714 ++++ drivers/md/libzoned.c | 7149 +++++++++++++++++++++++++++++++++ 7 files changed, 10492 insertions(+) create mode 100644 Documentation/device-mapper/zoned.txt create mode 100644 drivers/md/dm-zoned.c create mode 100644 drivers/md/dm-zoned.h create mode 100644 drivers/md/libzoned.c