@@ -47,6 +47,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_REPLICATOR) += dm-replicator.o \
dm-repl-log-ringbuffer.o \
+ dm-repl-slink-blockdev.o \
dm-log.o dm-registry.o
obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o \
dm-region-hash.o
new file mode 100644
@@ -0,0 +1,3049 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen (heinzm@redhat.com)
+ *
+ * This file is released under the GPL.
+ *
+ *
+ * "blockdev" site link handler for the replicator target supporting
+ * devices on block transports with device node access.
+ *
+ * Locking Hierarchy:
+ * 1) repl_slinks->lock
+ * 2) sl->lock
+ *
+ */
+
+static const char version[] = "v0.022";
+
+#include "dm.h"
+#include "dm-repl.h"
+#include "dm-registry.h"
+#include "dm-repl-log.h"
+#include "dm-repl-slink.h"
+
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "dm-repl-slink-blockdev"
+#define DAEMON DM_MSG_PREFIX "d"
+#define COPY_PAGES BIO_MAX_PAGES
+#define RESYNC_PAGES (BIO_MAX_PAGES / 2)
+#define RESYNC_SIZE (to_sector(PAGE_SIZE) * RESYNC_PAGES)
+#define MIN_IOS 1
+
+/* Jiffies to wait before retrying a device. */
+#define SLINK_TEST_JIFFIES (15 * HZ)
+#define SLINK_TEST_SIZE 4096
+
+#define _SET_AND_BUG_ON_SL(sl, slink) \
+ do { \
+ _BUG_ON_PTR(slink); \
+ (sl) = slink_check(slink); \
+ _BUG_ON_PTR(sl); \
+ } while (0);
+
+/* An slink device. */
+enum sdev_list_type {
+ SDEV_SLINK, /* To put on slink's device list. */
+ SDEV_RESYNC, /* To put on slink's resynchronization list. */
+ SDEV_TEST, /* To put on so_slink_test()'s test list. */
+ NR_SDEV_LISTS,
+};
+struct test_buffer;
+struct sdev {
+ struct kref ref; /* Reference counter. */
+
+ /* Lists to hang off slink, resync and flush lists */
+ struct list_head lists[NR_SDEV_LISTS];
+ struct dm_target *ti;
+ struct slink *sl; /* Backpointer for callback. */
+
+ struct {
+ struct sdev_resync {
+ sector_t region; /* Region being resynchronized. */
+ sector_t writer_region; /* region being written to. */
+ sector_t start; /* Start of actual resync copy. */
+ sector_t end; /* End of actual resync copy. */
+ unsigned len; /* Length of actual copy. */
+ /* Source device pointer for resync callback. */
+ struct sdev *from;
+ } resync;
+
+ struct {
+ unsigned long time;
+ struct test_buffer *buffer;
+ } test;
+
+
+ /* kcopyd resynchronization client. */
+ struct dm_kcopyd_client *kcopyd_client;
+
+ /* Teardown synchronization. */
+ wait_queue_head_t waiters;
+
+ sector_t split_io;
+
+ unsigned long flags;
+ } io;
+
+ /* Device properties. */
+ struct sdev_dev {
+ struct {
+ unsigned count; /* Ctr parameters count. */
+ const char *path; /* Device path/major:minor. */
+ } params;
+
+ struct dm_dev *dm_dev;
+ struct dm_dirty_log *dl;
+ unsigned number;
+ } dev;
+};
+
+/* Macros to access sdev lists. */
+#define SDEV_SLINK_LIST(sl) (sl->lists + SDEV_SLINK)
+#define SDEV_RESYNC_LIST(sl) (sl->lists + SDEV_RESYNC)
+
+/* Status flags for device. */
+enum dev_flags {
+ DEV_ERROR_READ, /* Read error on device. */
+ DEV_ERROR_WRITE, /* Write error on device. */
+ DEV_IO_QUEUED, /* Request(s) to device queued. */
+ DEV_IO_UNPLUG, /* Unplug device queue. */
+ DEV_OPEN, /* Device got opend during ctr. */
+ DEV_RESYNC, /* Device may resync. */
+ DEV_RESYNC_END, /* Flag device resynchronization end. */
+ DEV_RESYNCING, /* Device has active resync. */
+ DEV_SUSPENDED, /* Device suspended. */
+ DEV_TEARDOWN, /* Device is being deleted. */
+};
+
+/* Create slink bitfield (io.flags) access inline definitions. */
+DM_BITOPS(DevErrorRead, sdev, DEV_ERROR_READ)
+DM_BITOPS(DevErrorWrite, sdev, DEV_ERROR_WRITE)
+DM_BITOPS(DevIOQueued, sdev, DEV_IO_QUEUED)
+DM_BITOPS(DevIOUnplug, sdev, DEV_IO_UNPLUG)
+DM_BITOPS(DevOpen, sdev, DEV_OPEN)
+DM_BITOPS(DevResync, sdev, DEV_RESYNC)
+DM_BITOPS(DevResyncEnd, sdev, DEV_RESYNC_END)
+DM_BITOPS(DevResyncing, sdev, DEV_RESYNCING)
+DM_BITOPS(DevSuspended, sdev, DEV_SUSPENDED)
+DM_BITOPS(DevTeardown, sdev, DEV_TEARDOWN)
+
+/* Internal site link representation. */
+enum slink_list_type { SLINK_DEVS, SLINK_REPLOG, SLINK_RESYNC, NR_SLINK_LISTS };
+enum cache_type { COPY_CACHE, TEST_CACHE, NR_CACHES };
+struct slink {
+ struct kref ref; /* Reference count. */
+ /*
+ * Protect slink lists.
+ *
+ * Has to be spinlock, because the global replog lock
+ * needs to be one to be used from interrupt context
+ * and they are both taken in some places.
+ */
+ rwlock_t lock; /* Protect slink lists. */
+
+ /* Devices on this slink, on replog list and on resync list. */
+ struct list_head lists[NR_SLINK_LISTS];
+
+ /* List of all slinks for a replog. */
+ struct dm_repl_log_slink_list *repl_slinks;
+
+ unsigned number; /* slink number. */
+
+ struct slink_params {
+ unsigned count;
+ struct dm_repl_slink_fallbehind fallbehind;
+ enum dm_repl_slink_policy_type policy;
+ } params;
+
+ struct dm_repl_slink *slink;
+
+ struct slink_io {
+ unsigned long flags;
+ struct dm_kcopyd_client *kcopyd_client;
+ struct dm_io_client *dm_io_client;
+
+ /* Copy context and test buffer mempools. */
+ mempool_t *pool[NR_CACHES];
+
+ /* io work. */
+ struct workqueue_struct *wq;
+ struct delayed_work dws;
+
+ struct sdev *dev_test;
+ } io;
+
+ /* Callback for slink recovered. */
+ struct dm_repl_slink_notify_ctx recover;
+};
+
+/* Macros to access slink lists. */
+#define SLINK_DEVS_LIST(sl) (sl->lists + SLINK_DEVS)
+#define SLINK_REPLOG_LIST(sl) (sl->lists + SLINK_REPLOG)
+#define SLINK_RESYNC_LIST(sl) (sl->lists + SLINK_RESYNC)
+
+/* Status flags for slink. */
+enum slink_flags {
+ SLINK_ERROR_READ, /* Read error on site link. */
+ SLINK_ERROR_WRITE, /* Write error on site link. */
+ SLINK_IMMEDIATE_WORK, /* Flag immediate worker run. */
+ SLINK_RESYNC_PROCESSING,/* Resync is being processed on slink. */
+ SLINK_TEST_ACTIVE, /* Device test active on slink. */
+ SLINK_WRITER, /* Slink is being written to. */
+};
+
+/* Create slink bitfield (io.flags) access inline definitions. */
+DM_BITOPS(SlinkErrorRead, slink, SLINK_ERROR_READ)
+DM_BITOPS(SlinkErrorWrite, slink, SLINK_ERROR_WRITE)
+DM_BITOPS(SlinkImmediateWork, slink, SLINK_IMMEDIATE_WORK)
+DM_BITOPS(SlinkResyncProcessing, slink, SLINK_RESYNC_PROCESSING)
+DM_BITOPS(SlinkTestActive, slink, SLINK_TEST_ACTIVE)
+DM_BITOPS(SlinkWriter, slink, SLINK_WRITER)
+
+/* Copy context to carry from blockdev_copy() to copy_endio(). */
+struct copy_context {
+ struct sdev *dev_to; /* Device to copy to. */
+
+ /* Callback for data in RAM (noop for 'blockdev' type). */
+ struct dm_repl_slink_notify_ctx ram;
+
+ /* Callback for data on disk. */
+ struct dm_repl_slink_notify_ctx disk;
+};
+
+/* Allocate/free blockdev copy context. */
+static inline struct copy_context *alloc_copy_context(struct slink *sl)
+{
+ return mempool_alloc(sl->io.pool[COPY_CACHE], GFP_KERNEL);
+}
+
+static inline void free_copy_context(struct copy_context *cc, struct slink *sl)
+{
+ mempool_free(cc, sl->io.pool[COPY_CACHE]);
+}
+
+/* Allocate/free blockdev test io buffer. */
+static inline struct test_buffer *alloc_test_buffer(struct slink *sl)
+{
+ return mempool_alloc(sl->io.pool[TEST_CACHE], GFP_KERNEL);
+}
+
+static inline void free_test_buffer(struct test_buffer *tb, struct slink *sl)
+{
+ mempool_free(tb, sl->io.pool[TEST_CACHE]);
+}
+
+/* Destcriptor type <-> name mapping. */
+static const struct dm_str_descr policies[] = {
+ { DM_REPL_SLINK_ASYNC, "asynchronous" },
+ { DM_REPL_SLINK_SYNC, "synchronous" },
+ { DM_REPL_SLINK_STALL, "stall" },
+};
+
+/* Get slink policy flags. */
+static int _slink_policy_type(char *name)
+{
+ int r = dm_descr_type(policies, ARRAY_SIZE(policies), name);
+
+ if (r < 0)
+ DMERR("Invalid site link policy %s", name);
+
+ return r;
+}
+
+/* Get slink policy name. */
+static const char *
+_slink_policy_name(const int type)
+{
+ return dm_descr_name(policies, ARRAY_SIZE(policies), type);
+}
+
+#define SEPARATOR '+'
+static int
+get_slink_policy(char *arg)
+{
+ int policy = 0, r;
+ char *sep;
+
+ DMDEBUG_LIMIT("%s arg=%s", __func__, arg);
+
+ /*
+ * Check substrings of the compound policy
+ * string separated by SEPARATOR.
+ */
+ do {
+ sep = strchr(arg, SEPARATOR);
+ if (sep)
+ *sep = 0;
+ else
+ sep = arg;
+
+ r = _slink_policy_type(arg);
+ if (sep != arg) {
+ arg = sep + 1;
+ *sep = SEPARATOR;
+ }
+
+ if (r < 0)
+ return r;
+ else
+ set_bit(r, (unsigned long *) &policy);
+ } while (sep != arg);
+
+ return policy;
+}
+
+/* String print policies. */
+static char *
+snprint_policies(enum dm_repl_slink_policy_type policies,
+ char *result, size_t maxlen)
+{
+ int bits = sizeof(policies) * 8, i;
+ size_t sz = 0;
+
+ *result = 0;
+ for (i = 0; i < bits; i++) {
+ if (test_bit(i, (unsigned long *) &policies)) {
+ const char *name = _slink_policy_name(i);
+
+ if (name) {
+ if (*result)
+ DMEMIT("%c", SEPARATOR);
+
+ DMEMIT("%s", name);
+ }
+ }
+ }
+
+ return result;
+}
+
+/* Fallbehind type <-> name mappings. */
+static const struct dm_str_descr fb_types[] = {
+ { DM_REPL_SLINK_FB_IOS, "ios" },
+ { DM_REPL_SLINK_FB_SIZE, "size" },
+ { DM_REPL_SLINK_FB_TIMEOUT, "timeout" },
+};
+
+/* Return name of fallbehind parameter by type. */
+static const char *
+fb_name(enum dm_repl_slink_fallbehind_type type)
+{
+ return dm_descr_name(fb_types, ARRAY_SIZE(fb_types), type);
+}
+
+/* String print fallbehind. */
+static char *
+snprint_fallbehind(struct dm_repl_slink_fallbehind *fallbehind,
+ char *result, size_t maxlen)
+{
+ size_t sz = 0;
+ sector_t value = fallbehind->value;
+
+ sector_div(value, fallbehind->multiplier);
+ DMEMIT("%s %llu%c", fb_name(fallbehind->type),
+ (unsigned long long) value, fallbehind->unit);
+ return result;
+}
+
+/*
+ * Check and get fallbehind value and type.
+ * Pay attention to unit qualifiers.
+ */
+static int
+_get_slink_fallbehind(int argc, char **argv,
+ enum dm_repl_slink_fallbehind_type fb_type,
+ struct dm_repl_slink_fallbehind *fb)
+{
+ int arg = 0, r;
+ unsigned multi = 1;
+ long long tmp;
+ char *unit;
+ const char *name = fb_name(fb_type);
+
+ fb->unit = 0;
+
+ /* Old syntax e.g. "ios=1000". */
+ r = sscanf(argv[arg] + strlen(name), "=%lld", &tmp) != 1 || tmp < 0;
+ if (r) {
+ if (argc < 2)
+ goto bad_value;
+
+ /* New syntax e.g. "ios 1000". */
+ r = sscanf(argv[++arg], "%lld", &tmp) != 1 || tmp < 0;
+ }
+
+ unit = argv[arg] + strlen(argv[arg]) - 1;
+ unit = (*unit < '0' || *unit > '9') ? unit : NULL;
+
+ if (r)
+ goto bad_value;
+
+ if (unit) {
+ const struct units {
+ const char *chars;
+ const sector_t multiplier[];
+ } *u = NULL;
+ static const struct units size = {
+ "sSkKmMgGtTpPeE",
+ #define TWO (sector_t) 2
+ /* sectors, KB,MB, GB, TB, PB, EB */
+ { 1, 2, TWO<<10, TWO<<20, TWO<<30, TWO<<40, TWO<<50 },
+ #undef TWO
+ }, timeout = {
+ "tTsSmMhHdD",
+ /*ms, sec, minute, hour, day */
+ { 1, 1000, 60*1000, 60*60*1000, 24*60*60*1000 },
+ };
+ const char *c;
+
+ switch (fb_type) {
+ case DM_REPL_SLINK_FB_SIZE:
+ u = &size;
+ goto retrieve;
+ case DM_REPL_SLINK_FB_TIMEOUT:
+ u = &timeout;
+retrieve:
+ /* Skip to unit identifier character. */
+ for (c = u->chars, multi = 0;
+ *c && *c != *unit;
+ c++, multi++)
+ ;
+
+ if (*c) {
+ fb->unit = *c;
+ multi = u->multiplier[(multi + 2) / 2];
+ } else
+ goto bad_unit;
+ case DM_REPL_SLINK_FB_IOS:
+ break;
+
+ default:
+ BUG();
+ }
+ }
+
+ fb->type = fb_type;
+ fb->multiplier = multi;
+ fb->value = tmp * multi;
+ return 0;
+
+bad_value:
+ DMERR("invalid fallbehind %s value", argv[0]);
+ return -EINVAL;
+
+bad_unit:
+ DMERR("invalid slink fallbehind unit");
+ return -EINVAL;
+}
+
+static int
+get_slink_fallbehind(int argc, char **argv, struct dm_repl_slink_fallbehind *fb)
+{
+ const struct dm_str_descr *f = ARRAY_END(fb_types);
+
+ while (f-- > fb_types) {
+ /* Check for fallbehind argument. */
+ if (!strnicmp(STR_LEN(fb_name(f->type), argv[0])))
+ return _get_slink_fallbehind(argc, argv, f->type, fb);
+ }
+
+ DMERR("invalid fallbehind type %s", argv[0]);
+ return -EINVAL;
+}
+
+/* Return region on device fro given sector. */
+static sector_t
+sector_to_region(struct sdev *dev, sector_t sector)
+{
+ sector_div(sector, dev->ti->split_io);
+ return sector;
+}
+
+/* Check dm_repl_slink and slink ok. */
+static struct slink *
+slink_check(struct dm_repl_slink *slink)
+{
+ struct slink *sl;
+
+ if (unlikely(!slink))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(IS_ERR(slink)))
+ return (struct slink *) slink;
+
+ sl = slink->context;
+ return sl ? sl : ERR_PTR(-EINVAL);
+}
+
+struct cache_defs {
+ enum cache_type type;
+ const int min;
+ struct kmem_cache *cache;
+ const char *name;
+ const size_t size;
+};
+
+/* Slabs for the copy context structures and for device test I/O buffers. */
+static struct cache_defs cache_defs[] = {
+ { COPY_CACHE, MIN_IOS, NULL,
+ "dm_repl_slink_copy", sizeof(struct copy_context) },
+ { TEST_CACHE, MIN_IOS, NULL,
+ "dm_repl_slink_test", SLINK_TEST_SIZE },
+};
+
+/*
+ * Release resources when last reference dropped.
+ *
+ * Gets called with lock hold to atomically delete slink from list.
+ */
+static void
+slink_release(struct kref *ref)
+{
+ struct slink *sl = container_of(ref, struct slink, ref);
+
+ DMDEBUG("%s slink=%d released", __func__, sl->number);
+ kfree(sl);
+}
+
+/* Take out reference on slink. */
+static struct slink *
+slink_get(struct slink *sl)
+{
+ kref_get(&sl->ref);
+ return sl;
+}
+
+/* Drop reference on slink and destroy it on last release. */
+static int
+slink_put(struct slink *sl)
+{
+ return kref_put(&sl->ref, slink_release);
+}
+
+/* Find slink on global slink list by number. */
+static struct slink *
+slink_get_by_number(struct dm_repl_log_slink_list *repl_slinks,
+ unsigned slink_number)
+{
+ struct slink *sl;
+
+ BUG_ON(!repl_slinks);
+
+ list_for_each_entry(sl, &repl_slinks->list, lists[SLINK_REPLOG]) {
+ if (slink_number == sl->number)
+ return slink_get(sl);
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+/* Destroy slink object. */
+static void
+slink_destroy(struct slink *sl)
+{
+ struct slink_io *io;
+ struct cache_defs *cd;
+
+ _BUG_ON_PTR(sl);
+ _BUG_ON_PTR(sl->repl_slinks);
+ io = &sl->io;
+
+ write_lock(&sl->repl_slinks->lock);
+ if (!list_empty(SLINK_REPLOG_LIST(sl)))
+ list_del(SLINK_REPLOG_LIST(sl));
+ write_unlock(&sl->repl_slinks->lock);
+
+ /* Destroy workqueue before freeing resources. */
+ if (io->wq)
+ destroy_workqueue(io->wq);
+
+ if (io->kcopyd_client)
+ dm_kcopyd_client_destroy(io->kcopyd_client);
+
+ if (io->dm_io_client)
+ dm_io_client_destroy(io->dm_io_client);
+
+ cd = ARRAY_END(cache_defs);
+ while (cd-- > cache_defs) {
+ if (io->pool[cd->type]) {
+ mempool_destroy(io->pool[cd->type]);
+ io->pool[cd->type] = NULL;
+ }
+ }
+}
+
+/*
+ * Get slink from global slink list by number or create
+ * new one and put it on list; take out reference.
+ */
+static void do_slink(struct work_struct *ws);
+static struct slink *
+slink_create(struct dm_repl_slink *slink,
+ struct dm_repl_log_slink_list *repl_slinks,
+ struct slink_params *params, unsigned slink_number)
+{
+ int i, r;
+ struct slink *sl;
+
+ DMDEBUG_LIMIT("%s %u", __func__, slink_number);
+
+ /* Make sure, slink0 exists when creating slink > 0. */
+ if (slink_number) {
+ struct slink *sl0;
+
+ read_lock(&repl_slinks->lock);
+ sl0 = slink_get_by_number(repl_slinks, 0);
+ read_unlock(&repl_slinks->lock);
+
+ if (IS_ERR(sl0)) {
+ DMERR("Can't create slink=%u w/o slink0.",
+ slink_number);
+ return ERR_PTR(-EPERM);
+ }
+
+ BUG_ON(slink_put(sl0));
+ }
+
+ read_lock(&repl_slinks->lock);
+ sl = slink_get_by_number(repl_slinks, slink_number);
+ read_unlock(&repl_slinks->lock);
+
+ if (IS_ERR(sl)) {
+ struct slink *sl_tmp;
+ struct slink_io *io;
+ struct cache_defs *cd;
+
+ if (!params)
+ return sl;
+
+ /* Preallocate internal slink struct. */
+ sl = kzalloc(sizeof(*sl), GFP_KERNEL);
+ if (unlikely(!sl))
+ return ERR_PTR(-ENOMEM);
+
+ rwlock_init(&sl->lock);
+ kref_init(&sl->ref);
+
+#ifdef CONFIG_LOCKDEP
+ {
+ static struct lock_class_key slink_number_lock;
+
+ lockdep_set_class_and_subclass(&sl->lock,
+ &slink_number_lock,
+ slink_number);
+ }
+#endif
+
+ i = ARRAY_SIZE(sl->lists);
+ while (i--)
+ INIT_LIST_HEAD(sl->lists + i);
+
+ /* Copy (parsed) fallbehind arguments accross. */
+ io = &sl->io;
+ sl->params = *params;
+ sl->number = slink_number;
+ sl->repl_slinks = repl_slinks;
+ sl->slink = slink;
+ slink->context = sl;
+
+ /* Create kcopyd client for data copies to slinks. */
+ r = dm_kcopyd_client_create(COPY_PAGES, &io->kcopyd_client);
+ if (unlikely(r < 0)) {
+ io->kcopyd_client = NULL;
+ goto bad;
+ }
+
+ /* Create dm-io client context for test I/Os on slinks. */
+ io->dm_io_client = dm_io_client_create(1);
+ if (unlikely(IS_ERR(io->dm_io_client))) {
+ r = PTR_ERR(io->dm_io_client);
+ io->dm_io_client = NULL;
+ goto bad;
+ }
+
+ r = -ENOMEM;
+
+ /* Create slab mempools for copy contexts and test buffers. */
+ cd = ARRAY_END(cache_defs);
+ while (cd-- > cache_defs) {
+ io->pool[cd->type] =
+ mempool_create_slab_pool(cd->min, cd->cache);
+ if (unlikely(!io->pool[cd->type])) {
+ DMERR("Failed to create mempool %p",
+ io->pool[cd->type]);
+ goto bad;
+ }
+ }
+
+ io->wq = create_singlethread_workqueue(DAEMON);
+ if (likely(io->wq))
+ INIT_DELAYED_WORK(&sl->io.dws, do_slink);
+ else
+ goto bad;
+
+ /* Add to replog list. */
+ write_lock(&repl_slinks->lock);
+ sl_tmp = slink_get_by_number(repl_slinks, slink_number);
+ if (likely(IS_ERR(sl_tmp))) {
+ /* We won the race -> add to list. */
+ list_add_tail(SLINK_REPLOG_LIST(sl),
+ &repl_slinks->list);
+ write_unlock(&repl_slinks->lock);
+ } else {
+ /* We lost the race, take the winner. */
+ write_unlock(&repl_slinks->lock);
+ /* Will release sl. */
+ slink_destroy(sl);
+ sl = sl_tmp;
+ }
+
+ return sl;
+ }
+
+ slink_put(sl);
+ return ERR_PTR(-EEXIST);
+
+bad:
+ slink_destroy(sl);
+ return ERR_PTR(r);
+}
+
+/* Return slink count. */
+static unsigned
+slink_count(struct slink *sl)
+{
+ unsigned count = 0;
+ struct slink *sl_cur;
+
+ _BUG_ON_PTR(sl);
+
+ list_for_each_entry(sl_cur, &sl->repl_slinks->list, lists[SLINK_REPLOG])
+ count++;
+
+ return count;
+}
+
+/* Return number of regions for device. */
+static inline sector_t
+region_count(struct sdev *dev)
+{
+ return dm_sector_div_up(dev->ti->len, dev->ti->split_io);
+}
+
+
+/*
+ * Site link worker.
+ */
+/* Queue (optionally delayed) io work. */
+static void
+wake_do_slink_delayed(struct slink *sl, unsigned long delay)
+{
+ struct delayed_work *dws = &sl->io.dws;
+
+ if (delay) {
+ /* Avoid delaying if immediate worker run already requested. */
+ if (SlinkImmediateWork(sl))
+ return;
+ } else
+ SetSlinkImmediateWork(sl);
+
+ if (delayed_work_pending(dws))
+ cancel_delayed_work(dws);
+
+ queue_delayed_work(sl->io.wq, dws, delay);
+}
+
+/* Queue io work immediately. */
+static void
+wake_do_slink(void *context)
+{
+ wake_do_slink_delayed(context, 0);
+}
+
+/* Set/get device test timeouts. */
+/* FIXME: algorithm to have flexible test timing? */
+static inline void
+set_dev_test_time(struct sdev *dev)
+{
+ unsigned long time = jiffies + SLINK_TEST_JIFFIES;
+
+ /* Check jiffies wrap. */
+ if (unlikely(time < jiffies))
+ time = SLINK_TEST_JIFFIES;
+
+ dev->io.test.time = time;
+}
+
+static inline unsigned long
+get_dev_test_time(struct sdev *dev)
+{
+ return dev->io.test.time;
+}
+
+/*
+ * Get device object reference count.
+ *
+ * A reference count > 1 indicates IO in flight on the device.
+ *
+ */
+static int dev_io(struct sdev *dev)
+{
+ return atomic_read(&dev->ref.refcount) > 1;
+}
+
+/* Take device object reference out. */
+static struct sdev *dev_get(struct sdev *dev)
+{
+ kref_get(&dev->ref);
+ return dev;
+}
+
+/* Release sdev object. */
+static void
+dev_release(struct kref *ref)
+{
+ struct sdev *dev = container_of(ref, struct sdev, ref);
+ struct slink *sl = dev->sl;
+
+ _BUG_ON_PTR(sl);
+
+ kfree(dev->dev.params.path);
+ DMDEBUG("%s dev=%d slink=%d released", __func__,
+ dev->dev.number, sl->number);
+ kfree(dev);
+}
+
+/* Drop device object reference. */
+static int dev_put(struct sdev *dev)
+{
+ int r = kref_put(&dev->ref, dev_release);
+
+ if (!r) {
+ if (!dev_io(dev))
+ wake_up(&dev->io.waiters);
+ }
+
+ return r;
+}
+
+/* Find device by device number. */
+static struct sdev *dev_get_by_number(struct slink *sl, int dev_number)
+{
+ struct sdev *dev;
+
+ _BUG_ON_PTR(sl);
+
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ if (dev_number == dev->dev.number)
+ return dev_get(dev);
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+/* Find device by bdev. */
+static struct sdev *dev_get_by_bdev(struct slink *sl,
+ struct block_device *bdev)
+{
+ struct sdev *dev;
+
+ _BUG_ON_PTR(sl);
+
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ struct mapped_device *md = dm_table_get_md(dev->ti->table);
+ struct gendisk *gd = dm_disk(md);
+
+ dm_put(md);
+
+ if (bdev->bd_disk == gd)
+ return dev_get(dev);
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+/* Find device by path. */
+static struct sdev *dev_get_by_path(struct slink *sl,
+ const char *path)
+{
+ struct sdev *dev;
+
+ _BUG_ON_PTR(sl);
+
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ if (!strcmp(dev->dev.params.path, path))
+ return dev_get(dev);
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct sdev *
+dev_get_on_any_slink(struct slink *sl, struct sdev *dev)
+{
+ struct slink *sl_cur = NULL;
+ struct sdev *dev_r;
+
+ list_for_each_entry(sl_cur, &sl->repl_slinks->list,
+ lists[SLINK_REPLOG]) {
+ /* Check by path if device already present. */
+ if (sl_cur != sl)
+ read_lock(&sl_cur->lock);
+
+ /* Check by bdev/number depending on device open or not. */
+ dev_r = DevOpen(dev) ?
+ dev_get_by_bdev(sl_cur, dev->dev.dm_dev->bdev) :
+ dev_get_by_path(sl_cur, dev->dev.params.path);
+
+ if (sl_cur != sl)
+ read_unlock(&sl_cur->lock);
+
+ if (unlikely(!IS_ERR(dev_r)))
+ return dev_r;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+/* Callback for site link accessibility tests. */
+static void
+dev_test_endio(unsigned long error, void *context)
+{
+ struct sdev *dev = context;
+ struct slink *sl;
+
+ _BUG_ON_PTR(dev);
+ sl = dev->sl;
+ _BUG_ON_PTR(sl);
+
+ if (error)
+ set_dev_test_time(dev);
+ else {
+ ClearDevErrorRead(dev);
+ ClearDevErrorWrite(dev);
+ }
+
+ /* Release test io buffer. */
+ free_test_buffer(dev->io.test.buffer, sl);
+
+ ClearSlinkTestActive(sl);
+ BUG_ON(dev_put(dev)); /* Release reference. */
+ wake_do_slink(sl);
+}
+
+/* Submit a read to sector 0 of a remote device to test access to it. */
+static void
+dev_test(struct slink *sl, struct sdev *dev)
+{
+ struct dm_io_region region = {
+ .bdev = dev->dev.dm_dev->bdev,
+ .sector = 0,
+ .count = 1,
+ };
+ struct dm_io_request req = {
+ .bi_rw = READ,
+ .mem.type = DM_IO_KMEM,
+ .mem.ptr.addr = alloc_test_buffer(sl),
+ .notify.fn = dev_test_endio,
+ .notify.context = dev,
+ .client = sl->io.dm_io_client,
+ };
+
+ /* FIXME: flush_workqueue should care for race. */
+ sl->io.dev_test = dev;
+ dev->io.test.buffer = req.mem.ptr.addr;
+ set_dev_test_time(dev);
+ BUG_ON(dm_io(&req, 1, ®ion, NULL));
+}
+
+
+/*
+ * Callback replog handler in case of a region
+ * resynchronized or a device recovered.
+ */
+static inline void
+recover_callback(struct slink *sl, int read_err, int write_err)
+{
+ struct dm_repl_slink_notify_ctx recover;
+
+ _BUG_ON_PTR(sl);
+ read_lock(&sl->lock);
+ recover = sl->recover;
+ read_unlock(&sl->lock);
+
+ /* Optionally call back site link recovery. */
+ if (likely(recover.fn))
+ recover.fn(read_err, write_err, recover.context);
+}
+
+/* Try to open device. */
+static int
+try_dev_open(struct sdev *dev)
+{
+ int r = 0;
+
+ if (!DevOpen(dev)) {
+ /* Try getting device with limit checks. */
+ r = dm_get_device(dev->ti, dev->dev.params.path,
+ 0, dev->ti->len,
+ dm_table_get_mode(dev->ti->table),
+ &dev->dev.dm_dev);
+ if (r) {
+ set_dev_test_time(dev);
+ SetDevErrorRead(dev);
+ } else {
+ SetDevOpen(dev);
+ ClearDevErrorRead(dev);
+ }
+ }
+
+ return r;
+}
+
+/* Check devices for error condition and initiate test io on those. */
+static void
+do_slink_test(struct slink *sl)
+{
+ int r;
+ unsigned error_count = 0;
+ /* FIXME: jiffies may differ on MP ? */
+ unsigned long delay = ~0, j = jiffies;
+ struct sdev *dev, *dev_t = NULL;
+
+ read_lock(&sl->lock);
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ if ((DevErrorRead(dev) || DevErrorWrite(dev))) {
+ error_count++;
+
+ if (!DevTeardown(dev) && !DevSuspended(dev)) {
+ unsigned long t = get_dev_test_time(dev);
+
+ /* Check we didn't reach the test time jet. */
+ if (time_before(j, t)) {
+ unsigned long d = t - j;
+
+ if (d < delay)
+ delay = d;
+ } else {
+ dev_t = dev;
+ slink_get(sl);
+ break;
+ }
+ }
+ }
+ }
+
+ read_unlock(&sl->lock);
+
+ if (!error_count) {
+ /*
+ * If all are ok -> reset site link error state.
+ *
+ * We can't allow submission of writes
+ * before all devices are accessible.
+ */
+ /*
+ * FIXME: I actually test the remote device only so
+ * I shouldn't update state on the local reader side!
+ *
+ * Question is, where to update this or leave it
+ * to the caller to fail fataly when it can't read data
+ * of off the log or the replicated device any more.
+ */
+ if (TestClearSlinkErrorRead(sl))
+ error_count++;
+
+ if (TestClearSlinkErrorWrite(sl))
+ error_count++;
+
+ if (error_count)
+ recover_callback(sl, 0, 0);
+
+ return;
+ }
+
+ j = jiffies;
+
+ /* Check for jiffies overrun. */
+ if (unlikely(j < SLINK_TEST_JIFFIES))
+ set_dev_test_time(dev);
+
+ if (!TestSetSlinkTestActive(sl)) {
+ dev_get(dev); /* Take out device reference. */
+
+ /* Check device is open or open it. */
+ r = try_dev_open(dev_t);
+ if (r) {
+ ClearSlinkTestActive(sl);
+ BUG_ON(dev_put(dev_t));
+ } else
+ /* If open, do test. */
+ dev_test(sl, dev_t);
+ }
+
+ if (!SlinkTestActive(sl) && delay < ~0)
+ wake_do_slink_delayed(sl, delay);
+
+ slink_put(sl);
+}
+
+/* Set device error state and throw a dm event. */
+static void
+resync_error_dev(struct sdev *dev)
+{
+ SetDevErrorRead(dev);
+ dm_table_event(dev->ti->table);
+}
+
+/* Resync copy callback. */
+static void
+resync_endio(int read_err, unsigned long write_err, void *context)
+{
+ struct sdev *dev_from, *dev_to = context;
+ struct sdev_resync *resync;
+
+ _BUG_ON_PTR(dev_to);
+ resync = &dev_to->io.resync;
+ dev_from = resync->from;
+ _BUG_ON_PTR(dev_from);
+
+ if (unlikely(read_err))
+ resync_error_dev(dev_from);
+
+ if (unlikely(write_err))
+ resync_error_dev(dev_to);
+
+ resync->start += resync->len;
+ ClearDevResyncing(dev_to);
+ dev_put(dev_from);
+ dev_put(dev_to);
+ wake_do_slink(dev_to->sl);
+}
+
+/* Unplug a block devices queue. */
+static inline void
+dev_unplug(struct block_device *bdev)
+{
+ blk_unplug(bdev_get_queue(bdev));
+}
+
+/* Resync copy function. */
+static void
+resync_copy(struct sdev *dev_to, struct sdev *dev_from,
+ struct sdev_resync *resync)
+{
+ sector_t max_len = dev_from->ti->len;
+ struct dm_io_region src = {
+ .bdev = dev_from->dev.dm_dev->bdev,
+ .sector = resync->start,
+ }, dst = {
+ .bdev = dev_to->dev.dm_dev->bdev,
+ .sector = resync->start,
+ };
+
+ src.count = dst.count = unlikely(src.sector + resync->len > max_len) ?
+ max_len - src.sector : resync->len;
+ BUG_ON(!src.count);
+ BUG_ON(src.sector + src.count > max_len);
+ dev_to->io.resync.from = dev_from;
+ SetDevResyncing(dev_to);
+ BUG_ON(dm_kcopyd_copy(dev_to->io.kcopyd_client, &src, 1, &dst, 0,
+ resync_endio, dev_to));
+ dev_unplug(src.bdev);
+ dev_unplug(dst.bdev);
+}
+
+/* Return length of segment to resynchronoze. */
+static inline sector_t
+resync_len(struct sdev *dev)
+{
+ sector_t r = RESYNC_SIZE, region_size = dev->ti->split_io;
+ struct sdev_resync *resync = &dev->io.resync;
+
+ if (unlikely(r > region_size))
+ r = region_size;
+
+ if (unlikely(resync->start + r > resync->end))
+ r = resync->end - resync->start;
+
+ return r;
+}
+
+/* Initiate recovery on all site links registered for. */
+static void
+do_slink_resync(struct slink *sl)
+{
+ struct slink *sl0;
+ struct sdev *dev_from, *dev_n, *dev_to;
+ struct list_head resync_list;
+
+ _BUG_ON_PTR(sl);
+
+ if (!sl->number)
+ return;
+
+ /*
+ * Protect the global site link list from
+ * changes while getting slink 0.
+ */
+ read_lock(&sl->repl_slinks->lock);
+ sl0 = slink_get_by_number(sl->repl_slinks, 0);
+ read_unlock(&sl->repl_slinks->lock);
+
+ _BUG_ON_PTR(sl0);
+
+ /*
+ * Quickly take out resync list for local unlocked processing
+ * and take references per device for suspend/delete race prevention
+ */
+ INIT_LIST_HEAD(&resync_list);
+ read_lock(&sl0->lock);
+ write_lock(&sl->lock);
+ SetSlinkResyncProcessing(sl);
+
+ list_splice(SLINK_RESYNC_LIST(sl), &resync_list);
+ INIT_LIST_HEAD(SLINK_RESYNC_LIST(sl));
+
+ list_for_each_entry(dev_to, &resync_list, lists[SDEV_RESYNC]) {
+ dev_from = dev_get_by_number(sl0, dev_to->dev.number);
+ _BUG_ON_PTR(dev_from);
+ dev_get(dev_to);
+ /* Memorize device to copy from. */
+ dev_to->io.resync.from = dev_from;
+ }
+
+ write_unlock(&sl->lock);
+ read_unlock(&sl0->lock);
+
+ /*
+ * Process all devvices needing
+ * resynchronization on the private list.
+ *
+ * "dev_to" is device to copy to.
+ */
+ list_for_each_entry(dev_to, &resync_list, lists[SDEV_RESYNC]) {
+ unsigned region_size;
+ struct sdev_resync *resync;
+ struct dm_dirty_log *dl = dev_to->dev.dl;
+
+ /* Can't resync w/o dirty log. */
+ _BUG_ON_PTR(dl);
+
+ /* Device closed/copy active/suspended/being torn down/error. */
+ if (!DevOpen(dev_to) ||
+ DevResyncing(dev_to) ||
+ DevSuspended(dev_to) ||
+ DevTeardown(dev_to) ||
+ DevErrorWrite(dev_to))
+ continue;
+
+ /* Device to copy from. */
+ resync = &dev_to->io.resync;
+ dev_from = resync->from;
+
+ /* slink0 device suspended/being torn down or I/O error. */
+ if (DevSuspended(dev_from) ||
+ DevTeardown(dev_from) ||
+ DevErrorRead(dev_from))
+ continue;
+
+ /* No copy active if resync->end == 0. */
+ if (!resync->end) {
+ int r;
+ sector_t region;
+
+ /* Ask dirty region log for another region to sync. */
+ r = dl->type->get_resync_work(dl, ®ion);
+ if (r) {
+ write_lock(&sl0->lock);
+
+ /* Region is being written to -> postpone. */
+ if (unlikely(SlinkWriter(sl0) &&
+ resync->writer_region == region)) {
+ write_unlock(&sl0->lock);
+ continue;
+ }
+
+ region_size = dev_to->ti->split_io;
+ resync->region = region;
+ resync->start = region * region_size;
+ resync->end = resync->start + region_size;
+ if (unlikely(resync->end > dev_to->ti->len))
+ resync->end = dev_to->ti->len;
+
+ write_unlock(&sl0->lock);
+ } else {
+ /* No more regions to recover. */
+ SetDevResyncEnd(dev_to);
+ continue;
+ }
+ }
+
+ /* More to copy for this region. */
+ if (resync->start < resync->end) {
+ resync->len = resync_len(dev_to);
+ BUG_ON(!resync->len);
+
+ /*
+ * Take out references in order
+ * to not race with deletion.
+ *
+ * resync_endio will release them.
+ */
+ dev_get(dev_from);
+ dev_get(dev_to);
+ resync_copy(dev_to, dev_from, resync);
+
+ /*
+ * Done with copying this region:
+ * mark in sync and flush dirty log.
+ */
+ } else {
+ dl->type->set_region_sync(dl, resync->region, 1);
+ dl->type->flush(dl);
+
+ /* Optionally call back site link recovery. */
+ recover_callback(sl, 0, 0);
+ resync->end = 0;
+
+ /* Another run to check for more resync work. */
+ wake_do_slink(sl);
+ }
+ }
+
+ /* Put race device references. */
+ read_lock(&sl0->lock);
+ write_lock(&sl->lock);
+ list_for_each_entry_safe(dev_to, dev_n, &resync_list,
+ lists[SDEV_RESYNC]) {
+ if (TestClearDevResyncEnd(dev_to))
+ list_del_init(SDEV_RESYNC_LIST(dev_to));
+
+ dev_from = dev_get_by_number(sl0, dev_to->dev.number);
+ /* 1 put just taken, 1 put for the one initially taken. */
+ _BUG_ON_PTR(dev_from);
+ BUG_ON(dev_put(dev_from));
+ BUG_ON(dev_put(dev_from));
+ BUG_ON(dev_put(dev_to));
+ }
+
+ list_splice(&resync_list, SLINK_RESYNC_LIST(sl));
+ ClearSlinkResyncProcessing(sl);
+
+ write_unlock(&sl->lock);
+ read_unlock(&sl0->lock);
+
+ BUG_ON(slink_put(sl0));
+}
+
+/* Main worker thread function. */
+static void
+do_slink(struct work_struct *ws)
+{
+ int must_resync;
+ struct slink *sl = container_of(ws, struct slink, io.dws.work);
+
+ if (!SlinkTestActive(sl))
+ do_slink_test(sl);
+
+ write_lock(&sl->lock);
+ must_resync = !list_empty(SLINK_RESYNC_LIST(sl));
+ write_unlock(&sl->lock);
+
+ if (must_resync)
+ do_slink_resync(sl);
+
+ ClearSlinkImmediateWork(sl);
+}
+
+/*
+ * End site link worker.
+ */
+
+/* Allocate/init an sdev structure and dm_get_device(). */
+static struct sdev *
+dev_create(struct slink *sl, struct dm_target *ti,
+ char *path, unsigned dev_number)
+{
+ int i, r;
+ struct sdev *dev;
+
+ _BUG_ON_PTR(sl);
+ _BUG_ON_PTR(ti);
+ _BUG_ON_PTR(path);
+
+ /* Preallocate site link device structure. */
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (unlikely(!dev))
+ goto bad_dev_alloc;
+
+ dev->ti = ti;
+ dev->dev.number = dev_number;
+ init_waitqueue_head(&dev->io.waiters);
+ kref_init(&dev->ref);
+
+ i = ARRAY_SIZE(dev->lists);
+ while (i--)
+ INIT_LIST_HEAD(dev->lists + i);
+
+ dev->dev.params.path = kstrdup(path, GFP_KERNEL);
+ if (unlikely(!dev->dev.params.path))
+ goto bad_dev_path_alloc;
+
+ /* FIXME: closed device handling 29.09.2009
+ *
+ * Allow inaccessible devices to be created, hence
+ * opening them during transport failure discovery.
+ */
+ ClearDevOpen(dev);
+ try_dev_open(dev);
+
+ /*
+ * Create kcopyd client for resynchronization copies to slinks.
+ * Only needed for remote devices and hence slinks > 0.
+ */
+ if (sl->number) {
+ r = dm_kcopyd_client_create(RESYNC_PAGES,
+ &dev->io.kcopyd_client);
+ if (unlikely(r < 0))
+ goto bad_kcopyd_client;
+ }
+
+ dev->sl = sl;
+ SetDevSuspended(dev);
+ return dev;
+
+bad_dev_alloc:
+ DMERR("site link device allocation failed");
+ return ERR_PTR(-ENOMEM);
+
+bad_dev_path_alloc:
+ DMERR("site link device path allocation failed");
+ r = -ENOMEM;
+ goto bad;
+
+bad_kcopyd_client:
+ DMERR("site link device allocation failed");
+bad:
+ dev_release(&dev->ref);
+ return ERR_PTR(r);
+}
+
+/* Add an sdev to an slink. */
+static int
+dev_add(struct slink *sl, struct sdev *dev)
+{
+ int r;
+ struct slink *sl0;
+ struct sdev *dev_tmp;
+
+ _BUG_ON_PTR(sl);
+ _BUG_ON_PTR(dev);
+
+ /* Check by number if device got already added to this site link. */
+ dev_tmp = dev_get_by_number(sl, dev->dev.number);
+ if (unlikely(!IS_ERR(dev_tmp)))
+ goto bad_device;
+
+ /* Check by bdev/path if device got already added to any site link. */
+ dev_tmp = dev_get_on_any_slink(sl, dev);
+ if (unlikely(!IS_ERR(dev_tmp)))
+ goto bad_device;
+
+ /* Sibling device on local slink 0 registered yet ? */
+ if (sl->number) {
+ sl0 = slink_get_by_number(sl->repl_slinks, 0);
+ if (unlikely(IS_ERR(sl0)))
+ goto bad_slink0;
+
+ read_lock(&sl0->lock);
+ dev_tmp = dev_get_by_number(sl0, dev->dev.number);
+ read_unlock(&sl0->lock);
+
+ BUG_ON(slink_put(sl0));
+
+ if (unlikely(IS_ERR(dev_tmp)))
+ goto bad_sibling;
+
+ BUG_ON(dev_put(dev_tmp));
+ }
+
+ /* Add to slink's list of devices. */
+ list_add_tail(SDEV_SLINK_LIST(dev), SLINK_DEVS_LIST(sl));
+
+ /* All ok, add to list of remote devices to resync. */
+ if (sl->number) {
+ SetDevResync(dev);
+ list_add_tail(SDEV_RESYNC_LIST(dev), SLINK_RESYNC_LIST(sl));
+ }
+
+ return 0;
+
+bad_device:
+ DMERR("device already exists");
+ BUG_ON(dev_put(dev_tmp));
+ return -EBUSY;
+
+bad_slink0:
+ DMERR("SLINK0 doesn't exit!");
+ r = PTR_ERR(sl0);
+ return r;
+
+bad_sibling:
+ DMERR("Sibling device=%d on SLINK0 doesn't exist!", dev->dev.number);
+ r = PTR_ERR(dev_tmp);
+ return r;
+}
+
+/*
+ * Set up dirty log for new device.
+ *
+ * For local devices, no dirty log is allowed.
+ * For remote devices, a dirty log is mandatory.
+ *
+ * dirtylog_type = "nolog"/"core"/"disk",
+ * #dirtylog_params = 0-3 (1-2 for core dirty log type, 3 for
+ * disk dirty log only)
+ * dirtylog_params = [dirty_log_path] region_size [[no]sync])
+ */
+static int
+dirty_log_create(struct slink *sl, struct sdev *dev, unsigned argc, char **argv)
+{
+ struct dm_dirty_log *dl;
+
+ SHOW_ARGV;
+
+ _BUG_ON_PTR(sl);
+ _BUG_ON_PTR(dev);
+
+ if (unlikely(argc < 2))
+ goto bad_params;
+
+ /* Check for no dirty log with local devices. */
+ if (!strcmp(argv[0], "nolog") ||
+ !strcmp(argv[0], "-")) {
+ if (argc != 2 ||
+ strcmp(argv[1], "0"))
+ goto bad_params;
+
+ dev->dev.dl = NULL;
+ dev->io.split_io = DM_REPL_MIN_SPLIT_IO;
+
+ /* Mandatory dirty logs on SLINK > 0. */
+ if (sl->number)
+ goto bad_need_dl;
+
+ return 0;
+ }
+
+ /* No dirty logs on SLINK0. */
+ if (unlikely(!sl->number))
+ goto bad_site_link;
+
+ dl = dm_dirty_log_create(argv[0], dev->ti, argc - 2, argv + 2);
+ if (unlikely(!dl))
+ goto bad_dirty_log;
+
+ dev->dev.dl = dl;
+ dev->io.split_io = dl->type->get_region_size(dl);
+ if (dev->io.split_io < BIO_MAX_SECTORS)
+ DM_EINVAL("Invalid dirty log region size");
+
+ return 0;
+
+bad_params:
+ DMERR("invalid dirty log parameter count");
+ return -EINVAL;
+
+bad_need_dl:
+ DMERR("dirty log mandatory on SLINKs > 0");
+ return -EINVAL;
+
+bad_site_link:
+ DMERR("no dirty log allowed on SLINK0");
+ return -EINVAL;
+
+bad_dirty_log:
+ DMERR("failed to create dirty log");
+ return -ENXIO;
+}
+
+/*
+ * Check and adjust split_io on all replicated devices.
+ *
+ * Called with write repl_slinks->lock and write sl->lock hold.
+ *
+ * All remote devices must go by the same dirty log
+ * region size in order to keep the caller simple.
+ *
+ * @sl = slink > 0
+ * @dev = device to check and use to set ti->split_io
+ *
+ */
+static int
+set_split_io(struct slink *sl, struct sdev *dev)
+{
+ sector_t split_io_1st, split_io_ref = 0;
+ struct slink *sl_cur, *sl0;
+ struct sdev *dev_cur;
+
+ /* Nonsense to proceed on SLINK0. */
+ _BUG_ON_PTR(sl);
+ if (!sl->number)
+ return 0;
+
+ _BUG_ON_PTR(dev);
+
+ sl0 = slink_get_by_number(sl->repl_slinks, 0);
+ _BUG_ON_PTR(sl0);
+
+ /* Get split_io from any existing dev on this actual slink. */
+ if (list_empty(SLINK_DEVS_LIST(sl)))
+ split_io_1st = 0;
+ else {
+ dev_cur = list_first_entry(SLINK_DEVS_LIST(sl), struct sdev,
+ lists[SDEV_SLINK]);
+ split_io_1st = dev_cur->io.split_io;
+ }
+
+ /* Find any preset split_io on any (slink > 0 && slink != sl) device. */
+ list_for_each_entry(sl_cur, &sl0->repl_slinks->list,
+ lists[SLINK_REPLOG]) {
+ if (!sl_cur->number ||
+ sl_cur->number == sl->number)
+ continue;
+
+ if (!list_empty(SLINK_DEVS_LIST(sl_cur))) {
+ dev_cur = list_first_entry(SLINK_DEVS_LIST(sl_cur),
+ struct sdev,
+ lists[SDEV_SLINK]);
+ split_io_ref = dev_cur->io.split_io;
+ break;
+ }
+ }
+
+
+ /*
+ * The region size *must* be the same for all devices
+ * in order to simplify the related caller logic.
+ */
+ if ((split_io_ref && split_io_1st && split_io_ref != split_io_1st) ||
+ (split_io_1st && split_io_1st != dev->io.split_io) ||
+ (split_io_ref && split_io_ref != dev->io.split_io))
+ DM_EINVAL("region size argument must be the "
+ "same for all devices");
+
+ /* Lock sl0, because we ain't get here with sl == sl0. */
+ write_lock(&sl0->lock);
+ list_for_each_entry(dev_cur, SLINK_DEVS_LIST(sl0), lists[SDEV_SLINK])
+ dev_cur->ti->split_io = dev->io.split_io;
+
+ write_unlock(&sl0->lock);
+
+ BUG_ON(slink_put(sl0));
+ return 0;
+}
+
+/* Wait on device in flight device I/O before allowing device destroy. */
+static void
+slink_wait_on_io(struct sdev *dev)
+{
+ while (dev_io(dev)) {
+ flush_workqueue(dev->sl->io.wq);
+ wait_event(dev->io.waiters, !dev_io(dev));
+ }
+}
+
+/* Postsuspend method. */
+static int
+blockdev_postsuspend(struct dm_repl_slink *slink, int dev_number)
+{
+ struct slink *sl;
+ struct sdev *dev;
+
+ DMDEBUG_LIMIT("%s dev_number=%d", __func__, dev_number);
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ if (dev_number < 0)
+ return -EINVAL;
+
+ write_lock(&sl->lock);
+ dev = dev_get_by_number(sl, dev_number);
+ if (unlikely(IS_ERR(dev))) {
+ write_unlock(&sl->lock);
+ return PTR_ERR(dev);
+ }
+
+ /* Set device suspended. */
+ SetDevSuspended(dev);
+ write_unlock(&sl->lock);
+
+ dev_put(dev);
+
+ /* Wait for any device io to finish. */
+ slink_wait_on_io(dev);
+ return 0;
+}
+
+/* Resume method. */
+static int
+blockdev_resume(struct dm_repl_slink *slink, int dev_number)
+{
+ struct slink *sl;
+ struct sdev *dev;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+ DMDEBUG("%s sl_number=%d dev_number=%d", __func__,
+ sl->number, dev_number);
+
+ if (dev_number < 0)
+ return -EINVAL;
+
+ read_lock(&sl->lock);
+ dev = dev_get_by_number(sl, dev_number);
+ read_unlock(&sl->lock);
+
+ if (unlikely(IS_ERR(dev)))
+ return PTR_ERR(dev);
+
+ /* Clear device suspended. */
+ ClearDevSuspended(dev);
+ BUG_ON(dev_put(dev));
+ wake_do_slink(sl);
+ return 0;
+}
+
+/* Destroy device resources. */
+static void
+dev_destroy(struct sdev *dev)
+{
+ if (dev->dev.dl)
+ dm_dirty_log_destroy(dev->dev.dl);
+
+ if (dev->io.kcopyd_client)
+ dm_kcopyd_client_destroy(dev->io.kcopyd_client);
+
+ if (dev->dev.dm_dev)
+ dm_put_device(dev->ti, dev->dev.dm_dev);
+
+ BUG_ON(!dev_put(dev));
+}
+
+/*
+ * Method to add a device to a given site link
+ * and optionally create a dirty log for it.
+ *
+ * @dev_number = unsigned int stored in the REPLOG to associate to a dev_path
+ * @ti = dm_target ptr (needed for dm functions)
+ * @argc = 4...
+ * @argv = dev_params# dev_path dirty_log_args
+ */
+#define MIN_DEV_ARGS 4
+static int
+blockdev_dev_add(struct dm_repl_slink *slink, int dev_number,
+ struct dm_target *ti, unsigned argc, char **argv)
+{
+ int r;
+ unsigned dev_params, params, sl_count;
+ long long tmp;
+ struct slink *sl;
+ struct sdev *dev;
+
+ SHOW_ARGV;
+
+ if (dev_number < 0)
+ return -EINVAL;
+
+ /* Two more because of the following dirty log parameters. */
+ if (unlikely(argc < MIN_DEV_ARGS))
+ DM_EINVAL("invalid device parameters count");
+
+ /* Get #dev_params. */
+ if (unlikely(sscanf(argv[0], "%lld", &tmp) != 1 ||
+ tmp != 1)) {
+ DM_EINVAL("invalid device parameters argument");
+ } else
+ dev_params = tmp;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+ _BUG_ON_PTR(sl->repl_slinks);
+
+ dev = dev_create(sl, ti, argv[1], dev_number);
+ if (unlikely(IS_ERR(dev)))
+ return PTR_ERR(dev);
+
+ dev->dev.params.count = dev_params;
+
+ /* Work on dirty log paramaters. */
+ params = dev_params + 1;
+ r = dirty_log_create(sl, dev, argc - params, argv + params);
+ if (unlikely(r < 0))
+ goto bad;
+
+ /* Take out global and local lock to update the configuration. */
+ write_lock(&sl->repl_slinks->lock);
+ write_lock(&sl->lock);
+
+ /* Set split io value on all replicated devices. */
+ r = set_split_io(sl, dev);
+ if (unlikely(r < 0))
+ goto bad_unlock;
+
+ /*
+ * Now that dev is all set, add it to the slink.
+ *
+ * If callers are racing for the same device,
+ * dev_add() will catch that case too.
+ */
+ r = dev_add(sl, dev);
+ if (unlikely(r < 0))
+ goto bad_unlock;
+
+ write_unlock(&sl->lock);
+
+ sl_count = slink_count(sl);
+ write_unlock(&sl->repl_slinks->lock);
+
+ /* Ignore any resize problem and live with what we got. */
+ if (sl_count > 1)
+ dm_io_client_resize(sl_count, sl->io.dm_io_client);
+
+ DMDEBUG("%s added device=%u to slink=%u",
+ __func__, dev_number, sl->number);
+ return dev_number;
+
+bad_unlock:
+ write_unlock(&sl->lock);
+ write_unlock(&sl->repl_slinks->lock);
+bad:
+ BUG_ON(dev_io(dev));
+ dev_destroy(dev);
+ return r;
+}
+
+/* Method to delete a device from a given site link. */
+static int
+blockdev_dev_del(struct dm_repl_slink *slink, int dev_number)
+{
+ int i;
+ unsigned sl_count;
+ struct slink *sl;
+ struct sdev *dev;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ if (dev_number < 0)
+ return -EINVAL;
+
+ /* Check if device is active! */
+ write_lock(&sl->lock);
+ dev = dev_get_by_number(sl, dev_number);
+ if (unlikely(IS_ERR(dev))) {
+ write_unlock(&sl->lock);
+ return PTR_ERR(dev);
+ }
+
+ SetDevTeardown(dev);
+ write_unlock(&sl->lock);
+
+ /* Release the new reference taken out via dev_get_by_number() .*/
+ BUG_ON(dev_put(dev));
+
+ /* Wait for any device I/O to finish. */
+ slink_wait_on_io(dev);
+ BUG_ON(dev_io(dev));
+
+ /* Take device off any lists. */
+ write_lock(&sl->lock);
+ i = ARRAY_SIZE(dev->lists);
+ while (i--) {
+ if (!list_empty(dev->lists + i))
+ list_del(dev->lists + i);
+ }
+
+ write_unlock(&sl->lock);
+
+ /* Destroy device. */
+ dev_destroy(dev);
+
+ /* Ignore any resize problem. */
+ sl_count = slink_count(sl);
+ dm_io_client_resize(sl_count ? sl_count : 1, sl->io.dm_io_client);
+ DMDEBUG("%s deleted device=%u from slink=%u",
+ __func__, dev_number, sl->number);
+ return 0;
+}
+
+/* Check slink properties for consistency. */
+static int slink_check_properties(struct slink_params *params)
+{
+ enum dm_repl_slink_policy_type policy = params->policy;
+
+ if (slink_policy_synchronous(policy) &&
+ slink_policy_asynchronous(policy))
+ DM_EINVAL("synchronous and asynchronous slink "
+ "policies are mutually exclusive!");
+
+ if (slink_policy_synchronous(policy) &&
+ params->fallbehind.value)
+ DM_EINVAL("synchronous slink policy and fallbehind "
+ "are mutually exclusive!");
+ return 0;
+}
+
+/*
+ * Start methods of "blockdev" slink type.
+ */
+/* Method to destruct a site link context. */
+static void
+blockdev_dtr(struct dm_repl_slink *slink)
+{
+ struct slink *sl;
+
+ _BUG_ON_PTR(slink);
+ sl = slink->context;
+ _BUG_ON_PTR(sl);
+
+ slink_destroy(sl);
+ BUG_ON(!slink_put(sl));
+}
+
+/*
+ * Method to construct a site link context.
+ *
+ * #slink_params = 1-4
+ * <slink_params> = slink# [slink_policy [fall_behind value]]
+ * slink# = used to tie the host+dev_path to a particular SLINK; 0 is used
+ * for the local site link and 1-M are for remote site links.
+ * slink_policy = policy to set on the slink (eg. async/sync)
+ * fall_behind = # of ios the SLINK can fall behind before switching to
+ * synchronous mode (ios N, data N[kmgtpe], timeout N[smhd])
+ */
+static int
+blockdev_ctr(struct dm_repl_slink *slink, struct dm_repl_log *replog,
+ unsigned argc, char **argv)
+{
+ int check, r;
+ long long tmp;
+ unsigned slink_number, slink_params;
+ struct slink *sl;
+ struct slink_params params;
+ /* Makes our task to keep a unique list of slinks per replog easier. */
+ struct dm_repl_log_slink_list *repl_slinks =
+ replog->ops->slinks(replog);
+
+ SHOW_ARGV;
+ _BUG_ON_PTR(repl_slinks);
+
+ memset(¶ms, 0, sizeof(params));
+ params.policy = DM_REPL_SLINK_ASYNC;
+ params.fallbehind.type = DM_REPL_SLINK_FB_IOS;
+ params.fallbehind.multiplier = 1;
+
+ if (unlikely(argc < 2))
+ DM_EINVAL("invalid number of slink arguments");
+
+ /* Get # of slink parameters. */
+ if (unlikely(sscanf(argv[0], "%lld", &tmp) != 1 ||
+ tmp < 1 || tmp > argc))
+ DM_EINVAL("invalid slink parameter argument");
+ else
+ params.count = slink_params = tmp;
+
+
+ /* Get slink#. */
+ if (unlikely(sscanf(argv[1], "%lld", &tmp) != 1 ||
+ tmp < 0 || tmp >= replog->ops->slink_max(replog))) {
+ DM_EINVAL("invalid slink number argument");
+ } else
+ slink_number = tmp;
+
+ if (slink_params > 1) {
+ /* Handle policy argument. */
+ r = get_slink_policy(argv[2]);
+ if (unlikely(r < 0))
+ return r;
+
+ params.policy = r;
+ check = 1;
+
+ /* Handle fallbehind argument. */
+ if (slink_params > 2) {
+ r = get_slink_fallbehind(slink_params,
+ argv + 3, ¶ms.fallbehind);
+ if (unlikely(r < 0))
+ return r;
+ }
+ } else
+ check = 0;
+
+ /* Check that policies make sense vs. fallbehind. */
+ if (check) {
+ r = slink_check_properties(¶ms);
+ if (r < 0)
+ return r;
+ }
+
+ /* Get/create an slink context. */
+ sl = slink_create(slink, repl_slinks, ¶ms, slink_number);
+ return unlikely(IS_ERR(sl)) ? PTR_ERR(sl) : 0;
+}
+
+/*
+ * Initiate data copy across a site link.
+ *
+ * This function may be used to copy a buffer entry *or*
+ * for resynchronizing regions initially or when an SLINK
+ * has fallen back to dirty log (bitmap) mode.
+ */
+/* Get sdev ptr from copy address. */
+static struct sdev *
+dev_get_by_addr(struct slink *sl, struct dm_repl_slink_copy_addr *addr)
+{
+ BUG_ON(addr->type != DM_REPL_SLINK_BLOCK_DEVICE &&
+ addr->type != DM_REPL_SLINK_DEV_NUMBER);
+ return addr->type == DM_REPL_SLINK_BLOCK_DEVICE ?
+ dev_get_by_bdev(sl, addr->dev.bdev) :
+ dev_get_by_number(sl, addr->dev.number.dev);
+}
+
+/*
+ * Needs to be called with sl->lock held.
+ *
+ * Return 0 in case io is allowed to the region the sector
+ * is in and take out an I/O reference on the device.
+ *
+ * If I/O isn't allowd, no I/O reference will be taken out
+ * and the follwoing return codes apply to caller actions:
+ *
+ * o -EAGAIN in case of prohibiting I/O because of device suspension
+ * or device I/O errors (i.e. link temporarilly down) ->
+ * caller is allowed to retry the I/O later once
+ * he'll have received a callback.
+ *
+ * o -EACCES in case a region is being resynchronized and the source
+ * region is being read to copy data accross to the same region
+ * of the replica (RD) ->
+ * caller is allowed to retry the I/O later once
+ * he'll have received a callback.
+ *
+ * o -ENODEV in case a device is not configured
+ * caller must drop the I/O to the device/slink pair.
+ *
+ * o -EPERM in case a region is out of sync ->
+ * caller must drop the I/O to the device/slink pair.
+ */
+static int
+may_io(int rw, struct slink *sl, struct sdev *dev,
+ sector_t sector, const char *action)
+{
+ int r;
+ sector_t region;
+ struct slink *sl_cur;
+ struct sdev *dev_tmp;
+ struct sdev_resync *resync;
+
+ _BUG_ON_PTR(sl);
+
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ region = sector_to_region(dev, sector);
+
+ /*
+ * It's a caller error to call for multiple copies per slink.
+ */
+ if (rw == WRITE &&
+ SlinkWriter(sl)) {
+ DMERR_LIMIT("%s %s to slink%d, dev%d, region=%llu "
+ "while write pending to region=%llu!",
+ __func__, action, sl->number, dev->dev.number,
+ (unsigned long long) region,
+ (unsigned long long) dev->io.resync.writer_region);
+ return -EPERM;
+ }
+
+ /*
+ * If the device is suspended, being torn down,
+ * closed or errored, retry again later.
+ */
+ if (!DevOpen(dev) ||
+ DevSuspended(dev) ||
+ DevTeardown(dev) ||
+ DevErrorRead(dev) ||
+ DevErrorWrite(dev))
+ return -EAGAIN;
+
+ /* slink > 0 may read and write in case region is in sync. */
+ if (sl->number) {
+ struct dm_dirty_log *dl = dev->dev.dl;
+
+ /*
+ * Ask dirty log for region in sync.
+ *
+ * In sync -> allow reads and writes.
+ * Out of sync -> prohibit them.
+ */
+ _BUG_ON_PTR(dl);
+ r = dl->type->in_sync(dl, region, 0); /* Don't block. */
+ r = r ? 0 : -EPERM;
+ } else {
+ /* slink0 may always read. */
+ if (rw == READ)
+ return 0;
+
+ read_lock(&sl->repl_slinks->lock);
+
+ /*
+ * Walk all slinks and check if anyone is syncing this region,
+ * in which case no write is allowed to it on slink0.
+ */
+ list_for_each_entry(sl_cur, &sl->repl_slinks->list,
+ lists[SLINK_REPLOG]) {
+ /* Avoid local devices. */
+ if (!sl_cur->number)
+ continue;
+
+ /*
+ * If device exists and the LD is
+ * being read off for resync.
+ */
+ read_lock(&sl_cur->lock);
+ dev_tmp = dev_get_by_number(sl_cur, dev->dev.number);
+ read_unlock(&sl_cur->lock);
+
+ if (!IS_ERR(dev_tmp)) {
+ resync = &dev_tmp->io.resync;
+ if (resync->end &&
+ resync->region == region) {
+ BUG_ON(dev_put(dev_tmp));
+ r = -EACCES;
+ goto out;
+ }
+
+ BUG_ON(dev_put(dev_tmp));
+ }
+ }
+
+ /* We're allowed to write to this LD -> indicate we do. */
+ r = 0;
+out:
+ read_unlock(&sl->repl_slinks->lock);
+ }
+
+ if (!r && rw == WRITE) {
+ write_lock(&sl->lock);
+ /*
+ * Memorize region being synchronized
+ * to check in do_slink_resync().
+ */
+ dev->io.resync.writer_region = region;
+ SetSlinkWriter(sl);
+ write_unlock(&sl->lock);
+ }
+
+ return r;
+}
+
+/* Set source/destination address of the copy. */
+static int
+copy_addr_init(struct slink *sl, struct dm_io_region *io,
+ struct dm_repl_slink_copy_addr *addr, unsigned size)
+{
+
+ if (addr->type == DM_REPL_SLINK_BLOCK_DEVICE) {
+ io->bdev = addr->dev.bdev;
+ } else if (addr->type == DM_REPL_SLINK_DEV_NUMBER) {
+ struct sdev *dev;
+ struct slink *sl_tmp;
+
+ /* Check that slink number is correct. */
+ read_lock(&sl->repl_slinks->lock);
+ sl_tmp = slink_get_by_number(sl->repl_slinks,
+ addr->dev.number.slink);
+ if (unlikely(IS_ERR(sl_tmp))) {
+ int r = PTR_ERR(sl_tmp);
+
+ read_unlock(&sl->repl_slinks->lock);
+ return r;
+ }
+
+ read_unlock(&sl->repl_slinks->lock);
+
+ if (unlikely(sl != sl_tmp))
+ return -EINVAL;
+
+ read_lock(&sl_tmp->lock);
+ dev = dev_get_by_number(sl_tmp, addr->dev.number.dev);
+ read_unlock(&sl_tmp->lock);
+
+ BUG_ON(slink_put(sl_tmp));
+
+ if (unlikely(IS_ERR(dev)))
+ return PTR_ERR(dev);
+
+ io->bdev = dev->dev.dm_dev->bdev;
+ BUG_ON(dev_put(dev));
+ } else
+ BUG();
+
+ io->sector = addr->sector;
+ io->count = dm_div_up(size, to_bytes(1));
+ return 0;
+}
+
+/*
+ * Copy endio function.
+ *
+ * For the "blockdev" type, both states (data in (remote) ram and data
+ * on (remote) disk) are reported here at once. For future transports
+ * those will be reported seperately.
+ */
+static void
+copy_endio(int read_err, unsigned long write_err, void *context)
+{
+ struct copy_context *ctx = context;
+ struct slink *sl;
+ struct sdev *dev_to;
+
+ _BUG_ON_PTR(ctx);
+ dev_to = ctx->dev_to;
+ _BUG_ON_PTR(dev_to);
+ sl = dev_to->sl;
+ _BUG_ON_PTR(sl);
+
+ /* Throw a table event in case of a site link device copy error. */
+ if (unlikely(read_err || write_err)) {
+ if (read_err) {
+ if (!TestSetDevErrorRead(dev_to)) {
+ SetSlinkErrorRead(sl);
+ dm_table_event(dev_to->ti->table);
+ }
+ } else if (!TestSetDevErrorWrite(dev_to)) {
+ SetSlinkErrorWrite(sl);
+ dm_table_event(dev_to->ti->table);
+ }
+
+ set_dev_test_time(dev_to);
+ }
+
+ /* Must clear before calling back. */
+ ClearSlinkWriter(sl);
+
+ /*
+ * FIXME: check if caller has set region to NOSYNC and
+ * and if so, avoid calling callbacks completely.
+ */
+ if (ctx->ram.fn)
+ ctx->ram.fn(read_err, write_err ? -EIO : 0, ctx->ram.context);
+
+ /* Only call when no error or when no ram callback defined. */
+ if (likely((!read_err && !write_err) || !ctx->ram.fn))
+ ctx->disk.fn(read_err, write_err, ctx->disk.context);
+
+ /* Copy done slinkX device. */
+ free_copy_context(ctx, sl);
+
+ /* Release device reference. */
+ BUG_ON(dev_put(dev_to));
+
+ /* Wake slink worker to reschedule any postponed resynchronization. */
+ wake_do_slink(sl);
+}
+
+/* Site link copy method. */
+static int
+blockdev_copy(struct dm_repl_slink *slink, struct dm_repl_slink_copy *copy,
+ unsigned long long tag)
+{
+ int r;
+ struct slink *sl;
+ struct sdev *dev_to;
+ struct copy_context *ctx = NULL;
+ static struct dm_io_region src, dst;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+ if (unlikely(SlinkErrorRead(sl) || SlinkErrorWrite(sl)))
+ return -EAGAIN;
+
+ /* Get device by address taking out reference. */
+ read_lock(&sl->lock);
+ dev_to = dev_get_by_addr(sl, ©->dst);
+ read_unlock(&sl->lock);
+
+ /* Check if io is allowed or a resync is active. */
+ r = may_io(WRITE, sl, dev_to, copy->dst.sector, "copy");
+ if (r < 0)
+ goto bad;
+
+ ctx = alloc_copy_context(sl);
+ BUG_ON(!ctx);
+
+ /* Device to copy to. */
+ ctx->dev_to = dev_to;
+
+ /* Save caller context. */
+ ctx->ram = copy->ram;
+ ctx->disk = copy->disk;
+
+ /* Setup copy source. */
+ r = copy_addr_init(sl, &src, ©->src, copy->size);
+ if (unlikely(r < 0))
+ goto bad;
+
+ /* Setup copy destination. */
+ r = copy_addr_init(sl, &dst, ©->dst, copy->size);
+ if (unlikely(r < 0))
+ goto bad;
+
+ /* FIXME: can we avoid reading per copy on multiple slinks ? */
+ r = dm_kcopyd_copy(sl->io.kcopyd_client, &src, 1, &dst, 0,
+ copy_endio, ctx);
+ BUG_ON(r); /* dm_kcopyd_copy() may never fail. */
+ SetDevIOQueued(dev_to); /* dev_unplug(src.bdev); */
+ return r;
+
+bad:
+ if (!IS_ERR(dev_to))
+ BUG_ON(dev_put(dev_to));
+
+ if (ctx)
+ free_copy_context(ctx, sl);
+
+ return r;
+}
+
+/* Method to get site link policy. */
+static enum dm_repl_slink_policy_type
+blockdev_policy(struct dm_repl_slink *slink)
+{
+ struct slink *sl = slink_check(slink);
+
+ return IS_ERR(sl) ? PTR_ERR(sl) : sl->params.policy;
+}
+
+/* Method to get site link State. */
+static enum dm_repl_slink_state_type
+blockdev_state(struct dm_repl_slink *slink)
+{
+ enum dm_repl_slink_state_type state = 0;
+ struct slink *sl = slink_check(slink);
+
+ if (unlikely(IS_ERR(sl)))
+ return PTR_ERR(sl);
+
+ if (SlinkErrorRead(sl))
+ set_bit(DM_REPL_SLINK_READ_ERROR, (unsigned long *) &state);
+
+ if (SlinkErrorWrite(sl))
+ set_bit(DM_REPL_SLINK_DOWN, (unsigned long *) &state);
+
+ return state;
+}
+
+/* Method to get reference to site link fallbehind parameters. */
+static struct dm_repl_slink_fallbehind *
+blockdev_fallbehind(struct dm_repl_slink *slink)
+{
+ struct slink *sl = slink_check(slink);
+
+ return IS_ERR(sl) ? ((struct dm_repl_slink_fallbehind *) sl) :
+ &sl->params.fallbehind;
+}
+
+/* Return # of the device. */
+static int
+blockdev_dev_number(struct dm_repl_slink *slink, struct block_device *bdev)
+{
+ struct slink *sl = slink_check(slink);
+ struct sdev *dev;
+ struct mapped_device *md;
+
+ if (unlikely(IS_ERR(sl)))
+ return PTR_ERR(sl);
+
+ if (unlikely(sl->number)) {
+ DMERR("Can't retrieve device number from slink > 0");
+ return -EINVAL;
+ }
+
+ read_lock(&sl->lock);
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ md = dm_table_get_md(dev->ti->table);
+ if (bdev->bd_disk == dm_disk(md)) {
+ read_unlock(&sl->lock);
+ dm_put(md);
+ return dev->dev.number;
+ }
+
+ dm_put(md);
+ }
+
+ read_unlock(&sl->lock);
+
+ /*
+ * The caller might have removed the device from SLINK0 but
+ * have an order to copy to the device in its metadata still,
+ * so he has to react accordingly (ie. remove device copy request).
+ */
+ return -ENOENT;
+}
+
+/* Method to remap bio to underlying device on slink0. */
+static int
+blockdev_io(struct dm_repl_slink *slink, struct bio *bio,
+ unsigned long long tag)
+{
+ int r, rw = bio_data_dir(bio);
+ struct slink *sl;
+ struct sdev *dev;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ /*
+ * Prohibit slink > 0 I/O, because the resync
+ * code can't cope with it for now...
+ */
+ if (sl->number)
+ DM_EPERM("I/O to slink > 0 prohibited!");
+
+ if (rw == WRITE)
+ DM_EPERM("Writes to slink0 prohibited!");
+
+ read_lock(&sl->lock);
+ dev = dev_get_by_bdev(sl, bio->bi_bdev);
+ read_unlock(&sl->lock);
+
+ /* Check if io is allowed or a resync is active. */
+ r = may_io(rw, sl, dev, bio->bi_sector, "io");
+ if (likely(!r)) {
+ bio->bi_bdev = dev->dev.dm_dev->bdev;
+ generic_make_request(bio);
+ SetDevIOQueued(dev);
+ }
+
+ if (!IS_ERR(dev))
+ BUG_ON(dev_put(dev));
+
+ return r;
+}
+
+/* Method to unplug all device queues on a site link. */
+static int
+blockdev_unplug(struct dm_repl_slink *slink)
+{
+ struct slink *sl = slink_check(slink);
+ struct sdev *dev, *dev_n;
+
+ if (unlikely(IS_ERR(sl)))
+ return PTR_ERR(sl);
+
+ /* Take out device references for all devices with IO queued. */
+ read_lock(&sl->lock);
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ if (TestClearDevIOQueued(dev)) {
+ dev_get(dev);
+ SetDevIOUnplug(dev);
+ }
+ }
+
+ read_unlock(&sl->lock);
+
+ list_for_each_entry_safe(dev, dev_n,
+ SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ if (TestClearDevIOUnplug(dev)) {
+ if (DevOpen(dev) &&
+ !DevSuspended(dev) &&
+ !DevTeardown(dev))
+ dev_unplug(dev->dev.dm_dev->bdev);
+
+ BUG_ON(dev_put(dev));
+ }
+ }
+
+ return 0;
+}
+
+/* Method to set global recovery function and context. */
+static void
+blockdev_recover_notify_fn_set(struct dm_repl_slink *slink,
+ dm_repl_notify_fn fn, void *context)
+{
+ struct slink *sl;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ write_lock(&sl->lock);
+ sl->recover.fn = fn;
+ sl->recover.context = context;
+ write_unlock(&sl->lock);
+}
+
+/* Method to return # of the SLINK. */
+static int
+blockdev_slink_number(struct dm_repl_slink *slink)
+{
+ struct slink *sl = slink_check(slink);
+
+ return unlikely(IS_ERR(sl)) ? PTR_ERR(sl) : sl->number;
+}
+
+/* Method to return SLINK by number. */
+static struct dm_repl_slink *
+blockdev_slink(struct dm_repl_log *replog, unsigned slink_number)
+{
+ struct slink *sl;
+ struct dm_repl_slink *slink;
+ struct dm_repl_log_slink_list *repl_slinks;
+
+ _BUG_ON_PTR(replog);
+ repl_slinks = replog->ops->slinks(replog);
+ _BUG_ON_PTR(repl_slinks);
+
+ read_lock(&repl_slinks->lock);
+ sl = slink_get_by_number(repl_slinks, slink_number);
+ if (IS_ERR(sl))
+ slink = (struct dm_repl_slink *) sl;
+ else {
+ slink = sl->slink;
+ BUG_ON(slink_put(sl));
+ }
+
+ read_unlock(&repl_slinks->lock);
+ return slink;
+}
+
+/* Method to set SYNC state of a region of a device. */
+static int
+blockdev_set_sync(struct dm_repl_slink *slink, int dev_number,
+ sector_t sector, int in_sync)
+{
+ struct sdev *dev;
+ struct sdev_dev *sd;
+ struct dm_dirty_log *dl;
+ struct slink *sl;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ if (dev_number < 0)
+ return -EINVAL;
+
+ read_lock(&sl->lock);
+ dev = dev_get_by_number(sl, dev_number);
+ read_unlock(&sl->lock);
+
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ sd = &dev->dev;
+ dl = sd->dl;
+ if (dl)
+ dl->type->set_region_sync(dl, sector_to_region(dev, sector),
+ in_sync);
+ BUG_ON(dev_put(dev));
+ return 0;
+}
+
+/* Method to flush all dirty logs on slink's devices. */
+static int
+blockdev_flush_sync(struct dm_repl_slink *slink)
+{
+ int r = 0;
+ struct slink *sl;
+ struct sdev *dev;
+ struct list_head resync_list;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+ if (!sl->number)
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&resync_list);
+
+ write_lock(&sl->lock);
+ if (SlinkResyncProcessing(sl)) {
+ write_unlock(&sl->lock);
+ return -EAGAIN;
+ }
+
+ /* Take out resync list in order to process flushs unlocked. */
+ list_splice(SLINK_RESYNC_LIST(sl), &resync_list);
+ INIT_LIST_HEAD(SLINK_RESYNC_LIST(sl));
+
+ /* Get race references on devices. */
+ list_for_each_entry(dev, &resync_list, lists[SDEV_RESYNC]) {
+ _BUG_ON_PTR(dev->dev.dl);
+ dev_get(dev);
+ }
+
+ write_unlock(&sl->lock);
+
+ list_for_each_entry(dev, &resync_list, lists[SDEV_RESYNC]) {
+ if (DevOpen(dev) &&
+ !(DevSuspended(dev) || DevTeardown(dev))) {
+ int rr;
+ struct dm_dirty_log *dl = dev->dev.dl;
+
+ rr = dl->type->flush(dl);
+ if (rr && !r)
+ r = rr;
+ }
+ }
+
+ /* Put race device references. */
+ write_lock(&sl->lock);
+ list_for_each_entry(dev, &resync_list, lists[SDEV_RESYNC])
+ BUG_ON(dev_put(dev));
+
+ list_splice(&resync_list, SLINK_RESYNC_LIST(sl));
+ write_unlock(&sl->lock);
+
+ return r;
+}
+
+/*
+ * Method to trigger/prohibit resynchronzation on all devices by
+ * adding to the slink resync list and waking up the worker.
+ *
+ * We may *not* remove from the slink resync list here,
+ * because we'd end up with partitally resynchronized
+ * regions in do_slink_resync() otherwise.
+ */
+static int
+blockdev_resync(struct dm_repl_slink *slink, int resync)
+{
+ struct slink *sl;
+ struct sdev *dev;
+
+ _SET_AND_BUG_ON_SL(sl, slink);
+
+ /* Don't proceed on site link 0. */
+ if (!sl->number)
+ return -EINVAL;
+
+ /* If resync processing, we need to postpone. */
+ write_lock(&sl->lock);
+ if (SlinkResyncProcessing(sl)) {
+ write_unlock(&sl->lock);
+ return -EAGAIN;
+ }
+
+ list_for_each_entry(dev, SLINK_DEVS_LIST(sl), lists[SDEV_SLINK]) {
+ BUG_ON(IS_ERR(dev));
+
+ if (resync) {
+ SetDevResync(dev);
+
+ /* Add to resync list if not yet on. */
+ if (list_empty(SDEV_RESYNC_LIST(dev))) {
+ list_add_tail(SDEV_RESYNC_LIST(dev),
+ SLINK_RESYNC_LIST(sl));
+ break;
+ }
+ } else {
+ ClearDevResync(dev);
+
+ /* emove from resync list if on. */
+ if (!list_empty(SDEV_RESYNC_LIST(dev)))
+ list_del_init(SDEV_RESYNC_LIST(dev));
+ }
+ }
+
+ write_unlock(&sl->lock);
+ wake_do_slink(sl);
+ return 0;
+}
+
+/*
+ * Method to check if a region is in sync
+ * by sector on all devices on all slinks.
+ */
+static int
+blockdev_in_sync(struct dm_repl_slink *slink, int dev_number, sector_t sector)
+{
+ int nosync = 0;
+ sector_t region = 0;
+ struct slink *sl = slink_check(slink);
+
+ if (IS_ERR(sl) ||
+ dev_number < 0)
+ return -EINVAL;
+
+ BUG_ON(!sl->repl_slinks);
+
+ read_lock(&sl->repl_slinks->lock);
+ list_for_each_entry(sl, &sl->repl_slinks->list, lists[SLINK_REPLOG]) {
+ int r;
+ struct dm_dirty_log *dl;
+ struct sdev *dev;
+
+ if (!sl->number)
+ continue;
+
+ read_lock(&sl->lock);
+ dev = dev_get_by_number(sl, dev_number);
+ read_unlock(&sl->lock);
+
+ if (IS_ERR(dev))
+ continue;
+
+ dl = dev->dev.dl;
+ _BUG_ON_PTR(dl);
+
+ /* Calculate region once for all devices on any slinks. */
+ if (!region)
+ region = sector_to_region(dev, sector);
+
+ r = dl->type->in_sync(dl, region, 0);
+ BUG_ON(dev_put(dev));
+ if (!r) {
+ nosync = 1;
+ break;
+ }
+ }
+
+ read_unlock(&sl->repl_slinks->lock);
+ return nosync;
+}
+
+/*
+ * Method for site link messages.
+ *
+ * fallbehind ios/size/timeout=X[unit]
+ * policy X
+ */
+static int
+blockdev_message(struct dm_repl_slink *slink, unsigned argc, char **argv)
+{
+ int r;
+ struct slink_params params;
+ struct slink *sl = slink_check(slink);
+
+ if (IS_ERR(sl))
+ return PTR_ERR(sl);
+
+ if (unlikely(argc < 2))
+ goto bad_arguments;
+
+ /* Preserve parameters. */
+ params = sl->params;
+
+ if (!strnicmp(STR_LEN(argv[0], "fallbehind"))) {
+ if (argc != 2)
+ DM_EINVAL("wrong fallbehind argument count");
+
+ r = get_slink_fallbehind(argc - 1, argv + 1,
+ ¶ms.fallbehind);
+ if (r < 0)
+ return r;
+ } else if (!strnicmp(STR_LEN(argv[0], "policy"))) {
+ if (argc != 2)
+ DM_EINVAL("wrong policy argument count");
+
+ r = get_slink_policy(argv[1]);
+ if (r < 0)
+ return r;
+
+ params.policy = r;
+ } else
+ DM_EINVAL("invalid message received");
+
+ /* Check properties' consistency. */
+ r = slink_check_properties(¶ms);
+ if (r < 0)
+ return r;
+
+ /* Set parameters. */
+ sl->params = params;
+ return 0;
+
+bad_arguments:
+ DM_EINVAL("too few message arguments");
+}
+
+/* String print site link error state. */
+static const char *
+snprint_slink_error(struct slink *sl, char *result, size_t maxlen)
+{
+ size_t sz = 0;
+
+ *result = 0;
+ if (SlinkErrorRead(sl))
+ DMEMIT("R");
+
+ if (SlinkErrorWrite(sl))
+ DMEMIT("W");
+
+ if (!*result)
+ DMEMIT("A");
+
+ return result;
+}
+
+/* String print device status. */
+static const char *
+snprint_device(struct slink *sl, struct sdev *dev,
+ status_type_t type, char *result, unsigned maxlen)
+{
+ size_t sz = 0;
+ static char buf[BDEVNAME_SIZE];
+ struct sdev_dev *sd;
+ struct dm_dirty_log *dl;
+
+ *result = 0;
+ if (IS_ERR(dev))
+ goto out;
+
+ sd = &dev->dev;
+ DMEMIT("%u %s ", sd->params.count,
+ sd->dm_dev ?
+ format_dev_t(buf, sd->dm_dev->bdev->bd_dev) :
+ sd->params.path);
+ dl = sd->dl;
+ if (dl)
+ dl->type->status(dl, type, result + sz, maxlen - sz);
+ else
+ DMEMIT("nolog 0");
+
+out:
+ return result;
+}
+
+/* String print device resynchronization state. */
+static const char *
+snprint_sync_count(struct slink *sl, struct sdev *dev,
+ char *result, unsigned maxlen)
+{
+ size_t sz = 0;
+ struct dm_dirty_log *dl;
+
+ if (IS_ERR(dev))
+ goto no_dev;
+
+ dl = dev->dev.dl;
+ if (dl) {
+ DMEMIT("%llu%s/%llu",
+ (unsigned long long) dl->type->get_sync_count(dl),
+ DevResyncing(dev) ? "+" : "",
+ (unsigned long long) region_count(dev));
+ } else {
+no_dev:
+ DMEMIT("-");
+ }
+
+ return result;
+}
+
+/* Method for site link status requests. */
+static struct dm_repl_slink_type blockdev_type;
+static int
+blockdev_status(struct dm_repl_slink *slink, int dev_number,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ size_t sz = 0;
+ static char buffer[256];
+ struct slink *sl_cur, *sl = slink_check(slink);
+ struct sdev *dev;
+ struct slink_params *p;
+ struct list_head *sl_list;
+
+ if (unlikely(IS_ERR(sl)))
+ return PTR_ERR(sl);
+
+ if (dev_number < -1)
+ return -EINVAL;
+
+ BUG_ON(!sl->repl_slinks);
+ sl_list = &sl->repl_slinks->list;
+
+ read_lock(&sl->repl_slinks->lock);
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ list_for_each_entry(sl_cur, sl_list, lists[SLINK_REPLOG]) {
+ read_lock(&sl_cur->lock);
+ dev = dev_get_by_number(sl_cur, dev_number);
+ read_unlock(&sl_cur->lock);
+
+ if (!IS_ERR(dev)) {
+ DMEMIT("%s,",
+ snprint_slink_error(sl_cur, buffer,
+ sizeof(buffer)));
+
+ DMEMIT("%s ",
+ snprint_sync_count(sl_cur, dev, buffer,
+ sizeof(buffer)));
+ BUG_ON(dev_put(dev));
+ }
+ }
+
+ break;
+
+ case STATUSTYPE_TABLE:
+ list_for_each_entry(sl_cur, sl_list, lists[SLINK_REPLOG]) {
+ read_lock(&sl_cur->lock);
+ if (dev_number < 0) {
+ p = &sl_cur->params;
+ DMEMIT("%s %u %u ",
+ blockdev_type.type.name,
+ p->count, sl_cur->number);
+
+ if (p->count > 1) {
+ snprint_policies(p->policy, buffer,
+ sizeof(buffer));
+ DMEMIT("%s ", buffer);
+ snprint_fallbehind(&p->fallbehind,
+ buffer,
+ sizeof(buffer));
+ if (p->count > 2)
+ DMEMIT("%s ", buffer);
+ }
+ } else {
+ dev = dev_get_by_number(sl_cur, dev_number);
+ if (!IS_ERR(dev)) {
+ DMEMIT("%u %s ", sl_cur->number,
+ snprint_device(sl_cur, dev, type,
+ buffer,
+ sizeof(buffer)));
+ BUG_ON(dev_put(dev));
+ }
+ }
+
+ read_unlock(&sl_cur->lock);
+ }
+ }
+
+ read_unlock(&sl->repl_slinks->lock);
+ return 0;
+}
+
+/*
+ * End methods of "blockdev" slink type.
+ */
+
+/* "blockdev" SLINK handler interface type. */
+static struct dm_repl_slink_type blockdev_type = {
+ .type.name = "blockdev",
+ .type.module = THIS_MODULE,
+
+ .ctr = blockdev_ctr,
+ .dtr = blockdev_dtr,
+
+ .postsuspend = blockdev_postsuspend,
+ .resume = blockdev_resume,
+
+ .dev_add = blockdev_dev_add,
+ .dev_del = blockdev_dev_del,
+
+ .copy = blockdev_copy,
+ .io = blockdev_io,
+ .unplug = blockdev_unplug,
+ .recover_notify_fn_set = blockdev_recover_notify_fn_set,
+ .set_sync = blockdev_set_sync,
+ .flush_sync = blockdev_flush_sync,
+ .resync = blockdev_resync,
+ .in_sync = blockdev_in_sync,
+
+ .policy = blockdev_policy,
+ .state = blockdev_state,
+ .fallbehind = blockdev_fallbehind,
+ .dev_number = blockdev_dev_number,
+ .slink_number = blockdev_slink_number,
+ .slink = blockdev_slink,
+
+ .message = blockdev_message,
+ .status = blockdev_status,
+};
+
+/* Destroy kmem caches on module unload. */
+static void
+slink_kmem_caches_exit(void)
+{
+ struct cache_defs *cd = ARRAY_END(cache_defs);
+
+ while (cd-- > cache_defs) {
+ if (cd->cache) {
+ kmem_cache_destroy(cd->cache);
+ cd->cache = NULL;
+ }
+ }
+}
+
+/* Create kmem caches on module load. */
+static int
+slink_kmem_caches_init(void)
+{
+ int r = 0;
+ struct cache_defs *cd = ARRAY_END(cache_defs);
+
+ while (cd-- > cache_defs) {
+ cd->cache = kmem_cache_create(cd->name, cd->size, 0, 0, NULL);
+
+ if (unlikely(!cd->cache)) {
+ DMERR("failed to create %s slab for site link "
+ "handler %s %s",
+ cd->name, blockdev_type.type.name, version);
+ slink_kmem_caches_exit();
+ r = -ENOMEM;
+ break;
+ }
+ }
+
+ return r;
+}
+
+int __init
+dm_repl_slink_init(void)
+{
+ int r;
+
+ /* Create slabs for the copy contexts and test buffers. */
+ r = slink_kmem_caches_init();
+ if (r) {
+ DMERR("failed to init %s kmem caches", blockdev_type.type.name);
+ return r;
+ }
+
+ r = dm_register_type(&blockdev_type, DM_SLINK);
+ if (unlikely(r < 0)) {
+ DMERR("failed to register replication site "
+ "link handler %s %s [%d]",
+ blockdev_type.type.name, version, r);
+ slink_kmem_caches_exit();
+ } else
+ DMINFO("registered replication site link handler %s %s",
+ blockdev_type.type.name, version);
+
+ return r;
+}
+
+void __exit
+dm_repl_slink_exit(void)
+{
+ int r = dm_unregister_type(&blockdev_type, DM_SLINK);
+
+ slink_kmem_caches_exit();
+
+ if (r)
+ DMERR("failed to unregister replication site "
+ "link handler %s %s [%d]",
+ blockdev_type.type.name, version, r);
+ else
+ DMINFO("unregistered replication site link handler %s %s",
+ blockdev_type.type.name, version);
+
+}
+
+/* Module hooks. */
+module_init(dm_repl_slink_init);
+module_exit(dm_repl_slink_exit);
+
+MODULE_DESCRIPTION(DM_NAME " remote replication target \"blockdev\" "
+ "site link (SLINK) handler");
+MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+MODULE_LICENSE("GPL");