Message ID | 1611853955-32167-3-git-send-email-sergei.shtepa@veeam.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Mike Snitzer |
Headers | show |
Series | block: blk_interposer v3 | expand |
On 2021/01/29 2:23, Sergei Shtepa wrote: > Implement a block interposer for device-mapper to attach > to an existing block layer stack. Using the interposer, > we can connect the dm-linear to a device with a mounted > file system. > > changes: > * the new dm_interposer structure contains blk_interposer > to intercept bio from the interposed disk and interval tree > of block devices on this disk. > * the new interval tree for device mapper. > * the dm_submit_bio_interposer_fn() function implements > the bio interception logic. > * the functions dm_interposer_attach_dev() & > dm_interposer_detach_dev() allow to attach and detach devices > to dm_interposer. > * the new parameter 'noexcl' allows to create dm-linear to device > with an already mounted file system. > * the non_exclusive parameter in dm_target structure - it`s a sign > that target device should be opened without FMODE_EXCL mode. > * the new ioctl IOCTL_DEV_REMAP allow to attach dm device to > a regular block device. Same comment about changelog as in the previous patch. > Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com> > --- > drivers/md/dm-core.h | 46 +++- > drivers/md/dm-ioctl.c | 39 ++++ > drivers/md/dm-linear.c | 17 +- > drivers/md/dm-table.c | 12 +- > drivers/md/dm.c | 383 ++++++++++++++++++++++++++++++++-- > drivers/md/dm.h | 2 +- > include/linux/device-mapper.h | 7 + > include/uapi/linux/dm-ioctl.h | 15 +- > 8 files changed, 493 insertions(+), 28 deletions(-) > > diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h > index 086d293c2b03..0f870b1d4be4 100644 > --- a/drivers/md/dm-core.h > +++ b/drivers/md/dm-core.h > @@ -13,7 +13,7 @@ > #include <linux/ktime.h> > #include <linux/genhd.h> > #include <linux/blk-mq.h> > - whiteline change. > +#include <linux/rbtree.h> > #include <trace/events/block.h> > > #include "dm.h" > @@ -109,6 +109,9 @@ struct mapped_device { > bool init_tio_pdu:1; > > struct srcu_struct io_barrier; > + > + /* interposer device for remap */ > + struct dm_interposed_dev *ip_dev; > }; > > void disable_discard(struct mapped_device *md); > @@ -164,6 +167,47 @@ struct dm_table { > struct dm_md_mempools *mempools; > }; > > +/* > + * Interval tree for device mapper > + */ > +struct dm_rb_range { > + struct rb_node node; > + sector_t start; /* start sector of rb node */ > + sector_t last; /* end sector of rb node */ > + sector_t _subtree_last; /* highest sector in subtree of rb node */ > +}; > + > +void dm_rb_insert(struct dm_rb_range *node, struct rb_root_cached *root); > +void dm_rb_remove(struct dm_rb_range *node, struct rb_root_cached *root); > + > +struct dm_rb_range *dm_rb_iter_first(struct rb_root_cached *root, sector_t start, sector_t last); > +struct dm_rb_range *dm_rb_iter_next(struct dm_rb_range *node, sector_t start, sector_t last); > + > +/* > + * For connecting blk_interposer and dm-targets devices. Is this comment about the callback or the structure ? I think the latter, so it is in the worng place. Please also add a comment for the callback definition explaining what it should be doing. > + */ > +typedef void (*dm_interpose_bio_t) (void *context, struct dm_rb_range *node, struct bio *bio); > + > +struct dm_interposed_dev { > + struct gendisk *disk; > + struct dm_rb_range node; > + void *context; > + dm_interpose_bio_t dm_interpose_bio; > + > + atomic64_t ip_cnt; /*for debug purpose*/ > +}; > + > +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, > + sector_t ofs, sector_t len, > + void *context, > + dm_interpose_bio_t dm_interpose_bio_t); > +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev); > +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev); > +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev); > + > +int dm_remap_install(struct mapped_device *md, const char *donor_device_name); > +int dm_remap_uninstall(struct mapped_device *md); > + > static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) > { > return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; > diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c > index 5e306bba4375..2944d442c256 100644 > --- a/drivers/md/dm-ioctl.c > +++ b/drivers/md/dm-ioctl.c > @@ -1649,6 +1649,44 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para > return r; > } > > +static inline int dev_remap_start(struct mapped_device *md, uint8_t *params) > +{ > + char *donor_device_name = (char *)params; > + > + return dm_remap_install(md, donor_device_name); > +} > +static int dev_remap_finish(struct mapped_device *md) > +{ > + return dm_remap_uninstall(md); > +} > + > +static int dev_remap(struct file *filp, struct dm_ioctl *param, size_t param_size) > +{ > + int ret = 0; > + struct mapped_device *md; > + void *bin_data; > + struct dm_remap_param *remap_param; > + > + md = find_device(param); > + if (!md) > + return -ENXIO; > + > + bin_data = (void *)(param) + param->data_start; > + remap_param = bin_data; > + > + if (remap_param->cmd == REMAP_START_CMD) > + ret = dev_remap_start(md, remap_param->params); > + else if (remap_param->cmd == REMAP_FINISH_CMD) > + ret = dev_remap_finish(md); > + else { > + DMWARN("Invalid remap command, %d", remap_param->cmd); > + ret = -EINVAL; > + } > + > + dm_put(md); > + return ret; > +} > + > /* > * The ioctl parameter block consists of two parts, a dm_ioctl struct > * followed by a data buffer. This flag is set if the second part, > @@ -1691,6 +1729,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) > {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, > {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, > {DM_GET_TARGET_VERSION, 0, get_target_version}, > + {DM_DEV_REMAP_CMD, 0, dev_remap}, > }; > > if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) > diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c > index 00774b5d7668..ffb8b5ca4d10 100644 > --- a/drivers/md/dm-linear.c > +++ b/drivers/md/dm-linear.c > @@ -28,12 +28,13 @@ struct linear_c { > */ > static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) > { > + fmode_t mode; > struct linear_c *lc; > unsigned long long tmp; > char dummy; > int ret; > > - if (argc != 2) { > + if ((argc < 2) || (argc > 3)) { > ti->error = "Invalid argument count"; > return -EINVAL; > } > @@ -51,7 +52,19 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) > } > lc->start = tmp; > > - ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev); > + ti->non_exclusive = false; > + if (argc > 2) { > + if (strcmp("noexcl", argv[2]) == 0) > + ti->non_exclusive = true; > + else if (strcmp("excl", argv[2]) == 0) > + ti->non_exclusive = false; It already is false. > + else { > + ti->error = "Invalid exclusive option"; > + return -EINVAL; > + } > + } > + > + ret = dm_get_device(ti, argv[0], mode, &lc->dev); Where is mode initialized ? Why remove dm_table_get_mode(ti->table) ? > if (ret) { > ti->error = "Device lookup failed"; > goto bad; I would prefer to see this change to dm-linear in its own patch, following this one, with a clear explanation in the commit message how this change relates to interposer since the explanation for this "exclusive" change is nowhere to be seen. Also please check if there is a file describing dm-linear options under Documentation/ (I can't remember if there is one). If there is one, it will need to be updated too. > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c > index 4acf2342f7ad..f15bc2171f25 100644 > --- a/drivers/md/dm-table.c > +++ b/drivers/md/dm-table.c > @@ -322,7 +322,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, > * device and not to touch the existing bdev field in case > * it is accessed concurrently. > */ > -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, > +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, bool non_exclusive, > struct mapped_device *md) > { > int r; > @@ -330,7 +330,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, > > old_dev = dd->dm_dev; > > - r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, > + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, non_exclusive, > dd->dm_dev->mode | new_mode, &new_dev); > if (r) > return r; > @@ -387,7 +387,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, > if (!dd) > return -ENOMEM; > > - if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { > + r = dm_get_table_device(t->md, dev, mode, ti->non_exclusive, &dd->dm_dev); > + if (r) { > kfree(dd); > return r; > } > @@ -396,8 +397,9 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, > list_add(&dd->list, &t->devices); > goto out; > > - } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { > - r = upgrade_mode(dd, mode, t->md); > + } else if ((dd->dm_dev->mode != (mode | dd->dm_dev->mode)) && > + (dd->dm_dev->non_exclusive != ti->non_exclusive)) { > + r = upgrade_mode(dd, mode, ti->non_exclusive, t->md); > if (r) > return r; > } > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > index 7bac564f3faa..3b871d98b7b6 100644 > --- a/drivers/md/dm.c > +++ b/drivers/md/dm.c > @@ -28,6 +28,7 @@ > #include <linux/refcount.h> > #include <linux/part_stat.h> > #include <linux/blk-crypto.h> > +#include <linux/interval_tree_generic.h> > > #define DM_MSG_PREFIX "core" > > @@ -56,6 +57,8 @@ static struct workqueue_struct *deferred_remove_workqueue; > atomic_t dm_global_event_nr = ATOMIC_INIT(0); > DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); > > +static DEFINE_MUTEX(interposer_mutex); /* synchronizing access to blk_interposer */ Why not dm_interposer_mutex as the name ? And the comment is not very useful: a mutex is always for synchronizing :) > + > void dm_issue_global_event(void) > { > atomic_inc(&dm_global_event_nr); > @@ -162,6 +165,26 @@ struct table_device { > struct dm_dev dm_dev; > }; > > +/* > + * Device mapper`s interposer. > + */ > +struct dm_interposer { > + struct blk_interposer blk_ip; > + struct mapped_device *md; > + > + struct kref kref; > + struct rw_semaphore ip_devs_lock; > + struct rb_root_cached ip_devs_root; /* dm_interposed_dev tree */ > +}; > + > +/* > + * Interval tree for device mapper > + */ > +#define START(node) ((node)->start) > +#define LAST(node) ((node)->last) > +INTERVAL_TREE_DEFINE(struct dm_rb_range, node, sector_t, _subtree_last, > + START, LAST,, dm_rb); > + > /* > * Bio-based DM's mempools' reserved IOs set by the user. > */ > @@ -733,28 +756,340 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) > rcu_read_unlock(); > } > > +static void dm_submit_bio_interposer_fn(struct bio *bio) > +{ > + struct dm_interposer *ip; > + unsigned int noio_flag = 0; > + sector_t start; > + sector_t last; > + struct dm_rb_range *node; > + > + ip = container_of(bio->bi_disk->interposer, struct dm_interposer, blk_ip); > + start = bio->bi_iter.bi_sector; > + last = start + dm_sector_div_up(bio->bi_iter.bi_size, SECTOR_SIZE); > + > + noio_flag = memalloc_noio_save(); > + down_read(&ip->ip_devs_lock); > + node = dm_rb_iter_first(&ip->ip_devs_root, start, last); > + while (node) { > + struct dm_interposed_dev *ip_dev = > + container_of(node, struct dm_interposed_dev, node); > + > + atomic64_inc(&ip_dev->ip_cnt); > + ip_dev->dm_interpose_bio(ip_dev->context, node, bio); > + > + node = dm_rb_iter_next(node, start, last); > + } > + up_read(&ip->ip_devs_lock); > + memalloc_noio_restore(noio_flag); > +} > + > +static void free_interposer(struct kref *kref) > +{ > + struct dm_interposer *ip = container_of(kref, struct dm_interposer, kref); > + > + blk_interposer_detach(&ip->blk_ip, dm_submit_bio_interposer_fn); No queue freeze ? > + > + kfree(ip); > +} > + > +static struct dm_interposer *new_interposer(struct gendisk *disk) > +{ > + int ret = 0; > + struct dm_interposer *ip; > + > + ip = kzalloc(sizeof(struct dm_interposer), GFP_NOIO); > + if (!ip) > + return ERR_PTR(-ENOMEM); > + > + kref_init(&ip->kref); > + init_rwsem(&ip->ip_devs_lock); > + ip->ip_devs_root = RB_ROOT_CACHED; > + > + ret = blk_interposer_attach(disk, &ip->blk_ip, dm_submit_bio_interposer_fn); No queue freeze ? > + if (ret) { > + DMERR("Failed to attack blk_interposer"); > + kref_put(&ip->kref, free_interposer); > + return ERR_PTR(ret); > + } > + > + return ip; > +} > + > +static struct dm_interposer *get_interposer(struct gendisk *disk) > +{ > + struct dm_interposer *ip; > + > + if (!blk_has_interposer(disk)) > + return NULL; > + > + if (disk->interposer->ip_submit_bio != dm_submit_bio_interposer_fn) { > + DMERR("Disks interposer slot already occupied."); > + return ERR_PTR(-EBUSY); This is weird... If there is an interposer, why not get a ref on that one. That is what the function name suggests at least. > + } > + > + ip = container_of(disk->interposer, struct dm_interposer, blk_ip); > + > + kref_get(&ip->kref); > + return ip; > +} > + > +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, sector_t ofs, sector_t len, > + void *context, dm_interpose_bio_t dm_interpose_bio) > +{ > + sector_t start = ofs; > + sector_t last = ofs + len - 1; > + struct dm_interposed_dev *ip_dev = NULL; > + > + /* Allocate new ip_dev */ > + ip_dev = kzalloc(sizeof(struct dm_interposed_dev), GFP_KERNEL); > + if (!ip_dev) > + return NULL; > + > + ip_dev->disk = disk; > + ip_dev->node.start = start; > + ip_dev->node.last = last; > + > + ip_dev->context = context; > + ip_dev->dm_interpose_bio = dm_interpose_bio; > + > + atomic64_set(&ip_dev->ip_cnt, 0); > + > + return ip_dev; > +} > + > +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev) > +{ > + kfree(ip_dev); > +} Make this inline may be ? > + > +static inline void dm_disk_freeze(struct gendisk *disk) > +{ > + blk_mq_freeze_queue(disk->queue); > + blk_mq_quiesce_queue(disk->queue); I think you can replace this with blk_mq_freeze_queue_wait(). > +} > + > +static inline void dm_disk_unfreeze(struct gendisk *disk) > +{ > + blk_mq_unquiesce_queue(disk->queue); > + blk_mq_unfreeze_queue(disk->queue); > +} > + > +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev) > +{ > + int ret = 0; > + struct dm_interposer *ip = NULL; > + unsigned int noio_flag = 0; > + > + if (!ip_dev) > + return -EINVAL; > + > + dm_disk_freeze(ip_dev->disk); > + mutex_lock(&interposer_mutex); > + noio_flag = memalloc_noio_save(); > + > + ip = get_interposer(ip_dev->disk); > + if (ip == NULL) > + ip = new_interposer(ip_dev->disk); > + if (IS_ERR(ip)) { > + ret = PTR_ERR(ip); > + goto out; > + } > + > + /* Attach dm_interposed_dev to dm_interposer */ > + down_write(&ip->ip_devs_lock); > + do { > + struct dm_rb_range *node; > + > + /* checking that ip_dev already exists for this region */ > + node = dm_rb_iter_first(&ip->ip_devs_root, ip_dev->node.start, ip_dev->node.last); > + if (node) { > + DMERR("Disk part form [%llu] to [%llu] already have interposer", > + node->start, node->last); > + > + ret = -EBUSY; > + break; > + } > + > + /* insert ip_dev to ip tree */ > + dm_rb_insert(&ip_dev->node, &ip->ip_devs_root); > + /* increment ip reference counter */ > + kref_get(&ip->kref); > + } while (false); > + up_write(&ip->ip_devs_lock); > + > + kref_put(&ip->kref, free_interposer); > + > +out: > + memalloc_noio_restore(noio_flag); > + mutex_unlock(&interposer_mutex); > + dm_disk_unfreeze(ip_dev->disk); > + > + return ret; > +} > + > +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev) > +{ > + int ret = 0; > + struct dm_interposer *ip = NULL; > + unsigned int noio_flag = 0; > + > + if (!ip_dev) > + return -EINVAL; > + > + dm_disk_freeze(ip_dev->disk); > + mutex_lock(&interposer_mutex); > + noio_flag = memalloc_noio_save(); > + > + ip = get_interposer(ip_dev->disk); > + if (IS_ERR(ip)) { > + ret = PTR_ERR(ip); > + DMERR("Interposer not found"); > + goto out; > + } > + if (unlikely(ip == NULL)) { > + ret = -ENXIO; > + DMERR("Interposer not found"); > + goto out; > + } > + > + down_write(&ip->ip_devs_lock); > + do { > + dm_rb_remove(&ip_dev->node, &ip->ip_devs_root); > + /* the reference counter here cannot be zero */ > + kref_put(&ip->kref, free_interposer); > + > + } while (false); > + up_write(&ip->ip_devs_lock); > + > + /* detach and free interposer if it`s not needed */ s/`/'/ > + kref_put(&ip->kref, free_interposer); > +out: > + memalloc_noio_restore(noio_flag); > + mutex_unlock(&interposer_mutex); > + dm_disk_unfreeze(ip_dev->disk); > + > + return ret; > +} > + > +static void dm_remap_fn(void *context, struct dm_rb_range *node, struct bio *bio) > +{ > + struct mapped_device *md = context; > + > + /* Set acceptor device. */ > + bio->bi_disk = md->disk; > + > + /* Remap disks offset */ > + bio->bi_iter.bi_sector -= node->start; > + > + /* > + * bio should be resubmitted. > + * We can just add bio to bio_list of the current process. > + * current->bio_list must be initialized when this function is called. > + * If call submit_bio_noacct(), the bio will be checked twice. > + */ > + BUG_ON(!current->bio_list); > + bio_list_add(¤t->bio_list[0], bio); > +} > + > +int dm_remap_install(struct mapped_device *md, const char *donor_device_name) > +{ > + int ret = 0; > + struct block_device *donor_bdev; > + fmode_t mode = FMODE_READ | FMODE_WRITE; > + > + DMDEBUG("Dm remap install for mapped device %s and donor device %s", > + md->name, donor_device_name); > + > + donor_bdev = blkdev_get_by_path(donor_device_name, mode, "device-mapper remap"); > + if (IS_ERR(donor_bdev)) { > + DMERR("Cannot open device [%s]", donor_device_name); > + return PTR_ERR(donor_bdev); > + } > + > + do { > + sector_t ofs = get_start_sect(donor_bdev); > + sector_t len = bdev_nr_sectors(donor_bdev); > + > + md->ip_dev = dm_interposer_new_dev(donor_bdev->bd_disk, ofs, len, md, dm_remap_fn); > + if (!md->ip_dev) { > + ret = -ENOMEM; > + break; > + } > + > + DMDEBUG("New interposed device 0x%p", md->ip_dev); > + ret = dm_interposer_attach_dev(md->ip_dev); > + if (ret) { > + dm_interposer_free_dev(md->ip_dev); > + > + md->ip_dev = NULL; > + DMERR("Failed to attach dm interposer"); > + break; > + } > + > + DMDEBUG("Attached successfully."); > + } while (false); > + > + blkdev_put(donor_bdev, mode); > + > + return ret; > +} > + > +int dm_remap_uninstall(struct mapped_device *md) > +{ > + int ret = 0; > + > + DMDEBUG("Dm remap uninstall for mapped device %s ip_dev=0x%p", md->name, md->ip_dev); > + > + if (!md->ip_dev) { > + DMERR("Cannot detach dm interposer"); > + return -EINVAL; > + } > + > + ret = dm_interposer_detach_dev(md->ip_dev); > + if (ret) { > + DMERR("Failed to detach dm interposer"); > + return ret; > + } > + > + DMDEBUG("Detached successfully. %llu bios was interposed", > + atomic64_read(&md->ip_dev->ip_cnt)); > + dm_interposer_free_dev(md->ip_dev); > + md->ip_dev = NULL; > + > + return 0; > +} > + > static char *_dm_claim_ptr = "I belong to device-mapper"; > > /* > * Open a table device so we can use it as a map destination. > */ > static int open_table_device(struct table_device *td, dev_t dev, > - struct mapped_device *md) > + struct mapped_device *md, bool non_exclusive) > { > struct block_device *bdev; > - > - int r; > + int ret; > > BUG_ON(td->dm_dev.bdev); > > - bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); > - if (IS_ERR(bdev)) > - return PTR_ERR(bdev); > + if (non_exclusive) > + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL); > + else > + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); > > - r = bd_link_disk_holder(bdev, dm_disk(md)); > - if (r) { > - blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); > - return r; > + if (IS_ERR(bdev)) { > + ret = PTR_ERR(bdev); > + if (ret != -EBUSY) > + return ret; > + } > + > + if (!non_exclusive) { > + ret = bd_link_disk_holder(bdev, dm_disk(md)); > + if (ret) { > + blkdev_put(bdev, td->dm_dev.mode); > + return ret; > + } > } > > td->dm_dev.bdev = bdev; > @@ -770,33 +1105,38 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > if (!td->dm_dev.bdev) > return; > > - bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); > - blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); > + if (td->dm_dev.mode & FMODE_EXCL) > + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); > + > + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode); > + > put_dax(td->dm_dev.dax_dev); > td->dm_dev.bdev = NULL; > td->dm_dev.dax_dev = NULL; > } > > static struct table_device *find_table_device(struct list_head *l, dev_t dev, > - fmode_t mode) > + fmode_t mode, bool non_exclusive) > { > struct table_device *td; > > list_for_each_entry(td, l, list) > - if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) > + if (td->dm_dev.bdev->bd_dev == dev && > + td->dm_dev.mode == mode && > + td->dm_dev.non_exclusive == non_exclusive) > return td; > > return NULL; > } > > -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, > struct dm_dev **result) > { > int r; > struct table_device *td; > > mutex_lock(&md->table_devices_lock); > - td = find_table_device(&md->table_devices, dev, mode); > + td = find_table_device(&md->table_devices, dev, mode, non_exclusive); > if (!td) { > td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); > if (!td) { > @@ -807,7 +1147,8 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > td->dm_dev.mode = mode; > td->dm_dev.bdev = NULL; > > - if ((r = open_table_device(td, dev, md))) { > + r = open_table_device(td, dev, md, non_exclusive); > + if (r) { > mutex_unlock(&md->table_devices_lock); > kfree(td); > return r; > @@ -2182,6 +2523,14 @@ static void __dm_destroy(struct mapped_device *md, bool wait) > > might_sleep(); > > + if (md->ip_dev) { > + if (dm_interposer_detach_dev(md->ip_dev)) > + DMERR("Failed to detach dm interposer"); > + > + dm_interposer_free_dev(md->ip_dev); > + md->ip_dev = NULL; > + } > + > spin_lock(&_minor_lock); > idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); > set_bit(DMF_FREEING, &md->flags); > diff --git a/drivers/md/dm.h b/drivers/md/dm.h > index fffe1e289c53..7bf20fb2de74 100644 > --- a/drivers/md/dm.h > +++ b/drivers/md/dm.h > @@ -179,7 +179,7 @@ int dm_open_count(struct mapped_device *md); > int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); > int dm_cancel_deferred_remove(struct mapped_device *md); > int dm_request_based(struct mapped_device *md); > -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, > struct dm_dev **result); > void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); > > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h > index 61a66fb8ebb3..70002363bfc0 100644 > --- a/include/linux/device-mapper.h > +++ b/include/linux/device-mapper.h > @@ -150,6 +150,7 @@ struct dm_dev { > struct block_device *bdev; > struct dax_device *dax_dev; > fmode_t mode; > + bool non_exclusive; > char name[16]; > }; > > @@ -325,6 +326,12 @@ struct dm_target { > * whether or not its underlying devices have support. > */ > bool discards_supported:1; > + > + /* > + * Set if this target needs to open device without FMODE_EXCL > + * mode. > + */ > + bool non_exclusive:1; > }; > > void *dm_per_bio_data(struct bio *bio, size_t data_size); > diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h > index 4933b6b67b85..08d7dbff80f4 100644 > --- a/include/uapi/linux/dm-ioctl.h > +++ b/include/uapi/linux/dm-ioctl.h > @@ -214,6 +214,15 @@ struct dm_target_msg { > char message[0]; > }; > > +enum { > + REMAP_START_CMD = 1, > + REMAP_FINISH_CMD, > +}; > + > +struct dm_remap_param { > + uint8_t cmd; > + uint8_t params[0]; > +}; > /* > * If you change this make sure you make the corresponding change > * to dm-ioctl.c:lookup_ioctl() > @@ -244,6 +253,7 @@ enum { > DM_DEV_SET_GEOMETRY_CMD, > DM_DEV_ARM_POLL_CMD, > DM_GET_TARGET_VERSION_CMD, > + DM_DEV_REMAP_CMD > }; > > #define DM_IOCTL 0xfd > @@ -259,6 +269,7 @@ enum { > #define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) > #define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) > #define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl) > +#define DM_DEV_REMAP _IOWR(DM_IOCTL, DM_DEV_REMAP_CMD, struct dm_ioctl) > > #define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) > #define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) > @@ -272,9 +283,9 @@ enum { > #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) > > #define DM_VERSION_MAJOR 4 > -#define DM_VERSION_MINOR 43 > +#define DM_VERSION_MINOR 44 > #define DM_VERSION_PATCHLEVEL 0 > -#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" > +#define DM_VERSION_EXTRA "-ioctl (2020-12-25)" > > /* Status bits */ > #define DM_READONLY_FLAG (1 << 0) /* In/Out */ >
The 01/29/2021 04:46, Damien Le Moal wrote: > On 2021/01/29 2:23, Sergei Shtepa wrote: > > Implement a block interposer for device-mapper to attach > > to an existing block layer stack. Using the interposer, > > we can connect the dm-linear to a device with a mounted > > file system. > > > > changes: > > * the new dm_interposer structure contains blk_interposer > > to intercept bio from the interposed disk and interval tree > > of block devices on this disk. > > * the new interval tree for device mapper. > > * the dm_submit_bio_interposer_fn() function implements > > the bio interception logic. > > * the functions dm_interposer_attach_dev() & > > dm_interposer_detach_dev() allow to attach and detach devices > > to dm_interposer. > > * the new parameter 'noexcl' allows to create dm-linear to device > > with an already mounted file system. > > * the non_exclusive parameter in dm_target structure - it`s a sign > > that target device should be opened without FMODE_EXCL mode. > > * the new ioctl IOCTL_DEV_REMAP allow to attach dm device to > > a regular block device. > > Same comment about changelog as in the previous patch. > > > Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com> > > --- > > drivers/md/dm-core.h | 46 +++- > > drivers/md/dm-ioctl.c | 39 ++++ > > drivers/md/dm-linear.c | 17 +- > > drivers/md/dm-table.c | 12 +- > > drivers/md/dm.c | 383 ++++++++++++++++++++++++++++++++-- > > drivers/md/dm.h | 2 +- > > include/linux/device-mapper.h | 7 + > > include/uapi/linux/dm-ioctl.h | 15 +- > > 8 files changed, 493 insertions(+), 28 deletions(-) > > > > diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h > > index 086d293c2b03..0f870b1d4be4 100644 > > --- a/drivers/md/dm-core.h > > +++ b/drivers/md/dm-core.h > > @@ -13,7 +13,7 @@ > > #include <linux/ktime.h> > > #include <linux/genhd.h> > > #include <linux/blk-mq.h> > > - > > whiteline change. > > > +#include <linux/rbtree.h> > > #include <trace/events/block.h> I don't see any problem in the fact that a new include appeared instead of whiteline. It doesn't make sense to split the include section by whiteline. > > > > #include "dm.h" > > @@ -109,6 +109,9 @@ struct mapped_device { > > bool init_tio_pdu:1; > > > > struct srcu_struct io_barrier; > > + > > + /* interposer device for remap */ > > + struct dm_interposed_dev *ip_dev; > > }; > > > > void disable_discard(struct mapped_device *md); > > @@ -164,6 +167,47 @@ struct dm_table { > > struct dm_md_mempools *mempools; > > }; > > > > +/* > > + * Interval tree for device mapper > > + */ > > +struct dm_rb_range { > > + struct rb_node node; > > + sector_t start; /* start sector of rb node */ > > + sector_t last; /* end sector of rb node */ > > + sector_t _subtree_last; /* highest sector in subtree of rb node */ > > +}; > > + > > +void dm_rb_insert(struct dm_rb_range *node, struct rb_root_cached *root); > > +void dm_rb_remove(struct dm_rb_range *node, struct rb_root_cached *root); > > + > > +struct dm_rb_range *dm_rb_iter_first(struct rb_root_cached *root, sector_t start, sector_t last); > > +struct dm_rb_range *dm_rb_iter_next(struct dm_rb_range *node, sector_t start, sector_t last); > > + > > +/* > > + * For connecting blk_interposer and dm-targets devices. > > Is this comment about the callback or the structure ? I think the latter, so it > is in the worng place. Please also add a comment for the callback definition > explaining what it should be doing. Ok. > > > + */ > > +typedef void (*dm_interpose_bio_t) (void *context, struct dm_rb_range *node, struct bio *bio); > > + > > +struct dm_interposed_dev { > > + struct gendisk *disk; > > + struct dm_rb_range node; > > + void *context; > > + dm_interpose_bio_t dm_interpose_bio; > > + > > + atomic64_t ip_cnt; /*for debug purpose*/ > > +}; > > + > > +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, > > + sector_t ofs, sector_t len, > > + void *context, > > + dm_interpose_bio_t dm_interpose_bio_t); > > +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev); > > +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev); > > +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev); > > + > > +int dm_remap_install(struct mapped_device *md, const char *donor_device_name); > > +int dm_remap_uninstall(struct mapped_device *md); > > + > > static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) > > { > > return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; > > diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c > > index 5e306bba4375..2944d442c256 100644 > > --- a/drivers/md/dm-ioctl.c > > +++ b/drivers/md/dm-ioctl.c > > @@ -1649,6 +1649,44 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para > > return r; > > } > > > > +static inline int dev_remap_start(struct mapped_device *md, uint8_t *params) > > +{ > > + char *donor_device_name = (char *)params; > > + > > + return dm_remap_install(md, donor_device_name); > > +} > > +static int dev_remap_finish(struct mapped_device *md) > > +{ > > + return dm_remap_uninstall(md); > > +} > > + > > +static int dev_remap(struct file *filp, struct dm_ioctl *param, size_t param_size) > > +{ > > + int ret = 0; > > + struct mapped_device *md; > > + void *bin_data; > > + struct dm_remap_param *remap_param; > > + > > + md = find_device(param); > > + if (!md) > > + return -ENXIO; > > + > > + bin_data = (void *)(param) + param->data_start; > > + remap_param = bin_data; > > + > > + if (remap_param->cmd == REMAP_START_CMD) > > + ret = dev_remap_start(md, remap_param->params); > > + else if (remap_param->cmd == REMAP_FINISH_CMD) > > + ret = dev_remap_finish(md); > > + else { > > + DMWARN("Invalid remap command, %d", remap_param->cmd); > > + ret = -EINVAL; > > + } > > + > > + dm_put(md); > > + return ret; > > +} > > + > > /* > > * The ioctl parameter block consists of two parts, a dm_ioctl struct > > * followed by a data buffer. This flag is set if the second part, > > @@ -1691,6 +1729,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) > > {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, > > {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, > > {DM_GET_TARGET_VERSION, 0, get_target_version}, > > + {DM_DEV_REMAP_CMD, 0, dev_remap}, > > }; > > > > if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) > > diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c > > index 00774b5d7668..ffb8b5ca4d10 100644 > > --- a/drivers/md/dm-linear.c > > +++ b/drivers/md/dm-linear.c > > @@ -28,12 +28,13 @@ struct linear_c { > > */ > > static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) > > { > > + fmode_t mode; > > struct linear_c *lc; > > unsigned long long tmp; > > char dummy; > > int ret; > > > > - if (argc != 2) { > > + if ((argc < 2) || (argc > 3)) { > > ti->error = "Invalid argument count"; > > return -EINVAL; > > } > > @@ -51,7 +52,19 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) > > } > > lc->start = tmp; > > > > - ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev); > > + ti->non_exclusive = false; > > + if (argc > 2) { > > + if (strcmp("noexcl", argv[2]) == 0) > > + ti->non_exclusive = true; > > + else if (strcmp("excl", argv[2]) == 0) > > + ti->non_exclusive = false; > > It already is false. Yes, and even the value of the "excl" parameter is redundant, since it defines the default value. I think this code structure more clearly reflects the meaning of the parameter. > > > + else { > > + ti->error = "Invalid exclusive option"; > > + return -EINVAL; > > + } > > + } > > + > > + ret = dm_get_device(ti, argv[0], mode, &lc->dev); > > Where is mode initialized ? Why remove dm_table_get_mode(ti->table) ? Yes. It`s a bug. In this plaсe should be dm_table_get_mode(). > > > if (ret) { > > ti->error = "Device lookup failed"; > > goto bad; > > I would prefer to see this change to dm-linear in its own patch, following this > one, with a clear explanation in the commit message how this change relates to > interposer since the explanation for this "exclusive" change is nowhere to be > seen. Also please check if there is a file describing dm-linear options under > Documentation/ (I can't remember if there is one). If there is one, it will need > to be updated too. It's a good idea. > > > diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c > > index 4acf2342f7ad..f15bc2171f25 100644 > > --- a/drivers/md/dm-table.c > > +++ b/drivers/md/dm-table.c > > @@ -322,7 +322,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, > > * device and not to touch the existing bdev field in case > > * it is accessed concurrently. > > */ > > -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, > > +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, bool non_exclusive, > > struct mapped_device *md) > > { > > int r; > > @@ -330,7 +330,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, > > > > old_dev = dd->dm_dev; > > > > - r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, > > + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, non_exclusive, > > dd->dm_dev->mode | new_mode, &new_dev); > > if (r) > > return r; > > @@ -387,7 +387,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, > > if (!dd) > > return -ENOMEM; > > > > - if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { > > + r = dm_get_table_device(t->md, dev, mode, ti->non_exclusive, &dd->dm_dev); > > + if (r) { > > kfree(dd); > > return r; > > } > > @@ -396,8 +397,9 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, > > list_add(&dd->list, &t->devices); > > goto out; > > > > - } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { > > - r = upgrade_mode(dd, mode, t->md); > > + } else if ((dd->dm_dev->mode != (mode | dd->dm_dev->mode)) && > > + (dd->dm_dev->non_exclusive != ti->non_exclusive)) { > > + r = upgrade_mode(dd, mode, ti->non_exclusive, t->md); > > if (r) > > return r; > > } > > diff --git a/drivers/md/dm.c b/drivers/md/dm.c > > index 7bac564f3faa..3b871d98b7b6 100644 > > --- a/drivers/md/dm.c > > +++ b/drivers/md/dm.c > > @@ -28,6 +28,7 @@ > > #include <linux/refcount.h> > > #include <linux/part_stat.h> > > #include <linux/blk-crypto.h> > > +#include <linux/interval_tree_generic.h> > > > > #define DM_MSG_PREFIX "core" > > > > @@ -56,6 +57,8 @@ static struct workqueue_struct *deferred_remove_workqueue; > > atomic_t dm_global_event_nr = ATOMIC_INIT(0); > > DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); > > > > +static DEFINE_MUTEX(interposer_mutex); /* synchronizing access to blk_interposer */ > > Why not dm_interposer_mutex as the name ? And the comment is not very useful: a > mutex is always for synchronizing :) Right. I'll do it. > > > + > > void dm_issue_global_event(void) > > { > > atomic_inc(&dm_global_event_nr); > > @@ -162,6 +165,26 @@ struct table_device { > > struct dm_dev dm_dev; > > }; > > > > +/* > > + * Device mapper`s interposer. > > + */ > > +struct dm_interposer { > > + struct blk_interposer blk_ip; > > + struct mapped_device *md; > > + > > + struct kref kref; > > + struct rw_semaphore ip_devs_lock; > > + struct rb_root_cached ip_devs_root; /* dm_interposed_dev tree */ > > +}; > > + > > +/* > > + * Interval tree for device mapper > > + */ > > +#define START(node) ((node)->start) > > +#define LAST(node) ((node)->last) > > +INTERVAL_TREE_DEFINE(struct dm_rb_range, node, sector_t, _subtree_last, > > + START, LAST,, dm_rb); > > + > > /* > > * Bio-based DM's mempools' reserved IOs set by the user. > > */ > > @@ -733,28 +756,340 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) > > rcu_read_unlock(); > > } > > > > +static void dm_submit_bio_interposer_fn(struct bio *bio) > > +{ > > + struct dm_interposer *ip; > > + unsigned int noio_flag = 0; > > + sector_t start; > > + sector_t last; > > + struct dm_rb_range *node; > > + > > + ip = container_of(bio->bi_disk->interposer, struct dm_interposer, blk_ip); > > + start = bio->bi_iter.bi_sector; > > + last = start + dm_sector_div_up(bio->bi_iter.bi_size, SECTOR_SIZE); > > + > > + noio_flag = memalloc_noio_save(); > > + down_read(&ip->ip_devs_lock); > > + node = dm_rb_iter_first(&ip->ip_devs_root, start, last); > > + while (node) { > > + struct dm_interposed_dev *ip_dev = > > + container_of(node, struct dm_interposed_dev, node); > > + > > + atomic64_inc(&ip_dev->ip_cnt); > > + ip_dev->dm_interpose_bio(ip_dev->context, node, bio); > > + > > + node = dm_rb_iter_next(node, start, last); > > + } > > + up_read(&ip->ip_devs_lock); > > + memalloc_noio_restore(noio_flag); > > +} > > + > > +static void free_interposer(struct kref *kref) > > +{ > > + struct dm_interposer *ip = container_of(kref, struct dm_interposer, kref); > > + > > + blk_interposer_detach(&ip->blk_ip, dm_submit_bio_interposer_fn); > > No queue freeze ? Yes. The queue should be already freeze. > > > + > > + kfree(ip); > > +} > > + > > +static struct dm_interposer *new_interposer(struct gendisk *disk) > > +{ > > + int ret = 0; > > + struct dm_interposer *ip; > > + > > + ip = kzalloc(sizeof(struct dm_interposer), GFP_NOIO); > > + if (!ip) > > + return ERR_PTR(-ENOMEM); > > + > > + kref_init(&ip->kref); > > + init_rwsem(&ip->ip_devs_lock); > > + ip->ip_devs_root = RB_ROOT_CACHED; > > + > > + ret = blk_interposer_attach(disk, &ip->blk_ip, dm_submit_bio_interposer_fn); > > No queue freeze ? Yes, again. > > > + if (ret) { > > + DMERR("Failed to attack blk_interposer"); > > + kref_put(&ip->kref, free_interposer); > > + return ERR_PTR(ret); > > + } > > + > > + return ip; > > +} > > + > > +static struct dm_interposer *get_interposer(struct gendisk *disk) > > +{ > > + struct dm_interposer *ip; > > + > > + if (!blk_has_interposer(disk)) > > + return NULL; > > + > > + if (disk->interposer->ip_submit_bio != dm_submit_bio_interposer_fn) { > > + DMERR("Disks interposer slot already occupied."); > > + return ERR_PTR(-EBUSY); > > This is weird... If there is an interposer, why not get a ref on that one. That > is what the function name suggests at least. Getting a ref on that just below in this function. But the name "get_dm_interposer" would be better. > > > + } > > + > > + ip = container_of(disk->interposer, struct dm_interposer, blk_ip); > > + > > + kref_get(&ip->kref); > > + return ip; > > +} > > + > > +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, sector_t ofs, sector_t len, > > + void *context, dm_interpose_bio_t dm_interpose_bio) > > +{ > > + sector_t start = ofs; > > + sector_t last = ofs + len - 1; > > + struct dm_interposed_dev *ip_dev = NULL; > > + > > + /* Allocate new ip_dev */ > > + ip_dev = kzalloc(sizeof(struct dm_interposed_dev), GFP_KERNEL); > > + if (!ip_dev) > > + return NULL; > > + > > + ip_dev->disk = disk; > > + ip_dev->node.start = start; > > + ip_dev->node.last = last; > > + > > + ip_dev->context = context; > > + ip_dev->dm_interpose_bio = dm_interpose_bio; > > + > > + atomic64_set(&ip_dev->ip_cnt, 0); > > + > > + return ip_dev; > > +} > > + > > +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev) > > +{ > > + kfree(ip_dev); > > +} > > Make this inline may be ? Yes. Or even remove this function. > > > + > > +static inline void dm_disk_freeze(struct gendisk *disk) > > +{ > > + blk_mq_freeze_queue(disk->queue); > > + blk_mq_quiesce_queue(disk->queue); > > I think you can replace this with blk_mq_freeze_queue_wait(). I think no. blk_freeze_queue_start() also is required. > > > +} > > + > > +static inline void dm_disk_unfreeze(struct gendisk *disk) > > +{ > > + blk_mq_unquiesce_queue(disk->queue); > > + blk_mq_unfreeze_queue(disk->queue); > > +} > > + > > +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev) > > +{ > > + int ret = 0; > > + struct dm_interposer *ip = NULL; > > + unsigned int noio_flag = 0; > > + > > + if (!ip_dev) > > + return -EINVAL; > > + > > + dm_disk_freeze(ip_dev->disk); > > + mutex_lock(&interposer_mutex); > > + noio_flag = memalloc_noio_save(); > > + > > + ip = get_interposer(ip_dev->disk); > > + if (ip == NULL) > > + ip = new_interposer(ip_dev->disk); > > + if (IS_ERR(ip)) { > > + ret = PTR_ERR(ip); > > + goto out; > > + } > > + > > + /* Attach dm_interposed_dev to dm_interposer */ > > + down_write(&ip->ip_devs_lock); > > + do { > > + struct dm_rb_range *node; > > + > > + /* checking that ip_dev already exists for this region */ > > + node = dm_rb_iter_first(&ip->ip_devs_root, ip_dev->node.start, ip_dev->node.last); > > + if (node) { > > + DMERR("Disk part form [%llu] to [%llu] already have interposer", > > + node->start, node->last); > > + > > + ret = -EBUSY; > > + break; > > + } > > + > > + /* insert ip_dev to ip tree */ > > + dm_rb_insert(&ip_dev->node, &ip->ip_devs_root); > > + /* increment ip reference counter */ > > + kref_get(&ip->kref); > > + } while (false); > > + up_write(&ip->ip_devs_lock); > > + > > + kref_put(&ip->kref, free_interposer); > > + > > +out: > > + memalloc_noio_restore(noio_flag); > > + mutex_unlock(&interposer_mutex); > > + dm_disk_unfreeze(ip_dev->disk); > > + > > + return ret; > > +} > > + > > +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev) > > +{ > > + int ret = 0; > > + struct dm_interposer *ip = NULL; > > + unsigned int noio_flag = 0; > > + > > + if (!ip_dev) > > + return -EINVAL; > > + > > + dm_disk_freeze(ip_dev->disk); > > + mutex_lock(&interposer_mutex); > > + noio_flag = memalloc_noio_save(); > > + > > + ip = get_interposer(ip_dev->disk); > > + if (IS_ERR(ip)) { > > + ret = PTR_ERR(ip); > > + DMERR("Interposer not found"); > > + goto out; > > + } > > + if (unlikely(ip == NULL)) { > > + ret = -ENXIO; > > + DMERR("Interposer not found"); > > + goto out; > > + } > > + > > + down_write(&ip->ip_devs_lock); > > + do { > > + dm_rb_remove(&ip_dev->node, &ip->ip_devs_root); > > + /* the reference counter here cannot be zero */ > > + kref_put(&ip->kref, free_interposer); > > + > > + } while (false); > > + up_write(&ip->ip_devs_lock); > > + > > + /* detach and free interposer if it`s not needed */ > > s/`/'/ Thanks. It's my problem. > > > + kref_put(&ip->kref, free_interposer); > > +out: > > + memalloc_noio_restore(noio_flag); > > + mutex_unlock(&interposer_mutex); > > + dm_disk_unfreeze(ip_dev->disk); > > + > > + return ret; > > +} > > + > > +static void dm_remap_fn(void *context, struct dm_rb_range *node, struct bio *bio) > > +{ > > + struct mapped_device *md = context; > > + > > + /* Set acceptor device. */ > > + bio->bi_disk = md->disk; > > + > > + /* Remap disks offset */ > > + bio->bi_iter.bi_sector -= node->start; > > + > > + /* > > + * bio should be resubmitted. > > + * We can just add bio to bio_list of the current process. > > + * current->bio_list must be initialized when this function is called. > > + * If call submit_bio_noacct(), the bio will be checked twice. > > + */ > > + BUG_ON(!current->bio_list); > > + bio_list_add(¤t->bio_list[0], bio); > > +} > > + > > +int dm_remap_install(struct mapped_device *md, const char *donor_device_name) > > +{ > > + int ret = 0; > > + struct block_device *donor_bdev; > > + fmode_t mode = FMODE_READ | FMODE_WRITE; > > + > > + DMDEBUG("Dm remap install for mapped device %s and donor device %s", > > + md->name, donor_device_name); > > + > > + donor_bdev = blkdev_get_by_path(donor_device_name, mode, "device-mapper remap"); > > + if (IS_ERR(donor_bdev)) { > > + DMERR("Cannot open device [%s]", donor_device_name); > > + return PTR_ERR(donor_bdev); > > + } > > + > > + do { > > + sector_t ofs = get_start_sect(donor_bdev); > > + sector_t len = bdev_nr_sectors(donor_bdev); > > + > > + md->ip_dev = dm_interposer_new_dev(donor_bdev->bd_disk, ofs, len, md, dm_remap_fn); > > + if (!md->ip_dev) { > > + ret = -ENOMEM; > > + break; > > + } > > + > > + DMDEBUG("New interposed device 0x%p", md->ip_dev); > > + ret = dm_interposer_attach_dev(md->ip_dev); > > + if (ret) { > > + dm_interposer_free_dev(md->ip_dev); > > + > > + md->ip_dev = NULL; > > + DMERR("Failed to attach dm interposer"); > > + break; > > + } > > + > > + DMDEBUG("Attached successfully."); > > + } while (false); > > + > > + blkdev_put(donor_bdev, mode); > > + > > + return ret; > > +} > > + > > +int dm_remap_uninstall(struct mapped_device *md) > > +{ > > + int ret = 0; > > + > > + DMDEBUG("Dm remap uninstall for mapped device %s ip_dev=0x%p", md->name, md->ip_dev); > > + > > + if (!md->ip_dev) { > > + DMERR("Cannot detach dm interposer"); > > + return -EINVAL; > > + } > > + > > + ret = dm_interposer_detach_dev(md->ip_dev); > > + if (ret) { > > + DMERR("Failed to detach dm interposer"); > > + return ret; > > + } > > + > > + DMDEBUG("Detached successfully. %llu bios was interposed", > > + atomic64_read(&md->ip_dev->ip_cnt)); > > + dm_interposer_free_dev(md->ip_dev); > > + md->ip_dev = NULL; > > + > > + return 0; > > +} > > + > > static char *_dm_claim_ptr = "I belong to device-mapper"; > > > > /* > > * Open a table device so we can use it as a map destination. > > */ > > static int open_table_device(struct table_device *td, dev_t dev, > > - struct mapped_device *md) > > + struct mapped_device *md, bool non_exclusive) > > { > > struct block_device *bdev; > > - > > - int r; > > + int ret; > > > > BUG_ON(td->dm_dev.bdev); > > > > - bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); > > - if (IS_ERR(bdev)) > > - return PTR_ERR(bdev); > > + if (non_exclusive) > > + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL); > > + else > > + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); > > > > - r = bd_link_disk_holder(bdev, dm_disk(md)); > > - if (r) { > > - blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); > > - return r; > > + if (IS_ERR(bdev)) { > > + ret = PTR_ERR(bdev); > > + if (ret != -EBUSY) > > + return ret; > > + } > > + > > + if (!non_exclusive) { > > + ret = bd_link_disk_holder(bdev, dm_disk(md)); > > + if (ret) { > > + blkdev_put(bdev, td->dm_dev.mode); > > + return ret; > > + } > > } > > > > td->dm_dev.bdev = bdev; > > @@ -770,33 +1105,38 @@ static void close_table_device(struct table_device *td, struct mapped_device *md > > if (!td->dm_dev.bdev) > > return; > > > > - bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); > > - blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); > > + if (td->dm_dev.mode & FMODE_EXCL) > > + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); > > + > > + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode); > > + > > put_dax(td->dm_dev.dax_dev); > > td->dm_dev.bdev = NULL; > > td->dm_dev.dax_dev = NULL; > > } > > > > static struct table_device *find_table_device(struct list_head *l, dev_t dev, > > - fmode_t mode) > > + fmode_t mode, bool non_exclusive) > > { > > struct table_device *td; > > > > list_for_each_entry(td, l, list) > > - if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) > > + if (td->dm_dev.bdev->bd_dev == dev && > > + td->dm_dev.mode == mode && > > + td->dm_dev.non_exclusive == non_exclusive) > > return td; > > > > return NULL; > > } > > > > -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > > +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, > > struct dm_dev **result) > > { > > int r; > > struct table_device *td; > > > > mutex_lock(&md->table_devices_lock); > > - td = find_table_device(&md->table_devices, dev, mode); > > + td = find_table_device(&md->table_devices, dev, mode, non_exclusive); > > if (!td) { > > td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); > > if (!td) { > > @@ -807,7 +1147,8 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > > td->dm_dev.mode = mode; > > td->dm_dev.bdev = NULL; > > > > - if ((r = open_table_device(td, dev, md))) { > > + r = open_table_device(td, dev, md, non_exclusive); > > + if (r) { > > mutex_unlock(&md->table_devices_lock); > > kfree(td); > > return r; > > @@ -2182,6 +2523,14 @@ static void __dm_destroy(struct mapped_device *md, bool wait) > > > > might_sleep(); > > > > + if (md->ip_dev) { > > + if (dm_interposer_detach_dev(md->ip_dev)) > > + DMERR("Failed to detach dm interposer"); > > + > > + dm_interposer_free_dev(md->ip_dev); > > + md->ip_dev = NULL; > > + } > > + > > spin_lock(&_minor_lock); > > idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); > > set_bit(DMF_FREEING, &md->flags); > > diff --git a/drivers/md/dm.h b/drivers/md/dm.h > > index fffe1e289c53..7bf20fb2de74 100644 > > --- a/drivers/md/dm.h > > +++ b/drivers/md/dm.h > > @@ -179,7 +179,7 @@ int dm_open_count(struct mapped_device *md); > > int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); > > int dm_cancel_deferred_remove(struct mapped_device *md); > > int dm_request_based(struct mapped_device *md); > > -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, > > +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, > > struct dm_dev **result); > > void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); > > > > diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h > > index 61a66fb8ebb3..70002363bfc0 100644 > > --- a/include/linux/device-mapper.h > > +++ b/include/linux/device-mapper.h > > @@ -150,6 +150,7 @@ struct dm_dev { > > struct block_device *bdev; > > struct dax_device *dax_dev; > > fmode_t mode; > > + bool non_exclusive; > > char name[16]; > > }; > > > > @@ -325,6 +326,12 @@ struct dm_target { > > * whether or not its underlying devices have support. > > */ > > bool discards_supported:1; > > + > > + /* > > + * Set if this target needs to open device without FMODE_EXCL > > + * mode. > > + */ > > + bool non_exclusive:1; > > }; > > > > void *dm_per_bio_data(struct bio *bio, size_t data_size); > > diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h > > index 4933b6b67b85..08d7dbff80f4 100644 > > --- a/include/uapi/linux/dm-ioctl.h > > +++ b/include/uapi/linux/dm-ioctl.h > > @@ -214,6 +214,15 @@ struct dm_target_msg { > > char message[0]; > > }; > > > > +enum { > > + REMAP_START_CMD = 1, > > + REMAP_FINISH_CMD, > > +}; > > + > > +struct dm_remap_param { > > + uint8_t cmd; > > + uint8_t params[0]; > > +}; > > /* > > * If you change this make sure you make the corresponding change > > * to dm-ioctl.c:lookup_ioctl() > > @@ -244,6 +253,7 @@ enum { > > DM_DEV_SET_GEOMETRY_CMD, > > DM_DEV_ARM_POLL_CMD, > > DM_GET_TARGET_VERSION_CMD, > > + DM_DEV_REMAP_CMD > > }; > > > > #define DM_IOCTL 0xfd > > @@ -259,6 +269,7 @@ enum { > > #define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) > > #define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) > > #define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl) > > +#define DM_DEV_REMAP _IOWR(DM_IOCTL, DM_DEV_REMAP_CMD, struct dm_ioctl) > > > > #define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) > > #define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) > > @@ -272,9 +283,9 @@ enum { > > #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) > > > > #define DM_VERSION_MAJOR 4 > > -#define DM_VERSION_MINOR 43 > > +#define DM_VERSION_MINOR 44 > > #define DM_VERSION_PATCHLEVEL 0 > > -#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" > > +#define DM_VERSION_EXTRA "-ioctl (2020-12-25)" > > > > /* Status bits */ > > #define DM_READONLY_FLAG (1 << 0) /* In/Out */ > > > > > -- > Damien Le Moal > Western Digital Research >
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 086d293c2b03..0f870b1d4be4 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,7 +13,7 @@ #include <linux/ktime.h> #include <linux/genhd.h> #include <linux/blk-mq.h> - +#include <linux/rbtree.h> #include <trace/events/block.h> #include "dm.h" @@ -109,6 +109,9 @@ struct mapped_device { bool init_tio_pdu:1; struct srcu_struct io_barrier; + + /* interposer device for remap */ + struct dm_interposed_dev *ip_dev; }; void disable_discard(struct mapped_device *md); @@ -164,6 +167,47 @@ struct dm_table { struct dm_md_mempools *mempools; }; +/* + * Interval tree for device mapper + */ +struct dm_rb_range { + struct rb_node node; + sector_t start; /* start sector of rb node */ + sector_t last; /* end sector of rb node */ + sector_t _subtree_last; /* highest sector in subtree of rb node */ +}; + +void dm_rb_insert(struct dm_rb_range *node, struct rb_root_cached *root); +void dm_rb_remove(struct dm_rb_range *node, struct rb_root_cached *root); + +struct dm_rb_range *dm_rb_iter_first(struct rb_root_cached *root, sector_t start, sector_t last); +struct dm_rb_range *dm_rb_iter_next(struct dm_rb_range *node, sector_t start, sector_t last); + +/* + * For connecting blk_interposer and dm-targets devices. + */ +typedef void (*dm_interpose_bio_t) (void *context, struct dm_rb_range *node, struct bio *bio); + +struct dm_interposed_dev { + struct gendisk *disk; + struct dm_rb_range node; + void *context; + dm_interpose_bio_t dm_interpose_bio; + + atomic64_t ip_cnt; /*for debug purpose*/ +}; + +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, + sector_t ofs, sector_t len, + void *context, + dm_interpose_bio_t dm_interpose_bio_t); +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev); +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev); +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev); + +int dm_remap_install(struct mapped_device *md, const char *donor_device_name); +int dm_remap_uninstall(struct mapped_device *md); + static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 5e306bba4375..2944d442c256 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1649,6 +1649,44 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para return r; } +static inline int dev_remap_start(struct mapped_device *md, uint8_t *params) +{ + char *donor_device_name = (char *)params; + + return dm_remap_install(md, donor_device_name); +} +static int dev_remap_finish(struct mapped_device *md) +{ + return dm_remap_uninstall(md); +} + +static int dev_remap(struct file *filp, struct dm_ioctl *param, size_t param_size) +{ + int ret = 0; + struct mapped_device *md; + void *bin_data; + struct dm_remap_param *remap_param; + + md = find_device(param); + if (!md) + return -ENXIO; + + bin_data = (void *)(param) + param->data_start; + remap_param = bin_data; + + if (remap_param->cmd == REMAP_START_CMD) + ret = dev_remap_start(md, remap_param->params); + else if (remap_param->cmd == REMAP_FINISH_CMD) + ret = dev_remap_finish(md); + else { + DMWARN("Invalid remap command, %d", remap_param->cmd); + ret = -EINVAL; + } + + dm_put(md); + return ret; +} + /* * The ioctl parameter block consists of two parts, a dm_ioctl struct * followed by a data buffer. This flag is set if the second part, @@ -1691,6 +1729,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, {DM_GET_TARGET_VERSION, 0, get_target_version}, + {DM_DEV_REMAP_CMD, 0, dev_remap}, }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 00774b5d7668..ffb8b5ca4d10 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -28,12 +28,13 @@ struct linear_c { */ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { + fmode_t mode; struct linear_c *lc; unsigned long long tmp; char dummy; int ret; - if (argc != 2) { + if ((argc < 2) || (argc > 3)) { ti->error = "Invalid argument count"; return -EINVAL; } @@ -51,7 +52,19 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) } lc->start = tmp; - ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev); + ti->non_exclusive = false; + if (argc > 2) { + if (strcmp("noexcl", argv[2]) == 0) + ti->non_exclusive = true; + else if (strcmp("excl", argv[2]) == 0) + ti->non_exclusive = false; + else { + ti->error = "Invalid exclusive option"; + return -EINVAL; + } + } + + ret = dm_get_device(ti, argv[0], mode, &lc->dev); if (ret) { ti->error = "Device lookup failed"; goto bad; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4acf2342f7ad..f15bc2171f25 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -322,7 +322,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * device and not to touch the existing bdev field in case * it is accessed concurrently. */ -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, bool non_exclusive, struct mapped_device *md) { int r; @@ -330,7 +330,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, old_dev = dd->dm_dev; - r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, non_exclusive, dd->dm_dev->mode | new_mode, &new_dev); if (r) return r; @@ -387,7 +387,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (!dd) return -ENOMEM; - if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { + r = dm_get_table_device(t->md, dev, mode, ti->non_exclusive, &dd->dm_dev); + if (r) { kfree(dd); return r; } @@ -396,8 +397,9 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, list_add(&dd->list, &t->devices); goto out; - } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { - r = upgrade_mode(dd, mode, t->md); + } else if ((dd->dm_dev->mode != (mode | dd->dm_dev->mode)) && + (dd->dm_dev->non_exclusive != ti->non_exclusive)) { + r = upgrade_mode(dd, mode, ti->non_exclusive, t->md); if (r) return r; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7bac564f3faa..3b871d98b7b6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -28,6 +28,7 @@ #include <linux/refcount.h> #include <linux/part_stat.h> #include <linux/blk-crypto.h> +#include <linux/interval_tree_generic.h> #define DM_MSG_PREFIX "core" @@ -56,6 +57,8 @@ static struct workqueue_struct *deferred_remove_workqueue; atomic_t dm_global_event_nr = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq); +static DEFINE_MUTEX(interposer_mutex); /* synchronizing access to blk_interposer */ + void dm_issue_global_event(void) { atomic_inc(&dm_global_event_nr); @@ -162,6 +165,26 @@ struct table_device { struct dm_dev dm_dev; }; +/* + * Device mapper`s interposer. + */ +struct dm_interposer { + struct blk_interposer blk_ip; + struct mapped_device *md; + + struct kref kref; + struct rw_semaphore ip_devs_lock; + struct rb_root_cached ip_devs_root; /* dm_interposed_dev tree */ +}; + +/* + * Interval tree for device mapper + */ +#define START(node) ((node)->start) +#define LAST(node) ((node)->last) +INTERVAL_TREE_DEFINE(struct dm_rb_range, node, sector_t, _subtree_last, + START, LAST,, dm_rb); + /* * Bio-based DM's mempools' reserved IOs set by the user. */ @@ -733,28 +756,340 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static void dm_submit_bio_interposer_fn(struct bio *bio) +{ + struct dm_interposer *ip; + unsigned int noio_flag = 0; + sector_t start; + sector_t last; + struct dm_rb_range *node; + + ip = container_of(bio->bi_disk->interposer, struct dm_interposer, blk_ip); + start = bio->bi_iter.bi_sector; + last = start + dm_sector_div_up(bio->bi_iter.bi_size, SECTOR_SIZE); + + noio_flag = memalloc_noio_save(); + down_read(&ip->ip_devs_lock); + node = dm_rb_iter_first(&ip->ip_devs_root, start, last); + while (node) { + struct dm_interposed_dev *ip_dev = + container_of(node, struct dm_interposed_dev, node); + + atomic64_inc(&ip_dev->ip_cnt); + ip_dev->dm_interpose_bio(ip_dev->context, node, bio); + + node = dm_rb_iter_next(node, start, last); + } + up_read(&ip->ip_devs_lock); + memalloc_noio_restore(noio_flag); +} + +static void free_interposer(struct kref *kref) +{ + struct dm_interposer *ip = container_of(kref, struct dm_interposer, kref); + + blk_interposer_detach(&ip->blk_ip, dm_submit_bio_interposer_fn); + + kfree(ip); +} + +static struct dm_interposer *new_interposer(struct gendisk *disk) +{ + int ret = 0; + struct dm_interposer *ip; + + ip = kzalloc(sizeof(struct dm_interposer), GFP_NOIO); + if (!ip) + return ERR_PTR(-ENOMEM); + + kref_init(&ip->kref); + init_rwsem(&ip->ip_devs_lock); + ip->ip_devs_root = RB_ROOT_CACHED; + + ret = blk_interposer_attach(disk, &ip->blk_ip, dm_submit_bio_interposer_fn); + if (ret) { + DMERR("Failed to attack blk_interposer"); + kref_put(&ip->kref, free_interposer); + return ERR_PTR(ret); + } + + return ip; +} + +static struct dm_interposer *get_interposer(struct gendisk *disk) +{ + struct dm_interposer *ip; + + if (!blk_has_interposer(disk)) + return NULL; + + if (disk->interposer->ip_submit_bio != dm_submit_bio_interposer_fn) { + DMERR("Disks interposer slot already occupied."); + return ERR_PTR(-EBUSY); + } + + ip = container_of(disk->interposer, struct dm_interposer, blk_ip); + + kref_get(&ip->kref); + return ip; +} + +struct dm_interposed_dev *dm_interposer_new_dev(struct gendisk *disk, sector_t ofs, sector_t len, + void *context, dm_interpose_bio_t dm_interpose_bio) +{ + sector_t start = ofs; + sector_t last = ofs + len - 1; + struct dm_interposed_dev *ip_dev = NULL; + + /* Allocate new ip_dev */ + ip_dev = kzalloc(sizeof(struct dm_interposed_dev), GFP_KERNEL); + if (!ip_dev) + return NULL; + + ip_dev->disk = disk; + ip_dev->node.start = start; + ip_dev->node.last = last; + + ip_dev->context = context; + ip_dev->dm_interpose_bio = dm_interpose_bio; + + atomic64_set(&ip_dev->ip_cnt, 0); + + return ip_dev; +} + +void dm_interposer_free_dev(struct dm_interposed_dev *ip_dev) +{ + kfree(ip_dev); +} + +static inline void dm_disk_freeze(struct gendisk *disk) +{ + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); +} + +static inline void dm_disk_unfreeze(struct gendisk *disk) +{ + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); +} + +int dm_interposer_attach_dev(struct dm_interposed_dev *ip_dev) +{ + int ret = 0; + struct dm_interposer *ip = NULL; + unsigned int noio_flag = 0; + + if (!ip_dev) + return -EINVAL; + + dm_disk_freeze(ip_dev->disk); + mutex_lock(&interposer_mutex); + noio_flag = memalloc_noio_save(); + + ip = get_interposer(ip_dev->disk); + if (ip == NULL) + ip = new_interposer(ip_dev->disk); + if (IS_ERR(ip)) { + ret = PTR_ERR(ip); + goto out; + } + + /* Attach dm_interposed_dev to dm_interposer */ + down_write(&ip->ip_devs_lock); + do { + struct dm_rb_range *node; + + /* checking that ip_dev already exists for this region */ + node = dm_rb_iter_first(&ip->ip_devs_root, ip_dev->node.start, ip_dev->node.last); + if (node) { + DMERR("Disk part form [%llu] to [%llu] already have interposer", + node->start, node->last); + + ret = -EBUSY; + break; + } + + /* insert ip_dev to ip tree */ + dm_rb_insert(&ip_dev->node, &ip->ip_devs_root); + /* increment ip reference counter */ + kref_get(&ip->kref); + } while (false); + up_write(&ip->ip_devs_lock); + + kref_put(&ip->kref, free_interposer); + +out: + memalloc_noio_restore(noio_flag); + mutex_unlock(&interposer_mutex); + dm_disk_unfreeze(ip_dev->disk); + + return ret; +} + +int dm_interposer_detach_dev(struct dm_interposed_dev *ip_dev) +{ + int ret = 0; + struct dm_interposer *ip = NULL; + unsigned int noio_flag = 0; + + if (!ip_dev) + return -EINVAL; + + dm_disk_freeze(ip_dev->disk); + mutex_lock(&interposer_mutex); + noio_flag = memalloc_noio_save(); + + ip = get_interposer(ip_dev->disk); + if (IS_ERR(ip)) { + ret = PTR_ERR(ip); + DMERR("Interposer not found"); + goto out; + } + if (unlikely(ip == NULL)) { + ret = -ENXIO; + DMERR("Interposer not found"); + goto out; + } + + down_write(&ip->ip_devs_lock); + do { + dm_rb_remove(&ip_dev->node, &ip->ip_devs_root); + /* the reference counter here cannot be zero */ + kref_put(&ip->kref, free_interposer); + + } while (false); + up_write(&ip->ip_devs_lock); + + /* detach and free interposer if it`s not needed */ + kref_put(&ip->kref, free_interposer); +out: + memalloc_noio_restore(noio_flag); + mutex_unlock(&interposer_mutex); + dm_disk_unfreeze(ip_dev->disk); + + return ret; +} + +static void dm_remap_fn(void *context, struct dm_rb_range *node, struct bio *bio) +{ + struct mapped_device *md = context; + + /* Set acceptor device. */ + bio->bi_disk = md->disk; + + /* Remap disks offset */ + bio->bi_iter.bi_sector -= node->start; + + /* + * bio should be resubmitted. + * We can just add bio to bio_list of the current process. + * current->bio_list must be initialized when this function is called. + * If call submit_bio_noacct(), the bio will be checked twice. + */ + BUG_ON(!current->bio_list); + bio_list_add(¤t->bio_list[0], bio); +} + +int dm_remap_install(struct mapped_device *md, const char *donor_device_name) +{ + int ret = 0; + struct block_device *donor_bdev; + fmode_t mode = FMODE_READ | FMODE_WRITE; + + DMDEBUG("Dm remap install for mapped device %s and donor device %s", + md->name, donor_device_name); + + donor_bdev = blkdev_get_by_path(donor_device_name, mode, "device-mapper remap"); + if (IS_ERR(donor_bdev)) { + DMERR("Cannot open device [%s]", donor_device_name); + return PTR_ERR(donor_bdev); + } + + do { + sector_t ofs = get_start_sect(donor_bdev); + sector_t len = bdev_nr_sectors(donor_bdev); + + md->ip_dev = dm_interposer_new_dev(donor_bdev->bd_disk, ofs, len, md, dm_remap_fn); + if (!md->ip_dev) { + ret = -ENOMEM; + break; + } + + DMDEBUG("New interposed device 0x%p", md->ip_dev); + ret = dm_interposer_attach_dev(md->ip_dev); + if (ret) { + dm_interposer_free_dev(md->ip_dev); + + md->ip_dev = NULL; + DMERR("Failed to attach dm interposer"); + break; + } + + DMDEBUG("Attached successfully."); + } while (false); + + blkdev_put(donor_bdev, mode); + + return ret; +} + +int dm_remap_uninstall(struct mapped_device *md) +{ + int ret = 0; + + DMDEBUG("Dm remap uninstall for mapped device %s ip_dev=0x%p", md->name, md->ip_dev); + + if (!md->ip_dev) { + DMERR("Cannot detach dm interposer"); + return -EINVAL; + } + + ret = dm_interposer_detach_dev(md->ip_dev); + if (ret) { + DMERR("Failed to detach dm interposer"); + return ret; + } + + DMDEBUG("Detached successfully. %llu bios was interposed", + atomic64_read(&md->ip_dev->ip_cnt)); + dm_interposer_free_dev(md->ip_dev); + md->ip_dev = NULL; + + return 0; +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* * Open a table device so we can use it as a map destination. */ static int open_table_device(struct table_device *td, dev_t dev, - struct mapped_device *md) + struct mapped_device *md, bool non_exclusive) { struct block_device *bdev; - - int r; + int ret; BUG_ON(td->dm_dev.bdev); - bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + if (non_exclusive) + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode, NULL); + else + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr); - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); - return r; + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + if (ret != -EBUSY) + return ret; + } + + if (!non_exclusive) { + ret = bd_link_disk_holder(bdev, dm_disk(md)); + if (ret) { + blkdev_put(bdev, td->dm_dev.mode); + return ret; + } } td->dm_dev.bdev = bdev; @@ -770,33 +1105,38 @@ static void close_table_device(struct table_device *td, struct mapped_device *md if (!td->dm_dev.bdev) return; - bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); - blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + if (td->dm_dev.mode & FMODE_EXCL) + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode); + put_dax(td->dm_dev.dax_dev); td->dm_dev.bdev = NULL; td->dm_dev.dax_dev = NULL; } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) + fmode_t mode, bool non_exclusive) { struct table_device *td; list_for_each_entry(td, l, list) - if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + if (td->dm_dev.bdev->bd_dev == dev && + td->dm_dev.mode == mode && + td->dm_dev.non_exclusive == non_exclusive) return td; return NULL; } -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, struct dm_dev **result) { int r; struct table_device *td; mutex_lock(&md->table_devices_lock); - td = find_table_device(&md->table_devices, dev, mode); + td = find_table_device(&md->table_devices, dev, mode, non_exclusive); if (!td) { td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) { @@ -807,7 +1147,8 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, td->dm_dev.mode = mode; td->dm_dev.bdev = NULL; - if ((r = open_table_device(td, dev, md))) { + r = open_table_device(td, dev, md, non_exclusive); + if (r) { mutex_unlock(&md->table_devices_lock); kfree(td); return r; @@ -2182,6 +2523,14 @@ static void __dm_destroy(struct mapped_device *md, bool wait) might_sleep(); + if (md->ip_dev) { + if (dm_interposer_detach_dev(md->ip_dev)) + DMERR("Failed to detach dm interposer"); + + dm_interposer_free_dev(md->ip_dev); + md->ip_dev = NULL; + } + spin_lock(&_minor_lock); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index fffe1e289c53..7bf20fb2de74 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -179,7 +179,7 @@ int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, bool non_exclusive, struct dm_dev **result); void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 61a66fb8ebb3..70002363bfc0 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -150,6 +150,7 @@ struct dm_dev { struct block_device *bdev; struct dax_device *dax_dev; fmode_t mode; + bool non_exclusive; char name[16]; }; @@ -325,6 +326,12 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; + + /* + * Set if this target needs to open device without FMODE_EXCL + * mode. + */ + bool non_exclusive:1; }; void *dm_per_bio_data(struct bio *bio, size_t data_size); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 4933b6b67b85..08d7dbff80f4 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -214,6 +214,15 @@ struct dm_target_msg { char message[0]; }; +enum { + REMAP_START_CMD = 1, + REMAP_FINISH_CMD, +}; + +struct dm_remap_param { + uint8_t cmd; + uint8_t params[0]; +}; /* * If you change this make sure you make the corresponding change * to dm-ioctl.c:lookup_ioctl() @@ -244,6 +253,7 @@ enum { DM_DEV_SET_GEOMETRY_CMD, DM_DEV_ARM_POLL_CMD, DM_GET_TARGET_VERSION_CMD, + DM_DEV_REMAP_CMD }; #define DM_IOCTL 0xfd @@ -259,6 +269,7 @@ enum { #define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) #define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) #define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl) +#define DM_DEV_REMAP _IOWR(DM_IOCTL, DM_DEV_REMAP_CMD, struct dm_ioctl) #define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) #define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) @@ -272,9 +283,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 43 +#define DM_VERSION_MINOR 44 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2020-10-01)" +#define DM_VERSION_EXTRA "-ioctl (2020-12-25)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */
Implement a block interposer for device-mapper to attach to an existing block layer stack. Using the interposer, we can connect the dm-linear to a device with a mounted file system. changes: * the new dm_interposer structure contains blk_interposer to intercept bio from the interposed disk and interval tree of block devices on this disk. * the new interval tree for device mapper. * the dm_submit_bio_interposer_fn() function implements the bio interception logic. * the functions dm_interposer_attach_dev() & dm_interposer_detach_dev() allow to attach and detach devices to dm_interposer. * the new parameter 'noexcl' allows to create dm-linear to device with an already mounted file system. * the non_exclusive parameter in dm_target structure - it`s a sign that target device should be opened without FMODE_EXCL mode. * the new ioctl IOCTL_DEV_REMAP allow to attach dm device to a regular block device. Signed-off-by: Sergei Shtepa <sergei.shtepa@veeam.com> --- drivers/md/dm-core.h | 46 +++- drivers/md/dm-ioctl.c | 39 ++++ drivers/md/dm-linear.c | 17 +- drivers/md/dm-table.c | 12 +- drivers/md/dm.c | 383 ++++++++++++++++++++++++++++++++-- drivers/md/dm.h | 2 +- include/linux/device-mapper.h | 7 + include/uapi/linux/dm-ioctl.h | 15 +- 8 files changed, 493 insertions(+), 28 deletions(-)