[v1,13/17] block, blksnap: owner of information about overwritten blocks of the original block device

Message ID	20221102155101.4550-14-sergei.shtepa@veeam.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Sergei Shtepa <sergei.shtepa@veeam.com> To: <axboe@kernel.dk>, <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <hch@infradead.org>, <sergei.shtepa@veeam.com> Subject: [PATCH v1 13/17] block, blksnap: owner of information about overwritten blocks of the original block device Date: Wed, 2 Nov 2022 16:50:57 +0100 Message-ID: <20221102155101.4550-14-sergei.shtepa@veeam.com> In-Reply-To: <20221102155101.4550-1-sergei.shtepa@veeam.com> References: <20221102155101.4550-1-sergei.shtepa@veeam.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain Precedence: bulk
Series	blksnap - creating non-persistent snapshots for backup \| expand [v1,00/17] blksnap - creating non-persistent snapshots for backup [v1,01/17] block, bdev_filter: enable block device filters [v1,02/17] block, blksnap: header file of the module interface [v1,03/17] block, blksnap: module management interface functions [v1,04/17] block, blksnap: init() and exit() functions [v1,05/17] block, blksnap: interaction with sysfs [v1,06/17] block, blksnap: attaching and detaching the filter and handling a bios [v1,07/17] block, blksnap: map of change block tracking [v1,08/17] block, blksnap: minimum data storage unit of the original block device [v1,09/17] lock, blksnap: buffer in memory for the minimum data storage unit [v1,10/17] block, blksnap: functions and structures for performing block I/O operations [v1,11/17] block, blksnap: storage for storing difference blocks [v1,12/17] lock, blksnap: event queue from the difference storage [v1,13/17] block, blksnap: owner of information about overwritten blocks of the original block devi… [v1,14/17] block, blksnap: snapshot image block device [v1,15/17] block, blksnap: snapshot [v1,16/17] block, blksnap: Kconfig and Makefile [v1,17/17] block, blksnap: adds a blksnap to the kernel tree

diff --git a/drivers/block/blksnap/diff_area.c b/drivers/block/blksnap/diff_area.c new file mode 100644 index 000000000000..43a98dc4c89b --- /dev/null +++ b/drivers/block/blksnap/diff_area.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME "-diff-area: " fmt + +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <uapi/linux/blksnap.h> +#include "params.h" +#include "chunk.h" +#include "diff_area.h" +#include "diff_buffer.h" +#include "diff_storage.h" +#include "diff_io.h" + +static inline unsigned long chunk_number(struct diff_area *diff_area, + sector_t sector) +{ + return (unsigned long)(sector >> + (diff_area->chunk_shift - SECTOR_SHIFT)); +}; + +static inline sector_t chunk_sector(struct chunk *chunk) +{ + return (sector_t)(chunk->number) + << (chunk->diff_area->chunk_shift - SECTOR_SHIFT); +} + +static inline void recalculate_last_chunk_size(struct chunk *chunk) +{ + sector_t capacity; + + capacity = bdev_nr_sectors(chunk->diff_area->orig_bdev); + if (capacity > round_down(capacity, chunk->sector_count)) + chunk->sector_count = + capacity - round_down(capacity, chunk->sector_count); +} + +static inline unsigned long long count_by_shift(sector_t capacity, + unsigned long long shift) +{ + unsigned long long shift_sector = (shift - SECTOR_SHIFT); + + return round_up(capacity, (1ull << shift_sector)) >> shift_sector; +} + +static void diff_area_calculate_chunk_size(struct diff_area *diff_area) +{ + unsigned long long shift = chunk_minimum_shift; + unsigned long long count; + sector_t capacity; + sector_t min_io_sect; + + min_io_sect = (sector_t)(bdev_io_min(diff_area->orig_bdev) >> + SECTOR_SHIFT); + capacity = bdev_nr_sectors(diff_area->orig_bdev); + pr_debug("Minimal IO block %llu sectors\n", min_io_sect); + pr_debug("Device capacity %llu sectors\n", capacity); + + count = count_by_shift(capacity, shift); + pr_debug("Chunks count %llu\n", count); + while ((count > chunk_maximum_count) || + ((1ull << (shift - SECTOR_SHIFT)) < min_io_sect)) { + shift = shift + 1ull; + count = count_by_shift(capacity, shift); + pr_debug("Chunks count %llu\n", count); + } + + diff_area->chunk_shift = shift; + diff_area->chunk_count = count; + + pr_info("The optimal chunk size was calculated as %llu bytes for device [%d:%d]\n", + (1ull << diff_area->chunk_shift), + MAJOR(diff_area->orig_bdev->bd_dev), + MINOR(diff_area->orig_bdev->bd_dev)); +} + +void diff_area_free(struct kref *kref) +{ + unsigned long inx = 0; + u64 start_waiting; + struct chunk *chunk; + struct diff_area *diff_area = + container_of(kref, struct diff_area, kref); + + might_sleep(); + start_waiting = jiffies_64; + while (atomic_read(&diff_area->pending_io_count)) { + schedule_timeout_interruptible(1); + if (jiffies_64 > (start_waiting + HZ)) { + start_waiting = jiffies_64; + inx++; + pr_warn("Waiting for pending I/O to complete\n"); + if (inx > 5) { + pr_err("Failed to complete pending I/O\n"); + break; + } + } + } + + atomic_set(&diff_area->corrupt_flag, 1); + flush_work(&diff_area->cache_release_work); + xa_for_each(&diff_area->chunk_map, inx, chunk) + chunk_free(chunk); + xa_destroy(&diff_area->chunk_map); + + if (diff_area->orig_bdev) { + blkdev_put(diff_area->orig_bdev, FMODE_READ | FMODE_WRITE); + diff_area->orig_bdev = NULL; + } + + /* Clean up free_diff_buffers */ + diff_buffer_cleanup(diff_area); + + kfree(diff_area); +} + +static inline struct chunk * +get_chunk_from_cache_and_write_lock(spinlock_t *caches_lock, + struct list_head *cache_queue, + atomic_t *cache_count) +{ + struct chunk *iter; + struct chunk *chunk = NULL; + + spin_lock(caches_lock); + list_for_each_entry(iter, cache_queue, cache_link) { + if (!down_trylock(&iter->lock)) { + chunk = iter; + break; + } + /* + * If it is not possible to lock a chunk for writing, + * then it is currently in use, and we try to clean up the + * next chunk. + */ + } + if (likely(chunk)) { + atomic_dec(cache_count); + list_del_init(&chunk->cache_link); + } + spin_unlock(caches_lock); + + return chunk; +} + +static struct chunk * +diff_area_get_chunk_from_cache_and_write_lock(struct diff_area *diff_area) +{ + struct chunk *chunk; + + if (atomic_read(&diff_area->read_cache_count) > + chunk_maximum_in_cache) { + chunk = get_chunk_from_cache_and_write_lock( + &diff_area->caches_lock, &diff_area->read_cache_queue, + &diff_area->read_cache_count); + if (chunk) + return chunk; + } + + if (atomic_read(&diff_area->write_cache_count) > + chunk_maximum_in_cache) { + chunk = get_chunk_from_cache_and_write_lock( + &diff_area->caches_lock, &diff_area->write_cache_queue, + &diff_area->write_cache_count); + if (chunk) + return chunk; + } + + return NULL; +} + +static void diff_area_cache_release(struct diff_area *diff_area) +{ + struct chunk *chunk; + + while (!diff_area_is_corrupted(diff_area) && + (chunk = diff_area_get_chunk_from_cache_and_write_lock( + diff_area))) { + /* + * There cannot be a chunk in the cache whose buffer is + * not ready. + */ + if (WARN(!chunk_state_check(chunk, CHUNK_ST_BUFFER_READY), + "Cannot release empty buffer for chunk #%ld", + chunk->number)) { + up(&chunk->lock); + continue; + } + + if (chunk_state_check(chunk, CHUNK_ST_DIRTY)) { + int ret = chunk_schedule_storing(chunk, false); + + if (ret) + chunk_store_failed(chunk, ret); + } else { + chunk_diff_buffer_release(chunk); + up(&chunk->lock); + } + } +} + +static void diff_area_cache_release_work(struct work_struct *work) +{ + struct diff_area *diff_area = + container_of(work, struct diff_area, cache_release_work); + + diff_area_cache_release(diff_area); +} + +struct diff_area *diff_area_new(dev_t dev_id, struct diff_storage *diff_storage) +{ + int ret = 0; + struct diff_area *diff_area = NULL; + struct block_device *bdev; + unsigned long number; + struct chunk *chunk; + + pr_debug("Open device [%u:%u]\n", MAJOR(dev_id), MINOR(dev_id)); + + bdev = blkdev_get_by_dev(dev_id, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { + pr_err("Failed to open device. errno=%d\n", + abs((int)PTR_ERR(bdev))); + return ERR_PTR(PTR_ERR(bdev)); + } + + diff_area = kzalloc(sizeof(struct diff_area), GFP_KERNEL); + if (!diff_area) { + blkdev_put(bdev, FMODE_READ | FMODE_WRITE); + return ERR_PTR(-ENOMEM); + } + + diff_area->orig_bdev = bdev; + diff_area->diff_storage = diff_storage; + + diff_area_calculate_chunk_size(diff_area); + pr_debug("Chunk size %llu in bytes\n", 1ull << diff_area->chunk_shift); + pr_debug("Chunk count %lu\n", diff_area->chunk_count); + + kref_init(&diff_area->kref); + xa_init(&diff_area->chunk_map); + + if (!diff_storage->capacity) { + pr_err("Difference storage is empty.\n"); + pr_err("In-memory difference storage is not supported"); + return ERR_PTR(-EFAULT); + } + + spin_lock_init(&diff_area->caches_lock); + INIT_LIST_HEAD(&diff_area->read_cache_queue); + atomic_set(&diff_area->read_cache_count, 0); + INIT_LIST_HEAD(&diff_area->write_cache_queue); + atomic_set(&diff_area->write_cache_count, 0); + INIT_WORK(&diff_area->cache_release_work, diff_area_cache_release_work); + + spin_lock_init(&diff_area->free_diff_buffers_lock); + INIT_LIST_HEAD(&diff_area->free_diff_buffers); + atomic_set(&diff_area->free_diff_buffers_count, 0); + + atomic_set(&diff_area->corrupt_flag, 0); + atomic_set(&diff_area->pending_io_count, 0); + + /** + * Allocating all chunks in advance allows to avoid doing this in + * the process of filtering bio. + * In addition, the chunk structure has an rw semaphore that allows + * to lock data of a single chunk. + * Different threads can read, write, or dump their data to diff storage + * independently of each other, provided that different chunks are used. + */ + for (number = 0; number < diff_area->chunk_count; number++) { + chunk = chunk_alloc(diff_area, number); + if (!chunk) { + pr_err("Failed allocate chunk\n"); + ret = -ENOMEM; + break; + } + chunk->sector_count = diff_area_chunk_sectors(diff_area); + + ret = xa_insert(&diff_area->chunk_map, number, chunk, + GFP_KERNEL); + if (ret) { + pr_err("Failed insert chunk to chunk map\n"); + chunk_free(chunk); + break; + } + } + if (ret) { + diff_area_put(diff_area); + return ERR_PTR(ret); + } + + recalculate_last_chunk_size(chunk); + + atomic_set(&diff_area->corrupt_flag, 0); + + return diff_area; +} + +static void diff_area_take_chunk_from_cache(struct diff_area *diff_area, + struct chunk *chunk) +{ + spin_lock(&diff_area->caches_lock); + if (!list_is_first(&chunk->cache_link, &chunk->cache_link)) { + list_del_init(&chunk->cache_link); + + if (chunk_state_check(chunk, CHUNK_ST_DIRTY)) + atomic_dec(&diff_area->write_cache_count); + else + atomic_dec(&diff_area->read_cache_count); + } + spin_unlock(&diff_area->caches_lock); +} + +/** + * diff_area_copy() - Implements the copy-on-write mechanism. + * + * + */ +int diff_area_copy(struct diff_area *diff_area, sector_t sector, sector_t count, + const bool is_nowait) +{ + int ret = 0; + sector_t offset; + struct chunk *chunk; + struct diff_buffer *diff_buffer; + sector_t area_sect_first; + sector_t chunk_sectors = diff_area_chunk_sectors(diff_area); + + area_sect_first = round_down(sector, chunk_sectors); + for (offset = area_sect_first; offset < (sector + count); + offset += chunk_sectors) { + chunk = xa_load(&diff_area->chunk_map, + chunk_number(diff_area, offset)); + if (!chunk) { + diff_area_set_corrupted(diff_area, -EINVAL); + return -EINVAL; + } + WARN_ON(chunk_number(diff_area, offset) != chunk->number); + if (is_nowait) { + if (down_trylock(&chunk->lock)) + return -EAGAIN; + } else { + ret = down_killable(&chunk->lock); + if (unlikely(ret)) + return ret; + } + + if (chunk_state_check(chunk, CHUNK_ST_FAILED | CHUNK_ST_DIRTY | + CHUNK_ST_STORE_READY)) { + /* + * The chunk has already been: + * - Failed, when the snapshot is corrupted + * - Overwritten in the snapshot image + * - Already stored in the diff storage + */ + up(&chunk->lock); + continue; + } + + if (unlikely(chunk_state_check( + chunk, CHUNK_ST_LOADING | CHUNK_ST_STORING))) { + pr_err("Invalid chunk state\n"); + ret = -EFAULT; + goto fail_unlock_chunk; + } + + if (chunk_state_check(chunk, CHUNK_ST_BUFFER_READY)) { + diff_area_take_chunk_from_cache(diff_area, chunk); + /** + * The chunk has already been read, but now we need + * to store it to diff_storage. + */ + ret = chunk_schedule_storing(chunk, is_nowait); + if (unlikely(ret)) + goto fail_unlock_chunk; + } else { + diff_buffer = + diff_buffer_take(chunk->diff_area, is_nowait); + if (IS_ERR(diff_buffer)) { + ret = PTR_ERR(diff_buffer); + goto fail_unlock_chunk; + } + WARN(chunk->diff_buffer, "Chunks buffer has been lost"); + chunk->diff_buffer = diff_buffer; + + ret = chunk_async_load_orig(chunk, is_nowait); + if (unlikely(ret)) + goto fail_unlock_chunk; + } + } + + return ret; +fail_unlock_chunk: + WARN_ON(!chunk); + chunk_store_failed(chunk, ret); + return ret; +} + +int diff_area_wait(struct diff_area *diff_area, sector_t sector, sector_t count, + const bool is_nowait) +{ + int ret = 0; + sector_t offset; + struct chunk *chunk; + sector_t area_sect_first; + sector_t chunk_sectors = diff_area_chunk_sectors(diff_area); + + area_sect_first = round_down(sector, chunk_sectors); + for (offset = area_sect_first; offset < (sector + count); + offset += chunk_sectors) { + chunk = xa_load(&diff_area->chunk_map, + chunk_number(diff_area, offset)); + if (!chunk) { + diff_area_set_corrupted(diff_area, -EINVAL); + return -EINVAL; + } + WARN_ON(chunk_number(diff_area, offset) != chunk->number); + if (is_nowait) { + if (down_trylock(&chunk->lock)) + return -EAGAIN; + } else { + ret = down_killable(&chunk->lock); + if (unlikely(ret)) + return ret; + } + + if (chunk_state_check(chunk, CHUNK_ST_FAILED)) { + /* + * The chunk has already been: + * - Failed, when the snapshot is corrupted + * - Overwritten in the snapshot image + * - Already stored in the diff storage + */ + up(&chunk->lock); + ret = -EFAULT; + break; + } + + if (chunk_state_check(chunk, CHUNK_ST_BUFFER_READY | + CHUNK_ST_DIRTY | CHUNK_ST_STORE_READY)) { + /* + * The chunk has already been: + * - Read + * - Overwritten in the snapshot image + * - Already stored in the diff storage + */ + up(&chunk->lock); + continue; + } + } + + return ret; +} + +static inline void diff_area_image_put_chunk(struct chunk *chunk, bool is_write) +{ + if (is_write) { + /* + * Since the chunk was taken to perform writing, + * we mark it as dirty. + */ + chunk_state_set(chunk, CHUNK_ST_DIRTY); + } + + chunk_schedule_caching(chunk); +} + +void diff_area_image_ctx_done(struct diff_area_image_ctx *io_ctx) +{ + if (!io_ctx->chunk) + return; + + diff_area_image_put_chunk(io_ctx->chunk, io_ctx->is_write); +} + +static int diff_area_load_chunk_from_storage(struct diff_area *diff_area, + struct chunk *chunk) +{ + struct diff_buffer *diff_buffer; + + diff_buffer = diff_buffer_take(diff_area, false); + if (IS_ERR(diff_buffer)) + return PTR_ERR(diff_buffer); + + WARN_ON(chunk->diff_buffer); + chunk->diff_buffer = diff_buffer; + + if (chunk_state_check(chunk, CHUNK_ST_STORE_READY)) + return chunk_load_diff(chunk); + + return chunk_load_orig(chunk); +} + +static struct chunk * +diff_area_image_context_get_chunk(struct diff_area_image_ctx *io_ctx, + sector_t sector) +{ + int ret; + struct chunk *chunk; + struct diff_area *diff_area = io_ctx->diff_area; + unsigned long new_chunk_number = chunk_number(diff_area, sector); + + chunk = io_ctx->chunk; + if (chunk) { + if (chunk->number == new_chunk_number) + return chunk; + + /* + * If the sector falls into a new chunk, then we release + * the old chunk. + */ + diff_area_image_put_chunk(chunk, io_ctx->is_write); + io_ctx->chunk = NULL; + } + + /* Take a next chunk. */ + chunk = xa_load(&diff_area->chunk_map, new_chunk_number); + if (unlikely(!chunk)) + return ERR_PTR(-EINVAL); + + ret = down_killable(&chunk->lock); + if (ret) + return ERR_PTR(ret); + + if (unlikely(chunk_state_check(chunk, CHUNK_ST_FAILED))) { + pr_err("Chunk #%ld corrupted\n", chunk->number); + + pr_debug("new_chunk_number=%ld\n", new_chunk_number); + pr_debug("sector=%llu\n", sector); + pr_debug("Chunk size %llu in bytes\n", + (1ull << diff_area->chunk_shift)); + pr_debug("Chunk count %lu\n", diff_area->chunk_count); + + ret = -EIO; + goto fail_unlock_chunk; + } + + /* + * If there is already data in the buffer, then nothing needs to be loaded. + * Otherwise, the chunk needs to be loaded from the original device or + * from the difference storage. + */ + if (!chunk_state_check(chunk, CHUNK_ST_BUFFER_READY)) { + ret = diff_area_load_chunk_from_storage(diff_area, chunk); + if (unlikely(ret)) + goto fail_unlock_chunk; + + /* Set the flag that the buffer contains the required data. */ + chunk_state_set(chunk, CHUNK_ST_BUFFER_READY); + } else + diff_area_take_chunk_from_cache(diff_area, chunk); + + io_ctx->chunk = chunk; + return chunk; + +fail_unlock_chunk: + pr_err("Failed to load chunk #%ld\n", chunk->number); + up(&chunk->lock); + return ERR_PTR(ret); +} + +static inline sector_t diff_area_chunk_start(struct diff_area *diff_area, + struct chunk *chunk) +{ + return (sector_t)(chunk->number) << diff_area->chunk_shift; +} + +/** + * diff_area_image_io - Implements copying data from the chunk to bio_vec when + * reading or from bio_vec to the chunk when writing. + */ +blk_status_t diff_area_image_io(struct diff_area_image_ctx *io_ctx, + const struct bio_vec *bvec, sector_t *pos) +{ + unsigned int bv_len = bvec->bv_len; + struct iov_iter iter; + + iov_iter_bvec(&iter, io_ctx->is_write ? WRITE : READ, bvec, 1, bv_len); + + while (bv_len) { + struct diff_buffer_iter diff_buffer_iter; + struct chunk *chunk; + size_t buff_offset; + + chunk = diff_area_image_context_get_chunk(io_ctx, *pos); + if (IS_ERR(chunk)) + return BLK_STS_IOERR; + + buff_offset = (size_t)(*pos - chunk_sector(chunk)) + << SECTOR_SHIFT; + while (bv_len && + diff_buffer_iter_get(chunk->diff_buffer, buff_offset, + &diff_buffer_iter)) { + size_t sz; + + if (io_ctx->is_write) + sz = copy_page_from_iter( + diff_buffer_iter.page, + diff_buffer_iter.offset, + diff_buffer_iter.bytes, + &iter); + else + sz = copy_page_to_iter( + diff_buffer_iter.page, + diff_buffer_iter.offset, + diff_buffer_iter.bytes, + &iter); + if (!sz) + return BLK_STS_IOERR; + + buff_offset += sz; + *pos += (sz >> SECTOR_SHIFT); + bv_len -= sz; + } + } + + return BLK_STS_OK; +} + +static inline void diff_area_event_corrupted(struct diff_area *diff_area, + int err_code) +{ + struct blk_snap_event_corrupted data = { + .orig_dev_id.mj = MAJOR(diff_area->orig_bdev->bd_dev), + .orig_dev_id.mn = MINOR(diff_area->orig_bdev->bd_dev), + .err_code = abs(err_code), + }; + + event_gen(&diff_area->diff_storage->event_queue, GFP_NOIO, + blk_snap_event_code_corrupted, &data, + sizeof(struct blk_snap_event_corrupted)); +} + +void diff_area_set_corrupted(struct diff_area *diff_area, int err_code) +{ + if (atomic_inc_return(&diff_area->corrupt_flag) != 1) + return; + + diff_area_event_corrupted(diff_area, err_code); + + pr_err("Set snapshot device is corrupted for [%u:%u] with error code %d\n", + MAJOR(diff_area->orig_bdev->bd_dev), + MINOR(diff_area->orig_bdev->bd_dev), abs(err_code)); +} + +void diff_area_throttling_io(struct diff_area *diff_area) +{ + u64 start_waiting; + + start_waiting = jiffies_64; + while (atomic_read(&diff_area->pending_io_count)) { + schedule_timeout_interruptible(0); + if (jiffies_64 > (start_waiting + HZ / 10)) + break; + } +} diff --git a/drivers/block/blksnap/diff_area.h b/drivers/block/blksnap/diff_area.h new file mode 100644 index 000000000000..13cdbfc369fb --- /dev/null +++ b/drivers/block/blksnap/diff_area.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_SNAP_DIFF_AREA_H +#define __BLK_SNAP_DIFF_AREA_H + +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/xarray.h> +#include "event_queue.h" + +struct diff_storage; +struct chunk; + +/** + * struct diff_area - Discribes the difference area for one original device. + * @kref: + * The reference counter. The &struct diff_area can be shared between + * the &struct tracker and &struct snapimage. + * @orig_bdev: + * A pointer to the structure of an opened block device. + * @diff_storage: + * Pointer to difference storage for storing difference data. + * @chunk_shift: + * Power of 2 used to specify the chunk size. This allows to set different chunk sizes for + * huge and small block devices. + * @chunk_count: + * Count of chunks. The number of chunks into which the block device + * is divided. + * @chunk_map: + * A map of chunks. + * @caches_lock: + * This spinlock guarantees consistency of the linked lists of chunk + * caches. + * @read_cache_queue: + * Queue for the read cache. + * @read_cache_count: + * The number of chunks in the read cache. + * @write_cache_queue: + * Queue for the write cache. + * @write_cache_count: + * The number of chunks in the write cache. + * @cache_release_work: + * The workqueue work item. This worker limits the number of chunks + * that store their data in RAM. + * @free_diff_buffers_lock: + * This spinlock guarantees consistency of the linked lists of + * free difference buffers. + * @free_diff_buffers: + * Linked list of free difference buffers allows to reduce the number + * of buffer allocation and release operations. + * @free_diff_buffers_count: + * The number of free difference buffers in the linked list. + * @corrupt_flag: + * The flag is set if an error occurred in the operation of the data + * saving mechanism in the diff area. In this case, an error will be + * generated when reading from the snapshot image. + * @pending_io_count: + * Counter of incomplete I/O operations. Allows to wait for all I/O + * operations to be completed before releasing this structure. + * + * The &struct diff_area is created for each block device in the snapshot. + * It is used to save the differences between the original block device and + * the snapshot image. That is, when writing data to the original device, + * the differences are copied as chunks to the difference storage. + * Reading and writing from the snapshot image is also performed using + * &struct diff_area. + * + * The xarray has a limit on the maximum size. This can be especially + * noticeable on 32-bit systems. This creates a limit in the size of + * supported disks. + * + * For example, for a 256 TiB disk with a block size of 65536 bytes, the + * number of elements in the chunk map will be equal to 2 with a power of 32. + * Therefore, the number of chunks into which the block device is divided is + * limited. + * + * To provide high performance, a read cache and a write cache for chunks are + * used. The cache algorithm is the simplest. If the data of the chunk was + * read to the difference buffer, then the buffer is not released immediately, + * but is placed at the end of the queue. The worker thread checks the number + * of chunks in the queue and releases a difference buffer for the first chunk + * in the queue, but only if the binary semaphore of the chunk is not locked. + * If the read thread accesses the chunk from the cache again, it returns + * back to the end of the queue. + * + * The linked list of difference buffers allows to have a certain number of + * "hot" buffers. This allows to reduce the number of allocations and releases + * of memory. + * + * + */ +struct diff_area { + struct kref kref; + + struct block_device *orig_bdev; + struct diff_storage *diff_storage; + + unsigned long long chunk_shift; + unsigned long chunk_count; + struct xarray chunk_map; + + spinlock_t caches_lock; + struct list_head read_cache_queue; + atomic_t read_cache_count; + struct list_head write_cache_queue; + atomic_t write_cache_count; + struct work_struct cache_release_work; + + spinlock_t free_diff_buffers_lock; + struct list_head free_diff_buffers; + atomic_t free_diff_buffers_count; + + atomic_t corrupt_flag; + atomic_t pending_io_count; +}; + +struct diff_area *diff_area_new(dev_t dev_id, + struct diff_storage *diff_storage); +void diff_area_free(struct kref *kref); +static inline void diff_area_get(struct diff_area *diff_area) +{ + kref_get(&diff_area->kref); +}; +static inline void diff_area_put(struct diff_area *diff_area) +{ + if (likely(diff_area)) + kref_put(&diff_area->kref, diff_area_free); +}; +void diff_area_set_corrupted(struct diff_area *diff_area, int err_code); +static inline bool diff_area_is_corrupted(struct diff_area *diff_area) +{ + return !!atomic_read(&diff_area->corrupt_flag); +}; +static inline sector_t diff_area_chunk_sectors(struct diff_area *diff_area) +{ + return (sector_t)(1ull << (diff_area->chunk_shift - SECTOR_SHIFT)); +}; +int diff_area_copy(struct diff_area *diff_area, sector_t sector, sector_t count, + const bool is_nowait); + +int diff_area_wait(struct diff_area *diff_area, sector_t sector, sector_t count, + const bool is_nowait); +/** + * struct diff_area_image_ctx - The context for processing an io request to + * the snapshot image. + * @diff_area: + * Pointer to &struct diff_area for the current snapshot image. + * @is_write: + * Distinguishes between the behavior of reading or writing when + * processing a request. + * @chunk: + * Current chunk. + */ +struct diff_area_image_ctx { + struct diff_area *diff_area; + bool is_write; + struct chunk *chunk; +}; + +static inline void diff_area_image_ctx_init(struct diff_area_image_ctx *io_ctx, + struct diff_area *diff_area, + bool is_write) +{ + io_ctx->diff_area = diff_area; + io_ctx->is_write = is_write; + io_ctx->chunk = NULL; +}; +void diff_area_image_ctx_done(struct diff_area_image_ctx *io_ctx); +blk_status_t diff_area_image_io(struct diff_area_image_ctx *io_ctx, + const struct bio_vec *bvec, sector_t *pos); + +void diff_area_throttling_io(struct diff_area *diff_area); + +#endif /* __BLK_SNAP_DIFF_AREA_H */

[v1,13/17] block, blksnap: owner of information about overwritten blocks of the original block device

Commit Message

Patch