[05/19] btrfs: handle checksum generation in the storage layer

Message ID	20221120124734.18634-6-hch@lst.de (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Christoph Hellwig <hch@lst.de> To: Chris Mason <clm@fb.com>, Josef Bacik <josef@toxicpanda.com>, David Sterba <dsterba@suse.com> Cc: Damien Le Moal <damien.lemoal@wdc.com>, Naohiro Aota <naohiro.aota@wdc.com>, Johannes Thumshirn <johannes.thumshirn@wdc.com>, Qu Wenruo <wqu@suse.com>, Jens Axboe <axboe@kernel.dk>, "Darrick J. Wong" <djwong@kernel.org>, linux-block@vger.kernel.org, linux-btrfs@vger.kernel.org, linux-fsdevel@vger.kernel.org Subject: [PATCH 05/19] btrfs: handle checksum generation in the storage layer Date: Sun, 20 Nov 2022 13:47:20 +0100 Message-Id: <20221120124734.18634-6-hch@lst.de> In-Reply-To: <20221120124734.18634-1-hch@lst.de> References: <20221120124734.18634-1-hch@lst.de> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	[01/19] block: export bio_split_rw \| expand [01/19] block: export bio_split_rw [02/19] btrfs: handle checksum validation and repair at the storage layer [03/19] btrfs: remove the submit_bio_start helpers [04/19] btrfs: simplify the btrfs_csum_one_bio calling convention [05/19] btrfs: handle checksum generation in the storage layer [06/19] btrfs: handle recording of zoned writes in the storage layer [07/19] btrfs: support cloned bios in btree_csum_one_bio [08/19] btrfs: allow btrfs_submit_bio to split bios [09/19] btrfs: pass the iomap bio to btrfs_submit_bio [10/19] btrfs: remove stripe boundary calculation for buffered I/O [11/19] btrfs: remove stripe boundary calculation for compressed I/O [12/19] btrfs: remove stripe boundary calculation for encoded I/O [13/19] btrfs: remove struct btrfs_io_geometry [14/19] btrfs: remove submit_encoded_read_bio [15/19] btrfs: remove the fs_info argument to btrfs_submit_bio [16/19] btrfs: remove now spurious bio submission helpers [17/19] btrfs: calculate file system wide queue limit for zoned mode [18/19] btrfs: split zone append bios in btrfs_submit_bio [19/19] iomap: remove IOMAP_F_ZONE_APPEND

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index e1fbe7f67456b..67ee94fb92523 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -403,7 +403,169 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + /* Do not leak our private flag into the block layer */ + bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; + + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + bio->bi_private = smap->dev; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap->dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; + + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +{ + if (bbio->bio.bi_opf & REQ_META) + return btree_csum_one_bio(&bbio->bio); + return btrfs_csum_one_bio(bbio); +} + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. + */ +struct async_submit_bio { + struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; + struct btrfs_io_stripe smap; + int mirror_num; + struct btrfs_work work; +}; + +/* + * In order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static void run_one_async_start(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + blk_status_t ret; + + ret = btrfs_bio_csum(async->bbio); + if (ret) + async->bbio->bio.bi_status = ret; +} + +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ +static void run_one_async_done(struct btrfs_work *work) +{ + struct async_submit_bio *async = + container_of(work, struct async_submit_bio, work); + struct bio *bio = &async->bbio->bio; + + /* If an error occurred we just want to clean up the bio and move on */ + if (bio->bi_status) { + btrfs_bio_end_io(async->bbio, bio->bi_status); + return; + } + + /* + * All of the bios that pass through here are from async helpers. + * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. + * This changes nothing when cgroups aren't in use. + */ + bio->bi_opf |= REQ_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + kfree(container_of(work, struct async_submit_bio, work)); +} + +static bool should_async_write(struct btrfs_bio *bbio) +{ + /* + * If the I/O is not issued by fsync and friends, (->sync_writers != 0), + * then try to defer the submission to a workqueue to parallelize the + * checksum calculation. + */ + if (atomic_read(&bbio->inode->sync_writers)) + return false; + + /* + * Submit metadata writes synchronously if the checksum implementation + * is fast, or we are on a zoned device that wants I/O to be submitted + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) + return false; + } + + return true; +} + +/* + * Submit bio to an async queue. + * + * Returns true if the work has been succesfuly submitted, else false. + */ +static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) +{ + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return false; + + async->bbio = bbio; + async->bioc = bioc; + async->smap = *smap; + async->mirror_num = mirror_num; + + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, + run_one_async_free); + if (op_is_sync(bbio->bio.bi_opf)) + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; +} + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num) { struct btrfs_bio *bbio = btrfs_bio(bio); u64 logical = bio->bi_iter.bi_sector << 9; @@ -440,34 +602,25 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror goto fail; } - /* Do not leak our private flag into the block layer */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - - if (!bioc) { - /* Single mirror read/write fast path */ - btrfs_bio(bio)->mirror_num = mirror_num; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = smap.dev; - bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); - } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ - bio->bi_private = bioc; - bio->bi_end_io = btrfs_raid56_end_io; - if (bio_op(bio) == REQ_OP_READ) - raid56_parity_recover(bio, bioc, mirror_num); - else - raid56_parity_write(bio, bioc); - } else { - /* Write to multiple mirrors */ - int total_devs = bioc->num_stripes; - int dev_nr; - - bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) - btrfs_submit_mirrored_bio(bioc, dev_nr); + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ + if (!(bbio->inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bbio->inode->root)) { + if (should_async_write(bbio) && + btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) + return; + + ret = btrfs_bio_csum(bbio); + if (ret) + goto fail; + } } + __btrfs_submit_bio(bio, bioc, &smap, mirror_num); return; fail: btrfs_bio_counter_dec(fs_info); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9e695048e30f5..7ca4f97f5ec92 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -355,7 +355,6 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; blk_status_t ret = BLK_STS_OK; - int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); const enum req_op bio_op = REQ_BTRFS_ONE_ORDERED | (use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE); @@ -437,14 +436,6 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = true; if (submit) { - if (!skip_sum) { - ret = btrfs_csum_one_bio(btrfs_bio(bio)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - break; - } - } - ASSERT(bio->bi_iter.bi_size); btrfs_submit_bio(fs_info, bio, 0); bio = NULL; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ae0ca51adc6e2..2018d1b9ff3a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -52,7 +52,6 @@ #include "relocation.h" #include "scrub.h" #include "super.h" -#include "file-item.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -79,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) crypto_free_shash(fs_info->csum_shash); } -/* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct btrfs_inode *inode; - struct bio *bio; - enum btrfs_wq_submit_cmd submit_cmd; - int mirror_num; - - /* Optional parameter for used by direct io */ - u64 dio_file_offset; - struct btrfs_work work; - blk_status_t status; -}; - /* * Compute the csum of a btree block and store the result to provided buffer. */ @@ -449,7 +431,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return csum_one_extent_buffer(eb); } -static blk_status_t btree_csum_one_bio(struct bio *bio) +blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -709,139 +691,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, return ret; } -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - blk_status_t ret; - - async = container_of(work, struct async_submit_bio, work); - switch (async->submit_cmd) { - case WQ_SUBMIT_METADATA: - ret = btree_csum_one_bio(async->bio); - break; - case WQ_SUBMIT_DATA: - case WQ_SUBMIT_DATA_DIO: - ret = btrfs_csum_one_bio(btrfs_bio(async->bio)); - break; - } - if (ret) - async->status = ret; -} - -/* - * In order to insert checksums into the metadata in large chunks, we wait - * until bio submission time. All the pages in the bio are checksummed and - * sums are attached onto the ordered extent record. - * - * At IO completion time the csums attached on the ordered extent record are - * inserted into the tree. - */ -static void run_one_async_done(struct btrfs_work *work) -{ - struct async_submit_bio *async = - container_of(work, struct async_submit_bio, work); - struct btrfs_inode *inode = async->inode; - struct btrfs_bio *bbio = btrfs_bio(async->bio); - - /* If an error occurred we just want to clean up the bio and move on */ - if (async->status) { - btrfs_bio_end_io(bbio, async->status); - return; - } - - /* - * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. - */ - async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -/* - * Submit bio to an async queue. - * - * Retrun: - * - true if the work has been succesfuly submitted - * - false in case of error - */ -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return false; - - async->inode = inode; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_cmd = cmd; - - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); - - async->dio_file_offset = dio_file_offset; - - async->status = 0; - - if (op_is_sync(bio->bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); - return true; -} - -static bool should_async_write(struct btrfs_fs_info *fs_info, - struct btrfs_inode *bi) -{ - if (btrfs_is_zoned(fs_info)) - return false; - if (atomic_read(&bi->sync_writers)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - return true; -} - void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_bio *bbio = btrfs_bio(bio); - blk_status_t ret; - bio->bi_opf |= REQ_META; - - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - - /* - * Kthread helpers are used to submit writes so that checksumming can - * happen in parallel across all CPUs. - */ - if (should_async_write(fs_info, inode) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) - return; - - ret = btree_csum_one_bio(bio); - if (ret) { - btrfs_bio_end_io(bbio, ret); - return; - } - - btrfs_submit_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 07ac66d693aee..1e04ecc43a2ab 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -114,14 +114,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, struct btrfs_tree_parent_check *check); -enum btrfs_wq_submit_cmd { - WQ_SUBMIT_METADATA, - WQ_SUBMIT_DATA, - WQ_SUBMIT_DATA_DIO, -}; - -bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +blk_status_t btree_csum_one_bio(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e58cf151cb5b1..d569d9a2d0c5b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2721,27 +2721,6 @@ void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int } } - /* - * If we need to checksum, and the I/O is not issued by fsync and - * friends, that is ->sync_writers != 0, defer the submission to a - * workqueue to parallelize it. - * - * Csum items for reloc roots have already been cloned at this point, - * so they are handled as part of the no-checksum case. - */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(inode->root)) { - if (!atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) - return; - - ret = btrfs_csum_one_bio(btrfs_bio(bio)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } btrfs_submit_bio(fs_info, bio, mirror_num); } @@ -7841,36 +7820,6 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) btrfs_dio_private_put(dip); } -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - blk_status_t ret; - - if (inode->flags & BTRFS_INODE_NODATASUM) - goto map; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&inode->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, - WQ_SUBMIT_DATA_DIO)) - return; - - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(btrfs_bio(bio)); - if (ret) { - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; - } - } -map: - btrfs_submit_bio(fs_info, bio, 0); -} - static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { @@ -7878,11 +7827,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, container_of(dio_bio, struct btrfs_dio_private, bio); struct inode *inode = iter->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - const bool raid56 = (btrfs_data_alloc_profile(fs_info) & - BTRFS_BLOCK_GROUP_RAID56_MASK); struct bio *bio; u64 start_sector; - int async_submit = 0; u64 submit_len; u64 clone_offset = 0; u64 clone_len; @@ -7949,19 +7895,10 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, * We transfer the initial reference to the last bio, so we * don't need to increment the reference count for the last one. */ - if (submit_len > 0) { + if (submit_len > 0) refcount_inc(&dip->refs); - /* - * If we are submitting more than one bio, submit them - * all asynchronously. The exception is RAID 5 or 6, as - * asynchronous checksums make it difficult to collect - * full stripe writes. - */ - if (!raid56) - async_submit = 1; - } - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); + btrfs_submit_bio(fs_info, bio, 0); dio_data->submitted += clone_len; clone_offset += clone_len;

[05/19] btrfs: handle checksum generation in the storage layer

Commit Message

Comments

Patch