Message ID | 20220901074216.1849941-17-hch@lst.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/17] block: export bio_split_rw | expand |
On 9/1/22 16:42, Christoph Hellwig wrote: > The current btrfs zoned device support is a little cumbersome in the data > I/O path as it requires the callers to not support more I/O than the > supported ZONE_APPEND size by the underlying device. This leads to a lot Did you mean: "...as it requires the callers to not issue I/O larger than the supported ZONE_APPEND size for the underlying device." ? I think you do mean that :) > of extra accounting. Instead change btrfs_submit_bio so that it can take > write bios of arbitrary size and form from the upper layers, and just > split them internally to the ZONE_APPEND queue limits. Then remove all > the upper layer warts catering to limited write sized on zoned devices, > including the extra refcount in the compressed_bio. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/btrfs/compression.c | 112 ++++++++--------------------------------- > fs/btrfs/compression.h | 3 -- > fs/btrfs/extent_io.c | 74 ++++++--------------------- > fs/btrfs/inode.c | 4 -- > fs/btrfs/volumes.c | 40 +++++++++------ > fs/btrfs/zoned.c | 20 -------- > fs/btrfs/zoned.h | 9 ---- > 7 files changed, 62 insertions(+), 200 deletions(-) > > diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c > index 5e8b75b030ace..f89cac08dc4a4 100644 > --- a/fs/btrfs/compression.c > +++ b/fs/btrfs/compression.c > @@ -255,57 +255,14 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) > static void end_compressed_bio_write(struct btrfs_bio *bbio) > { > struct compressed_bio *cb = bbio->private; > + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); > > - if (bbio->bio.bi_status) > - cb->status = bbio->bio.bi_status; > - > - if (refcount_dec_and_test(&cb->pending_ios)) { > - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); > + cb->status = bbio->bio.bi_status; > + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); > > - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); > - } > bio_put(&bbio->bio); > } > > -/* > - * Allocate a compressed_bio, which will be used to read/write on-disk > - * (aka, compressed) * data. > - * > - * @cb: The compressed_bio structure, which records all the needed > - * information to bind the compressed data to the uncompressed > - * page cache. > - * @disk_byten: The logical bytenr where the compressed data will be read > - * from or written to. > - * @endio_func: The endio function to call after the IO for compressed data > - * is finished. > - */ > -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, > - blk_opf_t opf, > - btrfs_bio_end_io_t endio_func) > -{ > - struct bio *bio; > - > - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, cb->inode, endio_func, cb); > - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; > - > - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { > - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); > - struct extent_map *em; > - > - em = btrfs_get_chunk_map(fs_info, disk_bytenr, > - fs_info->sectorsize); > - if (IS_ERR(em)) { > - bio_put(bio); > - return ERR_CAST(em); > - } > - > - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); > - free_extent_map(em); > - } > - refcount_inc(&cb->pending_ios); > - return bio; > -} > - > /* > * worker function to build and submit bios for previously compressed pages. > * The corresponding pages in the inode should be marked for writeback > @@ -329,16 +286,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, > struct compressed_bio *cb; > u64 cur_disk_bytenr = disk_start; > blk_status_t ret = BLK_STS_OK; > - const bool use_append = btrfs_use_zone_append(inode, disk_start); > - const enum req_op bio_op = REQ_BTRFS_ONE_ORDERED | > - (use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE); > > ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && > IS_ALIGNED(len, fs_info->sectorsize)); > cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); > if (!cb) > return BLK_STS_RESOURCE; > - refcount_set(&cb->pending_ios, 1); > cb->status = BLK_STS_OK; > cb->inode = &inode->vfs_inode; > cb->start = start; > @@ -349,8 +302,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, > INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); > cb->nr_pages = nr_pages; > > - if (blkcg_css) > + if (blkcg_css) { > kthread_associate_blkcg(blkcg_css); > + write_flags |= REQ_CGROUP_PUNT; > + } > + > + write_flags |= REQ_BTRFS_ONE_ORDERED; > + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, > + cb->inode, end_compressed_bio_write, cb); > + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; > > while (cur_disk_bytenr < disk_start + compressed_len) { > u64 offset = cur_disk_bytenr - disk_start; > @@ -358,19 +318,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, > unsigned int real_size; > unsigned int added; > struct page *page = compressed_pages[index]; > - bool submit = false; > - > - /* Allocate new bio if submitted or not yet allocated */ > - if (!bio) { > - bio = alloc_compressed_bio(cb, cur_disk_bytenr, > - bio_op | write_flags, end_compressed_bio_write); > - if (IS_ERR(bio)) { > - ret = errno_to_blk_status(PTR_ERR(bio)); > - break; > - } > - if (blkcg_css) > - bio->bi_opf |= REQ_CGROUP_PUNT; > - } > + > /* > * We have various limits on the real read size: > * - page boundary > @@ -380,36 +328,21 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, > real_size = min_t(u64, real_size, compressed_len - offset); > ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); > > - if (use_append) > - added = bio_add_zone_append_page(bio, page, real_size, > - offset_in_page(offset)); > - else > - added = bio_add_page(bio, page, real_size, > - offset_in_page(offset)); > - /* Reached zoned boundary */ > - if (added == 0) > - submit = true; > - > + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); > + /* > + * Maximum compressed extent is smaller than bio size limit, > + * thus bio_add_page() should always success. > + */ > + ASSERT(added == real_size); > cur_disk_bytenr += added; > - > - /* Finished the range */ > - if (cur_disk_bytenr == disk_start + compressed_len) > - submit = true; > - > - if (submit) { > - ASSERT(bio->bi_iter.bi_size); > - btrfs_bio(bio)->file_offset = start; > - btrfs_submit_bio(fs_info, bio, 0); > - bio = NULL; > - } > - cond_resched(); > } > > + /* Finished the range */ > + ASSERT(bio->bi_iter.bi_size); > + btrfs_bio(bio)->file_offset = start; > + btrfs_submit_bio(fs_info, bio, 0); > if (blkcg_css) > kthread_associate_blkcg(NULL); > - > - if (refcount_dec_and_test(&cb->pending_ios)) > - finish_compressed_bio_write(cb); > return ret; > } > > @@ -613,7 +546,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, > goto out; > } > > - refcount_set(&cb->pending_ios, 1); > cb->status = BLK_STS_OK; > cb->inode = inode; > > diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h > index 1aa02903de697..25876f7a26949 100644 > --- a/fs/btrfs/compression.h > +++ b/fs/btrfs/compression.h > @@ -30,9 +30,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); > #define BTRFS_ZLIB_DEFAULT_LEVEL 3 > > struct compressed_bio { > - /* Number of outstanding bios */ > - refcount_t pending_ios; > - > /* Number of compressed pages in the array */ > unsigned int nr_pages; > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 33e80f8dd0b1b..40dadc46e00d8 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -2597,7 +2597,6 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, > u32 real_size; > const sector_t sector = disk_bytenr >> SECTOR_SHIFT; > bool contig = false; > - int ret; > > ASSERT(bio); > /* The limit should be calculated when bio_ctrl->bio is allocated */ > @@ -2646,12 +2645,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, > if (real_size == 0) > return 0; > > - if (bio_op(bio) == REQ_OP_ZONE_APPEND) > - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); > - else > - ret = bio_add_page(bio, page, real_size, pg_offset); > - > - return ret; > + return bio_add_page(bio, page, real_size, pg_offset); > } > > static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, > @@ -2666,7 +2660,7 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, > * to them. > */ > if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && > - bio_op(bio_ctrl->bio) == REQ_OP_ZONE_APPEND) { > + btrfs_use_zone_append(inode, logical)) { > ordered = btrfs_lookup_ordered_extent(inode, file_offset); > if (ordered) { > bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, > @@ -2680,17 +2674,15 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, > bio_ctrl->len_to_oe_boundary = U32_MAX; > } > > -static int alloc_new_bio(struct btrfs_inode *inode, > - struct btrfs_bio_ctrl *bio_ctrl, > - struct writeback_control *wbc, > - blk_opf_t opf, > - btrfs_bio_end_io_t end_io_func, > - u64 disk_bytenr, u32 offset, u64 file_offset, > - enum btrfs_compression_type compress_type) > +static void alloc_new_bio(struct btrfs_inode *inode, > + struct btrfs_bio_ctrl *bio_ctrl, > + struct writeback_control *wbc, blk_opf_t opf, > + btrfs_bio_end_io_t end_io_func, > + u64 disk_bytenr, u32 offset, u64 file_offset, > + enum btrfs_compression_type compress_type) > { > struct btrfs_fs_info *fs_info = inode->root->fs_info; > struct bio *bio; > - int ret; > > bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, &inode->vfs_inode, end_io_func, > NULL); > @@ -2708,40 +2700,14 @@ static int alloc_new_bio(struct btrfs_inode *inode, > > if (wbc) { > /* > - * For Zone append we need the correct block_device that we are > - * going to write to set in the bio to be able to respect the > - * hardware limitation. Look it up here: > + * Pick the last added device to support cgroup writeback. For > + * multi-device file systems this means blk-cgroup policies have > + * to always be set on the last added/replaced device. > + * This is a bit odd but has been like that for a long time. > */ > - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { > - struct btrfs_device *dev; > - > - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, > - fs_info->sectorsize); > - if (IS_ERR(dev)) { > - ret = PTR_ERR(dev); > - goto error; > - } > - > - bio_set_dev(bio, dev->bdev); > - } else { > - /* > - * Otherwise pick the last added device to support > - * cgroup writeback. For multi-device file systems this > - * means blk-cgroup policies have to always be set on the > - * last added/replaced device. This is a bit odd but has > - * been like that for a long time. > - */ > - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); > - } > + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); > wbc_init_bio(wbc, bio); > - } else { > - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); > } > - return 0; > -error: > - bio_ctrl->bio = NULL; > - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); > - return ret; > } > > /* > @@ -2767,7 +2733,6 @@ static int submit_extent_page(blk_opf_t opf, > enum btrfs_compression_type compress_type, > bool force_bio_submit) > { > - int ret = 0; > struct btrfs_inode *inode = BTRFS_I(page->mapping->host); > unsigned int cur = pg_offset; > > @@ -2784,12 +2749,9 @@ static int submit_extent_page(blk_opf_t opf, > > /* Allocate new bio if needed */ > if (!bio_ctrl->bio) { > - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, > - end_io_func, disk_bytenr, offset, > - page_offset(page) + cur, > - compress_type); > - if (ret < 0) > - return ret; > + alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, > + disk_bytenr, offset, > + page_offset(page) + cur, compress_type); > } > /* > * We must go through btrfs_bio_add_page() to ensure each > @@ -3354,10 +3316,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, > * find_next_dirty_byte() are all exclusive > */ > iosize = min(min(em_end, end + 1), dirty_range_end) - cur; > - > - if (btrfs_use_zone_append(inode, em->block_start)) > - op = REQ_OP_ZONE_APPEND; > - > free_extent_map(em); > em = NULL; > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 9c562d36e4570..1a0bf381f2437 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -7727,10 +7727,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, > iomap->offset = start; > iomap->bdev = fs_info->fs_devices->latest_dev->bdev; > iomap->length = len; > - > - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) > - iomap->flags |= IOMAP_F_ZONE_APPEND; > - > free_extent_map(em); > > return 0; > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index e497b63238189..0d828b58cc9c3 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -6632,13 +6632,22 @@ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, > return bio; > } > > -static struct bio *btrfs_split_bio(struct bio *orig, u64 map_length) > +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, > + struct bio *orig, u64 map_length, > + bool use_append) > { > struct btrfs_bio *orig_bbio = btrfs_bio(orig); > struct bio *bio; > > - bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, > - &btrfs_clone_bioset); > + if (use_append) { > + unsigned int nr_segs; > + > + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, > + &btrfs_clone_bioset, map_length); > + } else { > + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, > + &btrfs_clone_bioset); > + } > btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); > > btrfs_bio(bio)->file_offset = orig_bbio->file_offset; > @@ -6970,16 +6979,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) > */ > if (bio_op(bio) == REQ_OP_ZONE_APPEND) { > u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; > + u64 zone_start = round_down(physical, dev->fs_info->zone_size); > > - if (btrfs_dev_is_sequential(dev, physical)) { > - u64 zone_start = round_down(physical, > - dev->fs_info->zone_size); > - > - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; > - } else { > - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; > - bio->bi_opf |= REQ_OP_WRITE; > - } > + ASSERT(btrfs_dev_is_sequential(dev, physical)); > + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; > } > btrfs_debug_in_rcu(dev->fs_info, > "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", > @@ -7179,9 +7182,11 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, > int mirror_num) > { > struct btrfs_bio *bbio = btrfs_bio(bio); > + struct btrfs_inode *bi = BTRFS_I(bbio->inode); > u64 logical = bio->bi_iter.bi_sector << 9; > u64 length = bio->bi_iter.bi_size; > u64 map_length = length; > + bool use_append = btrfs_use_zone_append(bi, logical); > struct btrfs_io_context *bioc = NULL; > struct btrfs_io_stripe smap; > int ret; > @@ -7193,8 +7198,11 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, > goto fail; > > map_length = min(map_length, length); > + if (use_append) > + map_length = min(map_length, fs_info->max_zone_append_size); > + > if (map_length < length) { > - bio = btrfs_split_bio(bio, map_length); > + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); > bbio = btrfs_bio(bio); > } > > @@ -7210,9 +7218,9 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, > } > > if (btrfs_op(bio) == BTRFS_MAP_WRITE) { > - struct btrfs_inode *bi = BTRFS_I(bbio->inode); > - > - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { > + if (use_append) { > + bio->bi_opf &= ~REQ_OP_WRITE; > + bio->bi_opf |= REQ_OP_ZONE_APPEND; > ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); > if (ret) > goto fail_put_bio; > diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c > index 6e04fbbd76b92..988e9fc5a6b7b 100644 > --- a/fs/btrfs/zoned.c > +++ b/fs/btrfs/zoned.c > @@ -1818,26 +1818,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, > return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); > } > > -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, > - u64 logical, u64 length) > -{ > - struct btrfs_device *device; > - struct extent_map *em; > - struct map_lookup *map; > - > - em = btrfs_get_chunk_map(fs_info, logical, length); > - if (IS_ERR(em)) > - return ERR_CAST(em); > - > - map = em->map_lookup; > - /* We only support single profile for now */ > - device = map->stripes[0].dev; > - > - free_extent_map(em); > - > - return device; > -} > - > /** > * Activate block group and underlying device zones > * > diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h > index 0f22b22fe359f..74153ab52169f 100644 > --- a/fs/btrfs/zoned.h > +++ b/fs/btrfs/zoned.h > @@ -64,8 +64,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, > int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); > int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, > u64 physical_start, u64 physical_pos); > -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, > - u64 logical, u64 length); > bool btrfs_zone_activate(struct btrfs_block_group *block_group); > int btrfs_zone_finish(struct btrfs_block_group *block_group); > bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); > @@ -209,13 +207,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, > return -EOPNOTSUPP; > } > > -static inline struct btrfs_device *btrfs_zoned_get_device( > - struct btrfs_fs_info *fs_info, > - u64 logical, u64 length) > -{ > - return ERR_PTR(-EOPNOTSUPP); > -} > - > static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) > { > return true;
On Fri, Sep 02, 2022 at 10:46:13AM +0900, Damien Le Moal wrote: > On 9/1/22 16:42, Christoph Hellwig wrote: > > The current btrfs zoned device support is a little cumbersome in the data > > I/O path as it requires the callers to not support more I/O than the > > supported ZONE_APPEND size by the underlying device. This leads to a lot > > Did you mean: "...as it requires the callers to not issue I/O larger than > the supported ZONE_APPEND size for the underlying device." ? > I think you do mean that :) Yes.
On 01.09.22 09:43, Christoph Hellwig wrote: > - if (btrfs_dev_is_sequential(dev, physical)) { > - u64 zone_start = round_down(physical, > - dev->fs_info->zone_size); > - > - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; > - } else { > - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; > - bio->bi_opf |= REQ_OP_WRITE; > - } > + ASSERT(btrfs_dev_is_sequential(dev, physical)); > + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; That ASSERT() will trigger on conventional zones, won't it?
On Mon, Sep 05, 2022 at 01:15:16PM +0000, Johannes Thumshirn wrote: > On 01.09.22 09:43, Christoph Hellwig wrote: > > + ASSERT(btrfs_dev_is_sequential(dev, physical)); > > + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; > > That ASSERT() will trigger on conventional zones, won't it? The assert is inside a if (bio_op(bio) == REQ_OP_ZONE_APPEND) { btrfs_submit_chunk only sets the op to REQ_OP_ZONE_APPEND when btrfs_use_zone_append returns true, which excludes conventional zones.
On 05.09.22 16:25, Christoph Hellwig wrote: > On Mon, Sep 05, 2022 at 01:15:16PM +0000, Johannes Thumshirn wrote: >> On 01.09.22 09:43, Christoph Hellwig wrote: >>> + ASSERT(btrfs_dev_is_sequential(dev, physical)); >>> + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; >> >> That ASSERT() will trigger on conventional zones, won't it? > > The assert is inside a > > if (bio_op(bio) == REQ_OP_ZONE_APPEND) { > > btrfs_submit_chunk only sets the op to REQ_OP_ZONE_APPEND when > btrfs_use_zone_append returns true, which excludes conventional zones. > hmm I got that one triggered with fsx: + /home/johannes/src/fstests/ltp/fsx -d /mnt/test/test Seed set to 1 main: filesystem does not support fallocate mode 0, disabling! main: filesystem does not support fallocate mode FALLOC_FL_KEEP_SIZE, disabling! main: filesystem does not support fallocate mode FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, disabling! main: filesystem does not support fallocate mode FALLOC_FL_ZERO_RANGE, disabling! main: filesystem does not support fallocate mode FALLOC_FL_COLLAPSE_RANGE, disabling! main: filesystem does not support fallocate mode FALLOC_FL_INSERT_RANGE, disabling! 1 mapwrite 0x27d31 thru 0x3171f (0x99ef bytes) [ 2.399348] assertion failed: btrfs_dev_is_sequential(dev, physical), in fs/btrfs/volumes.c:7034 [ 2.400881] ------------[ cut here ]------------ [ 2.401677] kernel BUG at fs/btrfs/ctree.h:3772! [ 2.402463] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 2.402943] CPU: 0 PID: 233 Comm: fsx Not tainted 6.0.0-rc3-raid-stripe-tree-bio-split #313 [ 2.402943] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 [ 2.402943] RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs] [ 2.402943] Code: 83 ff ff 48 89 d9 48 89 ea 48 c7 c6 48 d9 1d a0 eb e5 89 f1 48 c7 c2 68 51 1d a0 48 89 fe 48 c7 c7 b0 d9 1d a0 e8 83 b0 4f e1 <0f> 0b be bf 16 00 00 48 c7 c7 d8 d9 1d a0 e8 d5 ff ff ff 49 8b 85 [ 2.402943] RSP: 0018:ffffc9000015f8a8 EFLAGS: 00010286 [ 2.402943] RAX: 0000000000000054 RBX: ffff888103f35428 RCX: 0000000000000000 [ 2.402943] RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff [ 2.402943] RBP: ffff88811ad33148 R08: 00000000ffffefff R09: 00000000ffffefff [ 2.402943] R10: ffffffff8203cf80 R11: ffffffff8203cf80 R12: ffff88811ad330c0 [ 2.402943] R13: 0000000000000002 R14: 0000000000000002 R15: ffff8881004957e8 [ 2.402943] FS: 00007f87c5d23740(0000) GS:ffff888627c00000(0000) knlGS:0000000000000000 [ 2.402943] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.402943] CR2: 00007f87c5ca0000 CR3: 0000000103fa8000 CR4: 00000000000006b0 [ 2.402943] Call Trace: [ 2.402943] <TASK> [ 2.402943] btrfs_submit_dev_bio.cold+0x11/0x11 [btrfs] [ 2.402943] __btrfs_submit_bio+0x8e/0x150 [btrfs] [ 2.402943] btrfs_submit_chunk+0x12e/0x450 [btrfs] [ 2.402943] btrfs_submit_bio+0x1e/0x30 [btrfs] [ 2.402943] submit_one_bio+0x89/0xc0 [btrfs] [ 2.402943] extent_write_locked_range+0x1d9/0x1f0 [btrfs] [ 2.402943] run_delalloc_zoned+0x74/0x160 [btrfs] [ 2.402943] btrfs_run_delalloc_range+0x16f/0x5e0 [btrfs] [ 2.402943] ? find_lock_delalloc_range+0x27b/0x290 [btrfs] [ 2.402943] writepage_delalloc+0xb9/0x180 [btrfs] [ 2.402943] __extent_writepage+0x17f/0x340 [btrfs] [ 2.402943] extent_write_cache_pages+0x193/0x410 [btrfs] [ 2.402943] ? rt_mutex_trylock+0x2b/0x90 [ 2.402943] extent_writepages+0x60/0xe0 [btrfs] [ 2.402943] do_writepages+0xac/0x180 [ 2.402943] ? balance_dirty_pages_ratelimited_flags+0xcd/0xb10 [ 2.402943] ? btrfs_inode_rsv_release+0x52/0xe0 [btrfs] [ 2.402943] ? preempt_count_add+0x4e/0xb0 [ 2.402943] filemap_fdatawrite_range+0x76/0x80 [ 2.402943] start_ordered_ops.constprop.0+0x37/0x80 [btrfs] [ 2.402943] btrfs_sync_file+0xb7/0x500 [btrfs] [ 2.402943] __do_sys_msync+0x1dd/0x310 [ 2.402943] do_syscall_64+0x42/0x90 [ 2.402943] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 2.402943] RIP: 0033:0x7f87c5e31197 [ 2.402943] Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 1a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 [ 2.402943] RSP: 002b:00007fff06ca0788 EFLAGS: 00000246 ORIG_RAX: 000000000000001a [ 2.402943] RAX: ffffffffffffffda RBX: 0000000000000d31 RCX: 00007f87c5e31197 [ 2.402943] RDX: 0000000000000004 RSI: 000000000000a720 RDI: 00007f87c5c96000 [ 2.402943] RBP: 0000000000027d31 R08: 0000000000000000 R09: 0000000000027000 [ 2.402943] R10: 00007f87c5d33578 R11: 0000000000000246 R12: 00000000000099ef [ 2.402943] R13: 000000000000a720 R14: 00007f87c5c96000 R15: 00007f87c5f6b000 [ 2.402943] </TASK> [ 2.402943] Modules linked in: btrfs blake2b_generic xor lzo_compress zlib_deflate raid6_pq zstd_decompress zstd_compress xxhash null_blk [ 2.402943] Dumping ftrace buffer: [ 2.402943] (ftrace buffer empty)
On Mon, Sep 05, 2022 at 02:31:53PM +0000, Johannes Thumshirn wrote: > hmm I got that one triggered with fsx: > > + /home/johannes/src/fstests/ltp/fsx -d /mnt/test/test Odd. Is this a raid stripe tree setup where one copy is using zone append and another isn't? Because without that I can't see how this would happen. If not cane you send me the reproducer including the mkfs line?
On 05.09.22 16:40, Christoph Hellwig wrote: > On Mon, Sep 05, 2022 at 02:31:53PM +0000, Johannes Thumshirn wrote: >> hmm I got that one triggered with fsx: >> >> + /home/johannes/src/fstests/ltp/fsx -d /mnt/test/test > > Odd. Is this a raid stripe tree setup where one copy is using zone > append and another isn't? Because without that I can't see how > this would happen. If not cane you send me the reproducer including > the mkfs line? > > The stripe tree doesn't touch anything before endio, but to be save I'm retesting without my patches applied.
On 05.09.22 16:40, Christoph Hellwig wrote: > On Mon, Sep 05, 2022 at 02:31:53PM +0000, Johannes Thumshirn wrote: >> hmm I got that one triggered with fsx: >> >> + /home/johannes/src/fstests/ltp/fsx -d /mnt/test/test > > Odd. Is this a raid stripe tree setup where one copy is using zone > append and another isn't? Because without that I can't see how > this would happen. If not cane you send me the reproducer including > the mkfs line? > > OK it seems totally unrelated to your patchset and introduced by my patches. Odd but not your problem, sorry for the noise
On Thu, Sep 01, 2022 at 10:42:15AM +0300, Christoph Hellwig wrote: > The current btrfs zoned device support is a little cumbersome in the data > I/O path as it requires the callers to not support more I/O than the > supported ZONE_APPEND size by the underlying device. This leads to a lot > of extra accounting. Instead change btrfs_submit_bio so that it can take > write bios of arbitrary size and form from the upper layers, and just > split them internally to the ZONE_APPEND queue limits. Then remove all > the upper layer warts catering to limited write sized on zoned devices, > including the extra refcount in the compressed_bio. > > Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> tho I'd trust the zoned guys reviews over mine here. Thanks, Josef
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5e8b75b030ace..f89cac08dc4a4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -255,57 +255,14 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_compressed_bio_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = bbio->private; + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - - if (refcount_dec_and_test(&cb->pending_ios)) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); - } bio_put(&bbio->bio); } -/* - * Allocate a compressed_bio, which will be used to read/write on-disk - * (aka, compressed) * data. - * - * @cb: The compressed_bio structure, which records all the needed - * information to bind the compressed data to the uncompressed - * page cache. - * @disk_byten: The logical bytenr where the compressed data will be read - * from or written to. - * @endio_func: The endio function to call after the IO for compressed data - * is finished. - */ -static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - blk_opf_t opf, - btrfs_bio_end_io_t endio_func) -{ - struct bio *bio; - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, cb->inode, endio_func, cb); - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - struct extent_map *em; - - em = btrfs_get_chunk_map(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(em)) { - bio_put(bio); - return ERR_CAST(em); - } - - bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); - free_extent_map(em); - } - refcount_inc(&cb->pending_ios); - return bio; -} - /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -329,16 +286,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; blk_status_t ret = BLK_STS_OK; - const bool use_append = btrfs_use_zone_append(inode, disk_start); - const enum req_op bio_op = REQ_BTRFS_ONE_ORDERED | - (use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -349,8 +302,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - if (blkcg_css) + if (blkcg_css) { kthread_associate_blkcg(blkcg_css); + write_flags |= REQ_CGROUP_PUNT; + } + + write_flags |= REQ_BTRFS_ONE_ORDERED; + bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, + cb->inode, end_compressed_bio_write, cb); + bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; @@ -358,19 +318,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int real_size; unsigned int added; struct page *page = compressed_pages[index]; - bool submit = false; - - /* Allocate new bio if submitted or not yet allocated */ - if (!bio) { - bio = alloc_compressed_bio(cb, cur_disk_bytenr, - bio_op | write_flags, end_compressed_bio_write); - if (IS_ERR(bio)) { - ret = errno_to_blk_status(PTR_ERR(bio)); - break; - } - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - } + /* * We have various limits on the real read size: * - page boundary @@ -380,36 +328,21 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, real_size = min_t(u64, real_size, compressed_len - offset); ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - if (use_append) - added = bio_add_zone_append_page(bio, page, real_size, - offset_in_page(offset)); - else - added = bio_add_page(bio, page, real_size, - offset_in_page(offset)); - /* Reached zoned boundary */ - if (added == 0) - submit = true; - + added = bio_add_page(bio, page, real_size, offset_in_page(offset)); + /* + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. + */ + ASSERT(added == real_size); cur_disk_bytenr += added; - - /* Finished the range */ - if (cur_disk_bytenr == disk_start + compressed_len) - submit = true; - - if (submit) { - ASSERT(bio->bi_iter.bi_size); - btrfs_bio(bio)->file_offset = start; - btrfs_submit_bio(fs_info, bio, 0); - bio = NULL; - } - cond_resched(); } + /* Finished the range */ + ASSERT(bio->bi_iter.bi_size); + btrfs_bio(bio)->file_offset = start; + btrfs_submit_bio(fs_info, bio, 0); if (blkcg_css) kthread_associate_blkcg(NULL); - - if (refcount_dec_and_test(&cb->pending_ios)) - finish_compressed_bio_write(cb); return ret; } @@ -613,7 +546,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1aa02903de697..25876f7a26949 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,9 +30,6 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of outstanding bios */ - refcount_t pending_ios; - /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33e80f8dd0b1b..40dadc46e00d8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2597,7 +2597,6 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig = false; - int ret; ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ @@ -2646,12 +2645,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, if (real_size == 0) return 0; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); - else - ret = bio_add_page(bio, page, real_size, pg_offset); - - return ret; + return bio_add_page(bio, page, real_size, pg_offset); } static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, @@ -2666,7 +2660,7 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * to them. */ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - bio_op(bio_ctrl->bio) == REQ_OP_ZONE_APPEND) { + btrfs_use_zone_append(inode, logical)) { ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (ordered) { bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, @@ -2680,17 +2674,15 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, bio_ctrl->len_to_oe_boundary = U32_MAX; } -static int alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, - blk_opf_t opf, - btrfs_bio_end_io_t end_io_func, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, blk_opf_t opf, + btrfs_bio_end_io_t end_io_func, + u64 disk_bytenr, u32 offset, u64 file_offset, + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; - int ret; bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, &inode->vfs_inode, end_io_func, NULL); @@ -2708,40 +2700,14 @@ static int alloc_new_bio(struct btrfs_inode *inode, if (wbc) { /* - * For Zone append we need the correct block_device that we are - * going to write to set in the bio to be able to respect the - * hardware limitation. Look it up here: + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *dev; - - dev = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto error; - } - - bio_set_dev(bio, dev->bdev); - } else { - /* - * Otherwise pick the last added device to support - * cgroup writeback. For multi-device file systems this - * means blk-cgroup policies have to always be set on the - * last added/replaced device. This is a bit odd but has - * been like that for a long time. - */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - } + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, bio); - } else { - ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } - return 0; -error: - bio_ctrl->bio = NULL; - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return ret; } /* @@ -2767,7 +2733,6 @@ static int submit_extent_page(blk_opf_t opf, enum btrfs_compression_type compress_type, bool force_bio_submit) { - int ret = 0; struct btrfs_inode *inode = BTRFS_I(page->mapping->host); unsigned int cur = pg_offset; @@ -2784,12 +2749,9 @@ static int submit_extent_page(blk_opf_t opf, /* Allocate new bio if needed */ if (!bio_ctrl->bio) { - ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, - end_io_func, disk_bytenr, offset, - page_offset(page) + cur, - compress_type); - if (ret < 0) - return ret; + alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, + disk_bytenr, offset, + page_offset(page) + cur, compress_type); } /* * We must go through btrfs_bio_add_page() to ensure each @@ -3354,10 +3316,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * find_next_dirty_byte() are all exclusive */ iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - - if (btrfs_use_zone_append(inode, em->block_start)) - op = REQ_OP_ZONE_APPEND; - free_extent_map(em); em = NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9c562d36e4570..1a0bf381f2437 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7727,10 +7727,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - - if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) - iomap->flags |= IOMAP_F_ZONE_APPEND; - free_extent_map(em); return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e497b63238189..0d828b58cc9c3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6632,13 +6632,22 @@ struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bio; } -static struct bio *btrfs_split_bio(struct bio *orig, u64 map_length) +static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct bio *orig, u64 map_length, + bool use_append) { struct btrfs_bio *orig_bbio = btrfs_bio(orig); struct bio *bio; - bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, - &btrfs_clone_bioset); + if (use_append) { + unsigned int nr_segs; + + bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { + bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, + &btrfs_clone_bioset); + } btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); btrfs_bio(bio)->file_offset = orig_bbio->file_offset; @@ -6970,16 +6979,10 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) */ if (bio_op(bio) == REQ_OP_ZONE_APPEND) { u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 zone_start = round_down(physical, dev->fs_info->zone_size); - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } + ASSERT(btrfs_dev_is_sequential(dev, physical)); + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", @@ -7179,9 +7182,11 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *bi = BTRFS_I(bbio->inode); u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; u64 map_length = length; + bool use_append = btrfs_use_zone_append(bi, logical); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; int ret; @@ -7193,8 +7198,11 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, goto fail; map_length = min(map_length, length); + if (use_append) + map_length = min(map_length, fs_info->max_zone_append_size); + if (map_length < length) { - bio = btrfs_split_bio(bio, map_length); + bio = btrfs_split_bio(fs_info, bio, map_length, use_append); bbio = btrfs_bio(bio); } @@ -7210,9 +7218,9 @@ static bool btrfs_submit_chunk(struct btrfs_fs_info *fs_info, struct bio *bio, } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - struct btrfs_inode *bi = BTRFS_I(bbio->inode); - - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); if (ret) goto fail_put_bio; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 6e04fbbd76b92..988e9fc5a6b7b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1818,26 +1818,6 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; - /* We only support single profile for now */ - device = map->stripes[0].dev; - - free_extent_map(em); - - return device; -} - /** * Activate block group and underlying device zones * diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 0f22b22fe359f..74153ab52169f 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -64,8 +64,6 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); -struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); bool btrfs_zone_activate(struct btrfs_block_group *block_group); int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); @@ -209,13 +207,6 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } -static inline struct btrfs_device *btrfs_zoned_get_device( - struct btrfs_fs_info *fs_info, - u64 logical, u64 length) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true;
The current btrfs zoned device support is a little cumbersome in the data I/O path as it requires the callers to not support more I/O than the supported ZONE_APPEND size by the underlying device. This leads to a lot of extra accounting. Instead change btrfs_submit_bio so that it can take write bios of arbitrary size and form from the upper layers, and just split them internally to the ZONE_APPEND queue limits. Then remove all the upper layer warts catering to limited write sized on zoned devices, including the extra refcount in the compressed_bio. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/btrfs/compression.c | 112 ++++++++--------------------------------- fs/btrfs/compression.h | 3 -- fs/btrfs/extent_io.c | 74 ++++++--------------------- fs/btrfs/inode.c | 4 -- fs/btrfs/volumes.c | 40 +++++++++------ fs/btrfs/zoned.c | 20 -------- fs/btrfs/zoned.h | 9 ---- 7 files changed, 62 insertions(+), 200 deletions(-)