Message ID | 98d6609df5dc669df4025c257c28077f44b21e04.1701464169.git.boris@bur.io (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: qgroups rsv fixes | expand |
On 2023/12/2 07:30, Boris Burkov wrote: > The reserved data counter and input parameter is a u64, but we > inadvertantly accumulate it in an int. Overflowing that int results in > freeing the wrong amount of data and breaking rsv accounting. > > Unfortunately, this overflow rot spreads from there, as the qgroup > release/free functions rely on returning an int to take advantage of > negative values for error codes. Indeed, reusing int for both released bytes and error number is the root cause of the overflow. > > Therefore, the full fix is to return the "released" or "freed" amount by > a u64* argument and to return 0 or negative error code via the return > value. > > Most of the callsites simply ignore the return value, though some > of them handle the error and count the returned bytes. Change all of > them accordingly. > > Signed-off-by: Boris Burkov <boris@bur.io> Reviewed-by: Qu Wenruo <wqu@suse.com> Thanks, Qu > --- > fs/btrfs/delalloc-space.c | 2 +- > fs/btrfs/file.c | 2 +- > fs/btrfs/inode.c | 16 ++++++++-------- > fs/btrfs/ordered-data.c | 7 ++++--- > fs/btrfs/qgroup.c | 25 +++++++++++++++---------- > fs/btrfs/qgroup.h | 4 ++-- > 6 files changed, 31 insertions(+), 25 deletions(-) > > diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c > index 51453d4928fa..2833e8ef4c09 100644 > --- a/fs/btrfs/delalloc-space.c > +++ b/fs/btrfs/delalloc-space.c > @@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, > start = round_down(start, fs_info->sectorsize); > > btrfs_free_reserved_data_space_noquota(fs_info, len); > - btrfs_qgroup_free_data(inode, reserved, start, len); > + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); > } > > /* > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index e9c4b947a5aa..7a71720aaed2 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -3192,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode, > qgroup_reserved -= range->len; > } else if (qgroup_reserved > 0) { > btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, > - range->start, range->len); > + range->start, range->len, NULL); > qgroup_reserved -= range->len; > } > list_del(&range->list); > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index f8647d8271b7..e79a047aa5d1 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -697,7 +697,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, > * And at reserve time, it's always aligned to page size, so > * just free one page here. > */ > - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); > + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); > btrfs_free_path(path); > btrfs_end_transaction(trans); > return ret; > @@ -5141,7 +5141,7 @@ static void evict_inode_truncate_pages(struct inode *inode) > */ > if (state_flags & EXTENT_DELALLOC) > btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, > - end - start + 1); > + end - start + 1, NULL); > > clear_extent_bit(io_tree, start, end, > EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, > @@ -8076,7 +8076,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, > * reserved data space. > * Since the IO will never happen for this page. > */ > - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); > + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); > if (!inode_evicting) { > clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | > EXTENT_DELALLOC | EXTENT_UPTODATE | > @@ -9513,7 +9513,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( > struct btrfs_path *path; > u64 start = ins->objectid; > u64 len = ins->offset; > - int qgroup_released; > + u64 qgroup_released = 0; > int ret; > > memset(&stack_fi, 0, sizeof(stack_fi)); > @@ -9526,9 +9526,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( > btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); > /* Encryption and other encoding is reserved and all 0 */ > > - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); > - if (qgroup_released < 0) > - return ERR_PTR(qgroup_released); > + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); > + if (ret < 0) > + return ERR_PTR(ret); > > if (trans) { > ret = insert_reserved_file_extent(trans, inode, > @@ -10423,7 +10423,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, > btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); > out_qgroup_free_data: > if (ret < 0) > - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); > + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); > out_free_data_space: > /* > * If btrfs_reserve_extent() succeeded, then we already decremented > diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c > index 8d4ab5ecfa5d..c68fb78b7454 100644 > --- a/fs/btrfs/ordered-data.c > +++ b/fs/btrfs/ordered-data.c > @@ -152,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( > { > struct btrfs_ordered_extent *entry; > int ret; > + u64 qgroup_rsv = 0; > > if (flags & > ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { > /* For nocow write, we can release the qgroup rsv right now */ > - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); > + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); > if (ret < 0) > return ERR_PTR(ret); > } else { > @@ -164,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( > * The ordered extent has reserved qgroup space, release now > * and pass the reserved number for qgroup_record to free. > */ > - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); > + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); > if (ret < 0) > return ERR_PTR(ret); > } > @@ -182,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( > entry->inode = igrab(&inode->vfs_inode); > entry->compress_type = compress_type; > entry->truncated_len = (u64)-1; > - entry->qgroup_rsv = ret; > + entry->qgroup_rsv = qgroup_rsv; > entry->flags = flags; > refcount_set(&entry->refs, 1); > init_waitqueue_head(&entry->wait); > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c > index ce446d9d7f23..a953c16c7eb8 100644 > --- a/fs/btrfs/qgroup.c > +++ b/fs/btrfs/qgroup.c > @@ -4057,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, > > /* Free ranges specified by @reserved, normally in error path */ > static int qgroup_free_reserved_data(struct btrfs_inode *inode, > - struct extent_changeset *reserved, u64 start, u64 len) > + struct extent_changeset *reserved, > + u64 start, u64 len, u64 *freed_ret) > { > struct btrfs_root *root = inode->root; > struct ulist_node *unode; > struct ulist_iterator uiter; > struct extent_changeset changeset; > - int freed = 0; > + u64 freed = 0; > int ret; > > extent_changeset_init(&changeset); > @@ -4104,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, > } > btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, > BTRFS_QGROUP_RSV_DATA); > - ret = freed; > + if (freed_ret) > + *freed_ret = freed; > + ret = 0; > out: > extent_changeset_release(&changeset); > return ret; > @@ -4112,7 +4115,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, > > static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, > struct extent_changeset *reserved, u64 start, u64 len, > - int free) > + u64 *released, int free) > { > struct extent_changeset changeset; > int trace_op = QGROUP_RELEASE; > @@ -4128,7 +4131,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, > /* In release case, we shouldn't have @reserved */ > WARN_ON(!free && reserved); > if (free && reserved) > - return qgroup_free_reserved_data(inode, reserved, start, len); > + return qgroup_free_reserved_data(inode, reserved, start, len, released); > extent_changeset_init(&changeset); > ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, > EXTENT_QGROUP_RESERVED, &changeset); > @@ -4143,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, > btrfs_qgroup_free_refroot(inode->root->fs_info, > inode->root->root_key.objectid, > changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); > - ret = changeset.bytes_changed; > + if (released) > + *released = changeset.bytes_changed; > out: > extent_changeset_release(&changeset); > return ret; > @@ -4162,9 +4166,10 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, > * NOTE: This function may sleep for memory allocation. > */ > int btrfs_qgroup_free_data(struct btrfs_inode *inode, > - struct extent_changeset *reserved, u64 start, u64 len) > + struct extent_changeset *reserved, > + u64 start, u64 len, u64 *freed) > { > - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); > + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); > } > > /* > @@ -4182,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, > * > * NOTE: This function may sleep for memory allocation. > */ > -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) > +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) > { > - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); > + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); > } > > static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, > diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h > index 855a4f978761..15b485506104 100644 > --- a/fs/btrfs/qgroup.h > +++ b/fs/btrfs/qgroup.h > @@ -358,10 +358,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, > /* New io_tree based accurate qgroup reserve API */ > int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, > struct extent_changeset **reserved, u64 start, u64 len); > -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); > +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); > int btrfs_qgroup_free_data(struct btrfs_inode *inode, > struct extent_changeset *reserved, u64 start, > - u64 len); > + u64 len, u64 *freed); > int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, > enum btrfs_qgroup_rsv_type type, bool enforce); > int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 51453d4928fa..2833e8ef4c09 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e9c4b947a5aa..7a71720aaed2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3192,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode, qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, - range->start, range->len); + range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f8647d8271b7..e79a047aa5d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -697,7 +697,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -5141,7 +5141,7 @@ static void evict_inode_truncate_pages(struct inode *inode) */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, - end - start + 1); + end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, @@ -8076,7 +8076,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | @@ -9513,7 +9513,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; - int qgroup_released; + u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9526,9 +9526,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); - if (qgroup_released < 0) - return ERR_PTR(qgroup_released); + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, @@ -10423,7 +10423,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8d4ab5ecfa5d..c68fb78b7454 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -152,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( { struct btrfs_ordered_extent *entry; int ret; + u64 qgroup_rsv = 0; if (flags & ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } else { @@ -164,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } @@ -182,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; + entry->qgroup_rsv = qgroup_rsv; entry->flags = flags; refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ce446d9d7f23..a953c16c7eb8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4057,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; - int freed = 0; + u64 freed = 0; int ret; extent_changeset_init(&changeset); @@ -4104,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); - ret = freed; + if (freed_ret) + *freed_ret = freed; + ret = 0; out: extent_changeset_release(&changeset); return ret; @@ -4112,7 +4115,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, - int free) + u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; @@ -4128,7 +4131,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -4143,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); - ret = changeset.bytes_changed; + if (released) + *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; @@ -4162,9 +4166,10 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* @@ -4182,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 855a4f978761..15b485506104 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -358,10 +358,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, - u64 len); + u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
The reserved data counter and input parameter is a u64, but we inadvertantly accumulate it in an int. Overflowing that int results in freeing the wrong amount of data and breaking rsv accounting. Unfortunately, this overflow rot spreads from there, as the qgroup release/free functions rely on returning an int to take advantage of negative values for error codes. Therefore, the full fix is to return the "released" or "freed" amount by a u64* argument and to return 0 or negative error code via the return value. Most of the callsites simply ignore the return value, though some of them handle the error and count the returned bytes. Change all of them accordingly. Signed-off-by: Boris Burkov <boris@bur.io> --- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 16 ++++++++-------- fs/btrfs/ordered-data.c | 7 ++++--- fs/btrfs/qgroup.c | 25 +++++++++++++++---------- fs/btrfs/qgroup.h | 4 ++-- 6 files changed, 31 insertions(+), 25 deletions(-)