@@ -6712,6 +6712,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 sectorsize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6740,17 +6741,19 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+ if (max_size < sectorsize)
+ folio_zero_range(folio, max_size, sectorsize - max_size);
kfree(tmp);
return ret;
}
-static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
+static int read_inline_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
+ const u32 sectorsize = fs_info->sectorsize;
if (!folio || folio_test_uptodate(folio))
return 0;
@@ -6768,8 +6771,8 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < sectorsize)
+ folio_zero_range(folio, copy_size, sectorsize - copy_size);
return 0;
}
@@ -6944,7 +6947,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(path, folio);
+ ret = read_inline_extent(fs_info, path, folio);
if (ret < 0)
goto out;
goto insert;
[BUG in DEVEL BRANCH] This bug itself can only be reproduced with the following out-of-tree patches: btrfs: allow inline data extents creation if sector size < page size btrfs: allow buffered write to skip full page if it's sector aligned With those out-of-tree patches, we can hit a data corruption: # mkfs.btrfs -f -s 4k $dev # mount $dev $mnt -o compress=zstd # xfs_io -f -c "pwrite 0 4k" $mnt/foobar # sync # echo 3 > /proc/sys/vm/drop_caches # xfs_io -f -c" pwrite 8k 4k" $mnt/foobar # md5sum $mnt/foobar 65df683add4707de8200bad14745b9ec Meanwhile such workload should result a md5sum of 277f3840b275c74d01e979ea9d75ac19 [CAUSE] The first buffered write into range [0, 4k) will result a compressed inline extent (relies on the patch "btrfs: allow inline data extents creation if sector size < page size" to create such inline extent): item 6 key (257 EXTENT_DATA 0) itemoff 15823 itemsize 40 generation 9 type 0 (inline) inline extent data size 19 ram_bytes 4096 compression 3 (zstd) Then all page cache is dropped, and we do the new write into range [8K, 12K) With the out-of-tree patch "btrfs: allow buffered write to skip full page if it's sector aligned", such aligned write won't trigger the full folio read, so we just dirtied range [8K, 12K), and range [0, 4K) is not uptodate. Then md5sum triggered the full folio read, causing us to read the inlined data extent. Then inside function read_inline_extent() and uncomress_inline(), we zero out all the remaining part of the folio, including the new dirtied range [8K, 12K), leading to the corruption. [FIX] Thankfully the bug is not yet reaching any end users. For upstream kernel, the [8K, 12K) write itself will trigger the full folio read before doing any write, thus everything is still fine. Furthermore, for the existing btrfs users with sector size < page size (the most common one is Asahi Linux) with inline data extents created from x86_64, they are still fine, because two factors are saving us: - Inline extents are always at offset 0 - Folio read always follows the file offset order So we always read out the inline extent, zeroing the remaining folio (which has no contents yet), then read the next sector, copying the correct content to the zeroed out part. No end users are affected thankfully. The fix is to make both read_inline_extent() and uncomress_inline() to only zero out the sector, not the whole page. Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-)