diff mbox series

[v3,1/3] btrfs: add extra comments on extent_map members

Message ID 261cf7744120a2312ce2cdb22dbbfe439a11268a.1712287421.git.wqu@suse.com (mailing list archive)
State New
Headers show
Series btrfs: more explaination on extent_map members | expand

Commit Message

Qu Wenruo April 5, 2024, 3:27 a.m. UTC
The extent_map structure is very critical to btrfs, as it is involved
for both read and write paths.

Unfortunately the structure is not properly explained, making it pretty
hard to understand nor to do further improvement.

This patch adds extra comments explaining the major members based on
my code reading.
Hopefully we can find more members to cleanup in the future.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 fs/btrfs/extent_map.h | 52 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

Comments

Filipe Manana April 5, 2024, 12:24 p.m. UTC | #1
On Fri, Apr 5, 2024 at 4:28 AM Qu Wenruo <wqu@suse.com> wrote:
>
> The extent_map structure is very critical to btrfs, as it is involved
> for both read and write paths.
>
> Unfortunately the structure is not properly explained, making it pretty
> hard to understand nor to do further improvement.
>
> This patch adds extra comments explaining the major members based on
> my code reading.
> Hopefully we can find more members to cleanup in the future.
>
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  fs/btrfs/extent_map.h | 52 ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> index 10e9491865c9..0b938e12cc78 100644
> --- a/fs/btrfs/extent_map.h
> +++ b/fs/btrfs/extent_map.h
> @@ -35,19 +35,69 @@ enum {
>  };
>
>  /*
> + * This structure represents file extents and holes.

So I clearly forgot this before, but we should add the caveat that
it's guaranteed that it represents a single file extent only if the
extent is new and not persisted (in the list of modified extents and
not fsynced).
Otherwise it can represent 2 or more extents that were merged (to save
memory), which adds some caveats I mention below.

Holes can also be merged of course (e.g. read part of a hole, we
create an extent map for that part, read the remainder of the hole,
create another extent map for that remainder, which then merges with
the former).

> + *
>   * Keep this structure as compact as possible, as we can have really large
>   * amounts of allocated extent maps at any time.
>   */
>  struct extent_map {
>         struct rb_node rb_node;
>
> -       /* all of these are in bytes */
> +       /* All of these are in bytes. */
> +
> +       /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
>         u64 start;
> +
> +       /*
> +        * Length of the file extent.
> +        *
> +        * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
> +        * For inline extents it's sectorsize, since inline data starts at
> +        * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
> +        * btrfs_file_extent_item::num_bytes is not valid.
> +        */
>         u64 len;
> +
> +       /*
> +        * The file offset of the original file extent before splitting.
> +        *
> +        * This is an in-memory only member, matching
> +        * extent_map::start - btrfs_file_extent_item::offset for
> +        * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
> +        */
>         u64 orig_start;
> +
> +       /*
> +        * The full on-disk extent length, matching
> +        * btrfs_file_extent_item::disk_num_bytes.
> +        */

So yes and no.
When merging extent maps, it's not updated, so it's tricky.
But that's ok because an extent map only needs to represent exactly
one file extent item if it's new and was not fsynced yet.

>         u64 orig_block_len;
> +
> +       /*
> +        * The decompressed size of the whole on-disk extent, matching
> +        * btrfs_file_extent_item::ram_bytes.
> +        */
>         u64 ram_bytes;

Same here regarding the merging.

Sorry I forgot this before.

> +
> +       /*
> +        * The on-disk logical bytenr for the file extent.
> +        *
> +        * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
> +        * For uncompressed extents it matches
> +        * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
> +        *
> +        * For holes it is EXTENT_MAP_HOLE and for inline extents it is
> +        * EXTENT_MAP_INLINE.
> +        */
>         u64 block_start;
> +
> +       /*
> +        * The on-disk length for the file extent.
> +        *
> +        * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
> +        * For uncompressed extents it matches extent_map::len.
> +        * For holes and inline extents it's -1 and shouldn't be used.
> +        */
>         u64 block_len;
>
>         /*
> --
> 2.44.0
>
>
Qu Wenruo April 5, 2024, 9:36 p.m. UTC | #2
在 2024/4/5 22:54, Filipe Manana 写道:
> On Fri, Apr 5, 2024 at 4:28 AM Qu Wenruo <wqu@suse.com> wrote:
>>
>> The extent_map structure is very critical to btrfs, as it is involved
>> for both read and write paths.
>>
>> Unfortunately the structure is not properly explained, making it pretty
>> hard to understand nor to do further improvement.
>>
>> This patch adds extra comments explaining the major members based on
>> my code reading.
>> Hopefully we can find more members to cleanup in the future.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>>   fs/btrfs/extent_map.h | 52 ++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 51 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
>> index 10e9491865c9..0b938e12cc78 100644
>> --- a/fs/btrfs/extent_map.h
>> +++ b/fs/btrfs/extent_map.h
>> @@ -35,19 +35,69 @@ enum {
>>   };
>>
>>   /*
>> + * This structure represents file extents and holes.
>
> So I clearly forgot this before, but we should add the caveat that
> it's guaranteed that it represents a single file extent only if the
> extent is new and not persisted (in the list of modified extents and
> not fsynced).
> Otherwise it can represent 2 or more extents that were merged (to save
> memory), which adds some caveats I mention below.

In fact I also wanted to address this, especially if I'm going to
introduce disk_bytenr/disk_num_bytes, as they can be merged, the merged
disk_bytenr/disk_num_bytes would not exist on-disk.

But on the other hand, it's a little too obvious, thus I didn't mention
through the whole series.

>
> Holes can also be merged of course (e.g. read part of a hole, we
> create an extent map for that part, read the remainder of the hole,
> create another extent map for that remainder, which then merges with
> the former).
>
[...]
>> +
>> +       /*
>> +        * The full on-disk extent length, matching
>> +        * btrfs_file_extent_item::disk_num_bytes.
>> +        */
>
> So yes and no.
> When merging extent maps, it's not updated, so it's tricky.

And that's already found in my sanity checks.

> But that's ok because an extent map only needs to represent exactly
> one file extent item if it's new and was not fsynced yet.

I can update the comments to add extra comments on merged behavior, and
fix the unexpected handling for merging/split.

>
>>          u64 orig_block_len;
>> +
>> +       /*
>> +        * The decompressed size of the whole on-disk extent, matching
>> +        * btrfs_file_extent_item::ram_bytes.
>> +        */
>>          u64 ram_bytes;
>
> Same here regarding the merging.
>
> Sorry I forgot this before.

No big deal, as my super strict sanity checks are crashing everywhere, I
have already experienced this quirk.

So I would add extra comments on the merging behaviors, but I'm afraid
since the current code doesn't handle certain members correctly (and a
lot of callers even do not populate things like ram_bytes), I'm afraid
those extra comments would only come after I fixed all of them.

Thanks,
Qu
Filipe Manana April 8, 2024, 11:11 a.m. UTC | #3
On Fri, Apr 5, 2024 at 10:36 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
>
>
>
> 在 2024/4/5 22:54, Filipe Manana 写道:
> > On Fri, Apr 5, 2024 at 4:28 AM Qu Wenruo <wqu@suse.com> wrote:
> >>
> >> The extent_map structure is very critical to btrfs, as it is involved
> >> for both read and write paths.
> >>
> >> Unfortunately the structure is not properly explained, making it pretty
> >> hard to understand nor to do further improvement.
> >>
> >> This patch adds extra comments explaining the major members based on
> >> my code reading.
> >> Hopefully we can find more members to cleanup in the future.
> >>
> >> Signed-off-by: Qu Wenruo <wqu@suse.com>
> >> ---
> >>   fs/btrfs/extent_map.h | 52 ++++++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 51 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> >> index 10e9491865c9..0b938e12cc78 100644
> >> --- a/fs/btrfs/extent_map.h
> >> +++ b/fs/btrfs/extent_map.h
> >> @@ -35,19 +35,69 @@ enum {
> >>   };
> >>
> >>   /*
> >> + * This structure represents file extents and holes.
> >
> > So I clearly forgot this before, but we should add the caveat that
> > it's guaranteed that it represents a single file extent only if the
> > extent is new and not persisted (in the list of modified extents and
> > not fsynced).
> > Otherwise it can represent 2 or more extents that were merged (to save
> > memory), which adds some caveats I mention below.
>
> In fact I also wanted to address this, especially if I'm going to
> introduce disk_bytenr/disk_num_bytes, as they can be merged, the merged
> disk_bytenr/disk_num_bytes would not exist on-disk.
>
> But on the other hand, it's a little too obvious, thus I didn't mention
> through the whole series.

Well, a lot of the comments this patch is adding are too obvious as well.
So I don't see why mentioning an extent map may represent merged
extents is too obvious, if anything, it's less obvious than the
comments for some of the fields the patch is adding.

>
> >
> > Holes can also be merged of course (e.g. read part of a hole, we
> > create an extent map for that part, read the remainder of the hole,
> > create another extent map for that remainder, which then merges with
> > the former).
> >
> [...]
> >> +
> >> +       /*
> >> +        * The full on-disk extent length, matching
> >> +        * btrfs_file_extent_item::disk_num_bytes.
> >> +        */
> >
> > So yes and no.
> > When merging extent maps, it's not updated, so it's tricky.
>
> And that's already found in my sanity checks.
>
> > But that's ok because an extent map only needs to represent exactly
> > one file extent item if it's new and was not fsynced yet.
>
> I can update the comments to add extra comments on merged behavior, and
> fix the unexpected handling for merging/split.

Fix what?
It's only important to be accurate for extents that need to be logged
(in the modified list of extents), and those are never merged or
split.

Otherwise it doesn't affect use cases other than fsync.

>
> >
> >>          u64 orig_block_len;
> >> +
> >> +       /*
> >> +        * The decompressed size of the whole on-disk extent, matching
> >> +        * btrfs_file_extent_item::ram_bytes.
> >> +        */
> >>          u64 ram_bytes;
> >
> > Same here regarding the merging.
> >
> > Sorry I forgot this before.
>
> No big deal, as my super strict sanity checks are crashing everywhere, I
> have already experienced this quirk.
>
> So I would add extra comments on the merging behaviors, but I'm afraid
> since the current code doesn't handle certain members correctly (and a
> lot of callers even do not populate things like ram_bytes), I'm afraid
> those extra comments would only come after I fixed all of them.

Well, as mentioned before it's ok for some fields to not be updated
after merging, and to a lesser extent, splitted.
Having all fields correct only makes sense when the mapping to a file
extent item is exactly 1 to 1 and the extent is new and needs to be
logged.

>
> Thanks,
> Qu
Qu Wenruo April 8, 2024, 10:05 p.m. UTC | #4
在 2024/4/8 20:41, Filipe Manana 写道:
> On Fri, Apr 5, 2024 at 10:36 PM Qu Wenruo <quwenruo.btrfs@gmx.com> wrote:
[...]
>>
>> But on the other hand, it's a little too obvious, thus I didn't mention
>> through the whole series.
>
> Well, a lot of the comments this patch is adding are too obvious as well.
> So I don't see why mentioning an extent map may represent merged
> extents is too obvious, if anything, it's less obvious than the
> comments for some of the fields the patch is adding.

Well, considering how many members are not properly populated (for tests
though), and other members like "orig_start" only makes sense for
non-mergeable extents (compressed), I strongly doubt if the extent_map
itself designed is that obvious.

>
>>
>>>
>>> Holes can also be merged of course (e.g. read part of a hole, we
>>> create an extent map for that part, read the remainder of the hole,
>>> create another extent map for that remainder, which then merges with
>>> the former).
>>>
>> [...]
>>>> +
>>>> +       /*
>>>> +        * The full on-disk extent length, matching
>>>> +        * btrfs_file_extent_item::disk_num_bytes.
>>>> +        */
>>>
>>> So yes and no.
>>> When merging extent maps, it's not updated, so it's tricky.
>>
>> And that's already found in my sanity checks.
>>
>>> But that's ok because an extent map only needs to represent exactly
>>> one file extent item if it's new and was not fsynced yet.
>>
>> I can update the comments to add extra comments on merged behavior, and
>> fix the unexpected handling for merging/split.
>
> Fix what?
> It's only important to be accurate for extents that need to be logged
> (in the modified list of extents), and those are never merged or
> split.

My bad, the "merged/split handling" is for my new sanity checks on the
extent map structure, which cross checks the old members
(orig_start/block_start/block_len) against the new members
(disk_bytenr/offset).

It turns out certain members do not really much sense after merging.
E.g. orig_start.

>
> Otherwise it doesn't affect use cases other than fsync.
>
>>
>>>
>>>>           u64 orig_block_len;
>>>> +
>>>> +       /*
>>>> +        * The decompressed size of the whole on-disk extent, matching
>>>> +        * btrfs_file_extent_item::ram_bytes.
>>>> +        */
>>>>           u64 ram_bytes;
>>>
>>> Same here regarding the merging.
>>>
>>> Sorry I forgot this before.
>>
>> No big deal, as my super strict sanity checks are crashing everywhere, I
>> have already experienced this quirk.
>>
>> So I would add extra comments on the merging behaviors, but I'm afraid
>> since the current code doesn't handle certain members correctly (and a
>> lot of callers even do not populate things like ram_bytes), I'm afraid
>> those extra comments would only come after I fixed all of them.
>
> Well, as mentioned before it's ok for some fields to not be updated
> after merging, and to a lesser extent, splitted.
> Having all fields correct only makes sense when the mapping to a file
> extent item is exactly 1 to 1 and the extent is new and needs to be
> logged.

I'd say, even only for the sake of consistence, we should populate every
member correctly no matter what.

That's why I'm adding strict sanity checks for all extent maps (only for
DEBUG build).

And it already exposed several bugs:

- The @block_start mis-calculation fix you just commented on
   Thankfully it's harmless

- Weird ram_bytes generated by g/311 test case
   We can create a file extent whose ram_bytes is double its
   disk_num_bytes, and is not compressed.

I'll spend time on pinning down the ram_bytes anomaly later, even it
doesn't cause data corruption.

Thanks,
Qu

>
>>
>> Thanks,
>> Qu
diff mbox series

Patch

diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 10e9491865c9..0b938e12cc78 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -35,19 +35,69 @@  enum {
 };
 
 /*
+ * This structure represents file extents and holes.
+ *
  * Keep this structure as compact as possible, as we can have really large
  * amounts of allocated extent maps at any time.
  */
 struct extent_map {
 	struct rb_node rb_node;
 
-	/* all of these are in bytes */
+	/* All of these are in bytes. */
+
+	/* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
 	u64 start;
+
+	/*
+	 * Length of the file extent.
+	 *
+	 * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
+	 * For inline extents it's sectorsize, since inline data starts at
+	 * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
+	 * btrfs_file_extent_item::num_bytes is not valid.
+	 */
 	u64 len;
+
+	/*
+	 * The file offset of the original file extent before splitting.
+	 *
+	 * This is an in-memory only member, matching
+	 * extent_map::start - btrfs_file_extent_item::offset for
+	 * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
+	 */
 	u64 orig_start;
+
+	/*
+	 * The full on-disk extent length, matching
+	 * btrfs_file_extent_item::disk_num_bytes.
+	 */
 	u64 orig_block_len;
+
+	/*
+	 * The decompressed size of the whole on-disk extent, matching
+	 * btrfs_file_extent_item::ram_bytes.
+	 */
 	u64 ram_bytes;
+
+	/*
+	 * The on-disk logical bytenr for the file extent.
+	 *
+	 * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
+	 * For uncompressed extents it matches
+	 * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
+	 *
+	 * For holes it is EXTENT_MAP_HOLE and for inline extents it is
+	 * EXTENT_MAP_INLINE.
+	 */
 	u64 block_start;
+
+	/*
+	 * The on-disk length for the file extent.
+	 *
+	 * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
+	 * For uncompressed extents it matches extent_map::len.
+	 * For holes and inline extents it's -1 and shouldn't be used.
+	 */
 	u64 block_len;
 
 	/*