@@ -857,13 +857,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 folio_start;
+ u64 folio_end;
struct extent_state *cached_state = NULL;
struct folio *folio;
int ret;
again:
+ /* TODO: Add order fgp order flags when larger folios are fully enabled. */
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio))
@@ -871,13 +872,16 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
/*
* Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
+ *
+ * The IO for such larger folios are not fully tested, thus return
+ * an error to reject such folios unless it's an experimental build.
+ *
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (folio_test_large(folio)) {
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-ETXTBSY);
@@ -890,13 +894,15 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
return ERR_PTR(ret);
}
+ folio_start = folio_pos(folio);
+ folio_end = folio_pos(folio) + folio_size(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
+ lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio));
+ unlock_extent(&inode->io_tree, folio_start, folio_end,
&cached_state);
if (!ordered)
break;
@@ -1162,13 +1168,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
struct extent_changeset *data_reserved = NULL;
const u64 start = target->start;
const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = folios[0]->index;
int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
if (ret < 0)
@@ -1179,10 +1179,17 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
set_extent_bit(&inode->io_tree, start, start + len - 1,
EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- folio_clear_checked(folios[i]);
- btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
+ /*
+ * Update the page status.
+ * Due to possible larger folios, we have to check all folios one by one.
+ * And the btrfs_folio_clamp_*() helpers can handle ranges out of the
+ * folio cases well.
+ */
+ for (int i = 0; i < nr_pages && folios[i]; i++) {
+ struct folio *folio = folios[i];
+
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1200,9 +1207,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
LIST_HEAD(target_list);
struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
+ u64 cur = start;
+ const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT) + 1;
int ret = 0;
int i;
@@ -1214,21 +1221,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
return -ENOMEM;
/* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ for (i = 0; cur < start + len && i < nr_pages; i++) {
+ folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
if (IS_ERR(folios[i])) {
ret = PTR_ERR(folios[i]);
- nr_pages = i;
+ folios[i] = NULL;
goto free_folios;
}
+ cur = folio_pos(folios[i]) + folio_size(folios[i]);
}
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_wait_writeback(folios[i]);
+ }
- /* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ /* Lock the folios[] range */
+ lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1,
&cached_state);
+
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1254,11 +1265,12 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+ unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1,
&cached_state);
free_folios:
for (i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_unlock(folios[i]);
folio_put(folios[i]);
}
Currently we rejects larger folios for defrag gracefully, but the implementation itself is already mostly larger folios compatible. There are several parts of defrag in btrfs: - Extent map checking Aka, defrag_collect_targets(), which prepare a list of target ranges that should be defragged. This part is completely folio unrelated, thus it doesn't care about if the folio is larger or not. - Target folio preparation Aka, defrag_prepare_one_folio(), which lock and read (if needed) the target folio. Since folio read and lock are already supporting larger folios, we can easily support this part. - Redirty the target range of the folio This is already done in a way supporting larger folios. So it's pretty straightforward to enable larger folios for defrag: - Do not reject larger folios for experimental builds This affects the larger folio check inside defrag_prepare_one_folio(). - Wait for ordered extents of the whole folio in defrag_prepare_one_folio() - Lock the whole extent range for all involved folios in defrag_one_range() - Allow the folios[] array to be partially empty Since we can have larger folios, folios[] will not always be full. This affects: * How to allocate folios in defrag_one_range() Now we can not use page index, but use the end position of the folio as an iterator. * How to free the folios[] array If we hit an empty slot, it means we have larger folios and already hit the end of the array. * How to mark the range dirty Instead of use page index directly, we have to go through each folio. Unfortunately the behavior can only be verified with larger data folio support enabled, the current change is only ensured that it doesn't break the existing defrag behavior for regular and subpage cases. Signed-off-by: Qu Wenruo <wqu@suse.com> --- fs/btrfs/defrag.c | 72 +++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 30 deletions(-)