diff mbox series

[2/8] btrfs: use sector numbers as keys for the dirty extents xarray

Message ID a0b3dae9b5933cb80804f063cedcadf1ae8259f2.1727261112.git.fdmanana@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs: delayed refs and qgroups, fixes, cleanups, improvements | expand

Commit Message

Filipe Manana Sept. 25, 2024, 10:50 a.m. UTC
From: Filipe Manana <fdmanana@suse.com>

We are using the logical address ("bytenr") of an extent as the key for
qgroup records in the dirty extents xarray. This is a problem because the
xarrays use "unsigned long" for keys/indices, meaning that on a 32 bits
platform any extent starting at or beyond 4G is truncated, which is a too
low limitation as virtually everyone is using storage with more than 4G of
space. This means a "bytenr" of 4G gets truncated to 0, and so does 8G and
16G for example, resulting in incorrect qgroup accounting.

Fix this by using sector numbers as keys instead, that is, using keys that
match the logical address right shifted by fs_info->sectorsize_bits, which
is what we do for the fs_info->buffer_radix that tracks extent buffers
(radix trees also use an "unsigned long" type for keys). This also makes
the index space more dense which helps optimize the xarray (as mentioned
at Documentation/core-api/xarray.rst).

Fixes: 3cce39a8ca4e ("btrfs: qgroup: use xarray to track dirty extents in transaction")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/delayed-ref.c | 13 ++++++++-----
 fs/btrfs/delayed-ref.h | 10 +++++++++-
 fs/btrfs/qgroup.c      | 11 ++++++-----
 3 files changed, 23 insertions(+), 11 deletions(-)

Comments

Qu Wenruo Sept. 25, 2024, 10:25 p.m. UTC | #1
在 2024/9/25 20:20, fdmanana@kernel.org 写道:
> From: Filipe Manana <fdmanana@suse.com>
>
> We are using the logical address ("bytenr") of an extent as the key for
> qgroup records in the dirty extents xarray. This is a problem because the
> xarrays use "unsigned long" for keys/indices, meaning that on a 32 bits
> platform any extent starting at or beyond 4G is truncated, which is a too
> low limitation as virtually everyone is using storage with more than 4G of
> space. This means a "bytenr" of 4G gets truncated to 0, and so does 8G and
> 16G for example, resulting in incorrect qgroup accounting.
>
> Fix this by using sector numbers as keys instead, that is, using keys that
> match the logical address right shifted by fs_info->sectorsize_bits, which
> is what we do for the fs_info->buffer_radix that tracks extent buffers
> (radix trees also use an "unsigned long" type for keys). This also makes
> the index space more dense which helps optimize the xarray (as mentioned
> at Documentation/core-api/xarray.rst).
>
> Fixes: 3cce39a8ca4e ("btrfs: qgroup: use xarray to track dirty extents in transaction")
> Signed-off-by: Filipe Manana <fdmanana@suse.com>
> ---
>   fs/btrfs/delayed-ref.c | 13 ++++++++-----
>   fs/btrfs/delayed-ref.h | 10 +++++++++-
>   fs/btrfs/qgroup.c      | 11 ++++++-----
>   3 files changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 32f719b9e661..f075ac11e51c 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -849,6 +849,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>   		     struct btrfs_qgroup_extent_record *qrecord,
>   		     int action, bool *qrecord_inserted_ret)
>   {
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
>   	struct btrfs_delayed_ref_head *existing;
>   	struct btrfs_delayed_ref_root *delayed_refs;
>   	bool qrecord_inserted = false;
> @@ -859,11 +860,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>   	if (qrecord) {
>   		int ret;
>
> -		ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info,
> +		ret = btrfs_qgroup_trace_extent_nolock(fs_info,
>   						       delayed_refs, qrecord);
>   		if (ret) {
>   			/* Clean up if insertion fails or item exists. */
> -			xa_release(&delayed_refs->dirty_extents, qrecord->bytenr);
> +			xa_release(&delayed_refs->dirty_extents,
> +				   qrecord->bytenr >> fs_info->sectorsize_bits);
>   			/* Caller responsible for freeing qrecord on error. */
>   			if (ret < 0)
>   				return ERR_PTR(ret);
> @@ -873,7 +875,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>   		}
>   	}
>
> -	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
> +	trace_add_delayed_ref_head(fs_info, head_ref, action);
>
>   	existing = htree_insert(&delayed_refs->href_root,
>   				&head_ref->href_node);
> @@ -895,7 +897,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>   		if (head_ref->is_data && head_ref->ref_mod < 0) {
>   			delayed_refs->pending_csums += head_ref->num_bytes;
>   			trans->delayed_ref_csum_deletions +=
> -				btrfs_csum_bytes_to_leaves(trans->fs_info,
> +				btrfs_csum_bytes_to_leaves(fs_info,
>   							   head_ref->num_bytes);
>   		}
>   		delayed_refs->num_heads++;
> @@ -1030,7 +1032,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
>   			goto free_head_ref;
>   		}
>   		if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
> -			       generic_ref->bytenr, GFP_NOFS)) {
> +			       generic_ref->bytenr >> fs_info->sectorsize_bits,
> +			       GFP_NOFS)) {
>   			ret = -ENOMEM;
>   			goto free_record;
>   		}
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 085f30968aba..352921e76c74 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -202,7 +202,15 @@ struct btrfs_delayed_ref_root {
>   	/* head ref rbtree */
>   	struct rb_root_cached href_root;
>
> -	/* Track dirty extent records. */
> +	/*
> +	 * Track dirty extent records.
> +	 * The keys correspond to the logical address of the extent ("bytenr")
> +	 * right shifted by fs_info->sectorsize_bits. This is both to get a more
> +	 * dense index space (optimizes xarray structure) and because indexes in
> +	 * xarrays are of "unsigned long" type, meaning they are 32 bits wide on
> +	 * 32 bits platforms, limiting the extent range to 4G which is too low
> +	 * and makes it unusable (truncated index values) on 32 bits platforms.
> +	 */
>   	struct xarray dirty_extents;
>
>   	/* this spin lock protects the rbtree and the entries inside */
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index c297909f1506..a76e4610fe80 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2005,7 +2005,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
>   				struct btrfs_qgroup_extent_record *record)
>   {
>   	struct btrfs_qgroup_extent_record *existing, *ret;
> -	unsigned long bytenr = record->bytenr;
> +	const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);

Could we check if record->bytenr >= MAX_LFS_FILESIZE and error out directly?
(With btrfs_err_32bit_limit() called to indicate the problem).

Just like what we did in alloc_extent_buffer().

Otherwise looks good.

Thanks,
Qu

>
>   	if (!btrfs_qgroup_full_accounting(fs_info))
>   		return 1;
> @@ -2014,7 +2014,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
>   	trace_btrfs_qgroup_trace_extent(fs_info, record);
>
>   	xa_lock(&delayed_refs->dirty_extents);
> -	existing = xa_load(&delayed_refs->dirty_extents, bytenr);
> +	existing = xa_load(&delayed_refs->dirty_extents, index);
>   	if (existing) {
>   		if (record->data_rsv && !existing->data_rsv) {
>   			existing->data_rsv = record->data_rsv;
> @@ -2024,7 +2024,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
>   		return 1;
>   	}
>
> -	ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC);
> +	ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
>   	xa_unlock(&delayed_refs->dirty_extents);
>   	if (xa_is_err(ret)) {
>   		qgroup_mark_inconsistent(fs_info);
> @@ -2129,6 +2129,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
>   	struct btrfs_fs_info *fs_info = trans->fs_info;
>   	struct btrfs_qgroup_extent_record *record;
>   	struct btrfs_delayed_ref_root *delayed_refs;
> +	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
>   	int ret;
>
>   	if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
> @@ -2137,7 +2138,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
>   	if (!record)
>   		return -ENOMEM;
>
> -	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) {
> +	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
>   		kfree(record);
>   		return -ENOMEM;
>   	}
> @@ -2152,7 +2153,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
>   	spin_unlock(&delayed_refs->lock);
>   	if (ret) {
>   		/* Clean up if insertion fails or item exists. */
> -		xa_release(&delayed_refs->dirty_extents, record->bytenr);
> +		xa_release(&delayed_refs->dirty_extents, index);
>   		kfree(record);
>   		return 0;
>   	}
diff mbox series

Patch

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 32f719b9e661..f075ac11e51c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -849,6 +849,7 @@  add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		     struct btrfs_qgroup_extent_record *qrecord,
 		     int action, bool *qrecord_inserted_ret)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	bool qrecord_inserted = false;
@@ -859,11 +860,12 @@  add_delayed_ref_head(struct btrfs_trans_handle *trans,
 	if (qrecord) {
 		int ret;
 
-		ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info,
+		ret = btrfs_qgroup_trace_extent_nolock(fs_info,
 						       delayed_refs, qrecord);
 		if (ret) {
 			/* Clean up if insertion fails or item exists. */
-			xa_release(&delayed_refs->dirty_extents, qrecord->bytenr);
+			xa_release(&delayed_refs->dirty_extents,
+				   qrecord->bytenr >> fs_info->sectorsize_bits);
 			/* Caller responsible for freeing qrecord on error. */
 			if (ret < 0)
 				return ERR_PTR(ret);
@@ -873,7 +875,7 @@  add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
+	trace_add_delayed_ref_head(fs_info, head_ref, action);
 
 	existing = htree_insert(&delayed_refs->href_root,
 				&head_ref->href_node);
@@ -895,7 +897,7 @@  add_delayed_ref_head(struct btrfs_trans_handle *trans,
 		if (head_ref->is_data && head_ref->ref_mod < 0) {
 			delayed_refs->pending_csums += head_ref->num_bytes;
 			trans->delayed_ref_csum_deletions +=
-				btrfs_csum_bytes_to_leaves(trans->fs_info,
+				btrfs_csum_bytes_to_leaves(fs_info,
 							   head_ref->num_bytes);
 		}
 		delayed_refs->num_heads++;
@@ -1030,7 +1032,8 @@  static int add_delayed_ref(struct btrfs_trans_handle *trans,
 			goto free_head_ref;
 		}
 		if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
-			       generic_ref->bytenr, GFP_NOFS)) {
+			       generic_ref->bytenr >> fs_info->sectorsize_bits,
+			       GFP_NOFS)) {
 			ret = -ENOMEM;
 			goto free_record;
 		}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 085f30968aba..352921e76c74 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -202,7 +202,15 @@  struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
 	struct rb_root_cached href_root;
 
-	/* Track dirty extent records. */
+	/*
+	 * Track dirty extent records.
+	 * The keys correspond to the logical address of the extent ("bytenr")
+	 * right shifted by fs_info->sectorsize_bits. This is both to get a more
+	 * dense index space (optimizes xarray structure) and because indexes in
+	 * xarrays are of "unsigned long" type, meaning they are 32 bits wide on
+	 * 32 bits platforms, limiting the extent range to 4G which is too low
+	 * and makes it unusable (truncated index values) on 32 bits platforms.
+	 */
 	struct xarray dirty_extents;
 
 	/* this spin lock protects the rbtree and the entries inside */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c297909f1506..a76e4610fe80 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2005,7 +2005,7 @@  int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 				struct btrfs_qgroup_extent_record *record)
 {
 	struct btrfs_qgroup_extent_record *existing, *ret;
-	unsigned long bytenr = record->bytenr;
+	const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);
 
 	if (!btrfs_qgroup_full_accounting(fs_info))
 		return 1;
@@ -2014,7 +2014,7 @@  int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	trace_btrfs_qgroup_trace_extent(fs_info, record);
 
 	xa_lock(&delayed_refs->dirty_extents);
-	existing = xa_load(&delayed_refs->dirty_extents, bytenr);
+	existing = xa_load(&delayed_refs->dirty_extents, index);
 	if (existing) {
 		if (record->data_rsv && !existing->data_rsv) {
 			existing->data_rsv = record->data_rsv;
@@ -2024,7 +2024,7 @@  int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 		return 1;
 	}
 
-	ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC);
+	ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
 	xa_unlock(&delayed_refs->dirty_extents);
 	if (xa_is_err(ret)) {
 		qgroup_mark_inconsistent(fs_info);
@@ -2129,6 +2129,7 @@  int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup_extent_record *record;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
 	int ret;
 
 	if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
@@ -2137,7 +2138,7 @@  int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	if (!record)
 		return -ENOMEM;
 
-	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) {
+	if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
 		kfree(record);
 		return -ENOMEM;
 	}
@@ -2152,7 +2153,7 @@  int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 	spin_unlock(&delayed_refs->lock);
 	if (ret) {
 		/* Clean up if insertion fails or item exists. */
-		xa_release(&delayed_refs->dirty_extents, record->bytenr);
+		xa_release(&delayed_refs->dirty_extents, index);
 		kfree(record);
 		return 0;
 	}