diff mbox series

[1/1] ocfs2: fix the issue with discontiguous allocation in the global_bitmap

Message ID 20250408152311.16196-2-heming.zhao@suse.com (mailing list archive)
State New
Headers show
Series ocfs2: fix discontig allocating issue | expand

Commit Message

Heming Zhao April 8, 2025, 3:23 p.m. UTC
The commit 4eb7b93e0310 ("ocfs2: improve write IO performance when
fragmentation is high") introduced another regression.

The following ocfs2-test case can trigger this issue:
> ${RESV_UNWRITTEN_BIN} -f ${WORK_PLACE}/large_testfile -s 0 -l \
> $((${FILE_MAJOR_SIZE_M}*1024*1024))

In my env, test disk size (by "fdisk -l <dev>"):
> 53687091200 bytes, 104857600 sectors.

Above command is:
> /usr/local/ocfs2-test/bin/resv_unwritten -f \
> /mnt/ocfs2/ocfs2-activate-discontig-bg-dir/large_testfile -s 0 -l \
> 53187969024

Error log:
> [*] Reserve 50724M space for a LARGE file, reserve 200M space for future test.
> ioctl error 28: "No space left on device"
> resv allocation failed Unknown error -1
> reserve unwritten region from 0 to 53187969024.

Call flow:
__ocfs2_change_file_space //by ioctl OCFS2_IOC_RESVSP64
 ocfs2_allocate_unwritten_extents //start:0 len:53187969024
  while()
   + ocfs2_get_clusters //cpos:0, alloc_size:1623168 (cluster number)
   + ocfs2_extend_allocation
     + ocfs2_lock_allocators
     |  + choose OCFS2_AC_USE_MAIN & ocfs2_cluster_group_search
     |
     + ocfs2_add_inode_data
        ocfs2_add_clusters_in_btree
         __ocfs2_claim_clusters
          ocfs2_claim_suballoc_bits
          + During the allocation of the final part of the large file
	    (around 47GB), no chain had the required contiguous
            bits_wanted. Consequently, the allocation failed.

How to fix:
When OCFS2 is encountering fragmented allocation, the file system should
stop attempting bits_wanted contiguous allocation and instead provide the
largest available contiguous free bits from the cluster groups.

Signed-off-by: Heming Zhao <heming.zhao@suse.com>
Fixes: 4eb7b93e0310 ("ocfs2: improve write IO performance when fragmentation is high")
---
 fs/ocfs2/suballoc.c | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

Comments

Joseph Qi April 13, 2025, 1:06 a.m. UTC | #1
On 2025/4/8 23:23, Heming Zhao wrote:
> The commit 4eb7b93e0310 ("ocfs2: improve write IO performance when
> fragmentation is high") introduced another regression.
> 
> The following ocfs2-test case can trigger this issue:
>> ${RESV_UNWRITTEN_BIN} -f ${WORK_PLACE}/large_testfile -s 0 -l \
>> $((${FILE_MAJOR_SIZE_M}*1024*1024))
> 
> In my env, test disk size (by "fdisk -l <dev>"):
>> 53687091200 bytes, 104857600 sectors.
> 
> Above command is:
>> /usr/local/ocfs2-test/bin/resv_unwritten -f \
>> /mnt/ocfs2/ocfs2-activate-discontig-bg-dir/large_testfile -s 0 -l \
>> 53187969024
> 
> Error log:
>> [*] Reserve 50724M space for a LARGE file, reserve 200M space for future test.
>> ioctl error 28: "No space left on device"
>> resv allocation failed Unknown error -1
>> reserve unwritten region from 0 to 53187969024.
> 
> Call flow:
> __ocfs2_change_file_space //by ioctl OCFS2_IOC_RESVSP64
>  ocfs2_allocate_unwritten_extents //start:0 len:53187969024
>   while()
>    + ocfs2_get_clusters //cpos:0, alloc_size:1623168 (cluster number)
>    + ocfs2_extend_allocation
>      + ocfs2_lock_allocators
>      |  + choose OCFS2_AC_USE_MAIN & ocfs2_cluster_group_search
>      |
>      + ocfs2_add_inode_data
>         ocfs2_add_clusters_in_btree
>          __ocfs2_claim_clusters
>           ocfs2_claim_suballoc_bits
>           + During the allocation of the final part of the large file
> 	    (around 47GB), no chain had the required contiguous
>             bits_wanted. Consequently, the allocation failed.
> 
> How to fix:
> When OCFS2 is encountering fragmented allocation, the file system should
> stop attempting bits_wanted contiguous allocation and instead provide the
> largest available contiguous free bits from the cluster groups.
> 
> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
> Fixes: 4eb7b93e0310 ("ocfs2: improve write IO performance when fragmentation is high")
> ---
>  fs/ocfs2/suballoc.c | 36 +++++++++++++++++++++++++++++-------
>  1 file changed, 29 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
> index fde75f2af37a..2e1689fc6cf7 100644
> --- a/fs/ocfs2/suballoc.c
> +++ b/fs/ocfs2/suballoc.c
> @@ -1796,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>  {
>  	int status;
>  	u16 chain;
> +	u32 contig_bits;
>  	u64 next_group;
>  	struct inode *alloc_inode = ac->ac_inode;
>  	struct buffer_head *group_bh = NULL;
> @@ -1821,10 +1822,23 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>  	status = -ENOSPC;
>  	/* for now, the chain search is a bit simplistic. We just use
>  	 * the 1st group with any empty bits. */
> -	while ((status = ac->ac_group_search(alloc_inode, group_bh,
> -					     bits_wanted, min_bits,
> -					     ac->ac_max_block,
> -					     res)) == -ENOSPC) {
> +	while (1) {
> +		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
> +			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
> +			if (!contig_bits)
> +				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
> +						le16_to_cpu(bg->bg_bits), 0);
> +			if (bits_wanted > contig_bits)
> +				bits_wanted = contig_bits;
> +			if (bits_wanted < min_bits)
> +				bits_wanted = min_bits;

This seems only valid when bits_wanted changed?

BTW, the previous fix hasn't been merged yet:
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/fs/ocfs2?h=next-20250411&id=767ba8b7cba3ca7a560d632f267f7aea35d54810

So should we drop that first and fold it into a whole one?

Thanks,
Joseph
> +		}
> +
> +		status = ac->ac_group_search(alloc_inode, group_bh,
> +				bits_wanted, min_bits,
> +				ac->ac_max_block, res);
> +		if (status != -ENOSPC)
> +			break;
>  		if (!bg->bg_next_group)
>  			break;
>  
> @@ -1984,6 +1998,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
>  	victim = ocfs2_find_victim_chain(cl);
>  	ac->ac_chain = victim;
>  
> +search:
>  	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
>  				    res, &bits_left);
>  	if (!status) {
> @@ -2024,6 +2039,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
>  		}
>  	}
>  
> +	/* Chains can't supply the bits_wanted contiguous space.
> +	 * We should switch to using every single bit when allocating
> +	 * from the global bitmap. */
> +	if (i == le16_to_cpu(cl->cl_next_free_rec) &&
> +	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
> +		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
> +		ac->ac_chain = victim;
> +		goto search;
> +	}
> +
>  set_hint:
>  	if (status != -ENOSPC) {
>  		/* If the next search of this group is not likely to
> @@ -2432,9 +2457,6 @@ int ocfs2_claim_clusters(handle_t *handle,
>  {
>  	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
>  
> -	if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG)
> -		bits_wanted = min_clusters;
> -
>  	return __ocfs2_claim_clusters(handle, ac, min_clusters,
>  				      bits_wanted, cluster_start, num_clusters);
>  }
Heming Zhao April 14, 2025, 2:29 a.m. UTC | #2
Hi Joseph & Andrew,

On 4/13/25 09:06, Joseph Qi wrote:
> 
> 
> On 2025/4/8 23:23, Heming Zhao wrote:
>> The commit 4eb7b93e0310 ("ocfs2: improve write IO performance when
>> fragmentation is high") introduced another regression.
>>
>> The following ocfs2-test case can trigger this issue:
>>> ${RESV_UNWRITTEN_BIN} -f ${WORK_PLACE}/large_testfile -s 0 -l \
>>> $((${FILE_MAJOR_SIZE_M}*1024*1024))
>>
>> In my env, test disk size (by "fdisk -l <dev>"):
>>> 53687091200 bytes, 104857600 sectors.
>>
>> Above command is:
>>> /usr/local/ocfs2-test/bin/resv_unwritten -f \
>>> /mnt/ocfs2/ocfs2-activate-discontig-bg-dir/large_testfile -s 0 -l \
>>> 53187969024
>>
>> Error log:
>>> [*] Reserve 50724M space for a LARGE file, reserve 200M space for future test.
>>> ioctl error 28: "No space left on device"
>>> resv allocation failed Unknown error -1
>>> reserve unwritten region from 0 to 53187969024.
>>
>> Call flow:
>> __ocfs2_change_file_space //by ioctl OCFS2_IOC_RESVSP64
>>   ocfs2_allocate_unwritten_extents //start:0 len:53187969024
>>    while()
>>     + ocfs2_get_clusters //cpos:0, alloc_size:1623168 (cluster number)
>>     + ocfs2_extend_allocation
>>       + ocfs2_lock_allocators
>>       |  + choose OCFS2_AC_USE_MAIN & ocfs2_cluster_group_search
>>       |
>>       + ocfs2_add_inode_data
>>          ocfs2_add_clusters_in_btree
>>           __ocfs2_claim_clusters
>>            ocfs2_claim_suballoc_bits
>>            + During the allocation of the final part of the large file
>> 	    (around 47GB), no chain had the required contiguous
>>              bits_wanted. Consequently, the allocation failed.
>>
>> How to fix:
>> When OCFS2 is encountering fragmented allocation, the file system should
>> stop attempting bits_wanted contiguous allocation and instead provide the
>> largest available contiguous free bits from the cluster groups.
>>
>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>> Fixes: 4eb7b93e0310 ("ocfs2: improve write IO performance when fragmentation is high")
>> ---
>>   fs/ocfs2/suballoc.c | 36 +++++++++++++++++++++++++++++-------
>>   1 file changed, 29 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
>> index fde75f2af37a..2e1689fc6cf7 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1796,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>>   {
>>   	int status;
>>   	u16 chain;
>> +	u32 contig_bits;
>>   	u64 next_group;
>>   	struct inode *alloc_inode = ac->ac_inode;
>>   	struct buffer_head *group_bh = NULL;
>> @@ -1821,10 +1822,23 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>>   	status = -ENOSPC;
>>   	/* for now, the chain search is a bit simplistic. We just use
>>   	 * the 1st group with any empty bits. */
>> -	while ((status = ac->ac_group_search(alloc_inode, group_bh,
>> -					     bits_wanted, min_bits,
>> -					     ac->ac_max_block,
>> -					     res)) == -ENOSPC) {
>> +	while (1) {
>> +		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
>> +			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
>> +			if (!contig_bits)
>> +				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
>> +						le16_to_cpu(bg->bg_bits), 0);
>> +			if (bits_wanted > contig_bits)
>> +				bits_wanted = contig_bits;
>> +			if (bits_wanted < min_bits)
>> +				bits_wanted = min_bits;
> 
> This seems only valid when bits_wanted changed?
> 
> BTW, the previous fix hasn't been merged yet:
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/fs/ocfs2?h=next-20250411&id=767ba8b7cba3ca7a560d632f267f7aea35d54810
> 
> So should we drop that first and fold it into a whole one?

Thanks for the reminder, I rechecked the ocfs2-test cases.
In my view, in Gautham's patch, the analysis is wrong.
the call stack isn't from ocfs2_mkdir(), the correct one should be
OCFS2_IOC_RESVSP64.

Let's recall the first patch, and let me resend the v2 patch with a whole one.

- Heming
Heming Zhao April 14, 2025, 3:32 a.m. UTC | #3
Hi Joseph,

On 4/13/25 09:06, Joseph Qi wrote:
> 
> 
> On 2025/4/8 23:23, Heming Zhao wrote:
>> The commit 4eb7b93e0310 ("ocfs2: improve write IO performance when
>> fragmentation is high") introduced another regression.
>>
>> The following ocfs2-test case can trigger this issue:
>>> ${RESV_UNWRITTEN_BIN} -f ${WORK_PLACE}/large_testfile -s 0 -l \
>>> $((${FILE_MAJOR_SIZE_M}*1024*1024))
>>
>> In my env, test disk size (by "fdisk -l <dev>"):
>>> 53687091200 bytes, 104857600 sectors.
>>
>> Above command is:
>>> /usr/local/ocfs2-test/bin/resv_unwritten -f \
>>> /mnt/ocfs2/ocfs2-activate-discontig-bg-dir/large_testfile -s 0 -l \
>>> 53187969024
>>
>> Error log:
>>> [*] Reserve 50724M space for a LARGE file, reserve 200M space for future test.
>>> ioctl error 28: "No space left on device"
>>> resv allocation failed Unknown error -1
>>> reserve unwritten region from 0 to 53187969024.
>>
>> Call flow:
>> __ocfs2_change_file_space //by ioctl OCFS2_IOC_RESVSP64
>>   ocfs2_allocate_unwritten_extents //start:0 len:53187969024
>>    while()
>>     + ocfs2_get_clusters //cpos:0, alloc_size:1623168 (cluster number)
>>     + ocfs2_extend_allocation
>>       + ocfs2_lock_allocators
>>       |  + choose OCFS2_AC_USE_MAIN & ocfs2_cluster_group_search
>>       |
>>       + ocfs2_add_inode_data
>>          ocfs2_add_clusters_in_btree
>>           __ocfs2_claim_clusters
>>            ocfs2_claim_suballoc_bits
>>            + During the allocation of the final part of the large file
>> 	    (around 47GB), no chain had the required contiguous
>>              bits_wanted. Consequently, the allocation failed.
>>
>> How to fix:
>> When OCFS2 is encountering fragmented allocation, the file system should
>> stop attempting bits_wanted contiguous allocation and instead provide the
>> largest available contiguous free bits from the cluster groups.
>>
>> Signed-off-by: Heming Zhao <heming.zhao@suse.com>
>> Fixes: 4eb7b93e0310 ("ocfs2: improve write IO performance when fragmentation is high")
>> ---
>>   fs/ocfs2/suballoc.c | 36 +++++++++++++++++++++++++++++-------
>>   1 file changed, 29 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
>> index fde75f2af37a..2e1689fc6cf7 100644
>> --- a/fs/ocfs2/suballoc.c
>> +++ b/fs/ocfs2/suballoc.c
>> @@ -1796,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>>   {
>>   	int status;
>>   	u16 chain;
>> +	u32 contig_bits;
>>   	u64 next_group;
>>   	struct inode *alloc_inode = ac->ac_inode;
>>   	struct buffer_head *group_bh = NULL;
>> @@ -1821,10 +1822,23 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
>>   	status = -ENOSPC;
>>   	/* for now, the chain search is a bit simplistic. We just use
>>   	 * the 1st group with any empty bits. */
>> -	while ((status = ac->ac_group_search(alloc_inode, group_bh,
>> -					     bits_wanted, min_bits,
>> -					     ac->ac_max_block,
>> -					     res)) == -ENOSPC) {
>> +	while (1) {
>> +		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
>> +			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
>> +			if (!contig_bits)
>> +				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
>> +						le16_to_cpu(bg->bg_bits), 0);
>> +			if (bits_wanted > contig_bits)
>> +				bits_wanted = contig_bits;
>> +			if (bits_wanted < min_bits)
>> +				bits_wanted = min_bits;
> 
> This seems only valid when bits_wanted changed?

Good catch. the correct style:

if (bits_wanted > contig_bits && contig_bits > min_bits)
         bits_wanted = contig_bits;

- Heming

> 
> BTW, the previous fix hasn't been merged yet:
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/fs/ocfs2?h=next-20250411&id=767ba8b7cba3ca7a560d632f267f7aea35d54810
> 
> So should we drop that first and fold it into a whole one?
> 
> Thanks,
> Joseph
diff mbox series

Patch

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index fde75f2af37a..2e1689fc6cf7 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1796,6 +1796,7 @@  static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 {
 	int status;
 	u16 chain;
+	u32 contig_bits;
 	u64 next_group;
 	struct inode *alloc_inode = ac->ac_inode;
 	struct buffer_head *group_bh = NULL;
@@ -1821,10 +1822,23 @@  static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
-	while ((status = ac->ac_group_search(alloc_inode, group_bh,
-					     bits_wanted, min_bits,
-					     ac->ac_max_block,
-					     res)) == -ENOSPC) {
+	while (1) {
+		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
+			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
+			if (!contig_bits)
+				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
+						le16_to_cpu(bg->bg_bits), 0);
+			if (bits_wanted > contig_bits)
+				bits_wanted = contig_bits;
+			if (bits_wanted < min_bits)
+				bits_wanted = min_bits;
+		}
+
+		status = ac->ac_group_search(alloc_inode, group_bh,
+				bits_wanted, min_bits,
+				ac->ac_max_block, res);
+		if (status != -ENOSPC)
+			break;
 		if (!bg->bg_next_group)
 			break;
 
@@ -1984,6 +1998,7 @@  static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	victim = ocfs2_find_victim_chain(cl);
 	ac->ac_chain = victim;
 
+search:
 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 				    res, &bits_left);
 	if (!status) {
@@ -2024,6 +2039,16 @@  static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 		}
 	}
 
+	/* Chains can't supply the bits_wanted contiguous space.
+	 * We should switch to using every single bit when allocating
+	 * from the global bitmap. */
+	if (i == le16_to_cpu(cl->cl_next_free_rec) &&
+	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
+		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
+		ac->ac_chain = victim;
+		goto search;
+	}
+
 set_hint:
 	if (status != -ENOSPC) {
 		/* If the next search of this group is not likely to
@@ -2432,9 +2457,6 @@  int ocfs2_claim_clusters(handle_t *handle,
 {
 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
 
-	if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG)
-		bits_wanted = min_clusters;
-
 	return __ocfs2_claim_clusters(handle, ac, min_clusters,
 				      bits_wanted, cluster_start, num_clusters);
 }