diff mbox series

[3/9] spaceman/defrag: defrag segments

Message ID 20240709191028.2329-4-wen.gang.wang@oracle.com (mailing list archive)
State Accepted, archived
Headers show
Series introduce defrag to xfs_spaceman | expand

Commit Message

Wengang Wang July 9, 2024, 7:10 p.m. UTC
For each segment, the following steps are done trying to defrag it:

1. share the segment with a temporary file
2. unshare the segment in the target file. kernel simulates Cow on the whole
   segment complete the unshare (defrag).
3. release blocks from the tempoary file.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
 spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

Comments

Darrick J. Wong July 9, 2024, 9:57 p.m. UTC | #1
On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
> For each segment, the following steps are done trying to defrag it:
> 
> 1. share the segment with a temporary file
> 2. unshare the segment in the target file. kernel simulates Cow on the whole
>    segment complete the unshare (defrag).
> 3. release blocks from the tempoary file.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>  spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 114 insertions(+)
> 
> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
> index 175cf461..9f11e36b 100644
> --- a/spaceman/defrag.c
> +++ b/spaceman/defrag.c
> @@ -263,6 +263,40 @@ add_ext:
>  	return ret;
>  }
>  
> +/*
> + * check if the segment exceeds EoF.
> + * fix up the clone range and return true if EoF happens,
> + * return false otherwise.
> + */
> +static bool
> +defrag_clone_eof(struct file_clone_range *clone)
> +{
> +	off_t delta;
> +
> +	delta = clone->src_offset + clone->src_length - g_defrag_file_size;
> +	if (delta > 0) {
> +		clone->src_length = 0; // to the end
> +		return true;
> +	}
> +	return false;
> +}
> +
> +/*
> + * get the time delta since pre_time in ms.
> + * pre_time should contains values fetched by gettimeofday()
> + * cur_time is used to store current time by gettimeofday()
> + */
> +static long long
> +get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
> +{
> +	long long us;
> +
> +	gettimeofday(cur_time, NULL);
> +	us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
> +	us += (cur_time->tv_usec - pre_time->tv_usec);
> +	return us;
> +}
> +
>  /*
>   * defragment a file
>   * return 0 if successfully done, 1 otherwise
> @@ -273,6 +307,7 @@ defrag_xfs_defrag(char *file_path) {
>  	long	nr_seg_defrag = 0, nr_ext_defrag = 0;
>  	int	scratch_fd = -1, defrag_fd = -1;
>  	char	tmp_file_path[PATH_MAX+1];
> +	struct file_clone_range clone;
>  	char	*defrag_dir;
>  	struct fsxattr	fsx;
>  	int	ret = 0;

Now that I see this, you might want to straighten up the lines:

	struct fsxattr	fsx = { };
	long		nr_seg_defrag = 0, nr_ext_defrag = 0;

etc.  Note the "= { }" bit that means you don't have to memset them to
zero explicitly.

> @@ -296,6 +331,8 @@ defrag_xfs_defrag(char *file_path) {
>  		goto out;
>  	}
>  
> +	clone.src_fd = defrag_fd;
> +
>  	defrag_dir = dirname(file_path);
>  	snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
>  		getpid());
> @@ -309,7 +346,11 @@ defrag_xfs_defrag(char *file_path) {
>  	}
>  
>  	do {
> +		struct timeval t_clone, t_unshare, t_punch_hole;
>  		struct defrag_segment segment;
> +		long long seg_size, seg_off;
> +		int time_delta;
> +		bool stop;
>  
>  		ret = defrag_get_next_segment(defrag_fd, &segment);
>  		/* no more segments, we are done */
> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
>  			ret = 1;
>  			break;
>  		}
> +
> +		/* we are done if the segment contains only 1 extent */
> +		if (segment.ds_nr < 2)
> +			continue;
> +
> +		/* to bytes */
> +		seg_off = segment.ds_offset * 512;
> +		seg_size = segment.ds_length * 512;
> +
> +		clone.src_offset = seg_off;
> +		clone.src_length = seg_size;
> +		clone.dest_offset = seg_off;
> +
> +		/* checks for EoF and fix up clone */
> +		stop = defrag_clone_eof(&clone);
> +		gettimeofday(&t_clone, NULL);
> +		ret = ioctl(scratch_fd, FICLONERANGE, &clone);

Hm, should the top-level defrag_f function check in the
filetable[i].fsgeom structure that the fs supports reflink?

> +		if (ret != 0) {
> +			fprintf(stderr, "FICLONERANGE failed %s\n",
> +				strerror(errno));

Might be useful to include the file_path in the error message:

/opt/a: FICLONERANGE failed Software caused connection abort

(maybe also put a semicolon before the strerror message?)

> +			break;
> +		}
> +
> +		/* for time stats */
> +		time_delta = get_time_delta_us(&t_clone, &t_unshare);
> +		if (time_delta > max_clone_us)
> +			max_clone_us = time_delta;
> +
> +		/* for defrag stats */
> +		nr_ext_defrag += segment.ds_nr;
> +
> +		/*
> +		 * For the shared range to be unshared via a copy-on-write
> +		 * operation in the file to be defragged. This causes the
> +		 * file needing to be defragged to have new extents allocated
> +		 * and the data to be copied over and written out.
> +		 */
> +		ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
> +				seg_size);
> +		if (ret != 0) {
> +			fprintf(stderr, "UNSHARE_RANGE failed %s\n",
> +				strerror(errno));
> +			break;
> +		}
> +
> +		/* for time stats */
> +		time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
> +		if (time_delta > max_unshare_us)
> +			max_unshare_us = time_delta;
> +
> +		/*
> +		 * Punch out the original extents we shared to the
> +		 * scratch file so they are returned to free space.
> +		 */
> +		ret = fallocate(scratch_fd,
> +			FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
> +			seg_size);

Indentation here (two tabs for a continuation).  Or just ftruncate
scratch_fd to zero bytes?  I think you have to do that for the EOF stuff
to work, right?

--D

> +		if (ret != 0) {
> +			fprintf(stderr, "PUNCH_HOLE failed %s\n",
> +				strerror(errno));
> +			break;
> +		}
> +
> +		/* for defrag stats */
> +		nr_seg_defrag += 1;
> +
> +		/* for time stats */
> +		time_delta = get_time_delta_us(&t_punch_hole, &t_clone);
> +		if (time_delta > max_punch_us)
> +			max_punch_us = time_delta;
> +
> +		if (stop)
> +			break;
>  	} while (true);
>  out:
>  	if (scratch_fd != -1) {
> -- 
> 2.39.3 (Apple Git-146)
> 
>
Wengang Wang July 11, 2024, 10:49 p.m. UTC | #2
> On Jul 9, 2024, at 2:57 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
>> For each segment, the following steps are done trying to defrag it:
>> 
>> 1. share the segment with a temporary file
>> 2. unshare the segment in the target file. kernel simulates Cow on the whole
>>   segment complete the unshare (defrag).
>> 3. release blocks from the tempoary file.
>> 
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>> spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 114 insertions(+)
>> 
>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
>> index 175cf461..9f11e36b 100644
>> --- a/spaceman/defrag.c
>> +++ b/spaceman/defrag.c
>> @@ -263,6 +263,40 @@ add_ext:
>> return ret;
>> }
>> 
>> +/*
>> + * check if the segment exceeds EoF.
>> + * fix up the clone range and return true if EoF happens,
>> + * return false otherwise.
>> + */
>> +static bool
>> +defrag_clone_eof(struct file_clone_range *clone)
>> +{
>> + off_t delta;
>> +
>> + delta = clone->src_offset + clone->src_length - g_defrag_file_size;
>> + if (delta > 0) {
>> + clone->src_length = 0; // to the end
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +/*
>> + * get the time delta since pre_time in ms.
>> + * pre_time should contains values fetched by gettimeofday()
>> + * cur_time is used to store current time by gettimeofday()
>> + */
>> +static long long
>> +get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
>> +{
>> + long long us;
>> +
>> + gettimeofday(cur_time, NULL);
>> + us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
>> + us += (cur_time->tv_usec - pre_time->tv_usec);
>> + return us;
>> +}
>> +
>> /*
>>  * defragment a file
>>  * return 0 if successfully done, 1 otherwise
>> @@ -273,6 +307,7 @@ defrag_xfs_defrag(char *file_path) {
>> long nr_seg_defrag = 0, nr_ext_defrag = 0;
>> int scratch_fd = -1, defrag_fd = -1;
>> char tmp_file_path[PATH_MAX+1];
>> + struct file_clone_range clone;
>> char *defrag_dir;
>> struct fsxattr fsx;
>> int ret = 0;
> 
> Now that I see this, you might want to straighten up the lines:
> 
> struct fsxattr fsx = { };
> long nr_seg_defrag = 0, nr_ext_defrag = 0;
> 
> etc.  Note the "= { }" bit that means you don't have to memset them to
> zero explicitly.

Nice!

> 
>> @@ -296,6 +331,8 @@ defrag_xfs_defrag(char *file_path) {
>> goto out;
>> }
>> 
>> + clone.src_fd = defrag_fd;
>> +
>> defrag_dir = dirname(file_path);
>> snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
>> getpid());
>> @@ -309,7 +346,11 @@ defrag_xfs_defrag(char *file_path) {
>> }
>> 
>> do {
>> + struct timeval t_clone, t_unshare, t_punch_hole;
>> struct defrag_segment segment;
>> + long long seg_size, seg_off;
>> + int time_delta;
>> + bool stop;
>> 
>> ret = defrag_get_next_segment(defrag_fd, &segment);
>> /* no more segments, we are done */
>> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
>> ret = 1;
>> break;
>> }
>> +
>> + /* we are done if the segment contains only 1 extent */
>> + if (segment.ds_nr < 2)
>> + continue;
>> +
>> + /* to bytes */
>> + seg_off = segment.ds_offset * 512;
>> + seg_size = segment.ds_length * 512;
>> +
>> + clone.src_offset = seg_off;
>> + clone.src_length = seg_size;
>> + clone.dest_offset = seg_off;
>> +
>> + /* checks for EoF and fix up clone */
>> + stop = defrag_clone_eof(&clone);
>> + gettimeofday(&t_clone, NULL);
>> + ret = ioctl(scratch_fd, FICLONERANGE, &clone);
> 
> Hm, should the top-level defrag_f function check in the
> filetable[i].fsgeom structure that the fs supports reflink?

Yes, good to know.

> 
>> + if (ret != 0) {
>> + fprintf(stderr, "FICLONERANGE failed %s\n",
>> + strerror(errno));
> 
> Might be useful to include the file_path in the error message:
> 
> /opt/a: FICLONERANGE failed Software caused connection abort
> 
> (maybe also put a semicolon before the strerror message?)

OK.

> 
>> + break;
>> + }
>> +
>> + /* for time stats */
>> + time_delta = get_time_delta_us(&t_clone, &t_unshare);
>> + if (time_delta > max_clone_us)
>> + max_clone_us = time_delta;
>> +
>> + /* for defrag stats */
>> + nr_ext_defrag += segment.ds_nr;
>> +
>> + /*
>> +  * For the shared range to be unshared via a copy-on-write
>> +  * operation in the file to be defragged. This causes the
>> +  * file needing to be defragged to have new extents allocated
>> +  * and the data to be copied over and written out.
>> +  */
>> + ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
>> + seg_size);
>> + if (ret != 0) {
>> + fprintf(stderr, "UNSHARE_RANGE failed %s\n",
>> + strerror(errno));
>> + break;
>> + }
>> +
>> + /* for time stats */
>> + time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
>> + if (time_delta > max_unshare_us)
>> + max_unshare_us = time_delta;
>> +
>> + /*
>> +  * Punch out the original extents we shared to the
>> +  * scratch file so they are returned to free space.
>> +  */
>> + ret = fallocate(scratch_fd,
>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
>> + seg_size);
> 
> Indentation here (two tabs for a continuation).  

OK.

> Or just ftruncate
> scratch_fd to zero bytes?  I think you have to do that for the EOF stuff
> to work, right?
> 

I’d truncate the UNSHARE range only in the loop.
EOF stuff would be truncated on (O_TMPFILE) file close.
The EOF stuff would be used for another purpose, see 
[PATCH 6/9] spaceman/defrag: workaround kernel

Thanks,
Wengang

> --D
> 
>> + if (ret != 0) {
>> + fprintf(stderr, "PUNCH_HOLE failed %s\n",
>> + strerror(errno));
>> + break;
>> + }
>> +
>> + /* for defrag stats */
>> + nr_seg_defrag += 1;
>> +
>> + /* for time stats */
>> + time_delta = get_time_delta_us(&t_punch_hole, &t_clone);
>> + if (time_delta > max_punch_us)
>> + max_punch_us = time_delta;
>> +
>> + if (stop)
>> + break;
>> } while (true);
>> out:
>> if (scratch_fd != -1) {
>> -- 
>> 2.39.3 (Apple Git-146)
Wengang Wang July 12, 2024, 7:07 p.m. UTC | #3
> On Jul 11, 2024, at 3:49 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> 
> 
> 
>> On Jul 9, 2024, at 2:57 PM, Darrick J. Wong <djwong@kernel.org> wrote:
>> 
>> On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
>>> For each segment, the following steps are done trying to defrag it:
>>> 
>>> 1. share the segment with a temporary file
>>> 2. unshare the segment in the target file. kernel simulates Cow on the whole
>>>  segment complete the unshare (defrag).
>>> 3. release blocks from the tempoary file.
>>> 
>>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>>> ---
>>> spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
>>> 1 file changed, 114 insertions(+)
>>> 
>>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
>>> index 175cf461..9f11e36b 100644
>>> --- a/spaceman/defrag.c
>>> +++ b/spaceman/defrag.c
>>> @@ -263,6 +263,40 @@ add_ext:
>>> return ret;
>>> }
>>> 
>>> +/*
>>> + * check if the segment exceeds EoF.
>>> + * fix up the clone range and return true if EoF happens,
>>> + * return false otherwise.
>>> + */
>>> +static bool
>>> +defrag_clone_eof(struct file_clone_range *clone)
>>> +{
>>> + off_t delta;
>>> +
>>> + delta = clone->src_offset + clone->src_length - g_defrag_file_size;
>>> + if (delta > 0) {
>>> + clone->src_length = 0; // to the end
>>> + return true;
>>> + }
>>> + return false;
>>> +}
>>> +
>>> +/*
>>> + * get the time delta since pre_time in ms.
>>> + * pre_time should contains values fetched by gettimeofday()
>>> + * cur_time is used to store current time by gettimeofday()
>>> + */
>>> +static long long
>>> +get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
>>> +{
>>> + long long us;
>>> +
>>> + gettimeofday(cur_time, NULL);
>>> + us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
>>> + us += (cur_time->tv_usec - pre_time->tv_usec);
>>> + return us;
>>> +}
>>> +
>>> /*
>>> * defragment a file
>>> * return 0 if successfully done, 1 otherwise
>>> @@ -273,6 +307,7 @@ defrag_xfs_defrag(char *file_path) {
>>> long nr_seg_defrag = 0, nr_ext_defrag = 0;
>>> int scratch_fd = -1, defrag_fd = -1;
>>> char tmp_file_path[PATH_MAX+1];
>>> + struct file_clone_range clone;
>>> char *defrag_dir;
>>> struct fsxattr fsx;
>>> int ret = 0;
>> 
>> Now that I see this, you might want to straighten up the lines:
>> 
>> struct fsxattr fsx = { };
>> long nr_seg_defrag = 0, nr_ext_defrag = 0;
>> 
>> etc.  Note the "= { }" bit that means you don't have to memset them to
>> zero explicitly.
> 
> Nice!
> 
>> 
>>> @@ -296,6 +331,8 @@ defrag_xfs_defrag(char *file_path) {
>>> goto out;
>>> }
>>> 
>>> + clone.src_fd = defrag_fd;
>>> +
>>> defrag_dir = dirname(file_path);
>>> snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
>>> getpid());
>>> @@ -309,7 +346,11 @@ defrag_xfs_defrag(char *file_path) {
>>> }
>>> 
>>> do {
>>> + struct timeval t_clone, t_unshare, t_punch_hole;
>>> struct defrag_segment segment;
>>> + long long seg_size, seg_off;
>>> + int time_delta;
>>> + bool stop;
>>> 
>>> ret = defrag_get_next_segment(defrag_fd, &segment);
>>> /* no more segments, we are done */
>>> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
>>> ret = 1;
>>> break;
>>> }
>>> +
>>> + /* we are done if the segment contains only 1 extent */
>>> + if (segment.ds_nr < 2)
>>> + continue;
>>> +
>>> + /* to bytes */
>>> + seg_off = segment.ds_offset * 512;
>>> + seg_size = segment.ds_length * 512;
>>> +
>>> + clone.src_offset = seg_off;
>>> + clone.src_length = seg_size;
>>> + clone.dest_offset = seg_off;
>>> +
>>> + /* checks for EoF and fix up clone */
>>> + stop = defrag_clone_eof(&clone);
>>> + gettimeofday(&t_clone, NULL);
>>> + ret = ioctl(scratch_fd, FICLONERANGE, &clone);
>> 
>> Hm, should the top-level defrag_f function check in the
>> filetable[i].fsgeom structure that the fs supports reflink?
> 
> Yes, good to know.

It seems that xfs_fsop_geom doesn’t know about reflink?

Thanks,
Wengang 

> 
>> 
>>> + if (ret != 0) {
>>> + fprintf(stderr, "FICLONERANGE failed %s\n",
>>> + strerror(errno));
>> 
>> Might be useful to include the file_path in the error message:
>> 
>> /opt/a: FICLONERANGE failed Software caused connection abort
>> 
>> (maybe also put a semicolon before the strerror message?)
> 
> OK.
> 
>> 
>>> + break;
>>> + }
>>> +
>>> + /* for time stats */
>>> + time_delta = get_time_delta_us(&t_clone, &t_unshare);
>>> + if (time_delta > max_clone_us)
>>> + max_clone_us = time_delta;
>>> +
>>> + /* for defrag stats */
>>> + nr_ext_defrag += segment.ds_nr;
>>> +
>>> + /*
>>> +  * For the shared range to be unshared via a copy-on-write
>>> +  * operation in the file to be defragged. This causes the
>>> +  * file needing to be defragged to have new extents allocated
>>> +  * and the data to be copied over and written out.
>>> +  */
>>> + ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
>>> + seg_size);
>>> + if (ret != 0) {
>>> + fprintf(stderr, "UNSHARE_RANGE failed %s\n",
>>> + strerror(errno));
>>> + break;
>>> + }
>>> +
>>> + /* for time stats */
>>> + time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
>>> + if (time_delta > max_unshare_us)
>>> + max_unshare_us = time_delta;
>>> +
>>> + /*
>>> +  * Punch out the original extents we shared to the
>>> +  * scratch file so they are returned to free space.
>>> +  */
>>> + ret = fallocate(scratch_fd,
>>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
>>> + seg_size);
>> 
>> Indentation here (two tabs for a continuation).  
> 
> OK.
> 
>> Or just ftruncate
>> scratch_fd to zero bytes?  I think you have to do that for the EOF stuff
>> to work, right?
>> 
> 
> I’d truncate the UNSHARE range only in the loop.
> EOF stuff would be truncated on (O_TMPFILE) file close.
> The EOF stuff would be used for another purpose, see 
> [PATCH 6/9] spaceman/defrag: workaround kernel
> 
> Thanks,
> Wengang
> 
>> --D
>> 
>>> + if (ret != 0) {
>>> + fprintf(stderr, "PUNCH_HOLE failed %s\n",
>>> + strerror(errno));
>>> + break;
>>> + }
>>> +
>>> + /* for defrag stats */
>>> + nr_seg_defrag += 1;
>>> +
>>> + /* for time stats */
>>> + time_delta = get_time_delta_us(&t_punch_hole, &t_clone);
>>> + if (time_delta > max_punch_us)
>>> + max_punch_us = time_delta;
>>> +
>>> + if (stop)
>>> + break;
>>> } while (true);
>>> out:
>>> if (scratch_fd != -1) {
>>> -- 
>>> 2.39.3 (Apple Git-146)
Darrick J. Wong July 15, 2024, 10:42 p.m. UTC | #4
On Fri, Jul 12, 2024 at 07:07:01PM +0000, Wengang Wang wrote:
> 
> 
> > On Jul 11, 2024, at 3:49 PM, Wengang Wang <wen.gang.wang@oracle.com> wrote:
> > 
> > 
> > 
> >> On Jul 9, 2024, at 2:57 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> >> 
> >> On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
> >>> For each segment, the following steps are done trying to defrag it:
> >>> 
> >>> 1. share the segment with a temporary file
> >>> 2. unshare the segment in the target file. kernel simulates Cow on the whole
> >>>  segment complete the unshare (defrag).
> >>> 3. release blocks from the tempoary file.
> >>> 
> >>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> >>> ---
> >>> spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
> >>> 1 file changed, 114 insertions(+)
> >>> 
> >>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
> >>> index 175cf461..9f11e36b 100644
> >>> --- a/spaceman/defrag.c
> >>> +++ b/spaceman/defrag.c

<snip>

> >>> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
> >>> ret = 1;
> >>> break;
> >>> }
> >>> +
> >>> + /* we are done if the segment contains only 1 extent */
> >>> + if (segment.ds_nr < 2)
> >>> + continue;
> >>> +
> >>> + /* to bytes */
> >>> + seg_off = segment.ds_offset * 512;
> >>> + seg_size = segment.ds_length * 512;
> >>> +
> >>> + clone.src_offset = seg_off;
> >>> + clone.src_length = seg_size;
> >>> + clone.dest_offset = seg_off;
> >>> +
> >>> + /* checks for EoF and fix up clone */
> >>> + stop = defrag_clone_eof(&clone);
> >>> + gettimeofday(&t_clone, NULL);
> >>> + ret = ioctl(scratch_fd, FICLONERANGE, &clone);
> >> 
> >> Hm, should the top-level defrag_f function check in the
> >> filetable[i].fsgeom structure that the fs supports reflink?
> > 
> > Yes, good to know.
> 
> It seems that xfs_fsop_geom doesn’t know about reflink?

XFS_FSOP_GEOM_FLAGS_REFLINK ?

--D

> Thanks,
> Wengang 
> 
> > 
> >> 
> >>> + if (ret != 0) {
> >>> + fprintf(stderr, "FICLONERANGE failed %s\n",
> >>> + strerror(errno));
> >> 
> >> Might be useful to include the file_path in the error message:
> >> 
> >> /opt/a: FICLONERANGE failed Software caused connection abort
> >> 
> >> (maybe also put a semicolon before the strerror message?)
> > 
> > OK.
> > 
> >> 
> >>> + break;
> >>> + }
> >>> +
> >>> + /* for time stats */
> >>> + time_delta = get_time_delta_us(&t_clone, &t_unshare);
> >>> + if (time_delta > max_clone_us)
> >>> + max_clone_us = time_delta;
> >>> +
> >>> + /* for defrag stats */
> >>> + nr_ext_defrag += segment.ds_nr;
> >>> +
> >>> + /*
> >>> +  * For the shared range to be unshared via a copy-on-write
> >>> +  * operation in the file to be defragged. This causes the
> >>> +  * file needing to be defragged to have new extents allocated
> >>> +  * and the data to be copied over and written out.
> >>> +  */
> >>> + ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
> >>> + seg_size);
> >>> + if (ret != 0) {
> >>> + fprintf(stderr, "UNSHARE_RANGE failed %s\n",
> >>> + strerror(errno));
> >>> + break;
> >>> + }
> >>> +
> >>> + /* for time stats */
> >>> + time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
> >>> + if (time_delta > max_unshare_us)
> >>> + max_unshare_us = time_delta;
> >>> +
> >>> + /*
> >>> +  * Punch out the original extents we shared to the
> >>> +  * scratch file so they are returned to free space.
> >>> +  */
> >>> + ret = fallocate(scratch_fd,
> >>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
> >>> + seg_size);
> >> 
> >> Indentation here (two tabs for a continuation).  
> > 
> > OK.
> > 
> >> Or just ftruncate
> >> scratch_fd to zero bytes?  I think you have to do that for the EOF stuff
> >> to work, right?
> >> 
> > 
> > I’d truncate the UNSHARE range only in the loop.
> > EOF stuff would be truncated on (O_TMPFILE) file close.
> > The EOF stuff would be used for another purpose, see 
> > [PATCH 6/9] spaceman/defrag: workaround kernel
> > 
> > Thanks,
> > Wengang
> > 
> >> --D
> >> 
> >>> + if (ret != 0) {
> >>> + fprintf(stderr, "PUNCH_HOLE failed %s\n",
> >>> + strerror(errno));
> >>> + break;
> >>> + }
> >>> +
> >>> + /* for defrag stats */
> >>> + nr_seg_defrag += 1;
> >>> +
> >>> + /* for time stats */
> >>> + time_delta = get_time_delta_us(&t_punch_hole, &t_clone);
> >>> + if (time_delta > max_punch_us)
> >>> + max_punch_us = time_delta;
> >>> +
> >>> + if (stop)
> >>> + break;
> >>> } while (true);
> >>> out:
> >>> if (scratch_fd != -1) {
> >>> -- 
> >>> 2.39.3 (Apple Git-146)
> 
>
Dave Chinner July 16, 2024, 12:08 a.m. UTC | #5
On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
> For each segment, the following steps are done trying to defrag it:
> 
> 1. share the segment with a temporary file
> 2. unshare the segment in the target file. kernel simulates Cow on the whole
>    segment complete the unshare (defrag).
> 3. release blocks from the tempoary file.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>  spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 114 insertions(+)
> 
> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
> index 175cf461..9f11e36b 100644
> --- a/spaceman/defrag.c
> +++ b/spaceman/defrag.c
> @@ -263,6 +263,40 @@ add_ext:
>  	return ret;
>  }
>  
> +/*
> + * check if the segment exceeds EoF.
> + * fix up the clone range and return true if EoF happens,
> + * return false otherwise.
> + */
> +static bool
> +defrag_clone_eof(struct file_clone_range *clone)
> +{
> +	off_t delta;
> +
> +	delta = clone->src_offset + clone->src_length - g_defrag_file_size;
> +	if (delta > 0) {
> +		clone->src_length = 0; // to the end
> +		return true;
> +	}
> +	return false;
> +}
> +
> +/*
> + * get the time delta since pre_time in ms.
> + * pre_time should contains values fetched by gettimeofday()
> + * cur_time is used to store current time by gettimeofday()
> + */
> +static long long
> +get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
> +{
> +	long long us;
> +
> +	gettimeofday(cur_time, NULL);
> +	us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
> +	us += (cur_time->tv_usec - pre_time->tv_usec);
> +	return us;
> +}
> +
>  /*
>   * defragment a file
>   * return 0 if successfully done, 1 otherwise
> @@ -273,6 +307,7 @@ defrag_xfs_defrag(char *file_path) {
>  	long	nr_seg_defrag = 0, nr_ext_defrag = 0;
>  	int	scratch_fd = -1, defrag_fd = -1;
>  	char	tmp_file_path[PATH_MAX+1];
> +	struct file_clone_range clone;
>  	char	*defrag_dir;
>  	struct fsxattr	fsx;
>  	int	ret = 0;
> @@ -296,6 +331,8 @@ defrag_xfs_defrag(char *file_path) {
>  		goto out;
>  	}
>  
> +	clone.src_fd = defrag_fd;
> +
>  	defrag_dir = dirname(file_path);

Just a note: can you please call this the "source fd", not the
"defrag_fd"? defrag_fd could mean either the source or the
temporary scratch file we use as the defrag destination.

>  	snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
>  		getpid());
> @@ -309,7 +346,11 @@ defrag_xfs_defrag(char *file_path) {
>  	}
>  
>  	do {
> +		struct timeval t_clone, t_unshare, t_punch_hole;
>  		struct defrag_segment segment;
> +		long long seg_size, seg_off;
> +		int time_delta;
> +		bool stop;
>  
>  		ret = defrag_get_next_segment(defrag_fd, &segment);
>  		/* no more segments, we are done */
> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
>  			ret = 1;
>  			break;
>  		}
> +
> +		/* we are done if the segment contains only 1 extent */
> +		if (segment.ds_nr < 2)
> +			continue;
> +
> +		/* to bytes */
> +		seg_off = segment.ds_offset * 512;
> +		seg_size = segment.ds_length * 512;

Ugh. Do this in the mapping code that gets the extent info. Have it
return bytes. Or just use FIEMAP because it uses byte ranges to
begin with.

> +
> +		clone.src_offset = seg_off;
> +		clone.src_length = seg_size;
> +		clone.dest_offset = seg_off;
> +
> +		/* checks for EoF and fix up clone */
> +		stop = defrag_clone_eof(&clone);

Ok, so we copy the segment map into clone args, and ...

> +		gettimeofday(&t_clone, NULL);
> +		ret = ioctl(scratch_fd, FICLONERANGE, &clone);
> +		if (ret != 0) {
> +			fprintf(stderr, "FICLONERANGE failed %s\n",
> +				strerror(errno));
> +			break;
> +		}

clone the source to the scratch file. This blocks writes to the
source file while it is in progress, but allows reads to pass
through the source file as data is not changing.


> +		/* for time stats */
> +		time_delta = get_time_delta_us(&t_clone, &t_unshare);
> +		if (time_delta > max_clone_us)
> +			max_clone_us = time_delta;
> +
> +		/* for defrag stats */
> +		nr_ext_defrag += segment.ds_nr;
> +
> +		/*
> +		 * For the shared range to be unshared via a copy-on-write
> +		 * operation in the file to be defragged. This causes the
> +		 * file needing to be defragged to have new extents allocated
> +		 * and the data to be copied over and written out.
> +		 */
> +		ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
> +				seg_size);
> +		if (ret != 0) {
> +			fprintf(stderr, "UNSHARE_RANGE failed %s\n",
> +				strerror(errno));
> +			break;
> +		}

And now we unshare the source file. This blocks all IO to the source
file.

Ok, so this is the fundamental problem this whole "segmented
defrag" is trying to work around: FALLOC_FL_UNSHARE_RANGE blocks
all read and write IO whilst it is in progress.

We had this same problem with FICLONERANGE taking snapshots of VM
files - we changed the locking to take shared IO locks to allow
reads to run while the clone was in progress. Because the Oracle Vm
infrastructure uses a sidecar to redirect writes while a snapshot
(clone) was in progress, no VM IO got blocked while the clone was in
progress and so the applications inside the VM never even noticed a
clone was taking place.

Why isn't the same infrastructure being used here?

FALLOC_FL_UNSHARE_RANGE is not changing data, nor is it freeing any
data blocks. Yes, we are re-writing the data somewhere else, but
in that case the original data is still intact in it's original
location on disk and not being freed.

Hence if a read races with UNSHARE, it will hit a referenced extent
containing the correct data regardless of whether it is in the old
or new file. Hence we can likely use shared IO locking for UNSHARE,
just like we do for FICLONERANGE.

At this point, if the Oracle VM infrastructure uses the sidecar
write channel whilst the defrag is in progress, this whole algorithm
simply becomes "for regions with extents smaller than X, clone and
unshare the region".

The whole need for "idle time" goes away. The need for segment size
control largely goes away. The need to tune the defrag algorithm to
avoid IO latency and/or throughput issues goes away.

> +
> +		/* for time stats */
> +		time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
> +		if (time_delta > max_unshare_us)
> +			max_unshare_us = time_delta;
> +
> +		/*
> +		 * Punch out the original extents we shared to the
> +		 * scratch file so they are returned to free space.
> +		 */
> +		ret = fallocate(scratch_fd,
> +			FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
> +			seg_size);
> +		if (ret != 0) {
> +			fprintf(stderr, "PUNCH_HOLE failed %s\n",
> +				strerror(errno));
> +			break;
> +		}

This is unnecessary if there is lots of free space. You can leave
this to the very end of defrag so that the source file defrag
operation isn't slowed down by cleaning up all the fragmented
extents....

-Dave.
Wengang Wang July 18, 2024, 6:06 p.m. UTC | #6
> On Jul 15, 2024, at 5:08 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Tue, Jul 09, 2024 at 12:10:22PM -0700, Wengang Wang wrote:
>> For each segment, the following steps are done trying to defrag it:
>> 
>> 1. share the segment with a temporary file
>> 2. unshare the segment in the target file. kernel simulates Cow on the whole
>>   segment complete the unshare (defrag).
>> 3. release blocks from the tempoary file.
>> 
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>> spaceman/defrag.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 114 insertions(+)
>> 
>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
>> index 175cf461..9f11e36b 100644
>> --- a/spaceman/defrag.c
>> +++ b/spaceman/defrag.c
>> @@ -263,6 +263,40 @@ add_ext:
>> return ret;
>> }
>> 
>> +/*
>> + * check if the segment exceeds EoF.
>> + * fix up the clone range and return true if EoF happens,
>> + * return false otherwise.
>> + */
>> +static bool
>> +defrag_clone_eof(struct file_clone_range *clone)
>> +{
>> + off_t delta;
>> +
>> + delta = clone->src_offset + clone->src_length - g_defrag_file_size;
>> + if (delta > 0) {
>> + clone->src_length = 0; // to the end
>> + return true;
>> + }
>> + return false;
>> +}
>> +
>> +/*
>> + * get the time delta since pre_time in ms.
>> + * pre_time should contains values fetched by gettimeofday()
>> + * cur_time is used to store current time by gettimeofday()
>> + */
>> +static long long
>> +get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
>> +{
>> + long long us;
>> +
>> + gettimeofday(cur_time, NULL);
>> + us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
>> + us += (cur_time->tv_usec - pre_time->tv_usec);
>> + return us;
>> +}
>> +
>> /*
>>  * defragment a file
>>  * return 0 if successfully done, 1 otherwise
>> @@ -273,6 +307,7 @@ defrag_xfs_defrag(char *file_path) {
>> long nr_seg_defrag = 0, nr_ext_defrag = 0;
>> int scratch_fd = -1, defrag_fd = -1;
>> char tmp_file_path[PATH_MAX+1];
>> + struct file_clone_range clone;
>> char *defrag_dir;
>> struct fsxattr fsx;
>> int ret = 0;
>> @@ -296,6 +331,8 @@ defrag_xfs_defrag(char *file_path) {
>> goto out;
>> }
>> 
>> + clone.src_fd = defrag_fd;
>> +
>> defrag_dir = dirname(file_path);
> 
> Just a note: can you please call this the "source fd", not the
> "defrag_fd"? defrag_fd could mean either the source or the
> temporary scratch file we use as the defrag destination.

I have no problem to change that. Just I was thinking there is no data moving happening
from a file to another. The defrag_fd means the file under defrag and the temp file is with
scratch_fd.

> 
>> snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
>> getpid());
>> @@ -309,7 +346,11 @@ defrag_xfs_defrag(char *file_path) {
>> }
>> 
>> do {
>> + struct timeval t_clone, t_unshare, t_punch_hole;
>> struct defrag_segment segment;
>> + long long seg_size, seg_off;
>> + int time_delta;
>> + bool stop;
>> 
>> ret = defrag_get_next_segment(defrag_fd, &segment);
>> /* no more segments, we are done */
>> @@ -322,6 +363,79 @@ defrag_xfs_defrag(char *file_path) {
>> ret = 1;
>> break;
>> }
>> +
>> + /* we are done if the segment contains only 1 extent */
>> + if (segment.ds_nr < 2)
>> + continue;
>> +
>> + /* to bytes */
>> + seg_off = segment.ds_offset * 512;
>> + seg_size = segment.ds_length * 512;
> 
> Ugh. Do this in the mapping code that gets the extent info. Have it
> return bytes. Or just use FIEMAP because it uses byte ranges to
> begin with.

OK.

> 
>> +
>> + clone.src_offset = seg_off;
>> + clone.src_length = seg_size;
>> + clone.dest_offset = seg_off;
>> +
>> + /* checks for EoF and fix up clone */
>> + stop = defrag_clone_eof(&clone);
> 
> Ok, so we copy the segment map into clone args, and ...
> 
>> + gettimeofday(&t_clone, NULL);
>> + ret = ioctl(scratch_fd, FICLONERANGE, &clone);
>> + if (ret != 0) {
>> + fprintf(stderr, "FICLONERANGE failed %s\n",
>> + strerror(errno));
>> + break;
>> + }
> 
> clone the source to the scratch file. This blocks writes to the
> source file while it is in progress, but allows reads to pass
> through the source file as data is not changing.
> 
> 
>> + /* for time stats */
>> + time_delta = get_time_delta_us(&t_clone, &t_unshare);
>> + if (time_delta > max_clone_us)
>> + max_clone_us = time_delta;
>> +
>> + /* for defrag stats */
>> + nr_ext_defrag += segment.ds_nr;
>> +
>> + /*
>> +  * For the shared range to be unshared via a copy-on-write
>> +  * operation in the file to be defragged. This causes the
>> +  * file needing to be defragged to have new extents allocated
>> +  * and the data to be copied over and written out.
>> +  */
>> + ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
>> + seg_size);
>> + if (ret != 0) {
>> + fprintf(stderr, "UNSHARE_RANGE failed %s\n",
>> + strerror(errno));
>> + break;
>> + }
> 
> And now we unshare the source file. This blocks all IO to the source
> file.

Yes, even fetching data from disk is done while IO is locked.
I am wondering if we can fetch data without locking IO. That’s also why
I added readahead in a later patch.

> 
> Ok, so this is the fundamental problem this whole "segmented
> defrag" is trying to work around: FALLOC_FL_UNSHARE_RANGE blocks
> all read and write IO whilst it is in progress.
> 
> We had this same problem with FICLONERANGE taking snapshots of VM
> files - we changed the locking to take shared IO locks to allow
> reads to run while the clone was in progress. Because the Oracle Vm
> infrastructure uses a sidecar to redirect writes while a snapshot
> (clone) was in progress, no VM IO got blocked while the clone was in
> progress and so the applications inside the VM never even noticed a
> clone was taking place.
> 
> Why isn't the same infrastructure being used here?
> 
> FALLOC_FL_UNSHARE_RANGE is not changing data, nor is it freeing any
> data blocks. Yes, we are re-writing the data somewhere else, but
> in that case the original data is still intact in it's original
> location on disk and not being freed.
> 
> Hence if a read races with UNSHARE, it will hit a referenced extent
> containing the correct data regardless of whether it is in the old
> or new file. Hence we can likely use shared IO locking for UNSHARE,
> just like we do for FICLONERANGE.
> 
> At this point, if the Oracle VM infrastructure uses the sidecar
> write channel whilst the defrag is in progress, this whole algorithm
> simply becomes "for regions with extents smaller than X, clone and
> unshare the region".
> 
> The whole need for "idle time" goes away. The need for segment size
> control largely goes away. The need to tune the defrag algorithm to
> avoid IO latency and/or throughput issues goes away.
> 

As we tested, the write-redirecting doesn’t work well.


>> +
>> + /* for time stats */
>> + time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
>> + if (time_delta > max_unshare_us)
>> + max_unshare_us = time_delta;
>> +
>> + /*
>> +  * Punch out the original extents we shared to the
>> +  * scratch file so they are returned to free space.
>> +  */
>> + ret = fallocate(scratch_fd,
>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
>> + seg_size);
>> + if (ret != 0) {
>> + fprintf(stderr, "PUNCH_HOLE failed %s\n",
>> + strerror(errno));
>> + break;
>> + }
> 
> This is unnecessary if there is lots of free space. You can leave
> this to the very end of defrag so that the source file defrag
> operation isn't slowed down by cleaning up all the fragmented
> extents....

Yes. Two considerations here:
1. The space use as you mentioned, the temp file might grow huge on a low space system.
2. Punching holes on the temp file doesn’t lock the file under defrag, so with the PUNCH_HOLE,
we need a little bit less sleep time for each segment.

Thanks,
Wengang
diff mbox series

Patch

diff --git a/spaceman/defrag.c b/spaceman/defrag.c
index 175cf461..9f11e36b 100644
--- a/spaceman/defrag.c
+++ b/spaceman/defrag.c
@@ -263,6 +263,40 @@  add_ext:
 	return ret;
 }
 
+/*
+ * check if the segment exceeds EoF.
+ * fix up the clone range and return true if EoF happens,
+ * return false otherwise.
+ */
+static bool
+defrag_clone_eof(struct file_clone_range *clone)
+{
+	off_t delta;
+
+	delta = clone->src_offset + clone->src_length - g_defrag_file_size;
+	if (delta > 0) {
+		clone->src_length = 0; // to the end
+		return true;
+	}
+	return false;
+}
+
+/*
+ * get the time delta since pre_time in ms.
+ * pre_time should contains values fetched by gettimeofday()
+ * cur_time is used to store current time by gettimeofday()
+ */
+static long long
+get_time_delta_us(struct timeval *pre_time, struct timeval *cur_time)
+{
+	long long us;
+
+	gettimeofday(cur_time, NULL);
+	us = (cur_time->tv_sec - pre_time->tv_sec) * 1000000;
+	us += (cur_time->tv_usec - pre_time->tv_usec);
+	return us;
+}
+
 /*
  * defragment a file
  * return 0 if successfully done, 1 otherwise
@@ -273,6 +307,7 @@  defrag_xfs_defrag(char *file_path) {
 	long	nr_seg_defrag = 0, nr_ext_defrag = 0;
 	int	scratch_fd = -1, defrag_fd = -1;
 	char	tmp_file_path[PATH_MAX+1];
+	struct file_clone_range clone;
 	char	*defrag_dir;
 	struct fsxattr	fsx;
 	int	ret = 0;
@@ -296,6 +331,8 @@  defrag_xfs_defrag(char *file_path) {
 		goto out;
 	}
 
+	clone.src_fd = defrag_fd;
+
 	defrag_dir = dirname(file_path);
 	snprintf(tmp_file_path, PATH_MAX, "%s/.xfsdefrag_%d", defrag_dir,
 		getpid());
@@ -309,7 +346,11 @@  defrag_xfs_defrag(char *file_path) {
 	}
 
 	do {
+		struct timeval t_clone, t_unshare, t_punch_hole;
 		struct defrag_segment segment;
+		long long seg_size, seg_off;
+		int time_delta;
+		bool stop;
 
 		ret = defrag_get_next_segment(defrag_fd, &segment);
 		/* no more segments, we are done */
@@ -322,6 +363,79 @@  defrag_xfs_defrag(char *file_path) {
 			ret = 1;
 			break;
 		}
+
+		/* we are done if the segment contains only 1 extent */
+		if (segment.ds_nr < 2)
+			continue;
+
+		/* to bytes */
+		seg_off = segment.ds_offset * 512;
+		seg_size = segment.ds_length * 512;
+
+		clone.src_offset = seg_off;
+		clone.src_length = seg_size;
+		clone.dest_offset = seg_off;
+
+		/* checks for EoF and fix up clone */
+		stop = defrag_clone_eof(&clone);
+		gettimeofday(&t_clone, NULL);
+		ret = ioctl(scratch_fd, FICLONERANGE, &clone);
+		if (ret != 0) {
+			fprintf(stderr, "FICLONERANGE failed %s\n",
+				strerror(errno));
+			break;
+		}
+
+		/* for time stats */
+		time_delta = get_time_delta_us(&t_clone, &t_unshare);
+		if (time_delta > max_clone_us)
+			max_clone_us = time_delta;
+
+		/* for defrag stats */
+		nr_ext_defrag += segment.ds_nr;
+
+		/*
+		 * For the shared range to be unshared via a copy-on-write
+		 * operation in the file to be defragged. This causes the
+		 * file needing to be defragged to have new extents allocated
+		 * and the data to be copied over and written out.
+		 */
+		ret = fallocate(defrag_fd, FALLOC_FL_UNSHARE_RANGE, seg_off,
+				seg_size);
+		if (ret != 0) {
+			fprintf(stderr, "UNSHARE_RANGE failed %s\n",
+				strerror(errno));
+			break;
+		}
+
+		/* for time stats */
+		time_delta = get_time_delta_us(&t_unshare, &t_punch_hole);
+		if (time_delta > max_unshare_us)
+			max_unshare_us = time_delta;
+
+		/*
+		 * Punch out the original extents we shared to the
+		 * scratch file so they are returned to free space.
+		 */
+		ret = fallocate(scratch_fd,
+			FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, seg_off,
+			seg_size);
+		if (ret != 0) {
+			fprintf(stderr, "PUNCH_HOLE failed %s\n",
+				strerror(errno));
+			break;
+		}
+
+		/* for defrag stats */
+		nr_seg_defrag += 1;
+
+		/* for time stats */
+		time_delta = get_time_delta_us(&t_punch_hole, &t_clone);
+		if (time_delta > max_punch_us)
+			max_punch_us = time_delta;
+
+		if (stop)
+			break;
 	} while (true);
 out:
 	if (scratch_fd != -1) {