diff mbox series

[6/9] spaceman/defrag: workaround kernel xfs_reflink_try_clear_inode_flag()

Message ID 20240709191028.2329-7-wen.gang.wang@oracle.com (mailing list archive)
State Accepted, archived
Headers show
Series introduce defrag to xfs_spaceman | expand

Commit Message

Wengang Wang July 9, 2024, 7:10 p.m. UTC
xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
of extents and none of the extents are shared.

workaround:
share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
quickly to save cpu times and speed up defrag significantly.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
---
 spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 172 insertions(+), 2 deletions(-)

Comments

Darrick J. Wong July 9, 2024, 8:51 p.m. UTC | #1
On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
> of extents and none of the extents are shared.
> 
> workaround:
> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
> quickly to save cpu times and speed up defrag significantly.

I wonder if a better solution would be to change xfs_reflink_unshare
only to try to clear the reflink iflag if offset/len cover the entire
file?  It's a pity we can't set time budgets on fallocate requests.

--D

> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>  spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 172 insertions(+), 2 deletions(-)
> 
> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
> index f8e6713c..b5c5b187 100644
> --- a/spaceman/defrag.c
> +++ b/spaceman/defrag.c
> @@ -327,6 +327,155 @@ defrag_fs_limit_hit(int fd)
>  	return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes;
>  }
>  
> +static bool g_enable_first_ext_share = true;
> +
> +static int
> +defrag_get_first_real_ext(int fd, struct getbmapx *mapx)
> +{
> +	int			err;
> +
> +	while (1) {
> +		err = defrag_get_next_extent(fd, mapx);
> +		if (err)
> +			break;
> +
> +		defrag_move_next_extent();
> +		if (!(mapx->bmv_oflags & BMV_OF_PREALLOC))
> +			break;
> +	}
> +	return err;
> +}
> +
> +static __u64 g_share_offset = -1ULL;
> +static __u64 g_share_len = 0ULL;
> +#define SHARE_MAX_SIZE 32768  /* 32KiB */
> +
> +/* share the first real extent with scrach */
> +static void
> +defrag_share_first_extent(int defrag_fd, int scratch_fd)
> +{
> +#define OFFSET_1PB 0x4000000000000LL
> +	struct file_clone_range clone;
> +	struct getbmapx mapx;
> +	int	err;
> +
> +	if (g_enable_first_ext_share == false)
> +		return;
> +
> +	err = defrag_get_first_real_ext(defrag_fd, &mapx);
> +	if (err)
> +		return;
> +
> +	clone.src_fd = defrag_fd;
> +	clone.src_offset = mapx.bmv_offset * 512;
> +	clone.src_length = mapx.bmv_length * 512;
> +	/* shares at most SHARE_MAX_SIZE length */
> +	if (clone.src_length > SHARE_MAX_SIZE)
> +		clone.src_length = SHARE_MAX_SIZE;
> +	clone.dest_offset = OFFSET_1PB + clone.src_offset;
> +	/* if the first is extent is reaching the EoF, no need to share */
> +	if (clone.src_offset + clone.src_length >= g_defrag_file_size)
> +		return;
> +	err = ioctl(scratch_fd, FICLONERANGE, &clone);
> +	if (err != 0) {
> +		fprintf(stderr, "cloning first extent failed: %s\n",
> +			strerror(errno));
> +		return;
> +	}
> +
> +	/* safe the offset and length for re-share */
> +	g_share_offset = clone.src_offset;
> +	g_share_len = clone.src_length;
> +}
> +
> +/* re-share the blocks we shared previous if then are no longer shared */
> +static void
> +defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd)
> +{
> +#define NR_GET_EXT 9
> +	struct getbmapx mapx[NR_GET_EXT];
> +	struct file_clone_range clone;
> +	__u64	new_share_len;
> +	int	idx, err;
> +
> +	if (g_enable_first_ext_share == false)
> +		return;
> +
> +	if (g_share_len == 0ULL)
> +		return;
> +
> +	/*
> +	 * check if previous shareing still exist
> +	 * we are done if (partially) so.
> +	 */
> +	mapx[0].bmv_offset = g_share_offset;
> +	mapx[0].bmv_length = g_share_len;
> +	mapx[0].bmv_count = NR_GET_EXT;
> +	mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC;
> +	err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx);
> +	if (err) {
> +		fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n",
> +			strerror(errno));
> +		/* won't try share again */
> +		g_share_len = 0ULL;
> +		return;
> +	}
> +
> +	if (mapx[0].bmv_entries == 0) {
> +		/* shared blocks all became hole, won't try share again */
> +		g_share_len = 0ULL;
> +		return;
> +	}
> +
> +	if (g_share_offset != 512 * mapx[1].bmv_offset) {
> +		/* first shared block became hole, won't try share again */
> +		g_share_len = 0ULL;
> +		return;
> +	}
> +
> +	/* we check up to only the first NR_GET_EXT - 1 extents */
> +	for (idx = 1; idx <= mapx[0].bmv_entries; idx++) {
> +		if (mapx[idx].bmv_oflags & BMV_OF_SHARED) {
> +			/* some blocks still shared, done */
> +			return;
> +		}
> +	}
> +
> +	/*
> +	 * The previously shared blocks are no longer shared, re-share.
> +	 * deallocate the blocks in scrath file first
> +	 */
> +	err = fallocate(scratch_fd,
> +		FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> +		OFFSET_1PB + g_share_offset, g_share_len);
> +	if (err != 0) {
> +		fprintf(stderr, "punch hole failed %s\n",
> +			strerror(errno));
> +		g_share_len = 0;
> +		return;
> +	}
> +
> +	new_share_len = 512 * mapx[1].bmv_length;
> +	if (new_share_len > SHARE_MAX_SIZE)
> +		new_share_len = SHARE_MAX_SIZE;
> +
> +	clone.src_fd = defrag_fd;
> +	/* keep starting offset unchanged */
> +	clone.src_offset = g_share_offset;
> +	clone.src_length = new_share_len;
> +	clone.dest_offset = OFFSET_1PB + clone.src_offset;
> +
> +	err = ioctl(scratch_fd, FICLONERANGE, &clone);
> +	if (err) {
> +		fprintf(stderr, "FICLONERANGE failed %s\n",
> +			strerror(errno));
> +		g_share_len = 0;
> +		return;
> +	}
> +
> +	g_share_len = new_share_len;
> + }
> +
>  /*
>   * defragment a file
>   * return 0 if successfully done, 1 otherwise
> @@ -377,6 +526,12 @@ defrag_xfs_defrag(char *file_path) {
>  
>  	signal(SIGINT, defrag_sigint_handler);
>  
> +	/*
> +	 * share the first extent to work around kernel consuming time
> +	 * in xfs_reflink_try_clear_inode_flag()
> +	 */
> +	defrag_share_first_extent(defrag_fd, scratch_fd);
> +
>  	do {
>  		struct timeval t_clone, t_unshare, t_punch_hole;
>  		struct defrag_segment segment;
> @@ -454,6 +609,15 @@ defrag_xfs_defrag(char *file_path) {
>  		if (time_delta > max_unshare_us)
>  			max_unshare_us = time_delta;
>  
> +		/*
> +		 * if unshare used more than 1 second, time is very possibly
> +		 * used in checking if the file is sharing extents now.
> +		 * to avoid that happen again we re-share the blocks in front
> +		 * to workaround that.
> +		 */
> +		if (time_delta > 1000000)
> +			defrag_reshare_blocks_in_front(defrag_fd, scratch_fd);
> +
>  		/*
>  		 * Punch out the original extents we shared to the
>  		 * scratch file so they are returned to free space.
> @@ -514,6 +678,8 @@ static void defrag_help(void)
>  " -f free_space      -- specify shrethod of the XFS free space in MiB, when\n"
>  "                       XFS free space is lower than that, shared segments \n"
>  "                       are excluded from defragmentation, 1024 by default\n"
> +" -n                 -- disable the \"share first extent\" featue, it's\n"
> +"                       enabled by default to speed up\n"
>  	));
>  }
>  
> @@ -525,7 +691,7 @@ defrag_f(int argc, char **argv)
>  	int	i;
>  	int	c;
>  
> -	while ((c = getopt(argc, argv, "s:f:")) != EOF) {
> +	while ((c = getopt(argc, argv, "s:f:n")) != EOF) {
>  		switch(c) {
>  		case 's':
>  			g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512;
> @@ -539,6 +705,10 @@ defrag_f(int argc, char **argv)
>  			g_limit_free_bytes = atol(optarg) * 1024 * 1024;
>  			break;
>  
> +		case 'n':
> +			g_enable_first_ext_share = false;
> +			break;
> +
>  		default:
>  			command_usage(&defrag_cmd);
>  			return 1;
> @@ -556,7 +726,7 @@ void defrag_init(void)
>  	defrag_cmd.cfunc	= defrag_f;
>  	defrag_cmd.argmin	= 0;
>  	defrag_cmd.argmax	= 4;
> -	defrag_cmd.args		= "[-s segment_size] [-f free_space]";
> +	defrag_cmd.args		= "[-s segment_size] [-f free_space] [-n]";
>  	defrag_cmd.flags	= CMD_FLAG_ONESHOT;
>  	defrag_cmd.oneline	= _("Defragment XFS files");
>  	defrag_cmd.help		= defrag_help;
> -- 
> 2.39.3 (Apple Git-146)
> 
>
Wengang Wang July 11, 2024, 11:11 p.m. UTC | #2
> On Jul 9, 2024, at 1:51 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
>> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
>> of extents and none of the extents are shared.
>> 
>> workaround:
>> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
>> quickly to save cpu times and speed up defrag significantly.
> 
> I wonder if a better solution would be to change xfs_reflink_unshare
> only to try to clear the reflink iflag if offset/len cover the entire
> file?  It's a pity we can't set time budgets on fallocate requests.

Yep.
Anyway the change, if there will be, would be in kernel.
We can use -n option to disable this workaround in defrag.

Thanks,
Wengang

> 
> --D
> 
>> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
>> ---
>> spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 172 insertions(+), 2 deletions(-)
>> 
>> diff --git a/spaceman/defrag.c b/spaceman/defrag.c
>> index f8e6713c..b5c5b187 100644
>> --- a/spaceman/defrag.c
>> +++ b/spaceman/defrag.c
>> @@ -327,6 +327,155 @@ defrag_fs_limit_hit(int fd)
>> return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes;
>> }
>> 
>> +static bool g_enable_first_ext_share = true;
>> +
>> +static int
>> +defrag_get_first_real_ext(int fd, struct getbmapx *mapx)
>> +{
>> + int err;
>> +
>> + while (1) {
>> + err = defrag_get_next_extent(fd, mapx);
>> + if (err)
>> + break;
>> +
>> + defrag_move_next_extent();
>> + if (!(mapx->bmv_oflags & BMV_OF_PREALLOC))
>> + break;
>> + }
>> + return err;
>> +}
>> +
>> +static __u64 g_share_offset = -1ULL;
>> +static __u64 g_share_len = 0ULL;
>> +#define SHARE_MAX_SIZE 32768  /* 32KiB */
>> +
>> +/* share the first real extent with scrach */
>> +static void
>> +defrag_share_first_extent(int defrag_fd, int scratch_fd)
>> +{
>> +#define OFFSET_1PB 0x4000000000000LL
>> + struct file_clone_range clone;
>> + struct getbmapx mapx;
>> + int err;
>> +
>> + if (g_enable_first_ext_share == false)
>> + return;
>> +
>> + err = defrag_get_first_real_ext(defrag_fd, &mapx);
>> + if (err)
>> + return;
>> +
>> + clone.src_fd = defrag_fd;
>> + clone.src_offset = mapx.bmv_offset * 512;
>> + clone.src_length = mapx.bmv_length * 512;
>> + /* shares at most SHARE_MAX_SIZE length */
>> + if (clone.src_length > SHARE_MAX_SIZE)
>> + clone.src_length = SHARE_MAX_SIZE;
>> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
>> + /* if the first is extent is reaching the EoF, no need to share */
>> + if (clone.src_offset + clone.src_length >= g_defrag_file_size)
>> + return;
>> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
>> + if (err != 0) {
>> + fprintf(stderr, "cloning first extent failed: %s\n",
>> + strerror(errno));
>> + return;
>> + }
>> +
>> + /* safe the offset and length for re-share */
>> + g_share_offset = clone.src_offset;
>> + g_share_len = clone.src_length;
>> +}
>> +
>> +/* re-share the blocks we shared previous if then are no longer shared */
>> +static void
>> +defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd)
>> +{
>> +#define NR_GET_EXT 9
>> + struct getbmapx mapx[NR_GET_EXT];
>> + struct file_clone_range clone;
>> + __u64 new_share_len;
>> + int idx, err;
>> +
>> + if (g_enable_first_ext_share == false)
>> + return;
>> +
>> + if (g_share_len == 0ULL)
>> + return;
>> +
>> + /*
>> + * check if previous shareing still exist
>> + * we are done if (partially) so.
>> + */
>> + mapx[0].bmv_offset = g_share_offset;
>> + mapx[0].bmv_length = g_share_len;
>> + mapx[0].bmv_count = NR_GET_EXT;
>> + mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC;
>> + err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx);
>> + if (err) {
>> + fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n",
>> + strerror(errno));
>> + /* won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + if (mapx[0].bmv_entries == 0) {
>> + /* shared blocks all became hole, won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + if (g_share_offset != 512 * mapx[1].bmv_offset) {
>> + /* first shared block became hole, won't try share again */
>> + g_share_len = 0ULL;
>> + return;
>> + }
>> +
>> + /* we check up to only the first NR_GET_EXT - 1 extents */
>> + for (idx = 1; idx <= mapx[0].bmv_entries; idx++) {
>> + if (mapx[idx].bmv_oflags & BMV_OF_SHARED) {
>> + /* some blocks still shared, done */
>> + return;
>> + }
>> + }
>> +
>> + /*
>> + * The previously shared blocks are no longer shared, re-share.
>> + * deallocate the blocks in scrath file first
>> + */
>> + err = fallocate(scratch_fd,
>> + FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
>> + OFFSET_1PB + g_share_offset, g_share_len);
>> + if (err != 0) {
>> + fprintf(stderr, "punch hole failed %s\n",
>> + strerror(errno));
>> + g_share_len = 0;
>> + return;
>> + }
>> +
>> + new_share_len = 512 * mapx[1].bmv_length;
>> + if (new_share_len > SHARE_MAX_SIZE)
>> + new_share_len = SHARE_MAX_SIZE;
>> +
>> + clone.src_fd = defrag_fd;
>> + /* keep starting offset unchanged */
>> + clone.src_offset = g_share_offset;
>> + clone.src_length = new_share_len;
>> + clone.dest_offset = OFFSET_1PB + clone.src_offset;
>> +
>> + err = ioctl(scratch_fd, FICLONERANGE, &clone);
>> + if (err) {
>> + fprintf(stderr, "FICLONERANGE failed %s\n",
>> + strerror(errno));
>> + g_share_len = 0;
>> + return;
>> + }
>> +
>> + g_share_len = new_share_len;
>> + }
>> +
>> /*
>>  * defragment a file
>>  * return 0 if successfully done, 1 otherwise
>> @@ -377,6 +526,12 @@ defrag_xfs_defrag(char *file_path) {
>> 
>> signal(SIGINT, defrag_sigint_handler);
>> 
>> + /*
>> + * share the first extent to work around kernel consuming time
>> + * in xfs_reflink_try_clear_inode_flag()
>> + */
>> + defrag_share_first_extent(defrag_fd, scratch_fd);
>> +
>> do {
>> struct timeval t_clone, t_unshare, t_punch_hole;
>> struct defrag_segment segment;
>> @@ -454,6 +609,15 @@ defrag_xfs_defrag(char *file_path) {
>> if (time_delta > max_unshare_us)
>> max_unshare_us = time_delta;
>> 
>> + /*
>> + * if unshare used more than 1 second, time is very possibly
>> + * used in checking if the file is sharing extents now.
>> + * to avoid that happen again we re-share the blocks in front
>> + * to workaround that.
>> + */
>> + if (time_delta > 1000000)
>> + defrag_reshare_blocks_in_front(defrag_fd, scratch_fd);
>> +
>> /*
>> * Punch out the original extents we shared to the
>> * scratch file so they are returned to free space.
>> @@ -514,6 +678,8 @@ static void defrag_help(void)
>> " -f free_space      -- specify shrethod of the XFS free space in MiB, when\n"
>> "                       XFS free space is lower than that, shared segments \n"
>> "                       are excluded from defragmentation, 1024 by default\n"
>> +" -n                 -- disable the \"share first extent\" featue, it's\n"
>> +"                       enabled by default to speed up\n"
>> ));
>> }
>> 
>> @@ -525,7 +691,7 @@ defrag_f(int argc, char **argv)
>> int i;
>> int c;
>> 
>> - while ((c = getopt(argc, argv, "s:f:")) != EOF) {
>> + while ((c = getopt(argc, argv, "s:f:n")) != EOF) {
>> switch(c) {
>> case 's':
>> g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512;
>> @@ -539,6 +705,10 @@ defrag_f(int argc, char **argv)
>> g_limit_free_bytes = atol(optarg) * 1024 * 1024;
>> break;
>> 
>> + case 'n':
>> + g_enable_first_ext_share = false;
>> + break;
>> +
>> default:
>> command_usage(&defrag_cmd);
>> return 1;
>> @@ -556,7 +726,7 @@ void defrag_init(void)
>> defrag_cmd.cfunc = defrag_f;
>> defrag_cmd.argmin = 0;
>> defrag_cmd.argmax = 4;
>> - defrag_cmd.args = "[-s segment_size] [-f free_space]";
>> + defrag_cmd.args = "[-s segment_size] [-f free_space] [-n]";
>> defrag_cmd.flags = CMD_FLAG_ONESHOT;
>> defrag_cmd.oneline = _("Defragment XFS files");
>> defrag_cmd.help = defrag_help;
>> -- 
>> 2.39.3 (Apple Git-146)
>> 
>>
Dave Chinner July 16, 2024, 12:25 a.m. UTC | #3
On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
> of extents and none of the extents are shared.

Got a kernel profile showing how bad it is?

> 
> workaround:
> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
> quickly to save cpu times and speed up defrag significantly.

That's nasty.

Let's fix the kernel code, not work around it in userspace.

I mean, it would be really easy to store if an extent is shared in
the iext btree record for the extent. If we do an unshare operation,
just do a single "find shared extents" pass on the extent tree and
mark all the extents that are shared as shared.  Then set a flag on
the data fork saying it is tracking shared extents, and so when we
share/unshare extents in that inode from then on, we set/clear that
flag in the iext record. (i.e. it's an in-memory equivalent of the
UNWRITTEN state flag).

Then after the first unshare, checking for nothing being shared is a
walk of the iext btree over the given range, not a refcountbt
walk. That should be much faster.

And we could make it even faster by adding a "shared extents"
counter to the inode fork. i.e. the first scan that sets the flags
also counts the shared extents, and we maintain that as we maintain
the iin memory extent flags....

That makes the cost of xfs_reflink_try_clear_inode_flag() basically
go to zero in these sorts of workloads. IMO, this is a much better
solution to the problem than hacking around it in userspace...

-Dave.
Wengang Wang July 18, 2024, 6:24 p.m. UTC | #4
> On Jul 15, 2024, at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
> 
> On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
>> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
>> of extents and none of the extents are shared.
> 
> Got a kernel profile showing how bad it is?

It was more than 1.5 seconds (basing on 6.4 millions of extents) when I add debug code to measure it.

> 
>> 
>> workaround:
>> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
>> quickly to save cpu times and speed up defrag significantly.
> 
> That's nasty.
> 
> Let's fix the kernel code, not work around it in userspace.
> 
> I mean, it would be really easy to store if an extent is shared in
> the iext btree record for the extent. If we do an unshare operation,
> just do a single "find shared extents" pass on the extent tree and
> mark all the extents that are shared as shared.  Then set a flag on
> the data fork saying it is tracking shared extents, and so when we
> share/unshare extents in that inode from then on, we set/clear that
> flag in the iext record. (i.e. it's an in-memory equivalent of the
> UNWRITTEN state flag).
> 
> Then after the first unshare, checking for nothing being shared is a
> walk of the iext btree over the given range, not a refcountbt
> walk. That should be much faster.
> 
> And we could make it even faster by adding a "shared extents"
> counter to the inode fork. i.e. the first scan that sets the flags
> also counts the shared extents, and we maintain that as we maintain
> the iin memory extent flags....
> 
> That makes the cost of xfs_reflink_try_clear_inode_flag() basically
> go to zero in these sorts of workloads. IMO, this is a much better
> solution to the problem than hacking around it in userspace...
> 

Yes, fixing it in kernel is the best way to go.
Well, one consideration is that the customers don’t run on upstream kernel.
They might run a much lower version. And some customers don’t want kernel
upgrades if there are no security issues.
So can we have both? 
1. Trying to fix kernel and
2. Keep the workaround in defrag usersapce?

Thanks,
Wengang
>
Dave Chinner July 31, 2024, 10:25 p.m. UTC | #5
On Tue, Jul 09, 2024 at 12:10:25PM -0700, Wengang Wang wrote:
> xfs_reflink_try_clear_inode_flag() takes very long in case file has huge number
> of extents and none of the extents are shared.
> 
> workaround:
> share the first real extent so that xfs_reflink_try_clear_inode_flag() returns
> quickly to save cpu times and speed up defrag significantly.
> 
> Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
> ---
>  spaceman/defrag.c | 174 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 172 insertions(+), 2 deletions(-)

I had some insight on this late last night. The source of the issue
is that both the kernel and the defrag algorithm are walking
forwards across the file. Hence as we get t higher offsets in the
file during defrag which unshares shared ranges, we are moving the
first shared range to be higher in the file.

Hence the act of unsharing the file in ascending offset order
results in the ascending offset search for shared extents done by
the kernel growing in time.

The solution to this is to make defrag work backwards through the
file, so it leaves the low offset shared extents intact for the
kernel to find until the defrag process unshares them. At which
point the kernel will clear the reflink flag and the searching
stops.

IOWs, we either need to change the kernel code to do reverse order
shared extent searching, or change the defrag operation to work in
reverse sequential order, and then the performance problem relating
to unsharing having to determine if the file being defragged is
still shared or not goes away.

That said, I still think that the fact the defrag is completely
unsharing the source file is wrong. If we leave shared extents
intact as we defrag the file, then this problem doesn't need solving
at all because xfs_reflink_try_clear_inode_flag() will hit the same
shared extent near the front of the file every time...

-Dave.
diff mbox series

Patch

diff --git a/spaceman/defrag.c b/spaceman/defrag.c
index f8e6713c..b5c5b187 100644
--- a/spaceman/defrag.c
+++ b/spaceman/defrag.c
@@ -327,6 +327,155 @@  defrag_fs_limit_hit(int fd)
 	return statfs_s.f_bsize * statfs_s.f_bavail < g_limit_free_bytes;
 }
 
+static bool g_enable_first_ext_share = true;
+
+static int
+defrag_get_first_real_ext(int fd, struct getbmapx *mapx)
+{
+	int			err;
+
+	while (1) {
+		err = defrag_get_next_extent(fd, mapx);
+		if (err)
+			break;
+
+		defrag_move_next_extent();
+		if (!(mapx->bmv_oflags & BMV_OF_PREALLOC))
+			break;
+	}
+	return err;
+}
+
+static __u64 g_share_offset = -1ULL;
+static __u64 g_share_len = 0ULL;
+#define SHARE_MAX_SIZE 32768  /* 32KiB */
+
+/* share the first real extent with scrach */
+static void
+defrag_share_first_extent(int defrag_fd, int scratch_fd)
+{
+#define OFFSET_1PB 0x4000000000000LL
+	struct file_clone_range clone;
+	struct getbmapx mapx;
+	int	err;
+
+	if (g_enable_first_ext_share == false)
+		return;
+
+	err = defrag_get_first_real_ext(defrag_fd, &mapx);
+	if (err)
+		return;
+
+	clone.src_fd = defrag_fd;
+	clone.src_offset = mapx.bmv_offset * 512;
+	clone.src_length = mapx.bmv_length * 512;
+	/* shares at most SHARE_MAX_SIZE length */
+	if (clone.src_length > SHARE_MAX_SIZE)
+		clone.src_length = SHARE_MAX_SIZE;
+	clone.dest_offset = OFFSET_1PB + clone.src_offset;
+	/* if the first is extent is reaching the EoF, no need to share */
+	if (clone.src_offset + clone.src_length >= g_defrag_file_size)
+		return;
+	err = ioctl(scratch_fd, FICLONERANGE, &clone);
+	if (err != 0) {
+		fprintf(stderr, "cloning first extent failed: %s\n",
+			strerror(errno));
+		return;
+	}
+
+	/* safe the offset and length for re-share */
+	g_share_offset = clone.src_offset;
+	g_share_len = clone.src_length;
+}
+
+/* re-share the blocks we shared previous if then are no longer shared */
+static void
+defrag_reshare_blocks_in_front(int defrag_fd, int scratch_fd)
+{
+#define NR_GET_EXT 9
+	struct getbmapx mapx[NR_GET_EXT];
+	struct file_clone_range clone;
+	__u64	new_share_len;
+	int	idx, err;
+
+	if (g_enable_first_ext_share == false)
+		return;
+
+	if (g_share_len == 0ULL)
+		return;
+
+	/*
+	 * check if previous shareing still exist
+	 * we are done if (partially) so.
+	 */
+	mapx[0].bmv_offset = g_share_offset;
+	mapx[0].bmv_length = g_share_len;
+	mapx[0].bmv_count = NR_GET_EXT;
+	mapx[0].bmv_iflags = BMV_IF_NO_HOLES | BMV_IF_PREALLOC;
+	err = ioctl(defrag_fd, XFS_IOC_GETBMAPX, mapx);
+	if (err) {
+		fprintf(stderr, "XFS_IOC_GETBMAPX failed %s\n",
+			strerror(errno));
+		/* won't try share again */
+		g_share_len = 0ULL;
+		return;
+	}
+
+	if (mapx[0].bmv_entries == 0) {
+		/* shared blocks all became hole, won't try share again */
+		g_share_len = 0ULL;
+		return;
+	}
+
+	if (g_share_offset != 512 * mapx[1].bmv_offset) {
+		/* first shared block became hole, won't try share again */
+		g_share_len = 0ULL;
+		return;
+	}
+
+	/* we check up to only the first NR_GET_EXT - 1 extents */
+	for (idx = 1; idx <= mapx[0].bmv_entries; idx++) {
+		if (mapx[idx].bmv_oflags & BMV_OF_SHARED) {
+			/* some blocks still shared, done */
+			return;
+		}
+	}
+
+	/*
+	 * The previously shared blocks are no longer shared, re-share.
+	 * deallocate the blocks in scrath file first
+	 */
+	err = fallocate(scratch_fd,
+		FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
+		OFFSET_1PB + g_share_offset, g_share_len);
+	if (err != 0) {
+		fprintf(stderr, "punch hole failed %s\n",
+			strerror(errno));
+		g_share_len = 0;
+		return;
+	}
+
+	new_share_len = 512 * mapx[1].bmv_length;
+	if (new_share_len > SHARE_MAX_SIZE)
+		new_share_len = SHARE_MAX_SIZE;
+
+	clone.src_fd = defrag_fd;
+	/* keep starting offset unchanged */
+	clone.src_offset = g_share_offset;
+	clone.src_length = new_share_len;
+	clone.dest_offset = OFFSET_1PB + clone.src_offset;
+
+	err = ioctl(scratch_fd, FICLONERANGE, &clone);
+	if (err) {
+		fprintf(stderr, "FICLONERANGE failed %s\n",
+			strerror(errno));
+		g_share_len = 0;
+		return;
+	}
+
+	g_share_len = new_share_len;
+ }
+
 /*
  * defragment a file
  * return 0 if successfully done, 1 otherwise
@@ -377,6 +526,12 @@  defrag_xfs_defrag(char *file_path) {
 
 	signal(SIGINT, defrag_sigint_handler);
 
+	/*
+	 * share the first extent to work around kernel consuming time
+	 * in xfs_reflink_try_clear_inode_flag()
+	 */
+	defrag_share_first_extent(defrag_fd, scratch_fd);
+
 	do {
 		struct timeval t_clone, t_unshare, t_punch_hole;
 		struct defrag_segment segment;
@@ -454,6 +609,15 @@  defrag_xfs_defrag(char *file_path) {
 		if (time_delta > max_unshare_us)
 			max_unshare_us = time_delta;
 
+		/*
+		 * if unshare used more than 1 second, time is very possibly
+		 * used in checking if the file is sharing extents now.
+		 * to avoid that happen again we re-share the blocks in front
+		 * to workaround that.
+		 */
+		if (time_delta > 1000000)
+			defrag_reshare_blocks_in_front(defrag_fd, scratch_fd);
+
 		/*
 		 * Punch out the original extents we shared to the
 		 * scratch file so they are returned to free space.
@@ -514,6 +678,8 @@  static void defrag_help(void)
 " -f free_space      -- specify shrethod of the XFS free space in MiB, when\n"
 "                       XFS free space is lower than that, shared segments \n"
 "                       are excluded from defragmentation, 1024 by default\n"
+" -n                 -- disable the \"share first extent\" featue, it's\n"
+"                       enabled by default to speed up\n"
 	));
 }
 
@@ -525,7 +691,7 @@  defrag_f(int argc, char **argv)
 	int	i;
 	int	c;
 
-	while ((c = getopt(argc, argv, "s:f:")) != EOF) {
+	while ((c = getopt(argc, argv, "s:f:n")) != EOF) {
 		switch(c) {
 		case 's':
 			g_segment_size_lmt = atoi(optarg) * 1024 * 1024 / 512;
@@ -539,6 +705,10 @@  defrag_f(int argc, char **argv)
 			g_limit_free_bytes = atol(optarg) * 1024 * 1024;
 			break;
 
+		case 'n':
+			g_enable_first_ext_share = false;
+			break;
+
 		default:
 			command_usage(&defrag_cmd);
 			return 1;
@@ -556,7 +726,7 @@  void defrag_init(void)
 	defrag_cmd.cfunc	= defrag_f;
 	defrag_cmd.argmin	= 0;
 	defrag_cmd.argmax	= 4;
-	defrag_cmd.args		= "[-s segment_size] [-f free_space]";
+	defrag_cmd.args		= "[-s segment_size] [-f free_space] [-n]";
 	defrag_cmd.flags	= CMD_FLAG_ONESHOT;
 	defrag_cmd.oneline	= _("Defragment XFS files");
 	defrag_cmd.help		= defrag_help;