diff mbox series

[1/2] btrfs-progs: receive: workaround unaligned full file clone

Message ID 16df770373ded4bf871d9d89f5af1ea7865de896.1727416314.git.wqu@suse.com (mailing list archive)
State New, archived
Headers show
Series btrfs-progs: receive: workaround full file clone quirk | expand

Commit Message

Qu Wenruo Sept. 27, 2024, 5:53 a.m. UTC
[BUG]
The following script can lead to receive failure with latest kernel:

  dev="/dev/test/scratch1"
  mnt="/mnt/btrfs"

  mkfs.btrfs -f $dev
  mount $dev $mnt
  btrfs subv create $mnt/subv1
  xfs_io -f -c "pwrite 0 4000" $mnt/subv1/source
  xfs_io -f -c "reflink $mnt/subv1/source" $mnt/subv1/dest
  btrfs subv snap -r $mnt/subv1 $mnt/ro_subv1
  btrfs subv snap $mnt/subv1 $mnt/snap1
  xfs_io -f -c "pwrite -S 0xff 0 3900" -c "truncate 3900" $mnt/snap1/source
  truncate -s 0 $mnt/snap1/dest
  xfs_io -f -c "reflink $mnt/snap1/source" $mnt/snap1/dest
  btrfs subv snap -r $mnt/snap1 $mnt/ro_snap1
  btrfs send $mnt/ro_subv1 -f /tmp/ro_subv1.stream
  btrfs send -p $mnt/ro_subv1 $mnt/ro_snap1 -f /tmp/ro_snap1.stream
  umount $mnt
  mkfs.btrfs -f $dev
  mount $dev $mnt
  btrfs receive -f /tmp/ro_subv1.stream $mnt
  btrfs receive -f /tmp/ro_snap1.stream $mnt
  At snapshot ro_snap1
  ERROR: failed to clone extents to dest: Invalid argument

[CAUSE]
Since kernel commit 46a6e10a1ab1 ("btrfs: send: allow cloning
non-aligned extent if it ends at i_size"), kernel can send out clone
stream if we're cloning a full file, even if the size of the file is not
sector aligned, like this one:

  snapshot        ./ro_snap1                      uuid=2a3e2b70-c606-d446-b60b-baab458be6da transid=9 parent_uuid=d8ff9b9e-3ffc-6343-b53e-e22f8bbb7c25 parent_transid=7
  write           ./ro_snap1/source               offset=0 len=4700
  truncate        ./ro_snap1/source               size=4700
  utimes          ./ro_snap1/source               atime=2024-09-27T13:08:54+0800 mtime=2024-09-27T13:08:54+0800 ctime=2024-09-27T13:08:54+0800
  clone           ./ro_snap1/dest                 offset=0 len=4700 from=./ro_snap1/source clone_offset=0
  truncate        ./ro_snap1/dest                 size=4700
  utimes          ./ro_snap1/dest                 atime=2024-09-27T13:08:54+0800 mtime=2024-09-27T13:08:54+0800 ctime=2024-09-27T13:08:54+0800

However for the clone command, if the file inside the source subvolume
is larger than the new size, kernel will reject the clone operation, as
the resulted layout may read beyond the EOF of the clone source.

This should be addressed by the kernel, by doing the truncation before
the clone to ensure the destination file is no larger than the source.

[FIX]
It won't hurt for "btrfs receive" command to workaround the
problem, by truncating the destination file first.

Here we choose to truncate the file size to 0, other than the
source/destination file size.
As truncating to an unaligned size can cause the fs to do extra page
dirty and zero the tailing part.

Since we know it's a full file clone, truncating the file to size 0 will
avoid the extra page dirty, and allow the later clone to be done.

Reported-by: Ben Millwood <thebenmachine@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAJhrHS2z+WViO2h=ojYvBPDLsATwLbg+7JaNCyYomv0fUxEpQQ@mail.gmail.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
 cmds/receive.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

Comments

Filipe Manana Sept. 27, 2024, 8:14 a.m. UTC | #1
On Fri, Sep 27, 2024 at 6:54 AM Qu Wenruo <wqu@suse.com> wrote:
>
> [BUG]
> The following script can lead to receive failure with latest kernel:
>
>   dev="/dev/test/scratch1"
>   mnt="/mnt/btrfs"
>
>   mkfs.btrfs -f $dev
>   mount $dev $mnt
>   btrfs subv create $mnt/subv1
>   xfs_io -f -c "pwrite 0 4000" $mnt/subv1/source
>   xfs_io -f -c "reflink $mnt/subv1/source" $mnt/subv1/dest
>   btrfs subv snap -r $mnt/subv1 $mnt/ro_subv1
>   btrfs subv snap $mnt/subv1 $mnt/snap1
>   xfs_io -f -c "pwrite -S 0xff 0 3900" -c "truncate 3900" $mnt/snap1/source
>   truncate -s 0 $mnt/snap1/dest
>   xfs_io -f -c "reflink $mnt/snap1/source" $mnt/snap1/dest
>   btrfs subv snap -r $mnt/snap1 $mnt/ro_snap1
>   btrfs send $mnt/ro_subv1 -f /tmp/ro_subv1.stream
>   btrfs send -p $mnt/ro_subv1 $mnt/ro_snap1 -f /tmp/ro_snap1.stream
>   umount $mnt
>   mkfs.btrfs -f $dev
>   mount $dev $mnt
>   btrfs receive -f /tmp/ro_subv1.stream $mnt
>   btrfs receive -f /tmp/ro_snap1.stream $mnt
>   At snapshot ro_snap1
>   ERROR: failed to clone extents to dest: Invalid argument
>
> [CAUSE]
> Since kernel commit 46a6e10a1ab1 ("btrfs: send: allow cloning
> non-aligned extent if it ends at i_size"), kernel can send out clone
> stream if we're cloning a full file, even if the size of the file is not
> sector aligned, like this one:
>
>   snapshot        ./ro_snap1                      uuid=2a3e2b70-c606-d446-b60b-baab458be6da transid=9 parent_uuid=d8ff9b9e-3ffc-6343-b53e-e22f8bbb7c25 parent_transid=7
>   write           ./ro_snap1/source               offset=0 len=4700
>   truncate        ./ro_snap1/source               size=4700
>   utimes          ./ro_snap1/source               atime=2024-09-27T13:08:54+0800 mtime=2024-09-27T13:08:54+0800 ctime=2024-09-27T13:08:54+0800
>   clone           ./ro_snap1/dest                 offset=0 len=4700 from=./ro_snap1/source clone_offset=0
>   truncate        ./ro_snap1/dest                 size=4700
>   utimes          ./ro_snap1/dest                 atime=2024-09-27T13:08:54+0800 mtime=2024-09-27T13:08:54+0800 ctime=2024-09-27T13:08:54+0800
>
> However for the clone command, if the file inside the source subvolume
> is larger than the new size, kernel will reject the clone operation, as
> the resulted layout may read beyond the EOF of the clone source.
>
> This should be addressed by the kernel, by doing the truncation before
> the clone to ensure the destination file is no larger than the source.
>
> [FIX]
> It won't hurt for "btrfs receive" command to workaround the
> problem, by truncating the destination file first.
>
> Here we choose to truncate the file size to 0, other than the
> source/destination file size.
> As truncating to an unaligned size can cause the fs to do extra page
> dirty and zero the tailing part.
>
> Since we know it's a full file clone, truncating the file to size 0 will
> avoid the extra page dirty, and allow the later clone to be done.
>
> Reported-by: Ben Millwood <thebenmachine@gmail.com>
> Link: https://lore.kernel.org/linux-btrfs/CAJhrHS2z+WViO2h=ojYvBPDLsATwLbg+7JaNCyYomv0fUxEpQQ@mail.gmail.com/
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>  cmds/receive.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
>
> diff --git a/cmds/receive.c b/cmds/receive.c
> index 4cc5b9009442..9bda5485d895 100644
> --- a/cmds/receive.c
> +++ b/cmds/receive.c
> @@ -56,6 +56,8 @@
>  #include "cmds/commands.h"
>  #include "cmds/receive-dump.h"
>
> +static u32 g_sectorsize;
> +
>  struct btrfs_receive
>  {
>         int mnt_fd;
> @@ -739,6 +741,7 @@ static int process_clone(const char *path, u64 offset, u64 len,
>         struct btrfs_receive *rctx = user;
>         struct btrfs_ioctl_clone_range_args clone_args;
>         struct subvol_info *si = NULL;
> +       struct stat st = { 0 };
>         char full_path[PATH_MAX];
>         const char *subvol_path;
>         char full_clone_path[PATH_MAX];
> @@ -802,11 +805,36 @@ static int process_clone(const char *path, u64 offset, u64 len,
>                 error("cannot open %s: %m", full_clone_path);
>                 goto out;
>         }
> +       ret = fstat(clone_fd, &st);
> +       if (ret < 0) {
> +               errno = -ret;
> +               error("cannot stat %s: %m", full_clone_path);
> +               goto out;
> +       }
>
>         if (bconf.verbose >= 2)
>                 fprintf(stderr,
>                         "clone %s - source=%s source offset=%llu offset=%llu length=%llu\n",
>                         path, clone_path, clone_offset, offset, len);
> +       /*
> +        * Since kernel commit 46a6e10a1ab1 ("btrfs: send: allow cloning
> +        * non-aligned extent if it ends at i_size"), we can have send
> +        * stream cloning a full file even its size is not sector aligned.
> +        *
> +        * But if we're cloning this unaligned range into an existing file,
> +        * which has a larger i_size, the clone will fail.
> +        *
> +        * Address this quirk by introducing an extra truncation to size 0.

No, this is incorrect and will cause data loss in case the file has
multiple extents, and we're trying to clone the last one which is not
sector size aligned.
The assumption that this is always for a full file, consisting of a
single extent is wrong.

The problem is easy to fix in the kernel and it's where it should
always be fixed - any problem with the order of operations in the send
stream should be fixed in the kernel.
We've had at least one problem in the past by adding this type of
workarounds to btrfs-progs when the problem should be fixed in the
kernel (a case with special xattrs).

I'll send the kernel fix soon.

Thanks.

> +        */
> +       if (clone_offset == 0 && !IS_ALIGNED(len, g_sectorsize) &&
> +           len == st.st_size) {
> +               ret = ftruncate(rctx->write_fd, 0);
> +               if (ret < 0) {
> +                       errno = - ret;
> +                       error("failed to truncate %s: %m", path);
> +                       goto out;
> +               }
> +       }
>
>         clone_args.src_fd = clone_fd;
>         clone_args.src_offset = clone_offset;
> @@ -1468,6 +1496,18 @@ static struct btrfs_send_ops send_ops = {
>         .enable_verity = process_enable_verity,
>  };
>
> +static int get_fs_sectorsize(int fd, u32 *sectorsize_ret)
> +{
> +       struct btrfs_ioctl_fs_info_args args = { 0 };
> +       int ret;
> +
> +       ret = ioctl(fd, BTRFS_IOC_FS_INFO, &args);
> +       if (ret < 0)
> +               return -errno;
> +       *sectorsize_ret = args.sectorsize;
> +       return 0;
> +}
> +
>  static int do_receive(struct btrfs_receive *rctx, const char *tomnt,
>                       char *realmnt, int r_fd, u64 max_errors)
>  {
> @@ -1491,6 +1531,13 @@ static int do_receive(struct btrfs_receive *rctx, const char *tomnt,
>                         dest_dir_full_path);
>                 goto out;
>         }
> +       ret = get_fs_sectorsize(rctx->dest_dir_fd, &g_sectorsize);
> +       if (ret < 0) {
> +               errno = -ret;
> +               error("failed to get sectorsize of the destination directory %s: %m",
> +                       dest_dir_full_path);
> +               goto out;
> +       }
>
>         if (realmnt[0]) {
>                 rctx->root_path = realmnt;
> --
> 2.46.1
>
>
diff mbox series

Patch

diff --git a/cmds/receive.c b/cmds/receive.c
index 4cc5b9009442..9bda5485d895 100644
--- a/cmds/receive.c
+++ b/cmds/receive.c
@@ -56,6 +56,8 @@ 
 #include "cmds/commands.h"
 #include "cmds/receive-dump.h"
 
+static u32 g_sectorsize;
+
 struct btrfs_receive
 {
 	int mnt_fd;
@@ -739,6 +741,7 @@  static int process_clone(const char *path, u64 offset, u64 len,
 	struct btrfs_receive *rctx = user;
 	struct btrfs_ioctl_clone_range_args clone_args;
 	struct subvol_info *si = NULL;
+	struct stat st = { 0 };
 	char full_path[PATH_MAX];
 	const char *subvol_path;
 	char full_clone_path[PATH_MAX];
@@ -802,11 +805,36 @@  static int process_clone(const char *path, u64 offset, u64 len,
 		error("cannot open %s: %m", full_clone_path);
 		goto out;
 	}
+	ret = fstat(clone_fd, &st);
+	if (ret < 0) {
+		errno = -ret;
+		error("cannot stat %s: %m", full_clone_path);
+		goto out;
+	}
 
 	if (bconf.verbose >= 2)
 		fprintf(stderr,
 			"clone %s - source=%s source offset=%llu offset=%llu length=%llu\n",
 			path, clone_path, clone_offset, offset, len);
+	/*
+	 * Since kernel commit 46a6e10a1ab1 ("btrfs: send: allow cloning
+	 * non-aligned extent if it ends at i_size"), we can have send
+	 * stream cloning a full file even its size is not sector aligned.
+	 *
+	 * But if we're cloning this unaligned range into an existing file,
+	 * which has a larger i_size, the clone will fail.
+	 *
+	 * Address this quirk by introducing an extra truncation to size 0.
+	 */
+	if (clone_offset == 0 && !IS_ALIGNED(len, g_sectorsize) &&
+	    len == st.st_size) {
+		ret = ftruncate(rctx->write_fd, 0);
+		if (ret < 0) {
+			errno = - ret;
+			error("failed to truncate %s: %m", path);
+			goto out;
+		}
+	}
 
 	clone_args.src_fd = clone_fd;
 	clone_args.src_offset = clone_offset;
@@ -1468,6 +1496,18 @@  static struct btrfs_send_ops send_ops = {
 	.enable_verity = process_enable_verity,
 };
 
+static int get_fs_sectorsize(int fd, u32 *sectorsize_ret)
+{
+	struct btrfs_ioctl_fs_info_args args = { 0 };
+	int ret;
+
+	ret = ioctl(fd, BTRFS_IOC_FS_INFO, &args);
+	if (ret < 0)
+		return -errno;
+	*sectorsize_ret = args.sectorsize;
+	return 0;
+}
+
 static int do_receive(struct btrfs_receive *rctx, const char *tomnt,
 		      char *realmnt, int r_fd, u64 max_errors)
 {
@@ -1491,6 +1531,13 @@  static int do_receive(struct btrfs_receive *rctx, const char *tomnt,
 			dest_dir_full_path);
 		goto out;
 	}
+	ret = get_fs_sectorsize(rctx->dest_dir_fd, &g_sectorsize);
+	if (ret < 0) {
+		errno = -ret;
+		error("failed to get sectorsize of the destination directory %s: %m",
+			dest_dir_full_path);
+		goto out;
+	}
 
 	if (realmnt[0]) {
 		rctx->root_path = realmnt;