diff mbox series

[RFC] generic: test reads racing with slow reflink operations

Message ID 20231019200913.GO3195650@frogsfrogsfrogs (mailing list archive)
State Superseded, archived
Headers show
Series [RFC] generic: test reads racing with slow reflink operations | expand

Commit Message

Darrick J. Wong Oct. 19, 2023, 8:09 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

XFS has a rather slow reflink operation.  While a reflink operation is
running, other programs cannot read the contents of the source file,
which is causing latency spikes.  Catherine Hoang wrote a patch to
permit reads, since the source file contents do  not change.  This is a
functionality test for that patch.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 src/Makefile              |    2 
 src/t_reflink_read_race.c |  343 +++++++++++++++++++++++++++++++++++++++++++++
 tests/generic/1953        |   74 ++++++++++
 tests/generic/1953.out    |    6 +
 4 files changed, 424 insertions(+), 1 deletion(-)
 create mode 100644 src/t_reflink_read_race.c
 create mode 100755 tests/generic/1953
 create mode 100644 tests/generic/1953.out

Comments

Christoph Hellwig Oct. 20, 2023, 6:09 a.m. UTC | #1
On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> permit reads, since the source file contents do  not change.  This is a

duplicate white space before "not"

> +#ifndef FICLONE
> +# define FICLONE	_IOW(0x94, 9, int)
> +#endif

Can't we assume system headers with FICLONE by now?

> +# real QA test starts here
> +_require_scratch_reflink

This needs a _require_scratch before the _require_scratch_reflink.

Otherwise this looks good to me:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Catherine Hoang Oct. 20, 2023, 1:37 p.m. UTC | #2
> On Oct 19, 2023, at 1:09 PM, Darrick J. Wong <djwong@kernel.org> wrote:
> 
> From: Darrick J. Wong <djwong@kernel.org>
> 
> XFS has a rather slow reflink operation.  While a reflink operation is
> running, other programs cannot read the contents of the source file,
> which is causing latency spikes.  Catherine Hoang wrote a patch to
> permit reads, since the source file contents do  not change.  This is a
> functionality test for that patch.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Looks good
Reviewed-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
> src/Makefile              |    2 
> src/t_reflink_read_race.c |  343 +++++++++++++++++++++++++++++++++++++++++++++
> tests/generic/1953        |   74 ++++++++++
> tests/generic/1953.out    |    6 +
> 4 files changed, 424 insertions(+), 1 deletion(-)
> create mode 100644 src/t_reflink_read_race.c
> create mode 100755 tests/generic/1953
> create mode 100644 tests/generic/1953.out
> 
> diff --git a/src/Makefile b/src/Makefile
> index 72c8a13007..b5e2b84dae 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -33,7 +33,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
> attr_replace_test swapon mkswap t_attr_corruption t_open_tmpfiles \
> fscrypt-crypt-util bulkstat_null_ocount splice-test chprojid_fail \
> detached_mounts_propagation ext4_resize t_readdir_3 splice2pipe \
> - uuid_ioctl usemem_and_swapoff
> + uuid_ioctl usemem_and_swapoff t_reflink_read_race
> 
> EXTRA_EXECS = dmerror fill2attr fill2fs fill2fs_check scaleread.sh \
>      btrfs_crc32c_forged_name.py popdir.pl popattr.py \
> diff --git a/src/t_reflink_read_race.c b/src/t_reflink_read_race.c
> new file mode 100644
> index 0000000000..acf8f8f73c
> --- /dev/null
> +++ b/src/t_reflink_read_race.c
> @@ -0,0 +1,343 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Race reads with reflink to see if reads continue while we're cloning.
> + * Copyright 2023 Oracle.  All rights reserved.
> + */
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <linux/fs.h>
> +#include <time.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <fcntl.h>
> +#include <pthread.h>
> +#include <stdio.h>
> +#include <errno.h>
> +#include <string.h>
> +
> +#ifndef FICLONE
> +# define FICLONE _IOW(0x94, 9, int)
> +#endif
> +
> +static pid_t mypid;
> +
> +static FILE *outcome_fp;
> +
> +/* Significant timestamps.  Initialized to zero */
> +static double program_start, clone_start, clone_end;
> +
> +/* Coordinates access to timestamps */
> +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +struct readinfo {
> + int fd;
> + unsigned int blocksize;
> + char *descr;
> +};
> +
> +struct cloneinfo {
> + int src_fd, dst_fd;
> +};
> +
> +#define MSEC_PER_SEC 1000
> +#define NSEC_PER_MSEC 1000000
> +
> +/*
> + * Assume that it shouldn't take longer than 100ms for the FICLONE ioctl to
> + * enter the kernel and take locks on an uncontended file.  This is also the
> + * required CLOCK_MONOTONIC granularity.
> + */
> +#define EARLIEST_READ_MS (MSEC_PER_SEC / 10)
> +
> +/*
> + * We want to be reasonably sure that the FICLONE takes long enough that any
> + * read initiated after clone operation locked the source file could have
> + * completed a disk IO before the clone finishes.  Therefore, we require that
> + * the clone operation must take at least 4x the maximum setup time.
> + */
> +#define MINIMUM_CLONE_MS (EARLIEST_READ_MS * 5)
> +
> +static double timespec_to_msec(const struct timespec *ts)
> +{
> + return (ts->tv_sec * (double)MSEC_PER_SEC) +
> +       (ts->tv_nsec / (double)NSEC_PER_MSEC);
> +}
> +
> +static void sleep_ms(unsigned int len)
> +{
> + struct timespec time = {
> + .tv_nsec = len * NSEC_PER_MSEC,
> + };
> +
> + nanosleep(&time, NULL);
> +}
> +
> +static void outcome(const char *str)
> +{
> + fprintf(outcome_fp, "%s\n", str);
> + fflush(outcome_fp);
> +}
> +
> +static void *reader_fn(void *arg)
> +{
> + struct timespec now;
> + struct readinfo *ri = arg;
> + char *buf = malloc(ri->blocksize);
> + loff_t pos = 0;
> + ssize_t copied;
> + int ret;
> +
> + if (!buf) {
> + perror("alloc buffer");
> + goto kill_error;
> + }
> +
> + /* Wait for the FICLONE to start */
> + pthread_mutex_lock(&lock);
> + while (clone_start < program_start) {
> + pthread_mutex_unlock(&lock);
> +#ifdef DEBUG
> + printf("%s read waiting for clone to start; cs=%.2f ps=%.2f\n",
> + ri->descr, clone_start, program_start);
> + fflush(stdout);
> +#endif
> + sleep_ms(1);
> + pthread_mutex_lock(&lock);
> + }
> + pthread_mutex_unlock(&lock);
> + sleep_ms(EARLIEST_READ_MS);
> +
> + /* Read from the file... */
> + while ((copied = read(ri->fd, buf, ri->blocksize)) > 0) {
> + double read_completion;
> +
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime");
> + goto kill_error;
> + }
> + read_completion = timespec_to_msec(&now);
> +
> + /*
> + * If clone_end is still zero, the FICLONE is still running.
> + * If the read completion occurred a safe duration after the
> + * start of the ioctl, then report that as an early read
> + * completion.
> + */
> + pthread_mutex_lock(&lock);
> + if (clone_end < program_start &&
> +    read_completion > clone_start + EARLIEST_READ_MS) {
> + pthread_mutex_unlock(&lock);
> +
> + /* clone still running... */
> + printf("finished %s read early at %.2fms\n",
> + ri->descr,
> + read_completion - program_start);
> + fflush(stdout);
> + outcome("finished read early");
> + goto kill_done;
> + }
> + pthread_mutex_unlock(&lock);
> +
> + sleep_ms(1);
> + pos += copied;
> + }
> + if (copied < 0) {
> + perror("read");
> + goto kill_error;
> + }
> +
> + return NULL;
> +kill_error:
> + outcome("reader error");
> +kill_done:
> + kill(mypid, SIGTERM);
> + return NULL;
> +}
> +
> +static void *clone_fn(void *arg)
> +{
> + struct timespec now;
> + struct cloneinfo *ci = arg;
> + int ret;
> +
> + /* Record the approximate start time of this thread */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime clone start");
> + goto kill_error;
> + }
> + pthread_mutex_lock(&lock);
> + clone_start = timespec_to_msec(&now);
> + pthread_mutex_unlock(&lock);
> +
> + printf("started clone at %.2fms\n", clone_start - program_start);
> + fflush(stdout);
> +
> + /* Kernel call, only killable with a fatal signal */
> + ret = ioctl(ci->dst_fd, FICLONE, ci->src_fd);
> + if (ret) {
> + perror("FICLONE");
> + goto kill_error;
> + }
> +
> + /* If the ioctl completes, note the completion time */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime clone end");
> + goto kill_error;
> + }
> +
> + pthread_mutex_lock(&lock);
> + clone_end = timespec_to_msec(&now);
> + pthread_mutex_unlock(&lock);
> +
> + printf("finished clone at %.2fms\n",
> + clone_end - program_start);
> + fflush(stdout);
> +
> + /* Complain if we didn't take long enough to clone. */
> + if (clone_end < clone_start + MINIMUM_CLONE_MS) {
> + printf("clone did not take enough time (%.2fms)\n",
> + clone_end - clone_start);
> + fflush(stdout);
> + outcome("clone too fast");
> + goto kill_done;
> + }
> +
> + outcome("clone finished before any reads");
> + goto kill_done;
> +
> +kill_error:
> + outcome("clone error");
> +kill_done:
> + kill(mypid, SIGTERM);
> + return NULL;
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + struct cloneinfo ci;
> + struct readinfo bufio = { .descr = "buffered" };
> + struct readinfo directio = { .descr = "directio" };
> + struct timespec now;
> + pthread_t clone_thread, bufio_thread, directio_thread;
> + double clockres;
> + int ret;
> +
> + if (argc != 4) {
> + printf("Usage: %s src_file dst_file outcome_file\n", argv[0]);
> + return 1;
> + }
> +
> + mypid = getpid();
> +
> + /* Check for sufficient clock precision. */
> + ret = clock_getres(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_getres MONOTONIC");
> + return 2;
> + }
> + printf("CLOCK_MONOTONIC resolution: %llus %luns\n",
> + (unsigned long long)now.tv_sec,
> + (unsigned long)now.tv_nsec);
> + fflush(stdout);
> +
> + clockres = timespec_to_msec(&now);
> + if (clockres > EARLIEST_READ_MS) {
> + fprintf(stderr, "insufficient CLOCK_MONOTONIC resolution\n");
> + return 2;
> + }
> +
> + /*
> + * Measure program start time since the MONOTONIC time base is not
> + * all that well defined.
> + */
> + ret = clock_gettime(CLOCK_MONOTONIC, &now);
> + if (ret) {
> + perror("clock_gettime MONOTONIC");
> + return 2;
> + }
> + if (now.tv_sec == 0 && now.tv_nsec == 0) {
> + fprintf(stderr, "Uhoh, start time is zero?!\n");
> + return 2;
> + }
> + program_start = timespec_to_msec(&now);
> +
> + outcome_fp = fopen(argv[3], "w");
> + if (!outcome_fp) {
> + perror(argv[3]);
> + return 2;
> + }
> +
> + /* Open separate fds for each thread */
> + ci.src_fd = open(argv[1], O_RDONLY);
> + if (ci.src_fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> +
> + ci.dst_fd = open(argv[2], O_RDWR | O_CREAT, 0600);
> + if (ci.dst_fd < 0) {
> + perror(argv[2]);
> + return 2;
> + }
> +
> + bufio.fd = open(argv[1], O_RDONLY);
> + if (bufio.fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> + bufio.blocksize = 37;
> +
> + directio.fd = open(argv[1], O_RDONLY | O_DIRECT);
> + if (directio.fd < 0) {
> + perror(argv[1]);
> + return 2;
> + }
> + directio.blocksize = 512;
> +
> + /* Start threads */
> + ret = pthread_create(&clone_thread, NULL, clone_fn, &ci);
> + if (ret) {
> + fprintf(stderr, "create clone thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_create(&bufio_thread, NULL, reader_fn, &bufio);
> + if (ret) {
> + fprintf(stderr, "create bufio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_create(&directio_thread, NULL, reader_fn, &directio);
> + if (ret) {
> + fprintf(stderr, "create directio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + /* Wait for threads */
> + ret = pthread_join(clone_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join clone thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_join(bufio_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join bufio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + ret = pthread_join(directio_thread, NULL);
> + if (ret) {
> + fprintf(stderr, "join directio thread: %s\n", strerror(ret));
> + return 2;
> + }
> +
> + printf("Program should have killed itself?\n");
> + outcome("program failed to end early");
> + return 0;
> +}
> diff --git a/tests/generic/1953 b/tests/generic/1953
> new file mode 100755
> index 0000000000..058538e6fe
> --- /dev/null
> +++ b/tests/generic/1953
> @@ -0,0 +1,74 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2023, Oracle and/or its affiliates.  All Rights Reserved.
> +#
> +# FS QA Test No. 1953
> +#
> +# Race file reads with a very slow reflink operation to see if the reads
> +# actually complete while the reflink is ongoing.  This is a functionality
> +# test for XFS commit f3ba4762fa56 "xfs: allow read IO and FICLONE to run
> +# concurrently".
> +#
> +. ./common/preamble
> +_begin_fstest auto clone punch
> +
> +# Import common functions.
> +. ./common/filter
> +. ./common/attr
> +. ./common/reflink
> +
> +# real QA test starts here
> +_require_scratch_reflink
> +_require_cp_reflink
> +_require_xfs_io_command "fpunch"
> +_require_test_program "punch-alternating"
> +_require_test_program "t_reflink_read_race"
> +
> +rm -f "$seqres.full"
> +
> +echo "Format and mount"
> +_scratch_mkfs > "$seqres.full" 2>&1
> +_scratch_mount >> "$seqres.full" 2>&1
> +
> +testdir="$SCRATCH_MNT/test-$seq"
> +mkdir "$testdir"
> +
> +calc_space() {
> + blocks_needed=$(( 2 ** (fnr + 1) ))
> + space_needed=$((blocks_needed * blksz * 5 / 4))
> +}
> +
> +# Figure out the number of blocks that we need to get the reflink runtime above
> +# 1 seconds
> +echo "Create a many-block file"
> +for ((fnr = 1; fnr < 40; fnr++)); do
> + free_blocks=$(stat -f -c '%a' "$testdir")
> + blksz=$(_get_file_block_size "$testdir")
> + space_avail=$((free_blocks * blksz))
> + calc_space
> + test $space_needed -gt $space_avail && \
> + _notrun "Insufficient space for stress test; would only create $blocks_needed extents."
> +
> + off=$(( (2 ** fnr) * blksz))
> + $XFS_IO_PROG -f -c "pwrite -S 0x61 -b 4194304 $off $off" "$testdir/file1" >> "$seqres.full"
> + "$here/src/punch-alternating" "$testdir/file1" >> "$seqres.full"
> +
> + timeout 1s cp --reflink=always "$testdir/file1" "$testdir/garbage" || break
> +done
> +echo "fnr=$fnr" >> $seqres.full
> +
> +echo "Reflink the big file"
> +$here/src/t_reflink_read_race "$testdir/file1" "$testdir/file2" \
> + "$testdir/outcome" &>> $seqres.full
> +
> +if [ ! -e "$testdir/outcome" ]; then
> + echo "Could not set up program"
> +elif grep -q "finished read early" "$testdir/outcome"; then
> + echo "test completed successfully"
> +else
> + cat "$testdir/outcome"
> +fi
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/1953.out b/tests/generic/1953.out
> new file mode 100644
> index 0000000000..8eaacb7ff0
> --- /dev/null
> +++ b/tests/generic/1953.out
> @@ -0,0 +1,6 @@
> +QA output created by 1953
> +Format and mount
> +Create a many-block file
> +Reflink the big file
> +Terminated
> +test completed successfully
Darrick J. Wong Oct. 20, 2023, 3:22 p.m. UTC | #3
On Thu, Oct 19, 2023 at 11:09:49PM -0700, Christoph Hellwig wrote:
> On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> > permit reads, since the source file contents do  not change.  This is a
> 
> duplicate white space before "not"

Fixed.

> > +#ifndef FICLONE
> > +# define FICLONE	_IOW(0x94, 9, int)
> > +#endif
> 
> Can't we assume system headers with FICLONE by now?

That depends on the long tail of RHEL7.  But you're right, it has been
years now, I'll just kill this hunk and see who complains. :)

> > +# real QA test starts here
> > +_require_scratch_reflink
> 
> This needs a _require_scratch before the _require_scratch_reflink.

Ok, will do.

> Otherwise this looks good to me:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thanks!

--D
Dave Chinner Oct. 22, 2023, 10:20 p.m. UTC | #4
On Fri, Oct 20, 2023 at 08:22:48AM -0700, Darrick J. Wong wrote:
> On Thu, Oct 19, 2023 at 11:09:49PM -0700, Christoph Hellwig wrote:
> > On Thu, Oct 19, 2023 at 01:09:13PM -0700, Darrick J. Wong wrote:
> > > permit reads, since the source file contents do  not change.  This is a
> > 
> > duplicate white space before "not"
> 
> Fixed.
> 
> > > +#ifndef FICLONE
> > > +# define FICLONE	_IOW(0x94, 9, int)
> > > +#endif
> > 
> > Can't we assume system headers with FICLONE by now?
> 
> That depends on the long tail of RHEL7.  But you're right, it has been
> years now, I'll just kill this hunk and see who complains. :)

*cough*

Just add an autoconf rule, yes?

-Dave.
diff mbox series

Patch

diff --git a/src/Makefile b/src/Makefile
index 72c8a13007..b5e2b84dae 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -33,7 +33,7 @@  LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
 	attr_replace_test swapon mkswap t_attr_corruption t_open_tmpfiles \
 	fscrypt-crypt-util bulkstat_null_ocount splice-test chprojid_fail \
 	detached_mounts_propagation ext4_resize t_readdir_3 splice2pipe \
-	uuid_ioctl usemem_and_swapoff
+	uuid_ioctl usemem_and_swapoff t_reflink_read_race
 
 EXTRA_EXECS = dmerror fill2attr fill2fs fill2fs_check scaleread.sh \
 	      btrfs_crc32c_forged_name.py popdir.pl popattr.py \
diff --git a/src/t_reflink_read_race.c b/src/t_reflink_read_race.c
new file mode 100644
index 0000000000..acf8f8f73c
--- /dev/null
+++ b/src/t_reflink_read_race.c
@@ -0,0 +1,343 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Race reads with reflink to see if reads continue while we're cloning.
+ * Copyright 2023 Oracle.  All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#ifndef FICLONE
+# define FICLONE	_IOW(0x94, 9, int)
+#endif
+
+static pid_t mypid;
+
+static FILE *outcome_fp;
+
+/* Significant timestamps.  Initialized to zero */
+static double program_start, clone_start, clone_end;
+
+/* Coordinates access to timestamps */
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+struct readinfo {
+	int fd;
+	unsigned int blocksize;
+	char *descr;
+};
+
+struct cloneinfo {
+	int src_fd, dst_fd;
+};
+
+#define MSEC_PER_SEC	1000
+#define NSEC_PER_MSEC	1000000
+
+/*
+ * Assume that it shouldn't take longer than 100ms for the FICLONE ioctl to
+ * enter the kernel and take locks on an uncontended file.  This is also the
+ * required CLOCK_MONOTONIC granularity.
+ */
+#define EARLIEST_READ_MS	(MSEC_PER_SEC / 10)
+
+/*
+ * We want to be reasonably sure that the FICLONE takes long enough that any
+ * read initiated after clone operation locked the source file could have
+ * completed a disk IO before the clone finishes.  Therefore, we require that
+ * the clone operation must take at least 4x the maximum setup time.
+ */
+#define MINIMUM_CLONE_MS	(EARLIEST_READ_MS * 5)
+
+static double timespec_to_msec(const struct timespec *ts)
+{
+	return (ts->tv_sec * (double)MSEC_PER_SEC) +
+	       (ts->tv_nsec / (double)NSEC_PER_MSEC);
+}
+
+static void sleep_ms(unsigned int len)
+{
+	struct timespec time = {
+		.tv_nsec = len * NSEC_PER_MSEC,
+	};
+
+	nanosleep(&time, NULL);
+}
+
+static void outcome(const char *str)
+{
+	fprintf(outcome_fp, "%s\n", str);
+	fflush(outcome_fp);
+}
+
+static void *reader_fn(void *arg)
+{
+	struct timespec now;
+	struct readinfo *ri = arg;
+	char *buf = malloc(ri->blocksize);
+	loff_t pos = 0;
+	ssize_t copied;
+	int ret;
+
+	if (!buf) {
+		perror("alloc buffer");
+		goto kill_error;
+	}
+
+	/* Wait for the FICLONE to start */
+	pthread_mutex_lock(&lock);
+	while (clone_start < program_start) {
+		pthread_mutex_unlock(&lock);
+#ifdef DEBUG
+		printf("%s read waiting for clone to start; cs=%.2f ps=%.2f\n",
+				ri->descr, clone_start, program_start);
+		fflush(stdout);
+#endif
+		sleep_ms(1);
+		pthread_mutex_lock(&lock);
+	}
+	pthread_mutex_unlock(&lock);
+	sleep_ms(EARLIEST_READ_MS);
+
+	/* Read from the file... */
+	while ((copied = read(ri->fd, buf, ri->blocksize)) > 0) {
+		double read_completion;
+
+		ret = clock_gettime(CLOCK_MONOTONIC, &now);
+		if (ret) {
+			perror("clock_gettime");
+			goto kill_error;
+		}
+		read_completion = timespec_to_msec(&now);
+
+		/*
+		 * If clone_end is still zero, the FICLONE is still running.
+		 * If the read completion occurred a safe duration after the
+		 * start of the ioctl, then report that as an early read
+		 * completion.
+		 */
+		pthread_mutex_lock(&lock);
+		if (clone_end < program_start &&
+		    read_completion > clone_start + EARLIEST_READ_MS) {
+			pthread_mutex_unlock(&lock);
+
+			/* clone still running... */
+			printf("finished %s read early at %.2fms\n",
+					ri->descr,
+					read_completion - program_start);
+			fflush(stdout);
+			outcome("finished read early");
+			goto kill_done;
+		}
+		pthread_mutex_unlock(&lock);
+
+		sleep_ms(1);
+		pos += copied;
+	}
+	if (copied < 0) {
+		perror("read");
+		goto kill_error;
+	}
+
+	return NULL;
+kill_error:
+	outcome("reader error");
+kill_done:
+	kill(mypid, SIGTERM);
+	return NULL;
+}
+
+static void *clone_fn(void *arg)
+{
+	struct timespec now;
+	struct cloneinfo *ci = arg;
+	int ret;
+
+	/* Record the approximate start time of this thread */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime clone start");
+		goto kill_error;
+	}
+	pthread_mutex_lock(&lock);
+	clone_start = timespec_to_msec(&now);
+	pthread_mutex_unlock(&lock);
+
+	printf("started clone at %.2fms\n", clone_start - program_start);
+	fflush(stdout);
+
+	/* Kernel call, only killable with a fatal signal */
+	ret = ioctl(ci->dst_fd, FICLONE, ci->src_fd);
+	if (ret) {
+		perror("FICLONE");
+		goto kill_error;
+	}
+
+	/* If the ioctl completes, note the completion time */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime clone end");
+		goto kill_error;
+	}
+
+	pthread_mutex_lock(&lock);
+	clone_end = timespec_to_msec(&now);
+	pthread_mutex_unlock(&lock);
+
+	printf("finished clone at %.2fms\n",
+			clone_end - program_start);
+	fflush(stdout);
+
+	/* Complain if we didn't take long enough to clone. */
+	if (clone_end < clone_start + MINIMUM_CLONE_MS) {
+		printf("clone did not take enough time (%.2fms)\n",
+				clone_end - clone_start);
+		fflush(stdout);
+		outcome("clone too fast");
+		goto kill_done;
+	}
+
+	outcome("clone finished before any reads");
+	goto kill_done;
+
+kill_error:
+	outcome("clone error");
+kill_done:
+	kill(mypid, SIGTERM);
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	struct cloneinfo ci;
+	struct readinfo bufio = { .descr = "buffered" };
+	struct readinfo directio = { .descr = "directio" };
+	struct timespec now;
+	pthread_t clone_thread, bufio_thread, directio_thread;
+	double clockres;
+	int ret;
+
+	if (argc != 4) {
+		printf("Usage: %s src_file dst_file outcome_file\n", argv[0]);
+		return 1;
+	}
+
+	mypid = getpid();
+
+	/* Check for sufficient clock precision. */
+	ret = clock_getres(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_getres MONOTONIC");
+		return 2;
+	}
+	printf("CLOCK_MONOTONIC resolution: %llus %luns\n",
+			(unsigned long long)now.tv_sec,
+			(unsigned long)now.tv_nsec);
+	fflush(stdout);
+
+	clockres = timespec_to_msec(&now);
+	if (clockres > EARLIEST_READ_MS) {
+		fprintf(stderr, "insufficient CLOCK_MONOTONIC resolution\n");
+		return 2;
+	}
+
+	/*
+	 * Measure program start time since the MONOTONIC time base is not
+	 * all that well defined.
+	 */
+	ret = clock_gettime(CLOCK_MONOTONIC, &now);
+	if (ret) {
+		perror("clock_gettime MONOTONIC");
+		return 2;
+	}
+	if (now.tv_sec == 0 && now.tv_nsec == 0) {
+		fprintf(stderr, "Uhoh, start time is zero?!\n");
+		return 2;
+	}
+	program_start = timespec_to_msec(&now);
+
+	outcome_fp = fopen(argv[3], "w");
+	if (!outcome_fp) {
+		perror(argv[3]);
+		return 2;
+	}
+
+	/* Open separate fds for each thread */
+	ci.src_fd = open(argv[1], O_RDONLY);
+	if (ci.src_fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+
+	ci.dst_fd = open(argv[2], O_RDWR | O_CREAT, 0600);
+	if (ci.dst_fd < 0) {
+		perror(argv[2]);
+		return 2;
+	}
+
+	bufio.fd = open(argv[1], O_RDONLY);
+	if (bufio.fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+	bufio.blocksize = 37;
+
+	directio.fd = open(argv[1], O_RDONLY | O_DIRECT);
+	if (directio.fd < 0) {
+		perror(argv[1]);
+		return 2;
+	}
+	directio.blocksize = 512;
+
+	/* Start threads */
+	ret = pthread_create(&clone_thread, NULL, clone_fn, &ci);
+	if (ret) {
+		fprintf(stderr, "create clone thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_create(&bufio_thread, NULL, reader_fn, &bufio);
+	if (ret) {
+		fprintf(stderr, "create bufio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_create(&directio_thread, NULL, reader_fn, &directio);
+	if (ret) {
+		fprintf(stderr, "create directio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	/* Wait for threads */
+	ret = pthread_join(clone_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join clone thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_join(bufio_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join bufio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	ret = pthread_join(directio_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "join directio thread: %s\n", strerror(ret));
+		return 2;
+	}
+
+	printf("Program should have killed itself?\n");
+	outcome("program failed to end early");
+	return 0;
+}
diff --git a/tests/generic/1953 b/tests/generic/1953
new file mode 100755
index 0000000000..058538e6fe
--- /dev/null
+++ b/tests/generic/1953
@@ -0,0 +1,74 @@ 
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023, Oracle and/or its affiliates.  All Rights Reserved.
+#
+# FS QA Test No. 1953
+#
+# Race file reads with a very slow reflink operation to see if the reads
+# actually complete while the reflink is ongoing.  This is a functionality
+# test for XFS commit f3ba4762fa56 "xfs: allow read IO and FICLONE to run
+# concurrently".
+#
+. ./common/preamble
+_begin_fstest auto clone punch
+
+# Import common functions.
+. ./common/filter
+. ./common/attr
+. ./common/reflink
+
+# real QA test starts here
+_require_scratch_reflink
+_require_cp_reflink
+_require_xfs_io_command "fpunch"
+_require_test_program "punch-alternating"
+_require_test_program "t_reflink_read_race"
+
+rm -f "$seqres.full"
+
+echo "Format and mount"
+_scratch_mkfs > "$seqres.full" 2>&1
+_scratch_mount >> "$seqres.full" 2>&1
+
+testdir="$SCRATCH_MNT/test-$seq"
+mkdir "$testdir"
+
+calc_space() {
+	blocks_needed=$(( 2 ** (fnr + 1) ))
+	space_needed=$((blocks_needed * blksz * 5 / 4))
+}
+
+# Figure out the number of blocks that we need to get the reflink runtime above
+# 1 seconds
+echo "Create a many-block file"
+for ((fnr = 1; fnr < 40; fnr++)); do
+	free_blocks=$(stat -f -c '%a' "$testdir")
+	blksz=$(_get_file_block_size "$testdir")
+	space_avail=$((free_blocks * blksz))
+	calc_space
+	test $space_needed -gt $space_avail && \
+		_notrun "Insufficient space for stress test; would only create $blocks_needed extents."
+
+	off=$(( (2 ** fnr) * blksz))
+	$XFS_IO_PROG -f -c "pwrite -S 0x61 -b 4194304 $off $off" "$testdir/file1" >> "$seqres.full"
+	"$here/src/punch-alternating" "$testdir/file1" >> "$seqres.full"
+
+	timeout 1s cp --reflink=always "$testdir/file1" "$testdir/garbage" || break
+done
+echo "fnr=$fnr" >> $seqres.full
+
+echo "Reflink the big file"
+$here/src/t_reflink_read_race "$testdir/file1" "$testdir/file2" \
+	"$testdir/outcome" &>> $seqres.full
+
+if [ ! -e "$testdir/outcome" ]; then
+	echo "Could not set up program"
+elif grep -q "finished read early" "$testdir/outcome"; then
+	echo "test completed successfully"
+else
+	cat "$testdir/outcome"
+fi
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/1953.out b/tests/generic/1953.out
new file mode 100644
index 0000000000..8eaacb7ff0
--- /dev/null
+++ b/tests/generic/1953.out
@@ -0,0 +1,6 @@ 
+QA output created by 1953
+Format and mount
+Create a many-block file
+Reflink the big file
+Terminated
+test completed successfully