Triggering non-integrity writeback from userspace
diff mbox

Message ID 20151028232641.GS8773@dastard
State New
Headers show

Commit Message

Dave Chinner Oct. 28, 2015, 11:26 p.m. UTC
On Thu, Oct 29, 2015 at 07:48:34AM +1100, Dave Chinner wrote:
> Hi Andres,
> 
> On Wed, Oct 28, 2015 at 10:27:52AM +0100, Andres Freund wrote:
> > On 2015-10-25 08:39:12 +1100, Dave Chinner wrote:
> ....
> > > Data integrity operations require related file metadata (e.g. block
> > > allocation trnascations) to be forced to the journal/disk, and a
> > > device cache flush issued to ensure the data is on stable storage.
> > > SYNC_FILE_RANGE_WRITE does neither of these things, and hence while
> > > the IO might be the same pattern as a data integrity operation, it
> > > does not provide such guarantees.
> > 
> > Which is desired here - the actual integrity is still going to be done
> > via fsync().
> 
> OK, so you require data integrity, but....
> 
> > The idea of using SYNC_FILE_RANGE_WRITE beforehand is that
> > the fsync() will only have to do very little work. The language in
> > sync_file_range(2) doesn't inspire enough confidence for using it as an
> > actual integrity operation :/
> 
> So really you're trying to minimise the blocking/latency of fsync()?
> 
> > > You don't want to do writeback from the syscall, right? i.e. you'd
> > > like to expire the inode behind the fd, and schedule background
> > > writeback to run on it immediately?
> > 
> > Yes, that's exactly what we want. Blocking if a process has done too
> > much writes is fine tho.
> 
> OK, so it's really the latency of the fsync() operation that is what
> you are trying to avoid? I've been meaning to get back to a generic
> implementation of an aio fsync operation:
> 
> http://oss.sgi.com/archives/xfs/2014-06/msg00214.html
> 
> Would that be a better approach to solving your need for a
> non-blocking data integrity flush of a file?

Which was relatively trivial to do. Numbers below come from XFS, I
smoke tested ext4 and it kinda worked but behaviour was very
unpredictable and maxxed out at about 25000 IOPS with max
performance being at 4 threads @ an average of 20000 files/s...

Cheers,

Dave.

Patch
diff mbox

diff --git a/fs/aio.c b/fs/aio.c
index 155f842..19df3ec 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -188,6 +188,19 @@  struct aio_kiocb {
 	struct eventfd_ctx	*ki_eventfd;
 };
 
+/*
+ * Generic async fsync work structure.  If the file does not supply
+ * an ->aio_fsync method but has a ->fsync method, then the f(d)sync request is
+ * passed to the aio_fsync_wq workqueue and is executed there.
+ */
+struct aio_fsync_args {
+	struct work_struct	work;
+	struct kiocb		*iocb;
+	int			datasync;
+};
+
+static struct workqueue_struct *aio_fsync_wq;
+
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;		/* current system wide number of aio requests */
@@ -257,6 +270,10 @@  static int __init aio_setup(void)
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 
+	aio_fsync_wq = alloc_workqueue("aio-fsync", 0, 0);
+	if (!aio_fsync_wq)
+		panic("Failed to create aio fsync workqueue.");
+
 	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
@@ -1396,6 +1413,32 @@  static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
 				len, UIO_FASTIOV, iovec, iter);
 }
 
+static void generic_aio_fsync_work(struct work_struct *work)
+{
+	struct aio_fsync_args *args = container_of(work,
+						   struct aio_fsync_args, work);
+	int error;
+
+	error = vfs_fsync(args->iocb->ki_filp, args->datasync);
+	aio_complete(args->iocb, error, 0);
+	kfree(args);
+}
+
+static int generic_aio_fsync(struct kiocb *iocb, int datasync)
+{
+	struct aio_fsync_args	*args;
+
+	args = kzalloc(sizeof(struct aio_fsync_args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	INIT_WORK(&args->work, generic_aio_fsync_work);
+	args->iocb = iocb;
+	args->datasync = datasync;
+	queue_work(aio_fsync_wq, &args->work);
+	return -EIOCBQUEUED;
+}
+
 /*
  * aio_run_iocb:
  *	Performs the initial checks and io submission.
@@ -1410,6 +1453,7 @@  static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 	rw_iter_op *iter_op;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	int datasync = 0;
 
 	switch (opcode) {
 	case IOCB_CMD_PREAD:
@@ -1460,17 +1504,15 @@  rw_common:
 		break;
 
 	case IOCB_CMD_FDSYNC:
-		if (!file->f_op->aio_fsync)
-			return -EINVAL;
-
-		ret = file->f_op->aio_fsync(req, 1);
-		break;
-
+		datasync = 1;
+		/* fall through */
 	case IOCB_CMD_FSYNC:
-		if (!file->f_op->aio_fsync)
+		if (file->f_op->aio_fsync)
+			ret = file->f_op->aio_fsync(req, datasync);
+		else if (file->f_op->fsync)
+			ret = generic_aio_fsync(req, datasync);
+		else
 			return -EINVAL;
-
-		ret = file->f_op->aio_fsync(req, 0);
 		break;
 
 	default: