Triggering non-integrity writeback from userspace

Message ID	20151029221022.GB10656@dastard (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> Date: Fri, 30 Oct 2015 09:10:22 +1100 From: Dave Chinner <david@fromorbit.com> To: Andres Freund <andres@anarazel.de> Cc: linux-mm@kvack.org, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: Re: Triggering non-integrity writeback from userspace Message-ID: <20151029221022.GB10656@dastard> References: <20151022131555.GC4378@alap3.anarazel.de> <20151024213912.GE8773@dastard> <20151028092752.GF29811@alap3.anarazel.de> <20151028204834.GP8773@dastard> <20151028232312.GL29811@alap3.anarazel.de> <20151029015422.GT8773@dastard> <20151029162356.GQ29811@alap3.anarazel.de> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20151029162356.GQ29811@alap3.anarazel.de> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk

Message ID

20151029221022.GB10656@dastard (mailing list archive)

State

New, archived

Headers

Date: Fri, 30 Oct 2015 09:10:22 +1100
From: Dave Chinner <david@fromorbit.com>
To: Andres Freund <andres@anarazel.de>
Cc: linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: Triggering non-integrity writeback from userspace
Message-ID: <20151029221022.GB10656@dastard>
References: <20151022131555.GC4378@alap3.anarazel.de>
	<20151024213912.GE8773@dastard>
	<20151028092752.GF29811@alap3.anarazel.de>
	<20151028204834.GP8773@dastard>
	<20151028232312.GL29811@alap3.anarazel.de>
	<20151029015422.GT8773@dastard>
	<20151029162356.GQ29811@alap3.anarazel.de>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20151029162356.GQ29811@alap3.anarazel.de>
User-Agent: Mutt/1.5.21 (2010-09-15)
Sender: linux-fsdevel-owner@vger.kernel.org
Precedence: bulk

Commit Message

Dave Chinner Oct. 29, 2015, 10:10 p.m. UTC

On Thu, Oct 29, 2015 at 05:23:56PM +0100, Andres Freund wrote:
> On 2015-10-29 12:54:22 +1100, Dave Chinner wrote:
> > On Thu, Oct 29, 2015 at 12:23:12AM +0100, Andres Freund wrote:
> > > By calling sync_file_range() over small ranges of pages shortly after
> > > they've been written we make it unlikely (but still possible) that much
> > > data has to be flushed at fsync() time.
> > 
> > Right, but you still need the fsync call, whereas with a async fsync
> > call you don't - when you gather the completion, no further action
> > needs to be taken on that dirty range.
> 
> I assume that the actual IOs issued by the async fsync and a plain fsync
> would be pretty similar. So the problem that an fsync of large amounts
> of dirty data causes latency increases for other issuers of IO wouldn't
> be gone, no?

Yes, they'd be the same if the async operation is not range limited.

> > > At the moment using fdatasync() instead of fsync() is a considerable
> > > performance advantage... If I understand the above proposal correctly,
> > > it'd allow specifying ranges, is that right?
> > 
> > Well, the patch I sent doesn't do ranges, but it could easily be
> > passed in as the iocb has offset/len parameters that are used by
> > IOCB_CMD_PREAD/PWRITE.
> 
> That'd be cool. Then we could issue those for asynchronous transaction
> commits, and to have more wal writes concurrently in progress by the
> background wal writer.

Updated patch that allows ranged aio fsync below. In the
application, do this for a ranged fsync:

	io_prep_fsync(iocb, fd);
	iocb->u.c.offset = offset;	/* start of range */
	iocb->u.c.nbytes = len;		/* size (in bytes) to sync */
	error = io_submit(ctx, 1, &iocb);

> I'll try the patch from 20151028232641.GS8773@dastard and see wether I
> can make it be advantageous for throughput (for WAL flushing, not the
> checkpointer process).  Wish I had a better storage system, my guess
> it'll be more advantageous there. We'll see.

A $100 SATA ssd is all you need to get the IOPS rates in the
thousands for these sorts of tests...

Cheers,

Dave.

diff --git a/fs/aio.c b/fs/aio.c
index 155f842..109433b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -188,6 +188,20 @@  struct aio_kiocb {
 	struct eventfd_ctx	*ki_eventfd;
 };
 
+/*
+ * Generic async fsync work structure.  If the file does not supply
+ * an ->aio_fsync method but has a ->fsync method, then the f(d)sync request is
+ * passed to the aio_fsync_wq workqueue and is executed there.
+ */
+struct aio_fsync_args {
+	struct work_struct	work;
+	struct kiocb		*req;
+	size_t			len;		/* zero for full file fsync */
+	int			datasync;
+};
+
+static struct workqueue_struct *aio_fsync_wq;
+
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;		/* current system wide number of aio requests */
@@ -257,6 +271,10 @@  static int __init aio_setup(void)
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 
+	aio_fsync_wq = alloc_workqueue("aio-fsync", 0, 0);
+	if (!aio_fsync_wq)
+		panic("Failed to create aio fsync workqueue.");
+
 	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
@@ -1396,6 +1414,40 @@  static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
 				len, UIO_FASTIOV, iovec, iter);
 }
 
+static void generic_aio_fsync_work(struct work_struct *work)
+{
+	struct aio_fsync_args *args = container_of(work,
+						   struct aio_fsync_args, work);
+	struct kiocb *req = args->req;
+	int error;
+
+	if (!args->len)
+		error = vfs_fsync(req->ki_filp, args->datasync);
+	else
+		error = vfs_fsync_range(req->ki_filp, req->ki_pos,
+					req->ki_pos + args->len,
+					args->datasync);
+
+	aio_complete(req, error, 0);
+	kfree(args);
+}
+
+static int generic_aio_fsync(struct kiocb *req, size_t len, int datasync)
+{
+	struct aio_fsync_args	*args;
+
+	args = kzalloc(sizeof(struct aio_fsync_args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	INIT_WORK(&args->work, generic_aio_fsync_work);
+	args->req = req;
+	args->len = len;
+	args->datasync = datasync;
+	queue_work(aio_fsync_wq, &args->work);
+	return -EIOCBQUEUED;
+}
+
 /*
  * aio_run_iocb:
  *	Performs the initial checks and io submission.
@@ -1410,6 +1462,7 @@  static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 	rw_iter_op *iter_op;
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
+	int datasync = 0;
 
 	switch (opcode) {
 	case IOCB_CMD_PREAD:
@@ -1460,17 +1513,15 @@  rw_common:
 		break;
 
 	case IOCB_CMD_FDSYNC:
-		if (!file->f_op->aio_fsync)
-			return -EINVAL;
-
-		ret = file->f_op->aio_fsync(req, 1);
-		break;
-
+		datasync = 1;
+		/* fall through */
 	case IOCB_CMD_FSYNC:
-		if (!file->f_op->aio_fsync)
+		if (file->f_op->aio_fsync)
+			ret = file->f_op->aio_fsync(req, len, datasync);
+		else if (file->f_op->fsync)
+			ret = generic_aio_fsync(req, len, datasync);
+		else
 			return -EINVAL;
-
-		ret = file->f_op->aio_fsync(req, 0);
 		break;
 
 	default:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 72d8a84..8a74dfb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1626,7 +1626,7 @@  struct file_operations {
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
 	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
-	int (*aio_fsync) (struct kiocb *, int datasync);
+	int (*aio_fsync) (struct kiocb *, size_t len, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);

Triggering non-integrity writeback from userspace

Commit Message

Patch