[2/4] iomap: iomap_dio_rw() handles all sync writes
diff mbox

Message ID 20180418040828.18165-3-david@fromorbit.com
State New
Headers show

Commit Message

Dave Chinner April 18, 2018, 4:08 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

Currently iomap_dio_rw() only handles (data)sync write completions
for AIO. This means we can't optimised non-AIO IO to minimise device
flushes as we can't tell the caller whether a flush is required or
not.

To solve this problem and enable further optimisations, make
iomap_dio_rw responsible for data sync behaviour for all IO, not
just AIO.

In doing so, the sync operation is now accounted as part of the DIO
IO by inode_dio_end(), hence post-IO data stability updates will no
long race against operations that serialise via inode_dio_wait()
such as truncate or hole punch.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c        | 22 +++++++++++++++-------
 fs/xfs/xfs_file.c |  5 -----
 2 files changed, 15 insertions(+), 12 deletions(-)

Comments

Jan Kara April 21, 2018, 1:03 p.m. UTC | #1
On Wed 18-04-18 14:08:26, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Currently iomap_dio_rw() only handles (data)sync write completions
> for AIO. This means we can't optimised non-AIO IO to minimise device
> flushes as we can't tell the caller whether a flush is required or
> not.
> 
> To solve this problem and enable further optimisations, make
> iomap_dio_rw responsible for data sync behaviour for all IO, not
> just AIO.
> 
> In doing so, the sync operation is now accounted as part of the DIO
> IO by inode_dio_end(), hence post-IO data stability updates will no
> long race against operations that serialise via inode_dio_wait()
> such as truncate or hole punch.
> 
> Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Looks good to me. You can add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/iomap.c        | 22 +++++++++++++++-------
>  fs/xfs/xfs_file.c |  5 -----
>  2 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index afd163586aa0..1f59c2d9ade6 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
>   * Private flags for iomap_dio, must not overlap with the public ones in
>   * iomap.h:
>   */
> +#define IOMAP_DIO_NEED_SYNC	(1 << 29)
>  #define IOMAP_DIO_WRITE		(1 << 30)
>  #define IOMAP_DIO_DIRTY		(1 << 31)
>  
> @@ -759,6 +760,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  			dio_warn_stale_pagecache(iocb->ki_filp);
>  	}
>  
> +	/*
> +	 * If this is a DSYNC write, make sure we push it to stable storage now
> +	 * that we've written data.
> +	 */
> +	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
> +		ret = generic_write_sync(iocb, ret);
> +
>  	inode_dio_end(file_inode(iocb->ki_filp));
>  	kfree(dio);
>  
> @@ -768,14 +776,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  static void iomap_dio_complete_work(struct work_struct *work)
>  {
>  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> -	struct kiocb *iocb = dio->iocb;
> -	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
> -	ssize_t ret;
>  
> -	ret = iomap_dio_complete(dio);
> -	if (is_write && ret > 0)
> -		ret = generic_write_sync(iocb, ret);
> -	iocb->ki_complete(iocb, ret, 0);
> +	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);
>  }
>  
>  /*
> @@ -961,6 +963,10 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
>  	return copied;
>  }
>  
> +/*
> + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
> + * is being issued as AIO or not.
> + */
>  ssize_t
>  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
> @@ -1006,6 +1012,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  			dio->flags |= IOMAP_DIO_DIRTY;
>  	} else {
>  		dio->flags |= IOMAP_DIO_WRITE;
> +		if (iocb->ki_flags & IOCB_DSYNC)
> +			dio->flags |= IOMAP_DIO_NEED_SYNC;
>  		flags |= IOMAP_WRITE;
>  	}
>  
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 6f15027661b6..0c4b8313d544 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -570,11 +570,6 @@ xfs_file_dio_aio_write(
>  	 * complete fully or fail.
>  	 */
>  	ASSERT(ret < 0 || ret == count);
> -
> -	if (ret > 0) {
> -		/* Handle various SYNC-type writes */
> -		ret = generic_write_sync(iocb, ret);
> -	}
>  	return ret;
>  }
>  
> -- 
> 2.16.1
>
Dave Chinner May 2, 2018, 2:45 a.m. UTC | #2
On Sat, Apr 21, 2018 at 03:03:09PM +0200, Jan Kara wrote:
> On Wed 18-04-18 14:08:26, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Currently iomap_dio_rw() only handles (data)sync write completions
> > for AIO. This means we can't optimised non-AIO IO to minimise device
> > flushes as we can't tell the caller whether a flush is required or
> > not.
> > 
> > To solve this problem and enable further optimisations, make
> > iomap_dio_rw responsible for data sync behaviour for all IO, not
> > just AIO.
> > 
> > In doing so, the sync operation is now accounted as part of the DIO
> > IO by inode_dio_end(), hence post-IO data stability updates will no
> > long race against operations that serialise via inode_dio_wait()
> > such as truncate or hole punch.
> > 
> > Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> Looks good to me. You can add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

It looks good, but it's broken in a subtle, nasty way. :/

> > @@ -768,14 +776,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> >  static void iomap_dio_complete_work(struct work_struct *work)
> >  {
> >  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> > -	struct kiocb *iocb = dio->iocb;
> > -	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
> > -	ssize_t ret;
> >  
> > -	ret = iomap_dio_complete(dio);
> > -	if (is_write && ret > 0)
> > -		ret = generic_write_sync(iocb, ret);
> > -	iocb->ki_complete(iocb, ret, 0);
> > +	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);

This generates a use after free from KASAN from generic/016. it
appears the compiler orders the code so that it dereferences
dio->iocb after iomap_dio_complete() has freed the dio structure
(yay!).

I'll post a new version of the patchset now that I've got changes to
2 of the 3 remaining patches in it....

Cheers,

Dave.
Robert Dorr May 2, 2018, 2:27 p.m. UTC | #3
Thanks for your continued effort Dave.

In the current implementation the first write to the location updates the metadata and must issue the flush.   In Windows SQL Server can avoid this behavior.   SQL Server can issue DeviceIoControl with SET_FILE_VALID_DATA and then SetEndOfFile.  The SetEndOfFile acquires space and saves metadata without requiring the actual write.   This allows us to quickly create a large file and the writes do not need the added flush.

Is this something that fallocate could accommodate to avoid having to write once (triggers flush for metadata) and then secondary writes can use FUA and avoid the flush?



-----Original Message-----
From: Dave Chinner <david@fromorbit.com> 
Sent: Tuesday, May 1, 2018 9:46 PM
To: Jan Kara <jack@suse.cz>
Cc: linux-xfs@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-block@vger.kernel.org; hch@lst.de; Robert Dorr <rdorr@microsoft.com>
Subject: Re: [PATCH 2/4] iomap: iomap_dio_rw() handles all sync writes

On Sat, Apr 21, 2018 at 03:03:09PM +0200, Jan Kara wrote:
> On Wed 18-04-18 14:08:26, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Currently iomap_dio_rw() only handles (data)sync write completions 
> > for AIO. This means we can't optimised non-AIO IO to minimise device 
> > flushes as we can't tell the caller whether a flush is required or 
> > not.
> > 
> > To solve this problem and enable further optimisations, make 
> > iomap_dio_rw responsible for data sync behaviour for all IO, not 
> > just AIO.
> > 
> > In doing so, the sync operation is now accounted as part of the DIO 
> > IO by inode_dio_end(), hence post-IO data stability updates will no 
> > long race against operations that serialise via inode_dio_wait() 
> > such as truncate or hole punch.
> > 
> > Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> > Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> Looks good to me. You can add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>

It looks good, but it's broken in a subtle, nasty way. :/

> > @@ -768,14 +776,8 @@ static ssize_t iomap_dio_complete(struct 
> > iomap_dio *dio)  static void iomap_dio_complete_work(struct 
> > work_struct *work)  {
> >  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> > -	struct kiocb *iocb = dio->iocb;
> > -	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
> > -	ssize_t ret;
> >  
> > -	ret = iomap_dio_complete(dio);
> > -	if (is_write && ret > 0)
> > -		ret = generic_write_sync(iocb, ret);
> > -	iocb->ki_complete(iocb, ret, 0);
> > +	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);

This generates a use after free from KASAN from generic/016. it appears the compiler orders the code so that it dereferences
dio->iocb after iomap_dio_complete() has freed the dio structure
(yay!).

I'll post a new version of the patchset now that I've got changes to
2 of the 3 remaining patches in it....

Cheers,

Dave.
--
Dave Chinner
david@fromorbit.com
Jan Kara May 3, 2018, 12:51 p.m. UTC | #4
On Wed 02-05-18 12:45:40, Dave Chinner wrote:
> On Sat, Apr 21, 2018 at 03:03:09PM +0200, Jan Kara wrote:
> > On Wed 18-04-18 14:08:26, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Currently iomap_dio_rw() only handles (data)sync write completions
> > > for AIO. This means we can't optimised non-AIO IO to minimise device
> > > flushes as we can't tell the caller whether a flush is required or
> > > not.
> > > 
> > > To solve this problem and enable further optimisations, make
> > > iomap_dio_rw responsible for data sync behaviour for all IO, not
> > > just AIO.
> > > 
> > > In doing so, the sync operation is now accounted as part of the DIO
> > > IO by inode_dio_end(), hence post-IO data stability updates will no
> > > long race against operations that serialise via inode_dio_wait()
> > > such as truncate or hole punch.
> > > 
> > > Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > 
> > Looks good to me. You can add:
> > 
> > Reviewed-by: Jan Kara <jack@suse.cz>
> 
> It looks good, but it's broken in a subtle, nasty way. :/
> 
> > > @@ -768,14 +776,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> > >  static void iomap_dio_complete_work(struct work_struct *work)
> > >  {
> > >  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> > > -	struct kiocb *iocb = dio->iocb;
> > > -	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
> > > -	ssize_t ret;
> > >  
> > > -	ret = iomap_dio_complete(dio);
> > > -	if (is_write && ret > 0)
> > > -		ret = generic_write_sync(iocb, ret);
> > > -	iocb->ki_complete(iocb, ret, 0);
> > > +	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);
> 
> This generates a use after free from KASAN from generic/016. it
> appears the compiler orders the code so that it dereferences
> dio->iocb after iomap_dio_complete() has freed the dio structure
> (yay!).

Yeah, very subtle but the compiler is indeed free to do this (in C the
sequence point is only the function call but the order of evaluation of
function arguments is unspecified). Thanks for catching this.

								Honza
Jan Kara May 3, 2018, 1:34 p.m. UTC | #5
On Wed 02-05-18 14:27:37, Robert Dorr wrote:
> In the current implementation the first write to the location updates the
> metadata and must issue the flush.   In Windows SQL Server can avoid this
> behavior.   SQL Server can issue DeviceIoControl with SET_FILE_VALID_DATA
> and then SetEndOfFile.  The SetEndOfFile acquires space and saves
> metadata without requiring the actual write.   This allows us to quickly
> create a large file and the writes do not need the added flush.
> 
> Is this something that fallocate could accommodate to avoid having to
> write once (triggers flush for metadata) and then secondary writes can
> use FUA and avoid the flush?

Well, the question then is what do you see in the file if you read those
blocks before writing them or if the system crashes before writing the
data. As you describe the feature, you'd see there just the old block
contents which is a security issue (you could see for example old
/etc/passwd contents there). That's why we've refused such feature in the
past [1].

								Honza

[1] https://www.spinics.net/lists/linux-ext4/msg31637.html

> -----Original Message-----
> From: Dave Chinner <david@fromorbit.com> 
> Sent: Tuesday, May 1, 2018 9:46 PM
> To: Jan Kara <jack@suse.cz>
> Cc: linux-xfs@vger.kernel.org; linux-fsdevel@vger.kernel.org; linux-block@vger.kernel.org; hch@lst.de; Robert Dorr <rdorr@microsoft.com>
> Subject: Re: [PATCH 2/4] iomap: iomap_dio_rw() handles all sync writes
> 
> On Sat, Apr 21, 2018 at 03:03:09PM +0200, Jan Kara wrote:
> > On Wed 18-04-18 14:08:26, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > Currently iomap_dio_rw() only handles (data)sync write completions 
> > > for AIO. This means we can't optimised non-AIO IO to minimise device 
> > > flushes as we can't tell the caller whether a flush is required or 
> > > not.
> > > 
> > > To solve this problem and enable further optimisations, make 
> > > iomap_dio_rw responsible for data sync behaviour for all IO, not 
> > > just AIO.
> > > 
> > > In doing so, the sync operation is now accounted as part of the DIO 
> > > IO by inode_dio_end(), hence post-IO data stability updates will no 
> > > long race against operations that serialise via inode_dio_wait() 
> > > such as truncate or hole punch.
> > > 
> > > Signed-Off-By: Dave Chinner <dchinner@redhat.com>
> > > Reviewed-by: Christoph Hellwig <hch@lst.de>
> > 
> > Looks good to me. You can add:
> > 
> > Reviewed-by: Jan Kara <jack@suse.cz>
> 
> It looks good, but it's broken in a subtle, nasty way. :/
> 
> > > @@ -768,14 +776,8 @@ static ssize_t iomap_dio_complete(struct 
> > > iomap_dio *dio)  static void iomap_dio_complete_work(struct 
> > > work_struct *work)  {
> > >  	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
> > > -	struct kiocb *iocb = dio->iocb;
> > > -	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
> > > -	ssize_t ret;
> > >  
> > > -	ret = iomap_dio_complete(dio);
> > > -	if (is_write && ret > 0)
> > > -		ret = generic_write_sync(iocb, ret);
> > > -	iocb->ki_complete(iocb, ret, 0);
> > > +	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);
> 
> This generates a use after free from KASAN from generic/016. it appears the compiler orders the code so that it dereferences
> dio->iocb after iomap_dio_complete() has freed the dio structure
> (yay!).
> 
> I'll post a new version of the patchset now that I've got changes to
> 2 of the 3 remaining patches in it....
> 
> Cheers,
> 
> Dave.
> --
> Dave Chinner
> david@fromorbit.com

Patch
diff mbox

diff --git a/fs/iomap.c b/fs/iomap.c
index afd163586aa0..1f59c2d9ade6 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -685,6 +685,7 @@  EXPORT_SYMBOL_GPL(iomap_seek_data);
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
 #define IOMAP_DIO_WRITE		(1 << 30)
 #define IOMAP_DIO_DIRTY		(1 << 31)
 
@@ -759,6 +760,13 @@  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 			dio_warn_stale_pagecache(iocb->ki_filp);
 	}
 
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
 	inode_dio_end(file_inode(iocb->ki_filp));
 	kfree(dio);
 
@@ -768,14 +776,8 @@  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 static void iomap_dio_complete_work(struct work_struct *work)
 {
 	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
-	struct kiocb *iocb = dio->iocb;
-	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
-	ssize_t ret;
 
-	ret = iomap_dio_complete(dio);
-	if (is_write && ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	iocb->ki_complete(iocb, ret, 0);
+	dio->iocb->ki_complete(dio->iocb, iomap_dio_complete(dio), 0);
 }
 
 /*
@@ -961,6 +963,10 @@  iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 	return copied;
 }
 
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.
+ */
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1006,6 +1012,8 @@  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			dio->flags |= IOMAP_DIO_DIRTY;
 	} else {
 		dio->flags |= IOMAP_DIO_WRITE;
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
 		flags |= IOMAP_WRITE;
 	}
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6f15027661b6..0c4b8313d544 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -570,11 +570,6 @@  xfs_file_dio_aio_write(
 	 * complete fully or fail.
 	 */
 	ASSERT(ret < 0 || ret == count);
-
-	if (ret > 0) {
-		/* Handle various SYNC-type writes */
-		ret = generic_write_sync(iocb, ret);
-	}
 	return ret;
 }