Message ID | 20180301014144.28892-1-david@fromorbit.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Thu, Mar 01, 2018 at 12:41:44PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@redhat.com> > > If we are doing direct IO writes with datasync semantics, we often > have to flush metadata changes along with the data write. However, > if we are overwriting existing data, there are no metadata changes > that we need to flush. In this case, optimising the IO by using > FUA write makes sense. > > We know from teh IOMAP_F_DIRTY flag as to whether a specific inode > requires a metadata flush - this is currently used by DAX to ensure > extent modi$fication as stable in page fault operations. For direct ^^^^ modification > IO writes, we can use it to determine if we need to flush metadata > or not once the data is on disk. > > Hence if we have been returned a mapped extent that is not new and > the IO mapping is not dirty, then we can use a FUA write to provide > datasync semantics. This allows us to short-cut the > generic_write_sync() call in IO completion and hence avoid > unnecessary operations. This makes pure direct IO data write > behaviour identical to the way block devices use REQ_FUA to provide > datasync semantics. > > Now that iomap_dio_rw() is determining if REQ_FUA can be used, we > have to stop issuing generic_write_sync() calls from the XFS code > when REQ_FUA is issued, otherwise it will still throw a cache flush > to the device via xfs_file_fsync(). To do this, we need to make > iomap_dio_rw() always responsible for issuing generic_write_sync() > when necessary, not just for AIO calls. This means the filesystem > doesn't have to guess when cache flushes are necessary now. > > On a FUA enabled device, a synchronous direct IO write workload > (sequential 4k overwrites in 32MB file) had the following results: > > # xfs_io -fd -c "pwrite -V 1 -D 0 32m" /mnt/scratch/boo > > kernel time write()s write iops Write b/w > ------ ---- -------- ---------- --------- > (no dsync) 4s 2173/s 2173 8.5MB/s > vanilla 22s 370/s 750 1.4MB/s > patched 19s 420/s 420 1.6MB/s > > The patched code clearly doesn't send cache flushes anymore, but > instead uses FUA (confirmed via blktrace), and performance improves > a bit as a result. However, the benefits will be higher on workloads > that mix O_DSYNC overwrites with other write IO as we won't be > flushing the entire device cache on every DSYNC overwrite IO > anymore. > > Signed-Off-By: Dave Chinner <dchinner@redhat.com> > --- > fs/iomap.c | 38 +++++++++++++++++++++++++++++++++----- > fs/xfs/xfs_file.c | 5 +++++ > 2 files changed, 38 insertions(+), 5 deletions(-) > > diff --git a/fs/iomap.c b/fs/iomap.c > index afd163586aa0..bcc90e3a2e3f 100644 > --- a/fs/iomap.c > +++ b/fs/iomap.c > @@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); > * Private flags for iomap_dio, must not overlap with the public ones in > * iomap.h: > */ > +#define IOMAP_DIO_WRITE_FUA (1 << 29) > #define IOMAP_DIO_WRITE (1 << 30) > #define IOMAP_DIO_DIRTY (1 << 31) > > @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > } > > inode_dio_end(file_inode(iocb->ki_filp)); > - kfree(dio); > > + /* > + * If a FUA write was done, then that is all we required for datasync > + * semantics -. we don't need to call generic_write_sync() to complete Minor nit: space-dash-period. Looks ok to me, but does anyone else have comments? (Looking at hch here. ;)) --D > + * the write. > + */ > + if (ret > 0 && > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > + IOMAP_DIO_WRITE) { > + ret = generic_write_sync(iocb, ret); > + } > + > + kfree(dio); > return ret; > } > > @@ -769,12 +781,9 @@ static void iomap_dio_complete_work(struct work_struct *work) > { > struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); > struct kiocb *iocb = dio->iocb; > - bool is_write = (dio->flags & IOMAP_DIO_WRITE); > ssize_t ret; > > ret = iomap_dio_complete(dio); > - if (is_write && ret > 0) > - ret = generic_write_sync(iocb, ret); > iocb->ki_complete(iocb, ret, 0); > } > > @@ -883,6 +892,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > dio->flags |= IOMAP_DIO_COW; > if (iomap->flags & IOMAP_F_NEW) > need_zeroout = true; > + /* > + * Use a FUA write if we need datasync semantics and this is a > + * pure data IO that doesn't require any metadata updates. This > + * allows us to avoid cache flushes on IO completion. > + */ > + else if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && > + (dio->flags & IOMAP_DIO_WRITE) && > + (dio->iocb->ki_flags & IOCB_DSYNC)) > + dio->flags |= IOMAP_DIO_WRITE_FUA; > break; > default: > WARN_ON_ONCE(1); > @@ -930,7 +948,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > > n = bio->bi_iter.bi_size; > if (dio->flags & IOMAP_DIO_WRITE) { > - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); > + int op_flags = REQ_SYNC | REQ_IDLE; > + > + if (dio->flags & IOMAP_DIO_WRITE_FUA) > + op_flags |= REQ_FUA; > + bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags); > task_io_account_write(n); > } else { > bio_set_op_attrs(bio, REQ_OP_READ, 0); > @@ -961,6 +983,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > return copied; > } > > +/* > + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO > + * is being issued as AIO or not. This allows us to optimise pure data writes to > + * use REQ_FUA rather than requiring generic_write_sync() to issue a REQ_FLUSH > + * post write. > + */ > ssize_t > iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > const struct iomap_ops *ops, iomap_dio_end_io_t end_io) > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 260ff5e5c264..81aa3b73471e 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -732,6 +732,11 @@ xfs_file_write_iter( > ret = xfs_file_dio_aio_write(iocb, from); > if (ret == -EREMCHG) > goto buffered; > + /* > + * Direct IO handles sync type writes internally on I/O > + * completion. > + */ > + return ret; > } else { > buffered: > ret = xfs_file_buffered_aio_write(iocb, from); > -- > 2.16.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > } > > inode_dio_end(file_inode(iocb->ki_filp)); > - kfree(dio); > > + /* > + * If a FUA write was done, then that is all we required for datasync > + * semantics -. we don't need to call generic_write_sync() to complete > + * the write. > + */ > + if (ret > 0 && > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > + IOMAP_DIO_WRITE) { > + ret = generic_write_sync(iocb, ret); > + } > + > + kfree(dio); Can please split the move of the generic_write_sync call into generic_write_sync a separate prep patch? It's enough of a logic change on its own that it warrants a separate commit with a separate explanation. Also I'd be tempted to invert the IOMAP_DIO_WRITE_FUA flag and replace it with an IOMAP_DIO_WRITE_SYNC flag to indicate we need the generic_write_sync call, as that should make the logic much more clear. > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 260ff5e5c264..81aa3b73471e 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -732,6 +732,11 @@ xfs_file_write_iter( > ret = xfs_file_dio_aio_write(iocb, from); > if (ret == -EREMCHG) > goto buffered; > + /* > + * Direct IO handles sync type writes internally on I/O > + * completion. > + */ > + return ret; > } else { > buffered: > ret = xfs_file_buffered_aio_write(iocb, from); The else is not needed and you can now have a much more sensible code flow here: ret = xfs_file_dio_aio_write(iocb, from); if (ret != -EREMCHG)) return ret; } ret = xfs_file_buffered_aio_write(iocb, from); -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Mar 02, 2018 at 11:20:31PM +0100, Christoph Hellwig wrote: > > --- a/fs/xfs/xfs_file.c > > +++ b/fs/xfs/xfs_file.c > > @@ -732,6 +732,11 @@ xfs_file_write_iter( > > ret = xfs_file_dio_aio_write(iocb, from); > > if (ret == -EREMCHG) > > goto buffered; > > + /* > > + * Direct IO handles sync type writes internally on I/O > > + * completion. > > + */ > > + return ret; > > } else { > > buffered: > > ret = xfs_file_buffered_aio_write(iocb, from); > > The else is not needed and you can now have a much more sensible > code flow here: > > ret = xfs_file_dio_aio_write(iocb, from); > if (ret != -EREMCHG)) > return ret; > } > > ret = xfs_file_buffered_aio_write(iocb, from); Actually a little more complicated due to the DAX case, and the fact that both your original and this new version miss the XFS_STATS_ADD call. But you get the idea. While we're at it we should probably also skil the generic_write_sync call for DAX pure overwrites while we're at it. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Mar 02, 2018 at 11:20:31PM +0100, Christoph Hellwig wrote: > > @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > > } > > > > inode_dio_end(file_inode(iocb->ki_filp)); > > - kfree(dio); > > > > + /* > > + * If a FUA write was done, then that is all we required for datasync > > + * semantics -. we don't need to call generic_write_sync() to complete > > + * the write. > > + */ > > + if (ret > 0 && > > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > > + IOMAP_DIO_WRITE) { > > + ret = generic_write_sync(iocb, ret); > > + } > > + > > + kfree(dio); > > Can please split the move of the generic_write_sync call into > generic_write_sync a separate prep patch? It's enough of a logic change > on its own that it warrants a separate commit with a separate explanation. Ok, that makes sense. > Also I'd be tempted to invert the IOMAP_DIO_WRITE_FUA flag and replace > it with an IOMAP_DIO_WRITE_SYNC flag to indicate we need the > generic_write_sync call, as that should make the logic much more clear. Ah, much cleaner. > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > index 260ff5e5c264..81aa3b73471e 100644 > > --- a/fs/xfs/xfs_file.c > > +++ b/fs/xfs/xfs_file.c > > @@ -732,6 +732,11 @@ xfs_file_write_iter( > > ret = xfs_file_dio_aio_write(iocb, from); > > if (ret == -EREMCHG) > > goto buffered; > > + /* > > + * Direct IO handles sync type writes internally on I/O > > + * completion. > > + */ > > + return ret; > > } else { > > buffered: > > ret = xfs_file_buffered_aio_write(iocb, from); > > The else is not needed and you can now have a much more sensible > code flow here: > > ret = xfs_file_dio_aio_write(iocb, from); > if (ret != -EREMCHG)) > return ret; > } > > ret = xfs_file_buffered_aio_write(iocb, from); I thought about that, but as you noticed the DAX case gets in the way so I didn't bother for a RFC patch. I'll have a look at making the DAX iomap code do something similar with generic_write_sync() so we can simplify this high level code. Thanks Christoph! Cheers, Dave.
On Sat, Mar 03, 2018 at 09:53:19AM +1100, Dave Chinner wrote: > I thought about that, but as you noticed the DAX case gets in the > way so I didn't bother for a RFC patch. > > I'll have a look at making the DAX iomap code do something similar > with generic_write_sync() so we can simplify this high level code. I suspect it might make sense to juse move generic_write_sync into each of the three write routines as a prep patch before the prep patch, which should clean everything up nicely. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Oh, and another thing: I think you want to make this new code dependent on the block devie actually supporting REQ_FUA natively. Otherwise you'll cause a flush for every emulated FUA write, which is only going make things worse, especially for ATA where FLUSH is not queued. And last time I check libata still disabled FUA by default. On Sat, Mar 03, 2018 at 09:53:19AM +1100, Dave Chinner wrote: > On Fri, Mar 02, 2018 at 11:20:31PM +0100, Christoph Hellwig wrote: > > > @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > > > } > > > > > > inode_dio_end(file_inode(iocb->ki_filp)); > > > - kfree(dio); > > > > > > + /* > > > + * If a FUA write was done, then that is all we required for datasync > > > + * semantics -. we don't need to call generic_write_sync() to complete > > > + * the write. > > > + */ > > > + if (ret > 0 && > > > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > > > + IOMAP_DIO_WRITE) { > > > + ret = generic_write_sync(iocb, ret); > > > + } > > > + > > > + kfree(dio); > > > > Can please split the move of the generic_write_sync call into > > generic_write_sync a separate prep patch? It's enough of a logic change > > on its own that it warrants a separate commit with a separate explanation. > > Ok, that makes sense. > > > Also I'd be tempted to invert the IOMAP_DIO_WRITE_FUA flag and replace > > it with an IOMAP_DIO_WRITE_SYNC flag to indicate we need the > > generic_write_sync call, as that should make the logic much more clear. > > Ah, much cleaner. > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > > index 260ff5e5c264..81aa3b73471e 100644 > > > --- a/fs/xfs/xfs_file.c > > > +++ b/fs/xfs/xfs_file.c > > > @@ -732,6 +732,11 @@ xfs_file_write_iter( > > > ret = xfs_file_dio_aio_write(iocb, from); > > > if (ret == -EREMCHG) > > > goto buffered; > > > + /* > > > + * Direct IO handles sync type writes internally on I/O > > > + * completion. > > > + */ > > > + return ret; > > > } else { > > > buffered: > > > ret = xfs_file_buffered_aio_write(iocb, from); > > > > The else is not needed and you can now have a much more sensible > > code flow here: > > > > ret = xfs_file_dio_aio_write(iocb, from); > > if (ret != -EREMCHG)) > > return ret; > > } > > > > ret = xfs_file_buffered_aio_write(iocb, from); > > I thought about that, but as you noticed the DAX case gets in the > way so I didn't bother for a RFC patch. > > I'll have a look at making the DAX iomap code do something similar > with generic_write_sync() so we can simplify this high level code. > > Thanks Christoph! > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com ---end quoted text--- -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Mar 03, 2018 at 12:00:42AM +0100, Christoph Hellwig wrote: > Oh, and another thing: I think you want to make this new code dependent > on the block devie actually supporting REQ_FUA natively. Otherwise > you'll cause a flush for every emulated FUA write, which is only going > make things worse, especially for ATA where FLUSH is not queued. And > last time I check libata still disabled FUA by default. Yup, but the issue we have right now is that for pure RWF_DSYNC data overwrites we are already doing a post-flush on every IO. It's being issued as a separate zero-length IO, which is why REQ_FUA is faster and results in lower overall IOPS. The flush comes from this path: generic_write_sync vfs_fsync_range xfs_file_fsync .... /* * If we only have a single device, and the log force about was * a no-op we might have to flush the data device cache here. * This can only happen for fdatasync/O_DSYNC if we were overwriting * an already allocated file and thus do not have any metadata to * commit. */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); So the end result of using REQ_FUA and not calling generic_write_sync() on devices that don't support REQ_FUA is that we get a post-flush attached to the IO rather than separately issuing the post-flush via generic_write_sync(). Hence I think we end up with the same behaviour on devices that don't support REQ_FUA, just via a slightly different mechanism. Cheers, Dave.
On Sat, Mar 03, 2018 at 10:15:17AM +1100, Dave Chinner wrote: > On Sat, Mar 03, 2018 at 12:00:42AM +0100, Christoph Hellwig wrote: > > Oh, and another thing: I think you want to make this new code dependent > > on the block devie actually supporting REQ_FUA natively. Otherwise > > you'll cause a flush for every emulated FUA write, which is only going > > make things worse, especially for ATA where FLUSH is not queued. And > > last time I check libata still disabled FUA by default. > > Yup, but the issue we have right now is that for pure RWF_DSYNC data > overwrites we are already doing a post-flush on every IO. It's being > issued as a separate zero-length IO, which is why REQ_FUA is faster > and results in lower overall IOPS. The flush comes from this path: That is only the case if your device actually supports FUA. If the device does notit is emulated by the block/flk-flush.c code by issuing a FLUSH once the write has returned. So for e.g. a direct I/O write() call with O_DSYNC that turns into e.g. four write calls on the wire you currently have: WRITE WRITE WRITE WRITE FLUSH with your patch and a device that supports FUA you get WRITE (FUA) WRITE (FUA) WRITE (FUA) WRITE (FUA) but with a device that does not support FUA you get WRITE FLUSH WRITE FLUSH WRITE FLUSH WRITE FLUSH with the additional pain point that on ATA FLUSH is not a queueable command, so it will have to wait for the completion of every other non-related command first, and no other command can be started. So we should absolutely use your new approach IFF the device actually supports FUA (aka QUEUE_FLAG_FUA is set), but it will not help much or even be harmful if the device does not actually support the FUA bit. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Mar 02, 2018 at 11:26:41PM +0100, Christoph Hellwig wrote: > While we're at it we should probably > also skil the generic_write_sync call for DAX pure overwrites while > we're at it. Now that I've looked at it, DAX is a friggin' mess. We can't push the generic_write_sync() call into dax_iomap_rw() because dax_iomap_rw is called with the inode_lock() held, and both ext2's and ext4's ->fsync implementation can call __generic_file_fsync() which takes the inode_lock(). Deadlock central right there - someone is going to have to screw with ext4's indoe sync code to before we can do this. Further, XFS has post-write metadata updates to do in the case of extending writes, and hence we'd still need the call to generic_write_sync() after we have returned from dax_iomap_rw(). I'm thinking we'd really need a dax write IO completion callback (say in the struct iomap_ops) to run the filesystem specific IO completions (like the DIO path) before we can start to think about optimisations like this for DAX for XFS.... Cheers, Dave.
Oh well, lets skip DAX for now then. I still think moving the generic_write_sync call into xfs_file_dax_write/xfs_file_dio_aio_write/ xfs_file_buffered_aio_write functions would be a useful cleanup. On Mon, Mar 05, 2018 at 10:00:49AM +1100, Dave Chinner wrote: > On Fri, Mar 02, 2018 at 11:26:41PM +0100, Christoph Hellwig wrote: > > While we're at it we should probably > > also skil the generic_write_sync call for DAX pure overwrites while > > we're at it. > > Now that I've looked at it, DAX is a friggin' mess. > > We can't push the generic_write_sync() call into dax_iomap_rw() > because dax_iomap_rw is called with the inode_lock() held, and both > ext2's and ext4's ->fsync implementation can call > __generic_file_fsync() which takes the inode_lock(). Deadlock > central right there - someone is going to have to screw with ext4's > indoe sync code to before we can do this. > > Further, XFS has post-write metadata updates to do in the case of > extending writes, and hence we'd still need the call to > generic_write_sync() after we have returned from dax_iomap_rw(). I'm > thinking we'd really need a dax write IO completion callback (say in > the struct iomap_ops) to run the filesystem specific IO completions > (like the DIO path) before we can start to think about optimisations > like this for DAX for XFS.... > > Cheers, > > Dave. > -- > Dave Chinner > david@fromorbit.com ---end quoted text--- -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ adding Robert who had a question about this patch, and Jan + Ted for the ext4 implications ] I talked with Robert offline and he had some questions about this patch that belong on the mailing list. On Wed, Feb 28, 2018 at 5:41 PM, Dave Chinner <david@fromorbit.com> wrote: > From: Dave Chinner <dchinner@redhat.com> > > If we are doing direct IO writes with datasync semantics, we often > have to flush metadata changes along with the data write. However, > if we are overwriting existing data, there are no metadata changes > that we need to flush. In this case, optimising the IO by using > FUA write makes sense. > > We know from teh IOMAP_F_DIRTY flag as to whether a specific inode > requires a metadata flush - this is currently used by DAX to ensure > extent modi$fication as stable in page fault operations. For direct > IO writes, we can use it to determine if we need to flush metadata > or not once the data is on disk. > > Hence if we have been returned a mapped extent that is not new and > the IO mapping is not dirty, then we can use a FUA write to provide > datasync semantics. This allows us to short-cut the > generic_write_sync() call in IO completion and hence avoid > unnecessary operations. This makes pure direct IO data write > behaviour identical to the way block devices use REQ_FUA to provide > datasync semantics. > > Now that iomap_dio_rw() is determining if REQ_FUA can be used, we > have to stop issuing generic_write_sync() calls from the XFS code > when REQ_FUA is issued, otherwise it will still throw a cache flush > to the device via xfs_file_fsync(). To do this, we need to make > iomap_dio_rw() always responsible for issuing generic_write_sync() > when necessary, not just for AIO calls. This means the filesystem > doesn't have to guess when cache flushes are necessary now. > > On a FUA enabled device, a synchronous direct IO write workload > (sequential 4k overwrites in 32MB file) had the following results: > > # xfs_io -fd -c "pwrite -V 1 -D 0 32m" /mnt/scratch/boo > > kernel time write()s write iops Write b/w > ------ ---- -------- ---------- --------- > (no dsync) 4s 2173/s 2173 8.5MB/s > vanilla 22s 370/s 750 1.4MB/s > patched 19s 420/s 420 1.6MB/s > > The patched code clearly doesn't send cache flushes anymore, but > instead uses FUA (confirmed via blktrace), and performance improves > a bit as a result. However, the benefits will be higher on workloads > that mix O_DSYNC overwrites with other write IO as we won't be > flushing the entire device cache on every DSYNC overwrite IO > anymore. > > Signed-Off-By: Dave Chinner <dchinner@redhat.com> > --- > fs/iomap.c | 38 +++++++++++++++++++++++++++++++++----- > fs/xfs/xfs_file.c | 5 +++++ > 2 files changed, 38 insertions(+), 5 deletions(-) > > diff --git a/fs/iomap.c b/fs/iomap.c > index afd163586aa0..bcc90e3a2e3f 100644 > --- a/fs/iomap.c > +++ b/fs/iomap.c > @@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); > * Private flags for iomap_dio, must not overlap with the public ones in > * iomap.h: > */ > +#define IOMAP_DIO_WRITE_FUA (1 << 29) > #define IOMAP_DIO_WRITE (1 << 30) > #define IOMAP_DIO_DIRTY (1 << 31) > > @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > } > > inode_dio_end(file_inode(iocb->ki_filp)); > - kfree(dio); > > + /* > + * If a FUA write was done, then that is all we required for datasync > + * semantics -. we don't need to call generic_write_sync() to complete > + * the write. > + */ > + if (ret > 0 && > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > + IOMAP_DIO_WRITE) { > + ret = generic_write_sync(iocb, ret); > + } > + > + kfree(dio); > return ret; > } > > @@ -769,12 +781,9 @@ static void iomap_dio_complete_work(struct work_struct *work) > { > struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); > struct kiocb *iocb = dio->iocb; > - bool is_write = (dio->flags & IOMAP_DIO_WRITE); > ssize_t ret; > > ret = iomap_dio_complete(dio); > - if (is_write && ret > 0) > - ret = generic_write_sync(iocb, ret); > iocb->ki_complete(iocb, ret, 0); > } > > @@ -883,6 +892,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > dio->flags |= IOMAP_DIO_COW; > if (iomap->flags & IOMAP_F_NEW) > need_zeroout = true; > + /* > + * Use a FUA write if we need datasync semantics and this is a > + * pure data IO that doesn't require any metadata updates. This > + * allows us to avoid cache flushes on IO completion. > + */ > + else if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && > + (dio->flags & IOMAP_DIO_WRITE) && > + (dio->iocb->ki_flags & IOCB_DSYNC)) > + dio->flags |= IOMAP_DIO_WRITE_FUA; > break; > default: > WARN_ON_ONCE(1); > @@ -930,7 +948,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > > n = bio->bi_iter.bi_size; > if (dio->flags & IOMAP_DIO_WRITE) { > - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); > + int op_flags = REQ_SYNC | REQ_IDLE; > + > + if (dio->flags & IOMAP_DIO_WRITE_FUA) > + op_flags |= REQ_FUA; > + bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags); > task_io_account_write(n); > } else { > bio_set_op_attrs(bio, REQ_OP_READ, 0); > @@ -961,6 +983,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > return copied; > } > > +/* > + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO > + * is being issued as AIO or not. This allows us to optimise pure data writes to > + * use REQ_FUA rather than requiring generic_write_sync() to issue a REQ_FLUSH > + * post write. > + */ > ssize_t > iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > const struct iomap_ops *ops, iomap_dio_end_io_t end_io) > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 260ff5e5c264..81aa3b73471e 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -732,6 +732,11 @@ xfs_file_write_iter( > ret = xfs_file_dio_aio_write(iocb, from); > if (ret == -EREMCHG) > goto buffered; > + /* > + * Direct IO handles sync type writes internally on I/O > + * completion. > + */ > + return ret; > } else { > buffered: > ret = xfs_file_buffered_aio_write(iocb, from); > -- > 2.16.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hello all. I have a couple of follow-up questions around this effort, thanks you all for all your kind inputs, patience and knowledge transfer. 1. How does xfs or ext4 make sure a pattern of WS followed by FWS does not allow the write (WS) completion to be visible before the flush completes? I suspected the write was held in iomap_dio_complete_work but with the generic_write_sync change in the patch would a O_DSYNC write request to a DpoFua=0 block queue allow T2 to see the completion via io_getevents before T1 completed the actual flush? 2. How will my application be able to dynamically determine if xfs and ext4 have the performance enhancement for FUA or I need engage alternate methods to use fsync/fdatasync at strategic locations? 3. Are there any plans yet to optimize ext4 as well? 4. Before the patched code the xfs_file_write_iter would call generic_write_sync and that calls submit_io_wait. Does this hold the thread issuing the io_submit so it is unable to drive more async I/O? P.S. I tested the xfs patch today and great results ______________________________________________________________ Stamp the file with all 0xCCs before I do the write timing, so each sector is used and metadata flushed. Throughput way up on my system. O_DIRECT | O_DSYNC WFS 19.19 sec (getting FUA behavior) O_DIRECT WS 18.65 sec -----Original Message----- From: dan.j.williams@gmail.com <dan.j.williams@gmail.com> On Behalf Of Dan Williams Sent: Monday, March 12, 2018 6:54 PM To: Dave Chinner <david@fromorbit.com> Cc: linux-xfs@vger.kernel.org; Christoph Hellwig <hch@lst.de>; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Robert Dorr <rdorr@microsoft.com>; Jan Kara <jack@suse.cz>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com> Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes [ adding Robert who had a question about this patch, and Jan + Ted for the ext4 implications ] I talked with Robert offline and he had some questions about this patch that belong on the mailing list. On Wed, Feb 28, 2018 at 5:41 PM, Dave Chinner <david@fromorbit.com> wrote: > From: Dave Chinner <dchinner@redhat.com> > > If we are doing direct IO writes with datasync semantics, we often > have to flush metadata changes along with the data write. However, if > we are overwriting existing data, there are no metadata changes that > we need to flush. In this case, optimising the IO by using FUA write > makes sense. > > We know from teh IOMAP_F_DIRTY flag as to whether a specific inode > requires a metadata flush - this is currently used by DAX to ensure > extent modi$fication as stable in page fault operations. For direct IO > writes, we can use it to determine if we need to flush metadata or not > once the data is on disk. > > Hence if we have been returned a mapped extent that is not new and the > IO mapping is not dirty, then we can use a FUA write to provide > datasync semantics. This allows us to short-cut the > generic_write_sync() call in IO completion and hence avoid unnecessary > operations. This makes pure direct IO data write behaviour identical > to the way block devices use REQ_FUA to provide datasync semantics. > > Now that iomap_dio_rw() is determining if REQ_FUA can be used, we have > to stop issuing generic_write_sync() calls from the XFS code when > REQ_FUA is issued, otherwise it will still throw a cache flush to the > device via xfs_file_fsync(). To do this, we need to make > iomap_dio_rw() always responsible for issuing generic_write_sync() > when necessary, not just for AIO calls. This means the filesystem > doesn't have to guess when cache flushes are necessary now. > > On a FUA enabled device, a synchronous direct IO write workload > (sequential 4k overwrites in 32MB file) had the following results: > > # xfs_io -fd -c "pwrite -V 1 -D 0 32m" /mnt/scratch/boo > > kernel time write()s write iops Write b/w > ------ ---- -------- ---------- --------- > (no dsync) 4s 2173/s 2173 8.5MB/s > vanilla 22s 370/s 750 1.4MB/s > patched 19s 420/s 420 1.6MB/s > > The patched code clearly doesn't send cache flushes anymore, but > instead uses FUA (confirmed via blktrace), and performance improves a > bit as a result. However, the benefits will be higher on workloads > that mix O_DSYNC overwrites with other write IO as we won't be > flushing the entire device cache on every DSYNC overwrite IO anymore. > > Signed-Off-By: Dave Chinner <dchinner@redhat.com> > --- > fs/iomap.c | 38 +++++++++++++++++++++++++++++++++----- > fs/xfs/xfs_file.c | 5 +++++ > 2 files changed, 38 insertions(+), 5 deletions(-) > > diff --git a/fs/iomap.c b/fs/iomap.c > index afd163586aa0..bcc90e3a2e3f 100644 > --- a/fs/iomap.c > +++ b/fs/iomap.c > @@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); > * Private flags for iomap_dio, must not overlap with the public ones in > * iomap.h: > */ > +#define IOMAP_DIO_WRITE_FUA (1 << 29) > #define IOMAP_DIO_WRITE (1 << 30) > #define IOMAP_DIO_DIRTY (1 << 31) > > @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) > } > > inode_dio_end(file_inode(iocb->ki_filp)); > - kfree(dio); > > + /* > + * If a FUA write was done, then that is all we required for datasync > + * semantics -. we don't need to call generic_write_sync() to complete > + * the write. > + */ > + if (ret > 0 && > + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == > + IOMAP_DIO_WRITE) { > + ret = generic_write_sync(iocb, ret); > + } > + > + kfree(dio); > return ret; > } > > @@ -769,12 +781,9 @@ static void iomap_dio_complete_work(struct > work_struct *work) { > struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); > struct kiocb *iocb = dio->iocb; > - bool is_write = (dio->flags & IOMAP_DIO_WRITE); > ssize_t ret; > > ret = iomap_dio_complete(dio); > - if (is_write && ret > 0) > - ret = generic_write_sync(iocb, ret); > iocb->ki_complete(iocb, ret, 0); } > > @@ -883,6 +892,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > dio->flags |= IOMAP_DIO_COW; > if (iomap->flags & IOMAP_F_NEW) > need_zeroout = true; > + /* > + * Use a FUA write if we need datasync semantics and this is a > + * pure data IO that doesn't require any metadata updates. This > + * allows us to avoid cache flushes on IO completion. > + */ > + else if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && > + (dio->flags & IOMAP_DIO_WRITE) && > + (dio->iocb->ki_flags & IOCB_DSYNC)) > + dio->flags |= IOMAP_DIO_WRITE_FUA; > break; > default: > WARN_ON_ONCE(1); > @@ -930,7 +948,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, > loff_t length, > > n = bio->bi_iter.bi_size; > if (dio->flags & IOMAP_DIO_WRITE) { > - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); > + int op_flags = REQ_SYNC | REQ_IDLE; > + > + if (dio->flags & IOMAP_DIO_WRITE_FUA) > + op_flags |= REQ_FUA; > + bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags); > task_io_account_write(n); > } else { > bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ > -961,6 +983,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > return copied; > } > > +/* > + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of > +whether the IO > + * is being issued as AIO or not. This allows us to optimise pure > +data writes to > + * use REQ_FUA rather than requiring generic_write_sync() to issue a > +REQ_FLUSH > + * post write. > + */ > ssize_t > iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, > const struct iomap_ops *ops, iomap_dio_end_io_t > end_io) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index > 260ff5e5c264..81aa3b73471e 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -732,6 +732,11 @@ xfs_file_write_iter( > ret = xfs_file_dio_aio_write(iocb, from); > if (ret == -EREMCHG) > goto buffered; > + /* > + * Direct IO handles sync type writes internally on I/O > + * completion. > + */ > + return ret; > } else { > buffered: > ret = xfs_file_buffered_aio_write(iocb, from); > -- > 2.16.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" > in the body of a message to majordomo@vger.kernel.org More majordomo > info at > https://na01.safelinks.protection.outlook.com/?url=http%3A%2F%2Fvger.k > ernel.org%2Fmajordomo-info.html&data=04%7C01%7Crdorr%40microsoft.com%7 > C70271d8bc9c84c12e87808d5887485e5%7C72f988bf86f141af91ab2d7cd011db47%7 > C1%7C0%7C636564956441249840%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMD > AiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C-1&sdata=KqPTOI97YQxhrm%2 > BFu3MpoFMIWieH2nkool4tzs5IJ0M%3D&reserved=0
On Tue, Mar 13, 2018 at 12:15:28AM +0000, Robert Dorr wrote: > Hello all. I have a couple of follow-up questions around this > effort, thanks you all for all your kind inputs, patience and > knowledge transfer. > > 1. How does xfs or ext4 make sure a pattern of WS followed by > FWS does not allow the write (WS) completion to be visible before > the flush completes? I'm not sure what you are asking here. You need to be more precise about what these IOs are, who dispatched them and what their dependencies are. Where exactly did that FWS (REQ_FLUSH) come from? I think, though, you're asking questions about IO ordering at the wrong level - filesystems serialise and order IO, not the block layer. Hence what you see at the block layer is not necessary a reflection of the ordering the filesystem is doing. (I've already explained this earlier today in a different thread: https://marc.info/?l=linux-xfs&m=152091489100831&w=2) That's why I ask about the operation causing a REQ_FLUSH to be issued to the storage device, as that cannot be directly issued from userspace. It will occur as a side effect of a data integrity operation the filesystem is asked to perform, but without knowing the relationship between the integrity operation and the write in question an answer cannot be given. It may would be better to describe your IO ordering and integrity requirements at a higher level (e.g. the syscall layer), because then we know what you are trying to acheive rather that trying to understand your problem from context-less questions about "IO barriers" that don't actually exist... > I suspected the write was held in > iomap_dio_complete_work but with the generic_write_sync change in > the patch would a O_DSYNC write request to a DpoFua=0 block queue > allow T2 to see the completion via io_getevents before T1 > completed the actual flush? Yes, that can happen as concurrent data direct IO are not serialised against each other and will always race to completion without providing any ordering guarantees. IOWs, if you have an IO ordering dependency in your application, then that ordering dependency needs to be handled in the application. > 2. How will my application be able to dynamically determine if > xfs and ext4 have the performance enhancement for FUA or I need > engage alternate methods to use fsync/fdatasync at strategic > locations? You don't. The filesystem will provide the same integrity guarantees in either case - FUA is just a performance optimisation that will get used if your hardware supports it. Applications should not care what capabilities the storage hardawre has - the kernel should do what is fastest and most reliable for the underlying storage.... > 3. Are there any plans yet to optimize ext4 as well? Not from me. > 4. Before the patched code the xfs_file_write_iter would call > generic_write_sync and that calls submit_io_wait. Does this hold > the thread issuing the io_submit so it is unable to drive more > async I/O? No, -EIOCBQUEUED is returned to avoid blocking. AIO calls generic_write_sync() from the IO completion path via a worker thread so it's all done asynchronously. Cheers, Dave.
Very helpful, allow me to provide some concrete scenarios. SQL Server has data and log files and we honor WAL to provide for proper crash recovery and durability of the database. * http://www.microsoft.com/technet/prodtechnol/sql/2000/maintain/sqlIObasics.mspx * http://download.microsoft.com/download/4/f/8/4f8f2dc9-a9a7-4b68-98cb-163482c95e0b/SQLIOBasicsCh2.doc --------------------------------------------------------------------------------------------- #1 Scenario - Write vs Flush and io_getevents Ordering --------------------------------------------------------------------------------------------- If SQL Server opens data and log files with O_DIRECT | O_DSYNC 1. T1 - SQL Server issues io_submit (IOCB_CMD_WRITE) for data file Page xfs_write_file_iter issues xfs_file_dio_aio_write (W1) As you pointed out the return is -EIOCBQUEUED (from the block device direct I/O queue insert when !sync) so T1 can continue forward and would skip the generic_write_sync in xfs_write_file_iter 2. T2 - SQL Server is waiting on io_getevents for the I/O (W1) posted by T1 via the same async kernel I/O context When the W1 is considered complete SQL Server can truncate/reuse our database log file. I need to validate that the kernel only returns from io_getevents for W1 processed by hardware before F1 was issued and completed, then SQL Server can safely truncate the database transaction log. The code shows iomap_dio_complete_work that I expected to issue the REQ_PREFLUSH for the O_DSYNC based write and prevent the completion from getting to the io_getevents caller. The patch moved this to the iomap_dio_complete which I believe meets my needs. I think the answer is the iomap* logic makes sure to issue generic_write_sync for O_DSYNC W1 after W1 is completed by hardware and then waits for completion of the flush request (REQ_PREFLUSH) before W1 is returned to the AIO completion ring, preventing io_getevents from processing W1 before the flush occurs and completes. I just need proper confirmation from the experts on this code that this is the expected behavior. --------------------------------------------------------------------------------------------- #2 Dynamic determination of performant FUA capabilities --------------------------------------------------------------------------------------------- For SQL Server using O_DIRECT | O_DSYNC on current kernels is very performance impacting. Instead we enable a mode for SQL that opens O_DIRECT only and issues fsync/fdatasync when we are hardening log files or checkpointing data files. This reduces the write, flush, write, flush pattern allowing for write, write, write, ... then flush as we only issue flush requests when required to maintain the data integrity boundaries of SQL Server. The performance is significantly better then the device flush for each write as you can imagine. Testing shows the FUA enhancement is better then the write, flush pattern. For SQL Server we want to dynamically open with O_DIRECT | O_DSYNC when REQ_FUA can be properly used and open with O_DIRECT and leverage SQL Server's alternate flush scheme when running on older kernel or a system that does not support FUA (SATA, IDE, ...) -----Original Message----- From: Dave Chinner <david@fromorbit.com> Sent: Tuesday, March 13, 2018 12:11 AM To: Robert Dorr <rdorr@microsoft.com> Cc: Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; Christoph Hellwig <hch@lst.de>; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes [This sender failed our fraud detection checks and may not be who they appear to be. Learn about spoofing at http://aka.ms/LearnAboutSpoofing] On Tue, Mar 13, 2018 at 12:15:28AM +0000, Robert Dorr wrote: > Hello all. I have a couple of follow-up questions around this effort, > thanks you all for all your kind inputs, patience and knowledge > transfer. > > 1. How does xfs or ext4 make sure a pattern of WS followed by > FWS does not allow the write (WS) completion to be visible before the > flush completes? I'm not sure what you are asking here. You need to be more precise about what these IOs are, who dispatched them and what their dependencies are. Where exactly did that FWS (REQ_FLUSH) come from? I think, though, you're asking questions about IO ordering at the wrong level - filesystems serialise and order IO, not the block layer. Hence what you see at the block layer is not necessary a reflection of the ordering the filesystem is doing. (I've already explained this earlier today in a different thread: https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmarc.info%2F%3Fl%3Dlinux-xfs%26m%3D152091489100831%26w%3D2&data=04%7C01%7Crdorr%40microsoft.com%7C19bb64450eba4b0f2c8308d588a0c4e1%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C636565146506974722%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwifQ%3D%3D%7C100&sdata=o0CgiPt1dsfALFS5fXHRGy2jlD1t4HIBjjqD%2BqOKnb4%3D&reserved=0) That's why I ask about the operation causing a REQ_FLUSH to be issued to the storage device, as that cannot be directly issued from userspace. It will occur as a side effect of a data integrity operation the filesystem is asked to perform, but without knowing the relationship between the integrity operation and the write in question an answer cannot be given. It may would be better to describe your IO ordering and integrity requirements at a higher level (e.g. the syscall layer), because then we know what you are trying to acheive rather that trying to understand your problem from context-less questions about "IO barriers" that don't actually exist... > I suspected the write was held in > iomap_dio_complete_work but with the generic_write_sync change in the > patch would a O_DSYNC write request to a DpoFua=0 block queue allow T2 > to see the completion via io_getevents before T1 completed the actual > flush? Yes, that can happen as concurrent data direct IO are not serialised against each other and will always race to completion without providing any ordering guarantees. IOWs, if you have an IO ordering dependency in your application, then that ordering dependency needs to be handled in the application. > 2. How will my application be able to dynamically determine if > xfs and ext4 have the performance enhancement for FUA or I need engage > alternate methods to use fsync/fdatasync at strategic locations? You don't. The filesystem will provide the same integrity guarantees in either case - FUA is just a performance optimisation that will get used if your hardware supports it. Applications should not care what capabilities the storage hardawre has - the kernel should do what is fastest and most reliable for the underlying storage.... > 3. Are there any plans yet to optimize ext4 as well? Not from me. > 4. Before the patched code the xfs_file_write_iter would call > generic_write_sync and that calls submit_io_wait. Does this hold the > thread issuing the io_submit so it is unable to drive more async I/O? No, -EIOCBQUEUED is returned to avoid blocking. AIO calls generic_write_sync() from the IO completion path via a worker thread so it's all done asynchronously. Cheers, Dave. -- Dave Chinner david@fromorbit.com -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> I think the answer is the iomap* logic makes sure to issue generic_write_sync for O_DSYNC W1 after W1 is completed by hardware and then waits for completion of the flush request (REQ_PREFLUSH) before W1 is returned to the AIO completion ring, preventing io_getevents from processing W1 before the flush occurs and completes. I just need proper confirmation from the experts on this code that this is the expected behavior. Yes. that is the case. > For SQL Server using O_DIRECT | O_DSYNC on current kernels is very performance impacting. Instead we enable a mode for SQL that opens O_DIRECT only and issues fsync/fdatasync when we are hardening log files or checkpointing data files. This reduces the write, flush, write, flush pattern allowing for write, write, write, ... then flush as we only issue flush requests when required to maintain the data integrity boundaries of SQL Server. The performance is significantly better then the device flush for each write as you can imagine. > > Testing shows the FUA enhancement is better then the write, flush pattern. For SQL Server we want to dynamically open with O_DIRECT | O_DSYNC when REQ_FUA can be properly used and open with O_DIRECT and leverage SQL Server's alternate flush scheme when running on older kernel or a system that does not support FUA (SATA, IDE, ...) There is no really good way to figure this out except for benchmarking, given that the FUA use an implementation detail. Btw, another feature that might be interesting to you is the RWF_DSYNC flag to the pwritev2 syscall, which allows to apply O_DSYNC semantics on a per-I/O basis instead of using it at open time. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
I tried the RWF_ODSYNC and xfs seems to honor properly but it causes ext4 to go spin out a single CPU and hang the system.
On Tue 13-03-18 18:52:57, Robert Dorr wrote: > I tried the RWF_ODSYNC and xfs seems to honor properly but it causes ext4 > to go spin out a single CPU and hang the system.
Awesome news on the spin. Are you going to be able to make changes to EXT4 to accommodate REQ_FUA without generic_write_flush to improve performance like was done for xfs? -----Original Message----- From: Jan Kara <jack@suse.cz> Sent: Monday, March 19, 2018 11:07 AM To: Robert Dorr <rdorr@microsoft.com> Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes On Tue 13-03-18 18:52:57, Robert Dorr wrote: > I tried the RWF_ODSYNC and xfs seems to honor properly but it causes > ext4 to go spin out a single CPU and hang the system.
SmFuIC0gYXJlIHlvdSBhYmxlIHRvIG1ha2UgdGhlIGV4dDQgcGVyZm9ybWFuY2UgY2hhbmdlcz8N Cg0KDQotLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KRnJvbTogUm9iZXJ0IERvcnIgDQpTZW50 OiBNb25kYXksIE1hcmNoIDE5LCAyMDE4IDExOjE0IEFNDQpUbzogSmFuIEthcmEgPGphY2tAc3Vz ZS5jej4NCkNjOiBDaHJpc3RvcGggSGVsbHdpZyA8aGNoQGxzdC5kZT47IERhdmUgQ2hpbm5lciA8 ZGF2aWRAZnJvbW9yYml0LmNvbT47IERhbiBXaWxsaWFtcyA8ZGFuLmoud2lsbGlhbXNAaW50ZWwu Y29tPjsgbGludXgteGZzQHZnZXIua2VybmVsLm9yZzsgbGludXgtZnNkZXZlbCA8bGludXgtZnNk ZXZlbEB2Z2VyLmtlcm5lbC5vcmc+OyBUaGVvZG9yZSBUcydvIDx0eXRzb0BtaXQuZWR1PjsgTWF0 dGhldyBXaWxjb3ggPG1hd2lsY294QG1pY3Jvc29mdC5jb20+OyBTY290dCBLb25lcnNtYW5uIDxz Y290dGtvbkBtaWNyb3NvZnQuY29tPjsgU2xhdmEgT2tzIDxzbGF2YW9AbWljcm9zb2Z0LmNvbT47 IEphc3JhaiBEYW5nZSA8amFzcmFqZEBtaWNyb3NvZnQuY29tPjsgTWljaGFlbCBOZWxzb24gPG1p Y25AbWljcm9zb2Z0LmNvbT4NClN1YmplY3Q6IFJFOiBbUEFUQ0hdIFtSRkNdIGlvbWFwOiBVc2Ug RlVBIGZvciBwdXJlIGRhdGEgT19EU1lOQyBESU8gd3JpdGVzDQoNCkF3ZXNvbWUgbmV3cyBvbiB0 aGUgc3Bpbi4gICANCg0KQXJlIHlvdSBnb2luZyB0byBiZSBhYmxlIHRvIG1ha2UgY2hhbmdlcyB0 byBFWFQ0IHRvIGFjY29tbW9kYXRlIFJFUV9GVUEgd2l0aG91dCBnZW5lcmljX3dyaXRlX2ZsdXNo IHRvIGltcHJvdmUgcGVyZm9ybWFuY2UgbGlrZSB3YXMgZG9uZSBmb3IgeGZzPw0KDQotLS0tLU9y aWdpbmFsIE1lc3NhZ2UtLS0tLQ0KRnJvbTogSmFuIEthcmEgPGphY2tAc3VzZS5jej4NClNlbnQ6 IE1vbmRheSwgTWFyY2ggMTksIDIwMTggMTE6MDcgQU0NClRvOiBSb2JlcnQgRG9yciA8cmRvcnJA bWljcm9zb2Z0LmNvbT4NCkNjOiBDaHJpc3RvcGggSGVsbHdpZyA8aGNoQGxzdC5kZT47IERhdmUg Q2hpbm5lciA8ZGF2aWRAZnJvbW9yYml0LmNvbT47IERhbiBXaWxsaWFtcyA8ZGFuLmoud2lsbGlh bXNAaW50ZWwuY29tPjsgbGludXgteGZzQHZnZXIua2VybmVsLm9yZzsgbGludXgtZnNkZXZlbCA8 bGludXgtZnNkZXZlbEB2Z2VyLmtlcm5lbC5vcmc+OyBKYW4gS2FyYSA8amFja0BzdXNlLmN6Pjsg VGhlb2RvcmUgVHMnbyA8dHl0c29AbWl0LmVkdT47IE1hdHRoZXcgV2lsY294IDxtYXdpbGNveEBt aWNyb3NvZnQuY29tPjsgU2NvdHQgS29uZXJzbWFubiA8c2NvdHRrb25AbWljcm9zb2Z0LmNvbT47 IFNsYXZhIE9rcyA8c2xhdmFvQG1pY3Jvc29mdC5jb20+OyBKYXNyYWogRGFuZ2UgPGphc3JhamRA bWljcm9zb2Z0LmNvbT47IE1pY2hhZWwgTmVsc29uIDxtaWNuQG1pY3Jvc29mdC5jb20+DQpTdWJq ZWN0OiBSZTogW1BBVENIXSBbUkZDXSBpb21hcDogVXNlIEZVQSBmb3IgcHVyZSBkYXRhIE9fRFNZ TkMgRElPIHdyaXRlcw0KDQpPbiBUdWUgMTMtMDMtMTggMTg6NTI6NTcsIFJvYmVydCBEb3JyIHdy b3RlOg0KPiBJIHRyaWVkIHRoZSBSV0ZfT0RTWU5DIGFuZCB4ZnMgc2VlbXMgdG8gaG9ub3IgcHJv cGVybHkgYnV0IGl0IGNhdXNlcw0KPiBleHQ0IHRvIGdvIHNwaW4gb3V0IGEgc2luZ2xlIENQVSBh bmQgaGFuZyB0aGUgc3lzdGVtLiDwn5iKDQoNClllYWgsIHRoYXQncyBhIGJ1ZyBpbiBnZW5lcmlj IGRpcmVjdCBJTyBjb2RlIHRoYXQgSSd2ZSBqdXN0IHJlY2VudGx5IGZpeGVkDQooZDljMTBlNWI4 ODYzICJkaXJlY3QtaW86IEZpeCBzbGVlcCBpbiBhdG9taWMgZHVlIHRvIHN5bmMgQUlPIiBpbiA0 LjE2LXJjNCkuDQoNCgkJCQkJCQkJSG9uemENCg0KPiANCj4gDQo+IC0tLS0tT3JpZ2luYWwgTWVz c2FnZS0tLS0tDQo+IEZyb206IENocmlzdG9waCBIZWxsd2lnIDxoY2hAbHN0LmRlPg0KPiBTZW50 OiBUdWVzZGF5LCBNYXJjaCAxMywgMjAxOCAxMToxMyBBTQ0KPiBUbzogUm9iZXJ0IERvcnIgPHJk b3JyQG1pY3Jvc29mdC5jb20+DQo+IENjOiBEYXZlIENoaW5uZXIgPGRhdmlkQGZyb21vcmJpdC5j b20+OyBEYW4gV2lsbGlhbXMgDQo+IDxkYW4uai53aWxsaWFtc0BpbnRlbC5jb20+OyBsaW51eC14 ZnNAdmdlci5rZXJuZWwub3JnOyBDaHJpc3RvcGggDQo+IEhlbGx3aWcgPGhjaEBsc3QuZGU+OyBs aW51eC1mc2RldmVsIDxsaW51eC1mc2RldmVsQHZnZXIua2VybmVsLm9yZz47IA0KPiBKYW4gS2Fy YSA8amFja0BzdXNlLmN6PjsgVGhlb2RvcmUgVHMnbyA8dHl0c29AbWl0LmVkdT47IE1hdHRoZXcg V2lsY294IA0KPiA8bWF3aWxjb3hAbWljcm9zb2Z0LmNvbT47IFNjb3R0IEtvbmVyc21hbm4gPHNj b3R0a29uQG1pY3Jvc29mdC5jb20+OyANCj4gU2xhdmEgT2tzIDxzbGF2YW9AbWljcm9zb2Z0LmNv bT47IEphc3JhaiBEYW5nZSANCj4gPGphc3JhamRAbWljcm9zb2Z0LmNvbT47IE1pY2hhZWwgTmVs c29uIDxtaWNuQG1pY3Jvc29mdC5jb20+DQo+IFN1YmplY3Q6IFJlOiBbUEFUQ0hdIFtSRkNdIGlv bWFwOiBVc2UgRlVBIGZvciBwdXJlIGRhdGEgT19EU1lOQyBESU8gDQo+IHdyaXRlcw0KPiANCj4g PiBJIHRoaW5rIHRoZSBhbnN3ZXIgaXMgdGhlIGlvbWFwKiBsb2dpYyBtYWtlcyBzdXJlIHRvIGlz c3VlIGdlbmVyaWNfd3JpdGVfc3luYyBmb3IgT19EU1lOQyBXMSBhZnRlciBXMSBpcyBjb21wbGV0 ZWQgYnkgaGFyZHdhcmUgYW5kIHRoZW4gd2FpdHMgZm9yIGNvbXBsZXRpb24gb2YgdGhlIGZsdXNo IHJlcXVlc3QgKFJFUV9QUkVGTFVTSCkgYmVmb3JlIFcxIGlzIHJldHVybmVkIHRvIHRoZSBBSU8g Y29tcGxldGlvbiByaW5nLCBwcmV2ZW50aW5nIGlvX2dldGV2ZW50cyBmcm9tIHByb2Nlc3Npbmcg VzEgYmVmb3JlIHRoZSBmbHVzaCBvY2N1cnMgYW5kIGNvbXBsZXRlcy4gICBJIGp1c3QgbmVlZCBw cm9wZXIgY29uZmlybWF0aW9uIGZyb20gdGhlIGV4cGVydHMgb24gdGhpcyBjb2RlIHRoYXQgdGhp cyBpcyB0aGUgZXhwZWN0ZWQgYmVoYXZpb3IuDQo+IA0KPiBZZXMuIHRoYXQgaXMgdGhlIGNhc2Uu DQo+IA0KPiA+IEZvciBTUUwgU2VydmVyIHVzaW5nIE9fRElSRUNUIHwgT19EU1lOQyBvbiBjdXJy ZW50IGtlcm5lbHMgaXMgdmVyeSBwZXJmb3JtYW5jZSBpbXBhY3RpbmcuICAgSW5zdGVhZCB3ZSBl bmFibGUgYSBtb2RlIGZvciBTUUwgdGhhdCBvcGVucyBPX0RJUkVDVCBvbmx5IGFuZCBpc3N1ZXMg ZnN5bmMvZmRhdGFzeW5jIHdoZW4gd2UgYXJlIGhhcmRlbmluZyBsb2cgZmlsZXMgb3IgY2hlY2tw b2ludGluZyBkYXRhIGZpbGVzLiAgIFRoaXMgcmVkdWNlcyB0aGUgd3JpdGUsIGZsdXNoLCB3cml0 ZSwgZmx1c2ggcGF0dGVybiBhbGxvd2luZyBmb3Igd3JpdGUsIHdyaXRlLCB3cml0ZSwgLi4uIHRo ZW4gZmx1c2ggYXMgd2Ugb25seSBpc3N1ZSBmbHVzaCByZXF1ZXN0cyB3aGVuIHJlcXVpcmVkIHRv IG1haW50YWluIHRoZSBkYXRhIGludGVncml0eSBib3VuZGFyaWVzIG9mIFNRTCBTZXJ2ZXIuICBU aGUgcGVyZm9ybWFuY2UgaXMgc2lnbmlmaWNhbnRseSBiZXR0ZXIgdGhlbiB0aGUgZGV2aWNlIGZs dXNoIGZvciBlYWNoIHdyaXRlIGFzIHlvdSBjYW4gaW1hZ2luZS4NCj4gPiANCj4gPiBUZXN0aW5n IHNob3dzIHRoZSBGVUEgZW5oYW5jZW1lbnQgaXMgYmV0dGVyIHRoZW4gdGhlIHdyaXRlLCBmbHVz aCBwYXR0ZXJuLiAgIEZvciBTUUwgU2VydmVyIHdlIHdhbnQgdG8gZHluYW1pY2FsbHkgb3BlbiB3 aXRoIE9fRElSRUNUIHwgT19EU1lOQyB3aGVuIFJFUV9GVUEgY2FuIGJlIHByb3Blcmx5IHVzZWQg YW5kIG9wZW4gd2l0aCBPX0RJUkVDVCBhbmQgbGV2ZXJhZ2UgU1FMIFNlcnZlcidzIGFsdGVybmF0 ZSBmbHVzaCBzY2hlbWUgd2hlbiBydW5uaW5nIG9uIG9sZGVyIGtlcm5lbCBvciBhIHN5c3RlbSB0 aGF0IGRvZXMgbm90IHN1cHBvcnQgRlVBIChTQVRBLCBJREUsIC4uLikNCj4gDQo+IFRoZXJlIGlz IG5vIHJlYWxseSBnb29kIHdheSB0byBmaWd1cmUgdGhpcyBvdXQgZXhjZXB0IGZvciBiZW5jaG1h cmtpbmcsIGdpdmVuIHRoYXQgdGhlIEZVQSB1c2UgYW4gaW1wbGVtZW50YXRpb24gZGV0YWlsLg0K PiANCj4gQnR3LCBhbm90aGVyIGZlYXR1cmUgdGhhdCBtaWdodCBiZSBpbnRlcmVzdGluZyB0byB5 b3UgaXMgdGhlIFJXRl9EU1lOQyBmbGFnIHRvIHRoZSBwd3JpdGV2MiBzeXNjYWxsLCB3aGljaCBh bGxvd3MgdG8gYXBwbHkgT19EU1lOQyBzZW1hbnRpY3Mgb24gYSBwZXItSS9PIGJhc2lzIGluc3Rl YWQgb2YgdXNpbmcgaXQgYXQgb3BlbiB0aW1lLg0KLS0NCkphbiBLYXJhIDxqYWNrQHN1c2UuY29t Pg0KU1VTRSBMYWJzLCBDUg0K -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 19-03-18 16:14:16, Robert Dorr wrote: > Awesome news on the spin. > > Are you going to be able to make changes to EXT4 to accommodate REQ_FUA > without generic_write_flush to improve performance like was done for xfs? I'm currently on a sick leave so I'm slow on replies. Sorry. I don't see a reason why ext4 could not have the same optimization as XFS for that case of direct IO. However I'm not sure when I get to implementing that. Honza > -----Original Message----- > From: Jan Kara <jack@suse.cz> > Sent: Monday, March 19, 2018 11:07 AM > To: Robert Dorr <rdorr@microsoft.com> > Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> > Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes > > On Tue 13-03-18 18:52:57, Robert Dorr wrote: > > I tried the RWF_ODSYNC and xfs seems to honor properly but it causes > > ext4 to go spin out a single CPU and hang the system.
Sorry to hear that. Get well soon. Thanks for the response. -----Original Message----- From: Jan Kara <jack@suse.cz> Sent: Thursday, March 22, 2018 9:36 AM To: Robert Dorr <rdorr@microsoft.com> Cc: Jan Kara <jack@suse.cz>; Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes On Mon 19-03-18 16:14:16, Robert Dorr wrote: > Awesome news on the spin. > > Are you going to be able to make changes to EXT4 to accommodate > REQ_FUA without generic_write_flush to improve performance like was done for xfs? I'm currently on a sick leave so I'm slow on replies. Sorry. I don't see a reason why ext4 could not have the same optimization as XFS for that case of direct IO. However I'm not sure when I get to implementing that. Honza > -----Original Message----- > From: Jan Kara <jack@suse.cz> > Sent: Monday, March 19, 2018 11:07 AM > To: Robert Dorr <rdorr@microsoft.com> > Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner > <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; > linux-xfs@vger.kernel.org; linux-fsdevel > <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore > Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott > Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; > Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson > <micn@microsoft.com> > Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO > writes > > On Tue 13-03-18 18:52:57, Robert Dorr wrote: > > I tried the RWF_ODSYNC and xfs seems to honor properly but it causes > > ext4 to go spin out a single CPU and hang the system.
SG9wZSB5b3UgYXJlIGZlZWxpbmcgYmV0dGVyLg0KDQoxLiBIb3cgY2FuIHdlIGVmZmVjdCBzaW1p bGFyIGNoYW5nZXMgaW4gZXh0ND8NCjIuIEhvdyBkbyB3ZSBnZXQgdGhlc2UgY2hhbmdlcyBwdXNo ZWQgaW50byBhbGwgY29tbW9uIHJlbGVhc2UgYnVpbGRzIGZvciBib3RoIHhmcyBvciBleHQ0IG9w dGltaXphdGlvbnM/DQoNCldlIGFyZSBzZWVpbmcgZHJhbWF0aWMgcGVyZm9ybWFuY2UgaW5jcmVh c2VzIHdpdGggdGhlIHhmcyBmaXhlcy4gICBodHRwczovL2J1Z3ppbGxhLnJlZGhhdC5jb20vc2hv d19idWcuY2dpP2lkPTE1NDg1MjQNCg0KNCBzb2NrZXQsIFRQQ0MNCjksMTA4IOKAkyBPbGQga2Vy bmVsDQozNiw2MTgg4oCTIFBhdGNoZWQga2VybmVsDQoNCjIgc29ja2V0LCBUUENDDQo5LDYwNiAt IE9sZCBrZXJuZWwNCjE4LDg0MCAtIFBhdGNoZWQga2VybmVsDQoNCi0tLS0tLS0tLS0tLS0tLS0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQotLSBUaW1lcyBpbiBtaWxsaS1zZWNvbmRzDQotLSB3 dD1TUUwgd3JpdGV0aHJvdWdoIHNldHRpbmcNCi0tIGF3dD1TUUwgYWx0ZXJuYXRld3JpdGV0aHJv dWdoIHNldHRpbmcNCi0tIC1UMzk3OT1TUUwgb3BlbiB3aXRoIEZ1YSBiZWhhdmlvcg0KLS0gLVQz OTgyPVNRTCBvcGVucyB3aXRob3V0IEZ1YSBhbmQgdXNlIGZzeW5jIChmb3JjZWQgZmx1c2gpIGJl aGF2aW9yDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KLS0NCi0t CUxpbnV4IHBhdGNoZWQga2VybmVsIGZvciBYRlMgRlVBIG9wdHMNCi0tICBTUUwgU2VydmVyIDIw MTcgQ1U2DQotLSAgYmxrdHJhY2UgYWN0aXZlDQotLSAgQ2VudG9zNyAtIDRwcm9jLCA4R0IgcmFt IHJ1bm5pbmcgb24gSHlwZXItVg0KLS0gIFdpbjEwLCA4cHJvYywgMzJHQiByYW0gaG9zdA0KLS0g IFNhbXN1bmcgU1NEIGRyaXZlDQotLSAgQ2xlYW4gU1FMIFNlcnZlciByZXN0YXJ0cw0KLS0tLS0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCi0tICBQQVRDSEVEIEtFUk5FTCA0 LjE2LngNCi0tCSAgICAgCQkJCQkJCQlDcmVhdGUgREIgSW5zZXJ0cyBDaGVja3BvaW50IE9wdGlv bnMNCi0tICAtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLSAtLS0tLS0tLS0gLS0t LS0tLSAtLS0tLS0tLS0tIC0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQotLSAg RGVmYXVsdCAoLVQzOTgyID0gZm9yY2VkIGZsdXNoKQkJMzc4NyAgICAgIDI5ODE0ICAgNDcJCSBP X0RJUkVDVCBhbmQgZnN5bmMgZnJvbSBTUUwNCi0tICAtVDM5NzkgYW5kIHd0PTEgYXd0PTEJCQkg ICAgNDIyNAkgIDI1ODgwICAgMzAgICAgICAgICBPX0RJUkVDVCBhbmQgc3BlY2lhbGl6ZWQgZmRh dGFzeW5jICAoQmVzdCB3aXRob3V0IEZVQSBpcyB+MTB4IHNsb3dlciB0aGFuIEZVQSkNCi0tDQot LSAgLVQzOTc5IGFuZCB3dD0wCQkJCQkJMjg2MAkgIDI1NzAJICAxMwkJIE9fRElSRUNUIA0KLS0g IC1UMzk3OSBhbmQgd3Q9MSBhd3Q9MAkJCQk0MjA0CSAgMjU3MwkgIDE3CQkgT19ESVJFQ1QgfCBP X0RTWU5DIC0gSW5zZXJ0cyBhbmQgY2hlY2twb2ludCBvbiBhbGxvY2F0ZWQgc3BhY2UgbmVhcnMg T19ESVJFQ1Qgb25seSAgPC0tLSAhISEgREVTSVJFRCBCRUhBVklPUiAhISENCi0tICAtVDM5Nzkg YW5kIHd0PTEgYXd0PTAgTk8gU1RBTVAJCTQxMjcJICAyODIzCSAgMTcJCSBPX0RJUkVDVCB8IE9f RFNZTkMgLSBJc3N1ZXMgaW50ZXJuYWwgZmx1c2hzIGZvciAxc3Qgd3JpdGVzLCBjcmVhdGUgZmFz dGVyIGJ1dCBpbnNlcnRzIHNsb3dlcg0KLS0NCi0tICBVTlBBVENIRUQgS0VSTkVMIDMueA0KLS0J ICAgICAJCQkJCQkJCUNyZWF0ZSBEQiBJbnNlcnRzIENoZWNrcG9pbnQgT3B0aW9ucw0KLS0gIC0t LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tIC0tLS0tLS0tLSAtLS0tLS0tIC0tLS0t LS0tLS0gLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCi0tICBEZWZhdWx0ICgt VDM5ODIgPSBmb3JjZWQgZmx1c2gpCQkzMDk3ICAgICAgMjk3MDAgICAyNgkJIE9fRElSRUNUIGFu ZCBmc3luYyBmcm9tIFNRTA0KLS0gIC1UMzk3OSBhbmQgd3Q9MSBhd3Q9MQkJCSAgICA0MjEwCSAg MjU4MDAgICAyNyAgICAgICAgIE9fRElSRUNUIGFuZCBzcGVjaWFsaXplZCBmZGF0YXN5bmMNCi0t DQotLSAgLVQzOTc5IGFuZCB3dD0wCQkJCQkJMjg4MAkgIDIxMzQJICA2CQkJIE9fRElSRUNUIA0K LS0gIC1UMzk3OSBhbmQgd3Q9MSBhd3Q9MCAJCSAgICAgICAgNDQ0MwkgIDI1MjkwCSAgNDIJCSBP X0RJUkVDVCB8IE9fRFNZTkMNCi0tICAtVDM5NzkgYW5kIHd0PTEgYXd0PTAgTk8gU1RBTVAJCTQx MjAJICAyNTY4MAkgIDQwCQkgT19ESVJFQ1QgfCBPX0RTWU5DDQotLQ0KDQoNCg0KDQotLS0tLU9y aWdpbmFsIE1lc3NhZ2UtLS0tLQ0KRnJvbTogUm9iZXJ0IERvcnIgDQpTZW50OiBUaHVyc2RheSwg TWFyY2ggMjIsIDIwMTggNzozOCBBTQ0KVG86IEphbiBLYXJhIDxqYWNrQHN1c2UuY3o+DQpDYzog Q2hyaXN0b3BoIEhlbGx3aWcgPGhjaEBsc3QuZGU+OyBEYXZlIENoaW5uZXIgPGRhdmlkQGZyb21v cmJpdC5jb20+OyBEYW4gV2lsbGlhbXMgPGRhbi5qLndpbGxpYW1zQGludGVsLmNvbT47IGxpbnV4 LXhmc0B2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4LWZzZGV2ZWwgPGxpbnV4LWZzZGV2ZWxAdmdlci5r ZXJuZWwub3JnPjsgVGhlb2RvcmUgVHMnbyA8dHl0c29AbWl0LmVkdT47IE1hdHRoZXcgV2lsY294 IDxtYXdpbGNveEBtaWNyb3NvZnQuY29tPjsgU2NvdHQgS29uZXJzbWFubiA8c2NvdHRrb25AbWlj cm9zb2Z0LmNvbT47IFNsYXZhIE9rcyA8c2xhdmFvQG1pY3Jvc29mdC5jb20+OyBKYXNyYWogRGFu Z2UgPGphc3JhamRAbWljcm9zb2Z0LmNvbT47IE1pY2hhZWwgTmVsc29uIDxtaWNuQG1pY3Jvc29m dC5jb20+DQpTdWJqZWN0OiBSRTogW1BBVENIXSBbUkZDXSBpb21hcDogVXNlIEZVQSBmb3IgcHVy ZSBkYXRhIE9fRFNZTkMgRElPIHdyaXRlcw0KDQpTb3JyeSB0byBoZWFyIHRoYXQuICBHZXQgd2Vs bCBzb29uLg0KDQpUaGFua3MgZm9yIHRoZSByZXNwb25zZS4NCg0KDQotLS0tLU9yaWdpbmFsIE1l c3NhZ2UtLS0tLQ0KRnJvbTogSmFuIEthcmEgPGphY2tAc3VzZS5jej4NClNlbnQ6IFRodXJzZGF5 LCBNYXJjaCAyMiwgMjAxOCA5OjM2IEFNDQpUbzogUm9iZXJ0IERvcnIgPHJkb3JyQG1pY3Jvc29m dC5jb20+DQpDYzogSmFuIEthcmEgPGphY2tAc3VzZS5jej47IENocmlzdG9waCBIZWxsd2lnIDxo Y2hAbHN0LmRlPjsgRGF2ZSBDaGlubmVyIDxkYXZpZEBmcm9tb3JiaXQuY29tPjsgRGFuIFdpbGxp YW1zIDxkYW4uai53aWxsaWFtc0BpbnRlbC5jb20+OyBsaW51eC14ZnNAdmdlci5rZXJuZWwub3Jn OyBsaW51eC1mc2RldmVsIDxsaW51eC1mc2RldmVsQHZnZXIua2VybmVsLm9yZz47IFRoZW9kb3Jl IFRzJ28gPHR5dHNvQG1pdC5lZHU+OyBNYXR0aGV3IFdpbGNveCA8bWF3aWxjb3hAbWljcm9zb2Z0 LmNvbT47IFNjb3R0IEtvbmVyc21hbm4gPHNjb3R0a29uQG1pY3Jvc29mdC5jb20+OyBTbGF2YSBP a3MgPHNsYXZhb0BtaWNyb3NvZnQuY29tPjsgSmFzcmFqIERhbmdlIDxqYXNyYWpkQG1pY3Jvc29m dC5jb20+OyBNaWNoYWVsIE5lbHNvbiA8bWljbkBtaWNyb3NvZnQuY29tPg0KU3ViamVjdDogUmU6 IFtQQVRDSF0gW1JGQ10gaW9tYXA6IFVzZSBGVUEgZm9yIHB1cmUgZGF0YSBPX0RTWU5DIERJTyB3 cml0ZXMNCg0KT24gTW9uIDE5LTAzLTE4IDE2OjE0OjE2LCBSb2JlcnQgRG9yciB3cm90ZToNCj4g QXdlc29tZSBuZXdzIG9uIHRoZSBzcGluLiAgIA0KPiANCj4gQXJlIHlvdSBnb2luZyB0byBiZSBh YmxlIHRvIG1ha2UgY2hhbmdlcyB0byBFWFQ0IHRvIGFjY29tbW9kYXRlIA0KPiBSRVFfRlVBIHdp dGhvdXQgZ2VuZXJpY193cml0ZV9mbHVzaCB0byBpbXByb3ZlIHBlcmZvcm1hbmNlIGxpa2Ugd2Fz IGRvbmUgZm9yIHhmcz8NCg0KSSdtIGN1cnJlbnRseSBvbiBhIHNpY2sgbGVhdmUgc28gSSdtIHNs b3cgb24gcmVwbGllcy4gU29ycnkuIEkgZG9uJ3Qgc2VlIGEgcmVhc29uIHdoeSBleHQ0IGNvdWxk IG5vdCBoYXZlIHRoZSBzYW1lIG9wdGltaXphdGlvbiBhcyBYRlMgZm9yIHRoYXQgY2FzZSBvZiBk aXJlY3QgSU8uIEhvd2V2ZXIgSSdtIG5vdCBzdXJlIHdoZW4gSSBnZXQgdG8gaW1wbGVtZW50aW5n IHRoYXQuDQoNCgkJCQkJCQkJSG9uemENCg0KPiAtLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0K PiBGcm9tOiBKYW4gS2FyYSA8amFja0BzdXNlLmN6Pg0KPiBTZW50OiBNb25kYXksIE1hcmNoIDE5 LCAyMDE4IDExOjA3IEFNDQo+IFRvOiBSb2JlcnQgRG9yciA8cmRvcnJAbWljcm9zb2Z0LmNvbT4N Cj4gQ2M6IENocmlzdG9waCBIZWxsd2lnIDxoY2hAbHN0LmRlPjsgRGF2ZSBDaGlubmVyIA0KPiA8 ZGF2aWRAZnJvbW9yYml0LmNvbT47IERhbiBXaWxsaWFtcyA8ZGFuLmoud2lsbGlhbXNAaW50ZWwu Y29tPjsgDQo+IGxpbnV4LXhmc0B2Z2VyLmtlcm5lbC5vcmc7IGxpbnV4LWZzZGV2ZWwgDQo+IDxs aW51eC1mc2RldmVsQHZnZXIua2VybmVsLm9yZz47IEphbiBLYXJhIDxqYWNrQHN1c2UuY3o+OyBU aGVvZG9yZSANCj4gVHMnbyA8dHl0c29AbWl0LmVkdT47IE1hdHRoZXcgV2lsY294IDxtYXdpbGNv eEBtaWNyb3NvZnQuY29tPjsgU2NvdHQgDQo+IEtvbmVyc21hbm4gPHNjb3R0a29uQG1pY3Jvc29m dC5jb20+OyBTbGF2YSBPa3MgPHNsYXZhb0BtaWNyb3NvZnQuY29tPjsgDQo+IEphc3JhaiBEYW5n ZSA8amFzcmFqZEBtaWNyb3NvZnQuY29tPjsgTWljaGFlbCBOZWxzb24gDQo+IDxtaWNuQG1pY3Jv c29mdC5jb20+DQo+IFN1YmplY3Q6IFJlOiBbUEFUQ0hdIFtSRkNdIGlvbWFwOiBVc2UgRlVBIGZv ciBwdXJlIGRhdGEgT19EU1lOQyBESU8gDQo+IHdyaXRlcw0KPiANCj4gT24gVHVlIDEzLTAzLTE4 IDE4OjUyOjU3LCBSb2JlcnQgRG9yciB3cm90ZToNCj4gPiBJIHRyaWVkIHRoZSBSV0ZfT0RTWU5D IGFuZCB4ZnMgc2VlbXMgdG8gaG9ub3IgcHJvcGVybHkgYnV0IGl0IGNhdXNlcw0KPiA+IGV4dDQg dG8gZ28gc3BpbiBvdXQgYSBzaW5nbGUgQ1BVIGFuZCBoYW5nIHRoZSBzeXN0ZW0uIPCfmIoNCj4g DQo+IFllYWgsIHRoYXQncyBhIGJ1ZyBpbiBnZW5lcmljIGRpcmVjdCBJTyBjb2RlIHRoYXQgSSd2 ZSBqdXN0IHJlY2VudGx5IA0KPiBmaXhlZA0KPiAoZDljMTBlNWI4ODYzICJkaXJlY3QtaW86IEZp eCBzbGVlcCBpbiBhdG9taWMgZHVlIHRvIHN5bmMgQUlPIiBpbiA0LjE2LXJjNCkuDQo+IA0KPiAJ CQkJCQkJCUhvbnphDQo+IA0KPiA+IA0KPiA+IA0KPiA+IC0tLS0tT3JpZ2luYWwgTWVzc2FnZS0t LS0tDQo+ID4gRnJvbTogQ2hyaXN0b3BoIEhlbGx3aWcgPGhjaEBsc3QuZGU+DQo+ID4gU2VudDog VHVlc2RheSwgTWFyY2ggMTMsIDIwMTggMTE6MTMgQU0NCj4gPiBUbzogUm9iZXJ0IERvcnIgPHJk b3JyQG1pY3Jvc29mdC5jb20+DQo+ID4gQ2M6IERhdmUgQ2hpbm5lciA8ZGF2aWRAZnJvbW9yYml0 LmNvbT47IERhbiBXaWxsaWFtcyANCj4gPiA8ZGFuLmoud2lsbGlhbXNAaW50ZWwuY29tPjsgbGlu dXgteGZzQHZnZXIua2VybmVsLm9yZzsgQ2hyaXN0b3BoIA0KPiA+IEhlbGx3aWcgPGhjaEBsc3Qu ZGU+OyBsaW51eC1mc2RldmVsIDxsaW51eC1mc2RldmVsQHZnZXIua2VybmVsLm9yZz47IA0KPiA+ IEphbiBLYXJhIDxqYWNrQHN1c2UuY3o+OyBUaGVvZG9yZSBUcydvIDx0eXRzb0BtaXQuZWR1Pjsg TWF0dGhldyANCj4gPiBXaWxjb3ggPG1hd2lsY294QG1pY3Jvc29mdC5jb20+OyBTY290dCBLb25l cnNtYW5uIA0KPiA+IDxzY290dGtvbkBtaWNyb3NvZnQuY29tPjsgU2xhdmEgT2tzIDxzbGF2YW9A bWljcm9zb2Z0LmNvbT47IEphc3JhaiANCj4gPiBEYW5nZSA8amFzcmFqZEBtaWNyb3NvZnQuY29t PjsgTWljaGFlbCBOZWxzb24gPG1pY25AbWljcm9zb2Z0LmNvbT4NCj4gPiBTdWJqZWN0OiBSZTog W1BBVENIXSBbUkZDXSBpb21hcDogVXNlIEZVQSBmb3IgcHVyZSBkYXRhIE9fRFNZTkMgRElPIA0K PiA+IHdyaXRlcw0KPiA+IA0KPiA+ID4gSSB0aGluayB0aGUgYW5zd2VyIGlzIHRoZSBpb21hcCog bG9naWMgbWFrZXMgc3VyZSB0byBpc3N1ZSBnZW5lcmljX3dyaXRlX3N5bmMgZm9yIE9fRFNZTkMg VzEgYWZ0ZXIgVzEgaXMgY29tcGxldGVkIGJ5IGhhcmR3YXJlIGFuZCB0aGVuIHdhaXRzIGZvciBj b21wbGV0aW9uIG9mIHRoZSBmbHVzaCByZXF1ZXN0IChSRVFfUFJFRkxVU0gpIGJlZm9yZSBXMSBp cyByZXR1cm5lZCB0byB0aGUgQUlPIGNvbXBsZXRpb24gcmluZywgcHJldmVudGluZyBpb19nZXRl dmVudHMgZnJvbSBwcm9jZXNzaW5nIFcxIGJlZm9yZSB0aGUgZmx1c2ggb2NjdXJzIGFuZCBjb21w bGV0ZXMuICAgSSBqdXN0IG5lZWQgcHJvcGVyIGNvbmZpcm1hdGlvbiBmcm9tIHRoZSBleHBlcnRz IG9uIHRoaXMgY29kZSB0aGF0IHRoaXMgaXMgdGhlIGV4cGVjdGVkIGJlaGF2aW9yLg0KPiA+IA0K PiA+IFllcy4gdGhhdCBpcyB0aGUgY2FzZS4NCj4gPiANCj4gPiA+IEZvciBTUUwgU2VydmVyIHVz aW5nIE9fRElSRUNUIHwgT19EU1lOQyBvbiBjdXJyZW50IGtlcm5lbHMgaXMgdmVyeSBwZXJmb3Jt YW5jZSBpbXBhY3RpbmcuICAgSW5zdGVhZCB3ZSBlbmFibGUgYSBtb2RlIGZvciBTUUwgdGhhdCBv cGVucyBPX0RJUkVDVCBvbmx5IGFuZCBpc3N1ZXMgZnN5bmMvZmRhdGFzeW5jIHdoZW4gd2UgYXJl IGhhcmRlbmluZyBsb2cgZmlsZXMgb3IgY2hlY2twb2ludGluZyBkYXRhIGZpbGVzLiAgIFRoaXMg cmVkdWNlcyB0aGUgd3JpdGUsIGZsdXNoLCB3cml0ZSwgZmx1c2ggcGF0dGVybiBhbGxvd2luZyBm b3Igd3JpdGUsIHdyaXRlLCB3cml0ZSwgLi4uIHRoZW4gZmx1c2ggYXMgd2Ugb25seSBpc3N1ZSBm bHVzaCByZXF1ZXN0cyB3aGVuIHJlcXVpcmVkIHRvIG1haW50YWluIHRoZSBkYXRhIGludGVncml0 eSBib3VuZGFyaWVzIG9mIFNRTCBTZXJ2ZXIuICBUaGUgcGVyZm9ybWFuY2UgaXMgc2lnbmlmaWNh bnRseSBiZXR0ZXIgdGhlbiB0aGUgZGV2aWNlIGZsdXNoIGZvciBlYWNoIHdyaXRlIGFzIHlvdSBj YW4gaW1hZ2luZS4NCj4gPiA+IA0KPiA+ID4gVGVzdGluZyBzaG93cyB0aGUgRlVBIGVuaGFuY2Vt ZW50IGlzIGJldHRlciB0aGVuIHRoZSB3cml0ZSwgZmx1c2ggcGF0dGVybi4gICBGb3IgU1FMIFNl cnZlciB3ZSB3YW50IHRvIGR5bmFtaWNhbGx5IG9wZW4gd2l0aCBPX0RJUkVDVCB8IE9fRFNZTkMg d2hlbiBSRVFfRlVBIGNhbiBiZSBwcm9wZXJseSB1c2VkIGFuZCBvcGVuIHdpdGggT19ESVJFQ1Qg YW5kIGxldmVyYWdlIFNRTCBTZXJ2ZXIncyBhbHRlcm5hdGUgZmx1c2ggc2NoZW1lIHdoZW4gcnVu bmluZyBvbiBvbGRlciBrZXJuZWwgb3IgYSBzeXN0ZW0gdGhhdCBkb2VzIG5vdCBzdXBwb3J0IEZV QSAoU0FUQSwgSURFLCAuLi4pDQo+ID4gDQo+ID4gVGhlcmUgaXMgbm8gcmVhbGx5IGdvb2Qgd2F5 IHRvIGZpZ3VyZSB0aGlzIG91dCBleGNlcHQgZm9yIGJlbmNobWFya2luZywgZ2l2ZW4gdGhhdCB0 aGUgRlVBIHVzZSBhbiBpbXBsZW1lbnRhdGlvbiBkZXRhaWwuDQo+ID4gDQo+ID4gQnR3LCBhbm90 aGVyIGZlYXR1cmUgdGhhdCBtaWdodCBiZSBpbnRlcmVzdGluZyB0byB5b3UgaXMgdGhlIFJXRl9E U1lOQyBmbGFnIHRvIHRoZSBwd3JpdGV2MiBzeXNjYWxsLCB3aGljaCBhbGxvd3MgdG8gYXBwbHkg T19EU1lOQyBzZW1hbnRpY3Mgb24gYSBwZXItSS9PIGJhc2lzIGluc3RlYWQgb2YgdXNpbmcgaXQg YXQgb3BlbiB0aW1lLg0KPiAtLQ0KPiBKYW4gS2FyYSA8amFja0BzdXNlLmNvbT4NCj4gU1VTRSBM YWJzLCBDUg0KLS0NCkphbiBLYXJhIDxqYWNrQHN1c2UuY29tPg0KU1VTRSBMYWJzLCBDUg0K -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 24.04.2018 17:09, Robert Dorr wrote: > Hope you are feeling better. > > 1. How can we effect similar changes in ext4? > 2. How do we get these changes pushed into all common release builds for both xfs or ext4 optimizations? > > We are seeing dramatic performance increases with the xfs fixes. https://bugzilla.redhat.com/show_bug.cgi?id=1548524 > > 4 socket, TPCC > 9,108 – Old kernel > 36,618 – Patched kernel > > 2 socket, TPCC > 9,606 - Old kernel > 18,840 - Patched kernel > > -------------------------------------------------------------------------------------------------- > -- Times in milli-seconds > -- wt=SQL writethrough setting > -- awt=SQL alternatewritethrough setting > -- -T3979=SQL open with Fua behavior > -- -T3982=SQL opens without Fua and use fsync (forced flush) behavior > -------------------------------------------------------------------------------------------------- > -- > -- Linux patched kernel for XFS FUA opts > -- SQL Server 2017 CU6 > -- blktrace active > -- Centos7 - 4proc, 8GB ram running on Hyper-V > -- Win10, 8proc, 32GB ram host > -- Samsung SSD drive > -- Clean SQL Server restarts > -------------------------------------------------------------------------------------------------- > -- PATCHED KERNEL 4.16.x > -- Create DB Inserts Checkpoint Options > -- ----------------------------------- --------- ------- ---------- ----------------------------------- > -- Default (-T3982 = forced flush) 3787 29814 47 O_DIRECT and fsync from SQL > -- -T3979 and wt=1 awt=1 4224 25880 30 O_DIRECT and specialized fdatasync (Best without FUA is ~10x slower than FUA) > -- > -- -T3979 and wt=0 2860 2570 13 O_DIRECT > -- -T3979 and wt=1 awt=0 4204 2573 17 O_DIRECT | O_DSYNC - Inserts and checkpoint on allocated space nears O_DIRECT only <--- !!! DESIRED BEHAVIOR !!! > -- -T3979 and wt=1 awt=0 NO STAMP 4127 2823 17 O_DIRECT | O_DSYNC - Issues internal flushs for 1st writes, create faster but inserts slower > -- > -- UNPATCHED KERNEL 3.x > -- Create DB Inserts Checkpoint Options > -- ----------------------------------- --------- ------- ---------- ----------------------------------- > -- Default (-T3982 = forced flush) 3097 29700 26 O_DIRECT and fsync from SQL > -- -T3979 and wt=1 awt=1 4210 25800 27 O_DIRECT and specialized fdatasync > -- > -- -T3979 and wt=0 2880 2134 6 O_DIRECT > -- -T3979 and wt=1 awt=0 4443 25290 42 O_DIRECT | O_DSYNC > -- -T3979 and wt=1 awt=0 NO STAMP 4120 25680 40 O_DIRECT | O_DSYNC > -- > So these number should be taken with a VERY BIG grain of salt. Not because the change is not valid but because at the very least you are comparing kernel 3.x with latest 4.16. It's unlikely that this report will be the thing that tips the scaler to either "implement" or "not" that for ext4. But at the very least show number for the same kernel with the only difference being the patch. > > > > -----Original Message----- > From: Robert Dorr > Sent: Thursday, March 22, 2018 7:38 AM > To: Jan Kara <jack@suse.cz> > Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> > Subject: RE: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes > > Sorry to hear that. Get well soon. > > Thanks for the response. > > > -----Original Message----- > From: Jan Kara <jack@suse.cz> > Sent: Thursday, March 22, 2018 9:36 AM > To: Robert Dorr <rdorr@microsoft.com> > Cc: Jan Kara <jack@suse.cz>; Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> > Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes > > On Mon 19-03-18 16:14:16, Robert Dorr wrote: >> Awesome news on the spin. >> >> Are you going to be able to make changes to EXT4 to accommodate >> REQ_FUA without generic_write_flush to improve performance like was done for xfs? > > I'm currently on a sick leave so I'm slow on replies. Sorry. I don't see a reason why ext4 could not have the same optimization as XFS for that case of direct IO. However I'm not sure when I get to implementing that. > > Honza > >> -----Original Message----- >> From: Jan Kara <jack@suse.cz> >> Sent: Monday, March 19, 2018 11:07 AM >> To: Robert Dorr <rdorr@microsoft.com> >> Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner >> <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; >> linux-xfs@vger.kernel.org; linux-fsdevel >> <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore >> Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott >> Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; >> Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson >> <micn@microsoft.com> >> Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO >> writes >> >> On Tue 13-03-18 18:52:57, Robert Dorr wrote: >>> I tried the RWF_ODSYNC and xfs seems to honor properly but it causes >>> ext4 to go spin out a single CPU and hang the system.
On Tue 24-04-18 14:09:53, Robert Dorr wrote: > Hope you are feeling better. Yes :) Thanks. > 1. How can we effect similar changes in ext4? So ext4 currently does not use the iomap infrastructure for direct IO and thus cannot directly use the optimization of using FUA. We'd like to switch ext4 direct IO to use iomap instead of the blkdev_direct_IO() helper. It should be pretty straightforward but not completely trivial. I'll look into it but it will take a while. > 2. How do we get these changes pushed into all common release builds for > both xfs or ext4 optimizations? I'm not sure what are you speaking about here. Do you mean how do you get it to distributions like SLES, RHEL, etc? Generally when they update to new enough kernel version. Which is rather fast (weeks after Linus releases a kernel with this improvement) for community distros such as Fedora or openSUSE Tumbleweed, it will take much longer for enterprise distros (generally only the next major release will pick up a new kernel so a year or two). Honza > -----Original Message----- > From: Robert Dorr > Sent: Thursday, March 22, 2018 7:38 AM > To: Jan Kara <jack@suse.cz> > Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> > Subject: RE: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes > > Sorry to hear that. Get well soon. > > Thanks for the response. > > > -----Original Message----- > From: Jan Kara <jack@suse.cz> > Sent: Thursday, March 22, 2018 9:36 AM > To: Robert Dorr <rdorr@microsoft.com> > Cc: Jan Kara <jack@suse.cz>; Christoph Hellwig <hch@lst.de>; Dave Chinner <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; linux-xfs@vger.kernel.org; linux-fsdevel <linux-fsdevel@vger.kernel.org>; Theodore Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson <micn@microsoft.com> > Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO writes > > On Mon 19-03-18 16:14:16, Robert Dorr wrote: > > Awesome news on the spin. > > > > Are you going to be able to make changes to EXT4 to accommodate > > REQ_FUA without generic_write_flush to improve performance like was done for xfs? > > I'm currently on a sick leave so I'm slow on replies. Sorry. I don't see a reason why ext4 could not have the same optimization as XFS for that case of direct IO. However I'm not sure when I get to implementing that. > > Honza > > > -----Original Message----- > > From: Jan Kara <jack@suse.cz> > > Sent: Monday, March 19, 2018 11:07 AM > > To: Robert Dorr <rdorr@microsoft.com> > > Cc: Christoph Hellwig <hch@lst.de>; Dave Chinner > > <david@fromorbit.com>; Dan Williams <dan.j.williams@intel.com>; > > linux-xfs@vger.kernel.org; linux-fsdevel > > <linux-fsdevel@vger.kernel.org>; Jan Kara <jack@suse.cz>; Theodore > > Ts'o <tytso@mit.edu>; Matthew Wilcox <mawilcox@microsoft.com>; Scott > > Konersmann <scottkon@microsoft.com>; Slava Oks <slavao@microsoft.com>; > > Jasraj Dange <jasrajd@microsoft.com>; Michael Nelson > > <micn@microsoft.com> > > Subject: Re: [PATCH] [RFC] iomap: Use FUA for pure data O_DSYNC DIO > > writes > > > > On Tue 13-03-18 18:52:57, Robert Dorr wrote: > > > I tried the RWF_ODSYNC and xfs seems to honor properly but it causes > > > ext4 to go spin out a single CPU and hang the system.
diff --git a/fs/iomap.c b/fs/iomap.c index afd163586aa0..bcc90e3a2e3f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -685,6 +685,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_WRITE_FUA (1 << 29) #define IOMAP_DIO_WRITE (1 << 30) #define IOMAP_DIO_DIRTY (1 << 31) @@ -760,8 +761,19 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) } inode_dio_end(file_inode(iocb->ki_filp)); - kfree(dio); + /* + * If a FUA write was done, then that is all we required for datasync + * semantics -. we don't need to call generic_write_sync() to complete + * the write. + */ + if (ret > 0 && + (dio->flags & (IOMAP_DIO_WRITE|IOMAP_DIO_WRITE_FUA)) == + IOMAP_DIO_WRITE) { + ret = generic_write_sync(iocb, ret); + } + + kfree(dio); return ret; } @@ -769,12 +781,9 @@ static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - bool is_write = (dio->flags & IOMAP_DIO_WRITE); ssize_t ret; ret = iomap_dio_complete(dio); - if (is_write && ret > 0) - ret = generic_write_sync(iocb, ret); iocb->ki_complete(iocb, ret, 0); } @@ -883,6 +892,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, dio->flags |= IOMAP_DIO_COW; if (iomap->flags & IOMAP_F_NEW) need_zeroout = true; + /* + * Use a FUA write if we need datasync semantics and this is a + * pure data IO that doesn't require any metadata updates. This + * allows us to avoid cache flushes on IO completion. + */ + else if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && + (dio->flags & IOMAP_DIO_WRITE) && + (dio->iocb->ki_flags & IOCB_DSYNC)) + dio->flags |= IOMAP_DIO_WRITE_FUA; break; default: WARN_ON_ONCE(1); @@ -930,7 +948,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + int op_flags = REQ_SYNC | REQ_IDLE; + + if (dio->flags & IOMAP_DIO_WRITE_FUA) + op_flags |= REQ_FUA; + bio_set_op_attrs(bio, REQ_OP_WRITE, op_flags); task_io_account_write(n); } else { bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -961,6 +983,12 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return copied; } +/* + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO + * is being issued as AIO or not. This allows us to optimise pure data writes to + * use REQ_FUA rather than requiring generic_write_sync() to issue a REQ_FLUSH + * post write. + */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 260ff5e5c264..81aa3b73471e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -732,6 +732,11 @@ xfs_file_write_iter( ret = xfs_file_dio_aio_write(iocb, from); if (ret == -EREMCHG) goto buffered; + /* + * Direct IO handles sync type writes internally on I/O + * completion. + */ + return ret; } else { buffered: ret = xfs_file_buffered_aio_write(iocb, from);