diff mbox

[v3,6/6] block: use dax_do_io() if blkdev_dax_capable()

Message ID 1455680059-20126-7-git-send-email-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler Feb. 17, 2016, 3:34 a.m. UTC
From: Dan Williams <dan.j.williams@intel.com>

Setting S_DAX on an inode requires that the inode participate in the
DAX-fsync mechanism which expects to use the pagecache for tracking
potentially dirty cpu cachelines.  However, dax_do_io() participates in
the standard pagecache sync semantics and arranges for dirty pages to be
flushed through the driver when a direct-IO operation accesses the same
ranges.

It should always be valid to use the dax_do_io() path regardless of
whether the block_device inode has S_DAX set.  In either case dirty
pages or dirty cachelines are made durable before the direct-IO
operation proceeds.

Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 block/ioctl.c      |  1 +
 fs/block_dev.c     |  3 ++-
 include/linux/fs.h | 31 +++++++++++++++++++++----------
 3 files changed, 24 insertions(+), 11 deletions(-)

Comments

Jan Kara Feb. 17, 2016, 9:54 p.m. UTC | #1
On Tue 16-02-16 20:34:19, Ross Zwisler wrote:
> From: Dan Williams <dan.j.williams@intel.com>
> 
> Setting S_DAX on an inode requires that the inode participate in the
> DAX-fsync mechanism which expects to use the pagecache for tracking
> potentially dirty cpu cachelines.  However, dax_do_io() participates in
> the standard pagecache sync semantics and arranges for dirty pages to be
> flushed through the driver when a direct-IO operation accesses the same
> ranges.
> 
> It should always be valid to use the dax_do_io() path regardless of
> whether the block_device inode has S_DAX set.  In either case dirty
> pages or dirty cachelines are made durable before the direct-IO
> operation proceeds.

Please no. I agree that going via DAX path for normal likely won't
introduce new data corruption issues. But I dislike having a special
case for block devices. Also you have no way of turning DAX off for block
devices AFAIU and as Dave said, DAX should be opt-in, not opt-out. Note
that you may actually want to go through the block layer for normal IO e.g.
because you use IO cgroups to limit processes so using DAX regresses some
functionality.

								Honza
 
> Cc: Jan Kara <jack@suse.cz>
> Cc: Jens Axboe <axboe@fb.com>
> Cc: Al Viro <viro@ftp.linux.org.uk>
> Cc: Dave Chinner <david@fromorbit.com>
> Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> ---
>  block/ioctl.c      |  1 +
>  fs/block_dev.c     |  3 ++-
>  include/linux/fs.h | 31 +++++++++++++++++++++----------
>  3 files changed, 24 insertions(+), 11 deletions(-)
> 
> diff --git a/block/ioctl.c b/block/ioctl.c
> index d8996bb..7c64286 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -434,6 +434,7 @@ bool blkdev_dax_capable(struct block_device *bdev)
>  
>  	return true;
>  }
> +EXPORT_SYMBOL(blkdev_dax_capable);
>  #endif
>  
>  static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 826b164..0e937dd 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -166,8 +166,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = bdev_file_inode(file);
> +	struct block_device *bdev = I_BDEV(inode);
>  
> -	if (IS_DAX(inode))
> +	if (blkdev_dax_capable(bdev))
>  		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
>  				NULL, DIO_SKIP_DIO_COUNT);
>  	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ae68100..a3f5ee8 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -830,7 +830,14 @@ static inline unsigned imajor(const struct inode *inode)
>  	return MAJOR(inode->i_rdev);
>  }
>  
> +#ifdef CONFIG_BLOCK
>  extern struct block_device *I_BDEV(struct inode *inode);
> +#else
> +static inline struct block_device *I_BDEV(struct inode *inode)
> +{
> +	return NULL;
> +}
> +#endif
>  
>  struct fown_struct {
>  	rwlock_t lock;          /* protects pid, uid, euid fields */
> @@ -2306,15 +2313,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
>  extern void emergency_thaw_all(void);
>  extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
>  extern int fsync_bdev(struct block_device *);
> -#ifdef CONFIG_FS_DAX
> -extern bool blkdev_dax_capable(struct block_device *bdev);
> -#else
> -static inline bool blkdev_dax_capable(struct block_device *bdev)
> -{
> -	return false;
> -}
> -#endif
> -
>  extern struct super_block *blockdev_superblock;
>  
>  static inline bool sb_is_blkdev_sb(struct super_block *sb)
> @@ -2902,9 +2900,22 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
>  extern void save_mount_options(struct super_block *sb, char *options);
>  extern void replace_mount_options(struct super_block *sb, char *options);
>  
> +#ifdef CONFIG_FS_DAX
> +extern bool blkdev_dax_capable(struct block_device *bdev);
> +#else
> +static inline bool blkdev_dax_capable(struct block_device *bdev)
> +{
> +	return false;
> +}
> +#endif
> +
>  static inline bool io_is_direct(struct file *filp)
>  {
> -	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
> +	struct inode *inode = filp->f_mapping->host;
> +
> +	return (filp->f_flags & O_DIRECT) || IS_DAX(inode)
> +		|| (S_ISBLK(file_inode(filp)->i_mode)
> +				&& blkdev_dax_capable(I_BDEV(inode)));
>  }
>  
>  static inline int iocb_flags(struct file *file)
> -- 
> 2.5.0
>
Dan Williams Feb. 17, 2016, 10:18 p.m. UTC | #2
On Wed, Feb 17, 2016 at 1:54 PM, Jan Kara <jack@suse.cz> wrote:
> On Tue 16-02-16 20:34:19, Ross Zwisler wrote:
>> From: Dan Williams <dan.j.williams@intel.com>
>>
>> Setting S_DAX on an inode requires that the inode participate in the
>> DAX-fsync mechanism which expects to use the pagecache for tracking
>> potentially dirty cpu cachelines.  However, dax_do_io() participates in
>> the standard pagecache sync semantics and arranges for dirty pages to be
>> flushed through the driver when a direct-IO operation accesses the same
>> ranges.
>>
>> It should always be valid to use the dax_do_io() path regardless of
>> whether the block_device inode has S_DAX set.  In either case dirty
>> pages or dirty cachelines are made durable before the direct-IO
>> operation proceeds.
>
> Please no. I agree that going via DAX path for normal likely won't
> introduce new data corruption issues. But I dislike having a special
> case for block devices. Also you have no way of turning DAX off for block
> devices AFAIU and as Dave said, DAX should be opt-in, not opt-out. Note
> that you may actually want to go through the block layer for normal IO e.g.
> because you use IO cgroups to limit processes so using DAX regresses some
> functionality.
>

Sounds good.

As Ross mentioned in the cover letter, I'm fine with dropping this one
for now as we think through how to restore raw device DAX support.  In
the meantime we can still force CONFIG_BLK_DEV_DAX=y for testing.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/block/ioctl.c b/block/ioctl.c
index d8996bb..7c64286 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -434,6 +434,7 @@  bool blkdev_dax_capable(struct block_device *bdev)
 
 	return true;
 }
+EXPORT_SYMBOL(blkdev_dax_capable);
 #endif
 
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 826b164..0e937dd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -166,8 +166,9 @@  blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
+	struct block_device *bdev = I_BDEV(inode);
 
-	if (IS_DAX(inode))
+	if (blkdev_dax_capable(bdev))
 		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
 				NULL, DIO_SKIP_DIO_COUNT);
 	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae68100..a3f5ee8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -830,7 +830,14 @@  static inline unsigned imajor(const struct inode *inode)
 	return MAJOR(inode->i_rdev);
 }
 
+#ifdef CONFIG_BLOCK
 extern struct block_device *I_BDEV(struct inode *inode);
+#else
+static inline struct block_device *I_BDEV(struct inode *inode)
+{
+	return NULL;
+}
+#endif
 
 struct fown_struct {
 	rwlock_t lock;          /* protects pid, uid, euid fields */
@@ -2306,15 +2313,6 @@  extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
-
 extern struct super_block *blockdev_superblock;
 
 static inline bool sb_is_blkdev_sb(struct super_block *sb)
@@ -2902,9 +2900,22 @@  extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
 extern void replace_mount_options(struct super_block *sb, char *options);
 
+#ifdef CONFIG_FS_DAX
+extern bool blkdev_dax_capable(struct block_device *bdev);
+#else
+static inline bool blkdev_dax_capable(struct block_device *bdev)
+{
+	return false;
+}
+#endif
+
 static inline bool io_is_direct(struct file *filp)
 {
-	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
+	struct inode *inode = filp->f_mapping->host;
+
+	return (filp->f_flags & O_DIRECT) || IS_DAX(inode)
+		|| (S_ISBLK(file_inode(filp)->i_mode)
+				&& blkdev_dax_capable(I_BDEV(inode)));
 }
 
 static inline int iocb_flags(struct file *file)