diff mbox

[v2,2/7] dax: change bdev_dax_supported() to support boolean returns

Message ID 20180529195106.14268-3-ross.zwisler@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ross Zwisler May 29, 2018, 7:51 p.m. UTC
From: Dave Jiang <dave.jiang@intel.com>

The function return values are confusing with the way the function is
named. We expect a true or false return value but it actually returns
0/-errno.  This makes the code very confusing. Changing the return values
to return a bool where if DAX is supported then return true and no DAX
support returns false.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 drivers/dax/super.c | 16 ++++++++--------
 fs/ext2/super.c     |  3 +--
 fs/ext4/super.c     |  3 +--
 fs/xfs/xfs_ioctl.c  |  4 ++--
 fs/xfs/xfs_super.c  | 12 ++++++------
 include/linux/dax.h |  8 ++++----
 6 files changed, 22 insertions(+), 24 deletions(-)

Comments

Darrick J. Wong May 29, 2018, 9:25 p.m. UTC | #1
On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> From: Dave Jiang <dave.jiang@intel.com>
> 
> The function return values are confusing with the way the function is
> named. We expect a true or false return value but it actually returns
> 0/-errno.  This makes the code very confusing. Changing the return values
> to return a bool where if DAX is supported then return true and no DAX
> support returns false.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>

Looks ok, do you want me to pull the first two patches through the xfs
tree?

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

--D

> ---
>  drivers/dax/super.c | 16 ++++++++--------
>  fs/ext2/super.c     |  3 +--
>  fs/ext4/super.c     |  3 +--
>  fs/xfs/xfs_ioctl.c  |  4 ++--
>  fs/xfs/xfs_super.c  | 12 ++++++------
>  include/linux/dax.h |  8 ++++----
>  6 files changed, 22 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 3943feb9a090..1d7bd96511f0 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -80,9 +80,9 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
>   * This is a library function for filesystems to check if the block device
>   * can be mounted with dax option.
>   *
> - * Return: negative errno if unsupported, 0 if supported.
> + * Return: true if supported, false if unsupported
>   */
> -int __bdev_dax_supported(struct block_device *bdev, int blocksize)
> +bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
>  {
>  	struct dax_device *dax_dev;
>  	pgoff_t pgoff;
> @@ -95,21 +95,21 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
>  	if (blocksize != PAGE_SIZE) {
>  		pr_debug("%s: error: unsupported blocksize for dax\n",
>  				bdevname(bdev, buf));
> -		return -EINVAL;
> +		return false;
>  	}
>  
>  	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
>  	if (err) {
>  		pr_debug("%s: error: unaligned partition for dax\n",
>  				bdevname(bdev, buf));
> -		return err;
> +		return false;
>  	}
>  
>  	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
>  	if (!dax_dev) {
>  		pr_debug("%s: error: device does not support dax\n",
>  				bdevname(bdev, buf));
> -		return -EOPNOTSUPP;
> +		return false;
>  	}
>  
>  	id = dax_read_lock();
> @@ -121,7 +121,7 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
>  	if (len < 1) {
>  		pr_debug("%s: error: dax access failed (%ld)\n",
>  				bdevname(bdev, buf), len);
> -		return len < 0 ? len : -EIO;
> +		return false;
>  	}
>  
>  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
> @@ -139,10 +139,10 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize)
>  	} else {
>  		pr_debug("%s: error: dax support not enabled\n",
>  				bdevname(bdev, buf));
> -		return -EOPNOTSUPP;
> +		return false;
>  	}
>  
> -	return 0;
> +	return true;
>  }
>  EXPORT_SYMBOL_GPL(__bdev_dax_supported);
>  #endif
> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index 9627c3054b5c..c09289a42dc5 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
>  
>  	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
> -		err = bdev_dax_supported(sb->s_bdev, blocksize);
> -		if (err) {
> +		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
>  			ext2_msg(sb, KERN_ERR,
>  				"DAX unsupported by block device. Turning off DAX.");
>  			sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 089170e99895..2e1622907f4a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3732,8 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  					" that may contain inline data");
>  			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
>  		}
> -		err = bdev_dax_supported(sb->s_bdev, blocksize);
> -		if (err) {
> +		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
>  			ext4_msg(sb, KERN_ERR,
>  				"DAX unsupported by block device. Turning off DAX.");
>  			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 0effd46b965f..2c70a0a4f59f 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1103,8 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate(
>  	if (fa->fsx_xflags & FS_XFLAG_DAX) {
>  		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
>  			return -EINVAL;
> -		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
> -				sb->s_blocksize) < 0)
> +		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
> +				sb->s_blocksize))
>  			return -EINVAL;
>  	}
>  
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 62188c2a4c36..86915dc40eed 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1690,17 +1690,17 @@ xfs_fs_fill_super(
>  		sb->s_flags |= SB_I_VERSION;
>  
>  	if (mp->m_flags & XFS_MOUNT_DAX) {
> -		int	error2 = 0;
> +		bool rtdev_is_dax = false, datadev_is_dax;
>  
>  		xfs_warn(mp,
>  		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
>  
> -		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
> -				sb->s_blocksize);
> +		datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
> +			sb->s_blocksize);
>  		if (mp->m_rtdev_targp)
> -			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
> -					sb->s_blocksize);
> -		if (error && error2) {
> +			rtdev_is_dax = bdev_dax_supported(
> +				mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
> +		if (!rtdev_is_dax && !datadev_is_dax) {
>  			xfs_alert(mp,
>  			"DAX unsupported by block device. Turning off DAX.");
>  			mp->m_flags &= ~XFS_MOUNT_DAX;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 2a4f7295c12b..c99692ddd4b5 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -64,8 +64,8 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
>  struct writeback_control;
>  int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
>  #if IS_ENABLED(CONFIG_FS_DAX)
> -int __bdev_dax_supported(struct block_device *bdev, int blocksize);
> -static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
> +bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
> +static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
>  {
>  	return __bdev_dax_supported(bdev, blocksize);
>  }
> @@ -84,10 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
>  int dax_writeback_mapping_range(struct address_space *mapping,
>  		struct block_device *bdev, struct writeback_control *wbc);
>  #else
> -static inline int bdev_dax_supported(struct block_device *bdev,
> +static inline bool bdev_dax_supported(struct block_device *bdev,
>  		int blocksize)
>  {
> -	return -EOPNOTSUPP;
> +	return false;
>  }
>  
>  static inline struct dax_device *fs_dax_get_by_host(const char *host)
> -- 
> 2.14.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ross Zwisler May 29, 2018, 10:01 p.m. UTC | #2
On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > From: Dave Jiang <dave.jiang@intel.com>
> > 
> > The function return values are confusing with the way the function is
> > named. We expect a true or false return value but it actually returns
> > 0/-errno.  This makes the code very confusing. Changing the return values
> > to return a bool where if DAX is supported then return true and no DAX
> > support returns false.
> > 
> > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> 
> Looks ok, do you want me to pull the first two patches through the xfs
> tree?
> 
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

Thanks for the review.

I'm not sure what's best.  If you do that then Mike will need to have a DM
branch for the rest of the series based on your stable commits, yea?

Mike what would you prefer?
Darrick J. Wong May 31, 2018, 7:13 p.m. UTC | #3
On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > From: Dave Jiang <dave.jiang@intel.com>
> > > 
> > > The function return values are confusing with the way the function is
> > > named. We expect a true or false return value but it actually returns
> > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > to return a bool where if DAX is supported then return true and no DAX
> > > support returns false.
> > > 
> > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > 
> > Looks ok, do you want me to pull the first two patches through the xfs
> > tree?
> > 
> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Thanks for the review.
> 
> I'm not sure what's best.  If you do that then Mike will need to have a DM
> branch for the rest of the series based on your stable commits, yea?
> 
> Mike what would you prefer?

I /was/ about to say that I would pull in the first two patches, but now
I can't get xfs to mount with pmem at all, and have no way of testing
this...?

# echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control
# mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt
# dmesg
<snip>
SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled
XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk
pmem3: error: dax support not enabled
pmem4: error: dax support not enabled
XFS (pmem3): DAX unsupported by block device. Turning off DAX.
XFS (pmem3): Mounting V5 Filesystem
XFS (pmem3): Ending clean mount

Evidently the pfn it picks up is missing PFN_MAP in flags because
ND_REGION_PAGEMAP isn't set, and looking at the kernel source, pmem that
comes in via NFIT never gets that set...?

relevant qemu pmem options:

-object memory-backend-file,id=memnvdimm0,prealloc=no,mem-path=/dev/shm/a.img,share=yes,size=13488881664
-device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0
(repeat for five more devices)

<confused>

--D

NFIT table contents:

0000000  4e  46  49  54  78  04  00  00  01  46  42  4f  43  48  53  20
          N   F   I   T   x 004  \0  \0 001   F   B   O   C   H   S    
0000016  42  58  50  43  4e  46  49  54  01  00  00  00  42  58  50  43
          B   X   P   C   N   F   I   T 001  \0  \0  \0   B   X   P   C
0000032  01  00  00  00  00  00  00  00  00  00  38  00  08  00  03  00
        001  \0  \0  \0  \0  \0  \0  \0  \0  \0   8  \0  \b  \0 003  \0
0000048  00  00  00  00  01  00  00  00  79  d3  f0  66  f3  b4  74  40
         \0  \0  \0  \0 001  \0  \0  \0   y 323 360   f 363 264   t   @
0000064  ac  43  0d  33  18  b7  8c  db  00  00  00  6c  0a  00  00  00
        254   C  \r   3 030 267 214 333  \0  \0  \0   l  \n  \0  \0  \0
0000080  00  00  00  24  03  00  00  00  08  80  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \b 200  \0  \0  \0  \0  \0  \0
0000096  01  00  30  00  04  00  00  00  00  00  00  00  08  00  09  00
        001  \0   0  \0 004  \0  \0  \0  \0  \0  \0  \0  \b  \0  \t  \0
0000112  00  00  00  24  03  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000128  00  00  00  00  00  00  00  00  00  00  01  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0 001  \0  \0  \0  \0  \0
0000144  04  00  50  00  09  00  86  80  01  00  01  00  00  00  00  00
        004  \0   P  \0  \t  \0 206 200 001  \0 001  \0  \0  \0  \0  \0
0000160  00  00  00  00  00  00  00  00  59  34  12  00  01  03  00  00
         \0  \0  \0  \0  \0  \0  \0  \0   Y   4 022  \0 001 003  \0  \0
0000176  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0000224  00  00  38  00  0a  00  03  00  00  00  00  00  00  00  00  00
         \0  \0   8  \0  \n  \0 003  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000240  79  d3  f0  66  f3  b4  74  40  ac  43  0d  33  18  b7  8c  db
          y 323 360   f 363 264   t   @ 254   C  \r   3 030 267 214 333
0000256  00  00  00  90  0d  00  00  00  00  00  00  24  03  00  00  00
         \0  \0  \0 220  \r  \0  \0  \0  \0  \0  \0   $ 003  \0  \0  \0
0000272  08  80  00  00  00  00  00  00  01  00  30  00  05  00  00  00
         \b 200  \0  \0  \0  \0  \0  \0 001  \0   0  \0 005  \0  \0  \0
0000288  00  00  00  00  0a  00  0b  00  00  00  00  24  03  00  00  00
         \0  \0  \0  \0  \n  \0  \v  \0  \0  \0  \0   $ 003  \0  \0  \0
0000304  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000320  00  00  01  00  00  00  00  00  04  00  50  00  0b  00  86  80
         \0  \0 001  \0  \0  \0  \0  \0 004  \0   P  \0  \v  \0 206 200
0000336  01  00  01  00  00  00  00  00  00  00  00  00  00  00  00  00
        001  \0 001  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000352  5a  34  12  00  01  03  00  00  00  00  00  00  00  00  00  00
          Z   4 022  \0 001 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000368  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0000400  00  00  00  00  00  00  00  00  00  00  38  00  0c  00  03  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0   8  \0  \f  \0 003  \0
0000416  00  00  00  00  01  00  00  00  79  d3  f0  66  f3  b4  74  40
         \0  \0  \0  \0 001  \0  \0  \0   y 323 360   f 363 264   t   @
0000432  ac  43  0d  33  18  b7  8c  db  00  00  00  b4  10  00  00  00
        254   C  \r   3 030 267 214 333  \0  \0  \0 264 020  \0  \0  \0
0000448  00  00  00  24  03  00  00  00  08  80  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \b 200  \0  \0  \0  \0  \0  \0
0000464  01  00  30  00  06  00  00  00  00  00  00  00  0c  00  0d  00
        001  \0   0  \0 006  \0  \0  \0  \0  \0  \0  \0  \f  \0  \r  \0
0000480  00  00  00  24  03  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000496  00  00  00  00  00  00  00  00  00  00  01  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0 001  \0  \0  \0  \0  \0
0000512  04  00  50  00  0d  00  86  80  01  00  01  00  00  00  00  00
        004  \0   P  \0  \r  \0 206 200 001  \0 001  \0  \0  \0  \0  \0
0000528  00  00  00  00  00  00  00  00  5b  34  12  00  01  03  00  00
         \0  \0  \0  \0  \0  \0  \0  \0   [   4 022  \0 001 003  \0  \0
0000544  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0000592  00  00  38  00  02  00  03  00  00  00  00  00  00  00  00  00
         \0  \0   8  \0 002  \0 003  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000608  79  d3  f0  66  f3  b4  74  40  ac  43  0d  33  18  b7  8c  db
          y 323 360   f 363 264   t   @ 254   C  \r   3 030 267 214 333
0000624  00  00  00  00  01  00  00  00  00  00  00  24  03  00  00  00
         \0  \0  \0  \0 001  \0  \0  \0  \0  \0  \0   $ 003  \0  \0  \0
0000640  08  80  00  00  00  00  00  00  01  00  30  00  01  00  00  00
         \b 200  \0  \0  \0  \0  \0  \0 001  \0   0  \0 001  \0  \0  \0
0000656  00  00  00  00  02  00  03  00  00  00  00  24  03  00  00  00
         \0  \0  \0  \0 002  \0 003  \0  \0  \0  \0   $ 003  \0  \0  \0
0000672  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000688  00  00  01  00  00  00  00  00  04  00  50  00  03  00  86  80
         \0  \0 001  \0  \0  \0  \0  \0 004  \0   P  \0 003  \0 206 200
0000704  01  00  01  00  00  00  00  00  00  00  00  00  00  00  00  00
        001  \0 001  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000720  56  34  12  00  01  03  00  00  00  00  00  00  00  00  00  00
          V   4 022  \0 001 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000736  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0000768  00  00  00  00  00  00  00  00  00  00  38  00  04  00  03  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0   8  \0 004  \0 003  \0
0000784  00  00  00  00  01  00  00  00  79  d3  f0  66  f3  b4  74  40
         \0  \0  \0  \0 001  \0  \0  \0   y 323 360   f 363 264   t   @
0000800  ac  43  0d  33  18  b7  8c  db  00  00  00  24  04  00  00  00
        254   C  \r   3 030 267 214 333  \0  \0  \0   $ 004  \0  \0  \0
0000816  00  00  00  24  03  00  00  00  08  80  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \b 200  \0  \0  \0  \0  \0  \0
0000832  01  00  30  00  02  00  00  00  00  00  00  00  04  00  05  00
        001  \0   0  \0 002  \0  \0  \0  \0  \0  \0  \0 004  \0 005  \0
0000848  00  00  00  24  03  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0   $ 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000864  00  00  00  00  00  00  00  00  00  00  01  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0 001  \0  \0  \0  \0  \0
0000880  04  00  50  00  05  00  86  80  01  00  01  00  00  00  00  00
        004  \0   P  \0 005  \0 206 200 001  \0 001  \0  \0  \0  \0  \0
0000896  00  00  00  00  00  00  00  00  57  34  12  00  01  03  00  00
         \0  \0  \0  \0  \0  \0  \0  \0   W   4 022  \0 001 003  \0  \0
0000912  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0000960  00  00  38  00  06  00  03  00  00  00  00  00  00  00  00  00
         \0  \0   8  \0 006  \0 003  \0  \0  \0  \0  \0  \0  \0  \0  \0
0000976  79  d3  f0  66  f3  b4  74  40  ac  43  0d  33  18  b7  8c  db
          y 323 360   f 363 264   t   @ 254   C  \r   3 030 267 214 333
0000992  00  00  00  48  07  00  00  00  00  00  00  24  03  00  00  00
         \0  \0  \0   H  \a  \0  \0  \0  \0  \0  \0   $ 003  \0  \0  \0
0001008  08  80  00  00  00  00  00  00  01  00  30  00  03  00  00  00
         \b 200  \0  \0  \0  \0  \0  \0 001  \0   0  \0 003  \0  \0  \0
0001024  00  00  00  00  06  00  07  00  00  00  00  24  03  00  00  00
         \0  \0  \0  \0 006  \0  \a  \0  \0  \0  \0   $ 003  \0  \0  \0
0001040  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0001056  00  00  01  00  00  00  00  00  04  00  50  00  07  00  86  80
         \0  \0 001  \0  \0  \0  \0  \0 004  \0   P  \0  \a  \0 206 200
0001072  01  00  01  00  00  00  00  00  00  00  00  00  00  00  00  00
        001  \0 001  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0001088  58  34  12  00  01  03  00  00  00  00  00  00  00  00  00  00
          X   4 022  \0 001 003  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
0001104  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0  \0
*
0001136  00  00  00  00  00  00  00  00
         \0  \0  \0  \0  \0  \0  \0  \0
0001144

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ross Zwisler May 31, 2018, 8:34 p.m. UTC | #4
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > 
> > > > The function return values are confusing with the way the function is
> > > > named. We expect a true or false return value but it actually returns
> > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > to return a bool where if DAX is supported then return true and no DAX
> > > > support returns false.
> > > > 
> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > 
> > > Looks ok, do you want me to pull the first two patches through the xfs
> > > tree?
> > > 
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Thanks for the review.
> > 
> > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > branch for the rest of the series based on your stable commits, yea?
> > 
> > Mike what would you prefer?
> 
> I /was/ about to say that I would pull in the first two patches, but now
> I can't get xfs to mount with pmem at all, and have no way of testing
> this...?

I'm not sure what's up - I'll dig in and find out.
Dan Williams May 31, 2018, 8:35 p.m. UTC | #5
On Thu, May 31, 2018 at 12:13 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
>> On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
>> > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
>> > > From: Dave Jiang <dave.jiang@intel.com>
>> > >
>> > > The function return values are confusing with the way the function is
>> > > named. We expect a true or false return value but it actually returns
>> > > 0/-errno.  This makes the code very confusing. Changing the return values
>> > > to return a bool where if DAX is supported then return true and no DAX
>> > > support returns false.
>> > >
>> > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>> >
>> > Looks ok, do you want me to pull the first two patches through the xfs
>> > tree?
>> >
>> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
>>
>> Thanks for the review.
>>
>> I'm not sure what's best.  If you do that then Mike will need to have a DM
>> branch for the rest of the series based on your stable commits, yea?
>>
>> Mike what would you prefer?
>
> I /was/ about to say that I would pull in the first two patches, but now
> I can't get xfs to mount with pmem at all, and have no way of testing
> this...?
>
> # echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control
> # mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt
> # dmesg
> <snip>
> SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled
> XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk
> pmem3: error: dax support not enabled
> pmem4: error: dax support not enabled
> XFS (pmem3): DAX unsupported by block device. Turning off DAX.
> XFS (pmem3): Mounting V5 Filesystem
> XFS (pmem3): Ending clean mount
>
> Evidently the pfn it picks up is missing PFN_MAP in flags because
> ND_REGION_PAGEMAP isn't set, and looking at the kernel source, pmem that
> comes in via NFIT never gets that set...?
>
> relevant qemu pmem options:
>
> -object memory-backend-file,id=memnvdimm0,prealloc=no,mem-path=/dev/shm/a.img,share=yes,size=13488881664
> -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0
> (repeat for five more devices)
>
> <confused>
>

What does "ndctl list" say? The namespaces need to be in fsdax mode.
Ross Zwisler May 31, 2018, 8:41 p.m. UTC | #6
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > 
> > > > The function return values are confusing with the way the function is
> > > > named. We expect a true or false return value but it actually returns
> > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > to return a bool where if DAX is supported then return true and no DAX
> > > > support returns false.
> > > > 
> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > 
> > > Looks ok, do you want me to pull the first two patches through the xfs
> > > tree?
> > > 
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Thanks for the review.
> > 
> > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > branch for the rest of the series based on your stable commits, yea?
> > 
> > Mike what would you prefer?
> 
> I /was/ about to say that I would pull in the first two patches, but now
> I can't get xfs to mount with pmem at all, and have no way of testing
> this...?
> 
> # echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control
> # mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt
> # dmesg
> <snip>
> SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled
> XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk
> pmem3: error: dax support not enabled
> pmem4: error: dax support not enabled
> XFS (pmem3): DAX unsupported by block device. Turning off DAX.
> XFS (pmem3): Mounting V5 Filesystem
> XFS (pmem3): Ending clean mount

This same sequence worked fine in my QEMU setup.

My guess is the issue is that your namespaces are in raw mode:

# ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"raw",			<< this
    "size":18119393280,
    "sector_size":512,
    "blockdev":"pmem1",
    "numa_node":0
  },
  {
    "dev":"namespace0.0",
    "mode":"raw",			<< this
    "size":18119393280,
    "sector_size":512,
    "blockdev":"pmem0",
    "numa_node":0
  }
]

If so, you can solve this by putting them in fsdax mode with ndctl:

# ndctl create-namespace -f -e namespace0.0 --mode=fsdax
{
  "dev":"namespace0.0",
  "mode":"fsdax",
  "size":"16.61 GiB (17.83 GB)",
  "uuid":"4c193c83-c031-41d4-8f96-2ff5c7b59c69",
  "raw_uuid":"2f785f09-717a-4a5e-9e9d-88d489ef1030",
  "sector_size":512,
  "blockdev":"pmem0",
  "numa_node":0
}

# ndctl create-namespace -f -e namespace1.0 --mode=fsdax
{
  "dev":"namespace1.0",
  "mode":"fsdax",
  "size":"16.61 GiB (17.83 GB)",
  "uuid":"39ec9cc4-a7ef-4f99-8354-60105543bf47",
  "raw_uuid":"403fae76-d4f8-4bed-9a3f-ef40924762be",
  "sector_size":512,
  "blockdev":"pmem1",
  "numa_node":0
}

If that doesn't fix it for you, ping me on freenode and we'll figure it out.

Thanks,
- Ross
Mike Snitzer May 31, 2018, 8:52 p.m. UTC | #7
On Thu, May 31 2018 at  3:13pm -0400,
Darrick J. Wong <darrick.wong@oracle.com> wrote:

> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > 
> > > > The function return values are confusing with the way the function is
> > > > named. We expect a true or false return value but it actually returns
> > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > to return a bool where if DAX is supported then return true and no DAX
> > > > support returns false.
> > > > 
> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > 
> > > Looks ok, do you want me to pull the first two patches through the xfs
> > > tree?
> > > 
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Thanks for the review.
> > 
> > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > branch for the rest of the series based on your stable commits, yea?
> > 
> > Mike what would you prefer?
> 
> I /was/ about to say that I would pull in the first two patches, but now
> I can't get xfs to mount with pmem at all, and have no way of testing
> this...?

Once you get this sorted out, please feel free to pull in the first 2.

I'm unlikely to get to reviewing the DM patches in this series until
tomorrow at the earliest.

Mike
Darrick J. Wong May 31, 2018, 10:26 p.m. UTC | #8
On Thu, May 31, 2018 at 04:52:06PM -0400, Mike Snitzer wrote:
> On Thu, May 31 2018 at  3:13pm -0400,
> Darrick J. Wong <darrick.wong@oracle.com> wrote:
> 
> > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > > 
> > > > > The function return values are confusing with the way the function is
> > > > > named. We expect a true or false return value but it actually returns
> > > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > > to return a bool where if DAX is supported then return true and no DAX
> > > > > support returns false.
> > > > > 
> > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > > 
> > > > Looks ok, do you want me to pull the first two patches through the xfs
> > > > tree?
> > > > 
> > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Thanks for the review.
> > > 
> > > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > > branch for the rest of the series based on your stable commits, yea?
> > > 
> > > Mike what would you prefer?
> > 
> > I /was/ about to say that I would pull in the first two patches, but now
> > I can't get xfs to mount with pmem at all, and have no way of testing
> > this...?
> 
> Once you get this sorted out, please feel free to pull in the first 2.

Sorted.  It'll be in Friday's for-next.  Ross helped me bang on the pmem
devices w/ ndctl to enable fsdax mode and twist qemu until everything
worked properly. ;)

--D

> I'm unlikely to get to reviewing the DM patches in this series until
> tomorrow at the earliest.
> 
> Mike
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
Dave Chinner June 1, 2018, 1:26 a.m. UTC | #9
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > 
> > > > The function return values are confusing with the way the function is
> > > > named. We expect a true or false return value but it actually returns
> > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > to return a bool where if DAX is supported then return true and no DAX
> > > > support returns false.
> > > > 
> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > 
> > > Looks ok, do you want me to pull the first two patches through the xfs
> > > tree?
> > > 
> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Thanks for the review.
> > 
> > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > branch for the rest of the series based on your stable commits, yea?
> > 
> > Mike what would you prefer?
> 
> I /was/ about to say that I would pull in the first two patches, but now
> I can't get xfs to mount with pmem at all, and have no way of testing
> this...?

I have similar problems, too, but:

$ ndctl list
[
  {
    "dev":"namespace1.0",
    "mode":"raw",
    "size":8589934592,
    "sector_size":512,
    "blockdev":"pmem1"
  },
  {
    "dev":"namespace0.0",
    "mode":"raw",
    "size":8589934592,
    "sector_size":512,
    "blockdev":"pmem0"
  }
]
$ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax
  Error: operation failed, region0 fsdax mode not available

failed to reconfigure namespace: Invalid argument
$

I can't make head or tail of what is going wrong here - how am I
supposed to debug this and get it working again?

FWIW, XFS+DAX used to just work on this setup (I hadn't even
installed ndctl until this morning!) but after changing the kernel
it no longer works. That would make it a regression, yes?

Cheers,

Dave.
Dan Williams June 1, 2018, 1:57 a.m. UTC | #10
On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
>> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
>> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
>> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
>> > > > From: Dave Jiang <dave.jiang@intel.com>
>> > > >
>> > > > The function return values are confusing with the way the function is
>> > > > named. We expect a true or false return value but it actually returns
>> > > > 0/-errno.  This makes the code very confusing. Changing the return values
>> > > > to return a bool where if DAX is supported then return true and no DAX
>> > > > support returns false.
>> > > >
>> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>> > >
>> > > Looks ok, do you want me to pull the first two patches through the xfs
>> > > tree?
>> > >
>> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
>> >
>> > Thanks for the review.
>> >
>> > I'm not sure what's best.  If you do that then Mike will need to have a DM
>> > branch for the rest of the series based on your stable commits, yea?
>> >
>> > Mike what would you prefer?
>>
>> I /was/ about to say that I would pull in the first two patches, but now
>> I can't get xfs to mount with pmem at all, and have no way of testing
>> this...?
>
> I have similar problems, too, but:
>
> $ ndctl list
> [
>   {
>     "dev":"namespace1.0",
>     "mode":"raw",
>     "size":8589934592,
>     "sector_size":512,
>     "blockdev":"pmem1"
>   },
>   {
>     "dev":"namespace0.0",
>     "mode":"raw",
>     "size":8589934592,
>     "sector_size":512,
>     "blockdev":"pmem0"
>   }
> ]
> $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax
>   Error: operation failed, region0 fsdax mode not available
>
> failed to reconfigure namespace: Invalid argument
> $
>
> I can't make head or tail of what is going wrong here - how am I
> supposed to debug this and get it working again?
>
> FWIW, XFS+DAX used to just work on this setup (I hadn't even
> installed ndctl until this morning!) but after changing the kernel
> it no longer works. That would make it a regression, yes?

This commit caused the behavior change:

    569d0365f571 dax: require 'struct page' by default for filesystem dax

The justification is in that patch, but the short summary is we killed
off "pageless" dax because it had so many incomplete holes and
surprise behaviors. It needed to die on the path to making dax not
experimental, i.e. to close safety holes, and be feature complete for
all the ways userspace expects to use mappings (direct-io, fork,
poison handling, etc).

I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
has the following dependencies:

        depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on SPARSEMEM_VMEMMAP

I've created a task to go improve ndctl's error reporting for
troubleshooting this failure.
Dave Chinner June 1, 2018, 2:24 a.m. UTC | #11
On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
> On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
> >> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> >> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> >> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> >> > > > From: Dave Jiang <dave.jiang@intel.com>
> >> > > >
> >> > > > The function return values are confusing with the way the function is
> >> > > > named. We expect a true or false return value but it actually returns
> >> > > > 0/-errno.  This makes the code very confusing. Changing the return values
> >> > > > to return a bool where if DAX is supported then return true and no DAX
> >> > > > support returns false.
> >> > > >
> >> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> >> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> >> > >
> >> > > Looks ok, do you want me to pull the first two patches through the xfs
> >> > > tree?
> >> > >
> >> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> >> >
> >> > Thanks for the review.
> >> >
> >> > I'm not sure what's best.  If you do that then Mike will need to have a DM
> >> > branch for the rest of the series based on your stable commits, yea?
> >> >
> >> > Mike what would you prefer?
> >>
> >> I /was/ about to say that I would pull in the first two patches, but now
> >> I can't get xfs to mount with pmem at all, and have no way of testing
> >> this...?
> >
> > I have similar problems, too, but:
> >
> > $ ndctl list
> > [
> >   {
> >     "dev":"namespace1.0",
> >     "mode":"raw",
> >     "size":8589934592,
> >     "sector_size":512,
> >     "blockdev":"pmem1"
> >   },
> >   {
> >     "dev":"namespace0.0",
> >     "mode":"raw",
> >     "size":8589934592,
> >     "sector_size":512,
> >     "blockdev":"pmem0"
> >   }
> > ]
> > $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax
> >   Error: operation failed, region0 fsdax mode not available
> >
> > failed to reconfigure namespace: Invalid argument
> > $
> >
> > I can't make head or tail of what is going wrong here - how am I
> > supposed to debug this and get it working again?
> >
> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
> > installed ndctl until this morning!) but after changing the kernel
> > it no longer works. That would make it a regression, yes?
> 
> This commit caused the behavior change:
> 
>     569d0365f571 dax: require 'struct page' by default for filesystem dax
> 
> The justification is in that patch, but the short summary is we killed
> off "pageless" dax because it had so many incomplete holes and
> surprise behaviors. It needed to die on the path to making dax not
> experimental, i.e. to close safety holes, and be feature complete for
> all the ways userspace expects to use mappings (direct-io, fork,
> poison handling, etc).
> 
> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
> has the following dependencies:
> 
>         depends on MEMORY_HOTPLUG
>         depends on MEMORY_HOTREMOVE
>         depends on SPARSEMEM_VMEMMAP

Filesystem DAX now has a dependency on memory hotplug?

Fmeh. No wonder I never enabled CONFIG_ZONE_DEVICE. It's described in
menuconfig as "Device memory (pmem, HMM, etc...) hotplug support".
This isn't for hotplug support - it required for basic DAX
functionality. I've never enabled memory hotplug in any of my test
kernel configs, because the VMs I run the kernels on don't ever get
memory hotplugged....

OK, works now I've found the magic config incantantions to turn
everything I now need on.

Can we get this rationalised to a single top level config
option in the filesystems menu "Enable Filesystem DAX" that turns
on every knob that is required?

Cheers,

Dave.
Dan Williams June 1, 2018, 4:02 a.m. UTC | #12
On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
>> On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote:
>> > On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote:
>> >> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
>> >> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
>> >> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
>> >> > > > From: Dave Jiang <dave.jiang@intel.com>
>> >> > > >
>> >> > > > The function return values are confusing with the way the function is
>> >> > > > named. We expect a true or false return value but it actually returns
>> >> > > > 0/-errno.  This makes the code very confusing. Changing the return values
>> >> > > > to return a bool where if DAX is supported then return true and no DAX
>> >> > > > support returns false.
>> >> > > >
>> >> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> >> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
>> >> > >
>> >> > > Looks ok, do you want me to pull the first two patches through the xfs
>> >> > > tree?
>> >> > >
>> >> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
>> >> >
>> >> > Thanks for the review.
>> >> >
>> >> > I'm not sure what's best.  If you do that then Mike will need to have a DM
>> >> > branch for the rest of the series based on your stable commits, yea?
>> >> >
>> >> > Mike what would you prefer?
>> >>
>> >> I /was/ about to say that I would pull in the first two patches, but now
>> >> I can't get xfs to mount with pmem at all, and have no way of testing
>> >> this...?
>> >
>> > I have similar problems, too, but:
>> >
>> > $ ndctl list
>> > [
>> >   {
>> >     "dev":"namespace1.0",
>> >     "mode":"raw",
>> >     "size":8589934592,
>> >     "sector_size":512,
>> >     "blockdev":"pmem1"
>> >   },
>> >   {
>> >     "dev":"namespace0.0",
>> >     "mode":"raw",
>> >     "size":8589934592,
>> >     "sector_size":512,
>> >     "blockdev":"pmem0"
>> >   }
>> > ]
>> > $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax
>> >   Error: operation failed, region0 fsdax mode not available
>> >
>> > failed to reconfigure namespace: Invalid argument
>> > $
>> >
>> > I can't make head or tail of what is going wrong here - how am I
>> > supposed to debug this and get it working again?
>> >
>> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
>> > installed ndctl until this morning!) but after changing the kernel
>> > it no longer works. That would make it a regression, yes?
>>
>> This commit caused the behavior change:
>>
>>     569d0365f571 dax: require 'struct page' by default for filesystem dax
>>
>> The justification is in that patch, but the short summary is we killed
>> off "pageless" dax because it had so many incomplete holes and
>> surprise behaviors. It needed to die on the path to making dax not
>> experimental, i.e. to close safety holes, and be feature complete for
>> all the ways userspace expects to use mappings (direct-io, fork,
>> poison handling, etc).
>>
>> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
>> has the following dependencies:
>>
>>         depends on MEMORY_HOTPLUG
>>         depends on MEMORY_HOTREMOVE
>>         depends on SPARSEMEM_VMEMMAP
>
> Filesystem DAX now has a dependency on memory hotplug?
>
> Fmeh. No wonder I never enabled CONFIG_ZONE_DEVICE. It's described in
> menuconfig as "Device memory (pmem, HMM, etc...) hotplug support".
> This isn't for hotplug support - it required for basic DAX
> functionality. I've never enabled memory hotplug in any of my test
> kernel configs, because the VMs I run the kernels on don't ever get
> memory hotplugged....
>
> OK, works now I've found the magic config incantantions to turn
> everything I now need on.
>
> Can we get this rationalised to a single top level config
> option in the filesystems menu "Enable Filesystem DAX" that turns
> on every knob that is required?

That sounds good to me. Will do.
Ross Zwisler June 1, 2018, 8:59 p.m. UTC | #13
On Thu, May 31, 2018 at 03:26:45PM -0700, Darrick J. Wong wrote:
> On Thu, May 31, 2018 at 04:52:06PM -0400, Mike Snitzer wrote:
> > On Thu, May 31 2018 at  3:13pm -0400,
> > Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > 
> > > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote:
> > > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote:
> > > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote:
> > > > > > From: Dave Jiang <dave.jiang@intel.com>
> > > > > > 
> > > > > > The function return values are confusing with the way the function is
> > > > > > named. We expect a true or false return value but it actually returns
> > > > > > 0/-errno.  This makes the code very confusing. Changing the return values
> > > > > > to return a bool where if DAX is supported then return true and no DAX
> > > > > > support returns false.
> > > > > > 
> > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com>
> > > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
> > > > > 
> > > > > Looks ok, do you want me to pull the first two patches through the xfs
> > > > > tree?
> > > > > 
> > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Thanks for the review.
> > > > 
> > > > I'm not sure what's best.  If you do that then Mike will need to have a DM
> > > > branch for the rest of the series based on your stable commits, yea?
> > > > 
> > > > Mike what would you prefer?
> > > 
> > > I /was/ about to say that I would pull in the first two patches, but now
> > > I can't get xfs to mount with pmem at all, and have no way of testing
> > > this...?
> > 
> > Once you get this sorted out, please feel free to pull in the first 2.
> 
> Sorted.  It'll be in Friday's for-next.  Ross helped me bang on the pmem
> devices w/ ndctl to enable fsdax mode and twist qemu until everything
> worked properly. ;)

For anyone else who would like to simulate persistent memory using QEMU, I've
added some hints here:

https://nvdimm.wiki.kernel.org/pmem_in_qemu
Dave Chinner June 3, 2018, 10:20 p.m. UTC | #14
On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
> On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
> >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
> >> > installed ndctl until this morning!) but after changing the kernel
> >> > it no longer works. That would make it a regression, yes?

[....]

> >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
> >> has the following dependencies:
> >>
> >>         depends on MEMORY_HOTPLUG
> >>         depends on MEMORY_HOTREMOVE
> >>         depends on SPARSEMEM_VMEMMAP
> >
> > Filesystem DAX now has a dependency on memory hotplug?

[....]

> > OK, works now I've found the magic config incantantions to turn
> > everything I now need on.

By enabling these options, my test VM now has a ~30s pause in the
boot very soon after the nvdimm subsystem is initialised.

[    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
[    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
[    1.552175] Non-volatile memory driver v1.3
[    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
[    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
[   37.217453] brd: module loaded
[   37.225423] loop: module loaded
[   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
[   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
[   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
[   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
[   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes

The system does not appear to be consuming CPU, but it is blocking
NMIs so I can't get a CPU trace. For a VM that I rely on booting in
a few seconds because I reboot it tens of times a day, this is a
problem....

Cheers,

Dave.
Dave Chinner June 4, 2018, 12:25 a.m. UTC | #15
On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
> > >> > installed ndctl until this morning!) but after changing the kernel
> > >> > it no longer works. That would make it a regression, yes?
> 
> [....]
> 
> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
> > >> has the following dependencies:
> > >>
> > >>         depends on MEMORY_HOTPLUG
> > >>         depends on MEMORY_HOTREMOVE
> > >>         depends on SPARSEMEM_VMEMMAP
> > >
> > > Filesystem DAX now has a dependency on memory hotplug?
> 
> [....]
> 
> > > OK, works now I've found the magic config incantantions to turn
> > > everything I now need on.
> 
> By enabling these options, my test VM now has a ~30s pause in the
> boot very soon after the nvdimm subsystem is initialised.
> 
> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
> [    1.552175] Non-volatile memory driver v1.3
> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
> [   37.217453] brd: module loaded
> [   37.225423] loop: module loaded
> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
> 
> The system does not appear to be consuming CPU, but it is blocking
> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
> a few seconds because I reboot it tens of times a day, this is a
> problem....

And when I turn on KASAN, the kernel fails to boot to a login prompt
because:

[   15.363583] Non-volatile memory driver v1.3
[   22.286787] brd: module loaded
[   22.347054] loop: module loaded
[   22.352158] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
[   22.373921] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
[   22.394363] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
[   22.410145] Loading iSCSI transport class v2.0-870.
[   22.411506] nd_pmem namespace0.0: unable to guarantee persistence of writes
[   22.412777] iscsi: registered transport (tcp)
[   22.412780] megaraid cmm: 2.20.2.7 (Release Date: Sun Jul 16 00:01:03 EST 2006)
[   22.412806] nd_pmem namespace1.0: unable to guarantee persistence of writes
[   22.413090] megaraid: 2.20.5.1 (Release Date: Thu Nov 16 15:32:35 EST 2006)
[   22.421485] kworker/u33:4 (1087) used greatest stack depth: 27704 bytes left
[   22.430664] scsi host0: ata_piix
[   22.711000] BUG: unable to handle kernel paging request at ffffed0078000000
[   22.712225] PGD 13ffd1067 P4D 13ffd1067 PUD 1247d2067 PMD 0 
[   22.713180] Oops: 0000 [#1] PREEMPT SMP KASAN
[   22.713884] CPU: 8 PID: 168 Comm: kworker/u33:3 Not tainted 4.17.0-rc7-dgc+ #549
[   22.714480] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   22.714480] Workqueue: events_unbound async_run_entry_fn
[   22.714480] RIP: 0010:check_memory_region+0xdd/0x190
[   22.714480] RSP: 0000:ffff88007a1aec08 EFLAGS: 00010202
[   22.714480] RAX: ffffed0078000000 RBX: ffff8803c0000fff RCX: ffffffff81d41a12
[   22.714480] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff8803c0000000
[   22.714480] RBP: ffffed0078000200 R08: 0000000000000200 R09: 0000000000000040
[   22.714480] R10: 0000000000000200 R11: ffffed00780001ff R12: ffff8803c0000000
[   22.714480] R13: 000000000192f3f0 R14: 0000000000000000 R15: 0000000000000000
[   22.714480] FS:  0000000000000000(0000) GS:ffff88007f700000(0000) knlGS:0000000000000000
[   22.714480] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   22.714480] CR2: ffffed0078000000 CR3: 000000000300f001 CR4: 00000000000606e0
[   22.714480] Call Trace:
[   22.714480]  memcpy+0x1f/0x50
[   22.714480]  pmem_do_bvec+0x182/0x350
[   22.714480]  ? pmem_release_queue+0x10/0x10
[   22.714480]  ? __blkdev_get+0x484/0x7b0
[   22.714480]  ? blkdev_get+0x1e6/0x4f0
[   22.714480]  ? __device_add_disk+0x62a/0x780
[   22.714480]  ? pmem_attach_disk+0x580/0x7b0
[   22.714480]  ? do_raw_spin_unlock+0xa4/0x130
[   22.714480]  ? _raw_spin_unlock+0xa/0x20
[   22.714480]  ? create_task_io_context+0x195/0x1d0
[   22.714480]  ? generic_make_request_checks+0x33a/0x7a0
[   22.714480]  pmem_make_request+0x1d9/0x3c0
[   22.714480]  generic_make_request+0x2d8/0x6c0
[   22.714480]  ? blk_plug_queued_count+0xb0/0xb0
[   22.714480]  ? memset+0x1f/0x40
[   22.714480]  ? submit_bio+0xc8/0x1e0
[   22.714480]  submit_bio+0xc8/0x1e0
[   22.714480]  ? direct_make_request+0x150/0x150
[   22.714480]  ? alloc_buffer_head+0x3c/0x80
[   22.714480]  ? alloc_page_buffers+0x7f/0x120
[   22.714480]  ? __rcu_read_unlock+0x66/0x80
[   22.714480]  ? guard_bio_eod+0xa9/0x1e0
[   22.714480]  submit_bh_wbc.isra.41+0x257/0x280
[   22.714480]  ? end_buffer_read_nobh+0x10/0x10
[   22.714480]  block_read_full_page+0x307/0x470
[   22.714480]  ? block_llseek+0x80/0x80
[   22.714480]  ? block_page_mkwrite+0x150/0x150
[   22.714480]  ? add_to_page_cache_lru+0xd7/0x160
[   22.714480]  ? add_to_page_cache_locked+0x10/0x10
[   22.714480]  ? policy_node+0x56/0x60
[   22.714480]  do_read_cache_page+0x3e3/0x730
[   22.714480]  ? bus_probe_device+0xf3/0x120
[   22.714480]  ? async_run_entry_fn+0x75/0x1e0
[   22.714480]  ? blkdev_writepages+0x10/0x10
[   22.714480]  ? filemap_fault+0x940/0x940
[   22.714480]  ? unwind_next_frame+0x8cf/0x940
[   22.714480]  ? stack_access_ok+0x35/0x80
[   22.714480]  ? __alloc_pages_nodemask+0x18e/0x320
[   22.714480]  ? __alloc_pages_slowpath+0x12b0/0x12b0
[   22.714480]  read_dev_sector+0x61/0x120
[   22.753746] scsi host1: ata_piix
[   22.753296]  read_lba+0x260/0x2d0
[   22.753296]  ? sgi_partition+0x3c0/0x3c0
[   22.755203] ata1: PATA max MWDMA2 cmd 0x1f0 ctl 0x3f6 bmdma 0xc100 irq 14
[   22.753296]  ? do_raw_spin_unlock+0xa4/0x130
[   22.756637] ata2: PATA max MWDMA2 cmd 0x170 ctl 0x376 bmdma 0xc108 irq 15
[   22.753296]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.758498]  efi_partition+0x252/0xb40
[   22.758498]  ? __inc_numa_state+0x19/0x90
[   22.758498]  ? kasan_unpoison_shadow+0x30/0x40
[   22.758498]  ? vzalloc+0x58/0x60
[   22.758498]  ? rescan_partitions+0x133/0x510
[   22.758498]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.758498]  ? kasan_unpoison_shadow+0x30/0x40
[   22.764064]  ? get_page_from_freelist+0x16dd/0x1a50
[   22.764064]  ? widen_string+0x25/0x100
[   22.766067]  ? widen_string+0x25/0x100
[   22.766282] i8042: PNP: PS/2 Controller [PNP0303:KBD,PNP0f13:MOU] at 0x60,0x64 irq 1,12
[   22.766575]  ? string+0xc4/0xf0
[   22.766575]  ? format_decode+0x79/0x4b0
[   22.766575]  ? memcpy+0x34/0x50
[   22.770062] serio: i8042 KBD port at 0x60,0x64 irq 1
[   22.766575]  ? vsnprintf+0xf2/0x7d0
[   22.771264] serio: i8042 AUX port at 0x60,0x64 irq 12
[   22.766575]  ? num_to_str+0x160/0x160
[   22.772750]  ? __might_sleep+0x31/0xd0
[   22.773202] mousedev: PS/2 mouse device common for all mice
[   22.772750]  ? snprintf+0x8f/0xc0
[   22.772750]  ? vscnprintf+0x30/0x30
[   22.772750]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.776149] input: AT Translated Set 2 keyboard as /devices/platform/i8042/serio0/input/input1
[   22.776242]  ? check_partition+0x18f/0x310
[   22.776717] rtc_cmos 00:00: RTC can wake from S4
[   22.778723] rtc_cmos 00:00: registered as rtc0
[   22.779023]  check_partition+0x18f/0x310
[   22.779023]  rescan_partitions+0x133/0x510
[   22.780706] rtc_cmos 00:00: nvmem registration failed
[   22.779023]  ? __might_sleep+0x31/0xd0
[   22.782618] rtc_cmos 00:00: alarms up to one day, y3k, 114 bytes nvram, hpet irqs
[   22.779023]  ? kmem_cache_alloc+0xdd/0x5b0
[   22.779023]  ? ___might_sleep+0x7c/0x1a0
[   22.786069]  ? bd_set_size+0x104/0x160
[   22.786069]  __blkdev_get+0x484/0x7b0
[   22.786069]  ? bd_set_size+0x160/0x160
[   22.786069]  ? map_id_range_down+0x150/0x170
[   22.788981] device-mapper: ioctl: 4.39.0-ioctl (2018-04-03) initialised: dm-devel@redhat.com
[   22.786069]  ? free_user_ns+0x170/0x170
[   22.789183]  blkdev_get+0x1e6/0x4f0
[   22.789183]  ? __raw_spin_lock_init+0x42/0x50
[   22.789183]  ? __wake_up_bit+0x78/0xc0
[   22.792867] device-mapper: raid: Loading target version 1.13.2
[   22.789183]  ? __blkdev_get+0x7b0/0x7b0
[   22.789183]  ? refcount_sub_and_test+0xa7/0x120
[   22.789183]  ? refcount_inc+0x30/0x30
[   22.795220]  ? do_raw_spin_lock+0x9b/0x130
[   22.795991] hidraw: raw HID events driver (C) Jiri Kosina
[   22.796068]  ? do_raw_spin_unlock+0xa4/0x130
[   22.796068]  ? kobject_put+0x2d/0x100
[   22.796068]  __device_add_disk+0x62a/0x780
[   22.796068]  ? bdget_disk+0x50/0x50
[   22.796068]  ? alloc_dax+0x222/0x2e0
[   22.800041]  ? kill_dax+0xc0/0xc0
[   22.800041]  ? mutex_lock+0x2a/0x50
[   22.800041]  ? nvdimm_badblocks_populate+0x56/0x1d0
[   22.800041]  ? __raw_spin_lock_init+0x42/0x50
[   22.800041]  pmem_attach_disk+0x580/0x7b0
[   22.803358] oprofile: using NMI interrupt.
[   22.800041]  ? nd_pmem_notify+0x240/0x240
[   22.804217]  ? nd_dax_probe+0xef/0x110
[   22.805177] NET: Registered protocol family 17
[   22.804217]  nvdimm_bus_probe+0x76/0xe0
[   22.804217]  driver_probe_device+0x31d/0x450
[   22.806975] sctp: Hash tables configured (bind 128/128)
[   22.804217]  ? __driver_attach+0xd0/0xd0
[   22.808773] Key type dns_resolver registered
[   22.804217]  bus_for_each_drv+0xd8/0x130
[   22.809534] Key type ceph registered
[   22.804217]  ? subsys_find_device_by_id+0x1e0/0x1e0
[   22.804217]  ? do_raw_spin_unlock+0xa4/0x130
[   22.811472]  __device_attach+0x14d/0x1b0
[   22.812223]  ? device_bind_driver+0x70/0x70
[   22.812768] libceph: loaded (mon/osd proto 15/24)
[   22.812223]  ? kobject_uevent_env+0x15d/0x7f0
[   22.812223]  bus_probe_device+0xf3/0x120
[   22.812223]  device_add+0x696/0xa20
[   22.812223]  ? device_private_init+0xc0/0xc0
[   22.816158]  ? __wake_up_common_lock+0xcb/0x110
[   22.816158]  ? __wake_up_common+0x240/0x240
[   22.816158]  ? nd_async_device_unregister+0x30/0x30
[   22.816158]  nd_async_device_register+0xe/0x40
[   22.816158]  async_run_entry_fn+0x75/0x1e0
[   22.819536]  process_one_work+0x349/0x6d0
[   22.819536]  worker_thread+0x74/0x5c0
[   22.819536]  ? process_one_work+0x6d0/0x6d0
[   22.819536]  kthread+0x1b7/0x1e0
[   22.819536]  ? __kthread_bind_mask+0x70/0x70
[   22.819536]  ret_from_fork+0x24/0x30
[   22.819536] Code: 01 74 0b 41 80 38 00 74 f0 4d 85 c0 75 56 4c 01 c8 49 89 e8 49 29 c0 4d 8d 48 07 4d 85 c0 4d 0f 49 c8 49 c1 f9 03 45 85 c9 74 5b <48> 83 38 00 75 18 45 8d 41 ff 4e 8d 44 c0 08 48 8 
[   22.819536] RIP: check_memory_region+0xdd/0x190 RSP: ffff88007a1aec08
[   22.819536] CR2: ffffed0078000000
[   22.819536] ---[ end trace 2d3ad661d958a6e8 ]---
[   22.819536] BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:34
[   22.819536] in_atomic(): 1, irqs_disabled(): 1, pid: 168, name: kworker/u33:3
[   22.819536] CPU: 8 PID: 168 Comm: kworker/u33:3 Tainted: G    B D           4.17.0-rc7-dgc+ #549
[   22.819536] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   22.819536] Workqueue: events_unbound async_run_entry_fn
[   22.819536] Call Trace:
[   22.819536]  dump_stack+0x9a/0xeb
[   22.819536]  ___might_sleep+0x169/0x1a0
[   22.819536]  exit_signals+0x67/0x310
[   22.819536]  ? get_signal+0x860/0x860
[   22.819536]  ? __wake_up_common+0x240/0x240
[   22.819536]  ? nd_async_device_unregister+0x30/0x30
[   22.819536]  ? blocking_notifier_call_chain+0x24/0x70
[   22.819536]  do_exit+0x14d/0x1370
[   22.819536]  ? is_current_pgrp_orphaned+0x60/0x60
[   22.819536]  ? worker_thread+0x74/0x5c0
[   22.819536]  ? process_one_work+0x6d0/0x6d0
[   22.819536]  ? kthread+0x1b7/0x1e0
[   22.819536]  rewind_stack_do_exit+0x17/0x20
[   22.819536] ==================================================================
[   22.819536] BUG: KASAN: stack-out-of-bounds in widen_string+0x25/0x100
[   22.819536] Read of size 8 at addr ffff88007a1afae0 by task kworker/u33:3/168
[   22.819536] 
[   22.819536] CPU: 8 PID: 168 Comm: kworker/u33:3 Tainted: G      D           4.17.0-rc7-dgc+ #549
[   22.819536] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   22.819536] Workqueue: events_unbound async_run_entry_fn
[   22.819536] Call Trace:
[   22.819536]  dump_stack+0x9a/0xeb
[   22.819536]  ? widen_string+0x25/0x100
[   22.819536]  print_address_description+0x7e/0x290
[   22.819536]  ? widen_string+0x25/0x100
[   22.819536]  kasan_report+0x237/0x360
[   22.819536]  widen_string+0x25/0x100
[   22.819536]  string+0xc4/0xf0
[   22.819536]  vsnprintf+0x2a8/0x7d0
[   22.819536]  ? num_to_str+0x160/0x160
[   22.819536]  ? subsys_find_device_by_id+0x1e0/0x1e0
[   22.819536]  ? do_raw_spin_unlock+0xa4/0x130
[   22.819536]  vscnprintf+0x9/0x30
[   22.819536]  vprintk_emit+0x86/0x400
[   22.819536]  printk+0x94/0xb0
[   22.819536]  ? cpumask_weight.constprop.27+0x1e/0x1e
[   22.819536]  ? bus_probe_device+0xf3/0x120
[   22.819536]  ? ___might_sleep+0xa7/0x1a0
[   22.819536]  ___might_sleep+0xee/0x1a0
[   22.819536]  exit_signals+0x67/0x310
[   22.819536]  ? get_signal+0x860/0x860
[   22.819536]  ? __wake_up_common+0x240/0x240
[   22.819536]  ? nd_async_device_unregister+0x30/0x30
[   22.819536]  ? blocking_notifier_call_chain+0x24/0x70
[   22.819536]  do_exit+0x14d/0x1370
[   22.819536]  ? is_current_pgrp_orphaned+0x60/0x60
[   22.819536]  ? worker_thread+0x74/0x5c0
[   22.819536]  ? process_one_work+0x6d0/0x6d0
[   22.819536]  ? kthread+0x1b7/0x1e0
[   22.819536]  rewind_stack_do_exit+0x17/0x20
[   22.819536] 
[   22.819536] The buggy address belongs to the page:
[   22.819536] page:ffffea0001ab5e48 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[   22.819536] flags: 0xfffffc0000000()
[   22.819536] raw: 000fffffc0000000 0000000000000000 0000000000000000 00000000ffffffff
[   22.819536] raw: ffffea0001ab5e68 ffffea0001ab5e68 0000000000000000
[   22.819536] page dumped because: kasan: bad access detected
[   22.819536] 
[   22.819536] Memory state around the buggy address:
[   22.819536]  ffff88007a1af980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   22.819536]  ffff88007a1afa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1
[   22.819536] >ffff88007a1afa80: f1 f1 f1 00 00 00 00 00 00 00 00 f3 f3 f3 f3 00
[   22.819536]                                                        ^
[   22.819536]  ffff88007a1afb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   22.819536]  ffff88007a1afb80: 00 00 f1 f1 f1 f1 00 f2 f2 f2 f3 f3 f3 f3 f3 f3
[   22.819536] ==================================================================
[   22.882914] hpet1: lost 5 rtc interrupts
[   22.883636] note: kworker/u33:3[168] exited with preempt_count 1
[   22.885240] kworker/u33:3 (168) used greatest stack depth: 24768 bytes left
[   22.887136] BUG: unable to handle kernel paging request at ffffed00c0000000
[   22.888327] PGD 13ffd1067 P4D 13ffd1067 PUD 0 
[   22.889043] Oops: 0000 [#2] PREEMPT SMP KASAN
[   22.889747] CPU: 12 PID: 159 Comm: kworker/u33:2 Tainted: G    B D W         4.17.0-rc7-dgc+ #549
[   22.890253] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[   22.890253] Workqueue: events_unbound async_run_entry_fn
[   22.890253] RIP: 0010:check_memory_region+0xdd/0x190
[   22.890253] RSP: 0000:ffff88007a16ec08 EFLAGS: 00010202
[   22.890253] RAX: ffffed00c0000000 RBX: ffff880600000fff RCX: ffffffff81d41a12
[   22.890253] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff880600000000
[   22.890253] RBP: ffffed00c0000200 R08: 0000000000000200 R09: 0000000000000040
[   22.890253] R10: 0000000000000200 R11: ffffed00c00001ff R12: ffff880600000000
[   22.890253] R13: 0000000001927f90 R14: 0000000000000000 R15: 0000000000000000
[   22.890253] FS:  0000000000000000(0000) GS:ffff88007f780000(0000) knlGS:0000000000000000
[   22.890253] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   22.890253] CR2: ffffed00c0000000 CR3: 000000000300f001 CR4: 00000000000606e0
[   22.890253] Call Trace:
[   22.890253]  memcpy+0x1f/0x50
[   22.890253]  pmem_do_bvec+0x182/0x350
[   22.890253]  ? pmem_release_queue+0x10/0x10
[   22.890253]  ? blkdev_get+0x1e6/0x4f0
[   22.890253]  ? __device_add_disk+0x62a/0x780
[   22.890253]  ? pmem_attach_disk+0x580/0x7b0
[   22.890253]  ? do_raw_spin_unlock+0xa4/0x130
[   22.890253]  ? _raw_spin_unlock+0xa/0x20
[   22.890253]  ? create_task_io_context+0x195/0x1d0
[   22.890253]  ? generic_make_request_checks+0x33a/0x7a0
[   22.890253]  pmem_make_request+0x1d9/0x3c0
[   22.890253]  generic_make_request+0x2d8/0x6c0
[   22.890253]  ? blk_plug_queued_count+0xb0/0xb0
[   22.890253]  ? memset+0x1f/0x40
[   22.890253]  ? submit_bio+0xc8/0x1e0
[   22.890253]  submit_bio+0xc8/0x1e0
[   22.890253]  ? direct_make_request+0x150/0x150
[   22.890253]  ? alloc_buffer_head+0x3c/0x80
[   22.890253]  ? alloc_page_buffers+0x7f/0x120
[   22.890253]  ? __rcu_read_unlock+0x66/0x80
[   22.917466] ata2.01: NODEV after polling detection
[   22.890253]  ? guard_bio_eod+0xa9/0x1e0
[   22.919184] ata1.01: NODEV after polling detection
[   22.890253]  submit_bh_wbc.isra.41+0x257/0x280
[   22.920719] ata2.00: ATAPI: QEMU DVD-ROM, 2.5+, max UDMA/100
[   22.890253]  ? end_buffer_read_nobh+0x10/0x10
[   22.922324] ata1.00: ATA-7: QEMU HARDDISK, 2.5+, max UDMA/100
[   22.890253]  block_read_full_page+0x307/0x470
[   22.923573] ata1.00: 20971520 sectors, multi 16: LBA48 
[   22.890253]  ? block_llseek+0x80/0x80
[   22.890253]  ? block_page_mkwrite+0x150/0x150
[   22.890253]  ? add_to_page_cache_lru+0xd7/0x160
[   22.890253]  ? add_to_page_cache_locked+0x10/0x10
[   22.890253]  ? policy_node+0x56/0x60
[   22.926515] ata2.00: configured for MWDMA2
[   22.890253]  do_read_cache_page+0x3e3/0x730
[   22.927886] ata1.00: configured for MWDMA2
[   22.890253]  ? bus_probe_device+0xf3/0x120
[   22.890253]  ? async_run_entry_fn+0x75/0x1e0
[   22.890253]  ? blkdev_writepages+0x10/0x10
[   22.890253]  ? filemap_fault+0x940/0x940
[   22.890253]  ? unwind_next_frame+0x8cf/0x940
[   22.890253]  ? stack_access_ok+0x35/0x80
[   22.890253]  ? deref_stack_reg+0x7f/0xb0
[   22.890253]  ? __read_once_size_nocheck.constprop.6+0x10/0x10
[   22.890253]  ? unwind_get_return_address_ptr+0x50/0x50
[   22.890253]  ? save_stack+0x89/0xb0
[   22.890253]  ? __orc_find+0x6b/0xc0
[   22.890253]  ? unwind_next_frame+0xc8/0x940
[   22.890253]  read_dev_sector+0x61/0x120
[   22.890253]  read_lba+0x260/0x2d0
[   22.890253]  ? sgi_partition+0x3c0/0x3c0
[   22.890253]  ? __save_stack_trace+0x73/0xd0
[   22.890253]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.890253]  efi_partition+0x252/0xb40
[   22.890253]  ? __inc_numa_state+0x19/0x90
[   22.890253]  ? kasan_unpoison_shadow+0x30/0x40
[   22.890253]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.890253]  ? kasan_unpoison_shadow+0x30/0x40
[   22.890253]  ? get_page_from_freelist+0x16dd/0x1a50
[   22.890253]  ? widen_string+0x25/0x100
[   22.890253]  ? widen_string+0x25/0x100
[   22.890253]  ? string+0xc4/0xf0
[   22.890253]  ? format_decode+0x79/0x4b0
[   22.890253]  ? memcpy+0x34/0x50
[   22.890253]  ? vsnprintf+0xf2/0x7d0
[   22.890253]  ? num_to_str+0x160/0x160
[   22.890253]  ? __might_sleep+0x31/0xd0
[   22.890253]  ? snprintf+0x8f/0xc0
[   22.890253]  ? vscnprintf+0x30/0x30
[   22.890253]  ? is_gpt_valid.part.5+0x3c0/0x3c0
[   22.890253]  ? check_partition+0x18f/0x310
[   22.890253]  check_partition+0x18f/0x310
[   22.890253]  rescan_partitions+0x133/0x510
[   22.890253]  ? __might_sleep+0x31/0xd0
[   22.890253]  ? ___might_sleep+0x7c/0x1a0
[   22.890253]  ? bd_set_size+0x104/0x160
[   22.890253]  __blkdev_get+0x484/0x7b0
[   22.890253]  ? __raw_spin_lock_init+0x42/0x50
[   22.890253]  ? bd_set_size+0x160/0x160
[   22.890253]  ? map_id_range_down+0x150/0x170
[   22.890253]  ? free_user_ns+0x170/0x170
[   22.890253]  blkdev_get+0x1e6/0x4f0
[   22.890253]  ? __raw_spin_lock_init+0x42/0x50
[   22.890253]  ? __wake_up_bit+0x78/0xc0
[   22.890253]  ? __blkdev_get+0x7b0/0x7b0
[   22.890253]  ? refcount_sub_and_test+0xa7/0x120
[   22.890253]  ? refcount_inc+0x30/0x30
[   22.890253]  ? do_raw_spin_lock+0x9b/0x130
[   22.890253]  ? do_raw_spin_unlock+0xa4/0x130
[   22.890253]  ? kobject_put+0x2d/0x100
[   22.890253]  __device_add_disk+0x62a/0x780
[   22.890253]  ? bdget_disk+0x50/0x50
[   22.890253]  ? alloc_dax+0x222/0x2e0
[   22.890253]  ? kill_dax+0xc0/0xc0
[   22.890253]  ? mutex_lock+0x2a/0x50
[   22.890253]  ? nvdimm_badblocks_populate+0x56/0x1d0
[   22.890253]  ? __raw_spin_lock_init+0x42/0x50
[   22.890253]  pmem_attach_disk+0x580/0x7b0
[   22.890253]  ? nd_pmem_notify+0x240/0x240
[   22.890253]  ? nd_dax_probe+0xef/0x110
[   22.890253]  nvdimm_bus_probe+0x76/0xe0
[   22.890253]  driver_probe_device+0x31d/0x450
[   22.890253]  ? __driver_attach+0xd0/0xd0
[   22.890253]  bus_for_each_drv+0xd8/0x130
[   22.890253]  ? subsys_find_device_by_id+0x1e0/0x1e0
[   22.890253]  ? do_raw_spin_unlock+0xa4/0x130
[   22.890253]  __device_attach+0x14d/0x1b0
[   22.890253]  ? device_bind_driver+0x70/0x70
[   22.890253]  ? kobject_uevent_env+0x15d/0x7f0
[   22.890253]  bus_probe_device+0xf3/0x120
[   22.890253]  device_add+0x696/0xa20
[   22.890253]  ? device_private_init+0xc0/0xc0
[   22.890253]  ? finish_task_switch+0xf8/0x3c0
[   22.890253]  ? nd_async_device_unregister+0x30/0x30
[   22.890253]  nd_async_device_register+0xe/0x40
[   22.890253]  async_run_entry_fn+0x75/0x1e0
[   22.890253]  process_one_work+0x349/0x6d0
[   22.890253]  worker_thread+0x74/0x5c0
[   22.890253]  ? process_one_work+0x6d0/0x6d0
[   22.890253]  kthread+0x1b7/0x1e0
[   22.890253]  ? __kthread_bind_mask+0x70/0x70
[   22.890253]  ret_from_fork+0x24/0x30
[   22.890253] Code: 01 74 0b 41 80 38 00 74 f0 4d 85 c0 75 56 4c 01 c8 49 89 e8 49 29 c0 4d 8d 48 07 4d 85 c0 4d 0f 49 c8 49 c1 f9 03 45 85 c9 74 5b <48> 83 38 00 75 18 45 8d 41 ff 4e 8d 44 c0 08 48 8 
[   22.890253] RIP: check_memory_region+0xdd/0x190 RSP: ffff88007a16ec08
[   22.890253] CR2: ffffed00c0000000
[   22.890253] ---[ end trace 2d3ad661d958a6e9 ]---

-Dave.
Dan Williams June 4, 2018, 1:48 a.m. UTC | #16
On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
>> > >> > installed ndctl until this morning!) but after changing the kernel
>> > >> > it no longer works. That would make it a regression, yes?
>>
>> [....]
>>
>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
>> > >> has the following dependencies:
>> > >>
>> > >>         depends on MEMORY_HOTPLUG
>> > >>         depends on MEMORY_HOTREMOVE
>> > >>         depends on SPARSEMEM_VMEMMAP
>> > >
>> > > Filesystem DAX now has a dependency on memory hotplug?
>>
>> [....]
>>
>> > > OK, works now I've found the magic config incantantions to turn
>> > > everything I now need on.
>>
>> By enabling these options, my test VM now has a ~30s pause in the
>> boot very soon after the nvdimm subsystem is initialised.
>>
>> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
>> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
>> [    1.552175] Non-volatile memory driver v1.3
>> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
>> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
>> [   37.217453] brd: module loaded
>> [   37.225423] loop: module loaded
>> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
>> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
>> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
>> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
>> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
>>
>> The system does not appear to be consuming CPU, but it is blocking
>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
>> a few seconds because I reboot it tens of times a day, this is a
>> problem....
>
> And when I turn on KASAN, the kernel fails to boot to a login prompt
> because:

What's your qemu and kernel command line? I'll take look at this first
thing tomorrow.
Dan Williams June 4, 2018, 11:40 p.m. UTC | #17
On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
>> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
>>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
>>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
>>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
>>> > >> > installed ndctl until this morning!) but after changing the kernel
>>> > >> > it no longer works. That would make it a regression, yes?
>>>
>>> [....]
>>>
>>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
>>> > >> has the following dependencies:
>>> > >>
>>> > >>         depends on MEMORY_HOTPLUG
>>> > >>         depends on MEMORY_HOTREMOVE
>>> > >>         depends on SPARSEMEM_VMEMMAP
>>> > >
>>> > > Filesystem DAX now has a dependency on memory hotplug?
>>>
>>> [....]
>>>
>>> > > OK, works now I've found the magic config incantantions to turn
>>> > > everything I now need on.
>>>
>>> By enabling these options, my test VM now has a ~30s pause in the
>>> boot very soon after the nvdimm subsystem is initialised.
>>>
>>> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
>>> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
>>> [    1.552175] Non-volatile memory driver v1.3
>>> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
>>> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
>>> [   37.217453] brd: module loaded
>>> [   37.225423] loop: module loaded
>>> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
>>> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
>>> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
>>> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
>>> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
>>>
>>> The system does not appear to be consuming CPU, but it is blocking
>>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
>>> a few seconds because I reboot it tens of times a day, this is a
>>> problem....
>>
>> And when I turn on KASAN, the kernel fails to boot to a login prompt
>> because:
>
> What's your qemu and kernel command line? I'll take look at this first
> thing tomorrow.

I was able to reproduce this crash by just turning on KASAN...
investigating. It would still help to have your config for our own
regression testing purposes it makes sense for us to prioritize
"Dave's test config", similar to the priority of not breaking Linus'
laptop.
Mike Snitzer June 5, 2018, 12:33 a.m. UTC | #18
On Mon, Jun 04 2018 at  7:40pm -0400,
Dan Williams <dan.j.williams@intel.com> wrote:

> On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> > On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
> >> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
> >>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
> >>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> >>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
> >>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
> >>> > >> > installed ndctl until this morning!) but after changing the kernel
> >>> > >> > it no longer works. That would make it a regression, yes?
> >>>
> >>> [....]
> >>>
> >>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
> >>> > >> has the following dependencies:
> >>> > >>
> >>> > >>         depends on MEMORY_HOTPLUG
> >>> > >>         depends on MEMORY_HOTREMOVE
> >>> > >>         depends on SPARSEMEM_VMEMMAP
> >>> > >
> >>> > > Filesystem DAX now has a dependency on memory hotplug?
> >>>
> >>> [....]
> >>>
> >>> > > OK, works now I've found the magic config incantantions to turn
> >>> > > everything I now need on.
> >>>
> >>> By enabling these options, my test VM now has a ~30s pause in the
> >>> boot very soon after the nvdimm subsystem is initialised.
> >>>
> >>> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
> >>> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
> >>> [    1.552175] Non-volatile memory driver v1.3
> >>> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
> >>> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
> >>> [   37.217453] brd: module loaded
> >>> [   37.225423] loop: module loaded
> >>> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
> >>> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
> >>> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
> >>> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
> >>> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
> >>>
> >>> The system does not appear to be consuming CPU, but it is blocking
> >>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
> >>> a few seconds because I reboot it tens of times a day, this is a
> >>> problem....
> >>
> >> And when I turn on KASAN, the kernel fails to boot to a login prompt
> >> because:
> >
> > What's your qemu and kernel command line? I'll take look at this first
> > thing tomorrow.
> 
> I was able to reproduce this crash by just turning on KASAN...
> investigating. It would still help to have your config for our own
> regression testing purposes it makes sense for us to prioritize
> "Dave's test config", similar to the priority of not breaking Linus'
> laptop.

Dave, _this_ is when you know you've arrived! ;)
Dan Williams June 5, 2018, 3:32 a.m. UTC | #19
[ adding KASAN devs...]

On Mon, Jun 4, 2018 at 4:40 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote:
>> On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
>>> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
>>>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
>>>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
>>>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
>>>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
>>>> > >> > installed ndctl until this morning!) but after changing the kernel
>>>> > >> > it no longer works. That would make it a regression, yes?
>>>>
>>>> [....]
>>>>
>>>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
>>>> > >> has the following dependencies:
>>>> > >>
>>>> > >>         depends on MEMORY_HOTPLUG
>>>> > >>         depends on MEMORY_HOTREMOVE
>>>> > >>         depends on SPARSEMEM_VMEMMAP
>>>> > >
>>>> > > Filesystem DAX now has a dependency on memory hotplug?
>>>>
>>>> [....]
>>>>
>>>> > > OK, works now I've found the magic config incantantions to turn
>>>> > > everything I now need on.
>>>>
>>>> By enabling these options, my test VM now has a ~30s pause in the
>>>> boot very soon after the nvdimm subsystem is initialised.
>>>>
>>>> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
>>>> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
>>>> [    1.552175] Non-volatile memory driver v1.3
>>>> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
>>>> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
>>>> [   37.217453] brd: module loaded
>>>> [   37.225423] loop: module loaded
>>>> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
>>>> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
>>>> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
>>>> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
>>>> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
>>>>
>>>> The system does not appear to be consuming CPU, but it is blocking
>>>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
>>>> a few seconds because I reboot it tens of times a day, this is a
>>>> problem....
>>>
>>> And when I turn on KASAN, the kernel fails to boot to a login prompt
>>> because:
>>
>> What's your qemu and kernel command line? I'll take look at this first
>> thing tomorrow.
>
> I was able to reproduce this crash by just turning on KASAN...
> investigating. It would still help to have your config for our own
> regression testing purposes it makes sense for us to prioritize
> "Dave's test config", similar to the priority of not breaking Linus'
> laptop.

I believe this is a bug in KASAN, or a bug in devm_memremap_pages(),
depends on your point of view. At the very least it is a mismatch of
assumptions. KASAN learns of hot added memory via the memory hotplug
notifier. However, the devm_memremap_pages() implementation is
intentionally limited to the "first half" of the memory hotplug
procedure. I.e. it does just enough to setup the linear map for
pfn_to_page() and initialize the "struct page" memmap, but then stops
short of onlining the pages. This is why we are getting a NULL ptr
deref and not a KASAN report, because KASAN has no shadow area setup
for the linearly mapped pmem range.

In terms of solving it we could refactor kasan_mem_notifier() so that
devm_memremap_pages() can call it outside of the notifier... I'll give
this a shot.
Dave Chinner June 5, 2018, 5:55 a.m. UTC | #20
On Mon, Jun 04, 2018 at 08:33:26PM -0400, Mike Snitzer wrote:
> On Mon, Jun 04 2018 at  7:40pm -0400,
> Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> > > On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote:
> > >> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote:
> > >>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote:
> > >>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote:
> > >>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote:
> > >>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even
> > >>> > >> > installed ndctl until this morning!) but after changing the kernel
> > >>> > >> > it no longer works. That would make it a regression, yes?
> > >>>
> > >>> [....]
> > >>>
> > >>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which
> > >>> > >> has the following dependencies:
> > >>> > >>
> > >>> > >>         depends on MEMORY_HOTPLUG
> > >>> > >>         depends on MEMORY_HOTREMOVE
> > >>> > >>         depends on SPARSEMEM_VMEMMAP
> > >>> > >
> > >>> > > Filesystem DAX now has a dependency on memory hotplug?
> > >>>
> > >>> [....]
> > >>>
> > >>> > > OK, works now I've found the magic config incantantions to turn
> > >>> > > everything I now need on.
> > >>>
> > >>> By enabling these options, my test VM now has a ~30s pause in the
> > >>> boot very soon after the nvdimm subsystem is initialised.
> > >>>
> > >>> [    1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
> > >>> [    1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A
> > >>> [    1.552175] Non-volatile memory driver v1.3
> > >>> [    2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz
> > >>> [    2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns
> > >>> [   37.217453] brd: module loaded
> > >>> [   37.225423] loop: module loaded
> > >>> [   37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB)
> > >>> [   37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB)
> > >>> [   37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB)
> > >>> [   37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes
> > >>> [   37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes
> > >>>
> > >>> The system does not appear to be consuming CPU, but it is blocking
> > >>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in
> > >>> a few seconds because I reboot it tens of times a day, this is a
> > >>> problem....
> > >>
> > >> And when I turn on KASAN, the kernel fails to boot to a login prompt
> > >> because:
> > >
> > > What's your qemu and kernel command line? I'll take look at this first
> > > thing tomorrow.
> > 
> > I was able to reproduce this crash by just turning on KASAN...
> > investigating. It would still help to have your config for our own
> > regression testing purposes it makes sense for us to prioritize
> > "Dave's test config", similar to the priority of not breaking Linus'
> > laptop.
> 
> Dave, _this_ is when you know you've arrived! ;)

Nah, this is when I know I've been a real Grouch. Someone paint
me green, hand me a garbage bin and call me Oscar. :P

I've attached the config below. At one point (probably around 3.0)
it was the barest minimum needed to boot and test XFS in a VM.  I
only tend to manually turn on the things I need for testing, but
I've never swept out stuff I don't need so who knows what crap it's
gathered over the years.

BTW, I found the init delay - it was another RNG initialisation perf
regression (this time on the fast init path) as a result of all the
OMG-THE-RNG-IS-B0RKEN changes that went into 4.17. Hint: use
/dev/urandom as the host side entropy source, not /dev/random.

That just leaves KASAN....

Cheers,

Dave.
diff mbox

Patch

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 3943feb9a090..1d7bd96511f0 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -80,9 +80,9 @@  EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
  * This is a library function for filesystems to check if the block device
  * can be mounted with dax option.
  *
- * Return: negative errno if unsupported, 0 if supported.
+ * Return: true if supported, false if unsupported
  */
-int __bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	struct dax_device *dax_dev;
 	pgoff_t pgoff;
@@ -95,21 +95,21 @@  int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (blocksize != PAGE_SIZE) {
 		pr_debug("%s: error: unsupported blocksize for dax\n",
 				bdevname(bdev, buf));
-		return -EINVAL;
+		return false;
 	}
 
 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
 		pr_debug("%s: error: unaligned partition for dax\n",
 				bdevname(bdev, buf));
-		return err;
+		return false;
 	}
 
 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
 		pr_debug("%s: error: device does not support dax\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
 	id = dax_read_lock();
@@ -121,7 +121,7 @@  int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	if (len < 1) {
 		pr_debug("%s: error: dax access failed (%ld)\n",
 				bdevname(bdev, buf), len);
-		return len < 0 ? len : -EIO;
+		return false;
 	}
 
 	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
@@ -139,10 +139,10 @@  int __bdev_dax_supported(struct block_device *bdev, int blocksize)
 	} else {
 		pr_debug("%s: error: dax support not enabled\n",
 				bdevname(bdev, buf));
-		return -EOPNOTSUPP;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 #endif
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9627c3054b5c..c09289a42dc5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,8 +961,7 @@  static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
 	if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext2_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 089170e99895..2e1622907f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3732,8 +3732,7 @@  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 					" that may contain inline data");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
 		}
-		err = bdev_dax_supported(sb->s_bdev, blocksize);
-		if (err) {
+		if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
 			ext4_msg(sb, KERN_ERR,
 				"DAX unsupported by block device. Turning off DAX.");
 			sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0effd46b965f..2c70a0a4f59f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1103,8 +1103,8 @@  xfs_ioctl_setattr_dax_invalidate(
 	if (fa->fsx_xflags & FS_XFLAG_DAX) {
 		if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
 			return -EINVAL;
-		if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
-				sb->s_blocksize) < 0)
+		if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sb->s_blocksize))
 			return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 62188c2a4c36..86915dc40eed 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1690,17 +1690,17 @@  xfs_fs_fill_super(
 		sb->s_flags |= SB_I_VERSION;
 
 	if (mp->m_flags & XFS_MOUNT_DAX) {
-		int	error2 = 0;
+		bool rtdev_is_dax = false, datadev_is_dax;
 
 		xfs_warn(mp,
 		"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 
-		error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
-				sb->s_blocksize);
+		datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+			sb->s_blocksize);
 		if (mp->m_rtdev_targp)
-			error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev,
-					sb->s_blocksize);
-		if (error && error2) {
+			rtdev_is_dax = bdev_dax_supported(
+				mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+		if (!rtdev_is_dax && !datadev_is_dax) {
 			xfs_alert(mp,
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2a4f7295c12b..c99692ddd4b5 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -64,8 +64,8 @@  static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
-int __bdev_dax_supported(struct block_device *bdev, int blocksize);
-static inline int bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize);
+static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize)
 {
 	return __bdev_dax_supported(bdev, blocksize);
 }
@@ -84,10 +84,10 @@  struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc);
 #else
-static inline int bdev_dax_supported(struct block_device *bdev,
+static inline bool bdev_dax_supported(struct block_device *bdev,
 		int blocksize)
 {
-	return -EOPNOTSUPP;
+	return false;
 }
 
 static inline struct dax_device *fs_dax_get_by_host(const char *host)