Message ID | 20180529195106.14268-3-ross.zwisler@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > From: Dave Jiang <dave.jiang@intel.com> > > The function return values are confusing with the way the function is > named. We expect a true or false return value but it actually returns > 0/-errno. This makes the code very confusing. Changing the return values > to return a bool where if DAX is supported then return true and no DAX > support returns false. > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Looks ok, do you want me to pull the first two patches through the xfs tree? Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> --D > --- > drivers/dax/super.c | 16 ++++++++-------- > fs/ext2/super.c | 3 +-- > fs/ext4/super.c | 3 +-- > fs/xfs/xfs_ioctl.c | 4 ++-- > fs/xfs/xfs_super.c | 12 ++++++------ > include/linux/dax.h | 8 ++++---- > 6 files changed, 22 insertions(+), 24 deletions(-) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index 3943feb9a090..1d7bd96511f0 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -80,9 +80,9 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); > * This is a library function for filesystems to check if the block device > * can be mounted with dax option. > * > - * Return: negative errno if unsupported, 0 if supported. > + * Return: true if supported, false if unsupported > */ > -int __bdev_dax_supported(struct block_device *bdev, int blocksize) > +bool __bdev_dax_supported(struct block_device *bdev, int blocksize) > { > struct dax_device *dax_dev; > pgoff_t pgoff; > @@ -95,21 +95,21 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) > if (blocksize != PAGE_SIZE) { > pr_debug("%s: error: unsupported blocksize for dax\n", > bdevname(bdev, buf)); > - return -EINVAL; > + return false; > } > > err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); > if (err) { > pr_debug("%s: error: unaligned partition for dax\n", > bdevname(bdev, buf)); > - return err; > + return false; > } > > dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); > if (!dax_dev) { > pr_debug("%s: error: device does not support dax\n", > bdevname(bdev, buf)); > - return -EOPNOTSUPP; > + return false; > } > > id = dax_read_lock(); > @@ -121,7 +121,7 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) > if (len < 1) { > pr_debug("%s: error: dax access failed (%ld)\n", > bdevname(bdev, buf), len); > - return len < 0 ? len : -EIO; > + return false; > } > > if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { > @@ -139,10 +139,10 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) > } else { > pr_debug("%s: error: dax support not enabled\n", > bdevname(bdev, buf)); > - return -EOPNOTSUPP; > + return false; > } > > - return 0; > + return true; > } > EXPORT_SYMBOL_GPL(__bdev_dax_supported); > #endif > diff --git a/fs/ext2/super.c b/fs/ext2/super.c > index 9627c3054b5c..c09289a42dc5 100644 > --- a/fs/ext2/super.c > +++ b/fs/ext2/super.c > @@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) > blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); > > if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { > - err = bdev_dax_supported(sb->s_bdev, blocksize); > - if (err) { > + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { > ext2_msg(sb, KERN_ERR, > "DAX unsupported by block device. Turning off DAX."); > sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 089170e99895..2e1622907f4a 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -3732,8 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > " that may contain inline data"); > sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; > } > - err = bdev_dax_supported(sb->s_bdev, blocksize); > - if (err) { > + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { > ext4_msg(sb, KERN_ERR, > "DAX unsupported by block device. Turning off DAX."); > sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c > index 0effd46b965f..2c70a0a4f59f 100644 > --- a/fs/xfs/xfs_ioctl.c > +++ b/fs/xfs/xfs_ioctl.c > @@ -1103,8 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate( > if (fa->fsx_xflags & FS_XFLAG_DAX) { > if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) > return -EINVAL; > - if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), > - sb->s_blocksize) < 0) > + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), > + sb->s_blocksize)) > return -EINVAL; > } > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 62188c2a4c36..86915dc40eed 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -1690,17 +1690,17 @@ xfs_fs_fill_super( > sb->s_flags |= SB_I_VERSION; > > if (mp->m_flags & XFS_MOUNT_DAX) { > - int error2 = 0; > + bool rtdev_is_dax = false, datadev_is_dax; > > xfs_warn(mp, > "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); > > - error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, > - sb->s_blocksize); > + datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, > + sb->s_blocksize); > if (mp->m_rtdev_targp) > - error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev, > - sb->s_blocksize); > - if (error && error2) { > + rtdev_is_dax = bdev_dax_supported( > + mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); > + if (!rtdev_is_dax && !datadev_is_dax) { > xfs_alert(mp, > "DAX unsupported by block device. Turning off DAX."); > mp->m_flags &= ~XFS_MOUNT_DAX; > diff --git a/include/linux/dax.h b/include/linux/dax.h > index 2a4f7295c12b..c99692ddd4b5 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -64,8 +64,8 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) > struct writeback_control; > int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); > #if IS_ENABLED(CONFIG_FS_DAX) > -int __bdev_dax_supported(struct block_device *bdev, int blocksize); > -static inline int bdev_dax_supported(struct block_device *bdev, int blocksize) > +bool __bdev_dax_supported(struct block_device *bdev, int blocksize); > +static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) > { > return __bdev_dax_supported(bdev, blocksize); > } > @@ -84,10 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); > int dax_writeback_mapping_range(struct address_space *mapping, > struct block_device *bdev, struct writeback_control *wbc); > #else > -static inline int bdev_dax_supported(struct block_device *bdev, > +static inline bool bdev_dax_supported(struct block_device *bdev, > int blocksize) > { > - return -EOPNOTSUPP; > + return false; > } > > static inline struct dax_device *fs_dax_get_by_host(const char *host) > -- > 2.14.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > From: Dave Jiang <dave.jiang@intel.com> > > > > The function return values are confusing with the way the function is > > named. We expect a true or false return value but it actually returns > > 0/-errno. This makes the code very confusing. Changing the return values > > to return a bool where if DAX is supported then return true and no DAX > > support returns false. > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > Looks ok, do you want me to pull the first two patches through the xfs > tree? > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Thanks for the review. I'm not sure what's best. If you do that then Mike will need to have a DM branch for the rest of the series based on your stable commits, yea? Mike what would you prefer?
On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > The function return values are confusing with the way the function is > > > named. We expect a true or false return value but it actually returns > > > 0/-errno. This makes the code very confusing. Changing the return values > > > to return a bool where if DAX is supported then return true and no DAX > > > support returns false. > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > Looks ok, do you want me to pull the first two patches through the xfs > > tree? > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > Thanks for the review. > > I'm not sure what's best. If you do that then Mike will need to have a DM > branch for the rest of the series based on your stable commits, yea? > > Mike what would you prefer? I /was/ about to say that I would pull in the first two patches, but now I can't get xfs to mount with pmem at all, and have no way of testing this...? # echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control # mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt # dmesg <snip> SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk pmem3: error: dax support not enabled pmem4: error: dax support not enabled XFS (pmem3): DAX unsupported by block device. Turning off DAX. XFS (pmem3): Mounting V5 Filesystem XFS (pmem3): Ending clean mount Evidently the pfn it picks up is missing PFN_MAP in flags because ND_REGION_PAGEMAP isn't set, and looking at the kernel source, pmem that comes in via NFIT never gets that set...? relevant qemu pmem options: -object memory-backend-file,id=memnvdimm0,prealloc=no,mem-path=/dev/shm/a.img,share=yes,size=13488881664 -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 (repeat for five more devices) <confused> --D NFIT table contents: 0000000 4e 46 49 54 78 04 00 00 01 46 42 4f 43 48 53 20 N F I T x 004 \0 \0 001 F B O C H S 0000016 42 58 50 43 4e 46 49 54 01 00 00 00 42 58 50 43 B X P C N F I T 001 \0 \0 \0 B X P C 0000032 01 00 00 00 00 00 00 00 00 00 38 00 08 00 03 00 001 \0 \0 \0 \0 \0 \0 \0 \0 \0 8 \0 \b \0 003 \0 0000048 00 00 00 00 01 00 00 00 79 d3 f0 66 f3 b4 74 40 \0 \0 \0 \0 001 \0 \0 \0 y 323 360 f 363 264 t @ 0000064 ac 43 0d 33 18 b7 8c db 00 00 00 6c 0a 00 00 00 254 C \r 3 030 267 214 333 \0 \0 \0 l \n \0 \0 \0 0000080 00 00 00 24 03 00 00 00 08 80 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \b 200 \0 \0 \0 \0 \0 \0 0000096 01 00 30 00 04 00 00 00 00 00 00 00 08 00 09 00 001 \0 0 \0 004 \0 \0 \0 \0 \0 \0 \0 \b \0 \t \0 0000112 00 00 00 24 03 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000128 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 001 \0 \0 \0 \0 \0 0000144 04 00 50 00 09 00 86 80 01 00 01 00 00 00 00 00 004 \0 P \0 \t \0 206 200 001 \0 001 \0 \0 \0 \0 \0 0000160 00 00 00 00 00 00 00 00 59 34 12 00 01 03 00 00 \0 \0 \0 \0 \0 \0 \0 \0 Y 4 022 \0 001 003 \0 \0 0000176 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0000224 00 00 38 00 0a 00 03 00 00 00 00 00 00 00 00 00 \0 \0 8 \0 \n \0 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000240 79 d3 f0 66 f3 b4 74 40 ac 43 0d 33 18 b7 8c db y 323 360 f 363 264 t @ 254 C \r 3 030 267 214 333 0000256 00 00 00 90 0d 00 00 00 00 00 00 24 03 00 00 00 \0 \0 \0 220 \r \0 \0 \0 \0 \0 \0 $ 003 \0 \0 \0 0000272 08 80 00 00 00 00 00 00 01 00 30 00 05 00 00 00 \b 200 \0 \0 \0 \0 \0 \0 001 \0 0 \0 005 \0 \0 \0 0000288 00 00 00 00 0a 00 0b 00 00 00 00 24 03 00 00 00 \0 \0 \0 \0 \n \0 \v \0 \0 \0 \0 $ 003 \0 \0 \0 0000304 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000320 00 00 01 00 00 00 00 00 04 00 50 00 0b 00 86 80 \0 \0 001 \0 \0 \0 \0 \0 004 \0 P \0 \v \0 206 200 0000336 01 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 001 \0 001 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000352 5a 34 12 00 01 03 00 00 00 00 00 00 00 00 00 00 Z 4 022 \0 001 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000368 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0000400 00 00 00 00 00 00 00 00 00 00 38 00 0c 00 03 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 8 \0 \f \0 003 \0 0000416 00 00 00 00 01 00 00 00 79 d3 f0 66 f3 b4 74 40 \0 \0 \0 \0 001 \0 \0 \0 y 323 360 f 363 264 t @ 0000432 ac 43 0d 33 18 b7 8c db 00 00 00 b4 10 00 00 00 254 C \r 3 030 267 214 333 \0 \0 \0 264 020 \0 \0 \0 0000448 00 00 00 24 03 00 00 00 08 80 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \b 200 \0 \0 \0 \0 \0 \0 0000464 01 00 30 00 06 00 00 00 00 00 00 00 0c 00 0d 00 001 \0 0 \0 006 \0 \0 \0 \0 \0 \0 \0 \f \0 \r \0 0000480 00 00 00 24 03 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000496 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 001 \0 \0 \0 \0 \0 0000512 04 00 50 00 0d 00 86 80 01 00 01 00 00 00 00 00 004 \0 P \0 \r \0 206 200 001 \0 001 \0 \0 \0 \0 \0 0000528 00 00 00 00 00 00 00 00 5b 34 12 00 01 03 00 00 \0 \0 \0 \0 \0 \0 \0 \0 [ 4 022 \0 001 003 \0 \0 0000544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0000592 00 00 38 00 02 00 03 00 00 00 00 00 00 00 00 00 \0 \0 8 \0 002 \0 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000608 79 d3 f0 66 f3 b4 74 40 ac 43 0d 33 18 b7 8c db y 323 360 f 363 264 t @ 254 C \r 3 030 267 214 333 0000624 00 00 00 00 01 00 00 00 00 00 00 24 03 00 00 00 \0 \0 \0 \0 001 \0 \0 \0 \0 \0 \0 $ 003 \0 \0 \0 0000640 08 80 00 00 00 00 00 00 01 00 30 00 01 00 00 00 \b 200 \0 \0 \0 \0 \0 \0 001 \0 0 \0 001 \0 \0 \0 0000656 00 00 00 00 02 00 03 00 00 00 00 24 03 00 00 00 \0 \0 \0 \0 002 \0 003 \0 \0 \0 \0 $ 003 \0 \0 \0 0000672 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000688 00 00 01 00 00 00 00 00 04 00 50 00 03 00 86 80 \0 \0 001 \0 \0 \0 \0 \0 004 \0 P \0 003 \0 206 200 0000704 01 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 001 \0 001 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000720 56 34 12 00 01 03 00 00 00 00 00 00 00 00 00 00 V 4 022 \0 001 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000736 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0000768 00 00 00 00 00 00 00 00 00 00 38 00 04 00 03 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 8 \0 004 \0 003 \0 0000784 00 00 00 00 01 00 00 00 79 d3 f0 66 f3 b4 74 40 \0 \0 \0 \0 001 \0 \0 \0 y 323 360 f 363 264 t @ 0000800 ac 43 0d 33 18 b7 8c db 00 00 00 24 04 00 00 00 254 C \r 3 030 267 214 333 \0 \0 \0 $ 004 \0 \0 \0 0000816 00 00 00 24 03 00 00 00 08 80 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \b 200 \0 \0 \0 \0 \0 \0 0000832 01 00 30 00 02 00 00 00 00 00 00 00 04 00 05 00 001 \0 0 \0 002 \0 \0 \0 \0 \0 \0 \0 004 \0 005 \0 0000848 00 00 00 24 03 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 $ 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000864 00 00 00 00 00 00 00 00 00 00 01 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 001 \0 \0 \0 \0 \0 0000880 04 00 50 00 05 00 86 80 01 00 01 00 00 00 00 00 004 \0 P \0 005 \0 206 200 001 \0 001 \0 \0 \0 \0 \0 0000896 00 00 00 00 00 00 00 00 57 34 12 00 01 03 00 00 \0 \0 \0 \0 \0 \0 \0 \0 W 4 022 \0 001 003 \0 \0 0000912 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0000960 00 00 38 00 06 00 03 00 00 00 00 00 00 00 00 00 \0 \0 8 \0 006 \0 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 0000976 79 d3 f0 66 f3 b4 74 40 ac 43 0d 33 18 b7 8c db y 323 360 f 363 264 t @ 254 C \r 3 030 267 214 333 0000992 00 00 00 48 07 00 00 00 00 00 00 24 03 00 00 00 \0 \0 \0 H \a \0 \0 \0 \0 \0 \0 $ 003 \0 \0 \0 0001008 08 80 00 00 00 00 00 00 01 00 30 00 03 00 00 00 \b 200 \0 \0 \0 \0 \0 \0 001 \0 0 \0 003 \0 \0 \0 0001024 00 00 00 00 06 00 07 00 00 00 00 24 03 00 00 00 \0 \0 \0 \0 006 \0 \a \0 \0 \0 \0 $ 003 \0 \0 \0 0001040 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0001056 00 00 01 00 00 00 00 00 04 00 50 00 07 00 86 80 \0 \0 001 \0 \0 \0 \0 \0 004 \0 P \0 \a \0 206 200 0001072 01 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 001 \0 001 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0001088 58 34 12 00 01 03 00 00 00 00 00 00 00 00 00 00 X 4 022 \0 001 003 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 0001104 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 * 0001136 00 00 00 00 00 00 00 00 \0 \0 \0 \0 \0 \0 \0 \0 0001144 > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > The function return values are confusing with the way the function is > > > > named. We expect a true or false return value but it actually returns > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > to return a bool where if DAX is supported then return true and no DAX > > > > support returns false. > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > tree? > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Thanks for the review. > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > branch for the rest of the series based on your stable commits, yea? > > > > Mike what would you prefer? > > I /was/ about to say that I would pull in the first two patches, but now > I can't get xfs to mount with pmem at all, and have no way of testing > this...? I'm not sure what's up - I'll dig in and find out.
On Thu, May 31, 2018 at 12:13 PM, Darrick J. Wong <darrick.wong@oracle.com> wrote: > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: >> On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: >> > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: >> > > From: Dave Jiang <dave.jiang@intel.com> >> > > >> > > The function return values are confusing with the way the function is >> > > named. We expect a true or false return value but it actually returns >> > > 0/-errno. This makes the code very confusing. Changing the return values >> > > to return a bool where if DAX is supported then return true and no DAX >> > > support returns false. >> > > >> > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> >> > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> >> > >> > Looks ok, do you want me to pull the first two patches through the xfs >> > tree? >> > >> > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> >> >> Thanks for the review. >> >> I'm not sure what's best. If you do that then Mike will need to have a DM >> branch for the rest of the series based on your stable commits, yea? >> >> Mike what would you prefer? > > I /was/ about to say that I would pull in the first two patches, but now > I can't get xfs to mount with pmem at all, and have no way of testing > this...? > > # echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control > # mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt > # dmesg > <snip> > SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled > XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk > pmem3: error: dax support not enabled > pmem4: error: dax support not enabled > XFS (pmem3): DAX unsupported by block device. Turning off DAX. > XFS (pmem3): Mounting V5 Filesystem > XFS (pmem3): Ending clean mount > > Evidently the pfn it picks up is missing PFN_MAP in flags because > ND_REGION_PAGEMAP isn't set, and looking at the kernel source, pmem that > comes in via NFIT never gets that set...? > > relevant qemu pmem options: > > -object memory-backend-file,id=memnvdimm0,prealloc=no,mem-path=/dev/shm/a.img,share=yes,size=13488881664 > -device nvdimm,node=0,memdev=memnvdimm0,id=nvdimm0,slot=0 > (repeat for five more devices) > > <confused> > What does "ndctl list" say? The namespaces need to be in fsdax mode.
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > The function return values are confusing with the way the function is > > > > named. We expect a true or false return value but it actually returns > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > to return a bool where if DAX is supported then return true and no DAX > > > > support returns false. > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > tree? > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Thanks for the review. > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > branch for the rest of the series based on your stable commits, yea? > > > > Mike what would you prefer? > > I /was/ about to say that I would pull in the first two patches, but now > I can't get xfs to mount with pmem at all, and have no way of testing > this...? > > # echo 'file drivers/dax/* +p' > /sys/kernel/debug/dynamic_debug/control > # mount /dev/pmem3 -o rtdev=/dev/pmem4,dax /mnt > # dmesg > <snip> > SGI XFS with ACLs, security attributes, realtime, scrub, debug enabled > XFS (pmem3): DAX enabled. Warning: EXPERIMENTAL, use at your own risk > pmem3: error: dax support not enabled > pmem4: error: dax support not enabled > XFS (pmem3): DAX unsupported by block device. Turning off DAX. > XFS (pmem3): Mounting V5 Filesystem > XFS (pmem3): Ending clean mount This same sequence worked fine in my QEMU setup. My guess is the issue is that your namespaces are in raw mode: # ndctl list [ { "dev":"namespace1.0", "mode":"raw", << this "size":18119393280, "sector_size":512, "blockdev":"pmem1", "numa_node":0 }, { "dev":"namespace0.0", "mode":"raw", << this "size":18119393280, "sector_size":512, "blockdev":"pmem0", "numa_node":0 } ] If so, you can solve this by putting them in fsdax mode with ndctl: # ndctl create-namespace -f -e namespace0.0 --mode=fsdax { "dev":"namespace0.0", "mode":"fsdax", "size":"16.61 GiB (17.83 GB)", "uuid":"4c193c83-c031-41d4-8f96-2ff5c7b59c69", "raw_uuid":"2f785f09-717a-4a5e-9e9d-88d489ef1030", "sector_size":512, "blockdev":"pmem0", "numa_node":0 } # ndctl create-namespace -f -e namespace1.0 --mode=fsdax { "dev":"namespace1.0", "mode":"fsdax", "size":"16.61 GiB (17.83 GB)", "uuid":"39ec9cc4-a7ef-4f99-8354-60105543bf47", "raw_uuid":"403fae76-d4f8-4bed-9a3f-ef40924762be", "sector_size":512, "blockdev":"pmem1", "numa_node":0 } If that doesn't fix it for you, ping me on freenode and we'll figure it out. Thanks, - Ross
On Thu, May 31 2018 at 3:13pm -0400, Darrick J. Wong <darrick.wong@oracle.com> wrote: > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > The function return values are confusing with the way the function is > > > > named. We expect a true or false return value but it actually returns > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > to return a bool where if DAX is supported then return true and no DAX > > > > support returns false. > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > tree? > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Thanks for the review. > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > branch for the rest of the series based on your stable commits, yea? > > > > Mike what would you prefer? > > I /was/ about to say that I would pull in the first two patches, but now > I can't get xfs to mount with pmem at all, and have no way of testing > this...? Once you get this sorted out, please feel free to pull in the first 2. I'm unlikely to get to reviewing the DM patches in this series until tomorrow at the earliest. Mike
On Thu, May 31, 2018 at 04:52:06PM -0400, Mike Snitzer wrote: > On Thu, May 31 2018 at 3:13pm -0400, > Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > > > The function return values are confusing with the way the function is > > > > > named. We expect a true or false return value but it actually returns > > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > > to return a bool where if DAX is supported then return true and no DAX > > > > > support returns false. > > > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > > tree? > > > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > Thanks for the review. > > > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > > branch for the rest of the series based on your stable commits, yea? > > > > > > Mike what would you prefer? > > > > I /was/ about to say that I would pull in the first two patches, but now > > I can't get xfs to mount with pmem at all, and have no way of testing > > this...? > > Once you get this sorted out, please feel free to pull in the first 2. Sorted. It'll be in Friday's for-next. Ross helped me bang on the pmem devices w/ ndctl to enable fsdax mode and twist qemu until everything worked properly. ;) --D > I'm unlikely to get to reviewing the DM patches in this series until > tomorrow at the earliest. > > Mike > > -- > dm-devel mailing list > dm-devel@redhat.com > https://www.redhat.com/mailman/listinfo/dm-devel
On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > The function return values are confusing with the way the function is > > > > named. We expect a true or false return value but it actually returns > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > to return a bool where if DAX is supported then return true and no DAX > > > > support returns false. > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > tree? > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > Thanks for the review. > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > branch for the rest of the series based on your stable commits, yea? > > > > Mike what would you prefer? > > I /was/ about to say that I would pull in the first two patches, but now > I can't get xfs to mount with pmem at all, and have no way of testing > this...? I have similar problems, too, but: $ ndctl list [ { "dev":"namespace1.0", "mode":"raw", "size":8589934592, "sector_size":512, "blockdev":"pmem1" }, { "dev":"namespace0.0", "mode":"raw", "size":8589934592, "sector_size":512, "blockdev":"pmem0" } ] $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax Error: operation failed, region0 fsdax mode not available failed to reconfigure namespace: Invalid argument $ I can't make head or tail of what is going wrong here - how am I supposed to debug this and get it working again? FWIW, XFS+DAX used to just work on this setup (I hadn't even installed ndctl until this morning!) but after changing the kernel it no longer works. That would make it a regression, yes? Cheers, Dave.
On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote: > On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: >> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: >> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: >> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: >> > > > From: Dave Jiang <dave.jiang@intel.com> >> > > > >> > > > The function return values are confusing with the way the function is >> > > > named. We expect a true or false return value but it actually returns >> > > > 0/-errno. This makes the code very confusing. Changing the return values >> > > > to return a bool where if DAX is supported then return true and no DAX >> > > > support returns false. >> > > > >> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> >> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> >> > > >> > > Looks ok, do you want me to pull the first two patches through the xfs >> > > tree? >> > > >> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> >> > >> > Thanks for the review. >> > >> > I'm not sure what's best. If you do that then Mike will need to have a DM >> > branch for the rest of the series based on your stable commits, yea? >> > >> > Mike what would you prefer? >> >> I /was/ about to say that I would pull in the first two patches, but now >> I can't get xfs to mount with pmem at all, and have no way of testing >> this...? > > I have similar problems, too, but: > > $ ndctl list > [ > { > "dev":"namespace1.0", > "mode":"raw", > "size":8589934592, > "sector_size":512, > "blockdev":"pmem1" > }, > { > "dev":"namespace0.0", > "mode":"raw", > "size":8589934592, > "sector_size":512, > "blockdev":"pmem0" > } > ] > $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax > Error: operation failed, region0 fsdax mode not available > > failed to reconfigure namespace: Invalid argument > $ > > I can't make head or tail of what is going wrong here - how am I > supposed to debug this and get it working again? > > FWIW, XFS+DAX used to just work on this setup (I hadn't even > installed ndctl until this morning!) but after changing the kernel > it no longer works. That would make it a regression, yes? This commit caused the behavior change: 569d0365f571 dax: require 'struct page' by default for filesystem dax The justification is in that patch, but the short summary is we killed off "pageless" dax because it had so many incomplete holes and surprise behaviors. It needed to die on the path to making dax not experimental, i.e. to close safety holes, and be feature complete for all the ways userspace expects to use mappings (direct-io, fork, poison handling, etc). I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which has the following dependencies: depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP I've created a task to go improve ndctl's error reporting for troubleshooting this failure.
On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: > On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: > >> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > >> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > >> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > >> > > > From: Dave Jiang <dave.jiang@intel.com> > >> > > > > >> > > > The function return values are confusing with the way the function is > >> > > > named. We expect a true or false return value but it actually returns > >> > > > 0/-errno. This makes the code very confusing. Changing the return values > >> > > > to return a bool where if DAX is supported then return true and no DAX > >> > > > support returns false. > >> > > > > >> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > >> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > >> > > > >> > > Looks ok, do you want me to pull the first two patches through the xfs > >> > > tree? > >> > > > >> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > >> > > >> > Thanks for the review. > >> > > >> > I'm not sure what's best. If you do that then Mike will need to have a DM > >> > branch for the rest of the series based on your stable commits, yea? > >> > > >> > Mike what would you prefer? > >> > >> I /was/ about to say that I would pull in the first two patches, but now > >> I can't get xfs to mount with pmem at all, and have no way of testing > >> this...? > > > > I have similar problems, too, but: > > > > $ ndctl list > > [ > > { > > "dev":"namespace1.0", > > "mode":"raw", > > "size":8589934592, > > "sector_size":512, > > "blockdev":"pmem1" > > }, > > { > > "dev":"namespace0.0", > > "mode":"raw", > > "size":8589934592, > > "sector_size":512, > > "blockdev":"pmem0" > > } > > ] > > $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax > > Error: operation failed, region0 fsdax mode not available > > > > failed to reconfigure namespace: Invalid argument > > $ > > > > I can't make head or tail of what is going wrong here - how am I > > supposed to debug this and get it working again? > > > > FWIW, XFS+DAX used to just work on this setup (I hadn't even > > installed ndctl until this morning!) but after changing the kernel > > it no longer works. That would make it a regression, yes? > > This commit caused the behavior change: > > 569d0365f571 dax: require 'struct page' by default for filesystem dax > > The justification is in that patch, but the short summary is we killed > off "pageless" dax because it had so many incomplete holes and > surprise behaviors. It needed to die on the path to making dax not > experimental, i.e. to close safety holes, and be feature complete for > all the ways userspace expects to use mappings (direct-io, fork, > poison handling, etc). > > I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which > has the following dependencies: > > depends on MEMORY_HOTPLUG > depends on MEMORY_HOTREMOVE > depends on SPARSEMEM_VMEMMAP Filesystem DAX now has a dependency on memory hotplug? Fmeh. No wonder I never enabled CONFIG_ZONE_DEVICE. It's described in menuconfig as "Device memory (pmem, HMM, etc...) hotplug support". This isn't for hotplug support - it required for basic DAX functionality. I've never enabled memory hotplug in any of my test kernel configs, because the VMs I run the kernels on don't ever get memory hotplugged.... OK, works now I've found the magic config incantantions to turn everything I now need on. Can we get this rationalised to a single top level config option in the filesystems menu "Enable Filesystem DAX" that turns on every knob that is required? Cheers, Dave.
On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: >> On Thu, May 31, 2018 at 6:26 PM, Dave Chinner <david@fromorbit.com> wrote: >> > On Thu, May 31, 2018 at 12:13:32PM -0700, Darrick J. Wong wrote: >> >> On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: >> >> > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: >> >> > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: >> >> > > > From: Dave Jiang <dave.jiang@intel.com> >> >> > > > >> >> > > > The function return values are confusing with the way the function is >> >> > > > named. We expect a true or false return value but it actually returns >> >> > > > 0/-errno. This makes the code very confusing. Changing the return values >> >> > > > to return a bool where if DAX is supported then return true and no DAX >> >> > > > support returns false. >> >> > > > >> >> > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> >> >> > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> >> >> > > >> >> > > Looks ok, do you want me to pull the first two patches through the xfs >> >> > > tree? >> >> > > >> >> > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> >> >> > >> >> > Thanks for the review. >> >> > >> >> > I'm not sure what's best. If you do that then Mike will need to have a DM >> >> > branch for the rest of the series based on your stable commits, yea? >> >> > >> >> > Mike what would you prefer? >> >> >> >> I /was/ about to say that I would pull in the first two patches, but now >> >> I can't get xfs to mount with pmem at all, and have no way of testing >> >> this...? >> > >> > I have similar problems, too, but: >> > >> > $ ndctl list >> > [ >> > { >> > "dev":"namespace1.0", >> > "mode":"raw", >> > "size":8589934592, >> > "sector_size":512, >> > "blockdev":"pmem1" >> > }, >> > { >> > "dev":"namespace0.0", >> > "mode":"raw", >> > "size":8589934592, >> > "sector_size":512, >> > "blockdev":"pmem0" >> > } >> > ] >> > $ sudo ndctl create-namespace -f -e namespace0.0 --mode=fsdax >> > Error: operation failed, region0 fsdax mode not available >> > >> > failed to reconfigure namespace: Invalid argument >> > $ >> > >> > I can't make head or tail of what is going wrong here - how am I >> > supposed to debug this and get it working again? >> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even >> > installed ndctl until this morning!) but after changing the kernel >> > it no longer works. That would make it a regression, yes? >> >> This commit caused the behavior change: >> >> 569d0365f571 dax: require 'struct page' by default for filesystem dax >> >> The justification is in that patch, but the short summary is we killed >> off "pageless" dax because it had so many incomplete holes and >> surprise behaviors. It needed to die on the path to making dax not >> experimental, i.e. to close safety holes, and be feature complete for >> all the ways userspace expects to use mappings (direct-io, fork, >> poison handling, etc). >> >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which >> has the following dependencies: >> >> depends on MEMORY_HOTPLUG >> depends on MEMORY_HOTREMOVE >> depends on SPARSEMEM_VMEMMAP > > Filesystem DAX now has a dependency on memory hotplug? > > Fmeh. No wonder I never enabled CONFIG_ZONE_DEVICE. It's described in > menuconfig as "Device memory (pmem, HMM, etc...) hotplug support". > This isn't for hotplug support - it required for basic DAX > functionality. I've never enabled memory hotplug in any of my test > kernel configs, because the VMs I run the kernels on don't ever get > memory hotplugged.... > > OK, works now I've found the magic config incantantions to turn > everything I now need on. > > Can we get this rationalised to a single top level config > option in the filesystems menu "Enable Filesystem DAX" that turns > on every knob that is required? That sounds good to me. Will do.
On Thu, May 31, 2018 at 03:26:45PM -0700, Darrick J. Wong wrote: > On Thu, May 31, 2018 at 04:52:06PM -0400, Mike Snitzer wrote: > > On Thu, May 31 2018 at 3:13pm -0400, > > Darrick J. Wong <darrick.wong@oracle.com> wrote: > > > > > On Tue, May 29, 2018 at 04:01:14PM -0600, Ross Zwisler wrote: > > > > On Tue, May 29, 2018 at 02:25:10PM -0700, Darrick J. Wong wrote: > > > > > On Tue, May 29, 2018 at 01:51:01PM -0600, Ross Zwisler wrote: > > > > > > From: Dave Jiang <dave.jiang@intel.com> > > > > > > > > > > > > The function return values are confusing with the way the function is > > > > > > named. We expect a true or false return value but it actually returns > > > > > > 0/-errno. This makes the code very confusing. Changing the return values > > > > > > to return a bool where if DAX is supported then return true and no DAX > > > > > > support returns false. > > > > > > > > > > > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > > > > > Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> > > > > > > > > > > Looks ok, do you want me to pull the first two patches through the xfs > > > > > tree? > > > > > > > > > > Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> > > > > > > > > Thanks for the review. > > > > > > > > I'm not sure what's best. If you do that then Mike will need to have a DM > > > > branch for the rest of the series based on your stable commits, yea? > > > > > > > > Mike what would you prefer? > > > > > > I /was/ about to say that I would pull in the first two patches, but now > > > I can't get xfs to mount with pmem at all, and have no way of testing > > > this...? > > > > Once you get this sorted out, please feel free to pull in the first 2. > > Sorted. It'll be in Friday's for-next. Ross helped me bang on the pmem > devices w/ ndctl to enable fsdax mode and twist qemu until everything > worked properly. ;) For anyone else who would like to simulate persistent memory using QEMU, I've added some hints here: https://nvdimm.wiki.kernel.org/pmem_in_qemu
On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even > >> > installed ndctl until this morning!) but after changing the kernel > >> > it no longer works. That would make it a regression, yes? [....] > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which > >> has the following dependencies: > >> > >> depends on MEMORY_HOTPLUG > >> depends on MEMORY_HOTREMOVE > >> depends on SPARSEMEM_VMEMMAP > > > > Filesystem DAX now has a dependency on memory hotplug? [....] > > OK, works now I've found the magic config incantantions to turn > > everything I now need on. By enabling these options, my test VM now has a ~30s pause in the boot very soon after the nvdimm subsystem is initialised. [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A [ 1.552175] Non-volatile memory driver v1.3 [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns [ 37.217453] brd: module loaded [ 37.225423] loop: module loaded [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes The system does not appear to be consuming CPU, but it is blocking NMIs so I can't get a CPU trace. For a VM that I rely on booting in a few seconds because I reboot it tens of times a day, this is a problem.... Cheers, Dave.
On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: > On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: > > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: > > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: > > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even > > >> > installed ndctl until this morning!) but after changing the kernel > > >> > it no longer works. That would make it a regression, yes? > > [....] > > > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which > > >> has the following dependencies: > > >> > > >> depends on MEMORY_HOTPLUG > > >> depends on MEMORY_HOTREMOVE > > >> depends on SPARSEMEM_VMEMMAP > > > > > > Filesystem DAX now has a dependency on memory hotplug? > > [....] > > > > OK, works now I've found the magic config incantantions to turn > > > everything I now need on. > > By enabling these options, my test VM now has a ~30s pause in the > boot very soon after the nvdimm subsystem is initialised. > > [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled > [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A > [ 1.552175] Non-volatile memory driver v1.3 > [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz > [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns > [ 37.217453] brd: module loaded > [ 37.225423] loop: module loaded > [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) > [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) > [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) > [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes > [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes > > The system does not appear to be consuming CPU, but it is blocking > NMIs so I can't get a CPU trace. For a VM that I rely on booting in > a few seconds because I reboot it tens of times a day, this is a > problem.... And when I turn on KASAN, the kernel fails to boot to a login prompt because: [ 15.363583] Non-volatile memory driver v1.3 [ 22.286787] brd: module loaded [ 22.347054] loop: module loaded [ 22.352158] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) [ 22.373921] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) [ 22.394363] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) [ 22.410145] Loading iSCSI transport class v2.0-870. [ 22.411506] nd_pmem namespace0.0: unable to guarantee persistence of writes [ 22.412777] iscsi: registered transport (tcp) [ 22.412780] megaraid cmm: 2.20.2.7 (Release Date: Sun Jul 16 00:01:03 EST 2006) [ 22.412806] nd_pmem namespace1.0: unable to guarantee persistence of writes [ 22.413090] megaraid: 2.20.5.1 (Release Date: Thu Nov 16 15:32:35 EST 2006) [ 22.421485] kworker/u33:4 (1087) used greatest stack depth: 27704 bytes left [ 22.430664] scsi host0: ata_piix [ 22.711000] BUG: unable to handle kernel paging request at ffffed0078000000 [ 22.712225] PGD 13ffd1067 P4D 13ffd1067 PUD 1247d2067 PMD 0 [ 22.713180] Oops: 0000 [#1] PREEMPT SMP KASAN [ 22.713884] CPU: 8 PID: 168 Comm: kworker/u33:3 Not tainted 4.17.0-rc7-dgc+ #549 [ 22.714480] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 22.714480] Workqueue: events_unbound async_run_entry_fn [ 22.714480] RIP: 0010:check_memory_region+0xdd/0x190 [ 22.714480] RSP: 0000:ffff88007a1aec08 EFLAGS: 00010202 [ 22.714480] RAX: ffffed0078000000 RBX: ffff8803c0000fff RCX: ffffffff81d41a12 [ 22.714480] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff8803c0000000 [ 22.714480] RBP: ffffed0078000200 R08: 0000000000000200 R09: 0000000000000040 [ 22.714480] R10: 0000000000000200 R11: ffffed00780001ff R12: ffff8803c0000000 [ 22.714480] R13: 000000000192f3f0 R14: 0000000000000000 R15: 0000000000000000 [ 22.714480] FS: 0000000000000000(0000) GS:ffff88007f700000(0000) knlGS:0000000000000000 [ 22.714480] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.714480] CR2: ffffed0078000000 CR3: 000000000300f001 CR4: 00000000000606e0 [ 22.714480] Call Trace: [ 22.714480] memcpy+0x1f/0x50 [ 22.714480] pmem_do_bvec+0x182/0x350 [ 22.714480] ? pmem_release_queue+0x10/0x10 [ 22.714480] ? __blkdev_get+0x484/0x7b0 [ 22.714480] ? blkdev_get+0x1e6/0x4f0 [ 22.714480] ? __device_add_disk+0x62a/0x780 [ 22.714480] ? pmem_attach_disk+0x580/0x7b0 [ 22.714480] ? do_raw_spin_unlock+0xa4/0x130 [ 22.714480] ? _raw_spin_unlock+0xa/0x20 [ 22.714480] ? create_task_io_context+0x195/0x1d0 [ 22.714480] ? generic_make_request_checks+0x33a/0x7a0 [ 22.714480] pmem_make_request+0x1d9/0x3c0 [ 22.714480] generic_make_request+0x2d8/0x6c0 [ 22.714480] ? blk_plug_queued_count+0xb0/0xb0 [ 22.714480] ? memset+0x1f/0x40 [ 22.714480] ? submit_bio+0xc8/0x1e0 [ 22.714480] submit_bio+0xc8/0x1e0 [ 22.714480] ? direct_make_request+0x150/0x150 [ 22.714480] ? alloc_buffer_head+0x3c/0x80 [ 22.714480] ? alloc_page_buffers+0x7f/0x120 [ 22.714480] ? __rcu_read_unlock+0x66/0x80 [ 22.714480] ? guard_bio_eod+0xa9/0x1e0 [ 22.714480] submit_bh_wbc.isra.41+0x257/0x280 [ 22.714480] ? end_buffer_read_nobh+0x10/0x10 [ 22.714480] block_read_full_page+0x307/0x470 [ 22.714480] ? block_llseek+0x80/0x80 [ 22.714480] ? block_page_mkwrite+0x150/0x150 [ 22.714480] ? add_to_page_cache_lru+0xd7/0x160 [ 22.714480] ? add_to_page_cache_locked+0x10/0x10 [ 22.714480] ? policy_node+0x56/0x60 [ 22.714480] do_read_cache_page+0x3e3/0x730 [ 22.714480] ? bus_probe_device+0xf3/0x120 [ 22.714480] ? async_run_entry_fn+0x75/0x1e0 [ 22.714480] ? blkdev_writepages+0x10/0x10 [ 22.714480] ? filemap_fault+0x940/0x940 [ 22.714480] ? unwind_next_frame+0x8cf/0x940 [ 22.714480] ? stack_access_ok+0x35/0x80 [ 22.714480] ? __alloc_pages_nodemask+0x18e/0x320 [ 22.714480] ? __alloc_pages_slowpath+0x12b0/0x12b0 [ 22.714480] read_dev_sector+0x61/0x120 [ 22.753746] scsi host1: ata_piix [ 22.753296] read_lba+0x260/0x2d0 [ 22.753296] ? sgi_partition+0x3c0/0x3c0 [ 22.755203] ata1: PATA max MWDMA2 cmd 0x1f0 ctl 0x3f6 bmdma 0xc100 irq 14 [ 22.753296] ? do_raw_spin_unlock+0xa4/0x130 [ 22.756637] ata2: PATA max MWDMA2 cmd 0x170 ctl 0x376 bmdma 0xc108 irq 15 [ 22.753296] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.758498] efi_partition+0x252/0xb40 [ 22.758498] ? __inc_numa_state+0x19/0x90 [ 22.758498] ? kasan_unpoison_shadow+0x30/0x40 [ 22.758498] ? vzalloc+0x58/0x60 [ 22.758498] ? rescan_partitions+0x133/0x510 [ 22.758498] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.758498] ? kasan_unpoison_shadow+0x30/0x40 [ 22.764064] ? get_page_from_freelist+0x16dd/0x1a50 [ 22.764064] ? widen_string+0x25/0x100 [ 22.766067] ? widen_string+0x25/0x100 [ 22.766282] i8042: PNP: PS/2 Controller [PNP0303:KBD,PNP0f13:MOU] at 0x60,0x64 irq 1,12 [ 22.766575] ? string+0xc4/0xf0 [ 22.766575] ? format_decode+0x79/0x4b0 [ 22.766575] ? memcpy+0x34/0x50 [ 22.770062] serio: i8042 KBD port at 0x60,0x64 irq 1 [ 22.766575] ? vsnprintf+0xf2/0x7d0 [ 22.771264] serio: i8042 AUX port at 0x60,0x64 irq 12 [ 22.766575] ? num_to_str+0x160/0x160 [ 22.772750] ? __might_sleep+0x31/0xd0 [ 22.773202] mousedev: PS/2 mouse device common for all mice [ 22.772750] ? snprintf+0x8f/0xc0 [ 22.772750] ? vscnprintf+0x30/0x30 [ 22.772750] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.776149] input: AT Translated Set 2 keyboard as /devices/platform/i8042/serio0/input/input1 [ 22.776242] ? check_partition+0x18f/0x310 [ 22.776717] rtc_cmos 00:00: RTC can wake from S4 [ 22.778723] rtc_cmos 00:00: registered as rtc0 [ 22.779023] check_partition+0x18f/0x310 [ 22.779023] rescan_partitions+0x133/0x510 [ 22.780706] rtc_cmos 00:00: nvmem registration failed [ 22.779023] ? __might_sleep+0x31/0xd0 [ 22.782618] rtc_cmos 00:00: alarms up to one day, y3k, 114 bytes nvram, hpet irqs [ 22.779023] ? kmem_cache_alloc+0xdd/0x5b0 [ 22.779023] ? ___might_sleep+0x7c/0x1a0 [ 22.786069] ? bd_set_size+0x104/0x160 [ 22.786069] __blkdev_get+0x484/0x7b0 [ 22.786069] ? bd_set_size+0x160/0x160 [ 22.786069] ? map_id_range_down+0x150/0x170 [ 22.788981] device-mapper: ioctl: 4.39.0-ioctl (2018-04-03) initialised: dm-devel@redhat.com [ 22.786069] ? free_user_ns+0x170/0x170 [ 22.789183] blkdev_get+0x1e6/0x4f0 [ 22.789183] ? __raw_spin_lock_init+0x42/0x50 [ 22.789183] ? __wake_up_bit+0x78/0xc0 [ 22.792867] device-mapper: raid: Loading target version 1.13.2 [ 22.789183] ? __blkdev_get+0x7b0/0x7b0 [ 22.789183] ? refcount_sub_and_test+0xa7/0x120 [ 22.789183] ? refcount_inc+0x30/0x30 [ 22.795220] ? do_raw_spin_lock+0x9b/0x130 [ 22.795991] hidraw: raw HID events driver (C) Jiri Kosina [ 22.796068] ? do_raw_spin_unlock+0xa4/0x130 [ 22.796068] ? kobject_put+0x2d/0x100 [ 22.796068] __device_add_disk+0x62a/0x780 [ 22.796068] ? bdget_disk+0x50/0x50 [ 22.796068] ? alloc_dax+0x222/0x2e0 [ 22.800041] ? kill_dax+0xc0/0xc0 [ 22.800041] ? mutex_lock+0x2a/0x50 [ 22.800041] ? nvdimm_badblocks_populate+0x56/0x1d0 [ 22.800041] ? __raw_spin_lock_init+0x42/0x50 [ 22.800041] pmem_attach_disk+0x580/0x7b0 [ 22.803358] oprofile: using NMI interrupt. [ 22.800041] ? nd_pmem_notify+0x240/0x240 [ 22.804217] ? nd_dax_probe+0xef/0x110 [ 22.805177] NET: Registered protocol family 17 [ 22.804217] nvdimm_bus_probe+0x76/0xe0 [ 22.804217] driver_probe_device+0x31d/0x450 [ 22.806975] sctp: Hash tables configured (bind 128/128) [ 22.804217] ? __driver_attach+0xd0/0xd0 [ 22.808773] Key type dns_resolver registered [ 22.804217] bus_for_each_drv+0xd8/0x130 [ 22.809534] Key type ceph registered [ 22.804217] ? subsys_find_device_by_id+0x1e0/0x1e0 [ 22.804217] ? do_raw_spin_unlock+0xa4/0x130 [ 22.811472] __device_attach+0x14d/0x1b0 [ 22.812223] ? device_bind_driver+0x70/0x70 [ 22.812768] libceph: loaded (mon/osd proto 15/24) [ 22.812223] ? kobject_uevent_env+0x15d/0x7f0 [ 22.812223] bus_probe_device+0xf3/0x120 [ 22.812223] device_add+0x696/0xa20 [ 22.812223] ? device_private_init+0xc0/0xc0 [ 22.816158] ? __wake_up_common_lock+0xcb/0x110 [ 22.816158] ? __wake_up_common+0x240/0x240 [ 22.816158] ? nd_async_device_unregister+0x30/0x30 [ 22.816158] nd_async_device_register+0xe/0x40 [ 22.816158] async_run_entry_fn+0x75/0x1e0 [ 22.819536] process_one_work+0x349/0x6d0 [ 22.819536] worker_thread+0x74/0x5c0 [ 22.819536] ? process_one_work+0x6d0/0x6d0 [ 22.819536] kthread+0x1b7/0x1e0 [ 22.819536] ? __kthread_bind_mask+0x70/0x70 [ 22.819536] ret_from_fork+0x24/0x30 [ 22.819536] Code: 01 74 0b 41 80 38 00 74 f0 4d 85 c0 75 56 4c 01 c8 49 89 e8 49 29 c0 4d 8d 48 07 4d 85 c0 4d 0f 49 c8 49 c1 f9 03 45 85 c9 74 5b <48> 83 38 00 75 18 45 8d 41 ff 4e 8d 44 c0 08 48 8 [ 22.819536] RIP: check_memory_region+0xdd/0x190 RSP: ffff88007a1aec08 [ 22.819536] CR2: ffffed0078000000 [ 22.819536] ---[ end trace 2d3ad661d958a6e8 ]--- [ 22.819536] BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:34 [ 22.819536] in_atomic(): 1, irqs_disabled(): 1, pid: 168, name: kworker/u33:3 [ 22.819536] CPU: 8 PID: 168 Comm: kworker/u33:3 Tainted: G B D 4.17.0-rc7-dgc+ #549 [ 22.819536] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 22.819536] Workqueue: events_unbound async_run_entry_fn [ 22.819536] Call Trace: [ 22.819536] dump_stack+0x9a/0xeb [ 22.819536] ___might_sleep+0x169/0x1a0 [ 22.819536] exit_signals+0x67/0x310 [ 22.819536] ? get_signal+0x860/0x860 [ 22.819536] ? __wake_up_common+0x240/0x240 [ 22.819536] ? nd_async_device_unregister+0x30/0x30 [ 22.819536] ? blocking_notifier_call_chain+0x24/0x70 [ 22.819536] do_exit+0x14d/0x1370 [ 22.819536] ? is_current_pgrp_orphaned+0x60/0x60 [ 22.819536] ? worker_thread+0x74/0x5c0 [ 22.819536] ? process_one_work+0x6d0/0x6d0 [ 22.819536] ? kthread+0x1b7/0x1e0 [ 22.819536] rewind_stack_do_exit+0x17/0x20 [ 22.819536] ================================================================== [ 22.819536] BUG: KASAN: stack-out-of-bounds in widen_string+0x25/0x100 [ 22.819536] Read of size 8 at addr ffff88007a1afae0 by task kworker/u33:3/168 [ 22.819536] [ 22.819536] CPU: 8 PID: 168 Comm: kworker/u33:3 Tainted: G D 4.17.0-rc7-dgc+ #549 [ 22.819536] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 22.819536] Workqueue: events_unbound async_run_entry_fn [ 22.819536] Call Trace: [ 22.819536] dump_stack+0x9a/0xeb [ 22.819536] ? widen_string+0x25/0x100 [ 22.819536] print_address_description+0x7e/0x290 [ 22.819536] ? widen_string+0x25/0x100 [ 22.819536] kasan_report+0x237/0x360 [ 22.819536] widen_string+0x25/0x100 [ 22.819536] string+0xc4/0xf0 [ 22.819536] vsnprintf+0x2a8/0x7d0 [ 22.819536] ? num_to_str+0x160/0x160 [ 22.819536] ? subsys_find_device_by_id+0x1e0/0x1e0 [ 22.819536] ? do_raw_spin_unlock+0xa4/0x130 [ 22.819536] vscnprintf+0x9/0x30 [ 22.819536] vprintk_emit+0x86/0x400 [ 22.819536] printk+0x94/0xb0 [ 22.819536] ? cpumask_weight.constprop.27+0x1e/0x1e [ 22.819536] ? bus_probe_device+0xf3/0x120 [ 22.819536] ? ___might_sleep+0xa7/0x1a0 [ 22.819536] ___might_sleep+0xee/0x1a0 [ 22.819536] exit_signals+0x67/0x310 [ 22.819536] ? get_signal+0x860/0x860 [ 22.819536] ? __wake_up_common+0x240/0x240 [ 22.819536] ? nd_async_device_unregister+0x30/0x30 [ 22.819536] ? blocking_notifier_call_chain+0x24/0x70 [ 22.819536] do_exit+0x14d/0x1370 [ 22.819536] ? is_current_pgrp_orphaned+0x60/0x60 [ 22.819536] ? worker_thread+0x74/0x5c0 [ 22.819536] ? process_one_work+0x6d0/0x6d0 [ 22.819536] ? kthread+0x1b7/0x1e0 [ 22.819536] rewind_stack_do_exit+0x17/0x20 [ 22.819536] [ 22.819536] The buggy address belongs to the page: [ 22.819536] page:ffffea0001ab5e48 count:0 mapcount:0 mapping:0000000000000000 index:0x0 [ 22.819536] flags: 0xfffffc0000000() [ 22.819536] raw: 000fffffc0000000 0000000000000000 0000000000000000 00000000ffffffff [ 22.819536] raw: ffffea0001ab5e68 ffffea0001ab5e68 0000000000000000 [ 22.819536] page dumped because: kasan: bad access detected [ 22.819536] [ 22.819536] Memory state around the buggy address: [ 22.819536] ffff88007a1af980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 22.819536] ffff88007a1afa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 [ 22.819536] >ffff88007a1afa80: f1 f1 f1 00 00 00 00 00 00 00 00 f3 f3 f3 f3 00 [ 22.819536] ^ [ 22.819536] ffff88007a1afb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 22.819536] ffff88007a1afb80: 00 00 f1 f1 f1 f1 00 f2 f2 f2 f3 f3 f3 f3 f3 f3 [ 22.819536] ================================================================== [ 22.882914] hpet1: lost 5 rtc interrupts [ 22.883636] note: kworker/u33:3[168] exited with preempt_count 1 [ 22.885240] kworker/u33:3 (168) used greatest stack depth: 24768 bytes left [ 22.887136] BUG: unable to handle kernel paging request at ffffed00c0000000 [ 22.888327] PGD 13ffd1067 P4D 13ffd1067 PUD 0 [ 22.889043] Oops: 0000 [#2] PREEMPT SMP KASAN [ 22.889747] CPU: 12 PID: 159 Comm: kworker/u33:2 Tainted: G B D W 4.17.0-rc7-dgc+ #549 [ 22.890253] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 22.890253] Workqueue: events_unbound async_run_entry_fn [ 22.890253] RIP: 0010:check_memory_region+0xdd/0x190 [ 22.890253] RSP: 0000:ffff88007a16ec08 EFLAGS: 00010202 [ 22.890253] RAX: ffffed00c0000000 RBX: ffff880600000fff RCX: ffffffff81d41a12 [ 22.890253] RDX: 0000000000000000 RSI: 0000000000001000 RDI: ffff880600000000 [ 22.890253] RBP: ffffed00c0000200 R08: 0000000000000200 R09: 0000000000000040 [ 22.890253] R10: 0000000000000200 R11: ffffed00c00001ff R12: ffff880600000000 [ 22.890253] R13: 0000000001927f90 R14: 0000000000000000 R15: 0000000000000000 [ 22.890253] FS: 0000000000000000(0000) GS:ffff88007f780000(0000) knlGS:0000000000000000 [ 22.890253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.890253] CR2: ffffed00c0000000 CR3: 000000000300f001 CR4: 00000000000606e0 [ 22.890253] Call Trace: [ 22.890253] memcpy+0x1f/0x50 [ 22.890253] pmem_do_bvec+0x182/0x350 [ 22.890253] ? pmem_release_queue+0x10/0x10 [ 22.890253] ? blkdev_get+0x1e6/0x4f0 [ 22.890253] ? __device_add_disk+0x62a/0x780 [ 22.890253] ? pmem_attach_disk+0x580/0x7b0 [ 22.890253] ? do_raw_spin_unlock+0xa4/0x130 [ 22.890253] ? _raw_spin_unlock+0xa/0x20 [ 22.890253] ? create_task_io_context+0x195/0x1d0 [ 22.890253] ? generic_make_request_checks+0x33a/0x7a0 [ 22.890253] pmem_make_request+0x1d9/0x3c0 [ 22.890253] generic_make_request+0x2d8/0x6c0 [ 22.890253] ? blk_plug_queued_count+0xb0/0xb0 [ 22.890253] ? memset+0x1f/0x40 [ 22.890253] ? submit_bio+0xc8/0x1e0 [ 22.890253] submit_bio+0xc8/0x1e0 [ 22.890253] ? direct_make_request+0x150/0x150 [ 22.890253] ? alloc_buffer_head+0x3c/0x80 [ 22.890253] ? alloc_page_buffers+0x7f/0x120 [ 22.890253] ? __rcu_read_unlock+0x66/0x80 [ 22.917466] ata2.01: NODEV after polling detection [ 22.890253] ? guard_bio_eod+0xa9/0x1e0 [ 22.919184] ata1.01: NODEV after polling detection [ 22.890253] submit_bh_wbc.isra.41+0x257/0x280 [ 22.920719] ata2.00: ATAPI: QEMU DVD-ROM, 2.5+, max UDMA/100 [ 22.890253] ? end_buffer_read_nobh+0x10/0x10 [ 22.922324] ata1.00: ATA-7: QEMU HARDDISK, 2.5+, max UDMA/100 [ 22.890253] block_read_full_page+0x307/0x470 [ 22.923573] ata1.00: 20971520 sectors, multi 16: LBA48 [ 22.890253] ? block_llseek+0x80/0x80 [ 22.890253] ? block_page_mkwrite+0x150/0x150 [ 22.890253] ? add_to_page_cache_lru+0xd7/0x160 [ 22.890253] ? add_to_page_cache_locked+0x10/0x10 [ 22.890253] ? policy_node+0x56/0x60 [ 22.926515] ata2.00: configured for MWDMA2 [ 22.890253] do_read_cache_page+0x3e3/0x730 [ 22.927886] ata1.00: configured for MWDMA2 [ 22.890253] ? bus_probe_device+0xf3/0x120 [ 22.890253] ? async_run_entry_fn+0x75/0x1e0 [ 22.890253] ? blkdev_writepages+0x10/0x10 [ 22.890253] ? filemap_fault+0x940/0x940 [ 22.890253] ? unwind_next_frame+0x8cf/0x940 [ 22.890253] ? stack_access_ok+0x35/0x80 [ 22.890253] ? deref_stack_reg+0x7f/0xb0 [ 22.890253] ? __read_once_size_nocheck.constprop.6+0x10/0x10 [ 22.890253] ? unwind_get_return_address_ptr+0x50/0x50 [ 22.890253] ? save_stack+0x89/0xb0 [ 22.890253] ? __orc_find+0x6b/0xc0 [ 22.890253] ? unwind_next_frame+0xc8/0x940 [ 22.890253] read_dev_sector+0x61/0x120 [ 22.890253] read_lba+0x260/0x2d0 [ 22.890253] ? sgi_partition+0x3c0/0x3c0 [ 22.890253] ? __save_stack_trace+0x73/0xd0 [ 22.890253] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.890253] efi_partition+0x252/0xb40 [ 22.890253] ? __inc_numa_state+0x19/0x90 [ 22.890253] ? kasan_unpoison_shadow+0x30/0x40 [ 22.890253] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.890253] ? kasan_unpoison_shadow+0x30/0x40 [ 22.890253] ? get_page_from_freelist+0x16dd/0x1a50 [ 22.890253] ? widen_string+0x25/0x100 [ 22.890253] ? widen_string+0x25/0x100 [ 22.890253] ? string+0xc4/0xf0 [ 22.890253] ? format_decode+0x79/0x4b0 [ 22.890253] ? memcpy+0x34/0x50 [ 22.890253] ? vsnprintf+0xf2/0x7d0 [ 22.890253] ? num_to_str+0x160/0x160 [ 22.890253] ? __might_sleep+0x31/0xd0 [ 22.890253] ? snprintf+0x8f/0xc0 [ 22.890253] ? vscnprintf+0x30/0x30 [ 22.890253] ? is_gpt_valid.part.5+0x3c0/0x3c0 [ 22.890253] ? check_partition+0x18f/0x310 [ 22.890253] check_partition+0x18f/0x310 [ 22.890253] rescan_partitions+0x133/0x510 [ 22.890253] ? __might_sleep+0x31/0xd0 [ 22.890253] ? ___might_sleep+0x7c/0x1a0 [ 22.890253] ? bd_set_size+0x104/0x160 [ 22.890253] __blkdev_get+0x484/0x7b0 [ 22.890253] ? __raw_spin_lock_init+0x42/0x50 [ 22.890253] ? bd_set_size+0x160/0x160 [ 22.890253] ? map_id_range_down+0x150/0x170 [ 22.890253] ? free_user_ns+0x170/0x170 [ 22.890253] blkdev_get+0x1e6/0x4f0 [ 22.890253] ? __raw_spin_lock_init+0x42/0x50 [ 22.890253] ? __wake_up_bit+0x78/0xc0 [ 22.890253] ? __blkdev_get+0x7b0/0x7b0 [ 22.890253] ? refcount_sub_and_test+0xa7/0x120 [ 22.890253] ? refcount_inc+0x30/0x30 [ 22.890253] ? do_raw_spin_lock+0x9b/0x130 [ 22.890253] ? do_raw_spin_unlock+0xa4/0x130 [ 22.890253] ? kobject_put+0x2d/0x100 [ 22.890253] __device_add_disk+0x62a/0x780 [ 22.890253] ? bdget_disk+0x50/0x50 [ 22.890253] ? alloc_dax+0x222/0x2e0 [ 22.890253] ? kill_dax+0xc0/0xc0 [ 22.890253] ? mutex_lock+0x2a/0x50 [ 22.890253] ? nvdimm_badblocks_populate+0x56/0x1d0 [ 22.890253] ? __raw_spin_lock_init+0x42/0x50 [ 22.890253] pmem_attach_disk+0x580/0x7b0 [ 22.890253] ? nd_pmem_notify+0x240/0x240 [ 22.890253] ? nd_dax_probe+0xef/0x110 [ 22.890253] nvdimm_bus_probe+0x76/0xe0 [ 22.890253] driver_probe_device+0x31d/0x450 [ 22.890253] ? __driver_attach+0xd0/0xd0 [ 22.890253] bus_for_each_drv+0xd8/0x130 [ 22.890253] ? subsys_find_device_by_id+0x1e0/0x1e0 [ 22.890253] ? do_raw_spin_unlock+0xa4/0x130 [ 22.890253] __device_attach+0x14d/0x1b0 [ 22.890253] ? device_bind_driver+0x70/0x70 [ 22.890253] ? kobject_uevent_env+0x15d/0x7f0 [ 22.890253] bus_probe_device+0xf3/0x120 [ 22.890253] device_add+0x696/0xa20 [ 22.890253] ? device_private_init+0xc0/0xc0 [ 22.890253] ? finish_task_switch+0xf8/0x3c0 [ 22.890253] ? nd_async_device_unregister+0x30/0x30 [ 22.890253] nd_async_device_register+0xe/0x40 [ 22.890253] async_run_entry_fn+0x75/0x1e0 [ 22.890253] process_one_work+0x349/0x6d0 [ 22.890253] worker_thread+0x74/0x5c0 [ 22.890253] ? process_one_work+0x6d0/0x6d0 [ 22.890253] kthread+0x1b7/0x1e0 [ 22.890253] ? __kthread_bind_mask+0x70/0x70 [ 22.890253] ret_from_fork+0x24/0x30 [ 22.890253] Code: 01 74 0b 41 80 38 00 74 f0 4d 85 c0 75 56 4c 01 c8 49 89 e8 49 29 c0 4d 8d 48 07 4d 85 c0 4d 0f 49 c8 49 c1 f9 03 45 85 c9 74 5b <48> 83 38 00 75 18 45 8d 41 ff 4e 8d 44 c0 08 48 8 [ 22.890253] RIP: check_memory_region+0xdd/0x190 RSP: ffff88007a16ec08 [ 22.890253] CR2: ffffed00c0000000 [ 22.890253] ---[ end trace 2d3ad661d958a6e9 ]--- -Dave.
On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote: > On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: >> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: >> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: >> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: >> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even >> > >> > installed ndctl until this morning!) but after changing the kernel >> > >> > it no longer works. That would make it a regression, yes? >> >> [....] >> >> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which >> > >> has the following dependencies: >> > >> >> > >> depends on MEMORY_HOTPLUG >> > >> depends on MEMORY_HOTREMOVE >> > >> depends on SPARSEMEM_VMEMMAP >> > > >> > > Filesystem DAX now has a dependency on memory hotplug? >> >> [....] >> >> > > OK, works now I've found the magic config incantantions to turn >> > > everything I now need on. >> >> By enabling these options, my test VM now has a ~30s pause in the >> boot very soon after the nvdimm subsystem is initialised. >> >> [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled >> [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A >> [ 1.552175] Non-volatile memory driver v1.3 >> [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz >> [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns >> [ 37.217453] brd: module loaded >> [ 37.225423] loop: module loaded >> [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) >> [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) >> [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) >> [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes >> [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes >> >> The system does not appear to be consuming CPU, but it is blocking >> NMIs so I can't get a CPU trace. For a VM that I rely on booting in >> a few seconds because I reboot it tens of times a day, this is a >> problem.... > > And when I turn on KASAN, the kernel fails to boot to a login prompt > because: What's your qemu and kernel command line? I'll take look at this first thing tomorrow.
On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote: > On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote: >> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: >>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: >>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: >>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: >>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even >>> > >> > installed ndctl until this morning!) but after changing the kernel >>> > >> > it no longer works. That would make it a regression, yes? >>> >>> [....] >>> >>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which >>> > >> has the following dependencies: >>> > >> >>> > >> depends on MEMORY_HOTPLUG >>> > >> depends on MEMORY_HOTREMOVE >>> > >> depends on SPARSEMEM_VMEMMAP >>> > > >>> > > Filesystem DAX now has a dependency on memory hotplug? >>> >>> [....] >>> >>> > > OK, works now I've found the magic config incantantions to turn >>> > > everything I now need on. >>> >>> By enabling these options, my test VM now has a ~30s pause in the >>> boot very soon after the nvdimm subsystem is initialised. >>> >>> [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled >>> [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A >>> [ 1.552175] Non-volatile memory driver v1.3 >>> [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz >>> [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns >>> [ 37.217453] brd: module loaded >>> [ 37.225423] loop: module loaded >>> [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) >>> [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) >>> [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) >>> [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes >>> [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes >>> >>> The system does not appear to be consuming CPU, but it is blocking >>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in >>> a few seconds because I reboot it tens of times a day, this is a >>> problem.... >> >> And when I turn on KASAN, the kernel fails to boot to a login prompt >> because: > > What's your qemu and kernel command line? I'll take look at this first > thing tomorrow. I was able to reproduce this crash by just turning on KASAN... investigating. It would still help to have your config for our own regression testing purposes it makes sense for us to prioritize "Dave's test config", similar to the priority of not breaking Linus' laptop.
On Mon, Jun 04 2018 at 7:40pm -0400, Dan Williams <dan.j.williams@intel.com> wrote: > On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote: > > On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote: > >> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: > >>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: > >>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: > >>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: > >>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even > >>> > >> > installed ndctl until this morning!) but after changing the kernel > >>> > >> > it no longer works. That would make it a regression, yes? > >>> > >>> [....] > >>> > >>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which > >>> > >> has the following dependencies: > >>> > >> > >>> > >> depends on MEMORY_HOTPLUG > >>> > >> depends on MEMORY_HOTREMOVE > >>> > >> depends on SPARSEMEM_VMEMMAP > >>> > > > >>> > > Filesystem DAX now has a dependency on memory hotplug? > >>> > >>> [....] > >>> > >>> > > OK, works now I've found the magic config incantantions to turn > >>> > > everything I now need on. > >>> > >>> By enabling these options, my test VM now has a ~30s pause in the > >>> boot very soon after the nvdimm subsystem is initialised. > >>> > >>> [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled > >>> [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A > >>> [ 1.552175] Non-volatile memory driver v1.3 > >>> [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz > >>> [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns > >>> [ 37.217453] brd: module loaded > >>> [ 37.225423] loop: module loaded > >>> [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) > >>> [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) > >>> [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) > >>> [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes > >>> [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes > >>> > >>> The system does not appear to be consuming CPU, but it is blocking > >>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in > >>> a few seconds because I reboot it tens of times a day, this is a > >>> problem.... > >> > >> And when I turn on KASAN, the kernel fails to boot to a login prompt > >> because: > > > > What's your qemu and kernel command line? I'll take look at this first > > thing tomorrow. > > I was able to reproduce this crash by just turning on KASAN... > investigating. It would still help to have your config for our own > regression testing purposes it makes sense for us to prioritize > "Dave's test config", similar to the priority of not breaking Linus' > laptop. Dave, _this_ is when you know you've arrived! ;)
[ adding KASAN devs...] On Mon, Jun 4, 2018 at 4:40 PM, Dan Williams <dan.j.williams@intel.com> wrote: > On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote: >> On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote: >>> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: >>>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: >>>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: >>>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: >>>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even >>>> > >> > installed ndctl until this morning!) but after changing the kernel >>>> > >> > it no longer works. That would make it a regression, yes? >>>> >>>> [....] >>>> >>>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which >>>> > >> has the following dependencies: >>>> > >> >>>> > >> depends on MEMORY_HOTPLUG >>>> > >> depends on MEMORY_HOTREMOVE >>>> > >> depends on SPARSEMEM_VMEMMAP >>>> > > >>>> > > Filesystem DAX now has a dependency on memory hotplug? >>>> >>>> [....] >>>> >>>> > > OK, works now I've found the magic config incantantions to turn >>>> > > everything I now need on. >>>> >>>> By enabling these options, my test VM now has a ~30s pause in the >>>> boot very soon after the nvdimm subsystem is initialised. >>>> >>>> [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled >>>> [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A >>>> [ 1.552175] Non-volatile memory driver v1.3 >>>> [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz >>>> [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns >>>> [ 37.217453] brd: module loaded >>>> [ 37.225423] loop: module loaded >>>> [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) >>>> [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) >>>> [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) >>>> [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes >>>> [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes >>>> >>>> The system does not appear to be consuming CPU, but it is blocking >>>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in >>>> a few seconds because I reboot it tens of times a day, this is a >>>> problem.... >>> >>> And when I turn on KASAN, the kernel fails to boot to a login prompt >>> because: >> >> What's your qemu and kernel command line? I'll take look at this first >> thing tomorrow. > > I was able to reproduce this crash by just turning on KASAN... > investigating. It would still help to have your config for our own > regression testing purposes it makes sense for us to prioritize > "Dave's test config", similar to the priority of not breaking Linus' > laptop. I believe this is a bug in KASAN, or a bug in devm_memremap_pages(), depends on your point of view. At the very least it is a mismatch of assumptions. KASAN learns of hot added memory via the memory hotplug notifier. However, the devm_memremap_pages() implementation is intentionally limited to the "first half" of the memory hotplug procedure. I.e. it does just enough to setup the linear map for pfn_to_page() and initialize the "struct page" memmap, but then stops short of onlining the pages. This is why we are getting a NULL ptr deref and not a KASAN report, because KASAN has no shadow area setup for the linearly mapped pmem range. In terms of solving it we could refactor kasan_mem_notifier() so that devm_memremap_pages() can call it outside of the notifier... I'll give this a shot.
On Mon, Jun 04, 2018 at 08:33:26PM -0400, Mike Snitzer wrote: > On Mon, Jun 04 2018 at 7:40pm -0400, > Dan Williams <dan.j.williams@intel.com> wrote: > > > On Sun, Jun 3, 2018 at 6:48 PM, Dan Williams <dan.j.williams@intel.com> wrote: > > > On Sun, Jun 3, 2018 at 5:25 PM, Dave Chinner <david@fromorbit.com> wrote: > > >> On Mon, Jun 04, 2018 at 08:20:38AM +1000, Dave Chinner wrote: > > >>> On Thu, May 31, 2018 at 09:02:52PM -0700, Dan Williams wrote: > > >>> > On Thu, May 31, 2018 at 7:24 PM, Dave Chinner <david@fromorbit.com> wrote: > > >>> > > On Thu, May 31, 2018 at 06:57:33PM -0700, Dan Williams wrote: > > >>> > >> > FWIW, XFS+DAX used to just work on this setup (I hadn't even > > >>> > >> > installed ndctl until this morning!) but after changing the kernel > > >>> > >> > it no longer works. That would make it a regression, yes? > > >>> > > >>> [....] > > >>> > > >>> > >> I suspect your kernel does not have CONFIG_ZONE_DEVICE enabled which > > >>> > >> has the following dependencies: > > >>> > >> > > >>> > >> depends on MEMORY_HOTPLUG > > >>> > >> depends on MEMORY_HOTREMOVE > > >>> > >> depends on SPARSEMEM_VMEMMAP > > >>> > > > > >>> > > Filesystem DAX now has a dependency on memory hotplug? > > >>> > > >>> [....] > > >>> > > >>> > > OK, works now I've found the magic config incantantions to turn > > >>> > > everything I now need on. > > >>> > > >>> By enabling these options, my test VM now has a ~30s pause in the > > >>> boot very soon after the nvdimm subsystem is initialised. > > >>> > > >>> [ 1.523718] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled > > >>> [ 1.550353] 00:05: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A > > >>> [ 1.552175] Non-volatile memory driver v1.3 > > >>> [ 2.332045] tsc: Refined TSC clocksource calibration: 2199.909 MHz > > >>> [ 2.333280] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x1fb5dcd4620, max_idle_ns: 440795264143 ns > > >>> [ 37.217453] brd: module loaded > > >>> [ 37.225423] loop: module loaded > > >>> [ 37.228441] virtio_blk virtio2: [vda] 10485760 512-byte logical blocks (5.37 GB/5.00 GiB) > > >>> [ 37.245418] virtio_blk virtio3: [vdb] 146800640 512-byte logical blocks (75.2 GB/70.0 GiB) > > >>> [ 37.255794] virtio_blk virtio4: [vdc] 1073741824000 512-byte logical blocks (550 TB/500 TiB) > > >>> [ 37.265403] nd_pmem namespace1.0: unable to guarantee persistence of writes > > >>> [ 37.265618] nd_pmem namespace0.0: unable to guarantee persistence of writes > > >>> > > >>> The system does not appear to be consuming CPU, but it is blocking > > >>> NMIs so I can't get a CPU trace. For a VM that I rely on booting in > > >>> a few seconds because I reboot it tens of times a day, this is a > > >>> problem.... > > >> > > >> And when I turn on KASAN, the kernel fails to boot to a login prompt > > >> because: > > > > > > What's your qemu and kernel command line? I'll take look at this first > > > thing tomorrow. > > > > I was able to reproduce this crash by just turning on KASAN... > > investigating. It would still help to have your config for our own > > regression testing purposes it makes sense for us to prioritize > > "Dave's test config", similar to the priority of not breaking Linus' > > laptop. > > Dave, _this_ is when you know you've arrived! ;) Nah, this is when I know I've been a real Grouch. Someone paint me green, hand me a garbage bin and call me Oscar. :P I've attached the config below. At one point (probably around 3.0) it was the barest minimum needed to boot and test XFS in a VM. I only tend to manually turn on the things I need for testing, but I've never swept out stuff I don't need so who knows what crap it's gathered over the years. BTW, I found the init delay - it was another RNG initialisation perf regression (this time on the fast init path) as a result of all the OMG-THE-RNG-IS-B0RKEN changes that went into 4.17. Hint: use /dev/urandom as the host side entropy source, not /dev/random. That just leaves KASAN.... Cheers, Dave.
diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 3943feb9a090..1d7bd96511f0 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -80,9 +80,9 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); * This is a library function for filesystems to check if the block device * can be mounted with dax option. * - * Return: negative errno if unsupported, 0 if supported. + * Return: true if supported, false if unsupported */ -int __bdev_dax_supported(struct block_device *bdev, int blocksize) +bool __bdev_dax_supported(struct block_device *bdev, int blocksize) { struct dax_device *dax_dev; pgoff_t pgoff; @@ -95,21 +95,21 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) if (blocksize != PAGE_SIZE) { pr_debug("%s: error: unsupported blocksize for dax\n", bdevname(bdev, buf)); - return -EINVAL; + return false; } err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { pr_debug("%s: error: unaligned partition for dax\n", bdevname(bdev, buf)); - return err; + return false; } dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) { pr_debug("%s: error: device does not support dax\n", bdevname(bdev, buf)); - return -EOPNOTSUPP; + return false; } id = dax_read_lock(); @@ -121,7 +121,7 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) if (len < 1) { pr_debug("%s: error: dax access failed (%ld)\n", bdevname(bdev, buf), len); - return len < 0 ? len : -EIO; + return false; } if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { @@ -139,10 +139,10 @@ int __bdev_dax_supported(struct block_device *bdev, int blocksize) } else { pr_debug("%s: error: dax support not enabled\n", bdevname(bdev, buf)); - return -EOPNOTSUPP; + return false; } - return 0; + return true; } EXPORT_SYMBOL_GPL(__bdev_dax_supported); #endif diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9627c3054b5c..c09289a42dc5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - err = bdev_dax_supported(sb->s_bdev, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 089170e99895..2e1622907f4a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3732,8 +3732,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " that may contain inline data"); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } - err = bdev_dax_supported(sb->s_bdev, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0effd46b965f..2c70a0a4f59f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1103,8 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), - sb->s_blocksize) < 0) + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + sb->s_blocksize)) return -EINVAL; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 62188c2a4c36..86915dc40eed 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1690,17 +1690,17 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (mp->m_flags & XFS_MOUNT_DAX) { - int error2 = 0; + bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - error = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, - sb->s_blocksize); + datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, + sb->s_blocksize); if (mp->m_rtdev_targp) - error2 = bdev_dax_supported(mp->m_rtdev_targp->bt_bdev, - sb->s_blocksize); - if (error && error2) { + rtdev_is_dax = bdev_dax_supported( + mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; diff --git a/include/linux/dax.h b/include/linux/dax.h index 2a4f7295c12b..c99692ddd4b5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -64,8 +64,8 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) -int __bdev_dax_supported(struct block_device *bdev, int blocksize); -static inline int bdev_dax_supported(struct block_device *bdev, int blocksize) +bool __bdev_dax_supported(struct block_device *bdev, int blocksize); +static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) { return __bdev_dax_supported(bdev, blocksize); } @@ -84,10 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); #else -static inline int bdev_dax_supported(struct block_device *bdev, +static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) { - return -EOPNOTSUPP; + return false; } static inline struct dax_device *fs_dax_get_by_host(const char *host)