diff mbox

[1/2] xfs: cache dax_device lookup result

Message ID 150362134833.39142.12423673180125514125.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dan Williams Aug. 25, 2017, 12:35 a.m. UTC
The ->iomap_begin() operation is a hot path, so cache the
fs_dax_get_by_host() result to avoid the incurring the hash lookup
overhead.

Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/xfs/xfs_aops.c  |   24 ++++++++++++++++++++++++
 fs/xfs/xfs_aops.h  |    1 +
 fs/xfs/xfs_buf.h   |    1 +
 fs/xfs/xfs_iomap.c |    9 +--------
 4 files changed, 27 insertions(+), 8 deletions(-)

Comments

Darrick J. Wong Aug. 25, 2017, 5:32 a.m. UTC | #1
On Thu, Aug 24, 2017 at 05:35:48PM -0700, Dan Williams wrote:
> The ->iomap_begin() operation is a hot path, so cache the
> fs_dax_get_by_host() result to avoid the incurring the hash lookup
> overhead.

Just out of curiosity (and sorry if this has already been discussed to
death and I'm merely ignorant) but I was wondering why the daxdev isn't
simply attached to the block_device?  Is it not a 1:1 mapping?

--D

> 
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Reported-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  fs/xfs/xfs_aops.c  |   24 ++++++++++++++++++++++++
>  fs/xfs/xfs_aops.h  |    1 +
>  fs/xfs/xfs_buf.h   |    1 +
>  fs/xfs/xfs_iomap.c |    9 +--------
>  4 files changed, 27 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 6bf120bb1a17..d16e013e1b8c 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -80,6 +80,30 @@ xfs_find_bdev_for_inode(
>  		return mp->m_ddev_targp->bt_bdev;
>  }
>  
> +struct dax_device *
> +xfs_find_daxdev_for_inode(
> +	struct inode		*inode)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct block_device	*bdev;
> +	xfs_buftarg_t		*bt;
> +
> +	if (XFS_IS_REALTIME_INODE(ip))
> +		bt = mp->m_rtdev_targp;
> +	else
> +		bt = mp->m_ddev_targp;
> +
> +	bdev = bt->bt_bdev;
> +	if (!blk_queue_dax(bdev->bd_queue))
> +		return NULL;
> +
> +	if (!bt->bt_daxdev)
> +		bt->bt_daxdev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
> +
> +	return bt->bt_daxdev;
> +}
> +
>  /*
>   * We're now finished for good with this page.  Update the page state via the
>   * associated buffer_heads, paying attention to the start and end offsets that
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index cc174ec6c2fd..88c85ea63da0 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -59,5 +59,6 @@ int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
>  
>  extern void xfs_count_page_state(struct page *, int *, int *);
>  extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
> +extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
>  
>  #endif /* __XFS_AOPS_H__ */
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 20721261dae5..fc95a02f272a 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -108,6 +108,7 @@ typedef unsigned int xfs_buf_flags_t;
>  typedef struct xfs_buftarg {
>  	dev_t			bt_dev;
>  	struct block_device	*bt_bdev;
> +	struct dax_device	*bt_daxdev;
>  	struct xfs_mount	*bt_mount;
>  	unsigned int		bt_meta_sectorsize;
>  	size_t			bt_meta_sectormask;
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 813394c62849..3e8b9ec9e802 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -69,6 +69,7 @@ xfs_bmbt_to_iomap(
>  	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
>  	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
>  	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
> +	iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
>  }
>  
>  xfs_extlen_t
> @@ -976,7 +977,6 @@ xfs_file_iomap_begin(
>  	int			nimaps = 1, error = 0;
>  	bool			shared = false, trimmed = false;
>  	unsigned		lockmode;
> -	struct block_device	*bdev;
>  
>  	if (XFS_FORCED_SHUTDOWN(mp))
>  		return -EIO;
> @@ -1087,13 +1087,6 @@ xfs_file_iomap_begin(
>  
>  	xfs_bmbt_to_iomap(ip, iomap, &imap);
>  
> -	/* optionally associate a dax device with the iomap bdev */
> -	bdev = iomap->bdev;
> -	if (blk_queue_dax(bdev->bd_queue))
> -		iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
> -	else
> -		iomap->dax_dev = NULL;
> -
>  	if (shared)
>  		iomap->flags |= IOMAP_F_SHARED;
>  	return 0;
>
Dan Williams Aug. 25, 2017, 6:24 a.m. UTC | #2
On Thu, Aug 24, 2017 at 10:32 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Thu, Aug 24, 2017 at 05:35:48PM -0700, Dan Williams wrote:
>> The ->iomap_begin() operation is a hot path, so cache the
>> fs_dax_get_by_host() result to avoid the incurring the hash lookup
>> overhead.
>
> Just out of curiosity (and sorry if this has already been discussed to
> death and I'm merely ignorant) but I was wondering why the daxdev isn't
> simply attached to the block_device?  Is it not a 1:1 mapping?

The motivation was to remove all dax details from the block layer.
However, we still have the QUEUE_DAX flag which defeats that goal.
Hmm, a dax_device is similar to a request_queue as a path to access
capacity, so I think it might make sense to just replace the queue
flag with a full ->dax_dev pointer hanging off the gendisk.

This would also fix the bug I just realized was in these patches
around matching dax_get_by_host() with put_dax(). In fact, if it's
acceptable to hang a dax_dev off a gendisk then we don't need
dax_get_by_host() at all.
Christoph Hellwig Aug. 25, 2017, 7:01 a.m. UTC | #3
> +	bdev = bt->bt_bdev;
> +	if (!blk_queue_dax(bdev->bd_queue))
> +		return NULL;
> +
> +	if (!bt->bt_daxdev)
> +		bt->bt_daxdev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
> +
> +	return bt->bt_daxdev;

Please do this at mount time.
Christoph Hellwig Aug. 25, 2017, 7:02 a.m. UTC | #4
On Thu, Aug 24, 2017 at 11:24:10PM -0700, Dan Williams wrote:
> This would also fix the bug I just realized was in these patches
> around matching dax_get_by_host() with put_dax(). In fact, if it's
> acceptable to hang a dax_dev off a gendisk then we don't need
> dax_get_by_host() at all.

I want to get the block code out of the loop entirely.  I'm a little
overloaded with work at the moment, but my plan was to add a mount_dax
equivalent to mount_bdev, and add support to the XFS buffer cache and
log code to go straight to DAX.  For the buffer cache we'd still have
to double buffer to provide transaction gurantees, but for the log
formatting it directly to memory should also provide nice speedups.
diff mbox

Patch

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6bf120bb1a17..d16e013e1b8c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -80,6 +80,30 @@  xfs_find_bdev_for_inode(
 		return mp->m_ddev_targp->bt_bdev;
 }
 
+struct dax_device *
+xfs_find_daxdev_for_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct block_device	*bdev;
+	xfs_buftarg_t		*bt;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		bt = mp->m_rtdev_targp;
+	else
+		bt = mp->m_ddev_targp;
+
+	bdev = bt->bt_bdev;
+	if (!blk_queue_dax(bdev->bd_queue))
+		return NULL;
+
+	if (!bt->bt_daxdev)
+		bt->bt_daxdev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
+
+	return bt->bt_daxdev;
+}
+
 /*
  * We're now finished for good with this page.  Update the page state via the
  * associated buffer_heads, paying attention to the start and end offsets that
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index cc174ec6c2fd..88c85ea63da0 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -59,5 +59,6 @@  int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
+extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 20721261dae5..fc95a02f272a 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -108,6 +108,7 @@  typedef unsigned int xfs_buf_flags_t;
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
+	struct dax_device	*bt_daxdev;
 	struct xfs_mount	*bt_mount;
 	unsigned int		bt_meta_sectorsize;
 	size_t			bt_meta_sectormask;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 813394c62849..3e8b9ec9e802 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -69,6 +69,7 @@  xfs_bmbt_to_iomap(
 	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
 	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+	iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
 }
 
 xfs_extlen_t
@@ -976,7 +977,6 @@  xfs_file_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false, trimmed = false;
 	unsigned		lockmode;
-	struct block_device	*bdev;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1087,13 +1087,6 @@  xfs_file_iomap_begin(
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
 
-	/* optionally associate a dax device with the iomap bdev */
-	bdev = iomap->bdev;
-	if (blk_queue_dax(bdev->bd_queue))
-		iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
-	else
-		iomap->dax_dev = NULL;
-
 	if (shared)
 		iomap->flags |= IOMAP_F_SHARED;
 	return 0;