diff mbox

xfs: rewrite getbmap using the xfs_iext_* helpers

Message ID 20170828150612.16437-1-hch@lst.de (mailing list archive)
State Superseded
Headers show

Commit Message

Christoph Hellwig Aug. 28, 2017, 3:06 p.m. UTC
Currently getbmap uses xfs_bmapi_read to query the extent map, and then
fixes up various bits that are eventually reported to userspace.

This patch instead rewrites it to use xfs_iext_lookup_extent and
xfs_iext_get_extent to iteratively process the extent map.  This not
only avoids the need to allocate a map for the returned xfs_bmbt_irec
structures but also greatly simplified the code.

There are two intentional behavior changes compared to the old code:

 - the current code reports unwritten extents that don't directly border
   a written one as unwritten even when not passing the BMV_IF_PREALLOC
   option, contrary to the documentation.  The new code requires the
   BMV_IF_PREALLOC flag to report the unwrittent extent bit.
 - The new code does never merges consecutive extents, unlike the old
   code that sometimes does it based on the boundaries of the
   xfs_bmapi_read calls.  Note that the extent merging behavior was
   entirely undocumented.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap_util.c | 499 ++++++++++++++++++-------------------------------
 1 file changed, 185 insertions(+), 314 deletions(-)

Comments

Darrick J. Wong Aug. 28, 2017, 6:31 p.m. UTC | #1
On Mon, Aug 28, 2017 at 05:06:12PM +0200, Christoph Hellwig wrote:
> Currently getbmap uses xfs_bmapi_read to query the extent map, and then
> fixes up various bits that are eventually reported to userspace.
> 
> This patch instead rewrites it to use xfs_iext_lookup_extent and
> xfs_iext_get_extent to iteratively process the extent map.  This not
> only avoids the need to allocate a map for the returned xfs_bmbt_irec
> structures but also greatly simplified the code.
> 
> There are two intentional behavior changes compared to the old code:
> 
>  - the current code reports unwritten extents that don't directly border
>    a written one as unwritten even when not passing the BMV_IF_PREALLOC
>    option, contrary to the documentation.  The new code requires the
>    BMV_IF_PREALLOC flag to report the unwrittent extent bit.
>  - The new code does never merges consecutive extents, unlike the old
>    code that sometimes does it based on the boundaries of the
>    xfs_bmapi_read calls.  Note that the extent merging behavior was
>    entirely undocumented.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Hmm... this causes at least a couple of xfstests regressions:

xfs/310, because we no longer merge adjacent bmap records.  I think the
solution here is to change the extent count check to allow 2 extents.

xfs/245 because zero-mapping forks are no longer reported as having one
big "hole" extent; instead zero extents are reported back.  How do we
want to handle this case?

--D

> ---
>  fs/xfs/xfs_bmap_util.c | 499 ++++++++++++++++++-------------------------------
>  1 file changed, 185 insertions(+), 314 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 93e955262d07..5962f119d4ff 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -404,125 +404,69 @@ xfs_bmap_count_blocks(
>  	return 0;
>  }
>  
> -/*
> - * returns 1 for success, 0 if we failed to map the extent.
> - */
> -STATIC int
> -xfs_getbmapx_fix_eof_hole(
> -	xfs_inode_t		*ip,		/* xfs incore inode pointer */
> -	int			whichfork,
> -	struct getbmapx		*out,		/* output structure */
> -	int			prealloced,	/* this is a file with
> -						 * preallocated data space */
> -	int64_t			end,		/* last block requested */
> -	xfs_fsblock_t		startblock,
> -	bool			moretocome)
> +static void
> +xfs_getbmap_report_one(
> +	struct xfs_inode		*ip,
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	struct xfs_bmbt_irec		*got,
> +	struct getbmapx			*p)
>  {
> -	int64_t			fixlen;
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	xfs_ifork_t		*ifp;		/* inode fork pointer */
> -	xfs_extnum_t		lastx;		/* last extent pointer */
> -	xfs_fileoff_t		fileblock;
> -
> -	if (startblock == HOLESTARTBLOCK) {
> -		mp = ip->i_mount;
> -		out->bmv_block = -1;
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
> -		fixlen -= out->bmv_offset;
> -		if (prealloced && out->bmv_offset + out->bmv_length == end) {
> -			/* Came to hole at EOF. Trim it. */
> -			if (fixlen <= 0)
> -				return 0;
> -			out->bmv_length = fixlen;
> -		}
> +	if (isnullstartblock(got->br_startblock) ||
> +	    got->br_startblock == DELAYSTARTBLOCK) {
> +		/*
> +		 * Delalloc extents that start beyond EOF can occur due to
> +		 * speculative EOF allocation when the delalloc extent is larger
> +		 * than the largest freespace extent at conversion time.  These
> +		 * extents cannot be converted by data writeback, so can exist
> +		 * here even if we are not supposed to be finding delalloc
> +		 * extents.
> +		 */
> +		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
> +			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
> +
> +		p->bmv_oflags |= BMV_OF_DELALLOC;
> +		p->bmv_block = -2;
>  	} else {
> -		if (startblock == DELAYSTARTBLOCK)
> -			out->bmv_block = -2;
> -		else
> -			out->bmv_block = xfs_fsb_to_db(ip, startblock);
> -		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
> -		ifp = XFS_IFORK_PTR(ip, whichfork);
> -		if (!moretocome &&
> -		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
> -		   (lastx == xfs_iext_count(ifp) - 1))
> -			out->bmv_oflags |= BMV_OF_LAST;
> +		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
>  	}
>  
> -	return 1;
> +	if (got->br_state == XFS_EXT_UNWRITTEN &&
> +	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
> +		p->bmv_oflags |= BMV_OF_PREALLOC;
> +
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
> +
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
>  }
>  
> -/* Adjust the reported bmap around shared/unshared extent transitions. */
> -STATIC int
> -xfs_getbmap_adjust_shared(
> +static void
> +xfs_getbmap_report_hole(
>  	struct xfs_inode		*ip,
> -	int				whichfork,
> -	struct xfs_bmbt_irec		*map,
> -	struct getbmapx			*out,
> -	struct xfs_bmbt_irec		*next_map)
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	xfs_fileoff_t			bno,
> +	xfs_fileoff_t			end,
> +	struct getbmapx			*p)
>  {
> -	struct xfs_mount		*mp = ip->i_mount;
> -	xfs_agnumber_t			agno;
> -	xfs_agblock_t			agbno;
> -	xfs_agblock_t			ebno;
> -	xfs_extlen_t			elen;
> -	xfs_extlen_t			nlen;
> -	int				error;
> +	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
> +		return;
>  
> -	next_map->br_startblock = NULLFSBLOCK;
> -	next_map->br_startoff = NULLFILEOFF;
> -	next_map->br_blockcount = 0;
> +	p->bmv_block = -1;
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
>  
> -	/* Only written data blocks can be shared. */
> -	if (!xfs_is_reflink_inode(ip) ||
> -	    whichfork != XFS_DATA_FORK ||
> -	    !xfs_bmap_is_real_extent(map))
> -		return 0;
> -
> -	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
> -	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
> -	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
> -			map->br_blockcount, &ebno, &elen, true);
> -	if (error)
> -		return error;
> -
> -	if (ebno == NULLAGBLOCK) {
> -		/* No shared blocks at all. */
> -		return 0;
> -	} else if (agbno == ebno) {
> -		/*
> -		 * Shared extent at (agbno, elen).  Shrink the reported
> -		 * extent length and prepare to move the start of map[i]
> -		 * to agbno+elen, with the aim of (re)formatting the new
> -		 * map[i] the next time through the inner loop.
> -		 */
> -		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
> -		out->bmv_oflags |= BMV_OF_SHARED;
> -		if (elen != map->br_blockcount) {
> -			*next_map = *map;
> -			next_map->br_startblock += elen;
> -			next_map->br_startoff += elen;
> -			next_map->br_blockcount -= elen;
> -		}
> -		map->br_blockcount -= elen;
> -	} else {
> -		/*
> -		 * There's an unshared extent (agbno, ebno - agbno)
> -		 * followed by shared extent at (ebno, elen).  Shrink
> -		 * the reported extent length to cover only the unshared
> -		 * extent and prepare to move up the start of map[i] to
> -		 * ebno, with the aim of (re)formatting the new map[i]
> -		 * the next time through the inner loop.
> -		 */
> -		*next_map = *map;
> -		nlen = ebno - agbno;
> -		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
> -		next_map->br_startblock += nlen;
> -		next_map->br_startoff += nlen;
> -		next_map->br_blockcount -= nlen;
> -		map->br_blockcount -= nlen;
> -	}
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
> +}
>  
> -	return 0;
> +static inline bool xfs_getbmap_full(struct getbmapx *bmv, int nr_entries)
> +{
> +	return bmv->bmv_length == 0 || nr_entries >= bmv->bmv_count - 1;
>  }
>  
>  /*
> @@ -539,119 +483,72 @@ xfs_getbmap(
>  	xfs_bmap_format_t	formatter,	/* format to user */
>  	void			*arg)		/* formatter arg */
>  {
> -	int64_t			bmvend;		/* last block requested */
> -	int			error = 0;	/* return value */
> -	int64_t			fixlen;		/* length for -1 case */
> -	int			i;		/* extent number */
> -	int			lock;		/* lock state */
> -	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	int			nex;		/* # of user extents can do */
> -	int			subnex;		/* # of bmapi's can do */
> -	int			nmap;		/* number of map entries */
> -	struct getbmapx		*out;		/* output structure */
> -	int			whichfork;	/* data or attr fork */
> -	int			prealloced;	/* this is a file with
> -						 * preallocated data space */
> -	int			iflags;		/* interface flags */
> -	int			bmapi_flags;	/* flags for xfs_bmapi */
> -	int			cur_ext = 0;
> -	struct xfs_bmbt_irec	inject_map;
> -
> -	mp = ip->i_mount;
> -	iflags = bmv->bmv_iflags;
> +	struct xfs_mount	*mp = ip->i_mount;
> +	int			iflags = bmv->bmv_iflags;
> +	int			whichfork, lock, i, nr_entries = 0, error = 0;
> +	int64_t			bmv_end, max_len;
> +	xfs_fileoff_t		bno, first_bno;
> +	struct xfs_ifork	*ifp;
> +	struct getbmapx		*out;
> +	struct xfs_bmbt_irec	got, rec;
> +	xfs_filblks_t		len;
> +	xfs_extnum_t		idx;
>  
>  #ifndef DEBUG
>  	/* Only allow CoW fork queries if we're debugging. */
>  	if (iflags & BMV_IF_COWFORK)
>  		return -EINVAL;
>  #endif
> +
>  	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
>  		return -EINVAL;
>  
> +	if (bmv->bmv_count <= 1)
> +		return -EINVAL;
> +	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> +		return -ENOMEM;
> +
> +	if (bmv->bmv_length < -1)
> +		return -EINVAL;
> +
> +	bmv->bmv_entries = 0;
> +	if (bmv->bmv_length == 0)
> +		return 0;
> +
> +	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> +	if (!out)
> +		return -ENOMEM;
> +
>  	if (iflags & BMV_IF_ATTRFORK)
>  		whichfork = XFS_ATTR_FORK;
>  	else if (iflags & BMV_IF_COWFORK)
>  		whichfork = XFS_COW_FORK;
>  	else
>  		whichfork = XFS_DATA_FORK;
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
>  
> +	xfs_ilock(ip, XFS_IOLOCK_SHARED);
>  	switch (whichfork) {
>  	case XFS_ATTR_FORK:
> -		if (XFS_IFORK_Q(ip)) {
> -			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
> -				return -EINVAL;
> -		} else if (unlikely(
> -			   ip->i_d.di_aformat != 0 &&
> -			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
> -			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
> -					 ip->i_mount);
> -			return -EFSCORRUPTED;
> -		}
> +		if (!XFS_IFORK_Q(ip))
> +			goto out_unlock_iolock;
>  
> -		prealloced = 0;
> -		fixlen = 1LL << 32;
> +		max_len = 1LL << 32;
> +		lock = xfs_ilock_attr_map_shared(ip);
>  		break;
>  	case XFS_COW_FORK:
> -		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
> -			return -EINVAL;
> +		/* No CoW fork? Just return */
> +		if (!ifp)
> +			goto out_unlock_iolock;
>  
> -		if (xfs_get_cowextsz_hint(ip)) {
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> -		break;
> -	default:
> -		/* Local format data forks report no extents. */
> -		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
> -			bmv->bmv_entries = 0;
> -			return 0;
> -		}
> -		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
> -		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
> -			return -EINVAL;
> +		if (xfs_get_cowextsz_hint(ip))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
>  
> -		if (xfs_get_extsz_hint(ip) ||
> -		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> +		lock = XFS_ILOCK_SHARED;
> +		xfs_ilock(ip, lock);
>  		break;
> -	}
> -
> -	if (bmv->bmv_length == -1) {
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
> -		bmv->bmv_length =
> -			max_t(int64_t, fixlen - bmv->bmv_offset, 0);
> -	} else if (bmv->bmv_length == 0) {
> -		bmv->bmv_entries = 0;
> -		return 0;
> -	} else if (bmv->bmv_length < 0) {
> -		return -EINVAL;
> -	}
> -
> -	nex = bmv->bmv_count - 1;
> -	if (nex <= 0)
> -		return -EINVAL;
> -	bmvend = bmv->bmv_offset + bmv->bmv_length;
> -
> -
> -	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> -		return -ENOMEM;
> -	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> -	if (!out)
> -		return -ENOMEM;
> -
> -	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	switch (whichfork) {
>  	case XFS_DATA_FORK:
>  		if (!(iflags & BMV_IF_DELALLOC) &&
>  		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
> @@ -669,147 +566,121 @@ xfs_getbmap(
>  			 */
>  		}
>  
> +		if (xfs_get_extsz_hint(ip) ||
> +		    (ip->i_d.di_flags &
> +		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
> +
>  		lock = xfs_ilock_data_map_shared(ip);
>  		break;
> -	case XFS_COW_FORK:
> -		lock = XFS_ILOCK_SHARED;
> -		xfs_ilock(ip, lock);
> -		break;
> -	case XFS_ATTR_FORK:
> -		lock = xfs_ilock_attr_map_shared(ip);
> +	}
> +
> +	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
> +	case XFS_DINODE_FMT_EXTENTS:
> +	case XFS_DINODE_FMT_BTREE:
>  		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +		/* Local format inode forks report no extents. */
> +		goto out_unlock_ilock;
> +	default:
> +		error = -EINVAL;
> +		goto out_unlock_ilock;
>  	}
>  
> -	/*
> -	 * Don't let nex be bigger than the number of extents
> -	 * we can have assuming alternating holes and real extents.
> -	 */
> -	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
> -		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
> +	if (bmv->bmv_length == -1) {
> +		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
> +		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
> +	}
>  
> -	bmapi_flags = xfs_bmapi_aflag(whichfork);
> -	if (!(iflags & BMV_IF_PREALLOC))
> -		bmapi_flags |= XFS_BMAPI_IGSTATE;
> +	bmv_end = bmv->bmv_offset + bmv->bmv_length;
>  
> -	/*
> -	 * Allocate enough space to handle "subnex" maps at a time.
> -	 */
> -	error = -ENOMEM;
> -	subnex = 16;
> -	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
> -	if (!map)
> +	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
> +	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +		error = xfs_iread_extents(NULL, ip, whichfork);
> +		if (error)
> +			goto out_unlock_ilock;
> +	}
> +
> +	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
>  		goto out_unlock_ilock;
>  
> -	bmv->bmv_entries = 0;
> +	while (!xfs_getbmap_full(bmv, nr_entries)) {
> +		struct getbmapx		*p = &out[nr_entries];
>  
> -	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
> -	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
> -		error = 0;
> -		goto out_free_map;
> -	}
> +		xfs_trim_extent(&got, first_bno, len);
>  
> -	do {
> -		nmap = (nex> subnex) ? subnex : nex;
> -		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
> -				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
> -				       map, &nmap, bmapi_flags);
> -		if (error)
> -			goto out_free_map;
> -		ASSERT(nmap <= subnex);
> -
> -		for (i = 0; i < nmap && bmv->bmv_length &&
> -				cur_ext < bmv->bmv_count - 1; i++) {
> -			out[cur_ext].bmv_oflags = 0;
> -			if (map[i].br_state == XFS_EXT_UNWRITTEN)
> -				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
> -			else if (map[i].br_startblock == DELAYSTARTBLOCK)
> -				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
> -			out[cur_ext].bmv_offset =
> -				XFS_FSB_TO_BB(mp, map[i].br_startoff);
> -			out[cur_ext].bmv_length =
> -				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
> -			out[cur_ext].bmv_unused1 = 0;
> -			out[cur_ext].bmv_unused2 = 0;
> +		/*
> +		 * Report an entry for a hole if this extent doesn't directly
> +		 * follow the previous one.
> +		 */
> +		if (got.br_startoff > bno) {
> +			xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +					got.br_startoff, p++);
> +			if (xfs_getbmap_full(bmv, ++nr_entries))
> +				break;
> +		}
>  
> -			/*
> -			 * delayed allocation extents that start beyond EOF can
> -			 * occur due to speculative EOF allocation when the
> -			 * delalloc extent is larger than the largest freespace
> -			 * extent at conversion time. These extents cannot be
> -			 * converted by data writeback, so can exist here even
> -			 * if we are not supposed to be finding delalloc
> -			 * extents.
> -			 */
> -			if (map[i].br_startblock == DELAYSTARTBLOCK &&
> -			    map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
> -				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
> -
> -                        if (map[i].br_startblock == HOLESTARTBLOCK &&
> -			    whichfork == XFS_ATTR_FORK) {
> -				/* came to the end of attribute fork */
> -				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
> -				goto out_free_map;
> -			}
> +		/*
> +		 * In order to report shared extents accurately, we report each
> +		 * distinct shared / unshared part of a single bmbt record with
> +		 * an individual getbmapx record.
> +		 */
> +		rec = got;
> +		for (;;) {
> +			bool			shared = false, trimmed = false;
> +			xfs_fileoff_t		len;
>  
> -			/* Is this a shared block? */
> -			error = xfs_getbmap_adjust_shared(ip, whichfork,
> -					&map[i], &out[cur_ext], &inject_map);
> +			error = xfs_reflink_trim_around_shared(ip, &rec,
> +					&shared, &trimmed);
>  			if (error)
> -				goto out_free_map;
> +				goto out_unlock_ilock;
>  
> -			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
> -					&out[cur_ext], prealloced, bmvend,
> -					map[i].br_startblock,
> -					inject_map.br_startblock != NULLFSBLOCK))
> -				goto out_free_map;
> +			xfs_getbmap_report_one(ip, bmv, bmv_end, &rec, p);
> +			if (shared)
> +				p->bmv_oflags |= BMV_OF_SHARED;
> +			if (!trimmed)
> +				break;
>  
> -			bmv->bmv_offset =
> -				out[cur_ext].bmv_offset +
> -				out[cur_ext].bmv_length;
> -			bmv->bmv_length =
> -				max_t(int64_t, 0, bmvend - bmv->bmv_offset);
> +			len = got.br_startoff + got.br_blockcount -
> +				(rec.br_startoff + rec.br_blockcount);
>  
> -			/*
> -			 * In case we don't want to return the hole,
> -			 * don't increase cur_ext so that we can reuse
> -			 * it in the next loop.
> -			 */
> -			if ((iflags & BMV_IF_NO_HOLES) &&
> -			    map[i].br_startblock == HOLESTARTBLOCK) {
> -				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
> -				continue;
> -			}
> +			rec.br_startoff += rec.br_blockcount;
> +			if (rec.br_startblock != DELAYSTARTBLOCK)
> +				rec.br_startblock += rec.br_blockcount;
> +			rec.br_blockcount = len;
> +		}
>  
> -			/*
> -			 * In order to report shared extents accurately,
> -			 * we report each distinct shared/unshared part
> -			 * of a single bmbt record using multiple bmap
> -			 * extents.  To make that happen, we iterate the
> -			 * same map array item multiple times, each
> -			 * time trimming out the subextent that we just
> -			 * reported.
> -			 *
> -			 * Because of this, we must check the out array
> -			 * index (cur_ext) directly against bmv_count-1
> -			 * to avoid overflows.
> -			 */
> -			if (inject_map.br_startblock != NULLFSBLOCK) {
> -				map[i] = inject_map;
> -				i--;
> +		bno = got.br_startoff + got.br_blockcount;
> +		nr_entries++;
> +
> +		if (!xfs_iext_get_extent(ifp, ++idx, &got)) {
> +			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
> +
> +			p->bmv_oflags |= BMV_OF_LAST;
> +
> +			if (whichfork != XFS_ATTR_FORK && bno < end &&
> +			    !xfs_getbmap_full(bmv, nr_entries)) {
> +				xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +						end, ++p);
> +				nr_entries++;
>  			}
> -			bmv->bmv_entries++;
> -			cur_ext++;
> +			break;
>  		}
> -	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
>  
> - out_free_map:
> -	kmem_free(map);
> - out_unlock_ilock:
> +		if (bno >= first_bno + len)
> +			break;
> +	}
> +
> +out_unlock_ilock:
>  	xfs_iunlock(ip, lock);
> - out_unlock_iolock:
> +out_unlock_iolock:
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
> -	for (i = 0; i < cur_ext; i++) {
> +	for (i = 0; i < nr_entries; i++) {
>  		/* format results & advance arg */
>  		error = formatter(&arg, &out[i]);
>  		if (error)
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 28, 2017, 7:35 p.m. UTC | #2
On Mon, Aug 28, 2017 at 11:31:42AM -0700, Darrick J. Wong wrote:
> Hmm... this causes at least a couple of xfstests regressions:

Oh.  Looks like I didn't do a rmap run as none of my runs showed
a regression.

> xfs/310, because we no longer merge adjacent bmap records.  I think the
> solution here is to change the extent count check to allow 2 extents.

Yeah.

> xfs/245 because zero-mapping forks are no longer reported as having one
> big "hole" extent; instead zero extents are reported back.  How do we
> want to handle this case?

Hmm.  Looks like that is the COW fork, as even the current code
never reports a hole at then end when there are no extents at all.
It does however when we have at least one extent:

root@brick:/home/hch/work/xfs# uname -a
Linux brick 4.9.0-3-amd64 #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) x86_64 GNU/Linux
root@brick:/home/hch/work/xfs# truncate --size 10g foo
root@brick:/home/hch/work/xfs# xfs_bmap foo
foo: no extents
root@brick:/home/hch/work/xfs# fallocate -l 10m foo
root@brick:/home/hch/work/xfs# xfs_bmap foo
foo:
	0: [0..20479]: 786860592..786881071
	1: [20480..20971519]: hole

in this case I suspect we should try to treat the COW fork the
same.  But let me take a more detailed look at xfs/245 on what's
going on there.

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 28, 2017, 9:01 p.m. UTC | #3
On Mon, Aug 28, 2017 at 09:35:07PM +0200, Christoph Hellwig wrote:
> On Mon, Aug 28, 2017 at 11:31:42AM -0700, Darrick J. Wong wrote:
> > Hmm... this causes at least a couple of xfstests regressions:
> 
> Oh.  Looks like I didn't do a rmap run as none of my runs showed
> a regression.
> 
> > xfs/310, because we no longer merge adjacent bmap records.  I think the
> > solution here is to change the extent count check to allow 2 extents.
> 
> Yeah.
> 
> > xfs/245 because zero-mapping forks are no longer reported as having one
> > big "hole" extent; instead zero extents are reported back.  How do we
> > want to handle this case?
> 
> Hmm.  Looks like that is the COW fork, as even the current code
> never reports a hole at then end when there are no extents at all.
> It does however when we have at least one extent:
> 
> root@brick:/home/hch/work/xfs# uname -a
> Linux brick 4.9.0-3-amd64 #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26) x86_64 GNU/Linux
> root@brick:/home/hch/work/xfs# truncate --size 10g foo
> root@brick:/home/hch/work/xfs# xfs_bmap foo
> foo: no extents
> root@brick:/home/hch/work/xfs# fallocate -l 10m foo
> root@brick:/home/hch/work/xfs# xfs_bmap foo
> foo:
> 	0: [0..20479]: 786860592..786881071
> 	1: [20480..20971519]: hole
> 
> in this case I suspect we should try to treat the COW fork the
> same.  But let me take a more detailed look at xfs/245 on what's
> going on there.

Aha, I see, it's only if you pass in -e to xfs_bmap then you get a hole record:

$ truncate -s 50m /storage/tmp/a
$ xfs_io -c 'bmap -elpv' /storage/tmp/a
/storage/tmp/a:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET         TOTAL FLAGS
   0: [0..102399]:     hole                                 102400
$ xfs_bmap /storage/tmp/a
/storage/tmp/a: no extents
$ xfs_io -c 'bmap -lpv' /storage/tmp/a
/storage/tmp/a: no extents

(eesh.)

--D

> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Aug. 28, 2017, 9:20 p.m. UTC | #4
On Mon, Aug 28, 2017 at 05:06:12PM +0200, Christoph Hellwig wrote:
> Currently getbmap uses xfs_bmapi_read to query the extent map, and then
> fixes up various bits that are eventually reported to userspace.
> 
> This patch instead rewrites it to use xfs_iext_lookup_extent and
> xfs_iext_get_extent to iteratively process the extent map.  This not
> only avoids the need to allocate a map for the returned xfs_bmbt_irec
> structures but also greatly simplified the code.
> 
> There are two intentional behavior changes compared to the old code:
> 
>  - the current code reports unwritten extents that don't directly border
>    a written one as unwritten even when not passing the BMV_IF_PREALLOC
>    option, contrary to the documentation.  The new code requires the
>    BMV_IF_PREALLOC flag to report the unwrittent extent bit.
>  - The new code does never merges consecutive extents, unlike the old
>    code that sometimes does it based on the boundaries of the
>    xfs_bmapi_read calls.  Note that the extent merging behavior was
>    entirely undocumented.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/xfs_bmap_util.c | 499 ++++++++++++++++++-------------------------------
>  1 file changed, 185 insertions(+), 314 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 93e955262d07..5962f119d4ff 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -404,125 +404,69 @@ xfs_bmap_count_blocks(
>  	return 0;
>  }
>  
> -/*
> - * returns 1 for success, 0 if we failed to map the extent.
> - */
> -STATIC int
> -xfs_getbmapx_fix_eof_hole(
> -	xfs_inode_t		*ip,		/* xfs incore inode pointer */
> -	int			whichfork,
> -	struct getbmapx		*out,		/* output structure */
> -	int			prealloced,	/* this is a file with
> -						 * preallocated data space */
> -	int64_t			end,		/* last block requested */
> -	xfs_fsblock_t		startblock,
> -	bool			moretocome)
> +static void
> +xfs_getbmap_report_one(
> +	struct xfs_inode		*ip,
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	struct xfs_bmbt_irec		*got,
> +	struct getbmapx			*p)
>  {
> -	int64_t			fixlen;
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	xfs_ifork_t		*ifp;		/* inode fork pointer */
> -	xfs_extnum_t		lastx;		/* last extent pointer */
> -	xfs_fileoff_t		fileblock;
> -
> -	if (startblock == HOLESTARTBLOCK) {
> -		mp = ip->i_mount;
> -		out->bmv_block = -1;
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
> -		fixlen -= out->bmv_offset;
> -		if (prealloced && out->bmv_offset + out->bmv_length == end) {
> -			/* Came to hole at EOF. Trim it. */
> -			if (fixlen <= 0)
> -				return 0;
> -			out->bmv_length = fixlen;
> -		}
> +	if (isnullstartblock(got->br_startblock) ||
> +	    got->br_startblock == DELAYSTARTBLOCK) {
> +		/*
> +		 * Delalloc extents that start beyond EOF can occur due to
> +		 * speculative EOF allocation when the delalloc extent is larger
> +		 * than the largest freespace extent at conversion time.  These
> +		 * extents cannot be converted by data writeback, so can exist
> +		 * here even if we are not supposed to be finding delalloc
> +		 * extents.
> +		 */
> +		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
> +			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
> +
> +		p->bmv_oflags |= BMV_OF_DELALLOC;
> +		p->bmv_block = -2;
>  	} else {
> -		if (startblock == DELAYSTARTBLOCK)
> -			out->bmv_block = -2;
> -		else
> -			out->bmv_block = xfs_fsb_to_db(ip, startblock);
> -		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
> -		ifp = XFS_IFORK_PTR(ip, whichfork);
> -		if (!moretocome &&
> -		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
> -		   (lastx == xfs_iext_count(ifp) - 1))
> -			out->bmv_oflags |= BMV_OF_LAST;
> +		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
>  	}
>  
> -	return 1;
> +	if (got->br_state == XFS_EXT_UNWRITTEN &&
> +	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
> +		p->bmv_oflags |= BMV_OF_PREALLOC;
> +
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
> +
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
>  }
>  
> -/* Adjust the reported bmap around shared/unshared extent transitions. */
> -STATIC int
> -xfs_getbmap_adjust_shared(
> +static void
> +xfs_getbmap_report_hole(
>  	struct xfs_inode		*ip,
> -	int				whichfork,
> -	struct xfs_bmbt_irec		*map,
> -	struct getbmapx			*out,
> -	struct xfs_bmbt_irec		*next_map)
> +	struct getbmapx			*bmv,
> +	int64_t				bmv_end,
> +	xfs_fileoff_t			bno,
> +	xfs_fileoff_t			end,
> +	struct getbmapx			*p)
>  {
> -	struct xfs_mount		*mp = ip->i_mount;
> -	xfs_agnumber_t			agno;
> -	xfs_agblock_t			agbno;
> -	xfs_agblock_t			ebno;
> -	xfs_extlen_t			elen;
> -	xfs_extlen_t			nlen;
> -	int				error;
> +	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
> +		return;
>  
> -	next_map->br_startblock = NULLFSBLOCK;
> -	next_map->br_startoff = NULLFILEOFF;
> -	next_map->br_blockcount = 0;
> +	p->bmv_block = -1;
> +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
> +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
>  
> -	/* Only written data blocks can be shared. */
> -	if (!xfs_is_reflink_inode(ip) ||
> -	    whichfork != XFS_DATA_FORK ||
> -	    !xfs_bmap_is_real_extent(map))
> -		return 0;
> -
> -	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
> -	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
> -	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
> -			map->br_blockcount, &ebno, &elen, true);
> -	if (error)
> -		return error;
> -
> -	if (ebno == NULLAGBLOCK) {
> -		/* No shared blocks at all. */
> -		return 0;
> -	} else if (agbno == ebno) {
> -		/*
> -		 * Shared extent at (agbno, elen).  Shrink the reported
> -		 * extent length and prepare to move the start of map[i]
> -		 * to agbno+elen, with the aim of (re)formatting the new
> -		 * map[i] the next time through the inner loop.
> -		 */
> -		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
> -		out->bmv_oflags |= BMV_OF_SHARED;
> -		if (elen != map->br_blockcount) {
> -			*next_map = *map;
> -			next_map->br_startblock += elen;
> -			next_map->br_startoff += elen;
> -			next_map->br_blockcount -= elen;
> -		}
> -		map->br_blockcount -= elen;
> -	} else {
> -		/*
> -		 * There's an unshared extent (agbno, ebno - agbno)
> -		 * followed by shared extent at (ebno, elen).  Shrink
> -		 * the reported extent length to cover only the unshared
> -		 * extent and prepare to move up the start of map[i] to
> -		 * ebno, with the aim of (re)formatting the new map[i]
> -		 * the next time through the inner loop.
> -		 */
> -		*next_map = *map;
> -		nlen = ebno - agbno;
> -		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
> -		next_map->br_startblock += nlen;
> -		next_map->br_startoff += nlen;
> -		next_map->br_blockcount -= nlen;
> -		map->br_blockcount -= nlen;
> -	}
> +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> +	bmv->bmv_entries++;
> +}
>  
> -	return 0;
> +static inline bool xfs_getbmap_full(struct getbmapx *bmv, int nr_entries)
> +{
> +	return bmv->bmv_length == 0 || nr_entries >= bmv->bmv_count - 1;
>  }
>  
>  /*
> @@ -539,119 +483,72 @@ xfs_getbmap(
>  	xfs_bmap_format_t	formatter,	/* format to user */
>  	void			*arg)		/* formatter arg */
>  {
> -	int64_t			bmvend;		/* last block requested */
> -	int			error = 0;	/* return value */
> -	int64_t			fixlen;		/* length for -1 case */
> -	int			i;		/* extent number */
> -	int			lock;		/* lock state */
> -	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
> -	xfs_mount_t		*mp;		/* file system mount point */
> -	int			nex;		/* # of user extents can do */
> -	int			subnex;		/* # of bmapi's can do */
> -	int			nmap;		/* number of map entries */
> -	struct getbmapx		*out;		/* output structure */
> -	int			whichfork;	/* data or attr fork */
> -	int			prealloced;	/* this is a file with
> -						 * preallocated data space */
> -	int			iflags;		/* interface flags */
> -	int			bmapi_flags;	/* flags for xfs_bmapi */
> -	int			cur_ext = 0;
> -	struct xfs_bmbt_irec	inject_map;
> -
> -	mp = ip->i_mount;
> -	iflags = bmv->bmv_iflags;
> +	struct xfs_mount	*mp = ip->i_mount;
> +	int			iflags = bmv->bmv_iflags;
> +	int			whichfork, lock, i, nr_entries = 0, error = 0;
> +	int64_t			bmv_end, max_len;
> +	xfs_fileoff_t		bno, first_bno;
> +	struct xfs_ifork	*ifp;
> +	struct getbmapx		*out;
> +	struct xfs_bmbt_irec	got, rec;
> +	xfs_filblks_t		len;
> +	xfs_extnum_t		idx;
>  
>  #ifndef DEBUG
>  	/* Only allow CoW fork queries if we're debugging. */
>  	if (iflags & BMV_IF_COWFORK)
>  		return -EINVAL;
>  #endif
> +
>  	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
>  		return -EINVAL;
>  
> +	if (bmv->bmv_count <= 1)
> +		return -EINVAL;
> +	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> +		return -ENOMEM;
> +
> +	if (bmv->bmv_length < -1)
> +		return -EINVAL;
> +
> +	bmv->bmv_entries = 0;
> +	if (bmv->bmv_length == 0)
> +		return 0;
> +
> +	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> +	if (!out)
> +		return -ENOMEM;
> +
>  	if (iflags & BMV_IF_ATTRFORK)
>  		whichfork = XFS_ATTR_FORK;
>  	else if (iflags & BMV_IF_COWFORK)
>  		whichfork = XFS_COW_FORK;
>  	else
>  		whichfork = XFS_DATA_FORK;
> +	ifp = XFS_IFORK_PTR(ip, whichfork);
>  
> +	xfs_ilock(ip, XFS_IOLOCK_SHARED);
>  	switch (whichfork) {
>  	case XFS_ATTR_FORK:
> -		if (XFS_IFORK_Q(ip)) {
> -			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
> -			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
> -				return -EINVAL;
> -		} else if (unlikely(
> -			   ip->i_d.di_aformat != 0 &&
> -			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
> -			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
> -					 ip->i_mount);
> -			return -EFSCORRUPTED;
> -		}
> +		if (!XFS_IFORK_Q(ip))
> +			goto out_unlock_iolock;
>  
> -		prealloced = 0;
> -		fixlen = 1LL << 32;
> +		max_len = 1LL << 32;
> +		lock = xfs_ilock_attr_map_shared(ip);
>  		break;
>  	case XFS_COW_FORK:
> -		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
> -			return -EINVAL;
> +		/* No CoW fork? Just return */
> +		if (!ifp)
> +			goto out_unlock_iolock;
>  
> -		if (xfs_get_cowextsz_hint(ip)) {
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> -		break;
> -	default:
> -		/* Local format data forks report no extents. */
> -		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
> -			bmv->bmv_entries = 0;
> -			return 0;
> -		}
> -		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
> -		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
> -			return -EINVAL;
> +		if (xfs_get_cowextsz_hint(ip))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
>  
> -		if (xfs_get_extsz_hint(ip) ||
> -		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
> -			prealloced = 1;
> -			fixlen = mp->m_super->s_maxbytes;
> -		} else {
> -			prealloced = 0;
> -			fixlen = XFS_ISIZE(ip);
> -		}
> +		lock = XFS_ILOCK_SHARED;
> +		xfs_ilock(ip, lock);
>  		break;
> -	}
> -
> -	if (bmv->bmv_length == -1) {
> -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
> -		bmv->bmv_length =
> -			max_t(int64_t, fixlen - bmv->bmv_offset, 0);
> -	} else if (bmv->bmv_length == 0) {
> -		bmv->bmv_entries = 0;
> -		return 0;
> -	} else if (bmv->bmv_length < 0) {
> -		return -EINVAL;
> -	}
> -
> -	nex = bmv->bmv_count - 1;
> -	if (nex <= 0)
> -		return -EINVAL;
> -	bmvend = bmv->bmv_offset + bmv->bmv_length;
> -
> -
> -	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> -		return -ENOMEM;
> -	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> -	if (!out)
> -		return -ENOMEM;
> -
> -	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> -	switch (whichfork) {
>  	case XFS_DATA_FORK:
>  		if (!(iflags & BMV_IF_DELALLOC) &&
>  		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
> @@ -669,147 +566,121 @@ xfs_getbmap(
>  			 */
>  		}
>  
> +		if (xfs_get_extsz_hint(ip) ||
> +		    (ip->i_d.di_flags &
> +		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
> +			max_len = mp->m_super->s_maxbytes;
> +		else
> +			max_len = XFS_ISIZE(ip);
> +
>  		lock = xfs_ilock_data_map_shared(ip);
>  		break;
> -	case XFS_COW_FORK:
> -		lock = XFS_ILOCK_SHARED;
> -		xfs_ilock(ip, lock);
> -		break;
> -	case XFS_ATTR_FORK:
> -		lock = xfs_ilock_attr_map_shared(ip);
> +	}
> +
> +	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
> +	case XFS_DINODE_FMT_EXTENTS:
> +	case XFS_DINODE_FMT_BTREE:
>  		break;
> +	case XFS_DINODE_FMT_LOCAL:
> +		/* Local format inode forks report no extents. */
> +		goto out_unlock_ilock;
> +	default:
> +		error = -EINVAL;
> +		goto out_unlock_ilock;
>  	}
>  
> -	/*
> -	 * Don't let nex be bigger than the number of extents
> -	 * we can have assuming alternating holes and real extents.
> -	 */
> -	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
> -		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
> +	if (bmv->bmv_length == -1) {
> +		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
> +		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
> +	}
>  
> -	bmapi_flags = xfs_bmapi_aflag(whichfork);
> -	if (!(iflags & BMV_IF_PREALLOC))
> -		bmapi_flags |= XFS_BMAPI_IGSTATE;
> +	bmv_end = bmv->bmv_offset + bmv->bmv_length;
>  
> -	/*
> -	 * Allocate enough space to handle "subnex" maps at a time.
> -	 */
> -	error = -ENOMEM;
> -	subnex = 16;
> -	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
> -	if (!map)
> +	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
> +	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
> +
> +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> +		error = xfs_iread_extents(NULL, ip, whichfork);
> +		if (error)
> +			goto out_unlock_ilock;
> +	}
> +
> +	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
>  		goto out_unlock_ilock;
>  
> -	bmv->bmv_entries = 0;
> +	while (!xfs_getbmap_full(bmv, nr_entries)) {
> +		struct getbmapx		*p = &out[nr_entries];
>  
> -	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
> -	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
> -		error = 0;
> -		goto out_free_map;
> -	}
> +		xfs_trim_extent(&got, first_bno, len);
>  
> -	do {
> -		nmap = (nex> subnex) ? subnex : nex;
> -		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
> -				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
> -				       map, &nmap, bmapi_flags);
> -		if (error)
> -			goto out_free_map;
> -		ASSERT(nmap <= subnex);
> -
> -		for (i = 0; i < nmap && bmv->bmv_length &&
> -				cur_ext < bmv->bmv_count - 1; i++) {
> -			out[cur_ext].bmv_oflags = 0;
> -			if (map[i].br_state == XFS_EXT_UNWRITTEN)
> -				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
> -			else if (map[i].br_startblock == DELAYSTARTBLOCK)
> -				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
> -			out[cur_ext].bmv_offset =
> -				XFS_FSB_TO_BB(mp, map[i].br_startoff);
> -			out[cur_ext].bmv_length =
> -				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
> -			out[cur_ext].bmv_unused1 = 0;
> -			out[cur_ext].bmv_unused2 = 0;
> +		/*
> +		 * Report an entry for a hole if this extent doesn't directly
> +		 * follow the previous one.
> +		 */
> +		if (got.br_startoff > bno) {
> +			xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +					got.br_startoff, p++);
> +			if (xfs_getbmap_full(bmv, ++nr_entries))
> +				break;
> +		}
>  
> -			/*
> -			 * delayed allocation extents that start beyond EOF can
> -			 * occur due to speculative EOF allocation when the
> -			 * delalloc extent is larger than the largest freespace
> -			 * extent at conversion time. These extents cannot be
> -			 * converted by data writeback, so can exist here even
> -			 * if we are not supposed to be finding delalloc
> -			 * extents.
> -			 */
> -			if (map[i].br_startblock == DELAYSTARTBLOCK &&
> -			    map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
> -				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
> -
> -                        if (map[i].br_startblock == HOLESTARTBLOCK &&
> -			    whichfork == XFS_ATTR_FORK) {
> -				/* came to the end of attribute fork */
> -				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
> -				goto out_free_map;
> -			}
> +		/*
> +		 * In order to report shared extents accurately, we report each
> +		 * distinct shared / unshared part of a single bmbt record with
> +		 * an individual getbmapx record.
> +		 */
> +		rec = got;
> +		for (;;) {

while (!xfs_getbmap_full()) ?

> +			bool			shared = false, trimmed = false;
> +			xfs_fileoff_t		len;
>  
> -			/* Is this a shared block? */
> -			error = xfs_getbmap_adjust_shared(ip, whichfork,
> -					&map[i], &out[cur_ext], &inject_map);
> +			error = xfs_reflink_trim_around_shared(ip, &rec,
> +					&shared, &trimmed);
>  			if (error)
> -				goto out_free_map;
> +				goto out_unlock_ilock;
>  
> -			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
> -					&out[cur_ext], prealloced, bmvend,
> -					map[i].br_startblock,
> -					inject_map.br_startblock != NULLFSBLOCK))
> -				goto out_free_map;
> +			xfs_getbmap_report_one(ip, bmv, bmv_end, &rec, p);
> +			if (shared)
> +				p->bmv_oflags |= BMV_OF_SHARED;

Shouldn't we advance p/nr_entries?  What if we have a single partially
shared extent?  Also, what's the difference between nr_entries and
bmv->bmv_entries?  They both track the number of bmv entries we've
filled out, right?

I tried a simple test (which I guess we should turn into an xfstests, sigh):

$ xfs_io -c 'pwrite 0 1m' /opt/a -f
wrote 1048576/1048576 bytes at offset 0
1 MiB, 256 ops; 0.0000 sec (180.571 MiB/sec and 46226.0744 ops/sec)
$ cp --reflink=always /opt/a /opt/b
$ xfs_io -c 'bmap -elpv' /opt/a
/opt/a:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..2047]:       192..2239         0 (192..2239)       2048 100000
$ xfs_io -c 'bmap -elpv' /opt/b
/opt/b:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..2047]:       192..2239         0 (192..2239)       2048 100000

Then tried to CoW the middle of the extent:

$ xfs_io -c 'pwrite 200k 4k' -c 'pwrite 700k 4k' -c fsync /opt/b
wrote 4096/4096 bytes at offset 204800
4 KiB, 1 ops; 0.0000 sec (5.964 MiB/sec and 1526.7176 ops/sec)
wrote 4096/4096 bytes at offset 716800
4 KiB, 1 ops; 0.0000 sec (19.829 MiB/sec and 5076.1421 ops/sec)
$ xfs_io -c 'bmap -elpv' /opt/a
/opt/a:
 EXT: FILE-OFFSET                     BLOCK-RANGE             AG AG-OFFSET                                TOTAL FLAGS
   0: [1408..2047]:                   1600..2239               0 (1600..2239)                               640 100000
   1: [4289600..8495807]:             delalloc                                                          4206208
   2: [4289432..8591138]:             4206160..8507866         2 (929360..5231066)                      4301707 000000
   3: [4611686027017322496..461168602 4301714..8602913         2 (1024914..5326113)                     4301200 000000
   4: [4226304..-4611686009833227009] 0..-4611686009837453313  0 (0..-4611686009837453313) -4611686009837453312 000000
$ xfs_io -c 'bmap -elpv' /opt/b
/opt/b:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL FLAGS
   0: [0..399]:        192..591          0 (192..591)         400 100000
   1: [400..407]:      2384..2391        0 (2384..2391)         8 000000
   2: [408..1399]:     600..1591         0 (600..1591)        992 100000
   3: [1400..1407]:    3384..3391        0 (3384..3391)         8 000000
   4: [1408..2047]:    1600..2239        0 (1600..2239)       640 100000

Ugh, something is seriously messed up here.  By comparison, FIEMAP works fine:

$ filefrag -v /opt/a
Filesystem type is: 58465342
File size of /opt/a is 1048576 (256 blocks of 4096 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..      49:         24..        73:     50:             shared
   1:       50..      50:         74..        74:      1:            
   2:       51..     174:         75..       198:    124:             shared
   3:      175..     175:        199..       199:      1:            
   4:      176..     255:        200..       279:     80:             last,shared,eof
/opt/a: 1 extent found
$ filefrag -v /opt/b
Filesystem type is: 58465342
File size of /opt/b is 1048576 (256 blocks of 4096 bytes)
 ext:     logical_offset:        physical_offset: length:   expected: flags:
   0:        0..      49:         24..        73:     50:             shared
   1:       50..      50:        298..       298:      1:         74:
   2:       51..     174:         75..       198:    124:        299: shared
   3:      175..     175:        423..       423:      1:        199:
   4:      176..     255:        200..       279:     80:        424: last,shared,eof
/opt/b: 5 extents found

> +			if (!trimmed)
> +				break;
>  
> -			bmv->bmv_offset =
> -				out[cur_ext].bmv_offset +
> -				out[cur_ext].bmv_length;
> -			bmv->bmv_length =
> -				max_t(int64_t, 0, bmvend - bmv->bmv_offset);
> +			len = got.br_startoff + got.br_blockcount -
> +				(rec.br_startoff + rec.br_blockcount);
>  
> -			/*
> -			 * In case we don't want to return the hole,
> -			 * don't increase cur_ext so that we can reuse
> -			 * it in the next loop.
> -			 */
> -			if ((iflags & BMV_IF_NO_HOLES) &&
> -			    map[i].br_startblock == HOLESTARTBLOCK) {
> -				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
> -				continue;
> -			}
> +			rec.br_startoff += rec.br_blockcount;
> +			if (rec.br_startblock != DELAYSTARTBLOCK)
> +				rec.br_startblock += rec.br_blockcount;
> +			rec.br_blockcount = len;
> +		}
>  
> -			/*
> -			 * In order to report shared extents accurately,
> -			 * we report each distinct shared/unshared part
> -			 * of a single bmbt record using multiple bmap
> -			 * extents.  To make that happen, we iterate the
> -			 * same map array item multiple times, each
> -			 * time trimming out the subextent that we just
> -			 * reported.
> -			 *
> -			 * Because of this, we must check the out array
> -			 * index (cur_ext) directly against bmv_count-1
> -			 * to avoid overflows.
> -			 */
> -			if (inject_map.br_startblock != NULLFSBLOCK) {
> -				map[i] = inject_map;
> -				i--;
> +		bno = got.br_startoff + got.br_blockcount;
> +		nr_entries++;
> +
> +		if (!xfs_iext_get_extent(ifp, ++idx, &got)) {
> +			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
> +
> +			p->bmv_oflags |= BMV_OF_LAST;

Isn't BMV_OF_LAST supposed to be set only on the last extent of the
dataset returned?  If I ask for the mappings for blocks 100-200 and
there's a hole from 190-200, shouldn't OF_LAST be set on the 190-200
extent?

--D

> +
> +			if (whichfork != XFS_ATTR_FORK && bno < end &&
> +			    !xfs_getbmap_full(bmv, nr_entries)) {
> +				xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> +						end, ++p);
> +				nr_entries++;
>  			}
> -			bmv->bmv_entries++;
> -			cur_ext++;
> +			break;
>  		}
> -	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
>  
> - out_free_map:
> -	kmem_free(map);
> - out_unlock_ilock:
> +		if (bno >= first_bno + len)
> +			break;
> +	}
> +
> +out_unlock_ilock:
>  	xfs_iunlock(ip, lock);
> - out_unlock_iolock:
> +out_unlock_iolock:
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
> -	for (i = 0; i < cur_ext; i++) {
> +	for (i = 0; i < nr_entries; i++) {
>  		/* format results & advance arg */
>  		error = formatter(&arg, &out[i]);
>  		if (error)
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 29, 2017, 2:38 p.m. UTC | #5
On Mon, Aug 28, 2017 at 02:20:24PM -0700, Darrick J. Wong wrote:
> On Mon, Aug 28, 2017 at 05:06:12PM +0200, Christoph Hellwig wrote:
> > Currently getbmap uses xfs_bmapi_read to query the extent map, and then
> > fixes up various bits that are eventually reported to userspace.
> > 
> > This patch instead rewrites it to use xfs_iext_lookup_extent and
> > xfs_iext_get_extent to iteratively process the extent map.  This not
> > only avoids the need to allocate a map for the returned xfs_bmbt_irec
> > structures but also greatly simplified the code.
> > 
> > There are two intentional behavior changes compared to the old code:
> > 
> >  - the current code reports unwritten extents that don't directly border
> >    a written one as unwritten even when not passing the BMV_IF_PREALLOC
> >    option, contrary to the documentation.  The new code requires the
> >    BMV_IF_PREALLOC flag to report the unwrittent extent bit.
> >  - The new code does never merges consecutive extents, unlike the old
> >    code that sometimes does it based on the boundaries of the
> >    xfs_bmapi_read calls.  Note that the extent merging behavior was
> >    entirely undocumented.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  fs/xfs/xfs_bmap_util.c | 499 ++++++++++++++++++-------------------------------
> >  1 file changed, 185 insertions(+), 314 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> > index 93e955262d07..5962f119d4ff 100644
> > --- a/fs/xfs/xfs_bmap_util.c
> > +++ b/fs/xfs/xfs_bmap_util.c
> > @@ -404,125 +404,69 @@ xfs_bmap_count_blocks(
> >  	return 0;
> >  }
> >  
> > -/*
> > - * returns 1 for success, 0 if we failed to map the extent.
> > - */
> > -STATIC int
> > -xfs_getbmapx_fix_eof_hole(
> > -	xfs_inode_t		*ip,		/* xfs incore inode pointer */
> > -	int			whichfork,
> > -	struct getbmapx		*out,		/* output structure */
> > -	int			prealloced,	/* this is a file with
> > -						 * preallocated data space */
> > -	int64_t			end,		/* last block requested */
> > -	xfs_fsblock_t		startblock,
> > -	bool			moretocome)
> > +static void
> > +xfs_getbmap_report_one(
> > +	struct xfs_inode		*ip,
> > +	struct getbmapx			*bmv,
> > +	int64_t				bmv_end,
> > +	struct xfs_bmbt_irec		*got,
> > +	struct getbmapx			*p)
> >  {
> > -	int64_t			fixlen;
> > -	xfs_mount_t		*mp;		/* file system mount point */
> > -	xfs_ifork_t		*ifp;		/* inode fork pointer */
> > -	xfs_extnum_t		lastx;		/* last extent pointer */
> > -	xfs_fileoff_t		fileblock;
> > -
> > -	if (startblock == HOLESTARTBLOCK) {
> > -		mp = ip->i_mount;
> > -		out->bmv_block = -1;
> > -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
> > -		fixlen -= out->bmv_offset;
> > -		if (prealloced && out->bmv_offset + out->bmv_length == end) {
> > -			/* Came to hole at EOF. Trim it. */
> > -			if (fixlen <= 0)
> > -				return 0;
> > -			out->bmv_length = fixlen;
> > -		}
> > +	if (isnullstartblock(got->br_startblock) ||
> > +	    got->br_startblock == DELAYSTARTBLOCK) {
> > +		/*
> > +		 * Delalloc extents that start beyond EOF can occur due to
> > +		 * speculative EOF allocation when the delalloc extent is larger
> > +		 * than the largest freespace extent at conversion time.  These
> > +		 * extents cannot be converted by data writeback, so can exist
> > +		 * here even if we are not supposed to be finding delalloc
> > +		 * extents.
> > +		 */
> > +		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
> > +			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
> > +
> > +		p->bmv_oflags |= BMV_OF_DELALLOC;
> > +		p->bmv_block = -2;
> >  	} else {
> > -		if (startblock == DELAYSTARTBLOCK)
> > -			out->bmv_block = -2;
> > -		else
> > -			out->bmv_block = xfs_fsb_to_db(ip, startblock);
> > -		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
> > -		ifp = XFS_IFORK_PTR(ip, whichfork);
> > -		if (!moretocome &&
> > -		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
> > -		   (lastx == xfs_iext_count(ifp) - 1))
> > -			out->bmv_oflags |= BMV_OF_LAST;
> > +		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
> >  	}
> >  
> > -	return 1;
> > +	if (got->br_state == XFS_EXT_UNWRITTEN &&
> > +	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
> > +		p->bmv_oflags |= BMV_OF_PREALLOC;
> > +
> > +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
> > +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
> > +
> > +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> > +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> > +	bmv->bmv_entries++;
> >  }
> >  
> > -/* Adjust the reported bmap around shared/unshared extent transitions. */
> > -STATIC int
> > -xfs_getbmap_adjust_shared(
> > +static void
> > +xfs_getbmap_report_hole(
> >  	struct xfs_inode		*ip,
> > -	int				whichfork,
> > -	struct xfs_bmbt_irec		*map,
> > -	struct getbmapx			*out,
> > -	struct xfs_bmbt_irec		*next_map)
> > +	struct getbmapx			*bmv,
> > +	int64_t				bmv_end,
> > +	xfs_fileoff_t			bno,
> > +	xfs_fileoff_t			end,
> > +	struct getbmapx			*p)
> >  {
> > -	struct xfs_mount		*mp = ip->i_mount;
> > -	xfs_agnumber_t			agno;
> > -	xfs_agblock_t			agbno;
> > -	xfs_agblock_t			ebno;
> > -	xfs_extlen_t			elen;
> > -	xfs_extlen_t			nlen;
> > -	int				error;
> > +	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
> > +		return;
> >  
> > -	next_map->br_startblock = NULLFSBLOCK;
> > -	next_map->br_startoff = NULLFILEOFF;
> > -	next_map->br_blockcount = 0;
> > +	p->bmv_block = -1;
> > +	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
> > +	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
> >  
> > -	/* Only written data blocks can be shared. */
> > -	if (!xfs_is_reflink_inode(ip) ||
> > -	    whichfork != XFS_DATA_FORK ||
> > -	    !xfs_bmap_is_real_extent(map))
> > -		return 0;
> > -
> > -	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
> > -	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
> > -	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
> > -			map->br_blockcount, &ebno, &elen, true);
> > -	if (error)
> > -		return error;
> > -
> > -	if (ebno == NULLAGBLOCK) {
> > -		/* No shared blocks at all. */
> > -		return 0;
> > -	} else if (agbno == ebno) {
> > -		/*
> > -		 * Shared extent at (agbno, elen).  Shrink the reported
> > -		 * extent length and prepare to move the start of map[i]
> > -		 * to agbno+elen, with the aim of (re)formatting the new
> > -		 * map[i] the next time through the inner loop.
> > -		 */
> > -		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
> > -		out->bmv_oflags |= BMV_OF_SHARED;
> > -		if (elen != map->br_blockcount) {
> > -			*next_map = *map;
> > -			next_map->br_startblock += elen;
> > -			next_map->br_startoff += elen;
> > -			next_map->br_blockcount -= elen;
> > -		}
> > -		map->br_blockcount -= elen;
> > -	} else {
> > -		/*
> > -		 * There's an unshared extent (agbno, ebno - agbno)
> > -		 * followed by shared extent at (ebno, elen).  Shrink
> > -		 * the reported extent length to cover only the unshared
> > -		 * extent and prepare to move up the start of map[i] to
> > -		 * ebno, with the aim of (re)formatting the new map[i]
> > -		 * the next time through the inner loop.
> > -		 */
> > -		*next_map = *map;
> > -		nlen = ebno - agbno;
> > -		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
> > -		next_map->br_startblock += nlen;
> > -		next_map->br_startoff += nlen;
> > -		next_map->br_blockcount -= nlen;
> > -		map->br_blockcount -= nlen;
> > -	}
> > +	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
> > +	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
> > +	bmv->bmv_entries++;
> > +}
> >  
> > -	return 0;
> > +static inline bool xfs_getbmap_full(struct getbmapx *bmv, int nr_entries)
> > +{
> > +	return bmv->bmv_length == 0 || nr_entries >= bmv->bmv_count - 1;
> >  }
> >  
> >  /*
> > @@ -539,119 +483,72 @@ xfs_getbmap(
> >  	xfs_bmap_format_t	formatter,	/* format to user */
> >  	void			*arg)		/* formatter arg */
> >  {
> > -	int64_t			bmvend;		/* last block requested */
> > -	int			error = 0;	/* return value */
> > -	int64_t			fixlen;		/* length for -1 case */
> > -	int			i;		/* extent number */
> > -	int			lock;		/* lock state */
> > -	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
> > -	xfs_mount_t		*mp;		/* file system mount point */
> > -	int			nex;		/* # of user extents can do */
> > -	int			subnex;		/* # of bmapi's can do */
> > -	int			nmap;		/* number of map entries */
> > -	struct getbmapx		*out;		/* output structure */
> > -	int			whichfork;	/* data or attr fork */
> > -	int			prealloced;	/* this is a file with
> > -						 * preallocated data space */
> > -	int			iflags;		/* interface flags */
> > -	int			bmapi_flags;	/* flags for xfs_bmapi */
> > -	int			cur_ext = 0;
> > -	struct xfs_bmbt_irec	inject_map;
> > -
> > -	mp = ip->i_mount;
> > -	iflags = bmv->bmv_iflags;
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	int			iflags = bmv->bmv_iflags;
> > +	int			whichfork, lock, i, nr_entries = 0, error = 0;
> > +	int64_t			bmv_end, max_len;
> > +	xfs_fileoff_t		bno, first_bno;
> > +	struct xfs_ifork	*ifp;
> > +	struct getbmapx		*out;
> > +	struct xfs_bmbt_irec	got, rec;
> > +	xfs_filblks_t		len;
> > +	xfs_extnum_t		idx;
> >  
> >  #ifndef DEBUG
> >  	/* Only allow CoW fork queries if we're debugging. */
> >  	if (iflags & BMV_IF_COWFORK)
> >  		return -EINVAL;
> >  #endif
> > +
> >  	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
> >  		return -EINVAL;
> >  
> > +	if (bmv->bmv_count <= 1)
> > +		return -EINVAL;
> > +	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> > +		return -ENOMEM;
> > +
> > +	if (bmv->bmv_length < -1)
> > +		return -EINVAL;
> > +
> > +	bmv->bmv_entries = 0;
> > +	if (bmv->bmv_length == 0)
> > +		return 0;
> > +
> > +	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> > +	if (!out)
> > +		return -ENOMEM;
> > +
> >  	if (iflags & BMV_IF_ATTRFORK)
> >  		whichfork = XFS_ATTR_FORK;
> >  	else if (iflags & BMV_IF_COWFORK)
> >  		whichfork = XFS_COW_FORK;
> >  	else
> >  		whichfork = XFS_DATA_FORK;
> > +	ifp = XFS_IFORK_PTR(ip, whichfork);
> >  
> > +	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> >  	switch (whichfork) {
> >  	case XFS_ATTR_FORK:
> > -		if (XFS_IFORK_Q(ip)) {
> > -			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
> > -			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
> > -			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
> > -				return -EINVAL;
> > -		} else if (unlikely(
> > -			   ip->i_d.di_aformat != 0 &&
> > -			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
> > -			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
> > -					 ip->i_mount);
> > -			return -EFSCORRUPTED;
> > -		}
> > +		if (!XFS_IFORK_Q(ip))
> > +			goto out_unlock_iolock;
> >  
> > -		prealloced = 0;
> > -		fixlen = 1LL << 32;
> > +		max_len = 1LL << 32;
> > +		lock = xfs_ilock_attr_map_shared(ip);
> >  		break;
> >  	case XFS_COW_FORK:
> > -		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
> > -			return -EINVAL;
> > +		/* No CoW fork? Just return */
> > +		if (!ifp)
> > +			goto out_unlock_iolock;
> >  
> > -		if (xfs_get_cowextsz_hint(ip)) {
> > -			prealloced = 1;
> > -			fixlen = mp->m_super->s_maxbytes;
> > -		} else {
> > -			prealloced = 0;
> > -			fixlen = XFS_ISIZE(ip);
> > -		}
> > -		break;
> > -	default:
> > -		/* Local format data forks report no extents. */
> > -		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
> > -			bmv->bmv_entries = 0;
> > -			return 0;
> > -		}
> > -		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
> > -		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
> > -			return -EINVAL;
> > +		if (xfs_get_cowextsz_hint(ip))
> > +			max_len = mp->m_super->s_maxbytes;
> > +		else
> > +			max_len = XFS_ISIZE(ip);
> >  
> > -		if (xfs_get_extsz_hint(ip) ||
> > -		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
> > -			prealloced = 1;
> > -			fixlen = mp->m_super->s_maxbytes;
> > -		} else {
> > -			prealloced = 0;
> > -			fixlen = XFS_ISIZE(ip);
> > -		}
> > +		lock = XFS_ILOCK_SHARED;
> > +		xfs_ilock(ip, lock);
> >  		break;
> > -	}
> > -
> > -	if (bmv->bmv_length == -1) {
> > -		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
> > -		bmv->bmv_length =
> > -			max_t(int64_t, fixlen - bmv->bmv_offset, 0);
> > -	} else if (bmv->bmv_length == 0) {
> > -		bmv->bmv_entries = 0;
> > -		return 0;
> > -	} else if (bmv->bmv_length < 0) {
> > -		return -EINVAL;
> > -	}
> > -
> > -	nex = bmv->bmv_count - 1;
> > -	if (nex <= 0)
> > -		return -EINVAL;
> > -	bmvend = bmv->bmv_offset + bmv->bmv_length;
> > -
> > -
> > -	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
> > -		return -ENOMEM;
> > -	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
> > -	if (!out)
> > -		return -ENOMEM;
> > -
> > -	xfs_ilock(ip, XFS_IOLOCK_SHARED);
> > -	switch (whichfork) {
> >  	case XFS_DATA_FORK:
> >  		if (!(iflags & BMV_IF_DELALLOC) &&
> >  		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
> > @@ -669,147 +566,121 @@ xfs_getbmap(
> >  			 */
> >  		}
> >  
> > +		if (xfs_get_extsz_hint(ip) ||
> > +		    (ip->i_d.di_flags &
> > +		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
> > +			max_len = mp->m_super->s_maxbytes;
> > +		else
> > +			max_len = XFS_ISIZE(ip);
> > +
> >  		lock = xfs_ilock_data_map_shared(ip);
> >  		break;
> > -	case XFS_COW_FORK:
> > -		lock = XFS_ILOCK_SHARED;
> > -		xfs_ilock(ip, lock);
> > -		break;
> > -	case XFS_ATTR_FORK:
> > -		lock = xfs_ilock_attr_map_shared(ip);
> > +	}
> > +
> > +	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
> > +	case XFS_DINODE_FMT_EXTENTS:
> > +	case XFS_DINODE_FMT_BTREE:
> >  		break;
> > +	case XFS_DINODE_FMT_LOCAL:
> > +		/* Local format inode forks report no extents. */
> > +		goto out_unlock_ilock;
> > +	default:
> > +		error = -EINVAL;
> > +		goto out_unlock_ilock;
> >  	}
> >  
> > -	/*
> > -	 * Don't let nex be bigger than the number of extents
> > -	 * we can have assuming alternating holes and real extents.
> > -	 */
> > -	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
> > -		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
> > +	if (bmv->bmv_length == -1) {
> > +		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
> > +		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
> > +	}
> >  
> > -	bmapi_flags = xfs_bmapi_aflag(whichfork);
> > -	if (!(iflags & BMV_IF_PREALLOC))
> > -		bmapi_flags |= XFS_BMAPI_IGSTATE;
> > +	bmv_end = bmv->bmv_offset + bmv->bmv_length;
> >  
> > -	/*
> > -	 * Allocate enough space to handle "subnex" maps at a time.
> > -	 */
> > -	error = -ENOMEM;
> > -	subnex = 16;
> > -	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
> > -	if (!map)
> > +	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
> > +	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
> > +
> > +	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
> > +		error = xfs_iread_extents(NULL, ip, whichfork);
> > +		if (error)
> > +			goto out_unlock_ilock;
> > +	}
> > +
> > +	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
> >  		goto out_unlock_ilock;
> >  
> > -	bmv->bmv_entries = 0;
> > +	while (!xfs_getbmap_full(bmv, nr_entries)) {
> > +		struct getbmapx		*p = &out[nr_entries];
> >  
> > -	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
> > -	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
> > -		error = 0;
> > -		goto out_free_map;
> > -	}
> > +		xfs_trim_extent(&got, first_bno, len);
> >  
> > -	do {
> > -		nmap = (nex> subnex) ? subnex : nex;
> > -		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
> > -				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
> > -				       map, &nmap, bmapi_flags);
> > -		if (error)
> > -			goto out_free_map;
> > -		ASSERT(nmap <= subnex);
> > -
> > -		for (i = 0; i < nmap && bmv->bmv_length &&
> > -				cur_ext < bmv->bmv_count - 1; i++) {
> > -			out[cur_ext].bmv_oflags = 0;
> > -			if (map[i].br_state == XFS_EXT_UNWRITTEN)
> > -				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
> > -			else if (map[i].br_startblock == DELAYSTARTBLOCK)
> > -				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
> > -			out[cur_ext].bmv_offset =
> > -				XFS_FSB_TO_BB(mp, map[i].br_startoff);
> > -			out[cur_ext].bmv_length =
> > -				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
> > -			out[cur_ext].bmv_unused1 = 0;
> > -			out[cur_ext].bmv_unused2 = 0;
> > +		/*
> > +		 * Report an entry for a hole if this extent doesn't directly
> > +		 * follow the previous one.
> > +		 */
> > +		if (got.br_startoff > bno) {
> > +			xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
> > +					got.br_startoff, p++);
> > +			if (xfs_getbmap_full(bmv, ++nr_entries))
> > +				break;
> > +		}
> >  
> > -			/*
> > -			 * delayed allocation extents that start beyond EOF can
> > -			 * occur due to speculative EOF allocation when the
> > -			 * delalloc extent is larger than the largest freespace
> > -			 * extent at conversion time. These extents cannot be
> > -			 * converted by data writeback, so can exist here even
> > -			 * if we are not supposed to be finding delalloc
> > -			 * extents.
> > -			 */
> > -			if (map[i].br_startblock == DELAYSTARTBLOCK &&
> > -			    map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
> > -				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
> > -
> > -                        if (map[i].br_startblock == HOLESTARTBLOCK &&
> > -			    whichfork == XFS_ATTR_FORK) {
> > -				/* came to the end of attribute fork */
> > -				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
> > -				goto out_free_map;
> > -			}
> > +		/*
> > +		 * In order to report shared extents accurately, we report each
> > +		 * distinct shared / unshared part of a single bmbt record with
> > +		 * an individual getbmapx record.
> > +		 */
> > +		rec = got;
> > +		for (;;) {
> 
> while (!xfs_getbmap_full()) ?

Yeah.

> > -			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
> > -					&out[cur_ext], prealloced, bmvend,
> > -					map[i].br_startblock,
> > -					inject_map.br_startblock != NULLFSBLOCK))
> > -				goto out_free_map;
> > +			xfs_getbmap_report_one(ip, bmv, bmv_end, &rec, p);
> > +			if (shared)
> > +				p->bmv_oflags |= BMV_OF_SHARED;
> 
> Shouldn't we advance p/nr_entries?  What if we have a single partially
> shared extent?  Also, what's the difference between nr_entries and
> bmv->bmv_entries?  They both track the number of bmv entries we've
> filled out, right?

We should, but I messed it up.  And no test caught it..

> > +		if (!xfs_iext_get_extent(ifp, ++idx, &got)) {
> > +			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
> > +
> > +			p->bmv_oflags |= BMV_OF_LAST;
> 
> Isn't BMV_OF_LAST supposed to be set only on the last extent of the
> dataset returned?  If I ask for the mappings for blocks 100-200 and
> there's a hole from 190-200, shouldn't OF_LAST be set on the 190-200
> extent?

BMV_OF_LAST doesn't seem to be document, and isn't checked by any
any userspace I could find.  But my interpretation was that it's
supposed to be set on the last extent of a file, based on:

#define BMV_OF_LAST             0x4     /* segment is the last in the file */

now of course we could argue of a hole is an extent in a file; 
by any normal defintion it isn't, but in terms of getbmap it
could be considered one.

The old code did this to set it:

	if (startblock == HOLESTARTBLOCK) {
		...
	} else {
		....
		if (!moretocome &&
		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
		   (lastx == xfs_iext_count(ifp) - 1))
			out->bmv_oflags |= BMV_OF_LAST;
	}

which means it sets it on the last actually allocated extent in a file
for the general ase.

But it also does a weird thing for holes in attr forks where it sets
BMV_OF_LAST on an entry that is beyond bmv_entries and this should
not even be considered valid by the caller.

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Aug. 29, 2017, 2:41 p.m. UTC | #6
On Mon, Aug 28, 2017 at 02:01:25PM -0700, Darrick J. Wong wrote:
> Aha, I see, it's only if you pass in -e to xfs_bmap then you get a hole record:

Good luck passing -e to xfs_bmap:

root@brick:/home/hch/work/xfsprogs# xfs_bmap -e foo
Illegal option -e
Usage: xfs_bmap [-adlpvV] [-n nx] file...

:)


> 
> $ truncate -s 50m /storage/tmp/a
> $ xfs_io -c 'bmap -elpv' /storage/tmp/a
> /storage/tmp/a:
>  EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET         TOTAL FLAGS
>    0: [0..102399]:     hole                                 102400
> $ xfs_bmap /storage/tmp/a
> /storage/tmp/a: no extents
> $ xfs_io -c 'bmap -lpv' /storage/tmp/a
> /storage/tmp/a: no extents
> 
> (eesh.)

But only if there are no other extents:

root@brick:/home/hch/work/xfsprogs# truncate -s 50m foo
root@brick:/home/hch/work/xfsprogs# xfs_io -c 'bmap -elpv' foo
foo:
 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET         TOTAL FLAGS
   0: [0..102399]:     hole                                 102400
root@brick:/home/hch/work/xfsprogs# echo bar > foo
root@brick:/home/hch/work/xfsprogs# xfs_io -c 'bmap -elpv' foo
foo:
 EXT: FILE-OFFSET      BLOCK-RANGE          AG AG-OFFSET        TOTAL FLAGS
   0: [0..7]:          730691408..730691415  3 (390992..390999)     8 000000

I think we need to actually define getbmap semantics, as it seems
to be a collection of accidental corner cases..
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 93e955262d07..5962f119d4ff 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -404,125 +404,69 @@  xfs_bmap_count_blocks(
 	return 0;
 }
 
-/*
- * returns 1 for success, 0 if we failed to map the extent.
- */
-STATIC int
-xfs_getbmapx_fix_eof_hole(
-	xfs_inode_t		*ip,		/* xfs incore inode pointer */
-	int			whichfork,
-	struct getbmapx		*out,		/* output structure */
-	int			prealloced,	/* this is a file with
-						 * preallocated data space */
-	int64_t			end,		/* last block requested */
-	xfs_fsblock_t		startblock,
-	bool			moretocome)
+static void
+xfs_getbmap_report_one(
+	struct xfs_inode		*ip,
+	struct getbmapx			*bmv,
+	int64_t				bmv_end,
+	struct xfs_bmbt_irec		*got,
+	struct getbmapx			*p)
 {
-	int64_t			fixlen;
-	xfs_mount_t		*mp;		/* file system mount point */
-	xfs_ifork_t		*ifp;		/* inode fork pointer */
-	xfs_extnum_t		lastx;		/* last extent pointer */
-	xfs_fileoff_t		fileblock;
-
-	if (startblock == HOLESTARTBLOCK) {
-		mp = ip->i_mount;
-		out->bmv_block = -1;
-		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
-		fixlen -= out->bmv_offset;
-		if (prealloced && out->bmv_offset + out->bmv_length == end) {
-			/* Came to hole at EOF. Trim it. */
-			if (fixlen <= 0)
-				return 0;
-			out->bmv_length = fixlen;
-		}
+	if (isnullstartblock(got->br_startblock) ||
+	    got->br_startblock == DELAYSTARTBLOCK) {
+		/*
+		 * Delalloc extents that start beyond EOF can occur due to
+		 * speculative EOF allocation when the delalloc extent is larger
+		 * than the largest freespace extent at conversion time.  These
+		 * extents cannot be converted by data writeback, so can exist
+		 * here even if we are not supposed to be finding delalloc
+		 * extents.
+		 */
+		if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
+			ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
+
+		p->bmv_oflags |= BMV_OF_DELALLOC;
+		p->bmv_block = -2;
 	} else {
-		if (startblock == DELAYSTARTBLOCK)
-			out->bmv_block = -2;
-		else
-			out->bmv_block = xfs_fsb_to_db(ip, startblock);
-		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
-		ifp = XFS_IFORK_PTR(ip, whichfork);
-		if (!moretocome &&
-		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
-		   (lastx == xfs_iext_count(ifp) - 1))
-			out->bmv_oflags |= BMV_OF_LAST;
+		p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
 	}
 
-	return 1;
+	if (got->br_state == XFS_EXT_UNWRITTEN &&
+	    (bmv->bmv_iflags & BMV_IF_PREALLOC))
+		p->bmv_oflags |= BMV_OF_PREALLOC;
+
+	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
+	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
+
+	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+	bmv->bmv_entries++;
 }
 
-/* Adjust the reported bmap around shared/unshared extent transitions. */
-STATIC int
-xfs_getbmap_adjust_shared(
+static void
+xfs_getbmap_report_hole(
 	struct xfs_inode		*ip,
-	int				whichfork,
-	struct xfs_bmbt_irec		*map,
-	struct getbmapx			*out,
-	struct xfs_bmbt_irec		*next_map)
+	struct getbmapx			*bmv,
+	int64_t				bmv_end,
+	xfs_fileoff_t			bno,
+	xfs_fileoff_t			end,
+	struct getbmapx			*p)
 {
-	struct xfs_mount		*mp = ip->i_mount;
-	xfs_agnumber_t			agno;
-	xfs_agblock_t			agbno;
-	xfs_agblock_t			ebno;
-	xfs_extlen_t			elen;
-	xfs_extlen_t			nlen;
-	int				error;
+	if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
+		return;
 
-	next_map->br_startblock = NULLFSBLOCK;
-	next_map->br_startoff = NULLFILEOFF;
-	next_map->br_blockcount = 0;
+	p->bmv_block = -1;
+	p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
+	p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
 
-	/* Only written data blocks can be shared. */
-	if (!xfs_is_reflink_inode(ip) ||
-	    whichfork != XFS_DATA_FORK ||
-	    !xfs_bmap_is_real_extent(map))
-		return 0;
-
-	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
-	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
-	error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
-			map->br_blockcount, &ebno, &elen, true);
-	if (error)
-		return error;
-
-	if (ebno == NULLAGBLOCK) {
-		/* No shared blocks at all. */
-		return 0;
-	} else if (agbno == ebno) {
-		/*
-		 * Shared extent at (agbno, elen).  Shrink the reported
-		 * extent length and prepare to move the start of map[i]
-		 * to agbno+elen, with the aim of (re)formatting the new
-		 * map[i] the next time through the inner loop.
-		 */
-		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
-		out->bmv_oflags |= BMV_OF_SHARED;
-		if (elen != map->br_blockcount) {
-			*next_map = *map;
-			next_map->br_startblock += elen;
-			next_map->br_startoff += elen;
-			next_map->br_blockcount -= elen;
-		}
-		map->br_blockcount -= elen;
-	} else {
-		/*
-		 * There's an unshared extent (agbno, ebno - agbno)
-		 * followed by shared extent at (ebno, elen).  Shrink
-		 * the reported extent length to cover only the unshared
-		 * extent and prepare to move up the start of map[i] to
-		 * ebno, with the aim of (re)formatting the new map[i]
-		 * the next time through the inner loop.
-		 */
-		*next_map = *map;
-		nlen = ebno - agbno;
-		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
-		next_map->br_startblock += nlen;
-		next_map->br_startoff += nlen;
-		next_map->br_blockcount -= nlen;
-		map->br_blockcount -= nlen;
-	}
+	bmv->bmv_offset = p->bmv_offset + p->bmv_length;
+	bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
+	bmv->bmv_entries++;
+}
 
-	return 0;
+static inline bool xfs_getbmap_full(struct getbmapx *bmv, int nr_entries)
+{
+	return bmv->bmv_length == 0 || nr_entries >= bmv->bmv_count - 1;
 }
 
 /*
@@ -539,119 +483,72 @@  xfs_getbmap(
 	xfs_bmap_format_t	formatter,	/* format to user */
 	void			*arg)		/* formatter arg */
 {
-	int64_t			bmvend;		/* last block requested */
-	int			error = 0;	/* return value */
-	int64_t			fixlen;		/* length for -1 case */
-	int			i;		/* extent number */
-	int			lock;		/* lock state */
-	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
-	xfs_mount_t		*mp;		/* file system mount point */
-	int			nex;		/* # of user extents can do */
-	int			subnex;		/* # of bmapi's can do */
-	int			nmap;		/* number of map entries */
-	struct getbmapx		*out;		/* output structure */
-	int			whichfork;	/* data or attr fork */
-	int			prealloced;	/* this is a file with
-						 * preallocated data space */
-	int			iflags;		/* interface flags */
-	int			bmapi_flags;	/* flags for xfs_bmapi */
-	int			cur_ext = 0;
-	struct xfs_bmbt_irec	inject_map;
-
-	mp = ip->i_mount;
-	iflags = bmv->bmv_iflags;
+	struct xfs_mount	*mp = ip->i_mount;
+	int			iflags = bmv->bmv_iflags;
+	int			whichfork, lock, i, nr_entries = 0, error = 0;
+	int64_t			bmv_end, max_len;
+	xfs_fileoff_t		bno, first_bno;
+	struct xfs_ifork	*ifp;
+	struct getbmapx		*out;
+	struct xfs_bmbt_irec	got, rec;
+	xfs_filblks_t		len;
+	xfs_extnum_t		idx;
 
 #ifndef DEBUG
 	/* Only allow CoW fork queries if we're debugging. */
 	if (iflags & BMV_IF_COWFORK)
 		return -EINVAL;
 #endif
+
 	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
 		return -EINVAL;
 
+	if (bmv->bmv_count <= 1)
+		return -EINVAL;
+	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
+		return -ENOMEM;
+
+	if (bmv->bmv_length < -1)
+		return -EINVAL;
+
+	bmv->bmv_entries = 0;
+	if (bmv->bmv_length == 0)
+		return 0;
+
+	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+	if (!out)
+		return -ENOMEM;
+
 	if (iflags & BMV_IF_ATTRFORK)
 		whichfork = XFS_ATTR_FORK;
 	else if (iflags & BMV_IF_COWFORK)
 		whichfork = XFS_COW_FORK;
 	else
 		whichfork = XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
 
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	switch (whichfork) {
 	case XFS_ATTR_FORK:
-		if (XFS_IFORK_Q(ip)) {
-			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
-			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
-			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-				return -EINVAL;
-		} else if (unlikely(
-			   ip->i_d.di_aformat != 0 &&
-			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
-			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
-					 ip->i_mount);
-			return -EFSCORRUPTED;
-		}
+		if (!XFS_IFORK_Q(ip))
+			goto out_unlock_iolock;
 
-		prealloced = 0;
-		fixlen = 1LL << 32;
+		max_len = 1LL << 32;
+		lock = xfs_ilock_attr_map_shared(ip);
 		break;
 	case XFS_COW_FORK:
-		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
-			return -EINVAL;
+		/* No CoW fork? Just return */
+		if (!ifp)
+			goto out_unlock_iolock;
 
-		if (xfs_get_cowextsz_hint(ip)) {
-			prealloced = 1;
-			fixlen = mp->m_super->s_maxbytes;
-		} else {
-			prealloced = 0;
-			fixlen = XFS_ISIZE(ip);
-		}
-		break;
-	default:
-		/* Local format data forks report no extents. */
-		if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-			bmv->bmv_entries = 0;
-			return 0;
-		}
-		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
-		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
-			return -EINVAL;
+		if (xfs_get_cowextsz_hint(ip))
+			max_len = mp->m_super->s_maxbytes;
+		else
+			max_len = XFS_ISIZE(ip);
 
-		if (xfs_get_extsz_hint(ip) ||
-		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
-			prealloced = 1;
-			fixlen = mp->m_super->s_maxbytes;
-		} else {
-			prealloced = 0;
-			fixlen = XFS_ISIZE(ip);
-		}
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
 		break;
-	}
-
-	if (bmv->bmv_length == -1) {
-		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
-		bmv->bmv_length =
-			max_t(int64_t, fixlen - bmv->bmv_offset, 0);
-	} else if (bmv->bmv_length == 0) {
-		bmv->bmv_entries = 0;
-		return 0;
-	} else if (bmv->bmv_length < 0) {
-		return -EINVAL;
-	}
-
-	nex = bmv->bmv_count - 1;
-	if (nex <= 0)
-		return -EINVAL;
-	bmvend = bmv->bmv_offset + bmv->bmv_length;
-
-
-	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-		return -ENOMEM;
-	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
-	if (!out)
-		return -ENOMEM;
-
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	switch (whichfork) {
 	case XFS_DATA_FORK:
 		if (!(iflags & BMV_IF_DELALLOC) &&
 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
@@ -669,147 +566,121 @@  xfs_getbmap(
 			 */
 		}
 
+		if (xfs_get_extsz_hint(ip) ||
+		    (ip->i_d.di_flags &
+		     (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))
+			max_len = mp->m_super->s_maxbytes;
+		else
+			max_len = XFS_ISIZE(ip);
+
 		lock = xfs_ilock_data_map_shared(ip);
 		break;
-	case XFS_COW_FORK:
-		lock = XFS_ILOCK_SHARED;
-		xfs_ilock(ip, lock);
-		break;
-	case XFS_ATTR_FORK:
-		lock = xfs_ilock_attr_map_shared(ip);
+	}
+
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
 		break;
+	case XFS_DINODE_FMT_LOCAL:
+		/* Local format inode forks report no extents. */
+		goto out_unlock_ilock;
+	default:
+		error = -EINVAL;
+		goto out_unlock_ilock;
 	}
 
-	/*
-	 * Don't let nex be bigger than the number of extents
-	 * we can have assuming alternating holes and real extents.
-	 */
-	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
-		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+	if (bmv->bmv_length == -1) {
+		max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
+		bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
+	}
 
-	bmapi_flags = xfs_bmapi_aflag(whichfork);
-	if (!(iflags & BMV_IF_PREALLOC))
-		bmapi_flags |= XFS_BMAPI_IGSTATE;
+	bmv_end = bmv->bmv_offset + bmv->bmv_length;
 
-	/*
-	 * Allocate enough space to handle "subnex" maps at a time.
-	 */
-	error = -ENOMEM;
-	subnex = 16;
-	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
-	if (!map)
+	first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
+	len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, whichfork);
+		if (error)
+			goto out_unlock_ilock;
+	}
+
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
 		goto out_unlock_ilock;
 
-	bmv->bmv_entries = 0;
+	while (!xfs_getbmap_full(bmv, nr_entries)) {
+		struct getbmapx		*p = &out[nr_entries];
 
-	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
-	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
-		error = 0;
-		goto out_free_map;
-	}
+		xfs_trim_extent(&got, first_bno, len);
 
-	do {
-		nmap = (nex> subnex) ? subnex : nex;
-		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
-				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
-				       map, &nmap, bmapi_flags);
-		if (error)
-			goto out_free_map;
-		ASSERT(nmap <= subnex);
-
-		for (i = 0; i < nmap && bmv->bmv_length &&
-				cur_ext < bmv->bmv_count - 1; i++) {
-			out[cur_ext].bmv_oflags = 0;
-			if (map[i].br_state == XFS_EXT_UNWRITTEN)
-				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
-			else if (map[i].br_startblock == DELAYSTARTBLOCK)
-				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
-			out[cur_ext].bmv_offset =
-				XFS_FSB_TO_BB(mp, map[i].br_startoff);
-			out[cur_ext].bmv_length =
-				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-			out[cur_ext].bmv_unused1 = 0;
-			out[cur_ext].bmv_unused2 = 0;
+		/*
+		 * Report an entry for a hole if this extent doesn't directly
+		 * follow the previous one.
+		 */
+		if (got.br_startoff > bno) {
+			xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
+					got.br_startoff, p++);
+			if (xfs_getbmap_full(bmv, ++nr_entries))
+				break;
+		}
 
-			/*
-			 * delayed allocation extents that start beyond EOF can
-			 * occur due to speculative EOF allocation when the
-			 * delalloc extent is larger than the largest freespace
-			 * extent at conversion time. These extents cannot be
-			 * converted by data writeback, so can exist here even
-			 * if we are not supposed to be finding delalloc
-			 * extents.
-			 */
-			if (map[i].br_startblock == DELAYSTARTBLOCK &&
-			    map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
-				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
-
-                        if (map[i].br_startblock == HOLESTARTBLOCK &&
-			    whichfork == XFS_ATTR_FORK) {
-				/* came to the end of attribute fork */
-				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
-				goto out_free_map;
-			}
+		/*
+		 * In order to report shared extents accurately, we report each
+		 * distinct shared / unshared part of a single bmbt record with
+		 * an individual getbmapx record.
+		 */
+		rec = got;
+		for (;;) {
+			bool			shared = false, trimmed = false;
+			xfs_fileoff_t		len;
 
-			/* Is this a shared block? */
-			error = xfs_getbmap_adjust_shared(ip, whichfork,
-					&map[i], &out[cur_ext], &inject_map);
+			error = xfs_reflink_trim_around_shared(ip, &rec,
+					&shared, &trimmed);
 			if (error)
-				goto out_free_map;
+				goto out_unlock_ilock;
 
-			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
-					&out[cur_ext], prealloced, bmvend,
-					map[i].br_startblock,
-					inject_map.br_startblock != NULLFSBLOCK))
-				goto out_free_map;
+			xfs_getbmap_report_one(ip, bmv, bmv_end, &rec, p);
+			if (shared)
+				p->bmv_oflags |= BMV_OF_SHARED;
+			if (!trimmed)
+				break;
 
-			bmv->bmv_offset =
-				out[cur_ext].bmv_offset +
-				out[cur_ext].bmv_length;
-			bmv->bmv_length =
-				max_t(int64_t, 0, bmvend - bmv->bmv_offset);
+			len = got.br_startoff + got.br_blockcount -
+				(rec.br_startoff + rec.br_blockcount);
 
-			/*
-			 * In case we don't want to return the hole,
-			 * don't increase cur_ext so that we can reuse
-			 * it in the next loop.
-			 */
-			if ((iflags & BMV_IF_NO_HOLES) &&
-			    map[i].br_startblock == HOLESTARTBLOCK) {
-				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
-				continue;
-			}
+			rec.br_startoff += rec.br_blockcount;
+			if (rec.br_startblock != DELAYSTARTBLOCK)
+				rec.br_startblock += rec.br_blockcount;
+			rec.br_blockcount = len;
+		}
 
-			/*
-			 * In order to report shared extents accurately,
-			 * we report each distinct shared/unshared part
-			 * of a single bmbt record using multiple bmap
-			 * extents.  To make that happen, we iterate the
-			 * same map array item multiple times, each
-			 * time trimming out the subextent that we just
-			 * reported.
-			 *
-			 * Because of this, we must check the out array
-			 * index (cur_ext) directly against bmv_count-1
-			 * to avoid overflows.
-			 */
-			if (inject_map.br_startblock != NULLFSBLOCK) {
-				map[i] = inject_map;
-				i--;
+		bno = got.br_startoff + got.br_blockcount;
+		nr_entries++;
+
+		if (!xfs_iext_get_extent(ifp, ++idx, &got)) {
+			xfs_fileoff_t	end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+
+			p->bmv_oflags |= BMV_OF_LAST;
+
+			if (whichfork != XFS_ATTR_FORK && bno < end &&
+			    !xfs_getbmap_full(bmv, nr_entries)) {
+				xfs_getbmap_report_hole(ip, bmv, bmv_end, bno,
+						end, ++p);
+				nr_entries++;
 			}
-			bmv->bmv_entries++;
-			cur_ext++;
+			break;
 		}
-	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
 
- out_free_map:
-	kmem_free(map);
- out_unlock_ilock:
+		if (bno >= first_bno + len)
+			break;
+	}
+
+out_unlock_ilock:
 	xfs_iunlock(ip, lock);
- out_unlock_iolock:
+out_unlock_iolock:
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	for (i = 0; i < cur_ext; i++) {
+	for (i = 0; i < nr_entries; i++) {
 		/* format results & advance arg */
 		error = formatter(&arg, &out[i]);
 		if (error)