Message ID | 1480971924-4864-3-git-send-email-hch@lst.de (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
On Mon, Dec 05, 2016 at 10:05:23PM +0100, Christoph Hellwig wrote: > When we allocate COW fork blocks for direct I/O writes we currently first > create a delayed allocation, and then convert it to a real allocation > once we've got the delayed one. > > As there is no good reason for that this patch instead makes use call > xfs_bmapi_write from the COW allocation path. The only interesting bits > are a few tweaks the low-level allocator to allow for this, most notably > the need to remove the call to xfs_bmap_extsize_align for the cowextsize > in xfs_bmap_btalloc - for the existing convert case it's a no-op, but > for the direct allocation case it would blow up our block reservation > way beyond what we reserved for the transaction. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/libxfs/xfs_bmap.c | 9 ++--- > fs/xfs/xfs_reflink.c | 101 ++++++++++++++++++++++++++++++++++------------- > 2 files changed, 76 insertions(+), 34 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c > index 6f28814..a4a4327 100644 > --- a/fs/xfs/libxfs/xfs_bmap.c > +++ b/fs/xfs/libxfs/xfs_bmap.c > @@ -2893,13 +2893,14 @@ xfs_bmap_add_extent_hole_real( > ASSERT(!isnullstartblock(new->br_startblock)); > ASSERT(!bma->cur || > !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); > - ASSERT(whichfork != XFS_COW_FORK); > > XFS_STATS_INC(mp, xs_add_exlist); > > state = 0; > if (whichfork == XFS_ATTR_FORK) > state |= BMAP_ATTRFORK; > + if (whichfork == XFS_COW_FORK) > + state |= BMAP_COWFORK; > > /* > * Check and set flags if this segment has a left neighbor. > @@ -3619,9 +3620,7 @@ xfs_bmap_btalloc( > else if (mp->m_dalign) > stripe_align = mp->m_dalign; > > - if (ap->flags & XFS_BMAPI_COWFORK) > - align = xfs_get_cowextsz_hint(ap->ip); > - else if (xfs_alloc_is_userdata(ap->datatype)) > + if (xfs_alloc_is_userdata(ap->datatype)) Doesn't this defeat the purpose (i.e., fragmentation avoidance via wider allocations) of the cowextszhint for direct I/O? I think it would be better to be consistent with the approach for traditional I/O + extsz and incorporate the alignment into the reservation. Perhaps the hunk of code that already does just that in xfs_iomap_write_direct() could be converted to a small helper and reused..? > align = xfs_get_extsz_hint(ap->ip); > if (unlikely(align)) { > error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, > @@ -4608,8 +4607,6 @@ xfs_bmapi_write( > */ > if (flags & XFS_BMAPI_REMAP) > ASSERT(inhole); > - if (flags & XFS_BMAPI_COWFORK) > - ASSERT(!inhole); > > /* > * First, deal with the hole before the allocated space > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > index 88fd03c..0a077e80 100644 > --- a/fs/xfs/xfs_reflink.c > +++ b/fs/xfs/xfs_reflink.c > @@ -304,59 +304,104 @@ __xfs_reflink_allocate_cow( > xfs_fileoff_t end_fsb) > { > struct xfs_mount *mp = ip->i_mount; > - struct xfs_bmbt_irec imap; > + struct xfs_bmbt_irec imap, got; > struct xfs_defer_ops dfops; > struct xfs_trans *tp; > xfs_fsblock_t first_block; > - int nimaps = 1, error; > - bool shared; > + int nimaps, error, lockmode; > + bool shared, trimmed; > + xfs_extlen_t resblks, align; > + xfs_extnum_t idx; > > - xfs_defer_init(&dfops, &first_block); > + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); > + if (align) > + end_fsb = roundup_64(end_fsb, align); > > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, > - XFS_TRANS_RESERVE, &tp); > + resblks = XFS_DIOSTRAT_SPACE_RES(mp, end_fsb - *offset_fsb); > + > + error = xfs_qm_dqattach(ip, 0); This is already in the (only) caller. Brian > if (error) > return error; > > - xfs_ilock(ip, XFS_ILOCK_EXCL); > - > - /* Read extent from the source file. */ > - nimaps = 1; > - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, > - &imap, &nimaps, 0); > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); > if (error) > - goto out_unlock; > - ASSERT(nimaps == 1); > + return error; > > - error = xfs_reflink_reserve_cow(ip, &imap, &shared); > - if (error) > - goto out_trans_cancel; > + lockmode = XFS_ILOCK_EXCL; > + xfs_ilock(ip, lockmode); > > - if (!shared) { > - *offset_fsb = imap.br_startoff + imap.br_blockcount; > - goto out_trans_cancel; > + /* > + * Even if the extent is not shared we might have a preallocation for > + * it in the COW fork. If so use it. > + */ > + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, *offset_fsb, &idx, &got) && > + got.br_startoff <= *offset_fsb) { > + /* If we have a real allocation in the COW fork we're done. */ > + if (!isnullstartblock(got.br_startblock)) { > + xfs_trim_extent(&got, *offset_fsb, > + end_fsb - *offset_fsb); > + *offset_fsb = got.br_startoff + got.br_blockcount; > + goto out_trans_cancel; > + } > + } else { > + nimaps = 1; > + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, > + &imap, &nimaps, 0); > + if (error) > + goto out_trans_cancel; > + ASSERT(nimaps == 1); > + > + /* Trim the mapping to the nearest shared extent boundary. */ > + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, > + &trimmed); > + if (error) > + goto out_trans_cancel; > + > + if (!shared) { > + *offset_fsb = imap.br_startoff + imap.br_blockcount; > + goto out_trans_cancel; > + } > + > + *offset_fsb = imap.br_startoff; > + end_fsb = imap.br_startoff + imap.br_blockcount; > + if (align) > + end_fsb = roundup_64(end_fsb, align); > } > > - xfs_trans_ijoin(tp, ip, 0); > - error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, > - XFS_BMAPI_COWFORK, &first_block, > - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), > - &imap, &nimaps, &dfops); > + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, > + XFS_QMOPT_RES_REGBLKS); > if (error) > goto out_trans_cancel; > > + xfs_trans_ijoin(tp, ip, 0); > + > + xfs_defer_init(&dfops, &first_block); > + nimaps = 1; > + error = xfs_bmapi_write(tp, ip, *offset_fsb, end_fsb - *offset_fsb, > + XFS_BMAPI_COWFORK, &first_block, resblks, &imap, > + &nimaps, &dfops); > + if (error) > + goto out_bmap_cancel; > + > error = xfs_defer_finish(&tp, &dfops, NULL); > if (error) > - goto out_trans_cancel; > + goto out_bmap_cancel; > > error = xfs_trans_commit(tp); > + if (error) > + goto out_unlock; > > *offset_fsb = imap.br_startoff + imap.br_blockcount; > + > out_unlock: > - xfs_iunlock(ip, XFS_ILOCK_EXCL); > + xfs_iunlock(ip, lockmode); > return error; > -out_trans_cancel: > + > +out_bmap_cancel: > xfs_defer_cancel(&dfops); > + xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, > + XFS_QMOPT_RES_REGBLKS); > +out_trans_cancel: > xfs_trans_cancel(tp); > goto out_unlock; > } > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Dec 07, 2016 at 02:00:09PM -0500, Brian Foster wrote: > > - if (ap->flags & XFS_BMAPI_COWFORK) > > - align = xfs_get_cowextsz_hint(ap->ip); > > - else if (xfs_alloc_is_userdata(ap->datatype)) > > + if (xfs_alloc_is_userdata(ap->datatype)) > > Doesn't this defeat the purpose (i.e., fragmentation avoidance via wider > allocations) of the cowextszhint for direct I/O? I think it would be > better to be consistent with the approach for traditional I/O + extsz > and incorporate the alignment into the reservation. Perhaps the hunk of > code that already does just that in xfs_iomap_write_direct() could be > converted to a small helper and reused..? We're already doing the alignment to the cowextsize hint in __xfs_reflink_allocate_cow so that we can take the cowextsize into account. > > + resblks = XFS_DIOSTRAT_SPACE_RES(mp, end_fsb - *offset_fsb); > > + > > + error = xfs_qm_dqattach(ip, 0); > > This is already in the (only) caller. Yes, it can be dropped, although a superflous xfs_qm_dqattach is totally harmless anyway. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Dec 07, 2016 at 08:37:09PM +0100, Christoph Hellwig wrote: > On Wed, Dec 07, 2016 at 02:00:09PM -0500, Brian Foster wrote: > > > - if (ap->flags & XFS_BMAPI_COWFORK) > > > - align = xfs_get_cowextsz_hint(ap->ip); > > > - else if (xfs_alloc_is_userdata(ap->datatype)) > > > + if (xfs_alloc_is_userdata(ap->datatype)) > > > > Doesn't this defeat the purpose (i.e., fragmentation avoidance via wider > > allocations) of the cowextszhint for direct I/O? I think it would be > > better to be consistent with the approach for traditional I/O + extsz > > and incorporate the alignment into the reservation. Perhaps the hunk of > > code that already does just that in xfs_iomap_write_direct() could be > > converted to a small helper and reused..? > > We're already doing the alignment to the cowextsize hint in > __xfs_reflink_allocate_cow so that we can take the cowextsize into > account. > Only for end_fsb... xfs_bmap_btalloc() calls xfs_bmap_extsize_align() with the alignment, which rounds out the start and end offsets. This same end_fsb code has already been removed from xfs_reflink_reserve_cow() for similar reasons. It should probably be removed from xfs_reflink_allocate_cow() as well. Brian > > > + resblks = XFS_DIOSTRAT_SPACE_RES(mp, end_fsb - *offset_fsb); > > > + > > > + error = xfs_qm_dqattach(ip, 0); > > > > This is already in the (only) caller. > > Yes, it can be dropped, although a superflous xfs_qm_dqattach is > totally harmless anyway. > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Dec 07, 2016 at 02:46:34PM -0500, Brian Foster wrote: > On Wed, Dec 07, 2016 at 08:37:09PM +0100, Christoph Hellwig wrote: > > On Wed, Dec 07, 2016 at 02:00:09PM -0500, Brian Foster wrote: > > > > - if (ap->flags & XFS_BMAPI_COWFORK) > > > > - align = xfs_get_cowextsz_hint(ap->ip); > > > > - else if (xfs_alloc_is_userdata(ap->datatype)) > > > > + if (xfs_alloc_is_userdata(ap->datatype)) > > > > > > Doesn't this defeat the purpose (i.e., fragmentation avoidance via wider > > > allocations) of the cowextszhint for direct I/O? I think it would be > > > better to be consistent with the approach for traditional I/O + extsz > > > and incorporate the alignment into the reservation. Perhaps the hunk of > > > code that already does just that in xfs_iomap_write_direct() could be > > > converted to a small helper and reused..? > > > > We're already doing the alignment to the cowextsize hint in > > __xfs_reflink_allocate_cow so that we can take the cowextsize into > > account. > > > > Only for end_fsb... xfs_bmap_btalloc() calls xfs_bmap_extsize_align() > with the alignment, which rounds out the start and end offsets. Seconded; I finally got a chance to build a kernel with these three patches and ran xfs/214 with a tracepointed kernel; it doesn't round the start down to the previous cowextsize alignment like 4.9 does... and thus the quota accounting is off. --D > > This same end_fsb code has already been removed from > xfs_reflink_reserve_cow() for similar reasons. It should probably be > removed from xfs_reflink_allocate_cow() as well. > > Brian > > > > > + resblks = XFS_DIOSTRAT_SPACE_RES(mp, end_fsb - *offset_fsb); > > > > + > > > > + error = xfs_qm_dqattach(ip, 0); > > > > > > This is already in the (only) caller. > > > > Yes, it can be dropped, although a superflous xfs_qm_dqattach is > > totally harmless anyway. > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Dec 07, 2016 at 02:46:34PM -0500, Brian Foster wrote: > Only for end_fsb... xfs_bmap_btalloc() calls xfs_bmap_extsize_align() > with the alignment, which rounds out the start and end offsets. ... and corrupts data in the direct I/O case. The problem is that the down-alignment in xfs_bmap_extsize_align will now create a real extent that spans before the extent that we have to COW in this write_begin call. But the area before might have been a hole before the dio write that had just before been filled with an allocation in the data fork. And due to the direct I/O end_io interface that only covers the range of the whole write we don't know at that point where exactly the COW operation started and will happily splice back our front pad into the data fork, replacing the just written data with garbage. xfs/228 and sometimes generic/199 reproduce this nicely. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 09:37:32AM +0100, Christoph Hellwig wrote: > On Wed, Dec 07, 2016 at 02:46:34PM -0500, Brian Foster wrote: > > Only for end_fsb... xfs_bmap_btalloc() calls xfs_bmap_extsize_align() > > with the alignment, which rounds out the start and end offsets. > > ... and corrupts data in the direct I/O case. > > The problem is that the down-alignment in xfs_bmap_extsize_align will > now create a real extent that spans before the extent that we have > to COW in this write_begin call. But the area before might have been > a hole before the dio write that had just before been filled with > an allocation in the data fork. And due to the direct I/O end_io > interface that only covers the range of the whole write we don't > know at that point where exactly the COW operation started and will > happily splice back our front pad into the data fork, replacing > the just written data with garbage. xfs/228 and sometimes generic/199 > reproduce this nicely. Is this reproducible on the current tree or only with this patch series? Also, shouldn't the end_io handler only remap the range of the write, regardless of whether the initial allocation ended up preallocating over holes or purely a shared range? Perhaps what you are saying here is that we have a single dio write that spans wider than a shared data fork extent..? In that case, we iterate the range in xfs_reflink_reserve_cow(). We'd skip the start of the range that is a hole in the data fork, but as you say, the xfs_bmapi_reserve_delalloc() call for the part of the I/O on the shared extent can widen the COW fork allocation to before the extent in the data fork, possibly to before the start of the I/O. (Thus we end up allocating COW blocks over the hole anyways...). From there we are going to drop into iomap_dio_rw(), which looks like it's going to check the COW fork for blocks and if found, write to those blocks (as opposed to doing a data fork allocation). AFAICT, that means the extent size hint shouldn't be a problem. What am I missing? Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 08:50:45AM -0500, Brian Foster wrote: > Is this reproducible on the current tree or only with this patch series? It's only reproducible with the series modified to your review comments. > Also, shouldn't the end_io handler only remap the range of the write, > regardless of whether the initial allocation ended up preallocating over > holes or purely a shared range? The end_io handler is caller for the whole size of the write. That's mostly because we don't have an object corresponding to a write_begin call. > Perhaps what you are saying here is that we have a single dio write that > spans wider than a shared data fork extent..? Yes. > In that case, we iterate > the range in xfs_reflink_reserve_cow(). We'd skip the start of the range > that is a hole in the data fork, but as you say, the > xfs_bmapi_reserve_delalloc() call for the part of the I/O on the shared > extent can widen the COW fork allocation to before the extent in the > data fork, possibly to before the start of the I/O. (Thus we end up > allocating COW blocks over the hole anyways...). The problem is the following. We have a file with the following layout HHHHHHHHHHHHDDDDDDDDDDDDDD where H is hole and D is data. The H to D boundary is not aligned to the cowextsize. The direct I/O code now does a first pass allocating an extent for H and copies data to it. Then in the next step it goes on to D and unshares it. It then enlarges the extent into the end of the previously H range. It does however not copy data into H again, as the iomap iterator is past it. The ->end_io routine however is called for the hole range, and will move the just allocated rounding before H back into the data fork, replacing the valid data writtent just before. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 02:59:37PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 08:50:45AM -0500, Brian Foster wrote: > > Is this reproducible on the current tree or only with this patch series? > > It's only reproducible with the series modified to your review comments. > Ok, well then I'm probably not going to be able to follow the details well enough to try and provide constructive feedback without seeing the code. Looking back, my comments were generally about the tradeoff of bypassing the extent size hint mechanism that has been built into reflink to avoid cow fragmention. > > Also, shouldn't the end_io handler only remap the range of the write, > > regardless of whether the initial allocation ended up preallocating over > > holes or purely a shared range? > > The end_io handler is caller for the whole size of the write. That's > mostly because we don't have an object corresponding to a write_begin > call. > > > Perhaps what you are saying here is that we have a single dio write that > > spans wider than a shared data fork extent..? > > Yes. > > > In that case, we iterate > > the range in xfs_reflink_reserve_cow(). We'd skip the start of the range > > that is a hole in the data fork, but as you say, the > > xfs_bmapi_reserve_delalloc() call for the part of the I/O on the shared > > extent can widen the COW fork allocation to before the extent in the > > data fork, possibly to before the start of the I/O. (Thus we end up > > allocating COW blocks over the hole anyways...). > > The problem is the following. > > We have a file with the following layout > > > HHHHHHHHHHHHDDDDDDDDDDDDDD > > where H is hole and D is data. The H to D boundary is not aligned > to the cowextsize. > > The direct I/O code now does a first pass allocating an extent for > H and copies data to it. Then in the next step it goes on to D > and unshares it. It then enlarges the extent into the end of the > previously H range. It does however not copy data into H again, > as the iomap iterator is past it. The ->end_io routine however > is called for the hole range, and will move the just allocated > rounding before H back into the data fork, replacing the valid data > writtent just before. > Without seeing the code, perhaps we need to pull up the cow extent size hint mechanism from the bmapi layer to something similar to how xfs_iomap_direct_write() handles the traditional extent size hints..? That may allow us to more intelligently consider the current state across the data and cow forks in such cases (to not preallocate over existing blocks, for example, without having to kill off the extent size hint entirely). Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 10:02:22AM -0500, Brian Foster wrote: > Ok, well then I'm probably not going to be able to follow the details > well enough to try and provide constructive feedback without seeing the > code. Looking back, my comments were generally about the tradeoff of > bypassing the extent size hint mechanism that has been built into > reflink to avoid cow fragmention. Ok - really very little change here - only the call to the extsize alignment in xfs_bmap_btalloc reinstated and the according reservation changes in the caller. Note that with the previously posted series there is no change in handling the cowextsize hint for real on-disk allocations. While the current code rounds down based on the cowextsize for creating the delayed extent we will never convert the alignment before the write start to a real extent - it will just get cleaned up later at inode eviction time or using the timer. > Without seeing the code, perhaps we need to pull up the cow extent size > hint mechanism from the bmapi layer to something similar to how > xfs_iomap_direct_write() handles the traditional extent size hints..? > That may allow us to more intelligently consider the current state > across the data and cow forks in such cases (to not preallocate over > existing blocks, for example, without having to kill off the extent size > hint entirely). We could. On the other hand I'd love to get the current series in first as the only thing it change in behavior is not allocating additional delayed extent space that never gets used, and not writing data twice if it's sub-blocksize, all of which seem like a clear improvement. And it's also the base for my pending DAX reflink support. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 04:09:59PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 10:02:22AM -0500, Brian Foster wrote: > > Ok, well then I'm probably not going to be able to follow the details > > well enough to try and provide constructive feedback without seeing the > > code. Looking back, my comments were generally about the tradeoff of > > bypassing the extent size hint mechanism that has been built into > > reflink to avoid cow fragmention. > > Ok - really very little change here - only the call to the extsize > alignment in xfs_bmap_btalloc reinstated and the according reservation > changes in the caller. > > Note that with the previously posted series there is no change in > handling the cowextsize hint for real on-disk allocations. While > the current code rounds down based on the cowextsize for creating > the delayed extent we will never convert the alignment before the > write start to a real extent - it will just get cleaned up later > at inode eviction time or using the timer. > I thought that while not necessarily guaranteed, generally the entire extent gets converted from delalloc to real blocks. IIRC, that's what I've seen in the past when looking into the cow fork with bmap. After all, isn't that the point of the extent size hint? Allocate wider than the write to accommodate potential subsequent writes into a more contiguous range. > > Without seeing the code, perhaps we need to pull up the cow extent size > > hint mechanism from the bmapi layer to something similar to how > > xfs_iomap_direct_write() handles the traditional extent size hints..? > > That may allow us to more intelligently consider the current state > > across the data and cow forks in such cases (to not preallocate over > > existing blocks, for example, without having to kill off the extent size > > hint entirely). > > We could. On the other hand I'd love to get the current series in > first as the only thing it change in behavior is not allocating > additional delayed extent space that never gets used, and not writing > data twice if it's sub-blocksize, all of which seem like a clear > improvement. And it's also the base for my pending DAX reflink support. Fair enough, that's a different discussion. Personally, I'd prefer to fix up the stuff we know that needs fixing before piling more stuff on top. Particularly since something like cow I/O to a reflinked vm image file (taking full advantage of cowextszhint) might be a more common use case than DAX reflink at the moment. All of this stuff is "experimental," however, so I don't feel strongly about the order so long as we agree the regression can/should be fixed up. So I'll defer to the maintainer on that one. ;) Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 11:17:20AM -0500, Brian Foster wrote: > I thought that while not necessarily guaranteed, generally the entire > extent gets converted from delalloc to real blocks. For buffered I/O that's the case. See the discussion on my recent xfs_bmapi_write patch. > IIRC, that's what > I've seen in the past when looking into the cow fork with bmap. After > all, isn't that the point of the extent size hint? Allocate wider than > the write to accommodate potential subsequent writes into a more > contiguous range. Well, for direct I/O that's not what the current code does. Implementing it might be useful, but I'm not sure how much the front alignment is going to help in usual worksloads - you'd need a backwards write or random writes that happen to look almost backwards for it to make a difference. I suspect most of them time we'd just allocate blocks to reclaim them again a little later. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 05:21:56PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 11:17:20AM -0500, Brian Foster wrote: > > I thought that while not necessarily guaranteed, generally the entire > > extent gets converted from delalloc to real blocks. > > For buffered I/O that's the case. See the discussion on my recent > xfs_bmapi_write patch. > Yeah, that's the behavior I would have expected... > > IIRC, that's what > > I've seen in the past when looking into the cow fork with bmap. After > > all, isn't that the point of the extent size hint? Allocate wider than > > the write to accommodate potential subsequent writes into a more > > contiguous range. > > Well, for direct I/O that's not what the current code does. Implementing > it might be useful, but I'm not sure how much the front alignment is > going to help in usual worksloads - you'd need a backwards write or > random writes that happen to look almost backwards for it to make > a difference. I suspect most of them time we'd just allocate blocks to > reclaim them again a little later. Hmm, that's not what I'm seeing (not that it really matters, but I'm curious if I'm missing something): # xfs_io -d -c "bmap -v" -c "pwrite 8k 4k" -c "bmap -v" -c "bmap -cv" ./file ./file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..15]: 80..95 0 (80..95) 16 100000 1: [16..2047]: 160..2191 0 (160..2191) 2032 100000 wrote 4096/4096 bytes at offset 8192 4 KiB, 1 ops; 0.0000 sec (12.440 MiB/sec and 3184.7134 ops/sec) ./file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..15]: 80..95 0 (80..95) 16 100000 1: [16..23]: 2208..2215 0 (2208..2215) 8 2: [24..2047]: 168..2191 0 (168..2191) 2024 100000 ./file: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..15]: 2192..2207 0 (2192..2207) 16 1: [16..23]: hole 8 2: [24..255]: 2216..2447 0 (2216..2447) 232 3: [256..2047]: hole 1792 That's on a fairly recent for-next on a fully reflinked file. It looks to me that the cow fork extent is fully allocated immediately (otherwise, I'm not sure what else would have converted it). Brian > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 12:43:18PM -0500, Brian Foster wrote: > Hmm, that's not what I'm seeing (not that it really matters, but I'm > curious if I'm missing something): Yeah, I can reproduce this on mainline. Turns out the it was done by the align call in xfs_bmap_btalloc that even my before run had removed. Took me some time to spin my head around this. Btw, I think we have a nasty race in the current DIO code that might expose stale data, but this is just the same kind of head spinning exercise for now: Thread 1 writes a range from B to c B --------- C a little later thread 2 writes from A to B A --------- B but the code preallocates beyond B into the range where thread 1 has just written, but ->end_io hasn't been called yet. But once ->end_io is called thread 2 has already allocated up to the extent size hint into the write range of thread 1, so the end_io handler will splice the unintialized blocks from that preallocation back into the file right after B. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 09:08:55PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 12:43:18PM -0500, Brian Foster wrote: > > Hmm, that's not what I'm seeing (not that it really matters, but I'm > > curious if I'm missing something): > > Yeah, I can reproduce this on mainline. Turns out the it was done s/can/can't/, sorry. It's getting late for the day. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 09:08:55PM +0100, Christoph Hellwig wrote: > On Tue, Jan 24, 2017 at 12:43:18PM -0500, Brian Foster wrote: > > Hmm, that's not what I'm seeing (not that it really matters, but I'm > > curious if I'm missing something): > > Yeah, I can reproduce this on mainline. Turns out the it was done > by the align call in xfs_bmap_btalloc that even my before run had > removed. > > Took me some time to spin my head around this. > > Btw, I think we have a nasty race in the current DIO code that might > expose stale data, but this is just the same kind of head spinning > exercise for now: > > Thread 1 writes a range from B to c > > B --------- C A --------- B --------- C ^ ^ d e I'm assuming B-C has no shared blocks, d-B has shared blocks, and that both d & e are cowextsize boundaries. > a little later thread 2 writes from A to B > > A --------- B > but the code preallocates beyond B into the range where thread > 1 has just written, but ->end_io hasn't been called yet. > But once ->end_io is called thread 2 has already allocated > up to the extent size hint into the write range of thread 1, > so the end_io handler will splice the unintialized blocks from > that preallocation back into the file right after B. I think you're right about there being a dio race here. I'm not sure what the solution here is -- certainly we could disregard the cowextsize hint, though that has a fragmentation cost that we already know about. We could also change the dio write setup to extend the range that it checks for shared blocks up and down to the nearest cowextsize boundary and set up the delalloc reservations in the cow fork as necessary. If our thread2 comes along then it'll find the reservations already set up for a cow so that we avoid the situation where B-C changes between iomap_begin and dio_write_end_io does the wrong thing. That's more in the spirit of cowextsize since we'd promote future writes to CoW. However it's more wasteful of blocks since we have no idea if those future writes are ever actually going to happen, and it also adds more bmap records if we don't use all of the reservation. Ugh, my head hurts, I'm going to go for a walk to ponder this some more, and to try to work out whether the buffered path is all right. --D > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2017 at 04:09:34PM -0800, Darrick J. Wong wrote: > On Tue, Jan 24, 2017 at 09:08:55PM +0100, Christoph Hellwig wrote: > > On Tue, Jan 24, 2017 at 12:43:18PM -0500, Brian Foster wrote: > > > Hmm, that's not what I'm seeing (not that it really matters, but I'm > > > curious if I'm missing something): > > > > Yeah, I can reproduce this on mainline. Turns out the it was done > > by the align call in xfs_bmap_btalloc that even my before run had > > removed. > > > > Took me some time to spin my head around this. > > > > Btw, I think we have a nasty race in the current DIO code that might > > expose stale data, but this is just the same kind of head spinning > > exercise for now: > > > > Thread 1 writes a range from B to c > > > > B --------- C > > A --------- B --------- C > ^ ^ > d e > > I'm assuming B-C has no shared blocks, d-B has shared blocks, and that > both d & e are cowextsize boundaries. > > > a little later thread 2 writes from A to B > > > > A --------- B > > but the code preallocates beyond B into the range where thread > > 1 has just written, but ->end_io hasn't been called yet. > > But once ->end_io is called thread 2 has already allocated > > up to the extent size hint into the write range of thread 1, > > so the end_io handler will splice the unintialized blocks from > > that preallocation back into the file right after B. > > I think you're right about there being a dio race here. I'm not sure > what the solution here is -- certainly we could disregard the cowextsize > hint, though that has a fragmentation cost that we already know about. > > We could also change the dio write setup to extend the range that it > checks for shared blocks up and down to the nearest cowextsize boundary > and set up the delalloc reservations in the cow fork as necessary. If > our thread2 comes along then it'll find the reservations already set > up for a cow so that we avoid the situation where B-C changes between > iomap_begin and dio_write_end_io does the wrong thing. That's more in > the spirit of cowextsize since we'd promote future writes to CoW. > However it's more wasteful of blocks since we have no idea if those > future writes are ever actually going to happen, and it also adds more > bmap records if we don't use all of the reservation. > > Ugh, my head hurts, I'm going to go for a walk to ponder this some more, > and to try to work out whether the buffered path is all right. I think I came up with a better idea overnight: take advantage of the unwritten bit. Currently, all extents in the cow fork are either delalloc or normal extents. When we allocate the blocks to fill a cow extent, we'll flag the extent as unwritten. When we go to write the data to disk, we convert the unwritten extent to a real extent, and the post-write remap (since it only takes a file offset and length) will be changed to remap only real, written extents into the data fork. --D > > --D > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2017 at 09:44:47AM -0800, Darrick J. Wong wrote: > I think I came up with a better idea overnight: take advantage of the > unwritten bit. Currently, all extents in the cow fork are either > delalloc or normal extents. When we allocate the blocks to fill a cow > extent, we'll flag the extent as unwritten. When we go to write the > data to disk, we convert the unwritten extent to a real extent, and the > post-write remap (since it only takes a file offset and length) will > be changed to remap only real, written extents into the data fork. That sounds like a possibility. My other idea was to force COW all direct I/O writes, that way we can't have a mismatch between extents in the data and the COW fork. But so far this is just an idea.. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6f28814..a4a4327 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2893,13 +2893,14 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; /* * Check and set flags if this segment has a left neighbor. @@ -3619,9 +3620,7 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - if (ap->flags & XFS_BMAPI_COWFORK) - align = xfs_get_cowextsz_hint(ap->ip); - else if (xfs_alloc_is_userdata(ap->datatype)) + if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -4608,8 +4607,6 @@ xfs_bmapi_write( */ if (flags & XFS_BMAPI_REMAP) ASSERT(inhole); - if (flags & XFS_BMAPI_COWFORK) - ASSERT(!inhole); /* * First, deal with the hole before the allocated space diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 88fd03c..0a077e80 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -304,59 +304,104 @@ __xfs_reflink_allocate_cow( xfs_fileoff_t end_fsb) { struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; + struct xfs_bmbt_irec imap, got; struct xfs_defer_ops dfops; struct xfs_trans *tp; xfs_fsblock_t first_block; - int nimaps = 1, error; - bool shared; + int nimaps, error, lockmode; + bool shared, trimmed; + xfs_extlen_t resblks, align; + xfs_extnum_t idx; - xfs_defer_init(&dfops, &first_block); + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); + if (align) + end_fsb = roundup_64(end_fsb, align); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, - XFS_TRANS_RESERVE, &tp); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, end_fsb - *offset_fsb); + + error = xfs_qm_dqattach(ip, 0); if (error) return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* Read extent from the source file. */ - nimaps = 1; - error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, - &imap, &nimaps, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) - goto out_unlock; - ASSERT(nimaps == 1); + return error; - error = xfs_reflink_reserve_cow(ip, &imap, &shared); - if (error) - goto out_trans_cancel; + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); - if (!shared) { - *offset_fsb = imap.br_startoff + imap.br_blockcount; - goto out_trans_cancel; + /* + * Even if the extent is not shared we might have a preallocation for + * it in the COW fork. If so use it. + */ + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, *offset_fsb, &idx, &got) && + got.br_startoff <= *offset_fsb) { + /* If we have a real allocation in the COW fork we're done. */ + if (!isnullstartblock(got.br_startblock)) { + xfs_trim_extent(&got, *offset_fsb, + end_fsb - *offset_fsb); + *offset_fsb = got.br_startoff + got.br_blockcount; + goto out_trans_cancel; + } + } else { + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_trans_cancel; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, + &trimmed); + if (error) + goto out_trans_cancel; + + if (!shared) { + *offset_fsb = imap.br_startoff + imap.br_blockcount; + goto out_trans_cancel; + } + + *offset_fsb = imap.br_startoff; + end_fsb = imap.br_startoff + imap.br_blockcount; + if (align) + end_fsb = roundup_64(end_fsb, align); } - xfs_trans_ijoin(tp, ip, 0); - error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, - XFS_BMAPI_COWFORK, &first_block, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), - &imap, &nimaps, &dfops); + error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto out_trans_cancel; + xfs_trans_ijoin(tp, ip, 0); + + xfs_defer_init(&dfops, &first_block); + nimaps = 1; + error = xfs_bmapi_write(tp, ip, *offset_fsb, end_fsb - *offset_fsb, + XFS_BMAPI_COWFORK, &first_block, resblks, &imap, + &nimaps, &dfops); + if (error) + goto out_bmap_cancel; + error = xfs_defer_finish(&tp, &dfops, NULL); if (error) - goto out_trans_cancel; + goto out_bmap_cancel; error = xfs_trans_commit(tp); + if (error) + goto out_unlock; *offset_fsb = imap.br_startoff + imap.br_blockcount; + out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; -out_trans_cancel: + +out_bmap_cancel: xfs_defer_cancel(&dfops); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, + XFS_QMOPT_RES_REGBLKS); +out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; }
When we allocate COW fork blocks for direct I/O writes we currently first create a delayed allocation, and then convert it to a real allocation once we've got the delayed one. As there is no good reason for that this patch instead makes use call xfs_bmapi_write from the COW allocation path. The only interesting bits are a few tweaks the low-level allocator to allow for this, most notably the need to remove the call to xfs_bmap_extsize_align for the cowextsize in xfs_bmap_btalloc - for the existing convert case it's a no-op, but for the direct allocation case it would blow up our block reservation way beyond what we reserved for the transaction. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/libxfs/xfs_bmap.c | 9 ++--- fs/xfs/xfs_reflink.c | 101 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 34 deletions(-)