diff mbox

[05/11] xfs: track CoW blocks separately in the inode

Message ID 151676030942.12349.14467032190779795677.stgit@magnolia (mailing list archive)
State Superseded
Headers show

Commit Message

Darrick J. Wong Jan. 24, 2018, 2:18 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

Track the number of blocks reserved in the CoW fork so that we can
move the quota reservations whenever we chown, and don't account for
CoW fork delalloc reservations in i_delayed_blks.  This should make
chown work properly for quota reservations, enables us to fully
account for real extents in the cow fork in the file stat info, and
improves the post-eof scanning decisions because we're no longer
confusing data fork delalloc extents with cow fork delalloc extents.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c      |   16 ++++++++++++----
 fs/xfs/libxfs/xfs_inode_buf.c |    1 +
 fs/xfs/xfs_bmap_util.c        |    5 +++++
 fs/xfs/xfs_icache.c           |    3 ++-
 fs/xfs/xfs_inode.c            |   11 +++++------
 fs/xfs/xfs_inode.h            |    1 +
 fs/xfs/xfs_iops.c             |    3 ++-
 fs/xfs/xfs_itable.c           |    3 ++-
 fs/xfs/xfs_qm.c               |    2 +-
 fs/xfs/xfs_reflink.c          |    4 ++--
 fs/xfs/xfs_super.c            |    1 +
 11 files changed, 34 insertions(+), 16 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Brian Foster Jan. 25, 2018, 1:06 p.m. UTC | #1
On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Track the number of blocks reserved in the CoW fork so that we can
> move the quota reservations whenever we chown, and don't account for
> CoW fork delalloc reservations in i_delayed_blks.  This should make
> chown work properly for quota reservations, enables us to fully
> account for real extents in the cow fork in the file stat info, and
> improves the post-eof scanning decisions because we're no longer
> confusing data fork delalloc extents with cow fork delalloc extents.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_bmap.c      |   16 ++++++++++++----
>  fs/xfs/libxfs/xfs_inode_buf.c |    1 +
>  fs/xfs/xfs_bmap_util.c        |    5 +++++
>  fs/xfs/xfs_icache.c           |    3 ++-
>  fs/xfs/xfs_inode.c            |   11 +++++------
>  fs/xfs/xfs_inode.h            |    1 +
>  fs/xfs/xfs_iops.c             |    3 ++-
>  fs/xfs/xfs_itable.c           |    3 ++-
>  fs/xfs/xfs_qm.c               |    2 +-
>  fs/xfs/xfs_reflink.c          |    4 ++--
>  fs/xfs/xfs_super.c            |    1 +
>  11 files changed, 34 insertions(+), 16 deletions(-)
> 
> 
...
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4a38cfc..a208825 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
...
> @@ -1669,7 +1667,7 @@ xfs_release(
>  		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
>  		if (truncated) {
>  			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
> -			if (ip->i_delayed_blks > 0) {
> +			if (ip->i_delayed_blks > 0 || ip->i_cow_blocks > 0) {
>  				error = filemap_flush(VFS_I(ip)->i_mapping);
>  				if (error)
>  					return error;

Is having cowblocks really relevant to this hunk? I thought this was
purely a delalloc vs. file size thing, but I could be wrong. 

Brian

> @@ -1909,7 +1907,8 @@ xfs_inactive(
>  
>  	if (S_ISREG(VFS_I(ip)->i_mode) &&
>  	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
> -	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
> +	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0 ||
> +	     ip->i_cow_blocks > 0))
>  		truncate = 1;
>  
>  	error = xfs_qm_dqattach(ip, 0);
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index ff56486..6feee8a 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -62,6 +62,7 @@ typedef struct xfs_inode {
>  	/* Miscellaneous state. */
>  	unsigned long		i_flags;	/* see defined flags below */
>  	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
> +	unsigned int		i_cow_blocks;	/* count of cow fork blocks */
>  
>  	struct xfs_icdinode	i_d;		/* most of ondisk inode */
>  
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 56475fc..6c3381c 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -513,7 +513,8 @@ xfs_vn_getattr(
>  	stat->mtime = inode->i_mtime;
>  	stat->ctime = inode->i_ctime;
>  	stat->blocks =
> -		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
> +		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks +
> +				  ip->i_cow_blocks);
>  
>  	if (ip->i_d.di_version == 3) {
>  		if (request_mask & STATX_BTIME) {
> diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> index d583105..412d7eb 100644
> --- a/fs/xfs/xfs_itable.c
> +++ b/fs/xfs/xfs_itable.c
> @@ -122,7 +122,8 @@ xfs_bulkstat_one_int(
>  	case XFS_DINODE_FMT_BTREE:
>  		buf->bs_rdev = 0;
>  		buf->bs_blksize = mp->m_sb.sb_blocksize;
> -		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
> +		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks +
> +				 ip->i_cow_blocks;
>  		break;
>  	}
>  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index 5b848f4..28f12f8 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1847,7 +1847,7 @@ xfs_qm_vop_chown_reserve(
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
>  	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
>  
> -	delblks = ip->i_delayed_blks;
> +	delblks = ip->i_delayed_blks + ip->i_cow_blocks;
>  	blkflags = XFS_IS_REALTIME_INODE(ip) ?
>  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
>  
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index e367351..f875ea7 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -619,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
>  	}
>  
>  	/* clear tag if cow fork is emptied */
> -	if (!ifp->if_bytes)
> +	if (ip->i_cow_blocks == 0)
>  		xfs_inode_clear_cowblocks_tag(ip);
>  
>  	return error;
> @@ -704,7 +704,7 @@ xfs_reflink_end_cow(
>  	trace_xfs_reflink_end_cow(ip, offset, count);
>  
>  	/* No COW extents?  That's easy! */
> -	if (ifp->if_bytes == 0)
> +	if (ip->i_cow_blocks == 0)
>  		return 0;
>  
>  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index f3e0001..9d04cfb 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -989,6 +989,7 @@ xfs_fs_destroy_inode(
>  	xfs_inactive(ip);
>  
>  	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
> +	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_cow_blocks == 0);
>  	XFS_STATS_INC(ip->i_mount, vn_reclaim);
>  
>  	/*
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 25, 2018, 7:21 p.m. UTC | #2
On Thu, Jan 25, 2018 at 08:06:45AM -0500, Brian Foster wrote:
> On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Track the number of blocks reserved in the CoW fork so that we can
> > move the quota reservations whenever we chown, and don't account for
> > CoW fork delalloc reservations in i_delayed_blks.  This should make
> > chown work properly for quota reservations, enables us to fully
> > account for real extents in the cow fork in the file stat info, and
> > improves the post-eof scanning decisions because we're no longer
> > confusing data fork delalloc extents with cow fork delalloc extents.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/xfs/libxfs/xfs_bmap.c      |   16 ++++++++++++----
> >  fs/xfs/libxfs/xfs_inode_buf.c |    1 +
> >  fs/xfs/xfs_bmap_util.c        |    5 +++++
> >  fs/xfs/xfs_icache.c           |    3 ++-
> >  fs/xfs/xfs_inode.c            |   11 +++++------
> >  fs/xfs/xfs_inode.h            |    1 +
> >  fs/xfs/xfs_iops.c             |    3 ++-
> >  fs/xfs/xfs_itable.c           |    3 ++-
> >  fs/xfs/xfs_qm.c               |    2 +-
> >  fs/xfs/xfs_reflink.c          |    4 ++--
> >  fs/xfs/xfs_super.c            |    1 +
> >  11 files changed, 34 insertions(+), 16 deletions(-)
> > 
> > 
> ...
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 4a38cfc..a208825 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> ...
> > @@ -1669,7 +1667,7 @@ xfs_release(
> >  		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
> >  		if (truncated) {
> >  			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
> > -			if (ip->i_delayed_blks > 0) {
> > +			if (ip->i_delayed_blks > 0 || ip->i_cow_blocks > 0) {
> >  				error = filemap_flush(VFS_I(ip)->i_mapping);
> >  				if (error)
> >  					return error;
> 
> Is having cowblocks really relevant to this hunk? I thought this was
> purely a delalloc vs. file size thing, but I could be wrong. 

AFAICT, if we (1) use truncate to reduce a file's size, (2) write
somewhere past eof, (3) make some delalloc reservations for the post-eof
write, and (4) close the file, then this chunk flushes the dirty data to
disk so that if we crash after the close() call returns, the file will
still have all the data that was written out.  IOWs, this provides for
flush-on-close after a file size reduction.

So I was thinking that if a write to a lower offset causes the creation
of a speculative cow extent of some kind that extends past eof, we'd
still want to flush the dirty data to disk on close even if there are no
delalloc reservations in the data fork.

Ofc now I see that xfs_file_iomap_begin_delay will create the data fork
da reservation for a non-shared block even if a cow fork extent already
exists (the write is promoted to cow), so perhaps this isn't strictly
necessary... but adding a data fork da extent when there's already a cow
fork extent seems like a (mostly harmless) bug to me.

--D

> 
> Brian
> 
> > @@ -1909,7 +1907,8 @@ xfs_inactive(
> >  
> >  	if (S_ISREG(VFS_I(ip)->i_mode) &&
> >  	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
> > -	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
> > +	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0 ||
> > +	     ip->i_cow_blocks > 0))
> >  		truncate = 1;
> >  
> >  	error = xfs_qm_dqattach(ip, 0);
> > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > index ff56486..6feee8a 100644
> > --- a/fs/xfs/xfs_inode.h
> > +++ b/fs/xfs/xfs_inode.h
> > @@ -62,6 +62,7 @@ typedef struct xfs_inode {
> >  	/* Miscellaneous state. */
> >  	unsigned long		i_flags;	/* see defined flags below */
> >  	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
> > +	unsigned int		i_cow_blocks;	/* count of cow fork blocks */
> >  
> >  	struct xfs_icdinode	i_d;		/* most of ondisk inode */
> >  
> > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > index 56475fc..6c3381c 100644
> > --- a/fs/xfs/xfs_iops.c
> > +++ b/fs/xfs/xfs_iops.c
> > @@ -513,7 +513,8 @@ xfs_vn_getattr(
> >  	stat->mtime = inode->i_mtime;
> >  	stat->ctime = inode->i_ctime;
> >  	stat->blocks =
> > -		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
> > +		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks +
> > +				  ip->i_cow_blocks);
> >  
> >  	if (ip->i_d.di_version == 3) {
> >  		if (request_mask & STATX_BTIME) {
> > diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> > index d583105..412d7eb 100644
> > --- a/fs/xfs/xfs_itable.c
> > +++ b/fs/xfs/xfs_itable.c
> > @@ -122,7 +122,8 @@ xfs_bulkstat_one_int(
> >  	case XFS_DINODE_FMT_BTREE:
> >  		buf->bs_rdev = 0;
> >  		buf->bs_blksize = mp->m_sb.sb_blocksize;
> > -		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
> > +		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks +
> > +				 ip->i_cow_blocks;
> >  		break;
> >  	}
> >  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index 5b848f4..28f12f8 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> > @@ -1847,7 +1847,7 @@ xfs_qm_vop_chown_reserve(
> >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
> >  	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
> >  
> > -	delblks = ip->i_delayed_blks;
> > +	delblks = ip->i_delayed_blks + ip->i_cow_blocks;
> >  	blkflags = XFS_IS_REALTIME_INODE(ip) ?
> >  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
> >  
> > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > index e367351..f875ea7 100644
> > --- a/fs/xfs/xfs_reflink.c
> > +++ b/fs/xfs/xfs_reflink.c
> > @@ -619,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
> >  	}
> >  
> >  	/* clear tag if cow fork is emptied */
> > -	if (!ifp->if_bytes)
> > +	if (ip->i_cow_blocks == 0)
> >  		xfs_inode_clear_cowblocks_tag(ip);
> >  
> >  	return error;
> > @@ -704,7 +704,7 @@ xfs_reflink_end_cow(
> >  	trace_xfs_reflink_end_cow(ip, offset, count);
> >  
> >  	/* No COW extents?  That's easy! */
> > -	if (ifp->if_bytes == 0)
> > +	if (ip->i_cow_blocks == 0)
> >  		return 0;
> >  
> >  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index f3e0001..9d04cfb 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -989,6 +989,7 @@ xfs_fs_destroy_inode(
> >  	xfs_inactive(ip);
> >  
> >  	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
> > +	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_cow_blocks == 0);
> >  	XFS_STATS_INC(ip->i_mount, vn_reclaim);
> >  
> >  	/*
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Jan. 26, 2018, 12:15 p.m. UTC | #3
On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@oracle.com>
> 
> Track the number of blocks reserved in the CoW fork so that we can
> move the quota reservations whenever we chown, and don't account for
> CoW fork delalloc reservations in i_delayed_blks.  This should make
> chown work properly for quota reservations, enables us to fully
> account for real extents in the cow fork in the file stat info, and
> improves the post-eof scanning decisions because we're no longer
> confusing data fork delalloc extents with cow fork delalloc extents.

Just curious:  is there any good reason we can't just have an
i_extra_blocks field for the delayed and cow blocks?  Or is there
a place where we care about the difference between the two?
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Foster Jan. 26, 2018, 1:04 p.m. UTC | #4
On Thu, Jan 25, 2018 at 11:21:42AM -0800, Darrick J. Wong wrote:
> On Thu, Jan 25, 2018 at 08:06:45AM -0500, Brian Foster wrote:
> > On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Track the number of blocks reserved in the CoW fork so that we can
> > > move the quota reservations whenever we chown, and don't account for
> > > CoW fork delalloc reservations in i_delayed_blks.  This should make
> > > chown work properly for quota reservations, enables us to fully
> > > account for real extents in the cow fork in the file stat info, and
> > > improves the post-eof scanning decisions because we're no longer
> > > confusing data fork delalloc extents with cow fork delalloc extents.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > >  fs/xfs/libxfs/xfs_bmap.c      |   16 ++++++++++++----
> > >  fs/xfs/libxfs/xfs_inode_buf.c |    1 +
> > >  fs/xfs/xfs_bmap_util.c        |    5 +++++
> > >  fs/xfs/xfs_icache.c           |    3 ++-
> > >  fs/xfs/xfs_inode.c            |   11 +++++------
> > >  fs/xfs/xfs_inode.h            |    1 +
> > >  fs/xfs/xfs_iops.c             |    3 ++-
> > >  fs/xfs/xfs_itable.c           |    3 ++-
> > >  fs/xfs/xfs_qm.c               |    2 +-
> > >  fs/xfs/xfs_reflink.c          |    4 ++--
> > >  fs/xfs/xfs_super.c            |    1 +
> > >  11 files changed, 34 insertions(+), 16 deletions(-)
> > > 
> > > 
> > ...
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 4a38cfc..a208825 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > ...
> > > @@ -1669,7 +1667,7 @@ xfs_release(
> > >  		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
> > >  		if (truncated) {
> > >  			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
> > > -			if (ip->i_delayed_blks > 0) {
> > > +			if (ip->i_delayed_blks > 0 || ip->i_cow_blocks > 0) {
> > >  				error = filemap_flush(VFS_I(ip)->i_mapping);
> > >  				if (error)
> > >  					return error;
> > 
> > Is having cowblocks really relevant to this hunk? I thought this was
> > purely a delalloc vs. file size thing, but I could be wrong. 
> 
> AFAICT, if we (1) use truncate to reduce a file's size, (2) write
> somewhere past eof, (3) make some delalloc reservations for the post-eof
> write, and (4) close the file, then this chunk flushes the dirty data to
> disk so that if we crash after the close() call returns, the file will
> still have all the data that was written out.  IOWs, this provides for
> flush-on-close after a file size reduction.
> 

I think it goes back to problems where those subsequent buffered writes
increase the file size again and the fs crashes before all data is
written out. E.g., the problem described by commit ba87ea699e ("[XFS]
Fix to prevent the notorious 'NULL files' problem after a crash."). It's
not totally clear to me whether that fixed the problem and this
particular hack is still needed.

FWIW, the flush code looks like it goes back to commit 7d4fb40ad7
("[XFS] Start writeout earlier (on last close) ...").

> So I was thinking that if a write to a lower offset causes the creation
> of a speculative cow extent of some kind that extends past eof, we'd
> still want to flush the dirty data to disk on close even if there are no
> delalloc reservations in the data fork.
> 

This whole stanza still depends on a truncate in the first place
though..?

I guess I'm not necessarily against doing this, I just think we should
verify whether it's actually useful to prevent some kind of similar
crash-recovery problem it was intended to help mitigate. If not, then
we're subjecting ourselves to the tradeoff, which appears to be that
we'll initiate writeback of any file with cowblocks on close that has
been truncated.

Granted the truncate operation is probably infrequent with respect to
close() so it's probably not that big of a deal, but in the delalloc
case a flush is at least generally expected to clear the file of delayed
allocation. It's my understanding that the same is not necessarily true
for cowblocks.. cow prealloc means blocks can sit around in the cow fork
for a while in anticipation of future copy-on-writes, right?

Brian

> Ofc now I see that xfs_file_iomap_begin_delay will create the data fork
> da reservation for a non-shared block even if a cow fork extent already
> exists (the write is promoted to cow), so perhaps this isn't strictly
> necessary... but adding a data fork da extent when there's already a cow
> fork extent seems like a (mostly harmless) bug to me.
> 
> --D
> 
> > 
> > Brian
> > 
> > > @@ -1909,7 +1907,8 @@ xfs_inactive(
> > >  
> > >  	if (S_ISREG(VFS_I(ip)->i_mode) &&
> > >  	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
> > > -	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
> > > +	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0 ||
> > > +	     ip->i_cow_blocks > 0))
> > >  		truncate = 1;
> > >  
> > >  	error = xfs_qm_dqattach(ip, 0);
> > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > index ff56486..6feee8a 100644
> > > --- a/fs/xfs/xfs_inode.h
> > > +++ b/fs/xfs/xfs_inode.h
> > > @@ -62,6 +62,7 @@ typedef struct xfs_inode {
> > >  	/* Miscellaneous state. */
> > >  	unsigned long		i_flags;	/* see defined flags below */
> > >  	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
> > > +	unsigned int		i_cow_blocks;	/* count of cow fork blocks */
> > >  
> > >  	struct xfs_icdinode	i_d;		/* most of ondisk inode */
> > >  
> > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > index 56475fc..6c3381c 100644
> > > --- a/fs/xfs/xfs_iops.c
> > > +++ b/fs/xfs/xfs_iops.c
> > > @@ -513,7 +513,8 @@ xfs_vn_getattr(
> > >  	stat->mtime = inode->i_mtime;
> > >  	stat->ctime = inode->i_ctime;
> > >  	stat->blocks =
> > > -		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
> > > +		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks +
> > > +				  ip->i_cow_blocks);
> > >  
> > >  	if (ip->i_d.di_version == 3) {
> > >  		if (request_mask & STATX_BTIME) {
> > > diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> > > index d583105..412d7eb 100644
> > > --- a/fs/xfs/xfs_itable.c
> > > +++ b/fs/xfs/xfs_itable.c
> > > @@ -122,7 +122,8 @@ xfs_bulkstat_one_int(
> > >  	case XFS_DINODE_FMT_BTREE:
> > >  		buf->bs_rdev = 0;
> > >  		buf->bs_blksize = mp->m_sb.sb_blocksize;
> > > -		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
> > > +		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks +
> > > +				 ip->i_cow_blocks;
> > >  		break;
> > >  	}
> > >  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > index 5b848f4..28f12f8 100644
> > > --- a/fs/xfs/xfs_qm.c
> > > +++ b/fs/xfs/xfs_qm.c
> > > @@ -1847,7 +1847,7 @@ xfs_qm_vop_chown_reserve(
> > >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
> > >  	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
> > >  
> > > -	delblks = ip->i_delayed_blks;
> > > +	delblks = ip->i_delayed_blks + ip->i_cow_blocks;
> > >  	blkflags = XFS_IS_REALTIME_INODE(ip) ?
> > >  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
> > >  
> > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > index e367351..f875ea7 100644
> > > --- a/fs/xfs/xfs_reflink.c
> > > +++ b/fs/xfs/xfs_reflink.c
> > > @@ -619,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
> > >  	}
> > >  
> > >  	/* clear tag if cow fork is emptied */
> > > -	if (!ifp->if_bytes)
> > > +	if (ip->i_cow_blocks == 0)
> > >  		xfs_inode_clear_cowblocks_tag(ip);
> > >  
> > >  	return error;
> > > @@ -704,7 +704,7 @@ xfs_reflink_end_cow(
> > >  	trace_xfs_reflink_end_cow(ip, offset, count);
> > >  
> > >  	/* No COW extents?  That's easy! */
> > > -	if (ifp->if_bytes == 0)
> > > +	if (ip->i_cow_blocks == 0)
> > >  		return 0;
> > >  
> > >  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index f3e0001..9d04cfb 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -989,6 +989,7 @@ xfs_fs_destroy_inode(
> > >  	xfs_inactive(ip);
> > >  
> > >  	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
> > > +	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_cow_blocks == 0);
> > >  	XFS_STATS_INC(ip->i_mount, vn_reclaim);
> > >  
> > >  	/*
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 26, 2018, 7 p.m. UTC | #5
On Fri, Jan 26, 2018 at 04:15:46AM -0800, Christoph Hellwig wrote:
> On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> > 
> > Track the number of blocks reserved in the CoW fork so that we can
> > move the quota reservations whenever we chown, and don't account for
> > CoW fork delalloc reservations in i_delayed_blks.  This should make
> > chown work properly for quota reservations, enables us to fully
> > account for real extents in the cow fork in the file stat info, and
> > improves the post-eof scanning decisions because we're no longer
> > confusing data fork delalloc extents with cow fork delalloc extents.
> 
> Just curious:  is there any good reason we can't just have an
> i_extra_blocks field for the delayed and cow blocks?  Or is there
> a place where we care about the difference between the two?

"cow blocks" now includes real and unwritten extents sitting around in
the cow fork in addition to delalloc extents in the cow fork, and I
didn't want the field to have overlapping meanings.  On a practical
level, it also means we avoid eofblocks scans on inodes that have cow
blocks but no da blocks.

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 26, 2018, 7:08 p.m. UTC | #6
On Fri, Jan 26, 2018 at 08:04:29AM -0500, Brian Foster wrote:
> On Thu, Jan 25, 2018 at 11:21:42AM -0800, Darrick J. Wong wrote:
> > On Thu, Jan 25, 2018 at 08:06:45AM -0500, Brian Foster wrote:
> > > On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > > 
> > > > Track the number of blocks reserved in the CoW fork so that we can
> > > > move the quota reservations whenever we chown, and don't account for
> > > > CoW fork delalloc reservations in i_delayed_blks.  This should make
> > > > chown work properly for quota reservations, enables us to fully
> > > > account for real extents in the cow fork in the file stat info, and
> > > > improves the post-eof scanning decisions because we're no longer
> > > > confusing data fork delalloc extents with cow fork delalloc extents.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > >  fs/xfs/libxfs/xfs_bmap.c      |   16 ++++++++++++----
> > > >  fs/xfs/libxfs/xfs_inode_buf.c |    1 +
> > > >  fs/xfs/xfs_bmap_util.c        |    5 +++++
> > > >  fs/xfs/xfs_icache.c           |    3 ++-
> > > >  fs/xfs/xfs_inode.c            |   11 +++++------
> > > >  fs/xfs/xfs_inode.h            |    1 +
> > > >  fs/xfs/xfs_iops.c             |    3 ++-
> > > >  fs/xfs/xfs_itable.c           |    3 ++-
> > > >  fs/xfs/xfs_qm.c               |    2 +-
> > > >  fs/xfs/xfs_reflink.c          |    4 ++--
> > > >  fs/xfs/xfs_super.c            |    1 +
> > > >  11 files changed, 34 insertions(+), 16 deletions(-)
> > > > 
> > > > 
> > > ...
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index 4a38cfc..a208825 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > ...
> > > > @@ -1669,7 +1667,7 @@ xfs_release(
> > > >  		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
> > > >  		if (truncated) {
> > > >  			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
> > > > -			if (ip->i_delayed_blks > 0) {
> > > > +			if (ip->i_delayed_blks > 0 || ip->i_cow_blocks > 0) {
> > > >  				error = filemap_flush(VFS_I(ip)->i_mapping);
> > > >  				if (error)
> > > >  					return error;
> > > 
> > > Is having cowblocks really relevant to this hunk? I thought this was
> > > purely a delalloc vs. file size thing, but I could be wrong. 
> > 
> > AFAICT, if we (1) use truncate to reduce a file's size, (2) write
> > somewhere past eof, (3) make some delalloc reservations for the post-eof
> > write, and (4) close the file, then this chunk flushes the dirty data to
> > disk so that if we crash after the close() call returns, the file will
> > still have all the data that was written out.  IOWs, this provides for
> > flush-on-close after a file size reduction.
> > 
> 
> I think it goes back to problems where those subsequent buffered writes
> increase the file size again and the fs crashes before all data is
> written out. E.g., the problem described by commit ba87ea699e ("[XFS]
> Fix to prevent the notorious 'NULL files' problem after a crash."). It's
> not totally clear to me whether that fixed the problem and this
> particular hack is still needed.

Me neither.  It looks like deferring the size update until the write
end_io would have closed this bug... but on the other hand maybe its
function is more to avoid disappointing the people who expect flush on
close behavior...

> FWIW, the flush code looks like it goes back to commit 7d4fb40ad7
> ("[XFS] Start writeout earlier (on last close) ...").
> 
> > So I was thinking that if a write to a lower offset causes the creation
> > of a speculative cow extent of some kind that extends past eof, we'd
> > still want to flush the dirty data to disk on close even if there are no
> > delalloc reservations in the data fork.
> > 
> 
> This whole stanza still depends on a truncate in the first place
> though..?
> 
> I guess I'm not necessarily against doing this, I just think we should
> verify whether it's actually useful to prevent some kind of similar
> crash-recovery problem it was intended to help mitigate. If not, then
> we're subjecting ourselves to the tradeoff, which appears to be that
> we'll initiate writeback of any file with cowblocks on close that has
> been truncated.
> 
> Granted the truncate operation is probably infrequent with respect to
> close() so it's probably not that big of a deal, but in the delalloc

It's probably infrequent wrt cow-and-close, but "echo foo > existingfile"
would trigger this for the regular da case.  I don't really mind
dropping it either, aside from my sense of paranoia. :P

> case a flush is at least generally expected to clear the file of delayed
> allocation. It's my understanding that the same is not necessarily true
> for cowblocks.. cow prealloc means blocks can sit around in the cow fork
> for a while in anticipation of future copy-on-writes, right?

Yes.

--D

> 
> Brian
> 
> > Ofc now I see that xfs_file_iomap_begin_delay will create the data fork
> > da reservation for a non-shared block even if a cow fork extent already
> > exists (the write is promoted to cow), so perhaps this isn't strictly
> > necessary... but adding a data fork da extent when there's already a cow
> > fork extent seems like a (mostly harmless) bug to me.
> > 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > @@ -1909,7 +1907,8 @@ xfs_inactive(
> > > >  
> > > >  	if (S_ISREG(VFS_I(ip)->i_mode) &&
> > > >  	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
> > > > -	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
> > > > +	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0 ||
> > > > +	     ip->i_cow_blocks > 0))
> > > >  		truncate = 1;
> > > >  
> > > >  	error = xfs_qm_dqattach(ip, 0);
> > > > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> > > > index ff56486..6feee8a 100644
> > > > --- a/fs/xfs/xfs_inode.h
> > > > +++ b/fs/xfs/xfs_inode.h
> > > > @@ -62,6 +62,7 @@ typedef struct xfs_inode {
> > > >  	/* Miscellaneous state. */
> > > >  	unsigned long		i_flags;	/* see defined flags below */
> > > >  	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
> > > > +	unsigned int		i_cow_blocks;	/* count of cow fork blocks */
> > > >  
> > > >  	struct xfs_icdinode	i_d;		/* most of ondisk inode */
> > > >  
> > > > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > > > index 56475fc..6c3381c 100644
> > > > --- a/fs/xfs/xfs_iops.c
> > > > +++ b/fs/xfs/xfs_iops.c
> > > > @@ -513,7 +513,8 @@ xfs_vn_getattr(
> > > >  	stat->mtime = inode->i_mtime;
> > > >  	stat->ctime = inode->i_ctime;
> > > >  	stat->blocks =
> > > > -		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
> > > > +		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks +
> > > > +				  ip->i_cow_blocks);
> > > >  
> > > >  	if (ip->i_d.di_version == 3) {
> > > >  		if (request_mask & STATX_BTIME) {
> > > > diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> > > > index d583105..412d7eb 100644
> > > > --- a/fs/xfs/xfs_itable.c
> > > > +++ b/fs/xfs/xfs_itable.c
> > > > @@ -122,7 +122,8 @@ xfs_bulkstat_one_int(
> > > >  	case XFS_DINODE_FMT_BTREE:
> > > >  		buf->bs_rdev = 0;
> > > >  		buf->bs_blksize = mp->m_sb.sb_blocksize;
> > > > -		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
> > > > +		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks +
> > > > +				 ip->i_cow_blocks;
> > > >  		break;
> > > >  	}
> > > >  	xfs_iunlock(ip, XFS_ILOCK_SHARED);
> > > > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > > > index 5b848f4..28f12f8 100644
> > > > --- a/fs/xfs/xfs_qm.c
> > > > +++ b/fs/xfs/xfs_qm.c
> > > > @@ -1847,7 +1847,7 @@ xfs_qm_vop_chown_reserve(
> > > >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
> > > >  	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
> > > >  
> > > > -	delblks = ip->i_delayed_blks;
> > > > +	delblks = ip->i_delayed_blks + ip->i_cow_blocks;
> > > >  	blkflags = XFS_IS_REALTIME_INODE(ip) ?
> > > >  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
> > > >  
> > > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> > > > index e367351..f875ea7 100644
> > > > --- a/fs/xfs/xfs_reflink.c
> > > > +++ b/fs/xfs/xfs_reflink.c
> > > > @@ -619,7 +619,7 @@ xfs_reflink_cancel_cow_blocks(
> > > >  	}
> > > >  
> > > >  	/* clear tag if cow fork is emptied */
> > > > -	if (!ifp->if_bytes)
> > > > +	if (ip->i_cow_blocks == 0)
> > > >  		xfs_inode_clear_cowblocks_tag(ip);
> > > >  
> > > >  	return error;
> > > > @@ -704,7 +704,7 @@ xfs_reflink_end_cow(
> > > >  	trace_xfs_reflink_end_cow(ip, offset, count);
> > > >  
> > > >  	/* No COW extents?  That's easy! */
> > > > -	if (ifp->if_bytes == 0)
> > > > +	if (ip->i_cow_blocks == 0)
> > > >  		return 0;
> > > >  
> > > >  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
> > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > index f3e0001..9d04cfb 100644
> > > > --- a/fs/xfs/xfs_super.c
> > > > +++ b/fs/xfs/xfs_super.c
> > > > @@ -989,6 +989,7 @@ xfs_fs_destroy_inode(
> > > >  	xfs_inactive(ip);
> > > >  
> > > >  	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
> > > > +	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_cow_blocks == 0);
> > > >  	XFS_STATS_INC(ip->i_mount, vn_reclaim);
> > > >  
> > > >  	/*
> > > > 
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Darrick J. Wong Jan. 26, 2018, 11:51 p.m. UTC | #7
On Fri, Jan 26, 2018 at 11:00:58AM -0800, Darrick J. Wong wrote:
> On Fri, Jan 26, 2018 at 04:15:46AM -0800, Christoph Hellwig wrote:
> > On Tue, Jan 23, 2018 at 06:18:29PM -0800, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@oracle.com>
> > > 
> > > Track the number of blocks reserved in the CoW fork so that we can
> > > move the quota reservations whenever we chown, and don't account for
> > > CoW fork delalloc reservations in i_delayed_blks.  This should make
> > > chown work properly for quota reservations, enables us to fully
> > > account for real extents in the cow fork in the file stat info, and
> > > improves the post-eof scanning decisions because we're no longer
> > > confusing data fork delalloc extents with cow fork delalloc extents.
> > 
> > Just curious:  is there any good reason we can't just have an
> > i_extra_blocks field for the delayed and cow blocks?  Or is there
> > a place where we care about the difference between the two?
> 
> "cow blocks" now includes real and unwritten extents sitting around in
> the cow fork in addition to delalloc extents in the cow fork, and I
> didn't want the field to have overlapping meanings.  On a practical
> level, it also means we avoid eofblocks scans on inodes that have cow
> blocks but no da blocks.

Oh. Duh, we have the inode tags for that.  Ok, dropping this patch;
will integrate the two i_delayed_blks twiddles we need into the one that
fixes the quota accounting.

--D

> --D
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index e3e8f7c..93ce2c6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3505,6 +3505,7 @@  xfs_bmap_btalloc_cow(
 	 * the q_res_bcount blocks, so no quota accounting update is needed
 	 * here.
 	 */
+	ap->ip->i_cow_blocks += args->len;
 	xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
 			-(long)args->len);
 }
@@ -3743,13 +3744,13 @@  xfs_bmap_btalloc(
 			*ap->firstblock = args.fsbno;
 		ASSERT(nullfb || fb_agno <= args.agno);
 		ap->length = args.len;
-		if (ap->wasdel)
-			ap->ip->i_delayed_blks -= args.len;
 		if (ap->flags & XFS_BMAPI_COWFORK) {
 			xfs_bmap_btalloc_cow(ap, &args);
 		} else {
 			ap->ip->i_d.di_nblocks += args.len;
 			xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+			if (ap->wasdel)
+				ap->ip->i_delayed_blks -= args.len;
 			/*
 			 * Adjust the disk quota also. This was reserved
 			 * earlier.
@@ -4116,7 +4117,10 @@  xfs_bmapi_reserve_delalloc(
 		goto out_unreserve_blocks;
 
 
-	ip->i_delayed_blks += alen;
+	if (whichfork == XFS_COW_FORK)
+		ip->i_cow_blocks += alen;
+	else
+		ip->i_delayed_blks += alen;
 
 	got->br_startoff = aoff;
 	got->br_startblock = nullstartblock(indlen);
@@ -4859,7 +4863,10 @@  xfs_bmap_del_extent_delay(
 			isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
 	if (error)
 		return error;
-	ip->i_delayed_blks -= del->br_blockcount;
+	if (whichfork == XFS_COW_FORK)
+		ip->i_cow_blocks -= del->br_blockcount;
+	else
+		ip->i_delayed_blks -= del->br_blockcount;
 
 	if (got->br_startoff == del->br_startoff)
 		state |= BMAP_LEFT_FILLING;
@@ -5010,6 +5017,7 @@  xfs_bmap_del_extent_cow(
 	}
 
 	/* Remove the quota reservation */
+	ip->i_cow_blocks -= del->br_blockcount;
 	error = xfs_trans_reserve_quota_nblks(NULL, ip,
 			-(long)del->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
 	ASSERT(error == 0);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4035b5d..6e9dcdb 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -624,6 +624,7 @@  xfs_iread(
 
 	ASSERT(ip->i_d.di_version >= 2);
 	ip->i_delayed_blks = 0;
+	ip->i_cow_blocks = 0;
 
 	/*
 	 * Mark the buffer containing the inode as something to keep
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6d37ab4..c572789 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1991,6 +1991,7 @@  xfs_swap_extents(
 	/* Swap the cow forks. */
 	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
 		xfs_extnum_t	extnum;
+		unsigned int	cowblocks;
 
 		ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
 		ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
@@ -2011,6 +2012,10 @@  xfs_swap_extents(
 			xfs_inode_set_cowblocks_tag(tip);
 		else
 			xfs_inode_clear_cowblocks_tag(tip);
+
+		cowblocks = tip->i_cow_blocks;
+		tip->i_cow_blocks = ip->i_cow_blocks;
+		ip->i_cow_blocks = cowblocks;
 	}
 
 	xfs_trans_log_inode(tp, ip,  src_log_flags);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2da7a2e..1344206 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -80,6 +80,7 @@  xfs_inode_alloc(
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
+	ip->i_cow_blocks = 0;
 	memset(&ip->i_d, 0, sizeof(ip->i_d));
 
 	return ip;
@@ -1668,7 +1669,7 @@  xfs_prep_free_cowblocks(
 	 * Just clear the tag if we have an empty cow fork or none at all. It's
 	 * possible the inode was fully unshared since it was originally tagged.
 	 */
-	if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) {
+	if (!xfs_is_reflink_inode(ip) || ip->i_cow_blocks == 0) {
 		trace_xfs_inode_free_cowblocks_invalid(ip);
 		xfs_inode_clear_cowblocks_tag(ip);
 		return false;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4a38cfc..a208825 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1508,15 +1508,13 @@  xfs_itruncate_clear_reflink_flags(
 	struct xfs_inode	*ip)
 {
 	struct xfs_ifork	*dfork;
-	struct xfs_ifork	*cfork;
 
 	if (!xfs_is_reflink_inode(ip))
 		return;
 	dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	if (dfork->if_bytes == 0 && cfork->if_bytes == 0)
+	if (dfork->if_bytes == 0 && ip->i_cow_blocks == 0)
 		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
-	if (cfork->if_bytes == 0)
+	if (ip->i_cow_blocks == 0)
 		xfs_inode_clear_cowblocks_tag(ip);
 }
 
@@ -1669,7 +1667,7 @@  xfs_release(
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 		if (truncated) {
 			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-			if (ip->i_delayed_blks > 0) {
+			if (ip->i_delayed_blks > 0 || ip->i_cow_blocks > 0) {
 				error = filemap_flush(VFS_I(ip)->i_mapping);
 				if (error)
 					return error;
@@ -1909,7 +1907,8 @@  xfs_inactive(
 
 	if (S_ISREG(VFS_I(ip)->i_mode) &&
 	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
-	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0 ||
+	     ip->i_cow_blocks > 0))
 		truncate = 1;
 
 	error = xfs_qm_dqattach(ip, 0);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff56486..6feee8a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -62,6 +62,7 @@  typedef struct xfs_inode {
 	/* Miscellaneous state. */
 	unsigned long		i_flags;	/* see defined flags below */
 	unsigned int		i_delayed_blks;	/* count of delay alloc blks */
+	unsigned int		i_cow_blocks;	/* count of cow fork blocks */
 
 	struct xfs_icdinode	i_d;		/* most of ondisk inode */
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 56475fc..6c3381c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -513,7 +513,8 @@  xfs_vn_getattr(
 	stat->mtime = inode->i_mtime;
 	stat->ctime = inode->i_ctime;
 	stat->blocks =
-		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks +
+				  ip->i_cow_blocks);
 
 	if (ip->i_d.di_version == 3) {
 		if (request_mask & STATX_BTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index d583105..412d7eb 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -122,7 +122,8 @@  xfs_bulkstat_one_int(
 	case XFS_DINODE_FMT_BTREE:
 		buf->bs_rdev = 0;
 		buf->bs_blksize = mp->m_sb.sb_blocksize;
-		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+		buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks +
+				 ip->i_cow_blocks;
 		break;
 	}
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 5b848f4..28f12f8 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1847,7 +1847,7 @@  xfs_qm_vop_chown_reserve(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-	delblks = ip->i_delayed_blks;
+	delblks = ip->i_delayed_blks + ip->i_cow_blocks;
 	blkflags = XFS_IS_REALTIME_INODE(ip) ?
 			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e367351..f875ea7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -619,7 +619,7 @@  xfs_reflink_cancel_cow_blocks(
 	}
 
 	/* clear tag if cow fork is emptied */
-	if (!ifp->if_bytes)
+	if (ip->i_cow_blocks == 0)
 		xfs_inode_clear_cowblocks_tag(ip);
 
 	return error;
@@ -704,7 +704,7 @@  xfs_reflink_end_cow(
 	trace_xfs_reflink_end_cow(ip, offset, count);
 
 	/* No COW extents?  That's easy! */
-	if (ifp->if_bytes == 0)
+	if (ip->i_cow_blocks == 0)
 		return 0;
 
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f3e0001..9d04cfb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -989,6 +989,7 @@  xfs_fs_destroy_inode(
 	xfs_inactive(ip);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_cow_blocks == 0);
 	XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
 	/*