diff mbox series

[V3,09/12] xfs: Enable bulkstat ioctl to support 64-bit per-inode extent counters

Message ID 20210916100647.176018-10-chandan.babu@oracle.com (mailing list archive)
State Superseded, archived
Headers show
Series xfs: Extend per-inode extent counters | expand

Commit Message

Chandan Babu R Sept. 16, 2021, 10:06 a.m. UTC
The following changes are made to enable userspace to obtain 64-bit extent
counters,
1. To hold 64-bit extent counters, carve out the new 64-bit field
   xfs_bulkstat->bs_extents64 from xfs_bulkstat->bs_pad[].
2. Carve out a new 64-bit field xfs_bulk_ireq->bulkstat_flags from
   xfs_bulk_ireq->reserved[] to hold bulkstat specific operational flags.  As of
   this commit, XFS_IBULK_NREXT64 is the only valid flag that this field can
   hold. It indicates that userspace has the necessary infrastructure to
   receive 64-bit extent counters.
3. Define the new flag XFS_BULK_IREQ_BULKSTAT for userspace to indicate that
   xfs_bulk_ireq->bulkstat_flags has valid flags set.

Suggested-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
---
 fs/xfs/libxfs/xfs_fs.h | 19 ++++++++++++++-----
 fs/xfs/xfs_ioctl.c     |  7 +++++++
 fs/xfs/xfs_itable.c    | 25 +++++++++++++++++++++++--
 fs/xfs/xfs_itable.h    |  2 ++
 fs/xfs/xfs_iwalk.h     |  7 +++++--
 5 files changed, 51 insertions(+), 9 deletions(-)

Comments

Dave Chinner Sept. 27, 2021, 11:06 p.m. UTC | #1
On Thu, Sep 16, 2021 at 03:36:44PM +0530, Chandan Babu R wrote:
> The following changes are made to enable userspace to obtain 64-bit extent
> counters,
> 1. To hold 64-bit extent counters, carve out the new 64-bit field
>    xfs_bulkstat->bs_extents64 from xfs_bulkstat->bs_pad[].
> 2. Carve out a new 64-bit field xfs_bulk_ireq->bulkstat_flags from
>    xfs_bulk_ireq->reserved[] to hold bulkstat specific operational flags.  As of
>    this commit, XFS_IBULK_NREXT64 is the only valid flag that this field can
>    hold. It indicates that userspace has the necessary infrastructure to
>    receive 64-bit extent counters.
> 3. Define the new flag XFS_BULK_IREQ_BULKSTAT for userspace to indicate that
>    xfs_bulk_ireq->bulkstat_flags has valid flags set.

This seems unnecessarily complex. It adds a new flag to define a new
flag field in the same structure and then define a new and a new
flag in the new flag field to define a new behaviour.

Why can't this be done with just a single new flag in the existing
flags field?

> Suggested-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
> ---
>  fs/xfs/libxfs/xfs_fs.h | 19 ++++++++++++++-----
>  fs/xfs/xfs_ioctl.c     |  7 +++++++
>  fs/xfs/xfs_itable.c    | 25 +++++++++++++++++++++++--
>  fs/xfs/xfs_itable.h    |  2 ++
>  fs/xfs/xfs_iwalk.h     |  7 +++++--
>  5 files changed, 51 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index 2594fb647384..b76906914d89 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -394,7 +394,7 @@ struct xfs_bulkstat {
>  	uint32_t	bs_extsize_blks; /* extent size hint, blocks	*/
>  
>  	uint32_t	bs_nlink;	/* number of links		*/
> -	uint32_t	bs_extents;	/* number of extents		*/
> +	uint32_t	bs_extents32;	/* 32-bit data fork extent counter */
>  	uint32_t	bs_aextents;	/* attribute number of extents	*/
>  	uint16_t	bs_version;	/* structure version		*/
>  	uint16_t	bs_forkoff;	/* inode fork offset in bytes	*/

I don't think renaming structure members is a good idea - it breaks
the user API and forces applications to require source level
modifications just to compile on both old and new xfsprogs installs.

> @@ -403,8 +403,9 @@ struct xfs_bulkstat {
>  	uint16_t	bs_checked;	/* checked inode metadata	*/
>  	uint16_t	bs_mode;	/* type and mode		*/
>  	uint16_t	bs_pad2;	/* zeroed			*/
> +	uint64_t	bs_extents64;	/* 64-bit data fork extent counter */
>  
> -	uint64_t	bs_pad[7];	/* zeroed			*/
> +	uint64_t	bs_pad[6];	/* zeroed			*/
>  };
>  
>  #define XFS_BULKSTAT_VERSION_V1	(1)
> @@ -469,7 +470,8 @@ struct xfs_bulk_ireq {
>  	uint32_t	icount;		/* I: count of entries in buffer */
>  	uint32_t	ocount;		/* O: count of entries filled out */
>  	uint32_t	agno;		/* I: see comment for IREQ_AGNO	*/
> -	uint64_t	reserved[5];	/* must be zero			*/
> +	uint64_t	bulkstat_flags; /* I: Bulkstat operation flags */
> +	uint64_t	reserved[4];	/* must be zero			*/
>  };
>  
>  /*
> @@ -492,9 +494,16 @@ struct xfs_bulk_ireq {
>   */
>  #define XFS_BULK_IREQ_METADIR	(1 << 2)
>  
> -#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
> +#define XFS_BULK_IREQ_BULKSTAT	(1 << 3)
> +
> +#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
>  				 XFS_BULK_IREQ_SPECIAL | \
> -				 XFS_BULK_IREQ_METADIR)
> +				 XFS_BULK_IREQ_METADIR | \
> +				 XFS_BULK_IREQ_BULKSTAT)

What's this XFS_BULK_IREQ_METADIR thing? I haven't noticed that when
scanning any recent proposed patch series....

> +#define XFS_BULK_IREQ_BULKSTAT_NREXT64 (1 << 0)
> +
> +#define XFS_BULK_IREQ_BULKSTAT_FLAGS_ALL (XFS_BULK_IREQ_BULKSTAT_NREXT64)

As per above, this seems unnecessarily complex.

> @@ -134,7 +136,26 @@ xfs_bulkstat_one_int(
>  
>  	buf->bs_xflags = xfs_ip2xflags(ip);
>  	buf->bs_extsize_blks = ip->i_extsize;
> -	buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
> +
> +	nextents = xfs_ifork_nextents(&ip->i_df);
> +	if (!(bc->breq->flags & XFS_IBULK_NREXT64)) {
> +		xfs_extnum_t max_nextents = XFS_IFORK_EXTCNT_MAXS32;
> +
> +		if (unlikely(XFS_TEST_ERROR(false, mp,
> +				XFS_ERRTAG_REDUCE_MAX_IEXTENTS)))
> +			max_nextents = 10;
> +
> +		if (nextents > max_nextents) {
> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> +			xfs_irele(ip);
> +			error = -EINVAL;
> +			goto out_advance;
> +		}

So we return an EINVAL error if any extent overflows the 32 bit
counter? Why isn't this -EOVERFLOW?

> +		buf->bs_extents32 = nextents;
> +	} else {
> +		buf->bs_extents64 = nextents;
> +	}
> +
>  	xfs_bulkstat_health(ip, buf);
>  	buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
>  	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
> @@ -356,7 +377,7 @@ xfs_bulkstat_to_bstat(
>  	bs1->bs_blocks = bstat->bs_blocks;
>  	bs1->bs_xflags = bstat->bs_xflags;
>  	bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
> -	bs1->bs_extents = bstat->bs_extents;
> +	bs1->bs_extents = bstat->bs_extents32;
>  	bs1->bs_gen = bstat->bs_gen;
>  	bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
>  	bs1->bs_forkoff = bstat->bs_forkoff;
> diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
> index f5a13f69883a..f61685da3837 100644
> --- a/fs/xfs/xfs_itable.h
> +++ b/fs/xfs/xfs_itable.h
> @@ -22,6 +22,8 @@ struct xfs_ibulk {
>  /* Signal that we can return metadata directories. */
>  #define XFS_IBULK_METADIR	(XFS_IWALK_METADIR)
>  
> +#define XFS_IBULK_NREXT64	(XFS_IWALK_NREXT64)
> +
>  /*
>   * Advance the user buffer pointer by one record of the given size.  If the
>   * buffer is now full, return the appropriate error code.
> diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
> index d7a082e45cbf..27a6842a1bb5 100644
> --- a/fs/xfs/xfs_iwalk.h
> +++ b/fs/xfs/xfs_iwalk.h
> @@ -31,8 +31,11 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
>  /* Signal that we can return metadata directories. */
>  #define XFS_IWALK_METADIR	(0x2)
>  
> -#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG | \
> -				 XFS_IWALK_METADIR)
> +#define XFS_IWALK_NREXT64	(0x4)

Can we use '(1 << 2)' style notation for new bit field defines?

Cheers,

Dave.
Chandan Babu R Sept. 28, 2021, 9:49 a.m. UTC | #2
On 28 Sep 2021 at 04:36, Dave Chinner wrote:
> On Thu, Sep 16, 2021 at 03:36:44PM +0530, Chandan Babu R wrote:
>> The following changes are made to enable userspace to obtain 64-bit extent
>> counters,
>> 1. To hold 64-bit extent counters, carve out the new 64-bit field
>>    xfs_bulkstat->bs_extents64 from xfs_bulkstat->bs_pad[].
>> 2. Carve out a new 64-bit field xfs_bulk_ireq->bulkstat_flags from
>>    xfs_bulk_ireq->reserved[] to hold bulkstat specific operational flags.  As of
>>    this commit, XFS_IBULK_NREXT64 is the only valid flag that this field can
>>    hold. It indicates that userspace has the necessary infrastructure to
>>    receive 64-bit extent counters.
>> 3. Define the new flag XFS_BULK_IREQ_BULKSTAT for userspace to indicate that
>>    xfs_bulk_ireq->bulkstat_flags has valid flags set.
>
> This seems unnecessarily complex. It adds a new flag to define a new
> flag field in the same structure and then define a new and a new
> flag in the new flag field to define a new behaviour.
>
> Why can't this be done with just a single new flag in the existing
> flags field?
>

Yes, This can be implemented with just one flag. I will make the relevant
changes before posting the next version.

>> Suggested-by: Darrick J. Wong <djwong@kernel.org>
>> Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
>> ---
>>  fs/xfs/libxfs/xfs_fs.h | 19 ++++++++++++++-----
>>  fs/xfs/xfs_ioctl.c     |  7 +++++++
>>  fs/xfs/xfs_itable.c    | 25 +++++++++++++++++++++++--
>>  fs/xfs/xfs_itable.h    |  2 ++
>>  fs/xfs/xfs_iwalk.h     |  7 +++++--
>>  5 files changed, 51 insertions(+), 9 deletions(-)
>> 
>> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
>> index 2594fb647384..b76906914d89 100644
>> --- a/fs/xfs/libxfs/xfs_fs.h
>> +++ b/fs/xfs/libxfs/xfs_fs.h
>> @@ -394,7 +394,7 @@ struct xfs_bulkstat {
>>  	uint32_t	bs_extsize_blks; /* extent size hint, blocks	*/
>>  
>>  	uint32_t	bs_nlink;	/* number of links		*/
>> -	uint32_t	bs_extents;	/* number of extents		*/
>> +	uint32_t	bs_extents32;	/* 32-bit data fork extent counter */
>>  	uint32_t	bs_aextents;	/* attribute number of extents	*/
>>  	uint16_t	bs_version;	/* structure version		*/
>>  	uint16_t	bs_forkoff;	/* inode fork offset in bytes	*/
>
> I don't think renaming structure members is a good idea - it breaks
> the user API and forces applications to require source level
> modifications just to compile on both old and new xfsprogs installs.
>

Ok. I will revert the rename.

>> @@ -403,8 +403,9 @@ struct xfs_bulkstat {
>>  	uint16_t	bs_checked;	/* checked inode metadata	*/
>>  	uint16_t	bs_mode;	/* type and mode		*/
>>  	uint16_t	bs_pad2;	/* zeroed			*/
>> +	uint64_t	bs_extents64;	/* 64-bit data fork extent counter */
>>  
>> -	uint64_t	bs_pad[7];	/* zeroed			*/
>> +	uint64_t	bs_pad[6];	/* zeroed			*/
>>  };
>>  
>>  #define XFS_BULKSTAT_VERSION_V1	(1)
>> @@ -469,7 +470,8 @@ struct xfs_bulk_ireq {
>>  	uint32_t	icount;		/* I: count of entries in buffer */
>>  	uint32_t	ocount;		/* O: count of entries filled out */
>>  	uint32_t	agno;		/* I: see comment for IREQ_AGNO	*/
>> -	uint64_t	reserved[5];	/* must be zero			*/
>> +	uint64_t	bulkstat_flags; /* I: Bulkstat operation flags */
>> +	uint64_t	reserved[4];	/* must be zero			*/
>>  };
>>  
>>  /*
>> @@ -492,9 +494,16 @@ struct xfs_bulk_ireq {
>>   */
>>  #define XFS_BULK_IREQ_METADIR	(1 << 2)
>>  
>> -#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
>> +#define XFS_BULK_IREQ_BULKSTAT	(1 << 3)
>> +
>> +#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
>>  				 XFS_BULK_IREQ_SPECIAL | \
>> -				 XFS_BULK_IREQ_METADIR)
>> +				 XFS_BULK_IREQ_METADIR | \
>> +				 XFS_BULK_IREQ_BULKSTAT)
>
> What's this XFS_BULK_IREQ_METADIR thing? I haven't noticed that when
> scanning any recent proposed patch series....
>

XFS_BULK_IREQ_METADIR is from Darrick's tree. His "Kill XFS_BTREE_MAXLEVELS"
patch series is based on his other patchsets. His recent "xfs: support dynamic
btree cursor height" patch series rebases only the required patchset on top of
v5.15-rc1 kernel eliminating the others.

>> +#define XFS_BULK_IREQ_BULKSTAT_NREXT64 (1 << 0)
>> +
>> +#define XFS_BULK_IREQ_BULKSTAT_FLAGS_ALL (XFS_BULK_IREQ_BULKSTAT_NREXT64)
>
> As per above, this seems unnecessarily complex.
>
>> @@ -134,7 +136,26 @@ xfs_bulkstat_one_int(
>>  
>>  	buf->bs_xflags = xfs_ip2xflags(ip);
>>  	buf->bs_extsize_blks = ip->i_extsize;
>> -	buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
>> +
>> +	nextents = xfs_ifork_nextents(&ip->i_df);
>> +	if (!(bc->breq->flags & XFS_IBULK_NREXT64)) {
>> +		xfs_extnum_t max_nextents = XFS_IFORK_EXTCNT_MAXS32;
>> +
>> +		if (unlikely(XFS_TEST_ERROR(false, mp,
>> +				XFS_ERRTAG_REDUCE_MAX_IEXTENTS)))
>> +			max_nextents = 10;
>> +
>> +		if (nextents > max_nextents) {
>> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
>> +			xfs_irele(ip);
>> +			error = -EINVAL;
>> +			goto out_advance;
>> +		}
>
> So we return an EINVAL error if any extent overflows the 32 bit
> counter? Why isn't this -EOVERFLOW?
>

Returning -EINVAL causes xfs_bulkstat_iwalk() to skip inodes whose extent
count is larger than that which can be fitted into a 32-bit field. Returning
-EOVERFLOW causes the bulkstat ioctl to stop reporting remaining inodes.

>> +		buf->bs_extents32 = nextents;
>> +	} else {
>> +		buf->bs_extents64 = nextents;
>> +	}
>> +
>>  	xfs_bulkstat_health(ip, buf);
>>  	buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
>>  	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
>> @@ -356,7 +377,7 @@ xfs_bulkstat_to_bstat(
>>  	bs1->bs_blocks = bstat->bs_blocks;
>>  	bs1->bs_xflags = bstat->bs_xflags;
>>  	bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
>> -	bs1->bs_extents = bstat->bs_extents;
>> +	bs1->bs_extents = bstat->bs_extents32;
>>  	bs1->bs_gen = bstat->bs_gen;
>>  	bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
>>  	bs1->bs_forkoff = bstat->bs_forkoff;
>> diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
>> index f5a13f69883a..f61685da3837 100644
>> --- a/fs/xfs/xfs_itable.h
>> +++ b/fs/xfs/xfs_itable.h
>> @@ -22,6 +22,8 @@ struct xfs_ibulk {
>>  /* Signal that we can return metadata directories. */
>>  #define XFS_IBULK_METADIR	(XFS_IWALK_METADIR)
>>  
>> +#define XFS_IBULK_NREXT64	(XFS_IWALK_NREXT64)
>> +
>>  /*
>>   * Advance the user buffer pointer by one record of the given size.  If the
>>   * buffer is now full, return the appropriate error code.
>> diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
>> index d7a082e45cbf..27a6842a1bb5 100644
>> --- a/fs/xfs/xfs_iwalk.h
>> +++ b/fs/xfs/xfs_iwalk.h
>> @@ -31,8 +31,11 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
>>  /* Signal that we can return metadata directories. */
>>  #define XFS_IWALK_METADIR	(0x2)
>>  
>> -#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG | \
>> -				 XFS_IWALK_METADIR)
>> +#define XFS_IWALK_NREXT64	(0x4)
>
> Can we use '(1 << 2)' style notation for new bit field defines?

Sure, I will change this.
Dave Chinner Sept. 28, 2021, 11:39 p.m. UTC | #3
On Tue, Sep 28, 2021 at 03:19:29PM +0530, Chandan Babu R wrote:
> On 28 Sep 2021 at 04:36, Dave Chinner wrote:
> > On Thu, Sep 16, 2021 at 03:36:44PM +0530, Chandan Babu R wrote:
> >> @@ -492,9 +494,16 @@ struct xfs_bulk_ireq {
> >>   */
> >>  #define XFS_BULK_IREQ_METADIR	(1 << 2)
> >>  
> >> -#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
> >> +#define XFS_BULK_IREQ_BULKSTAT	(1 << 3)
> >> +
> >> +#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
> >>  				 XFS_BULK_IREQ_SPECIAL | \
> >> -				 XFS_BULK_IREQ_METADIR)
> >> +				 XFS_BULK_IREQ_METADIR | \
> >> +				 XFS_BULK_IREQ_BULKSTAT)
> >
> > What's this XFS_BULK_IREQ_METADIR thing? I haven't noticed that when
> > scanning any recent proposed patch series....
> >
> 
> XFS_BULK_IREQ_METADIR is from Darrick's tree. His "Kill XFS_BTREE_MAXLEVELS"
> patch series is based on his other patchsets. His recent "xfs: support dynamic
> btree cursor height" patch series rebases only the required patchset on top of
> v5.15-rc1 kernel eliminating the others.

OK, so how much testing has this had on just a straight v5.15-rcX
kernel?

> >> @@ -134,7 +136,26 @@ xfs_bulkstat_one_int(
> >>  
> >>  	buf->bs_xflags = xfs_ip2xflags(ip);
> >>  	buf->bs_extsize_blks = ip->i_extsize;
> >> -	buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
> >> +
> >> +	nextents = xfs_ifork_nextents(&ip->i_df);
> >> +	if (!(bc->breq->flags & XFS_IBULK_NREXT64)) {
> >> +		xfs_extnum_t max_nextents = XFS_IFORK_EXTCNT_MAXS32;
> >> +
> >> +		if (unlikely(XFS_TEST_ERROR(false, mp,
> >> +				XFS_ERRTAG_REDUCE_MAX_IEXTENTS)))
> >> +			max_nextents = 10;
> >> +
> >> +		if (nextents > max_nextents) {
> >> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
> >> +			xfs_irele(ip);
> >> +			error = -EINVAL;
> >> +			goto out_advance;
> >> +		}
> >
> > So we return an EINVAL error if any extent overflows the 32 bit
> > counter? Why isn't this -EOVERFLOW?
> >
> 
> Returning -EINVAL causes xfs_bulkstat_iwalk() to skip inodes whose extent
> count is larger than that which can be fitted into a 32-bit field. Returning
> -EOVERFLOW causes the bulkstat ioctl to stop reporting remaining inodes.

Ok, that's a bad behaviour we need to fix because it will cause
things like old versions of xfs_dump to miss inodes that
have overflowing extent counts. i.e. it will cause incomplete
backups, and the failure will likely be silent.

I asked about -EOVERFLOW because that's what stat() returns when an
inode attribute value doesn't fit in the stat_buf field (e.g. 64 bit
inode number on 32 bit kernel), and if we are overflowing the
bulkstat field then we really should be telling userspace that an
overflow occurred.

/me has a sudden realisation that the xfs_dump format may not
support large extents counts and goes looking...

Yeah, xfsdump doesn't support extent counts greater than 2^32. So
that means we really do need -EOVERFLOW errors here.  i.e, if we get
an extent count overflow with a !(bc->breq->flags &
XFS_IBULK_NREXT64) bulkstat walk, xfs_dump needs bulkstat to fill
out the inode with the overflow with all the fileds that aren't
overflowed, then error out with -EOVERFLOW.

Bulkstat itself should not silently skip the inode because it would
overflow a field in the struct xfs-bstat - the decision of what to
do with the overflow is something xfsdump needs to handle, not the
kernel.  Hence we need to return -EOVERFLOW here so that userspace
can decide what to do with an inode it can't handle...

Cheers,

Dave.
Chandan Babu R Sept. 29, 2021, 5:04 p.m. UTC | #4
On 29 Sep 2021 at 05:09, Dave Chinner wrote:
> On Tue, Sep 28, 2021 at 03:19:29PM +0530, Chandan Babu R wrote:
>> On 28 Sep 2021 at 04:36, Dave Chinner wrote:
>> > On Thu, Sep 16, 2021 at 03:36:44PM +0530, Chandan Babu R wrote:
>> >> @@ -492,9 +494,16 @@ struct xfs_bulk_ireq {
>> >>   */
>> >>  #define XFS_BULK_IREQ_METADIR	(1 << 2)
>> >>  
>> >> -#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
>> >> +#define XFS_BULK_IREQ_BULKSTAT	(1 << 3)
>> >> +
>> >> +#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
>> >>  				 XFS_BULK_IREQ_SPECIAL | \
>> >> -				 XFS_BULK_IREQ_METADIR)
>> >> +				 XFS_BULK_IREQ_METADIR | \
>> >> +				 XFS_BULK_IREQ_BULKSTAT)
>> >
>> > What's this XFS_BULK_IREQ_METADIR thing? I haven't noticed that when
>> > scanning any recent proposed patch series....
>> >
>> 
>> XFS_BULK_IREQ_METADIR is from Darrick's tree. His "Kill XFS_BTREE_MAXLEVELS"
>> patch series is based on his other patchsets. His recent "xfs: support dynamic
>> btree cursor height" patch series rebases only the required patchset on top of
>> v5.15-rc1 kernel eliminating the others.
>
> OK, so how much testing has this had on just a straight v5.15-rcX
> kernel?
>

I haven't yet tested this patchset on v5.15-rcX yet. I will have to rebase my
patchset on top of Darrick's patchset and also would require xfsprogs' version
of "xfs: support dynamic btree cursor height".

>> >> @@ -134,7 +136,26 @@ xfs_bulkstat_one_int(
>> >>  
>> >>  	buf->bs_xflags = xfs_ip2xflags(ip);
>> >>  	buf->bs_extsize_blks = ip->i_extsize;
>> >> -	buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
>> >> +
>> >> +	nextents = xfs_ifork_nextents(&ip->i_df);
>> >> +	if (!(bc->breq->flags & XFS_IBULK_NREXT64)) {
>> >> +		xfs_extnum_t max_nextents = XFS_IFORK_EXTCNT_MAXS32;
>> >> +
>> >> +		if (unlikely(XFS_TEST_ERROR(false, mp,
>> >> +				XFS_ERRTAG_REDUCE_MAX_IEXTENTS)))
>> >> +			max_nextents = 10;
>> >> +
>> >> +		if (nextents > max_nextents) {
>> >> +			xfs_iunlock(ip, XFS_ILOCK_SHARED);
>> >> +			xfs_irele(ip);
>> >> +			error = -EINVAL;
>> >> +			goto out_advance;
>> >> +		}
>> >
>> > So we return an EINVAL error if any extent overflows the 32 bit
>> > counter? Why isn't this -EOVERFLOW?
>> >
>> 
>> Returning -EINVAL causes xfs_bulkstat_iwalk() to skip inodes whose extent
>> count is larger than that which can be fitted into a 32-bit field. Returning
>> -EOVERFLOW causes the bulkstat ioctl to stop reporting remaining inodes.
>
> Ok, that's a bad behaviour we need to fix because it will cause
> things like old versions of xfs_dump to miss inodes that
> have overflowing extent counts. i.e. it will cause incomplete
> backups, and the failure will likely be silent.
>
> I asked about -EOVERFLOW because that's what stat() returns when an
> inode attribute value doesn't fit in the stat_buf field (e.g. 64 bit
> inode number on 32 bit kernel), and if we are overflowing the
> bulkstat field then we really should be telling userspace that an
> overflow occurred.
>
> /me has a sudden realisation that the xfs_dump format may not
> support large extents counts and goes looking...
>
> Yeah, xfsdump doesn't support extent counts greater than 2^32. So
> that means we really do need -EOVERFLOW errors here.  i.e, if we get
> an extent count overflow with a !(bc->breq->flags &
> XFS_IBULK_NREXT64) bulkstat walk, xfs_dump needs bulkstat to fill
> out the inode with the overflow with all the fileds that aren't
> overflowed, then error out with -EOVERFLOW.
>
> Bulkstat itself should not silently skip the inode because it would
> overflow a field in the struct xfs-bstat - the decision of what to
> do with the overflow is something xfsdump needs to handle, not the
> kernel.  Hence we need to return -EOVERFLOW here so that userspace
> can decide what to do with an inode it can't handle...
>

Ok. I had never thought of xfsdump use case. I will fix this issue as
well.

I guess adding ability to xfsdump to work with 64-bit extent counters can be
done after I address all the issues pointed out with the current patchset.

Thanks a lot for reviewing this patchset.
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2594fb647384..b76906914d89 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -394,7 +394,7 @@  struct xfs_bulkstat {
 	uint32_t	bs_extsize_blks; /* extent size hint, blocks	*/
 
 	uint32_t	bs_nlink;	/* number of links		*/
-	uint32_t	bs_extents;	/* number of extents		*/
+	uint32_t	bs_extents32;	/* 32-bit data fork extent counter */
 	uint32_t	bs_aextents;	/* attribute number of extents	*/
 	uint16_t	bs_version;	/* structure version		*/
 	uint16_t	bs_forkoff;	/* inode fork offset in bytes	*/
@@ -403,8 +403,9 @@  struct xfs_bulkstat {
 	uint16_t	bs_checked;	/* checked inode metadata	*/
 	uint16_t	bs_mode;	/* type and mode		*/
 	uint16_t	bs_pad2;	/* zeroed			*/
+	uint64_t	bs_extents64;	/* 64-bit data fork extent counter */
 
-	uint64_t	bs_pad[7];	/* zeroed			*/
+	uint64_t	bs_pad[6];	/* zeroed			*/
 };
 
 #define XFS_BULKSTAT_VERSION_V1	(1)
@@ -469,7 +470,8 @@  struct xfs_bulk_ireq {
 	uint32_t	icount;		/* I: count of entries in buffer */
 	uint32_t	ocount;		/* O: count of entries filled out */
 	uint32_t	agno;		/* I: see comment for IREQ_AGNO	*/
-	uint64_t	reserved[5];	/* must be zero			*/
+	uint64_t	bulkstat_flags; /* I: Bulkstat operation flags */
+	uint64_t	reserved[4];	/* must be zero			*/
 };
 
 /*
@@ -492,9 +494,16 @@  struct xfs_bulk_ireq {
  */
 #define XFS_BULK_IREQ_METADIR	(1 << 2)
 
-#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO | \
+#define XFS_BULK_IREQ_BULKSTAT	(1 << 3)
+
+#define XFS_BULK_IREQ_FLAGS_ALL	(XFS_BULK_IREQ_AGNO |	 \
 				 XFS_BULK_IREQ_SPECIAL | \
-				 XFS_BULK_IREQ_METADIR)
+				 XFS_BULK_IREQ_METADIR | \
+				 XFS_BULK_IREQ_BULKSTAT)
+
+#define XFS_BULK_IREQ_BULKSTAT_NREXT64 (1 << 0)
+
+#define XFS_BULK_IREQ_BULKSTAT_FLAGS_ALL (XFS_BULK_IREQ_BULKSTAT_NREXT64)
 
 /* Operate on the root directory inode. */
 #define XFS_BULK_IREQ_SPECIAL_ROOT	(1)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4077862fa806..207c96bbc729 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -839,6 +839,10 @@  xfs_bulk_ireq_setup(
 {
 	if (hdr->icount == 0 ||
 	    (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) ||
+	    ((hdr->flags & XFS_BULK_IREQ_BULKSTAT) &&
+	     (hdr->bulkstat_flags & ~XFS_BULK_IREQ_BULKSTAT_FLAGS_ALL)) ||
+	    (!(hdr->flags & XFS_BULK_IREQ_BULKSTAT) &&
+	     (hdr->bulkstat_flags != 0)) ||
 	    memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
 		return -EINVAL;
 
@@ -897,6 +901,9 @@  xfs_bulk_ireq_setup(
 	if (hdr->flags & XFS_BULK_IREQ_METADIR)
 		breq->flags |= XFS_IWALK_METADIR;
 
+	if (hdr->flags & XFS_BULK_IREQ_BULKSTAT)
+		if (hdr->bulkstat_flags & XFS_BULK_IREQ_BULKSTAT_NREXT64)
+			breq->flags |= XFS_IBULK_NREXT64;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f92057ad686b..5dce090f8f65 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -20,6 +20,7 @@ 
 #include "xfs_icache.h"
 #include "xfs_health.h"
 #include "xfs_trans.h"
+#include "xfs_errortag.h"
 
 /*
  * Bulk Stat
@@ -74,6 +75,7 @@  xfs_bulkstat_one_int(
 	struct xfs_inode	*ip;		/* incore inode pointer */
 	struct inode		*inode;
 	struct xfs_bulkstat	*buf = bc->buf;
+	xfs_extnum_t		nextents;
 	int			error = -EINVAL;
 
 	error = xfs_iget(mp, tp, ino,
@@ -134,7 +136,26 @@  xfs_bulkstat_one_int(
 
 	buf->bs_xflags = xfs_ip2xflags(ip);
 	buf->bs_extsize_blks = ip->i_extsize;
-	buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+	nextents = xfs_ifork_nextents(&ip->i_df);
+	if (!(bc->breq->flags & XFS_IBULK_NREXT64)) {
+		xfs_extnum_t max_nextents = XFS_IFORK_EXTCNT_MAXS32;
+
+		if (unlikely(XFS_TEST_ERROR(false, mp,
+				XFS_ERRTAG_REDUCE_MAX_IEXTENTS)))
+			max_nextents = 10;
+
+		if (nextents > max_nextents) {
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			xfs_irele(ip);
+			error = -EINVAL;
+			goto out_advance;
+		}
+		buf->bs_extents32 = nextents;
+	} else {
+		buf->bs_extents64 = nextents;
+	}
+
 	xfs_bulkstat_health(ip, buf);
 	buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
 	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
@@ -356,7 +377,7 @@  xfs_bulkstat_to_bstat(
 	bs1->bs_blocks = bstat->bs_blocks;
 	bs1->bs_xflags = bstat->bs_xflags;
 	bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
-	bs1->bs_extents = bstat->bs_extents;
+	bs1->bs_extents = bstat->bs_extents32;
 	bs1->bs_gen = bstat->bs_gen;
 	bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
 	bs1->bs_forkoff = bstat->bs_forkoff;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f5a13f69883a..f61685da3837 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -22,6 +22,8 @@  struct xfs_ibulk {
 /* Signal that we can return metadata directories. */
 #define XFS_IBULK_METADIR	(XFS_IWALK_METADIR)
 
+#define XFS_IBULK_NREXT64	(XFS_IWALK_NREXT64)
+
 /*
  * Advance the user buffer pointer by one record of the given size.  If the
  * buffer is now full, return the appropriate error code.
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index d7a082e45cbf..27a6842a1bb5 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -31,8 +31,11 @@  int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
 /* Signal that we can return metadata directories. */
 #define XFS_IWALK_METADIR	(0x2)
 
-#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG | \
-				 XFS_IWALK_METADIR)
+#define XFS_IWALK_NREXT64	(0x4)
+
+#define XFS_IWALK_FLAGS_ALL	(XFS_IWALK_SAME_AG |	\
+				 XFS_IWALK_METADIR |	\
+				 XFS_IWALK_NREXT64)
 
 /* Walk all inode btree records in the filesystem starting from @startino. */
 typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,