[RFC] xfs: extended timestamp range
diff mbox series

Message ID 20191111213630.14680-1-amir73il@gmail.com
State New
Headers show
Series
  • [RFC] xfs: extended timestamp range
Related show

Commit Message

Amir Goldstein Nov. 11, 2019, 9:36 p.m. UTC
Use similar on-disk encoding for timestamps as ext4 to push back
the y2038 deadline to 2446.

The encoding uses the 2 free MSB in 32bit nsec field to extend the
seconds field storage size to 34bit.

Those 2 bits should be zero on existing xfs inodes, so the extended
timestamp range feature is declared read-only compatible with old
on-disk format.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---

Hi,

This is a very lightly tested RFC patch.
Naming and splitting to patches aside, I'd like to know what folks think
about this direction for fixing y2038 xfs support.

With XFS_TIMESTAMP_DEBUG defined, it provides correct output for test
generic/402 (timestamp range test), when matching xfs expected timestamp
ranges to those of ext4 (_filesystem_timestamp_range).
And then the test (naturally) fails on corrupted fs check.

If this direction is acceptable, I will proceed with patching xfsprogs.

I'd also like to hear your thoughts about migration process.
Should the new feature be ro_compat as I defined it or incompat?
In pricipal, all user would need to do is set the feature flag, but
I am not aware of any precedent of a similar format upgrade in xfs.

Thanks,
Amir.

 fs/xfs/libxfs/xfs_format.h      |  14 ++-
 fs/xfs/libxfs/xfs_inode_buf.c   |  36 ++++----
 fs/xfs/libxfs/xfs_log_format.h  |   4 +-
 fs/xfs/libxfs/xfs_timestamp.h   | 151 ++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_trans_inode.c |   7 +-
 fs/xfs/scrub/inode.c            |  11 ++-
 fs/xfs/xfs_inode.c              |   4 +-
 fs/xfs/xfs_inode_item.c         |  13 ++-
 fs/xfs/xfs_iops.c               |   5 +-
 fs/xfs/xfs_itable.c             |   3 +-
 fs/xfs/xfs_super.c              |   5 +-
 11 files changed, 208 insertions(+), 45 deletions(-)
 create mode 100644 fs/xfs/libxfs/xfs_timestamp.h

Comments

Darrick J. Wong Nov. 11, 2019, 10:35 p.m. UTC | #1
On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> Use similar on-disk encoding for timestamps as ext4 to push back
> the y2038 deadline to 2446.
> 
> The encoding uses the 2 free MSB in 32bit nsec field to extend the
> seconds field storage size to 34bit.
> 
> Those 2 bits should be zero on existing xfs inodes, so the extended
> timestamp range feature is declared read-only compatible with old
> on-disk format.

What do you think about making the timestamp field a uint64_t counting
nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
we support with the existing encoding scheme)?  Instead of using the
upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
up years ago and has not fully fixed?

Also, please change struct xfs_inode.i_crtime to a timespec64 to match
the other three timestamps in struct inode...

...and following the usual xfs indenting style for all the new functions.

--D

> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
> 
> Hi,
> 
> This is a very lightly tested RFC patch.
> Naming and splitting to patches aside, I'd like to know what folks think
> about this direction for fixing y2038 xfs support.
> 
> With XFS_TIMESTAMP_DEBUG defined, it provides correct output for test
> generic/402 (timestamp range test), when matching xfs expected timestamp
> ranges to those of ext4 (_filesystem_timestamp_range).
> And then the test (naturally) fails on corrupted fs check.
> 
> If this direction is acceptable, I will proceed with patching xfsprogs.
> 
> I'd also like to hear your thoughts about migration process.
> Should the new feature be ro_compat as I defined it or incompat?
> In pricipal, all user would need to do is set the feature flag, but
> I am not aware of any precedent of a similar format upgrade in xfs.
> 
> Thanks,
> Amir.
> 
>  fs/xfs/libxfs/xfs_format.h      |  14 ++-
>  fs/xfs/libxfs/xfs_inode_buf.c   |  36 ++++----
>  fs/xfs/libxfs/xfs_log_format.h  |   4 +-
>  fs/xfs/libxfs/xfs_timestamp.h   | 151 ++++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_trans_inode.c |   7 +-
>  fs/xfs/scrub/inode.c            |  11 ++-
>  fs/xfs/xfs_inode.c              |   4 +-
>  fs/xfs/xfs_inode_item.c         |  13 ++-
>  fs/xfs/xfs_iops.c               |   5 +-
>  fs/xfs/xfs_itable.c             |   3 +-
>  fs/xfs/xfs_super.c              |   5 +-
>  11 files changed, 208 insertions(+), 45 deletions(-)
>  create mode 100644 fs/xfs/libxfs/xfs_timestamp.h
> 
> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> index c968b60cee15..227a775a9889 100644
> --- a/fs/xfs/libxfs/xfs_format.h
> +++ b/fs/xfs/libxfs/xfs_format.h
> @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature(
>  #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
>  #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
>  #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
> +#define XFS_SB_FEAT_RO_COMPAT_EXTTIME  (1 << 3)		/* extended time_max */
>  #define XFS_SB_FEAT_RO_COMPAT_ALL \
>  		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>  		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> -		 XFS_SB_FEAT_RO_COMPAT_REFLINK)
> +		 XFS_SB_FEAT_RO_COMPAT_REFLINK | \
> +		 XFS_SB_FEAT_RO_COMPAT_EXTTIME)
>  #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
>  static inline bool
>  xfs_sb_has_ro_compat_feature(
> @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
>  		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
>  }
>  
> +static inline bool xfs_sb_version_hasexttime(struct xfs_sb *sbp)
> +{
> +	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
> +		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_EXTTIME);
> +}
> +
>  /*
>   * end of superblock version macros
>   */
> @@ -824,8 +832,8 @@ typedef struct xfs_agfl {
>  		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
>  
>  typedef struct xfs_timestamp {
> -	__be32		t_sec;		/* timestamp seconds */
> -	__be32		t_nsec;		/* timestamp nanoseconds */
> +	__be32	t_sec;		/* timestamp seconds */
> +	__be32	t_nsec_epoch;	/* timestamp nanoseconds | extra epoch */
>  } xfs_timestamp_t;
>  
>  /*
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 28ab3c5255e1..aaf411da6263 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -17,6 +17,7 @@
>  #include "xfs_trans.h"
>  #include "xfs_ialloc.h"
>  #include "xfs_dir2.h"
> +#include "xfs_timestamp.h"
>  
>  #include <linux/iversion.h>
>  
> @@ -204,6 +205,7 @@ xfs_inode_from_disk(
>  {
>  	struct xfs_icdinode	*to = &ip->i_d;
>  	struct inode		*inode = VFS_I(ip);
> +	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
>  
>  
>  	/*
> @@ -233,12 +235,9 @@ xfs_inode_from_disk(
>  	 * a time before epoch is converted to a time long after epoch
>  	 * on 64 bit systems.
>  	 */
> -	inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
> -	inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
> -	inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
> -	inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
> -	inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
> -	inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
> +	xfs_timestamp_di_decode(sbp, &inode->i_atime, &from->di_atime);
> +	xfs_timestamp_di_decode(sbp, &inode->i_mtime, &from->di_mtime);
> +	xfs_timestamp_di_decode(sbp, &inode->i_ctime, &from->di_ctime);
>  	inode->i_generation = be32_to_cpu(from->di_gen);
>  	inode->i_mode = be16_to_cpu(from->di_mode);
>  
> @@ -257,7 +256,8 @@ xfs_inode_from_disk(
>  		inode_set_iversion_queried(inode,
>  					   be64_to_cpu(from->di_changecount));
>  		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
> -		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
> +		to->di_crtime.t_nsec_epoch =
> +			be32_to_cpu(from->di_crtime.t_nsec_epoch);
>  		to->di_flags2 = be64_to_cpu(from->di_flags2);
>  		to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
>  	}
> @@ -271,6 +271,7 @@ xfs_inode_to_disk(
>  {
>  	struct xfs_icdinode	*from = &ip->i_d;
>  	struct inode		*inode = VFS_I(ip);
> +	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
>  
>  	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
>  	to->di_onlink = 0;
> @@ -283,12 +284,9 @@ xfs_inode_to_disk(
>  	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
>  
>  	memset(to->di_pad, 0, sizeof(to->di_pad));
> -	to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
> -	to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
> -	to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
> -	to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
> -	to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
> -	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
> +	xfs_timestamp_di_encode(sbp, &inode->i_atime, &to->di_atime);
> +	xfs_timestamp_di_encode(sbp, &inode->i_mtime, &to->di_mtime);
> +	xfs_timestamp_di_encode(sbp, &inode->i_ctime, &to->di_ctime);
>  	to->di_nlink = cpu_to_be32(inode->i_nlink);
>  	to->di_gen = cpu_to_be32(inode->i_generation);
>  	to->di_mode = cpu_to_be16(inode->i_mode);
> @@ -307,7 +305,8 @@ xfs_inode_to_disk(
>  	if (from->di_version == 3) {
>  		to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
>  		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
> -		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
> +		to->di_crtime.t_nsec_epoch =
> +			cpu_to_be32(from->di_crtime.t_nsec_epoch);
>  		to->di_flags2 = cpu_to_be64(from->di_flags2);
>  		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
>  		to->di_ino = cpu_to_be64(ip->i_ino);
> @@ -338,11 +337,11 @@ xfs_log_dinode_to_disk(
>  	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
>  
>  	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
> -	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
> +	to->di_atime.t_nsec_epoch = cpu_to_be32(from->di_atime.t_nsec_epoch);
>  	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
> -	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
> +	to->di_mtime.t_nsec_epoch = cpu_to_be32(from->di_mtime.t_nsec_epoch);
>  	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
> -	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
> +	to->di_ctime.t_nsec_epoch = cpu_to_be32(from->di_ctime.t_nsec_epoch);
>  
>  	to->di_size = cpu_to_be64(from->di_size);
>  	to->di_nblocks = cpu_to_be64(from->di_nblocks);
> @@ -359,7 +358,8 @@ xfs_log_dinode_to_disk(
>  	if (from->di_version == 3) {
>  		to->di_changecount = cpu_to_be64(from->di_changecount);
>  		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
> -		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
> +		to->di_crtime.t_nsec_epoch =
> +			cpu_to_be32(from->di_crtime.t_nsec_epoch);
>  		to->di_flags2 = cpu_to_be64(from->di_flags2);
>  		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
>  		to->di_ino = cpu_to_be64(from->di_ino);
> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> index e5f97c69b320..08f9d119e0d5 100644
> --- a/fs/xfs/libxfs/xfs_log_format.h
> +++ b/fs/xfs/libxfs/xfs_log_format.h
> @@ -369,8 +369,8 @@ static inline int xfs_ilog_fdata(int w)
>   * information.
>   */
>  typedef struct xfs_ictimestamp {
> -	int32_t		t_sec;		/* timestamp seconds */
> -	int32_t		t_nsec;		/* timestamp nanoseconds */
> +	int32_t t_sec;		/* timestamp seconds */
> +	uint32_t t_nsec_epoch;	/* timestamp nanoseconds | extra epoch */
>  } xfs_ictimestamp_t;
>  
>  /*
> diff --git a/fs/xfs/libxfs/xfs_timestamp.h b/fs/xfs/libxfs/xfs_timestamp.h
> new file mode 100644
> index 000000000000..b514a9f40704
> --- /dev/null
> +++ b/fs/xfs/libxfs/xfs_timestamp.h
> @@ -0,0 +1,151 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2019 CTERA Networks.
> + * All Rights Reserved.
> + */
> +#ifndef __XFS_TIMESTAMP_H__
> +#define __XFS_TIMESTAMP_H__
> +
> +//#define XFS_TIMESTAMP_DEBUG
> +
> +#ifdef XFS_TIMESTAMP_DEBUG
> +#define XFS_TIMESTAMP_EXTENDED(sbp) 1
> +#else
> +#define XFS_TIMESTAMP_EXTENDED(sbp) xfs_sb_version_hasexttime(sbp)
> +#endif
> +
> +/*
> + * We use 2 unused msb of 32bit t_nsec to encode time ranges beyond y2038.
> + *
> + * We use an encoding that preserves the times for extra epoch "00":
> + *
> + * extra  msb of                         adjust for signed
> + * epoch  32-bit                         32-bit tv_sec to
> + * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
> + * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
> + * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
> + * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
> + * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
> + * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
> + * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
> + * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
> + * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
> + */
> +
> +#define XFS_TIMESTAMP_NSEC_BITS		30
> +#define XFS_TIMESTAMP_NSEC_MASK		((1U << XFS_TIMESTAMP_NSEC_BITS) - 1)
> +#define XFS_TIMESTAMP_NSEC(nsec_epoch)	((nsec_epoch) & XFS_TIMESTAMP_NSEC_MASK)
> +#define XFS_TIMESTAMP_EPOCH_SHIFT	XFS_TIMESTAMP_NSEC_BITS
> +#define XFS_TIMESTAMP_EPOCH_BITS	(32 - XFS_TIMESTAMP_NSEC_BITS)
> +#define XFS_TIMESTAMP_EPOCH_MASK	(((1U << XFS_TIMESTAMP_EPOCH_BITS) \
> +					  - 1) << XFS_TIMESTAMP_EPOCH_SHIFT)
> +#define XFS_TIMESTAMP_SEC_BITS		(32 + XFS_TIMESTAMP_EPOCH_BITS)
> +
> +#define XFS_TIMESTAMP_SEC_MIN		S32_MIN
> +#define XFS_TIMESTAMP_SEC32_MAX		S32_MAX
> +#define XFS_TIMESTAMP_SEC64_MAX		((1LL << XFS_TIMESTAMP_SEC_BITS) \
> +					 - 1  + S32_MIN)
> +#define XFS_TIMESTAMP_SEC_MAX(sbp) \
> +	(XFS_TIMESTAMP_EXTENDED(sbp) ? XFS_TIMESTAMP_SEC64_MAX : \
> +					XFS_TIMESTAMP_SEC32_MAX)
> +
> +
> +static inline int64_t xfs_timestamp_decode_sec64(int32_t sec32,
> +						 uint32_t nsec_epoch)
> +{
> +	int64_t sec64 = sec32;
> +
> +	if (unlikely(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) {
> +		sec64 += ((int64_t)(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) <<
> +			XFS_TIMESTAMP_EPOCH_BITS;
> +#ifdef XFS_TIMESTAMP_DEBUG
> +		pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64,
> +			XFS_TIMESTAMP_NSEC(nsec_epoch),
> +			(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK), sec32);
> +#endif
> +	}
> +	return sec64;
> +}
> +
> +static inline int64_t xfs_timestamp_sec64(struct xfs_sb *sbp, int32_t sec32,
> +					  uint32_t nsec_epoch)
> +{
> +	return XFS_TIMESTAMP_EXTENDED(sbp) ?
> +		xfs_timestamp_decode_sec64(sec32, nsec_epoch) : sec32;
> +}
> +
> +static inline bool xfs_timestamp_nsec_is_valid(struct xfs_sb *sbp,
> +					       uint32_t nsec_epoch)
> +{
> +	if (!XFS_TIMESTAMP_EXTENDED(sbp) &&
> +	    (nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK))
> +		return false;
> +
> +	return XFS_TIMESTAMP_NSEC(nsec_epoch) < NSEC_PER_SEC;
> +}
> +
> +static inline bool xfs_timestamp_is_valid(struct xfs_sb *sbp,
> +					  xfs_timestamp_t *dtsp)
> +{
> +	return xfs_timestamp_nsec_is_valid(sbp,
> +				be32_to_cpu(dtsp->t_nsec_epoch));
> +}
> +
> +static inline void xfs_timestamp_ic_decode(struct xfs_sb *sbp,
> +					   struct timespec64 *time,
> +					   xfs_ictimestamp_t *itsp)
> +{
> +	time->tv_sec = xfs_timestamp_sec64(sbp, itsp->t_sec,
> +					   itsp->t_nsec_epoch);
> +	time->tv_nsec = XFS_TIMESTAMP_NSEC(itsp->t_nsec_epoch);
> +}
> +
> +static inline void xfs_timestamp_di_decode(struct xfs_sb *sbp,
> +					   struct timespec64 *time,
> +					   xfs_timestamp_t *dtsp)
> +{
> +	time->tv_sec = xfs_timestamp_sec64(sbp, be32_to_cpu(dtsp->t_sec),
> +					   be32_to_cpu(dtsp->t_nsec_epoch));
> +	time->tv_nsec = XFS_TIMESTAMP_NSEC(be32_to_cpu(dtsp->t_nsec_epoch));
> +}
> +
> +static inline int32_t xfs_timestamp_encode_nsec_epoch(int64_t sec64,
> +						      int32_t nsec)
> +{
> +	int32_t epoch = ((sec64 - (int32_t)sec64) >> XFS_TIMESTAMP_EPOCH_BITS) &
> +			XFS_TIMESTAMP_EPOCH_MASK;
> +
> +#ifdef XFS_TIMESTAMP_DEBUG
> +	if (epoch)
> +		pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64, nsec,
> +			epoch, (int32_t)sec64);
> +#endif
> +	return (nsec & XFS_TIMESTAMP_NSEC_MASK) | epoch;
> +}
> +
> +static inline int32_t xfs_timestamp_nsec_epoch(struct xfs_sb *sbp,
> +					       int64_t sec64, int32_t nsec)
> +{
> +	return XFS_TIMESTAMP_EXTENDED(sbp) ?
> +		xfs_timestamp_encode_nsec_epoch(sec64, nsec) : nsec;
> +}
> +
> +static inline void xfs_timestamp_ic_encode(struct xfs_sb *sbp,
> +					   struct timespec64 *time,
> +					   xfs_ictimestamp_t *itsp)
> +{
> +	itsp->t_sec = (int32_t)time->tv_sec;
> +	itsp->t_nsec_epoch = xfs_timestamp_nsec_epoch(sbp, time->tv_sec,
> +						      time->tv_nsec);
> +}
> +
> +static inline void xfs_timestamp_di_encode(struct xfs_sb *sbp,
> +					   struct timespec64 *time,
> +					   xfs_timestamp_t *dtsp)
> +{
> +	dtsp->t_sec = cpu_to_be32(time->tv_sec);
> +	dtsp->t_nsec_epoch = cpu_to_be32(xfs_timestamp_nsec_epoch(sbp,
> +						time->tv_sec, time->tv_nsec));
> +}
> +
> +#endif /* __XFS_TIMESTAMP_H__ */
> diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
> index a9ad90926b87..48c9c9e3654d 100644
> --- a/fs/xfs/libxfs/xfs_trans_inode.c
> +++ b/fs/xfs/libxfs/xfs_trans_inode.c
> @@ -8,10 +8,13 @@
>  #include "xfs_shared.h"
>  #include "xfs_format.h"
>  #include "xfs_log_format.h"
> +#include "xfs_trans_resv.h"
> +#include "xfs_mount.h"
>  #include "xfs_inode.h"
>  #include "xfs_trans.h"
>  #include "xfs_trans_priv.h"
>  #include "xfs_inode_item.h"
> +#include "xfs_timestamp.h"
>  
>  #include <linux/iversion.h>
>  
> @@ -67,8 +70,8 @@ xfs_trans_ichgtime(
>  	if (flags & XFS_ICHGTIME_CHG)
>  		inode->i_ctime = tv;
>  	if (flags & XFS_ICHGTIME_CREATE) {
> -		ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
> -		ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
> +		xfs_timestamp_ic_encode(&ip->i_mount->m_sb, &tv,
> +					&ip->i_d.di_crtime);
>  	}
>  }
>  
> diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
> index 6d483ab29e63..981f86387dc3 100644
> --- a/fs/xfs/scrub/inode.c
> +++ b/fs/xfs/scrub/inode.c
> @@ -17,6 +17,7 @@
>  #include "xfs_reflink.h"
>  #include "xfs_rmap.h"
>  #include "xfs_bmap_util.h"
> +#include "xfs_timestamp.h"
>  #include "scrub/scrub.h"
>  #include "scrub/common.h"
>  #include "scrub/btree.h"
> @@ -293,11 +294,9 @@ xchk_dinode(
>  	}
>  
>  	/* di_[amc]time.nsec */
> -	if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
> -		xchk_ino_set_corrupt(sc, ino);
> -	if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
> -		xchk_ino_set_corrupt(sc, ino);
> -	if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
> +	if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_atime) ||
> +	    !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_mtime) ||
> +	    !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_ctime))
>  		xchk_ino_set_corrupt(sc, ino);
>  
>  	/*
> @@ -403,7 +402,7 @@ xchk_dinode(
>  	}
>  
>  	if (dip->di_version >= 3) {
> -		if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
> +		if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_crtime))
>  			xchk_ino_set_corrupt(sc, ino);
>  		xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
>  		xchk_inode_cowextsize(sc, dip, ino, mode, flags,
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index a92d4521748d..ca1538b31170 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -35,6 +35,7 @@
>  #include "xfs_log.h"
>  #include "xfs_bmap_btree.h"
>  #include "xfs_reflink.h"
> +#include "xfs_timestamp.h"
>  
>  kmem_zone_t *xfs_inode_zone;
>  
> @@ -851,8 +852,7 @@ xfs_ialloc(
>  		inode_set_iversion(inode, 1);
>  		ip->i_d.di_flags2 = 0;
>  		ip->i_d.di_cowextsize = 0;
> -		ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
> -		ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
> +		xfs_timestamp_ic_encode(&mp->m_sb, &tv, &ip->i_d.di_crtime);
>  	}
>  
>  
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 726aa3bfd6e8..2b5db9af6a9d 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -18,6 +18,7 @@
>  #include "xfs_buf_item.h"
>  #include "xfs_log.h"
>  #include "xfs_error.h"
> +#include "xfs_timestamp.h"
>  
>  #include <linux/iversion.h>
>  
> @@ -303,6 +304,7 @@ xfs_inode_to_log_dinode(
>  {
>  	struct xfs_icdinode	*from = &ip->i_d;
>  	struct inode		*inode = VFS_I(ip);
> +	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
>  
>  	to->di_magic = XFS_DINODE_MAGIC;
>  
> @@ -315,12 +317,9 @@ xfs_inode_to_log_dinode(
>  
>  	memset(to->di_pad, 0, sizeof(to->di_pad));
>  	memset(to->di_pad3, 0, sizeof(to->di_pad3));
> -	to->di_atime.t_sec = inode->i_atime.tv_sec;
> -	to->di_atime.t_nsec = inode->i_atime.tv_nsec;
> -	to->di_mtime.t_sec = inode->i_mtime.tv_sec;
> -	to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
> -	to->di_ctime.t_sec = inode->i_ctime.tv_sec;
> -	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
> +	xfs_timestamp_ic_encode(sbp, &inode->i_atime, &to->di_atime);
> +	xfs_timestamp_ic_encode(sbp, &inode->i_mtime, &to->di_mtime);
> +	xfs_timestamp_ic_encode(sbp, &inode->i_ctime, &to->di_ctime);
>  	to->di_nlink = inode->i_nlink;
>  	to->di_gen = inode->i_generation;
>  	to->di_mode = inode->i_mode;
> @@ -342,7 +341,7 @@ xfs_inode_to_log_dinode(
>  	if (from->di_version == 3) {
>  		to->di_changecount = inode_peek_iversion(inode);
>  		to->di_crtime.t_sec = from->di_crtime.t_sec;
> -		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
> +		to->di_crtime.t_nsec_epoch = from->di_crtime.t_nsec_epoch;
>  		to->di_flags2 = from->di_flags2;
>  		to->di_cowextsize = from->di_cowextsize;
>  		to->di_ino = ip->i_ino;
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 4c7962ccb0c4..aa294597d16f 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -21,6 +21,7 @@
>  #include "xfs_dir2.h"
>  #include "xfs_iomap.h"
>  #include "xfs_error.h"
> +#include "xfs_timestamp.h"
>  
>  #include <linux/xattr.h>
>  #include <linux/posix_acl.h>
> @@ -556,8 +557,8 @@ xfs_vn_getattr(
>  	if (ip->i_d.di_version == 3) {
>  		if (request_mask & STATX_BTIME) {
>  			stat->result_mask |= STATX_BTIME;
> -			stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
> -			stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
> +			xfs_timestamp_ic_decode(&mp->m_sb, &stat->btime,
> +						&ip->i_d.di_crtime);
>  		}
>  	}
>  
> diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> index 884950adbd16..3e55cf029414 100644
> --- a/fs/xfs/xfs_itable.c
> +++ b/fs/xfs/xfs_itable.c
> @@ -19,6 +19,7 @@
>  #include "xfs_error.h"
>  #include "xfs_icache.h"
>  #include "xfs_health.h"
> +#include "xfs_timestamp.h"
>  
>  /*
>   * Bulk Stat
> @@ -98,7 +99,7 @@ xfs_bulkstat_one_int(
>  	buf->bs_ctime = inode->i_ctime.tv_sec;
>  	buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
>  	buf->bs_btime = dic->di_crtime.t_sec;
> -	buf->bs_btime_nsec = dic->di_crtime.t_nsec;
> +	buf->bs_btime_nsec = XFS_TIMESTAMP_NSEC(dic->di_crtime.t_nsec_epoch);
>  	buf->bs_gen = inode->i_generation;
>  	buf->bs_mode = inode->i_mode;
>  
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b3188ea49413..b940ce6dac07 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -35,6 +35,7 @@
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
>  #include "xfs_reflink.h"
> +#include "xfs_timestamp.h"
>  
>  #include <linux/magic.h>
>  #include <linux/fs_context.h>
> @@ -1438,8 +1439,8 @@ xfs_fc_fill_super(
>  	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
>  	sb->s_max_links = XFS_MAXLINK;
>  	sb->s_time_gran = 1;
> -	sb->s_time_min = S32_MIN;
> -	sb->s_time_max = S32_MAX;
> +	sb->s_time_min = XFS_TIMESTAMP_SEC_MIN;
> +	sb->s_time_max = XFS_TIMESTAMP_SEC_MAX(&mp->m_sb);
>  	sb->s_iflags |= SB_I_CGROUPWB;
>  
>  	set_posix_acl_flag(sb);
> -- 
> 2.17.1
>
Amir Goldstein Nov. 12, 2019, 4 a.m. UTC | #2
On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
<darrick.wong@oracle.com> wrote:
>
> On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> > Use similar on-disk encoding for timestamps as ext4 to push back
> > the y2038 deadline to 2446.
> >
> > The encoding uses the 2 free MSB in 32bit nsec field to extend the
> > seconds field storage size to 34bit.
> >
> > Those 2 bits should be zero on existing xfs inodes, so the extended
> > timestamp range feature is declared read-only compatible with old
> > on-disk format.
>
> What do you think about making the timestamp field a uint64_t counting
> nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
> we support with the existing encoding scheme)?  Instead of using the
> upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
> up years ago and has not fully fixed?

The advantage of the ext4 scheme is that it is more backward compatible.
I would like to have an upgrade procedure that is simple and I don't like
the idea of having two completely different time encodings in the code
(and on disk) if I can help it.

IIUC, you are implying that the ext4 scheme is more prone to human
programming errors? that should be addressed with proper unit testing
IMO and besides, we can learn from ext4 past bugs (not sure that my
implementation did), so those could be listed also in the pros column.

One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
allows for negative nsec value. Not sure if that is intentional and why.
I suppose it is an oversight? That is something that xfs_repair would
need to check and fix before upgrade if we do go with the epoch bits.

>
> Also, please change struct xfs_inode.i_crtime to a timespec64 to match
> the other three timestamps in struct inode...

Makes sense.

>
> ...and following the usual xfs indenting style for all the new functions.
>

I'd be surprised if that's the only xfs-ism that I missed ;-)

Thanks,
Amir.

>
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >
> > Hi,
> >
> > This is a very lightly tested RFC patch.
> > Naming and splitting to patches aside, I'd like to know what folks think
> > about this direction for fixing y2038 xfs support.
> >
> > With XFS_TIMESTAMP_DEBUG defined, it provides correct output for test
> > generic/402 (timestamp range test), when matching xfs expected timestamp
> > ranges to those of ext4 (_filesystem_timestamp_range).
> > And then the test (naturally) fails on corrupted fs check.
> >
> > If this direction is acceptable, I will proceed with patching xfsprogs.
> >
> > I'd also like to hear your thoughts about migration process.
> > Should the new feature be ro_compat as I defined it or incompat?
> > In pricipal, all user would need to do is set the feature flag, but
> > I am not aware of any precedent of a similar format upgrade in xfs.
> >
> > Thanks,
> > Amir.
> >
> >  fs/xfs/libxfs/xfs_format.h      |  14 ++-
> >  fs/xfs/libxfs/xfs_inode_buf.c   |  36 ++++----
> >  fs/xfs/libxfs/xfs_log_format.h  |   4 +-
> >  fs/xfs/libxfs/xfs_timestamp.h   | 151 ++++++++++++++++++++++++++++++++
> >  fs/xfs/libxfs/xfs_trans_inode.c |   7 +-
> >  fs/xfs/scrub/inode.c            |  11 ++-
> >  fs/xfs/xfs_inode.c              |   4 +-
> >  fs/xfs/xfs_inode_item.c         |  13 ++-
> >  fs/xfs/xfs_iops.c               |   5 +-
> >  fs/xfs/xfs_itable.c             |   3 +-
> >  fs/xfs/xfs_super.c              |   5 +-
> >  11 files changed, 208 insertions(+), 45 deletions(-)
> >  create mode 100644 fs/xfs/libxfs/xfs_timestamp.h
> >
> > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > index c968b60cee15..227a775a9889 100644
> > --- a/fs/xfs/libxfs/xfs_format.h
> > +++ b/fs/xfs/libxfs/xfs_format.h
> > @@ -449,10 +449,12 @@ xfs_sb_has_compat_feature(
> >  #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)              /* free inode btree */
> >  #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)              /* reverse map btree */
> >  #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)              /* reflinked files */
> > +#define XFS_SB_FEAT_RO_COMPAT_EXTTIME  (1 << 3)              /* extended time_max */
> >  #define XFS_SB_FEAT_RO_COMPAT_ALL \
> >               (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
> >                XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> > -              XFS_SB_FEAT_RO_COMPAT_REFLINK)
> > +              XFS_SB_FEAT_RO_COMPAT_REFLINK | \
> > +              XFS_SB_FEAT_RO_COMPAT_EXTTIME)
> >  #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN        ~XFS_SB_FEAT_RO_COMPAT_ALL
> >  static inline bool
> >  xfs_sb_has_ro_compat_feature(
> > @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
> >               (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
> >  }
> >
> > +static inline bool xfs_sb_version_hasexttime(struct xfs_sb *sbp)
> > +{
> > +     return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
> > +             (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_EXTTIME);
> > +}
> > +
> >  /*
> >   * end of superblock version macros
> >   */
> > @@ -824,8 +832,8 @@ typedef struct xfs_agfl {
> >                  xfs_daddr_to_agno(mp, (d) + (len) - 1)))
> >
> >  typedef struct xfs_timestamp {
> > -     __be32          t_sec;          /* timestamp seconds */
> > -     __be32          t_nsec;         /* timestamp nanoseconds */
> > +     __be32  t_sec;          /* timestamp seconds */
> > +     __be32  t_nsec_epoch;   /* timestamp nanoseconds | extra epoch */
> >  } xfs_timestamp_t;
> >
> >  /*
> > diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> > index 28ab3c5255e1..aaf411da6263 100644
> > --- a/fs/xfs/libxfs/xfs_inode_buf.c
> > +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> > @@ -17,6 +17,7 @@
> >  #include "xfs_trans.h"
> >  #include "xfs_ialloc.h"
> >  #include "xfs_dir2.h"
> > +#include "xfs_timestamp.h"
> >
> >  #include <linux/iversion.h>
> >
> > @@ -204,6 +205,7 @@ xfs_inode_from_disk(
> >  {
> >       struct xfs_icdinode     *to = &ip->i_d;
> >       struct inode            *inode = VFS_I(ip);
> > +     struct xfs_sb           *sbp = &ip->i_mount->m_sb;
> >
> >
> >       /*
> > @@ -233,12 +235,9 @@ xfs_inode_from_disk(
> >        * a time before epoch is converted to a time long after epoch
> >        * on 64 bit systems.
> >        */
> > -     inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
> > -     inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
> > -     inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
> > -     inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
> > -     inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
> > -     inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
> > +     xfs_timestamp_di_decode(sbp, &inode->i_atime, &from->di_atime);
> > +     xfs_timestamp_di_decode(sbp, &inode->i_mtime, &from->di_mtime);
> > +     xfs_timestamp_di_decode(sbp, &inode->i_ctime, &from->di_ctime);
> >       inode->i_generation = be32_to_cpu(from->di_gen);
> >       inode->i_mode = be16_to_cpu(from->di_mode);
> >
> > @@ -257,7 +256,8 @@ xfs_inode_from_disk(
> >               inode_set_iversion_queried(inode,
> >                                          be64_to_cpu(from->di_changecount));
> >               to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
> > -             to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
> > +             to->di_crtime.t_nsec_epoch =
> > +                     be32_to_cpu(from->di_crtime.t_nsec_epoch);
> >               to->di_flags2 = be64_to_cpu(from->di_flags2);
> >               to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
> >       }
> > @@ -271,6 +271,7 @@ xfs_inode_to_disk(
> >  {
> >       struct xfs_icdinode     *from = &ip->i_d;
> >       struct inode            *inode = VFS_I(ip);
> > +     struct xfs_sb           *sbp = &ip->i_mount->m_sb;
> >
> >       to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
> >       to->di_onlink = 0;
> > @@ -283,12 +284,9 @@ xfs_inode_to_disk(
> >       to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
> >
> >       memset(to->di_pad, 0, sizeof(to->di_pad));
> > -     to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
> > -     to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
> > -     to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
> > -     to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
> > -     to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
> > -     to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
> > +     xfs_timestamp_di_encode(sbp, &inode->i_atime, &to->di_atime);
> > +     xfs_timestamp_di_encode(sbp, &inode->i_mtime, &to->di_mtime);
> > +     xfs_timestamp_di_encode(sbp, &inode->i_ctime, &to->di_ctime);
> >       to->di_nlink = cpu_to_be32(inode->i_nlink);
> >       to->di_gen = cpu_to_be32(inode->i_generation);
> >       to->di_mode = cpu_to_be16(inode->i_mode);
> > @@ -307,7 +305,8 @@ xfs_inode_to_disk(
> >       if (from->di_version == 3) {
> >               to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
> >               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
> > -             to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
> > +             to->di_crtime.t_nsec_epoch =
> > +                     cpu_to_be32(from->di_crtime.t_nsec_epoch);
> >               to->di_flags2 = cpu_to_be64(from->di_flags2);
> >               to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
> >               to->di_ino = cpu_to_be64(ip->i_ino);
> > @@ -338,11 +337,11 @@ xfs_log_dinode_to_disk(
> >       memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
> >
> >       to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
> > -     to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
> > +     to->di_atime.t_nsec_epoch = cpu_to_be32(from->di_atime.t_nsec_epoch);
> >       to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
> > -     to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
> > +     to->di_mtime.t_nsec_epoch = cpu_to_be32(from->di_mtime.t_nsec_epoch);
> >       to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
> > -     to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
> > +     to->di_ctime.t_nsec_epoch = cpu_to_be32(from->di_ctime.t_nsec_epoch);
> >
> >       to->di_size = cpu_to_be64(from->di_size);
> >       to->di_nblocks = cpu_to_be64(from->di_nblocks);
> > @@ -359,7 +358,8 @@ xfs_log_dinode_to_disk(
> >       if (from->di_version == 3) {
> >               to->di_changecount = cpu_to_be64(from->di_changecount);
> >               to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
> > -             to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
> > +             to->di_crtime.t_nsec_epoch =
> > +                     cpu_to_be32(from->di_crtime.t_nsec_epoch);
> >               to->di_flags2 = cpu_to_be64(from->di_flags2);
> >               to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
> >               to->di_ino = cpu_to_be64(from->di_ino);
> > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
> > index e5f97c69b320..08f9d119e0d5 100644
> > --- a/fs/xfs/libxfs/xfs_log_format.h
> > +++ b/fs/xfs/libxfs/xfs_log_format.h
> > @@ -369,8 +369,8 @@ static inline int xfs_ilog_fdata(int w)
> >   * information.
> >   */
> >  typedef struct xfs_ictimestamp {
> > -     int32_t         t_sec;          /* timestamp seconds */
> > -     int32_t         t_nsec;         /* timestamp nanoseconds */
> > +     int32_t t_sec;          /* timestamp seconds */
> > +     uint32_t t_nsec_epoch;  /* timestamp nanoseconds | extra epoch */
> >  } xfs_ictimestamp_t;
> >
> >  /*
> > diff --git a/fs/xfs/libxfs/xfs_timestamp.h b/fs/xfs/libxfs/xfs_timestamp.h
> > new file mode 100644
> > index 000000000000..b514a9f40704
> > --- /dev/null
> > +++ b/fs/xfs/libxfs/xfs_timestamp.h
> > @@ -0,0 +1,151 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright (C) 2019 CTERA Networks.
> > + * All Rights Reserved.
> > + */
> > +#ifndef __XFS_TIMESTAMP_H__
> > +#define __XFS_TIMESTAMP_H__
> > +
> > +//#define XFS_TIMESTAMP_DEBUG
> > +
> > +#ifdef XFS_TIMESTAMP_DEBUG
> > +#define XFS_TIMESTAMP_EXTENDED(sbp) 1
> > +#else
> > +#define XFS_TIMESTAMP_EXTENDED(sbp) xfs_sb_version_hasexttime(sbp)
> > +#endif
> > +
> > +/*
> > + * We use 2 unused msb of 32bit t_nsec to encode time ranges beyond y2038.
> > + *
> > + * We use an encoding that preserves the times for extra epoch "00":
> > + *
> > + * extra  msb of                         adjust for signed
> > + * epoch  32-bit                         32-bit tv_sec to
> > + * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
> > + * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
> > + * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
> > + * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
> > + * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
> > + * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
> > + * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
> > + * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
> > + * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
> > + */
> > +
> > +#define XFS_TIMESTAMP_NSEC_BITS              30
> > +#define XFS_TIMESTAMP_NSEC_MASK              ((1U << XFS_TIMESTAMP_NSEC_BITS) - 1)
> > +#define XFS_TIMESTAMP_NSEC(nsec_epoch)       ((nsec_epoch) & XFS_TIMESTAMP_NSEC_MASK)
> > +#define XFS_TIMESTAMP_EPOCH_SHIFT    XFS_TIMESTAMP_NSEC_BITS
> > +#define XFS_TIMESTAMP_EPOCH_BITS     (32 - XFS_TIMESTAMP_NSEC_BITS)
> > +#define XFS_TIMESTAMP_EPOCH_MASK     (((1U << XFS_TIMESTAMP_EPOCH_BITS) \
> > +                                       - 1) << XFS_TIMESTAMP_EPOCH_SHIFT)
> > +#define XFS_TIMESTAMP_SEC_BITS               (32 + XFS_TIMESTAMP_EPOCH_BITS)
> > +
> > +#define XFS_TIMESTAMP_SEC_MIN                S32_MIN
> > +#define XFS_TIMESTAMP_SEC32_MAX              S32_MAX
> > +#define XFS_TIMESTAMP_SEC64_MAX              ((1LL << XFS_TIMESTAMP_SEC_BITS) \
> > +                                      - 1  + S32_MIN)
> > +#define XFS_TIMESTAMP_SEC_MAX(sbp) \
> > +     (XFS_TIMESTAMP_EXTENDED(sbp) ? XFS_TIMESTAMP_SEC64_MAX : \
> > +                                     XFS_TIMESTAMP_SEC32_MAX)
> > +
> > +
> > +static inline int64_t xfs_timestamp_decode_sec64(int32_t sec32,
> > +                                              uint32_t nsec_epoch)
> > +{
> > +     int64_t sec64 = sec32;
> > +
> > +     if (unlikely(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) {
> > +             sec64 += ((int64_t)(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) <<
> > +                     XFS_TIMESTAMP_EPOCH_BITS;
> > +#ifdef XFS_TIMESTAMP_DEBUG
> > +             pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64,
> > +                     XFS_TIMESTAMP_NSEC(nsec_epoch),
> > +                     (nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK), sec32);
> > +#endif
> > +     }
> > +     return sec64;
> > +}
> > +
> > +static inline int64_t xfs_timestamp_sec64(struct xfs_sb *sbp, int32_t sec32,
> > +                                       uint32_t nsec_epoch)
> > +{
> > +     return XFS_TIMESTAMP_EXTENDED(sbp) ?
> > +             xfs_timestamp_decode_sec64(sec32, nsec_epoch) : sec32;
> > +}
> > +
> > +static inline bool xfs_timestamp_nsec_is_valid(struct xfs_sb *sbp,
> > +                                            uint32_t nsec_epoch)
> > +{
> > +     if (!XFS_TIMESTAMP_EXTENDED(sbp) &&
> > +         (nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK))
> > +             return false;
> > +
> > +     return XFS_TIMESTAMP_NSEC(nsec_epoch) < NSEC_PER_SEC;
> > +}
> > +
> > +static inline bool xfs_timestamp_is_valid(struct xfs_sb *sbp,
> > +                                       xfs_timestamp_t *dtsp)
> > +{
> > +     return xfs_timestamp_nsec_is_valid(sbp,
> > +                             be32_to_cpu(dtsp->t_nsec_epoch));
> > +}
> > +
> > +static inline void xfs_timestamp_ic_decode(struct xfs_sb *sbp,
> > +                                        struct timespec64 *time,
> > +                                        xfs_ictimestamp_t *itsp)
> > +{
> > +     time->tv_sec = xfs_timestamp_sec64(sbp, itsp->t_sec,
> > +                                        itsp->t_nsec_epoch);
> > +     time->tv_nsec = XFS_TIMESTAMP_NSEC(itsp->t_nsec_epoch);
> > +}
> > +
> > +static inline void xfs_timestamp_di_decode(struct xfs_sb *sbp,
> > +                                        struct timespec64 *time,
> > +                                        xfs_timestamp_t *dtsp)
> > +{
> > +     time->tv_sec = xfs_timestamp_sec64(sbp, be32_to_cpu(dtsp->t_sec),
> > +                                        be32_to_cpu(dtsp->t_nsec_epoch));
> > +     time->tv_nsec = XFS_TIMESTAMP_NSEC(be32_to_cpu(dtsp->t_nsec_epoch));
> > +}
> > +
> > +static inline int32_t xfs_timestamp_encode_nsec_epoch(int64_t sec64,
> > +                                                   int32_t nsec)
> > +{
> > +     int32_t epoch = ((sec64 - (int32_t)sec64) >> XFS_TIMESTAMP_EPOCH_BITS) &
> > +                     XFS_TIMESTAMP_EPOCH_MASK;
> > +
> > +#ifdef XFS_TIMESTAMP_DEBUG
> > +     if (epoch)
> > +             pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64, nsec,
> > +                     epoch, (int32_t)sec64);
> > +#endif
> > +     return (nsec & XFS_TIMESTAMP_NSEC_MASK) | epoch;
> > +}
> > +
> > +static inline int32_t xfs_timestamp_nsec_epoch(struct xfs_sb *sbp,
> > +                                            int64_t sec64, int32_t nsec)
> > +{
> > +     return XFS_TIMESTAMP_EXTENDED(sbp) ?
> > +             xfs_timestamp_encode_nsec_epoch(sec64, nsec) : nsec;
> > +}
> > +
> > +static inline void xfs_timestamp_ic_encode(struct xfs_sb *sbp,
> > +                                        struct timespec64 *time,
> > +                                        xfs_ictimestamp_t *itsp)
> > +{
> > +     itsp->t_sec = (int32_t)time->tv_sec;
> > +     itsp->t_nsec_epoch = xfs_timestamp_nsec_epoch(sbp, time->tv_sec,
> > +                                                   time->tv_nsec);
> > +}
> > +
> > +static inline void xfs_timestamp_di_encode(struct xfs_sb *sbp,
> > +                                        struct timespec64 *time,
> > +                                        xfs_timestamp_t *dtsp)
> > +{
> > +     dtsp->t_sec = cpu_to_be32(time->tv_sec);
> > +     dtsp->t_nsec_epoch = cpu_to_be32(xfs_timestamp_nsec_epoch(sbp,
> > +                                             time->tv_sec, time->tv_nsec));
> > +}
> > +
> > +#endif /* __XFS_TIMESTAMP_H__ */
> > diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
> > index a9ad90926b87..48c9c9e3654d 100644
> > --- a/fs/xfs/libxfs/xfs_trans_inode.c
> > +++ b/fs/xfs/libxfs/xfs_trans_inode.c
> > @@ -8,10 +8,13 @@
> >  #include "xfs_shared.h"
> >  #include "xfs_format.h"
> >  #include "xfs_log_format.h"
> > +#include "xfs_trans_resv.h"
> > +#include "xfs_mount.h"
> >  #include "xfs_inode.h"
> >  #include "xfs_trans.h"
> >  #include "xfs_trans_priv.h"
> >  #include "xfs_inode_item.h"
> > +#include "xfs_timestamp.h"
> >
> >  #include <linux/iversion.h>
> >
> > @@ -67,8 +70,8 @@ xfs_trans_ichgtime(
> >       if (flags & XFS_ICHGTIME_CHG)
> >               inode->i_ctime = tv;
> >       if (flags & XFS_ICHGTIME_CREATE) {
> > -             ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
> > -             ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
> > +             xfs_timestamp_ic_encode(&ip->i_mount->m_sb, &tv,
> > +                                     &ip->i_d.di_crtime);
> >       }
> >  }
> >
> > diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
> > index 6d483ab29e63..981f86387dc3 100644
> > --- a/fs/xfs/scrub/inode.c
> > +++ b/fs/xfs/scrub/inode.c
> > @@ -17,6 +17,7 @@
> >  #include "xfs_reflink.h"
> >  #include "xfs_rmap.h"
> >  #include "xfs_bmap_util.h"
> > +#include "xfs_timestamp.h"
> >  #include "scrub/scrub.h"
> >  #include "scrub/common.h"
> >  #include "scrub/btree.h"
> > @@ -293,11 +294,9 @@ xchk_dinode(
> >       }
> >
> >       /* di_[amc]time.nsec */
> > -     if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
> > -             xchk_ino_set_corrupt(sc, ino);
> > -     if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
> > -             xchk_ino_set_corrupt(sc, ino);
> > -     if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
> > +     if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_atime) ||
> > +         !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_mtime) ||
> > +         !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_ctime))
> >               xchk_ino_set_corrupt(sc, ino);
> >
> >       /*
> > @@ -403,7 +402,7 @@ xchk_dinode(
> >       }
> >
> >       if (dip->di_version >= 3) {
> > -             if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
> > +             if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_crtime))
> >                       xchk_ino_set_corrupt(sc, ino);
> >               xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
> >               xchk_inode_cowextsize(sc, dip, ino, mode, flags,
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index a92d4521748d..ca1538b31170 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -35,6 +35,7 @@
> >  #include "xfs_log.h"
> >  #include "xfs_bmap_btree.h"
> >  #include "xfs_reflink.h"
> > +#include "xfs_timestamp.h"
> >
> >  kmem_zone_t *xfs_inode_zone;
> >
> > @@ -851,8 +852,7 @@ xfs_ialloc(
> >               inode_set_iversion(inode, 1);
> >               ip->i_d.di_flags2 = 0;
> >               ip->i_d.di_cowextsize = 0;
> > -             ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
> > -             ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
> > +             xfs_timestamp_ic_encode(&mp->m_sb, &tv, &ip->i_d.di_crtime);
> >       }
> >
> >
> > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> > index 726aa3bfd6e8..2b5db9af6a9d 100644
> > --- a/fs/xfs/xfs_inode_item.c
> > +++ b/fs/xfs/xfs_inode_item.c
> > @@ -18,6 +18,7 @@
> >  #include "xfs_buf_item.h"
> >  #include "xfs_log.h"
> >  #include "xfs_error.h"
> > +#include "xfs_timestamp.h"
> >
> >  #include <linux/iversion.h>
> >
> > @@ -303,6 +304,7 @@ xfs_inode_to_log_dinode(
> >  {
> >       struct xfs_icdinode     *from = &ip->i_d;
> >       struct inode            *inode = VFS_I(ip);
> > +     struct xfs_sb           *sbp = &ip->i_mount->m_sb;
> >
> >       to->di_magic = XFS_DINODE_MAGIC;
> >
> > @@ -315,12 +317,9 @@ xfs_inode_to_log_dinode(
> >
> >       memset(to->di_pad, 0, sizeof(to->di_pad));
> >       memset(to->di_pad3, 0, sizeof(to->di_pad3));
> > -     to->di_atime.t_sec = inode->i_atime.tv_sec;
> > -     to->di_atime.t_nsec = inode->i_atime.tv_nsec;
> > -     to->di_mtime.t_sec = inode->i_mtime.tv_sec;
> > -     to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
> > -     to->di_ctime.t_sec = inode->i_ctime.tv_sec;
> > -     to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
> > +     xfs_timestamp_ic_encode(sbp, &inode->i_atime, &to->di_atime);
> > +     xfs_timestamp_ic_encode(sbp, &inode->i_mtime, &to->di_mtime);
> > +     xfs_timestamp_ic_encode(sbp, &inode->i_ctime, &to->di_ctime);
> >       to->di_nlink = inode->i_nlink;
> >       to->di_gen = inode->i_generation;
> >       to->di_mode = inode->i_mode;
> > @@ -342,7 +341,7 @@ xfs_inode_to_log_dinode(
> >       if (from->di_version == 3) {
> >               to->di_changecount = inode_peek_iversion(inode);
> >               to->di_crtime.t_sec = from->di_crtime.t_sec;
> > -             to->di_crtime.t_nsec = from->di_crtime.t_nsec;
> > +             to->di_crtime.t_nsec_epoch = from->di_crtime.t_nsec_epoch;
> >               to->di_flags2 = from->di_flags2;
> >               to->di_cowextsize = from->di_cowextsize;
> >               to->di_ino = ip->i_ino;
> > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > index 4c7962ccb0c4..aa294597d16f 100644
> > --- a/fs/xfs/xfs_iops.c
> > +++ b/fs/xfs/xfs_iops.c
> > @@ -21,6 +21,7 @@
> >  #include "xfs_dir2.h"
> >  #include "xfs_iomap.h"
> >  #include "xfs_error.h"
> > +#include "xfs_timestamp.h"
> >
> >  #include <linux/xattr.h>
> >  #include <linux/posix_acl.h>
> > @@ -556,8 +557,8 @@ xfs_vn_getattr(
> >       if (ip->i_d.di_version == 3) {
> >               if (request_mask & STATX_BTIME) {
> >                       stat->result_mask |= STATX_BTIME;
> > -                     stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
> > -                     stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
> > +                     xfs_timestamp_ic_decode(&mp->m_sb, &stat->btime,
> > +                                             &ip->i_d.di_crtime);
> >               }
> >       }
> >
> > diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
> > index 884950adbd16..3e55cf029414 100644
> > --- a/fs/xfs/xfs_itable.c
> > +++ b/fs/xfs/xfs_itable.c
> > @@ -19,6 +19,7 @@
> >  #include "xfs_error.h"
> >  #include "xfs_icache.h"
> >  #include "xfs_health.h"
> > +#include "xfs_timestamp.h"
> >
> >  /*
> >   * Bulk Stat
> > @@ -98,7 +99,7 @@ xfs_bulkstat_one_int(
> >       buf->bs_ctime = inode->i_ctime.tv_sec;
> >       buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
> >       buf->bs_btime = dic->di_crtime.t_sec;
> > -     buf->bs_btime_nsec = dic->di_crtime.t_nsec;
> > +     buf->bs_btime_nsec = XFS_TIMESTAMP_NSEC(dic->di_crtime.t_nsec_epoch);
> >       buf->bs_gen = inode->i_generation;
> >       buf->bs_mode = inode->i_mode;
> >
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index b3188ea49413..b940ce6dac07 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -35,6 +35,7 @@
> >  #include "xfs_refcount_item.h"
> >  #include "xfs_bmap_item.h"
> >  #include "xfs_reflink.h"
> > +#include "xfs_timestamp.h"
> >
> >  #include <linux/magic.h>
> >  #include <linux/fs_context.h>
> > @@ -1438,8 +1439,8 @@ xfs_fc_fill_super(
> >       sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
> >       sb->s_max_links = XFS_MAXLINK;
> >       sb->s_time_gran = 1;
> > -     sb->s_time_min = S32_MIN;
> > -     sb->s_time_max = S32_MAX;
> > +     sb->s_time_min = XFS_TIMESTAMP_SEC_MIN;
> > +     sb->s_time_max = XFS_TIMESTAMP_SEC_MAX(&mp->m_sb);
> >       sb->s_iflags |= SB_I_CGROUPWB;
> >
> >       set_posix_acl_flag(sb);
> > --
> > 2.17.1
> >
Christoph Hellwig Nov. 12, 2019, 8:25 a.m. UTC | #3
On Mon, Nov 11, 2019 at 02:35:08PM -0800, Darrick J. Wong wrote:
> Also, please change struct xfs_inode.i_crtime to a timespec64 to match
> the other three timestamps in struct inode...

Or you could just merge my outstanding patch to do just that..
Darrick J. Wong Nov. 12, 2019, 4:09 p.m. UTC | #4
On Tue, Nov 12, 2019 at 12:25:24AM -0800, Christoph Hellwig wrote:
> On Mon, Nov 11, 2019 at 02:35:08PM -0800, Darrick J. Wong wrote:
> > Also, please change struct xfs_inode.i_crtime to a timespec64 to match
> > the other three timestamps in struct inode...
> 
> Or you could just merge my outstanding patch to do just that..

I must've missed it, got a link?

/me digs...

--D
Christoph Hellwig Nov. 12, 2019, 4:12 p.m. UTC | #5
On Tue, Nov 12, 2019 at 08:09:13AM -0800, Darrick J. Wong wrote:
> On Tue, Nov 12, 2019 at 12:25:24AM -0800, Christoph Hellwig wrote:
> > On Mon, Nov 11, 2019 at 02:35:08PM -0800, Darrick J. Wong wrote:
> > > Also, please change struct xfs_inode.i_crtime to a timespec64 to match
> > > the other three timestamps in struct inode...
> > 
> > Or you could just merge my outstanding patch to do just that..
> 
> I must've missed it, got a link?

The series is at:

https://www.spinics.net/lists/linux-xfs/msg32997.html

Dave didn't like the last patch, but the first three should be fine.
Dave Chinner Nov. 12, 2019, 9:11 p.m. UTC | #6
On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> >
> > On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> > > Use similar on-disk encoding for timestamps as ext4 to push back
> > > the y2038 deadline to 2446.
> > >
> > > The encoding uses the 2 free MSB in 32bit nsec field to extend the
> > > seconds field storage size to 34bit.
> > >
> > > Those 2 bits should be zero on existing xfs inodes, so the extended
> > > timestamp range feature is declared read-only compatible with old
> > > on-disk format.
> >
> > What do you think about making the timestamp field a uint64_t counting
> > nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
> > we support with the existing encoding scheme)?  Instead of using the
> > upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
> > up years ago and has not fully fixed?
> 
> The advantage of the ext4 scheme is that it is more backward compatible.

Darrick an I had a long discussion about this on #xfs a few weeks
ago (22nd october). 

Discussion went like this:

 <djwong>   btw, dchinner, were one to try to solve the y2038 problem on xfs, how would one do it?
 <djwong>   1) write tests to make sure we can store/retrieve the extreme ends of the existing timestamps; then
 <djwong>   2) use empty di_pad bytes to extend each timestamp field width; then
 <djwong>   3) figure out what the values of the extra byte are (epochs moving forward from the unix epoch; and we don't care about supporting dates before 1900); then
 <djwong>   4) expand test to cover new intervals?
 <dchinner> djwong: pretty much      
 <dchinner> the epoch extending patch I originally proposed is here: https://lore.kernel.org/linux-xfs/20140602002822.GQ14410@dastard/
 <djwong>   also it occurred to me that one could use the top two bits of the nsec fields to make it a 10-bit extension of the seconds fields
 <dchinner> I've probably got a more recent version somewhere in a stack somewhere around here
 <dchinner> didn't ext4 use part of the nsec field like that?
 <djwong>   yeah, they have 34 bit dates now
 <dchinner> ISTR it's got some horrifically complex encoding of the timestamp
 <djwong>   yes, it does
 <djwong>   they did epochs rolling forward from the current one
 <dchinner> we could just do the 34 bit second time in a sane way
 <dchinner> because all the timestamps are contiguous on disk
 <dchinner> i.e. if a SB flag is set, the timestamp on disk is an opaque 64 bit field
 <dchinner> upper 30 bits are the nsec field, lower 34 bits are the seconds field
 <dchinner> similar to how we encode BMBT extent records
 <dchinner> always unsigned, so we don't support dates before 1970 at all....
 <djwong>   hmm, with that scheme (uint t_sec:34; uint t_nsec: 30;} I guess that gets us to the year 2514
 <djwong>   and provided nobody cares about pre-1970 dates or the ability to store negative t_nsec
 <dchinner> djwong: if XFS is still in use in 2514, then I'm not going to care about it :)
 <djwong>   [narrator] But what Dave doesn't know is that the three XFS maintainers will be uploaded into the Cloud in 2046 to maintain XFS in perpetuity.... :D
 <dchinner> The current Dave doesn't care about that :)
 <djwong>   hmm even if we did {signed int t_sec:34;} that would still get us to 2242
 <dchinner> djwong: I just don't see the point of having signed timestamps
 <djwong>   admittedly, we don't need timestamps dating back to the 1700s
 <djwong>   but then, what if we set the new epoch to 1993?
 <djwong>   (or, heck, 2000?)
 <djwong>   i mean, i guess it doesn't matter if we have dates going to 2514 or 2544
 <dchinner> what, have an on-disk epoch that is different to the unix epoch?
 <djwong>   yes :D
 <djwong>   "In the year 2525, if XFS is still alive..." ♪♪
 <dchinner> then we definitely have unsigned timestamps on disk
 <dchinner> set the epoch to ~1900, and we handle the legacy negative 32-bit int timestamp range as well.
 <djwong>   that could also work
 <djwong>   I don't anticipate being around in 2444

Basically, we've both looked at what ext4 has done and don't want to
do that - it's an awful, complex hack and it's had quite a few bugs
in it since it was introduced that went a long time before being
noticed.

> I would like to have an upgrade procedure that is simple and I don't like
> the idea of having two completely different time encodings in the code
> (and on disk) if I can help it.

Backwards comaptible in-place upgrades are largely a myth: we don't
allow changes to the on-disk format without feature bits that limit
what old kernels can do with new formats. In this case it requires
an incompat bit because the moment an upper bit in the ns field is
set then the timestamps go bad on old kernels. Hence it's not a
compatible change and filesytems with this format cannot be mounted
on kernels that don't support it.

So, an in-place upgrade process is a one-way operation - once you
start converting and have >2038 dates, there is no going back
without an offline admin operation. That means there's no real need
to try to retain the old format at all. IOWs, for in-place upgrade,
all we need is an inode flag to indicate what format the timestamp
is in once the superblock incompat flag is set. Eventually the SB
flag becomes the mkfs default, and then eventually it becomes the
only supported format. This is what we've done before for things
like NLINK, ATTR2, etc.

> IIUC, you are implying that the ext4 scheme is more prone to human
> programming errors? that should be addressed with proper unit testing
> IMO and besides, we can learn from ext4 past bugs (not sure that my
> implementation did), so those could be listed also in the pros column.

We're not implying anything - there's been several actual bugs in
the encoding scheme that weren't noticed or fixed for quite a long
time. What we've learnt from this is that complexity in timestamp
encoding only leads to bugs, so given the choice we should really do
something simpler.

> One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
> allows for negative nsec value. Not sure if that is intentional and why.
> I suppose it is an oversight? That is something that xfs_repair would
> need to check and fix before upgrade if we do go with the epoch bits.

It's not an oversight - it's somethign the on-disk format allowed.
Who knows if it ever got used (or how it got used), but it's
somethign we can only fix by changing the on-disk format (as you can
see from the discussion above).

IOWs, we pretty much decided on a new 64 bit encoding format using a
epoch of 1900 and a unsigned 64bit nanosecond counter to get us a
range of almost 600 years from year 1900-2500. It's simple, easy to
encode/decode, and very easy to validate. It's also trivially easy
to do in-place upgrades with an inode flag....

Cheers,

Dave.
Darrick J. Wong Nov. 13, 2019, 3:56 a.m. UTC | #7
On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> > On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> > <darrick.wong@oracle.com> wrote:
> > >
> > > On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> > > > Use similar on-disk encoding for timestamps as ext4 to push back
> > > > the y2038 deadline to 2446.
> > > >
> > > > The encoding uses the 2 free MSB in 32bit nsec field to extend the
> > > > seconds field storage size to 34bit.
> > > >
> > > > Those 2 bits should be zero on existing xfs inodes, so the extended
> > > > timestamp range feature is declared read-only compatible with old
> > > > on-disk format.
> > >
> > > What do you think about making the timestamp field a uint64_t counting
> > > nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
> > > we support with the existing encoding scheme)?  Instead of using the
> > > upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
> > > up years ago and has not fully fixed?
> > 
> > The advantage of the ext4 scheme is that it is more backward compatible.
> 
> Darrick an I had a long discussion about this on #xfs a few weeks
> ago (22nd october). 
> 
> Discussion went like this:
> 
>  <djwong>   btw, dchinner, were one to try to solve the y2038 problem on xfs, how would one do it?
>  <djwong>   1) write tests to make sure we can store/retrieve the extreme ends of the existing timestamps; then
>  <djwong>   2) use empty di_pad bytes to extend each timestamp field width; then
>  <djwong>   3) figure out what the values of the extra byte are (epochs moving forward from the unix epoch; and we don't care about supporting dates before 1900); then
>  <djwong>   4) expand test to cover new intervals?
>  <dchinner> djwong: pretty much      
>  <dchinner> the epoch extending patch I originally proposed is here: https://lore.kernel.org/linux-xfs/20140602002822.GQ14410@dastard/
>  <djwong>   also it occurred to me that one could use the top two bits of the nsec fields to make it a 10-bit extension of the seconds fields
>  <dchinner> I've probably got a more recent version somewhere in a stack somewhere around here
>  <dchinner> didn't ext4 use part of the nsec field like that?
>  <djwong>   yeah, they have 34 bit dates now
>  <dchinner> ISTR it's got some horrifically complex encoding of the timestamp
>  <djwong>   yes, it does
>  <djwong>   they did epochs rolling forward from the current one
>  <dchinner> we could just do the 34 bit second time in a sane way
>  <dchinner> because all the timestamps are contiguous on disk
>  <dchinner> i.e. if a SB flag is set, the timestamp on disk is an opaque 64 bit field
>  <dchinner> upper 30 bits are the nsec field, lower 34 bits are the seconds field
>  <dchinner> similar to how we encode BMBT extent records
>  <dchinner> always unsigned, so we don't support dates before 1970 at all....
>  <djwong>   hmm, with that scheme (uint t_sec:34; uint t_nsec: 30;} I guess that gets us to the year 2514
>  <djwong>   and provided nobody cares about pre-1970 dates or the ability to store negative t_nsec
>  <dchinner> djwong: if XFS is still in use in 2514, then I'm not going to care about it :)
>  <djwong>   [narrator] But what Dave doesn't know is that the three XFS maintainers will be uploaded into the Cloud in 2046 to maintain XFS in perpetuity.... :D
>  <dchinner> The current Dave doesn't care about that :)
>  <djwong>   hmm even if we did {signed int t_sec:34;} that would still get us to 2242
>  <dchinner> djwong: I just don't see the point of having signed timestamps
>  <djwong>   admittedly, we don't need timestamps dating back to the 1700s
>  <djwong>   but then, what if we set the new epoch to 1993?
>  <djwong>   (or, heck, 2000?)
>  <djwong>   i mean, i guess it doesn't matter if we have dates going to 2514 or 2544
>  <dchinner> what, have an on-disk epoch that is different to the unix epoch?
>  <djwong>   yes :D
>  <djwong>   "In the year 2525, if XFS is still alive..." ♪♪
>  <dchinner> then we definitely have unsigned timestamps on disk
>  <dchinner> set the epoch to ~1900, and we handle the legacy negative 32-bit int timestamp range as well.
>  <djwong>   that could also work
>  <djwong>   I don't anticipate being around in 2444
> 
> Basically, we've both looked at what ext4 has done and don't want to
> do that - it's an awful, complex hack and it's had quite a few bugs
> in it since it was introduced that went a long time before being
> noticed.

When that conversation happened, I was still thinking of using the top
34 bits for seconds and the bottom 30 bits for nsec, which is not where
my brain went this month.

I've since moved on to a u64 nsec counter, which gets us to 2486, which
is a whole forty more years past ext4!!!

> > I would like to have an upgrade procedure that is simple and I don't like
> > the idea of having two completely different time encodings in the code
> > (and on disk) if I can help it.
> 
> Backwards comaptible in-place upgrades are largely a myth: we don't
> allow changes to the on-disk format without feature bits that limit
> what old kernels can do with new formats. In this case it requires
> an incompat bit because the moment an upper bit in the ns field is
> set then the timestamps go bad on old kernels. Hence it's not a
> compatible change and filesytems with this format cannot be mounted
> on kernels that don't support it.
> 
> So, an in-place upgrade process is a one-way operation - once you
> start converting and have >2038 dates, there is no going back
> without an offline admin operation. That means there's no real need
> to try to retain the old format at all. IOWs, for in-place upgrade,
> all we need is an inode flag to indicate what format the timestamp
> is in once the superblock incompat flag is set. Eventually the SB
> flag becomes the mkfs default, and then eventually it becomes the
> only supported format. This is what we've done before for things
> like NLINK, ATTR2, etc.

/me is confused, are you advocating for an upgrade path that is: (1)
admin sets incompat feature; (2) as people try to store dates beyond
2038 we set an inode flag and write the timestamp in the new format?

I guess we could do that.  I'd kinda thought that we'd just set an
incompat flag and users simply have to backup, reformat, and reinstall.
OTOH it's a fairly minor update so maybe we can support one way upgrade.

> > IIUC, you are implying that the ext4 scheme is more prone to human
> > programming errors? that should be addressed with proper unit testing
> > IMO and besides, we can learn from ext4 past bugs (not sure that my
> > implementation did), so those could be listed also in the pros column.

Well... Ted added a comment to ext4 about how the encoding had been
screwed up, along with some #if'd out code that would some day take its
place and do the encoding correctly... but Christoph later ripped it out
since it's basically an incompat format change...

> We're not implying anything - there's been several actual bugs in
> the encoding scheme that weren't noticed or fixed for quite a long
> time. What we've learnt from this is that complexity in timestamp
> encoding only leads to bugs, so given the choice we should really do
> something simpler.

...so yes, let's try for something simpler.

> > One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
> > allows for negative nsec value. Not sure if that is intentional and why.
> > I suppose it is an oversight? That is something that xfs_repair would
> > need to check and fix before upgrade if we do go with the epoch bits.
> 
> It's not an oversight - it's somethign the on-disk format allowed.
> Who knows if it ever got used (or how it got used), but it's
> somethign we can only fix by changing the on-disk format (as you can
> see from the discussion above).

The disk format allows it; scrub warns about it, and the kernel at least
in theory clamps the nsec value to 0...1e9.

> IOWs, we pretty much decided on a new 64 bit encoding format using a
> epoch of 1900 and a unsigned 64bit nanosecond counter to get us a
> range of almost 600 years from year 1900-2500. It's simple, easy to
> encode/decode, and very easy to validate. It's also trivially easy
> to do in-place upgrades with an inode flag....
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
Dave Chinner Nov. 13, 2019, 4:08 a.m. UTC | #8
On Tue, Nov 12, 2019 at 07:56:11PM -0800, Darrick J. Wong wrote:
> On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> > Backwards comaptible in-place upgrades are largely a myth: we don't
> > allow changes to the on-disk format without feature bits that limit
> > what old kernels can do with new formats. In this case it requires
> > an incompat bit because the moment an upper bit in the ns field is
> > set then the timestamps go bad on old kernels. Hence it's not a
> > compatible change and filesytems with this format cannot be mounted
> > on kernels that don't support it.
> > 
> > So, an in-place upgrade process is a one-way operation - once you
> > start converting and have >2038 dates, there is no going back
> > without an offline admin operation. That means there's no real need
> > to try to retain the old format at all. IOWs, for in-place upgrade,
> > all we need is an inode flag to indicate what format the timestamp
> > is in once the superblock incompat flag is set. Eventually the SB
> > flag becomes the mkfs default, and then eventually it becomes the
> > only supported format. This is what we've done before for things
> > like NLINK, ATTR2, etc.
> 
> /me is confused, are you advocating for an upgrade path that is: (1)
> admin sets incompat feature; (2) as people try to store dates beyond
> 2038 we set an inode flag and write the timestamp in the new format?

If we decide that we want to allow in-place upgrade, then this is
the way we've done it in the past. remember how long we supported
reading v1 inodes but only supported writing v2 inodes? i.e. we
converted old 16 bit nlink inodes to 32 bit nlink inodes silently
and automatically when they were dirtied.

i.e. once the superblock feature bit is set, we can convert inode
the timestamp format on inode writeback.

> I guess we could do that.  I'd kinda thought that we'd just set an
> incompat flag and users simply have to backup, reformat, and reinstall.
> OTOH it's a fairly minor update so maybe we can support one way upgrade.

Sure, we can do that, too, but the kernel code still has to support
reading and write both formats. Given that, on-the fly conversion is
a trivial addition to the code...

> > > One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
> > > allows for negative nsec value. Not sure if that is intentional and why.
> > > I suppose it is an oversight? That is something that xfs_repair would
> > > need to check and fix before upgrade if we do go with the epoch bits.
> > 
> > It's not an oversight - it's somethign the on-disk format allowed.
> > Who knows if it ever got used (or how it got used), but it's
> > somethign we can only fix by changing the on-disk format (as you can
> > see from the discussion above).
> 
> The disk format allows it; scrub warns about it, and the kernel at least
> in theory clamps the nsec value to 0...1e9.

Yup, but those are relatively recent things when compared to when
the XFS on-disk format was defined... :P

Cheers,

Dave.
Amir Goldstein Nov. 13, 2019, 4:42 a.m. UTC | #9
[CC:Arnd,Deepa]

On Wed, Nov 13, 2019 at 5:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> > On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> > > On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> > > <darrick.wong@oracle.com> wrote:
> > > >
> > > > On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> > > > > Use similar on-disk encoding for timestamps as ext4 to push back
> > > > > the y2038 deadline to 2446.
> > > > >
> > > > > The encoding uses the 2 free MSB in 32bit nsec field to extend the
> > > > > seconds field storage size to 34bit.
> > > > >
> > > > > Those 2 bits should be zero on existing xfs inodes, so the extended
> > > > > timestamp range feature is declared read-only compatible with old
> > > > > on-disk format.
> > > >
> > > > What do you think about making the timestamp field a uint64_t counting
> > > > nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
> > > > we support with the existing encoding scheme)?  Instead of using the
> > > > upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
> > > > up years ago and has not fully fixed?
> > >
> > > The advantage of the ext4 scheme is that it is more backward compatible.
> >
> > Darrick an I had a long discussion about this on #xfs a few weeks
> > ago (22nd october).
> >
> > Discussion went like this:
> >
> >  <djwong>   btw, dchinner, were one to try to solve the y2038 problem on xfs, how would one do it?
> >  <djwong>   1) write tests to make sure we can store/retrieve the extreme ends of the existing timestamps; then
> >  <djwong>   2) use empty di_pad bytes to extend each timestamp field width; then
> >  <djwong>   3) figure out what the values of the extra byte are (epochs moving forward from the unix epoch; and we don't care about supporting dates before 1900); then
> >  <djwong>   4) expand test to cover new intervals?
> >  <dchinner> djwong: pretty much
> >  <dchinner> the epoch extending patch I originally proposed is here: https://lore.kernel.org/linux-xfs/20140602002822.GQ14410@dastard/
> >  <djwong>   also it occurred to me that one could use the top two bits of the nsec fields to make it a 10-bit extension of the seconds fields
> >  <dchinner> I've probably got a more recent version somewhere in a stack somewhere around here
> >  <dchinner> didn't ext4 use part of the nsec field like that?
> >  <djwong>   yeah, they have 34 bit dates now
> >  <dchinner> ISTR it's got some horrifically complex encoding of the timestamp
> >  <djwong>   yes, it does
> >  <djwong>   they did epochs rolling forward from the current one
> >  <dchinner> we could just do the 34 bit second time in a sane way
> >  <dchinner> because all the timestamps are contiguous on disk
> >  <dchinner> i.e. if a SB flag is set, the timestamp on disk is an opaque 64 bit field
> >  <dchinner> upper 30 bits are the nsec field, lower 34 bits are the seconds field
> >  <dchinner> similar to how we encode BMBT extent records
> >  <dchinner> always unsigned, so we don't support dates before 1970 at all....
> >  <djwong>   hmm, with that scheme (uint t_sec:34; uint t_nsec: 30;} I guess that gets us to the year 2514
> >  <djwong>   and provided nobody cares about pre-1970 dates or the ability to store negative t_nsec
> >  <dchinner> djwong: if XFS is still in use in 2514, then I'm not going to care about it :)
> >  <djwong>   [narrator] But what Dave doesn't know is that the three XFS maintainers will be uploaded into the Cloud in 2046 to maintain XFS in perpetuity.... :D
> >  <dchinner> The current Dave doesn't care about that :)
> >  <djwong>   hmm even if we did {signed int t_sec:34;} that would still get us to 2242
> >  <dchinner> djwong: I just don't see the point of having signed timestamps
> >  <djwong>   admittedly, we don't need timestamps dating back to the 1700s
> >  <djwong>   but then, what if we set the new epoch to 1993?
> >  <djwong>   (or, heck, 2000?)
> >  <djwong>   i mean, i guess it doesn't matter if we have dates going to 2514 or 2544
> >  <dchinner> what, have an on-disk epoch that is different to the unix epoch?
> >  <djwong>   yes :D
> >  <djwong>   "In the year 2525, if XFS is still alive..." ♪♪
> >  <dchinner> then we definitely have unsigned timestamps on disk
> >  <dchinner> set the epoch to ~1900, and we handle the legacy negative 32-bit int timestamp range as well.
> >  <djwong>   that could also work
> >  <djwong>   I don't anticipate being around in 2444
> >
> > Basically, we've both looked at what ext4 has done and don't want to
> > do that - it's an awful, complex hack and it's had quite a few bugs
> > in it since it was introduced that went a long time before being
> > noticed.
>
> When that conversation happened, I was still thinking of using the top
> 34 bits for seconds and the bottom 30 bits for nsec, which is not where
> my brain went this month.
>
> I've since moved on to a u64 nsec counter, which gets us to 2486, which
> is a whole forty more years past ext4!!!
>

I didn't get why you dropped the di_pad idea (Arnd's and Dave's patches).
Is it to save the pad for another rainy day?

> > > I would like to have an upgrade procedure that is simple and I don't like
> > > the idea of having two completely different time encodings in the code
> > > (and on disk) if I can help it.
> >
> > Backwards comaptible in-place upgrades are largely a myth: we don't
> > allow changes to the on-disk format without feature bits that limit
> > what old kernels can do with new formats. In this case it requires
> > an incompat bit because the moment an upper bit in the ns field is
> > set then the timestamps go bad on old kernels. Hence it's not a
> > compatible change and filesytems with this format cannot be mounted
> > on kernels that don't support it.
> >
> > So, an in-place upgrade process is a one-way operation - once you
> > start converting and have >2038 dates, there is no going back
> > without an offline admin operation. That means there's no real need
> > to try to retain the old format at all. IOWs, for in-place upgrade,
> > all we need is an inode flag to indicate what format the timestamp
> > is in once the superblock incompat flag is set. Eventually the SB
> > flag becomes the mkfs default, and then eventually it becomes the
> > only supported format. This is what we've done before for things
> > like NLINK, ATTR2, etc.
>
> /me is confused, are you advocating for an upgrade path that is: (1)
> admin sets incompat feature; (2) as people try to store dates beyond
> 2038 we set an inode flag and write the timestamp in the new format?
>
> I guess we could do that.  I'd kinda thought that we'd just set an
> incompat flag and users simply have to backup, reformat, and reinstall.
> OTOH it's a fairly minor update so maybe we can support one way upgrade.
>

That is not very sympathetic to users. Please go have lunch with one of
the filed engineers in your organization.

y2038 support is not something that *some* users will need to migrate to.
It is going to be a major pain for IT departments I see our duty as developers
to do our best to minimize that pain -
Certainly when it is quite easy to do - go offline, set an incompat
flag, go online
downtime has been minimized.
The concept of backup/restore as a means to upgrade should not be on the table
for this sort of upgrade that everyone will be mandated to go through -
not unless you are looking for people to stop using xfs - y2038 problem solved!

> > > IIUC, you are implying that the ext4 scheme is more prone to human
> > > programming errors? that should be addressed with proper unit testing
> > > IMO and besides, we can learn from ext4 past bugs (not sure that my
> > > implementation did), so those could be listed also in the pros column.
>
> Well... Ted added a comment to ext4 about how the encoding had been
> screwed up, along with some #if'd out code that would some day take its
> place and do the encoding correctly... but Christoph later ripped it out
> since it's basically an incompat format change...
>
> > We're not implying anything - there's been several actual bugs in
> > the encoding scheme that weren't noticed or fixed for quite a long
> > time. What we've learnt from this is that complexity in timestamp
> > encoding only leads to bugs, so given the choice we should really do
> > something simpler.
>
> ...so yes, let's try for something simpler.
>

I'm all for simple. It's a completely new scheme that makes me nervous.
See you can say that the ext4 bugs are due to a "complex" encoding,
but I say it is due to "something completely new".
So yes, nanosec since 1900 is simple, but it's yet another standard, so
makes me unease, but I will get through it.

> > > One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
> > > allows for negative nsec value. Not sure if that is intentional and why.
> > > I suppose it is an oversight? That is something that xfs_repair would
> > > need to check and fix before upgrade if we do go with the epoch bits.
> >
> > It's not an oversight - it's somethign the on-disk format allowed.
> > Who knows if it ever got used (or how it got used), but it's
> > somethign we can only fix by changing the on-disk format (as you can
> > see from the discussion above).
>
> The disk format allows it; scrub warns about it, and the kernel at least
> in theory clamps the nsec value to 0...1e9.
>
> > IOWs, we pretty much decided on a new 64 bit encoding format using a
> > epoch of 1900 and a unsigned 64bit nanosecond counter to get us a
> > range of almost 600 years from year 1900-2500. It's simple, easy to
> > encode/decode, and very easy to validate. It's also trivially easy
> > to do in-place upgrades with an inode flag....
> >

All right, so how do we proceed?
Arnd, do you want to re-work your series according to this scheme?
Is there any core xfs developer that was going to tackle this?

I'm here, so if you need my help moving things forward let me know.

Thanks,
Amir.
Darrick J. Wong Nov. 13, 2019, 4:58 a.m. UTC | #10
On Wed, Nov 13, 2019 at 06:42:09AM +0200, Amir Goldstein wrote:
> [CC:Arnd,Deepa]
> 
> On Wed, Nov 13, 2019 at 5:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >
> > On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> > > On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> > > > On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> > > > <darrick.wong@oracle.com> wrote:
> > > > >
> > > > > On Mon, Nov 11, 2019 at 11:36:30PM +0200, Amir Goldstein wrote:
> > > > > > Use similar on-disk encoding for timestamps as ext4 to push back
> > > > > > the y2038 deadline to 2446.
> > > > > >
> > > > > > The encoding uses the 2 free MSB in 32bit nsec field to extend the
> > > > > > seconds field storage size to 34bit.
> > > > > >
> > > > > > Those 2 bits should be zero on existing xfs inodes, so the extended
> > > > > > timestamp range feature is declared read-only compatible with old
> > > > > > on-disk format.
> > > > >
> > > > > What do you think about making the timestamp field a uint64_t counting
> > > > > nanoseconds since Dec 14 09:15:53 UTC 1901 (a.k.a. the minimum datetime
> > > > > we support with the existing encoding scheme)?  Instead of using the
> > > > > upper 2 bits of the nsec field for an epoch encoding, which ext4 screwed
> > > > > up years ago and has not fully fixed?
> > > >
> > > > The advantage of the ext4 scheme is that it is more backward compatible.
> > >
> > > Darrick an I had a long discussion about this on #xfs a few weeks
> > > ago (22nd october).
> > >
> > > Discussion went like this:
> > >
> > >  <djwong>   btw, dchinner, were one to try to solve the y2038 problem on xfs, how would one do it?
> > >  <djwong>   1) write tests to make sure we can store/retrieve the extreme ends of the existing timestamps; then
> > >  <djwong>   2) use empty di_pad bytes to extend each timestamp field width; then
> > >  <djwong>   3) figure out what the values of the extra byte are (epochs moving forward from the unix epoch; and we don't care about supporting dates before 1900); then
> > >  <djwong>   4) expand test to cover new intervals?
> > >  <dchinner> djwong: pretty much
> > >  <dchinner> the epoch extending patch I originally proposed is here: https://lore.kernel.org/linux-xfs/20140602002822.GQ14410@dastard/
> > >  <djwong>   also it occurred to me that one could use the top two bits of the nsec fields to make it a 10-bit extension of the seconds fields
> > >  <dchinner> I've probably got a more recent version somewhere in a stack somewhere around here
> > >  <dchinner> didn't ext4 use part of the nsec field like that?
> > >  <djwong>   yeah, they have 34 bit dates now
> > >  <dchinner> ISTR it's got some horrifically complex encoding of the timestamp
> > >  <djwong>   yes, it does
> > >  <djwong>   they did epochs rolling forward from the current one
> > >  <dchinner> we could just do the 34 bit second time in a sane way
> > >  <dchinner> because all the timestamps are contiguous on disk
> > >  <dchinner> i.e. if a SB flag is set, the timestamp on disk is an opaque 64 bit field
> > >  <dchinner> upper 30 bits are the nsec field, lower 34 bits are the seconds field
> > >  <dchinner> similar to how we encode BMBT extent records
> > >  <dchinner> always unsigned, so we don't support dates before 1970 at all....
> > >  <djwong>   hmm, with that scheme (uint t_sec:34; uint t_nsec: 30;} I guess that gets us to the year 2514
> > >  <djwong>   and provided nobody cares about pre-1970 dates or the ability to store negative t_nsec
> > >  <dchinner> djwong: if XFS is still in use in 2514, then I'm not going to care about it :)
> > >  <djwong>   [narrator] But what Dave doesn't know is that the three XFS maintainers will be uploaded into the Cloud in 2046 to maintain XFS in perpetuity.... :D
> > >  <dchinner> The current Dave doesn't care about that :)
> > >  <djwong>   hmm even if we did {signed int t_sec:34;} that would still get us to 2242
> > >  <dchinner> djwong: I just don't see the point of having signed timestamps
> > >  <djwong>   admittedly, we don't need timestamps dating back to the 1700s
> > >  <djwong>   but then, what if we set the new epoch to 1993?
> > >  <djwong>   (or, heck, 2000?)
> > >  <djwong>   i mean, i guess it doesn't matter if we have dates going to 2514 or 2544
> > >  <dchinner> what, have an on-disk epoch that is different to the unix epoch?
> > >  <djwong>   yes :D
> > >  <djwong>   "In the year 2525, if XFS is still alive..." ♪♪
> > >  <dchinner> then we definitely have unsigned timestamps on disk
> > >  <dchinner> set the epoch to ~1900, and we handle the legacy negative 32-bit int timestamp range as well.
> > >  <djwong>   that could also work
> > >  <djwong>   I don't anticipate being around in 2444
> > >
> > > Basically, we've both looked at what ext4 has done and don't want to
> > > do that - it's an awful, complex hack and it's had quite a few bugs
> > > in it since it was introduced that went a long time before being
> > > noticed.
> >
> > When that conversation happened, I was still thinking of using the top
> > 34 bits for seconds and the bottom 30 bits for nsec, which is not where
> > my brain went this month.
> >
> > I've since moved on to a u64 nsec counter, which gets us to 2486, which
> > is a whole forty more years past ext4!!!
> >
> 
> I didn't get why you dropped the di_pad idea (Arnd's and Dave's patches).
> Is it to save the pad for another rainy day?

Something like that.  Someone /else/ can worry about the y2486 problem.

<duck>

Practically speaking I'd almost rather drop the precision in order to
extend the seconds range, since timestamp updates are only precise to
HZ anyway.

> > > > I would like to have an upgrade procedure that is simple and I don't like
> > > > the idea of having two completely different time encodings in the code
> > > > (and on disk) if I can help it.
> > >
> > > Backwards comaptible in-place upgrades are largely a myth: we don't
> > > allow changes to the on-disk format without feature bits that limit
> > > what old kernels can do with new formats. In this case it requires
> > > an incompat bit because the moment an upper bit in the ns field is
> > > set then the timestamps go bad on old kernels. Hence it's not a
> > > compatible change and filesytems with this format cannot be mounted
> > > on kernels that don't support it.
> > >
> > > So, an in-place upgrade process is a one-way operation - once you
> > > start converting and have >2038 dates, there is no going back
> > > without an offline admin operation. That means there's no real need
> > > to try to retain the old format at all. IOWs, for in-place upgrade,
> > > all we need is an inode flag to indicate what format the timestamp
> > > is in once the superblock incompat flag is set. Eventually the SB
> > > flag becomes the mkfs default, and then eventually it becomes the
> > > only supported format. This is what we've done before for things
> > > like NLINK, ATTR2, etc.
> >
> > /me is confused, are you advocating for an upgrade path that is: (1)
> > admin sets incompat feature; (2) as people try to store dates beyond
> > 2038 we set an inode flag and write the timestamp in the new format?
> >
> > I guess we could do that.  I'd kinda thought that we'd just set an
> > incompat flag and users simply have to backup, reformat, and reinstall.
> > OTOH it's a fairly minor update so maybe we can support one way upgrade.
> >
> 
> That is not very sympathetic to users. Please go have lunch with one of
> the filed engineers in your organization.
> 
> y2038 support is not something that *some* users will need to migrate to.
> It is going to be a major pain for IT departments I see our duty as developers
> to do our best to minimize that pain -
> Certainly when it is quite easy to do - go offline, set an incompat
> flag, go online
> downtime has been minimized.
> The concept of backup/restore as a means to upgrade should not be on the table
> for this sort of upgrade that everyone will be mandated to go through -
> not unless you are looking for people to stop using xfs - y2038 problem solved!

Heh, ok.  I'll add an inode flag and kernel auto-upgrade of timestamps
to the feature list.  It's not like we're trying to add an rmap btree to
the filesystem. :)

> > > > IIUC, you are implying that the ext4 scheme is more prone to human
> > > > programming errors? that should be addressed with proper unit testing
> > > > IMO and besides, we can learn from ext4 past bugs (not sure that my
> > > > implementation did), so those could be listed also in the pros column.
> >
> > Well... Ted added a comment to ext4 about how the encoding had been
> > screwed up, along with some #if'd out code that would some day take its
> > place and do the encoding correctly... but Christoph later ripped it out
> > since it's basically an incompat format change...
> >
> > > We're not implying anything - there's been several actual bugs in
> > > the encoding scheme that weren't noticed or fixed for quite a long
> > > time. What we've learnt from this is that complexity in timestamp
> > > encoding only leads to bugs, so given the choice we should really do
> > > something simpler.
> >
> > ...so yes, let's try for something simpler.
> >
> 
> I'm all for simple. It's a completely new scheme that makes me nervous.
> See you can say that the ext4 bugs are due to a "complex" encoding,
> but I say it is due to "something completely new".
> So yes, nanosec since 1900 is simple, but it's yet another standard, so
> makes me unease, but I will get through it.
> 
> > > > One thing I wasn't certain about is that it seems that xfs (and xfs_repair)
> > > > allows for negative nsec value. Not sure if that is intentional and why.
> > > > I suppose it is an oversight? That is something that xfs_repair would
> > > > need to check and fix before upgrade if we do go with the epoch bits.
> > >
> > > It's not an oversight - it's somethign the on-disk format allowed.
> > > Who knows if it ever got used (or how it got used), but it's
> > > somethign we can only fix by changing the on-disk format (as you can
> > > see from the discussion above).
> >
> > The disk format allows it; scrub warns about it, and the kernel at least
> > in theory clamps the nsec value to 0...1e9.
> >
> > > IOWs, we pretty much decided on a new 64 bit encoding format using a
> > > epoch of 1900 and a unsigned 64bit nanosecond counter to get us a
> > > range of almost 600 years from year 1900-2500. It's simple, easy to
> > > encode/decode, and very easy to validate. It's also trivially easy
> > > to do in-place upgrades with an inode flag....
> > >
> 
> All right, so how do we proceed?
> Arnd, do you want to re-work your series according to this scheme?

Lemme read them over again. :)

> Is there any core xfs developer that was going to tackle this?
> 
> I'm here, so if you need my help moving things forward let me know.

I wrote a trivial garbage version this afternoon, will have something
more polished tomorrow.  None of this is 5.6 material, we have time.

--D

> Thanks,
> Amir.
Amir Goldstein Nov. 13, 2019, 5:17 a.m. UTC | #11
>
> Practically speaking I'd almost rather drop the precision in order to
> extend the seconds range, since timestamp updates are only precise to
> HZ anyway.
>

FWIW, NTFS and CIFS standard is unsigned 64bit of 100 nanoseconds
counting from Jan 1, 1601.

>
> Heh, ok.  I'll add an inode flag and kernel auto-upgrade of timestamps
> to the feature list.  It's not like we're trying to add an rmap btree to
> the filesystem. :)
>

Exactly.

> >
> > All right, so how do we proceed?
> > Arnd, do you want to re-work your series according to this scheme?
>
> Lemme read them over again. :)
>
> > Is there any core xfs developer that was going to tackle this?
> >
> > I'm here, so if you need my help moving things forward let me know.
>
> I wrote a trivial garbage version this afternoon, will have something
> more polished tomorrow.  None of this is 5.6 material, we have time.
>

I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
There is a benefit in terms of code complexity and test coverage
to keep the only difference between inode versions in the on-disk
parsers, while reading into the same struct, the same way as
old inode versions are read into struct xfs_dinode.

Oh well, I can wait for tomorrow to see the polished version :-)

Thanks,
Amir.
Darrick J. Wong Nov. 13, 2019, 5:20 a.m. UTC | #12
On Wed, Nov 13, 2019 at 07:17:06AM +0200, Amir Goldstein wrote:
> >
> > Practically speaking I'd almost rather drop the precision in order to
> > extend the seconds range, since timestamp updates are only precise to
> > HZ anyway.
> >
> 
> FWIW, NTFS and CIFS standard is unsigned 64bit of 100 nanoseconds
> counting from Jan 1, 1601.
> 
> >
> > Heh, ok.  I'll add an inode flag and kernel auto-upgrade of timestamps
> > to the feature list.  It's not like we're trying to add an rmap btree to
> > the filesystem. :)
> >
> 
> Exactly.
> 
> > >
> > > All right, so how do we proceed?
> > > Arnd, do you want to re-work your series according to this scheme?
> >
> > Lemme read them over again. :)
> >
> > > Is there any core xfs developer that was going to tackle this?
> > >
> > > I'm here, so if you need my help moving things forward let me know.
> >
> > I wrote a trivial garbage version this afternoon, will have something
> > more polished tomorrow.  None of this is 5.6 material, we have time.
> >
> 
> I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
> There is a benefit in terms of code complexity and test coverage
> to keep the only difference between inode versions in the on-disk
> parsers, while reading into the same struct, the same way as
> old inode versions are read into struct xfs_dinode.
> 
> Oh well, I can wait for tomorrow to see the polished version :-)

Well now we noticed that Arnd also changed the disk quota structure
format too, so that'll slow things down as we try to figure out how to
reconcile 34-bit inode seconds vs. 40-bit quota timer seconds.

(Or whatever happens with that)

--D

> Thanks,
> Amir.
Arnd Bergmann Nov. 13, 2019, 1:20 p.m. UTC | #13
On Wed, Nov 13, 2019 at 5:59 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Wed, Nov 13, 2019 at 06:42:09AM +0200, Amir Goldstein wrote:
> > [CC:Arnd,Deepa]
> > On Wed, Nov 13, 2019 at 5:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > >
> > > On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> > > > On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> > > > > On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> > > > > <darrick.wong@oracle.com> wrote:
> > >
> >
> > I didn't get why you dropped the di_pad idea (Arnd's and Dave's patches).
> > Is it to save the pad for another rainy day?
>
> Something like that.  Someone /else/ can worry about the y2486 problem.
>
> <duck>
>
> Practically speaking I'd almost rather drop the precision in order to
> extend the seconds range, since timestamp updates are only precise to
> HZ anyway.

I think there would be noticeable overhead from anything that requires
a 64-bit division to update a timestamp, or to read it into an in-memory
inode, especially on 32-bit architectures.

I think the reason why separate seconds/nanoseconds are easy is that
this is what both the in-kernel and user space interface are based around.
We clearly don't use the precision, but anything else is more expensive
at runtime.

> > > > IOWs, we pretty much decided on a new 64 bit encoding format using a
> > > > epoch of 1900 and a unsigned 64bit nanosecond counter to get us a
> > > > range of almost 600 years from year 1900-2500. It's simple, easy to
> > > > encode/decode, and very easy to validate. It's also trivially easy
> > > > to do in-place upgrades with an inode flag....
> > > >
> >
> > All right, so how do we proceed?
> > Arnd, do you want to re-work your series according to this scheme?
>
> Lemme read them over again. :)
>
> > Is there any core xfs developer that was going to tackle this?
> >
> > I'm here, so if you need my help moving things forward let me know.
>
> I wrote a trivial garbage version this afternoon, will have something
> more polished tomorrow.  None of this is 5.6 material, we have time.

I think from a user perspective, it would be the nicest to just add the
extra high bits (however many, I don't care) and treat it as an ro-compatible
extension, possibly even as completely compatible both ways.

Note that for any released kernel (5.4 changes this), this matches
the existing behavior: setting a timestamp after 2038 using utimensat()
silently wraps the seconds back to the regular epoch. With the
extension patch, you get the correct results as long as the inode was
both written and read on a new enough kernel, while all pre-5.4
kernels produce the same incorrect data that they always have.

       Arnd
Dave Chinner Nov. 13, 2019, 8:40 p.m. UTC | #14
On Wed, Nov 13, 2019 at 02:20:44PM +0100, Arnd Bergmann wrote:
> On Wed, Nov 13, 2019 at 5:59 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Wed, Nov 13, 2019 at 06:42:09AM +0200, Amir Goldstein wrote:
> > > [CC:Arnd,Deepa]
> > > On Wed, Nov 13, 2019 at 5:56 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > >
> > > > On Wed, Nov 13, 2019 at 08:11:53AM +1100, Dave Chinner wrote:
> > > > > On Tue, Nov 12, 2019 at 06:00:19AM +0200, Amir Goldstein wrote:
> > > > > > On Tue, Nov 12, 2019 at 12:35 AM Darrick J. Wong
> > > > > > <darrick.wong@oracle.com> wrote:
> > > >
> > >
> > > I didn't get why you dropped the di_pad idea (Arnd's and Dave's patches).
> > > Is it to save the pad for another rainy day?
> >
> > Something like that.  Someone /else/ can worry about the y2486 problem.
> >
> > <duck>
> >
> > Practically speaking I'd almost rather drop the precision in order to
> > extend the seconds range, since timestamp updates are only precise to
> > HZ anyway.
> 
> I think there would be noticeable overhead from anything that requires
> a 64-bit division to update a timestamp, or to read it into an in-memory
> inode, especially on 32-bit architectures.

Comapred to actually allocating the inode, initialising it, etc?
The cost of a division is noise and, as such, is completely
irrelevant. It's classic premature micro-optimisation of CPU usage
without any regard for the context in which the division might
occur.

Seriously, XFS is not a filesystem for small, poorly performing 32
bit CPUs. It's a filesystem for 64 bit servers with hundreds of
CPUs, terabytes of memory and storage capable of millions of IOPs.
All I care about is that it /works/ on tiny systems and doesn't
corrupt filesystems - we've already made so many compromises in
terms of algorithmic scalability and memory usage that XFS
performance does not scale down effectively to tiny machines and
tiny filesystems.

IOWs, if you have a slow, poorly implemented 32 bit CPU with limited
RAM and storage capability, then XFS is not the filesystem for your
hardware. And in that context, the cost of a 64bit integer division
is largely irrelevant to us.

> I think the reason why separate seconds/nanoseconds are easy is that
> this is what both the in-kernel and user space interface are based around.

That's also largely irrelevant to what we store on disk. We store
lots of stuff in encoded data structures because it's more space
efficient to do so. We trade off CPU time for space efficiency
-everywhere- in the XFS filesystem, because CPU is far cheaper than
the RAM to store it and cheaper than moving information to/from disk.

> We clearly don't use the precision, but anything else is more expensive
> at runtime.

It's noise in runtime CPU profiles. It's far more important for the
filesystem on-disk format to be sane, scalable, maintainable and
repairable than it is to save a CPU cycle or two in a path
that is several thousand CPU instructions long already....

> > > Is there any core xfs developer that was going to tackle this?
> > >
> > > I'm here, so if you need my help moving things forward let me know.
> >
> > I wrote a trivial garbage version this afternoon, will have something
> > more polished tomorrow.  None of this is 5.6 material, we have time.
> 
> I think from a user perspective, it would be the nicest to just add the

The *user* does not see or know anything to do with what is on disk,
nor does the kernel code outside the disk->memory translation
functions. What the users passes to/from the kernel is irrelevant
when discussing how we store something on disk. Indeed, what the
kernel passes to the filesystem is largely irrelevant, too :)

> the existing behavior: setting a timestamp after 2038 using utimensat()
> silently wraps the seconds back to the regular epoch. With the
> extension patch, you get the correct results as long as the inode was
> both written and read on a new enough kernel, while all pre-5.4
> kernels produce the same incorrect data that they always have.

Feature bits prevent the new format being read on old kernels.
We can't allow an old kernel to parse an structure that it will
present to the user incorrectly because it was written by a more
recent kernel. Changing timestamp formats means old kernels can no
longer mount that filesystem, and that goes for your changes as well
- an old kernel will silently mishandle a >2038 date, and may even
detect it as corruption because we expect padding to be zero on
unused on-disk fields. Worse, older kernels will overwrite the new
epoch fields with zeros when the inode is re-written, destroying
the >y2038 timestamp information that is in that padding.

IOWs, old kernels do not preserve stuff written into on-disk
strucutre padding - the expect it to be zero and will write zeros
there whenever the structure is written to disk. That means these
epoch based y2038k format changes are not forwards or backwards
compatible - you can't interchange the filesystem between kernels
that do/don't support y2038k timestamps and expect it to works
correctly - feature bits are needed to prevent the timestamps from
being mis-interpretted or silently corrupted.

That means it's a one-way conversion. Hence if the old format
wrapped back to <1970, then new value written to disk will encode
that <1970 date to disk *in the new format*. Nothing gets lost or
changed in the process, and it's clear that an old kernel cannot
mangle it because it can no longer mount the filesystem that holds
new format timestamps....

Cheers,

Dave.
Amir Goldstein Nov. 18, 2019, 4:52 a.m. UTC | #15
> >
> > I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
> > There is a benefit in terms of code complexity and test coverage
> > to keep the only difference between inode versions in the on-disk
> > parsers, while reading into the same struct, the same way as
> > old inode versions are read into struct xfs_dinode.
> >
> > Oh well, I can wait for tomorrow to see the polished version :-)
>
> Well now we noticed that Arnd also changed the disk quota structure
> format too, so that'll slow things down as we try to figure out how to
> reconcile 34-bit inode seconds vs. 40-bit quota timer seconds.
>
> (Or whatever happens with that)
>

Sigh. FWIW, I liked Arnd's 40-bit inode time patch because it
keeps the patch LoC for this conversion minimal.
I am *not* promoting backward compat migration, but using the
most of existing on-disk/in-core parser code and only adding
parsing of new fields in inode format v4 reduces code complexity
and improves test coverage.

Please let me know if there is anything I can do to help pushing
things forward.

Thanks,
Amir.
Dave Chinner Nov. 18, 2019, 8:22 a.m. UTC | #16
On Mon, Nov 18, 2019 at 06:52:39AM +0200, Amir Goldstein wrote:
> > >
> > > I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
> > > There is a benefit in terms of code complexity and test coverage
> > > to keep the only difference between inode versions in the on-disk
> > > parsers, while reading into the same struct, the same way as
> > > old inode versions are read into struct xfs_dinode.
> > >
> > > Oh well, I can wait for tomorrow to see the polished version :-)
> >
> > Well now we noticed that Arnd also changed the disk quota structure
> > format too, so that'll slow things down as we try to figure out how to
> > reconcile 34-bit inode seconds vs. 40-bit quota timer seconds.
> >
> > (Or whatever happens with that)
> >
> 
> Sigh. FWIW, I liked Arnd's 40-bit inode time patch because it
> keeps the patch LoC for this conversion minimal.

We can extend the quota warning range without changing the on-disk
structures, and with much less code than changing the on-disk
structures.

We only need a ~500 year range for the warning expiry timestamp, and
we don't really care about fine grained resolution of the timer
expiry.

We've already got a 70 year range with the signed second counter. So
let's just redefine the timeout value on disk to use units of 10s
instead of 1s when the bigtime superblock feature bit is set. ANd
now we have our >500 year range requirement.

That shouldn't need much more than 5-10 lines of new code
translating the units when we read/write them from/to disk....

Cheers,

Dave.
Amir Goldstein Nov. 18, 2019, 9:30 a.m. UTC | #17
On Mon, Nov 18, 2019 at 10:22 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Mon, Nov 18, 2019 at 06:52:39AM +0200, Amir Goldstein wrote:
> > > >
> > > > I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
> > > > There is a benefit in terms of code complexity and test coverage
> > > > to keep the only difference between inode versions in the on-disk
> > > > parsers, while reading into the same struct, the same way as
> > > > old inode versions are read into struct xfs_dinode.
> > > >
> > > > Oh well, I can wait for tomorrow to see the polished version :-)
> > >
> > > Well now we noticed that Arnd also changed the disk quota structure
> > > format too, so that'll slow things down as we try to figure out how to
> > > reconcile 34-bit inode seconds vs. 40-bit quota timer seconds.
> > >
> > > (Or whatever happens with that)
> > >
> >
> > Sigh. FWIW, I liked Arnd's 40-bit inode time patch because it
> > keeps the patch LoC for this conversion minimal.
>
> We can extend the quota warning range without changing the on-disk
> structures, and with much less code than changing the on-disk
> structures.
>
> We only need a ~500 year range for the warning expiry timestamp, and
> we don't really care about fine grained resolution of the timer
> expiry.
>
> We've already got a 70 year range with the signed second counter. So
> let's just redefine the timeout value on disk to use units of 10s
> instead of 1s when the bigtime superblock feature bit is set. ANd
> now we have our >500 year range requirement.
>
> That shouldn't need much more than 5-10 lines of new code
> translating the units when we read/write them from/to disk....
>

Sounds good.

What is your take on the issue of keeping struct xfs_dinode
and struct xfs_log_dinode common to v3..v4?

If we make struct xfs_timestamp_t/xfs_ictimestamp_t a union
of {{t_sec32;t_nsec32}, {t_nsec64}} then xfs_log_dinode_to_disk()
conversion code is conditional to di_version.
If we store v4 on-disk as {t_nsec32_hi;t_nsec32_lo} then the
conversion code from disk to log is unconditional to di_version.

Am I overthinking this?

Darrick,

I am assuming you are working on the patch.
If you would like me to re-post my patch with the decided
on-disk formats for inode and a quota patch let me know.

Thanks,
Amir.
Darrick J. Wong Nov. 19, 2019, 5:34 a.m. UTC | #18
On Mon, Nov 18, 2019 at 11:30:58AM +0200, Amir Goldstein wrote:
> On Mon, Nov 18, 2019 at 10:22 AM Dave Chinner <david@fromorbit.com> wrote:
> >
> > On Mon, Nov 18, 2019 at 06:52:39AM +0200, Amir Goldstein wrote:
> > > > >
> > > > > I wonder if your version has struct xfs_dinode_v3 or it could avoid it.
> > > > > There is a benefit in terms of code complexity and test coverage
> > > > > to keep the only difference between inode versions in the on-disk
> > > > > parsers, while reading into the same struct, the same way as
> > > > > old inode versions are read into struct xfs_dinode.
> > > > >
> > > > > Oh well, I can wait for tomorrow to see the polished version :-)
> > > >
> > > > Well now we noticed that Arnd also changed the disk quota structure
> > > > format too, so that'll slow things down as we try to figure out how to
> > > > reconcile 34-bit inode seconds vs. 40-bit quota timer seconds.
> > > >
> > > > (Or whatever happens with that)
> > > >
> > >
> > > Sigh. FWIW, I liked Arnd's 40-bit inode time patch because it
> > > keeps the patch LoC for this conversion minimal.
> >
> > We can extend the quota warning range without changing the on-disk
> > structures, and with much less code than changing the on-disk
> > structures.
> >
> > We only need a ~500 year range for the warning expiry timestamp, and
> > we don't really care about fine grained resolution of the timer
> > expiry.
> >
> > We've already got a 70 year range with the signed second counter. So
> > let's just redefine the timeout value on disk to use units of 10s
> > instead of 1s when the bigtime superblock feature bit is set. ANd
> > now we have our >500 year range requirement.
> >
> > That shouldn't need much more than 5-10 lines of new code
> > translating the units when we read/write them from/to disk....
> >
> 
> Sounds good.
> 
> What is your take on the issue of keeping struct xfs_dinode
> and struct xfs_log_dinode common to v3..v4?
> 
> If we make struct xfs_timestamp_t/xfs_ictimestamp_t a union
> of {{t_sec32;t_nsec32}, {t_nsec64}} then xfs_log_dinode_to_disk()
> conversion code is conditional to di_version.
> If we store v4 on-disk as {t_nsec32_hi;t_nsec32_lo} then the
> conversion code from disk to log is unconditional to di_version.
> 
> Am I overthinking this?

Probably not.  I changed both disk and log inodes to a union of the old
timestamp struct and a t_nsec64 (as you put it) and it works fine.

The quota timers I've simply reduced the timer resolution from 1s to 4s
to get us an extra two bits, and added a full-size in-core dquot timer
so that we only lose resolution if the fs gets unmounted.  It's kind of
a dirty trick, though.

> Darrick,
> 
> I am assuming you are working on the patch.
> If you would like me to re-post my patch with the decided
> on-disk formats for inode and a quota patch let me know.

Still working on it.  In the meantime, I have a question for you and
Arnd: I've started writing more fstests.  How does userspace query the
kernel to find out the supported range of timestamps?  fstests currently
hardcodes it, but yuck.  XFS could find an ioctl/sysfs knob to export
that info, but it really belongs in statvfs or something.

--D

> Thanks,
> Amir.
Arnd Bergmann Nov. 19, 2019, 1:37 p.m. UTC | #19
On Tue, Nov 19, 2019 at 6:36 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> On Mon, Nov 18, 2019 at 11:30:58AM +0200, Amir Goldstein wrote:
> > I am assuming you are working on the patch.
> > If you would like me to re-post my patch with the decided
> > on-disk formats for inode and a quota patch let me know.
>
> Still working on it.  In the meantime, I have a question for you and
> Arnd: I've started writing more fstests.  How does userspace query the
> kernel to find out the supported range of timestamps?  fstests currently
> hardcodes it, but yuck.  XFS could find an ioctl/sysfs knob to export
> that info, but it really belongs in statvfs or something.

David Howells has gone through 15 revisions of his fsinfo syscall,
the most recent post was this summer:

https://lwn.net/Articles/792628/
https://lore.kernel.org/lkml/156173661696.14042.17822154531324224780.stgit@warthog.procyon.org.uk/

I don't know whether he has a timeline for completing the work.

The last version was waiting on the new mount API, which has gone into
v5.2, so maybe the patches just need to be refreshed on top of v5.4.

We had previously discussed adding a generic ioctl in do_vfs_ioctl
or using up the last 128 bits of padding in struct statfs, but I think the
fsinfo() syscall would be best.

       Arnd

Patch
diff mbox series

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c968b60cee15..227a775a9889 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -449,10 +449,12 @@  xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_EXTTIME  (1 << 3)		/* extended time_max */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
-		 XFS_SB_FEAT_RO_COMPAT_REFLINK)
+		 XFS_SB_FEAT_RO_COMPAT_REFLINK | \
+		 XFS_SB_FEAT_RO_COMPAT_EXTTIME)
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -546,6 +548,12 @@  static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
 		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
 }
 
+static inline bool xfs_sb_version_hasexttime(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_EXTTIME);
+}
+
 /*
  * end of superblock version macros
  */
@@ -824,8 +832,8 @@  typedef struct xfs_agfl {
 		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 
 typedef struct xfs_timestamp {
-	__be32		t_sec;		/* timestamp seconds */
-	__be32		t_nsec;		/* timestamp nanoseconds */
+	__be32	t_sec;		/* timestamp seconds */
+	__be32	t_nsec_epoch;	/* timestamp nanoseconds | extra epoch */
 } xfs_timestamp_t;
 
 /*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 28ab3c5255e1..aaf411da6263 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -17,6 +17,7 @@ 
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
 #include "xfs_dir2.h"
+#include "xfs_timestamp.h"
 
 #include <linux/iversion.h>
 
@@ -204,6 +205,7 @@  xfs_inode_from_disk(
 {
 	struct xfs_icdinode	*to = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
+	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
 
 
 	/*
@@ -233,12 +235,9 @@  xfs_inode_from_disk(
 	 * a time before epoch is converted to a time long after epoch
 	 * on 64 bit systems.
 	 */
-	inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
-	inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
-	inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
-	inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
-	inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
-	inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+	xfs_timestamp_di_decode(sbp, &inode->i_atime, &from->di_atime);
+	xfs_timestamp_di_decode(sbp, &inode->i_mtime, &from->di_mtime);
+	xfs_timestamp_di_decode(sbp, &inode->i_ctime, &from->di_ctime);
 	inode->i_generation = be32_to_cpu(from->di_gen);
 	inode->i_mode = be16_to_cpu(from->di_mode);
 
@@ -257,7 +256,8 @@  xfs_inode_from_disk(
 		inode_set_iversion_queried(inode,
 					   be64_to_cpu(from->di_changecount));
 		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
-		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+		to->di_crtime.t_nsec_epoch =
+			be32_to_cpu(from->di_crtime.t_nsec_epoch);
 		to->di_flags2 = be64_to_cpu(from->di_flags2);
 		to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
 	}
@@ -271,6 +271,7 @@  xfs_inode_to_disk(
 {
 	struct xfs_icdinode	*from = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
+	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
 
 	to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 	to->di_onlink = 0;
@@ -283,12 +284,9 @@  xfs_inode_to_disk(
 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 
 	memset(to->di_pad, 0, sizeof(to->di_pad));
-	to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
-	to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
-	to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
-	to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
-	to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
-	to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+	xfs_timestamp_di_encode(sbp, &inode->i_atime, &to->di_atime);
+	xfs_timestamp_di_encode(sbp, &inode->i_mtime, &to->di_mtime);
+	xfs_timestamp_di_encode(sbp, &inode->i_ctime, &to->di_ctime);
 	to->di_nlink = cpu_to_be32(inode->i_nlink);
 	to->di_gen = cpu_to_be32(inode->i_generation);
 	to->di_mode = cpu_to_be16(inode->i_mode);
@@ -307,7 +305,8 @@  xfs_inode_to_disk(
 	if (from->di_version == 3) {
 		to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_crtime.t_nsec_epoch =
+			cpu_to_be32(from->di_crtime.t_nsec_epoch);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
 		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(ip->i_ino);
@@ -338,11 +337,11 @@  xfs_log_dinode_to_disk(
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
-	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
+	to->di_atime.t_nsec_epoch = cpu_to_be32(from->di_atime.t_nsec_epoch);
 	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
-	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
+	to->di_mtime.t_nsec_epoch = cpu_to_be32(from->di_mtime.t_nsec_epoch);
 	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
-	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+	to->di_ctime.t_nsec_epoch = cpu_to_be32(from->di_ctime.t_nsec_epoch);
 
 	to->di_size = cpu_to_be64(from->di_size);
 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
@@ -359,7 +358,8 @@  xfs_log_dinode_to_disk(
 	if (from->di_version == 3) {
 		to->di_changecount = cpu_to_be64(from->di_changecount);
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
-		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_crtime.t_nsec_epoch =
+			cpu_to_be32(from->di_crtime.t_nsec_epoch);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
 		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(from->di_ino);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e5f97c69b320..08f9d119e0d5 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -369,8 +369,8 @@  static inline int xfs_ilog_fdata(int w)
  * information.
  */
 typedef struct xfs_ictimestamp {
-	int32_t		t_sec;		/* timestamp seconds */
-	int32_t		t_nsec;		/* timestamp nanoseconds */
+	int32_t t_sec;		/* timestamp seconds */
+	uint32_t t_nsec_epoch;	/* timestamp nanoseconds | extra epoch */
 } xfs_ictimestamp_t;
 
 /*
diff --git a/fs/xfs/libxfs/xfs_timestamp.h b/fs/xfs/libxfs/xfs_timestamp.h
new file mode 100644
index 000000000000..b514a9f40704
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_timestamp.h
@@ -0,0 +1,151 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 CTERA Networks.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_TIMESTAMP_H__
+#define __XFS_TIMESTAMP_H__
+
+//#define XFS_TIMESTAMP_DEBUG
+
+#ifdef XFS_TIMESTAMP_DEBUG
+#define XFS_TIMESTAMP_EXTENDED(sbp) 1
+#else
+#define XFS_TIMESTAMP_EXTENDED(sbp) xfs_sb_version_hasexttime(sbp)
+#endif
+
+/*
+ * We use 2 unused msb of 32bit t_nsec to encode time ranges beyond y2038.
+ *
+ * We use an encoding that preserves the times for extra epoch "00":
+ *
+ * extra  msb of                         adjust for signed
+ * epoch  32-bit                         32-bit tv_sec to
+ * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
+ * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
+ * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
+ * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
+ * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
+ * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
+ * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
+ * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
+ * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
+ */
+
+#define XFS_TIMESTAMP_NSEC_BITS		30
+#define XFS_TIMESTAMP_NSEC_MASK		((1U << XFS_TIMESTAMP_NSEC_BITS) - 1)
+#define XFS_TIMESTAMP_NSEC(nsec_epoch)	((nsec_epoch) & XFS_TIMESTAMP_NSEC_MASK)
+#define XFS_TIMESTAMP_EPOCH_SHIFT	XFS_TIMESTAMP_NSEC_BITS
+#define XFS_TIMESTAMP_EPOCH_BITS	(32 - XFS_TIMESTAMP_NSEC_BITS)
+#define XFS_TIMESTAMP_EPOCH_MASK	(((1U << XFS_TIMESTAMP_EPOCH_BITS) \
+					  - 1) << XFS_TIMESTAMP_EPOCH_SHIFT)
+#define XFS_TIMESTAMP_SEC_BITS		(32 + XFS_TIMESTAMP_EPOCH_BITS)
+
+#define XFS_TIMESTAMP_SEC_MIN		S32_MIN
+#define XFS_TIMESTAMP_SEC32_MAX		S32_MAX
+#define XFS_TIMESTAMP_SEC64_MAX		((1LL << XFS_TIMESTAMP_SEC_BITS) \
+					 - 1  + S32_MIN)
+#define XFS_TIMESTAMP_SEC_MAX(sbp) \
+	(XFS_TIMESTAMP_EXTENDED(sbp) ? XFS_TIMESTAMP_SEC64_MAX : \
+					XFS_TIMESTAMP_SEC32_MAX)
+
+
+static inline int64_t xfs_timestamp_decode_sec64(int32_t sec32,
+						 uint32_t nsec_epoch)
+{
+	int64_t sec64 = sec32;
+
+	if (unlikely(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) {
+		sec64 += ((int64_t)(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK)) <<
+			XFS_TIMESTAMP_EPOCH_BITS;
+#ifdef XFS_TIMESTAMP_DEBUG
+		pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64,
+			XFS_TIMESTAMP_NSEC(nsec_epoch),
+			(nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK), sec32);
+#endif
+	}
+	return sec64;
+}
+
+static inline int64_t xfs_timestamp_sec64(struct xfs_sb *sbp, int32_t sec32,
+					  uint32_t nsec_epoch)
+{
+	return XFS_TIMESTAMP_EXTENDED(sbp) ?
+		xfs_timestamp_decode_sec64(sec32, nsec_epoch) : sec32;
+}
+
+static inline bool xfs_timestamp_nsec_is_valid(struct xfs_sb *sbp,
+					       uint32_t nsec_epoch)
+{
+	if (!XFS_TIMESTAMP_EXTENDED(sbp) &&
+	    (nsec_epoch & XFS_TIMESTAMP_EPOCH_MASK))
+		return false;
+
+	return XFS_TIMESTAMP_NSEC(nsec_epoch) < NSEC_PER_SEC;
+}
+
+static inline bool xfs_timestamp_is_valid(struct xfs_sb *sbp,
+					  xfs_timestamp_t *dtsp)
+{
+	return xfs_timestamp_nsec_is_valid(sbp,
+				be32_to_cpu(dtsp->t_nsec_epoch));
+}
+
+static inline void xfs_timestamp_ic_decode(struct xfs_sb *sbp,
+					   struct timespec64 *time,
+					   xfs_ictimestamp_t *itsp)
+{
+	time->tv_sec = xfs_timestamp_sec64(sbp, itsp->t_sec,
+					   itsp->t_nsec_epoch);
+	time->tv_nsec = XFS_TIMESTAMP_NSEC(itsp->t_nsec_epoch);
+}
+
+static inline void xfs_timestamp_di_decode(struct xfs_sb *sbp,
+					   struct timespec64 *time,
+					   xfs_timestamp_t *dtsp)
+{
+	time->tv_sec = xfs_timestamp_sec64(sbp, be32_to_cpu(dtsp->t_sec),
+					   be32_to_cpu(dtsp->t_nsec_epoch));
+	time->tv_nsec = XFS_TIMESTAMP_NSEC(be32_to_cpu(dtsp->t_nsec_epoch));
+}
+
+static inline int32_t xfs_timestamp_encode_nsec_epoch(int64_t sec64,
+						      int32_t nsec)
+{
+	int32_t epoch = ((sec64 - (int32_t)sec64) >> XFS_TIMESTAMP_EPOCH_BITS) &
+			XFS_TIMESTAMP_EPOCH_MASK;
+
+#ifdef XFS_TIMESTAMP_DEBUG
+	if (epoch)
+		pr_info("%s: %lld.%d epoch=%x sec32=%d", __func__, sec64, nsec,
+			epoch, (int32_t)sec64);
+#endif
+	return (nsec & XFS_TIMESTAMP_NSEC_MASK) | epoch;
+}
+
+static inline int32_t xfs_timestamp_nsec_epoch(struct xfs_sb *sbp,
+					       int64_t sec64, int32_t nsec)
+{
+	return XFS_TIMESTAMP_EXTENDED(sbp) ?
+		xfs_timestamp_encode_nsec_epoch(sec64, nsec) : nsec;
+}
+
+static inline void xfs_timestamp_ic_encode(struct xfs_sb *sbp,
+					   struct timespec64 *time,
+					   xfs_ictimestamp_t *itsp)
+{
+	itsp->t_sec = (int32_t)time->tv_sec;
+	itsp->t_nsec_epoch = xfs_timestamp_nsec_epoch(sbp, time->tv_sec,
+						      time->tv_nsec);
+}
+
+static inline void xfs_timestamp_di_encode(struct xfs_sb *sbp,
+					   struct timespec64 *time,
+					   xfs_timestamp_t *dtsp)
+{
+	dtsp->t_sec = cpu_to_be32(time->tv_sec);
+	dtsp->t_nsec_epoch = cpu_to_be32(xfs_timestamp_nsec_epoch(sbp,
+						time->tv_sec, time->tv_nsec));
+}
+
+#endif /* __XFS_TIMESTAMP_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index a9ad90926b87..48c9c9e3654d 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,10 +8,13 @@ 
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_inode_item.h"
+#include "xfs_timestamp.h"
 
 #include <linux/iversion.h>
 
@@ -67,8 +70,8 @@  xfs_trans_ichgtime(
 	if (flags & XFS_ICHGTIME_CHG)
 		inode->i_ctime = tv;
 	if (flags & XFS_ICHGTIME_CREATE) {
-		ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
-		ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+		xfs_timestamp_ic_encode(&ip->i_mount->m_sb, &tv,
+					&ip->i_d.di_crtime);
 	}
 }
 
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 6d483ab29e63..981f86387dc3 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -17,6 +17,7 @@ 
 #include "xfs_reflink.h"
 #include "xfs_rmap.h"
 #include "xfs_bmap_util.h"
+#include "xfs_timestamp.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
@@ -293,11 +294,9 @@  xchk_dinode(
 	}
 
 	/* di_[amc]time.nsec */
-	if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
-		xchk_ino_set_corrupt(sc, ino);
-	if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
-		xchk_ino_set_corrupt(sc, ino);
-	if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
+	if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_atime) ||
+	    !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_mtime) ||
+	    !xfs_timestamp_is_valid(&mp->m_sb, &dip->di_ctime))
 		xchk_ino_set_corrupt(sc, ino);
 
 	/*
@@ -403,7 +402,7 @@  xchk_dinode(
 	}
 
 	if (dip->di_version >= 3) {
-		if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
+		if (!xfs_timestamp_is_valid(&mp->m_sb, &dip->di_crtime))
 			xchk_ino_set_corrupt(sc, ino);
 		xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
 		xchk_inode_cowextsize(sc, dip, ino, mode, flags,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a92d4521748d..ca1538b31170 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,6 +35,7 @@ 
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_timestamp.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -851,8 +852,7 @@  xfs_ialloc(
 		inode_set_iversion(inode, 1);
 		ip->i_d.di_flags2 = 0;
 		ip->i_d.di_cowextsize = 0;
-		ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
-		ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+		xfs_timestamp_ic_encode(&mp->m_sb, &tv, &ip->i_d.di_crtime);
 	}
 
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 726aa3bfd6e8..2b5db9af6a9d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -18,6 +18,7 @@ 
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_error.h"
+#include "xfs_timestamp.h"
 
 #include <linux/iversion.h>
 
@@ -303,6 +304,7 @@  xfs_inode_to_log_dinode(
 {
 	struct xfs_icdinode	*from = &ip->i_d;
 	struct inode		*inode = VFS_I(ip);
+	struct xfs_sb		*sbp = &ip->i_mount->m_sb;
 
 	to->di_magic = XFS_DINODE_MAGIC;
 
@@ -315,12 +317,9 @@  xfs_inode_to_log_dinode(
 
 	memset(to->di_pad, 0, sizeof(to->di_pad));
 	memset(to->di_pad3, 0, sizeof(to->di_pad3));
-	to->di_atime.t_sec = inode->i_atime.tv_sec;
-	to->di_atime.t_nsec = inode->i_atime.tv_nsec;
-	to->di_mtime.t_sec = inode->i_mtime.tv_sec;
-	to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
-	to->di_ctime.t_sec = inode->i_ctime.tv_sec;
-	to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+	xfs_timestamp_ic_encode(sbp, &inode->i_atime, &to->di_atime);
+	xfs_timestamp_ic_encode(sbp, &inode->i_mtime, &to->di_mtime);
+	xfs_timestamp_ic_encode(sbp, &inode->i_ctime, &to->di_ctime);
 	to->di_nlink = inode->i_nlink;
 	to->di_gen = inode->i_generation;
 	to->di_mode = inode->i_mode;
@@ -342,7 +341,7 @@  xfs_inode_to_log_dinode(
 	if (from->di_version == 3) {
 		to->di_changecount = inode_peek_iversion(inode);
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
-		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+		to->di_crtime.t_nsec_epoch = from->di_crtime.t_nsec_epoch;
 		to->di_flags2 = from->di_flags2;
 		to->di_cowextsize = from->di_cowextsize;
 		to->di_ino = ip->i_ino;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4c7962ccb0c4..aa294597d16f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -21,6 +21,7 @@ 
 #include "xfs_dir2.h"
 #include "xfs_iomap.h"
 #include "xfs_error.h"
+#include "xfs_timestamp.h"
 
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
@@ -556,8 +557,8 @@  xfs_vn_getattr(
 	if (ip->i_d.di_version == 3) {
 		if (request_mask & STATX_BTIME) {
 			stat->result_mask |= STATX_BTIME;
-			stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
-			stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
+			xfs_timestamp_ic_decode(&mp->m_sb, &stat->btime,
+						&ip->i_d.di_crtime);
 		}
 	}
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 884950adbd16..3e55cf029414 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -19,6 +19,7 @@ 
 #include "xfs_error.h"
 #include "xfs_icache.h"
 #include "xfs_health.h"
+#include "xfs_timestamp.h"
 
 /*
  * Bulk Stat
@@ -98,7 +99,7 @@  xfs_bulkstat_one_int(
 	buf->bs_ctime = inode->i_ctime.tv_sec;
 	buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
 	buf->bs_btime = dic->di_crtime.t_sec;
-	buf->bs_btime_nsec = dic->di_crtime.t_nsec;
+	buf->bs_btime_nsec = XFS_TIMESTAMP_NSEC(dic->di_crtime.t_nsec_epoch);
 	buf->bs_gen = inode->i_generation;
 	buf->bs_mode = inode->i_mode;
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b3188ea49413..b940ce6dac07 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,7 @@ 
 #include "xfs_refcount_item.h"
 #include "xfs_bmap_item.h"
 #include "xfs_reflink.h"
+#include "xfs_timestamp.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -1438,8 +1439,8 @@  xfs_fc_fill_super(
 	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
 	sb->s_max_links = XFS_MAXLINK;
 	sb->s_time_gran = 1;
-	sb->s_time_min = S32_MIN;
-	sb->s_time_max = S32_MAX;
+	sb->s_time_min = XFS_TIMESTAMP_SEC_MIN;
+	sb->s_time_max = XFS_TIMESTAMP_SEC_MAX(&mp->m_sb);
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	set_posix_acl_flag(sb);