diff mbox series

xfs: introduce "metasync" api to sync metadata to fsblock

Message ID 1570977420-3944-1-git-send-email-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series xfs: introduce "metasync" api to sync metadata to fsblock | expand

Commit Message

Pingfan Liu Oct. 13, 2019, 2:37 p.m. UTC
When using fadump (fireware assist dump) mode on powerpc, a mismatch
between grub xfs driver and kernel xfs driver has been obsevered.  Note:
fadump boots up in the following sequence: fireware -> grub reads kernel
and initramfs -> kernel boots.

The process to reproduce this mismatch:
  - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
  - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
    restart" to rebuild the initramfs. Detail about the rebuilding looks
    like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
          mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
          sync
  - "echo c >/proc/sysrq-trigger".

The result:
The dump image will not be saved under /var/crashnew/* as expected, but
still saved under /var/crash.

The root cause:
As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
back metadata to xlog, but not necessary to fsblock. This raises issue if
grub can not replay the xlog before accessing the xfs files. Since the
above dir entry of initramfs should be saved as inline data with xfs_inode,
so xfs_fs_sync_fs() does not guarantee it written to fsblock.

umount can be used to write metadata fsblock, but the filesystem can not be
umounted if still in use.

There are two ways to fix this mismatch, either grub or xfs. It may be
easier to do this in xfs side by introducing an interface to flush metadata
to fsblock explicitly.

With this patch, metadata can be written to fsblock by:
  # update AIL
  sync
  # new introduced interface to flush metadata to fsblock
  mount -o remount,metasync mountpoint

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Eric Sandeen <esandeen@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
To: linux-xfs@vger.kernel.org
---
 fs/xfs/xfs_mount.h      |  1 +
 fs/xfs/xfs_super.c      | 15 ++++++++++++++-
 fs/xfs/xfs_trans.h      |  2 ++
 fs/xfs/xfs_trans_ail.c  | 26 +++++++++++++++++++++++++-
 fs/xfs/xfs_trans_priv.h |  1 +
 5 files changed, 43 insertions(+), 2 deletions(-)

Comments

Darrick J. Wong Oct. 13, 2019, 4:34 p.m. UTC | #1
On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> When using fadump (fireware assist dump) mode on powerpc, a mismatch
> between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> fadump boots up in the following sequence: fireware -> grub reads kernel
> and initramfs -> kernel boots.
> 
> The process to reproduce this mismatch:
>   - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
>   - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
>     restart" to rebuild the initramfs. Detail about the rebuilding looks
>     like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
>           mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
>           sync
>   - "echo c >/proc/sysrq-trigger".
> 
> The result:
> The dump image will not be saved under /var/crashnew/* as expected, but
> still saved under /var/crash.
> 
> The root cause:
> As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> back metadata to xlog, but not necessary to fsblock. This raises issue if
> grub can not replay the xlog before accessing the xfs files. Since the
> above dir entry of initramfs should be saved as inline data with xfs_inode,
> so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> 
> umount can be used to write metadata fsblock, but the filesystem can not be
> umounted if still in use.
> 
> There are two ways to fix this mismatch, either grub or xfs. It may be
> easier to do this in xfs side by introducing an interface to flush metadata
> to fsblock explicitly.
> 
> With this patch, metadata can be written to fsblock by:
>   # update AIL
>   sync
>   # new introduced interface to flush metadata to fsblock
>   mount -o remount,metasync mountpoint

I think this ought to be an ioctl or some sort of generic call since the
jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
is too dumb to recover logs but still wants to write to the fs"
checkpointing problem.

(Or maybe we should just put all that stuff in a vfat filesystem, I
don't know...)

--D

> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Eric Sandeen <esandeen@redhat.com>
> Cc: Hari Bathini <hbathini@linux.ibm.com>
> Cc: linuxppc-dev@lists.ozlabs.org
> To: linux-xfs@vger.kernel.org
> ---
>  fs/xfs/xfs_mount.h      |  1 +
>  fs/xfs/xfs_super.c      | 15 ++++++++++++++-
>  fs/xfs/xfs_trans.h      |  2 ++
>  fs/xfs/xfs_trans_ail.c  | 26 +++++++++++++++++++++++++-
>  fs/xfs/xfs_trans_priv.h |  1 +
>  5 files changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index fdb60e0..85f32e6 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -243,6 +243,7 @@ typedef struct xfs_mount {
>  #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
>  						   allocator */
>  #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
> +#define XFS_MOUNT_METASYNC	(1ull << 26)	/* write meta to fsblock */
>  
>  #define XFS_MOUNT_DAX		(1ULL << 62)	/* TEST ONLY! */
>  
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 8d1df9f..41df810 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -59,7 +59,7 @@ enum {
>  	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
>  	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
>  	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> -	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
> +	Opt_discard, Opt_nodiscard, Opt_dax, Opt_metasync, Opt_err
>  };
>  
>  static const match_table_t tokens = {
> @@ -106,6 +106,7 @@ static const match_table_t tokens = {
>  	{Opt_discard,	"discard"},	/* Discard unused blocks */
>  	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
>  	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
> +	{Opt_metasync,	"metasync"},	/* one shot to write meta to fsblock */
>  	{Opt_err,	NULL},
>  };
>  
> @@ -338,6 +339,9 @@ xfs_parseargs(
>  			mp->m_flags |= XFS_MOUNT_DAX;
>  			break;
>  #endif
> +		case Opt_metasync:
> +			mp->m_flags |= XFS_MOUNT_METASYNC;
> +			break;
>  		default:
>  			xfs_warn(mp, "unknown mount option [%s].", p);
>  			return -EINVAL;
> @@ -1259,6 +1263,9 @@ xfs_fs_remount(
>  			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
>  			mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
>  			break;
> +		case Opt_metasync:
> +			mp->m_flags |= XFS_MOUNT_METASYNC;
> +			break;
>  		default:
>  			/*
>  			 * Logically we would return an error here to prevent
> @@ -1286,6 +1293,12 @@ xfs_fs_remount(
>  		}
>  	}
>  
> +	if (mp->m_flags & XFS_MOUNT_METASYNC) {
> +		xfs_ail_push_sync(mp->m_ail);
> +		/* one shot flag */
> +		mp->m_flags &= ~XFS_MOUNT_METASYNC;

Wait, so the mount flag magically disables itself?

This really sounds like a system call, not the kinds of long term
behavioral modifications that mount options are for.

> +	}
> +
>  	/* ro -> rw */
>  	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
>  		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index 64d7f17..fcdb902 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -242,6 +242,8 @@ void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
>  void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
>  					struct xfs_buf *src_bp);
>  
> +void		xfs_ail_push_sync(struct xfs_ail *ailp);
> +
>  extern kmem_zone_t	*xfs_trans_zone;
>  
>  #endif	/* __XFS_TRANS_H__ */
> diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
> index 6ccfd75..b8d8df1 100644
> --- a/fs/xfs/xfs_trans_ail.c
> +++ b/fs/xfs/xfs_trans_ail.c
> @@ -488,7 +488,11 @@ xfsaild_push(
>  	xfs_trans_ail_cursor_done(&cur);
>  	spin_unlock(&ailp->ail_lock);
>  
> -	if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
> +	if (unlikely(mp->m_flags & XFS_MOUNT_METASYNC)) {
> +		xfs_buf_delwri_submit(&ailp->ail_buf_list);

I guess this never fails, because write IO is 100% successful?

--D

> +		ailp->ail_log_flush++;
> +		wake_up_all(&ailp->pushed_que);
> +	} else if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
>  		ailp->ail_log_flush++;
>  
>  	if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
> @@ -641,6 +645,25 @@ xfs_ail_push(
>  	wake_up_process(ailp->ail_task);
>  }
>  
> +void
> +xfs_ail_push_sync(
> +	struct xfs_ail		*ailp)
> +{
> +	xfs_lsn_t		sync_lsn;
> +	DEFINE_WAIT(wait);
> +
> +	sync_lsn = xfs_ail_max_lsn(ailp);
> +	for (;;) {
> +		xfs_ail_push(ailp, sync_lsn);
> +		prepare_to_wait(&ailp->pushed_que, &wait, TASK_INTERRUPTIBLE);
> +		if (XFS_LSN_CMP(READ_ONCE(ailp->ail_target_prev),
> +			sync_lsn) >= 0)
> +			break;
> +		schedule();
> +	}
> +	finish_wait(&ailp->pushed_que, &wait);
> +}
> +
>  /*
>   * Push out all items in the AIL immediately
>   */
> @@ -834,6 +857,7 @@ xfs_trans_ail_init(
>  	spin_lock_init(&ailp->ail_lock);
>  	INIT_LIST_HEAD(&ailp->ail_buf_list);
>  	init_waitqueue_head(&ailp->ail_empty);
> +	init_waitqueue_head(&ailp->pushed_que);
>  
>  	ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
>  			ailp->ail_mount->m_fsname);
> diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
> index 2e073c1..9fe3cc6 100644
> --- a/fs/xfs/xfs_trans_priv.h
> +++ b/fs/xfs/xfs_trans_priv.h
> @@ -61,6 +61,7 @@ struct xfs_ail {
>  	int			ail_log_flush;
>  	struct list_head	ail_buf_list;
>  	wait_queue_head_t	ail_empty;
> +	wait_queue_head_t	pushed_que;
>  };
>  
>  /*
> -- 
> 2.7.5
>
Pingfan Liu Oct. 14, 2019, 8:33 a.m. UTC | #2
On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
> On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > fadump boots up in the following sequence: fireware -> grub reads kernel
> > and initramfs -> kernel boots.
> > 
> > The process to reproduce this mismatch:
> >   - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
> >   - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
> >     restart" to rebuild the initramfs. Detail about the rebuilding looks
> >     like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
> >           mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
> >           sync
> >   - "echo c >/proc/sysrq-trigger".
> > 
> > The result:
> > The dump image will not be saved under /var/crashnew/* as expected, but
> > still saved under /var/crash.
> > 
> > The root cause:
> > As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> > back metadata to xlog, but not necessary to fsblock. This raises issue if
> > grub can not replay the xlog before accessing the xfs files. Since the
> > above dir entry of initramfs should be saved as inline data with xfs_inode,
> > so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> > 
> > umount can be used to write metadata fsblock, but the filesystem can not be
> > umounted if still in use.
> > 
> > There are two ways to fix this mismatch, either grub or xfs. It may be
> > easier to do this in xfs side by introducing an interface to flush metadata
> > to fsblock explicitly.
> > 
> > With this patch, metadata can be written to fsblock by:
> >   # update AIL
> >   sync
> >   # new introduced interface to flush metadata to fsblock
> >   mount -o remount,metasync mountpoint
> 
> I think this ought to be an ioctl or some sort of generic call since the
> jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
> is too dumb to recover logs but still wants to write to the fs"
> checkpointing problem.
Yes, a syscall sounds more reasonable.
> 
> (Or maybe we should just put all that stuff in a vfat filesystem, I
> don't know...)
I think it is unavoidable to involve in each fs' implementation. What
about introducing an interface sync_to_fsblock(struct super_block *sb) in
the struct super_operations, then let each fs manage its own case?
> 
> --D
> 
> > Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> > Cc: Dave Chinner <dchinner@redhat.com>
> > Cc: Eric Sandeen <esandeen@redhat.com>
> > Cc: Hari Bathini <hbathini@linux.ibm.com>
> > Cc: linuxppc-dev@lists.ozlabs.org
> > To: linux-xfs@vger.kernel.org
> > ---
> >  fs/xfs/xfs_mount.h      |  1 +
> >  fs/xfs/xfs_super.c      | 15 ++++++++++++++-
> >  fs/xfs/xfs_trans.h      |  2 ++
> >  fs/xfs/xfs_trans_ail.c  | 26 +++++++++++++++++++++++++-
> >  fs/xfs/xfs_trans_priv.h |  1 +
> >  5 files changed, 43 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> > index fdb60e0..85f32e6 100644
> > --- a/fs/xfs/xfs_mount.h
> > +++ b/fs/xfs/xfs_mount.h
> > @@ -243,6 +243,7 @@ typedef struct xfs_mount {
> >  #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
> >  						   allocator */
> >  #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
> > +#define XFS_MOUNT_METASYNC	(1ull << 26)	/* write meta to fsblock */
> >  
> >  #define XFS_MOUNT_DAX		(1ULL << 62)	/* TEST ONLY! */
> >  
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 8d1df9f..41df810 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -59,7 +59,7 @@ enum {
> >  	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
> >  	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
> >  	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> > -	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
> > +	Opt_discard, Opt_nodiscard, Opt_dax, Opt_metasync, Opt_err
> >  };
> >  
> >  static const match_table_t tokens = {
> > @@ -106,6 +106,7 @@ static const match_table_t tokens = {
> >  	{Opt_discard,	"discard"},	/* Discard unused blocks */
> >  	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
> >  	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
> > +	{Opt_metasync,	"metasync"},	/* one shot to write meta to fsblock */
> >  	{Opt_err,	NULL},
> >  };
> >  
> > @@ -338,6 +339,9 @@ xfs_parseargs(
> >  			mp->m_flags |= XFS_MOUNT_DAX;
> >  			break;
> >  #endif
> > +		case Opt_metasync:
> > +			mp->m_flags |= XFS_MOUNT_METASYNC;
> > +			break;
> >  		default:
> >  			xfs_warn(mp, "unknown mount option [%s].", p);
> >  			return -EINVAL;
> > @@ -1259,6 +1263,9 @@ xfs_fs_remount(
> >  			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
> >  			mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
> >  			break;
> > +		case Opt_metasync:
> > +			mp->m_flags |= XFS_MOUNT_METASYNC;
> > +			break;
> >  		default:
> >  			/*
> >  			 * Logically we would return an error here to prevent
> > @@ -1286,6 +1293,12 @@ xfs_fs_remount(
> >  		}
> >  	}
> >  
> > +	if (mp->m_flags & XFS_MOUNT_METASYNC) {
> > +		xfs_ail_push_sync(mp->m_ail);
> > +		/* one shot flag */
> > +		mp->m_flags &= ~XFS_MOUNT_METASYNC;
> 
> Wait, so the mount flag magically disables itself?
> 
> This really sounds like a system call, not the kinds of long term
> behavioral modifications that mount options are for.
> 
Yeah, this one-shot behavior is not suitable for mount.
> > +	}
> > +
> >  	/* ro -> rw */
> >  	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
> >  		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
> > diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> > index 64d7f17..fcdb902 100644
> > --- a/fs/xfs/xfs_trans.h
> > +++ b/fs/xfs/xfs_trans.h
> > @@ -242,6 +242,8 @@ void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
> >  void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
> >  					struct xfs_buf *src_bp);
> >  
> > +void		xfs_ail_push_sync(struct xfs_ail *ailp);
> > +
> >  extern kmem_zone_t	*xfs_trans_zone;
> >  
> >  #endif	/* __XFS_TRANS_H__ */
> > diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
> > index 6ccfd75..b8d8df1 100644
> > --- a/fs/xfs/xfs_trans_ail.c
> > +++ b/fs/xfs/xfs_trans_ail.c
> > @@ -488,7 +488,11 @@ xfsaild_push(
> >  	xfs_trans_ail_cursor_done(&cur);
> >  	spin_unlock(&ailp->ail_lock);
> >  
> > -	if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
> > +	if (unlikely(mp->m_flags & XFS_MOUNT_METASYNC)) {
> > +		xfs_buf_delwri_submit(&ailp->ail_buf_list);
> 
> I guess this never fails, because write IO is 100% successful?
No, unfortunately, xfs_buf_delwri_submit() can return error. Do you
suggest to pass the error to the sync call?

Thanks for your kindly review.

Regards,
	Pingfan

[Keep the following original content unchanged. And cc Jan Kara, linux-fsdevel due to this
design may touch the vfs layer]
> 
> --D
> 
> > +		ailp->ail_log_flush++;
> > +		wake_up_all(&ailp->pushed_que);
> > +	} else if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
> >  		ailp->ail_log_flush++;
> >  
> >  	if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
> > @@ -641,6 +645,25 @@ xfs_ail_push(
> >  	wake_up_process(ailp->ail_task);
> >  }
> >  
> > +void
> > +xfs_ail_push_sync(
> > +	struct xfs_ail		*ailp)
> > +{
> > +	xfs_lsn_t		sync_lsn;
> > +	DEFINE_WAIT(wait);
> > +
> > +	sync_lsn = xfs_ail_max_lsn(ailp);
> > +	for (;;) {
> > +		xfs_ail_push(ailp, sync_lsn);
> > +		prepare_to_wait(&ailp->pushed_que, &wait, TASK_INTERRUPTIBLE);
> > +		if (XFS_LSN_CMP(READ_ONCE(ailp->ail_target_prev),
> > +			sync_lsn) >= 0)
> > +			break;
> > +		schedule();
> > +	}
> > +	finish_wait(&ailp->pushed_que, &wait);
> > +}
> > +
> >  /*
> >   * Push out all items in the AIL immediately
> >   */
> > @@ -834,6 +857,7 @@ xfs_trans_ail_init(
> >  	spin_lock_init(&ailp->ail_lock);
> >  	INIT_LIST_HEAD(&ailp->ail_buf_list);
> >  	init_waitqueue_head(&ailp->ail_empty);
> > +	init_waitqueue_head(&ailp->pushed_que);
> >  
> >  	ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
> >  			ailp->ail_mount->m_fsname);
> > diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
> > index 2e073c1..9fe3cc6 100644
> > --- a/fs/xfs/xfs_trans_priv.h
> > +++ b/fs/xfs/xfs_trans_priv.h
> > @@ -61,6 +61,7 @@ struct xfs_ail {
> >  	int			ail_log_flush;
> >  	struct list_head	ail_buf_list;
> >  	wait_queue_head_t	ail_empty;
> > +	wait_queue_head_t	pushed_que;
> >  };
> >  
> >  /*
> > -- 
> > 2.7.5
> >
Christoph Hellwig Oct. 14, 2019, 8:40 a.m. UTC | #3
On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> When using fadump (fireware assist dump) mode on powerpc, a mismatch
> between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> fadump boots up in the following sequence: fireware -> grub reads kernel
> and initramfs -> kernel boots.

This isn't something new.  To fundamentally fix this you need to
implement (in-memory) log recovery in grub.  That is the only really safe
long-term solutioin.  But the equivalent of your patch you can already
get by freezing and unfreezing the file system using the FIFREEZE and
FITHAW ioctls.  And if my memory is serving me correctly Dave has been
preaching that to the bootloader folks for a long time, but apparently
without visible results.
Jan Kara Oct. 14, 2019, 9:43 a.m. UTC | #4
On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
> On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
> > On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > > fadump boots up in the following sequence: fireware -> grub reads kernel
> > > and initramfs -> kernel boots.
> > > 
> > > The process to reproduce this mismatch:
> > >   - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
> > >   - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
> > >     restart" to rebuild the initramfs. Detail about the rebuilding looks
> > >     like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
> > >           mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
> > >           sync
> > >   - "echo c >/proc/sysrq-trigger".
> > > 
> > > The result:
> > > The dump image will not be saved under /var/crashnew/* as expected, but
> > > still saved under /var/crash.
> > > 
> > > The root cause:
> > > As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> > > back metadata to xlog, but not necessary to fsblock. This raises issue if
> > > grub can not replay the xlog before accessing the xfs files. Since the
> > > above dir entry of initramfs should be saved as inline data with xfs_inode,
> > > so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> > > 
> > > umount can be used to write metadata fsblock, but the filesystem can not be
> > > umounted if still in use.
> > > 
> > > There are two ways to fix this mismatch, either grub or xfs. It may be
> > > easier to do this in xfs side by introducing an interface to flush metadata
> > > to fsblock explicitly.
> > > 
> > > With this patch, metadata can be written to fsblock by:
> > >   # update AIL
> > >   sync
> > >   # new introduced interface to flush metadata to fsblock
> > >   mount -o remount,metasync mountpoint
> > 
> > I think this ought to be an ioctl or some sort of generic call since the
> > jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
> > is too dumb to recover logs but still wants to write to the fs"
> > checkpointing problem.
> Yes, a syscall sounds more reasonable.
> > 
> > (Or maybe we should just put all that stuff in a vfat filesystem, I
> > don't know...)
> I think it is unavoidable to involve in each fs' implementation. What
> about introducing an interface sync_to_fsblock(struct super_block *sb) in
> the struct super_operations, then let each fs manage its own case?

Well, we already have a way to achieve what you need: fsfreeze.
Traditionally, that is guaranteed to put fs into a "clean" state very much
equivalent to the fs being unmounted and that seems to be what the
bootloader wants so that it can access the filesystem without worrying
about some recovery details. So do you see any problem with replacing
'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?

								Honza
Eric Sandeen Oct. 14, 2019, 1:23 p.m. UTC | #5
On 10/14/19 4:43 AM, Jan Kara wrote:
> On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
>> On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
>>> On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
>>>> When using fadump (fireware assist dump) mode on powerpc, a mismatch
>>>> between grub xfs driver and kernel xfs driver has been obsevered.  Note:
>>>> fadump boots up in the following sequence: fireware -> grub reads kernel
>>>> and initramfs -> kernel boots.
>>>>
>>>> The process to reproduce this mismatch:
>>>>    - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
>>>>    - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
>>>>      restart" to rebuild the initramfs. Detail about the rebuilding looks
>>>>      like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
>>>>            mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
>>>>            sync
>>>>    - "echo c >/proc/sysrq-trigger".
>>>>
>>>> The result:
>>>> The dump image will not be saved under /var/crashnew/* as expected, but
>>>> still saved under /var/crash.
>>>>
>>>> The root cause:
>>>> As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
>>>> back metadata to xlog, but not necessary to fsblock. This raises issue if
>>>> grub can not replay the xlog before accessing the xfs files. Since the
>>>> above dir entry of initramfs should be saved as inline data with xfs_inode,
>>>> so xfs_fs_sync_fs() does not guarantee it written to fsblock.
>>>>
>>>> umount can be used to write metadata fsblock, but the filesystem can not be
>>>> umounted if still in use.
>>>>
>>>> There are two ways to fix this mismatch, either grub or xfs. It may be
>>>> easier to do this in xfs side by introducing an interface to flush metadata
>>>> to fsblock explicitly.
>>>>
>>>> With this patch, metadata can be written to fsblock by:
>>>>    # update AIL
>>>>    sync
>>>>    # new introduced interface to flush metadata to fsblock
>>>>    mount -o remount,metasync mountpoint
>>>
>>> I think this ought to be an ioctl or some sort of generic call since the
>>> jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
>>> is too dumb to recover logs but still wants to write to the fs"
>>> checkpointing problem.
>> Yes, a syscall sounds more reasonable.
>>>
>>> (Or maybe we should just put all that stuff in a vfat filesystem, I
>>> don't know...)
>> I think it is unavoidable to involve in each fs' implementation. What
>> about introducing an interface sync_to_fsblock(struct super_block *sb) in
>> the struct super_operations, then let each fs manage its own case?
> 
> Well, we already have a way to achieve what you need: fsfreeze.
> Traditionally, that is guaranteed to put fs into a "clean" state very much
> equivalent to the fs being unmounted and that seems to be what the
> bootloader wants so that it can access the filesystem without worrying
> about some recovery details. So do you see any problem with replacing
> 'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?
> 
> 								Honza

The problem with fsfreeze is that if the device you want to quiesce is, say,
the root fs, freeze isn't really a good option.

But the other thing I want to highlight about this approach is that it does not
solve the root problem: something is trying to read the block device without
first replaying the log.

A call such as the proposal here is only going to leave consistent metadata at
the time the call returns; at any time after that, all guarantees are off again,
so the problem hasn't been solved.

-Eric
Jan Kara Oct. 14, 2019, 8:03 p.m. UTC | #6
On Mon 14-10-19 08:23:39, Eric Sandeen wrote:
> On 10/14/19 4:43 AM, Jan Kara wrote:
> > On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
> > > On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
> > > > On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > > > > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > > > > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > > > > fadump boots up in the following sequence: fireware -> grub reads kernel
> > > > > and initramfs -> kernel boots.
> > > > > 
> > > > > The process to reproduce this mismatch:
> > > > >    - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
> > > > >    - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
> > > > >      restart" to rebuild the initramfs. Detail about the rebuilding looks
> > > > >      like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
> > > > >            mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
> > > > >            sync
> > > > >    - "echo c >/proc/sysrq-trigger".
> > > > > 
> > > > > The result:
> > > > > The dump image will not be saved under /var/crashnew/* as expected, but
> > > > > still saved under /var/crash.
> > > > > 
> > > > > The root cause:
> > > > > As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> > > > > back metadata to xlog, but not necessary to fsblock. This raises issue if
> > > > > grub can not replay the xlog before accessing the xfs files. Since the
> > > > > above dir entry of initramfs should be saved as inline data with xfs_inode,
> > > > > so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> > > > > 
> > > > > umount can be used to write metadata fsblock, but the filesystem can not be
> > > > > umounted if still in use.
> > > > > 
> > > > > There are two ways to fix this mismatch, either grub or xfs. It may be
> > > > > easier to do this in xfs side by introducing an interface to flush metadata
> > > > > to fsblock explicitly.
> > > > > 
> > > > > With this patch, metadata can be written to fsblock by:
> > > > >    # update AIL
> > > > >    sync
> > > > >    # new introduced interface to flush metadata to fsblock
> > > > >    mount -o remount,metasync mountpoint
> > > > 
> > > > I think this ought to be an ioctl or some sort of generic call since the
> > > > jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
> > > > is too dumb to recover logs but still wants to write to the fs"
> > > > checkpointing problem.
> > > Yes, a syscall sounds more reasonable.
> > > > 
> > > > (Or maybe we should just put all that stuff in a vfat filesystem, I
> > > > don't know...)
> > > I think it is unavoidable to involve in each fs' implementation. What
> > > about introducing an interface sync_to_fsblock(struct super_block *sb) in
> > > the struct super_operations, then let each fs manage its own case?
> > 
> > Well, we already have a way to achieve what you need: fsfreeze.
> > Traditionally, that is guaranteed to put fs into a "clean" state very much
> > equivalent to the fs being unmounted and that seems to be what the
> > bootloader wants so that it can access the filesystem without worrying
> > about some recovery details. So do you see any problem with replacing
> > 'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?
> > 
> > 								Honza
> 
> The problem with fsfreeze is that if the device you want to quiesce is, say,
> the root fs, freeze isn't really a good option.

I agree you need to be really careful not to deadlock against yourself in
that case. But this particular use actually has a chance to work.

> But the other thing I want to highlight about this approach is that it does not
> solve the root problem: something is trying to read the block device without
> first replaying the log.
> 
> A call such as the proposal here is only going to leave consistent metadata at
> the time the call returns; at any time after that, all guarantees are off again,
> so the problem hasn't been solved.

Oh, absolutely agreed. I was also thinking about this before sending my
reply. Once you unfreeze, the log can start filling with changes and
there's no guarantee that e.g. inode does not move as part of these
changes. But to be fair, replaying the log isn't easy either, even more so
from a bootloader. You cannot write the changes from the log back into the
filesystem as e.g. in case of suspend-to-disk the resumed kernel gets
surprised and corrupts the fs under its hands (been there, tried that). So
you must keep changes only in memory and that's not really easy in the
constrained bootloader environment.

So I guess we are left with hacks that kind of mostly work and fsfreeze is
one of those. If you don't mess with the files after fsfreeze, you're
likely to find what you need even without replaying the log.

								Honza
Eric Sandeen Oct. 14, 2019, 8:09 p.m. UTC | #7
On 10/14/19 3:03 PM, Jan Kara wrote:
> On Mon 14-10-19 08:23:39, Eric Sandeen wrote:
>> On 10/14/19 4:43 AM, Jan Kara wrote:
>>> On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
>>>> On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
>>>>> On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
>>>>>> When using fadump (fireware assist dump) mode on powerpc, a mismatch
>>>>>> between grub xfs driver and kernel xfs driver has been obsevered.  Note:
>>>>>> fadump boots up in the following sequence: fireware -> grub reads kernel
>>>>>> and initramfs -> kernel boots.
>>>>>>
>>>>>> The process to reproduce this mismatch:
>>>>>>     - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
>>>>>>     - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
>>>>>>       restart" to rebuild the initramfs. Detail about the rebuilding looks
>>>>>>       like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
>>>>>>             mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
>>>>>>             sync
>>>>>>     - "echo c >/proc/sysrq-trigger".
>>>>>>
>>>>>> The result:
>>>>>> The dump image will not be saved under /var/crashnew/* as expected, but
>>>>>> still saved under /var/crash.
>>>>>>
>>>>>> The root cause:
>>>>>> As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
>>>>>> back metadata to xlog, but not necessary to fsblock. This raises issue if
>>>>>> grub can not replay the xlog before accessing the xfs files. Since the
>>>>>> above dir entry of initramfs should be saved as inline data with xfs_inode,
>>>>>> so xfs_fs_sync_fs() does not guarantee it written to fsblock.
>>>>>>
>>>>>> umount can be used to write metadata fsblock, but the filesystem can not be
>>>>>> umounted if still in use.
>>>>>>
>>>>>> There are two ways to fix this mismatch, either grub or xfs. It may be
>>>>>> easier to do this in xfs side by introducing an interface to flush metadata
>>>>>> to fsblock explicitly.
>>>>>>
>>>>>> With this patch, metadata can be written to fsblock by:
>>>>>>     # update AIL
>>>>>>     sync
>>>>>>     # new introduced interface to flush metadata to fsblock
>>>>>>     mount -o remount,metasync mountpoint
>>>>>
>>>>> I think this ought to be an ioctl or some sort of generic call since the
>>>>> jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
>>>>> is too dumb to recover logs but still wants to write to the fs"
>>>>> checkpointing problem.
>>>> Yes, a syscall sounds more reasonable.
>>>>>
>>>>> (Or maybe we should just put all that stuff in a vfat filesystem, I
>>>>> don't know...)
>>>> I think it is unavoidable to involve in each fs' implementation. What
>>>> about introducing an interface sync_to_fsblock(struct super_block *sb) in
>>>> the struct super_operations, then let each fs manage its own case?
>>>
>>> Well, we already have a way to achieve what you need: fsfreeze.
>>> Traditionally, that is guaranteed to put fs into a "clean" state very much
>>> equivalent to the fs being unmounted and that seems to be what the
>>> bootloader wants so that it can access the filesystem without worrying
>>> about some recovery details. So do you see any problem with replacing
>>> 'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?
>>>
>>> 								Honza
>>
>> The problem with fsfreeze is that if the device you want to quiesce is, say,
>> the root fs, freeze isn't really a good option.
> 
> I agree you need to be really careful not to deadlock against yourself in
> that case. But this particular use actually has a chance to work.
> 
>> But the other thing I want to highlight about this approach is that it does not
>> solve the root problem: something is trying to read the block device without
>> first replaying the log.
>>
>> A call such as the proposal here is only going to leave consistent metadata at
>> the time the call returns; at any time after that, all guarantees are off again,
>> so the problem hasn't been solved.
> 
> Oh, absolutely agreed. I was also thinking about this before sending my
> reply. Once you unfreeze, the log can start filling with changes and
> there's no guarantee that e.g. inode does not move as part of these
> changes. But to be fair, replaying the log isn't easy either, even more so
> from a bootloader. You cannot write the changes from the log back into the
> filesystem as e.g. in case of suspend-to-disk the resumed kernel gets
> surprised and corrupts the fs under its hands (been there, tried that). So
> you must keep changes only in memory and that's not really easy in the
> constrained bootloader environment.
> 
> So I guess we are left with hacks that kind of mostly work and fsfreeze is
> one of those. If you don't mess with the files after fsfreeze, you're
> likely to find what you need even without replaying the log.

We're in agreement here.  ;)  I only worry about implementing things like this
which sound like guarantees, but aren't, and end up encouraging bad behavior
or promoting misconceptions.

More and more, I think we should reconsider Darrick's "bootfs" (ext2 by another
name, but with extra-sync-iness) proposal...

-Eric

> 								Honza
>
Pingfan Liu Oct. 15, 2019, 1:56 a.m. UTC | #8
On Mon, Oct 14, 2019 at 01:40:27AM -0700, Christoph Hellwig wrote:
> On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > fadump boots up in the following sequence: fireware -> grub reads kernel
> > and initramfs -> kernel boots.
> 
> This isn't something new.  To fundamentally fix this you need to
> implement (in-memory) log recovery in grub.  That is the only really safe
> long-term solutioin.  But the equivalent of your patch you can already
Agree. For the consistency of the whole fs, we need grub to be aware of
log. While this patch just assumes that files accessed by grub are
known, and the consistency is forced only on these files.
> get by freezing and unfreezing the file system using the FIFREEZE and
> FITHAW ioctls.  And if my memory is serving me correctly Dave has been
freeze will block any further modification to the fs. That is different
from my patch, which does not have such limitation.
> preaching that to the bootloader folks for a long time, but apparently
> without visible results.
Yes, it is a pity. And maybe it is uneasy to do.

Thanks and regards,
	Pingfan
Pingfan Liu Oct. 15, 2019, 2:12 a.m. UTC | #9
On Mon, Oct 14, 2019 at 08:23:39AM -0500, Eric Sandeen wrote:
> On 10/14/19 4:43 AM, Jan Kara wrote:
> > On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
> > > On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
> > > > On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > > > > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > > > > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > > > > fadump boots up in the following sequence: fireware -> grub reads kernel
> > > > > and initramfs -> kernel boots.
> > > > > 
> > > > > The process to reproduce this mismatch:
> > > > >    - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
> > > > >    - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
> > > > >      restart" to rebuild the initramfs. Detail about the rebuilding looks
> > > > >      like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
> > > > >            mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
> > > > >            sync
> > > > >    - "echo c >/proc/sysrq-trigger".
> > > > > 
> > > > > The result:
> > > > > The dump image will not be saved under /var/crashnew/* as expected, but
> > > > > still saved under /var/crash.
> > > > > 
> > > > > The root cause:
> > > > > As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> > > > > back metadata to xlog, but not necessary to fsblock. This raises issue if
> > > > > grub can not replay the xlog before accessing the xfs files. Since the
> > > > > above dir entry of initramfs should be saved as inline data with xfs_inode,
> > > > > so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> > > > > 
> > > > > umount can be used to write metadata fsblock, but the filesystem can not be
> > > > > umounted if still in use.
> > > > > 
> > > > > There are two ways to fix this mismatch, either grub or xfs. It may be
> > > > > easier to do this in xfs side by introducing an interface to flush metadata
> > > > > to fsblock explicitly.
> > > > > 
> > > > > With this patch, metadata can be written to fsblock by:
> > > > >    # update AIL
> > > > >    sync
> > > > >    # new introduced interface to flush metadata to fsblock
> > > > >    mount -o remount,metasync mountpoint
> > > > 
> > > > I think this ought to be an ioctl or some sort of generic call since the
> > > > jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
> > > > is too dumb to recover logs but still wants to write to the fs"
> > > > checkpointing problem.
> > > Yes, a syscall sounds more reasonable.
> > > > 
> > > > (Or maybe we should just put all that stuff in a vfat filesystem, I
> > > > don't know...)
> > > I think it is unavoidable to involve in each fs' implementation. What
> > > about introducing an interface sync_to_fsblock(struct super_block *sb) in
> > > the struct super_operations, then let each fs manage its own case?
> > 
> > Well, we already have a way to achieve what you need: fsfreeze.
> > Traditionally, that is guaranteed to put fs into a "clean" state very much
> > equivalent to the fs being unmounted and that seems to be what the
> > bootloader wants so that it can access the filesystem without worrying
> > about some recovery details. So do you see any problem with replacing
> > 'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?
> > 
> > 								Honza
> 
> The problem with fsfreeze is that if the device you want to quiesce is, say,
> the root fs, freeze isn't really a good option.
Yes, that is the difference between my patch and fsfreeze.  But
honestly, it is a rare case where a system has not a /boot partition. Due
to the activity on /boot is very low, fsfreeze may meet the need, or
repeatly retry fsfress until success.
> 
> But the other thing I want to highlight about this approach is that it does not
> solve the root problem: something is trying to read the block device without
> first replaying the log.
> 
> A call such as the proposal here is only going to leave consistent metadata at
> the time the call returns; at any time after that, all guarantees are off again,
My patch places assumption that grub only accesses limited files and ensures the
consistency only on those files (kernel,initramfs).
> so the problem hasn't been solved.
Agree. The perfect solution should be a log aware bootloader.

Thanks and regards,
	Pingfan
Pingfan Liu Oct. 15, 2019, 2:20 a.m. UTC | #10
On Mon, Oct 14, 2019 at 10:03:03PM +0200, Jan Kara wrote:
> On Mon 14-10-19 08:23:39, Eric Sandeen wrote:
> > On 10/14/19 4:43 AM, Jan Kara wrote:
> > > On Mon 14-10-19 16:33:15, Pingfan Liu wrote:
> > > > On Sun, Oct 13, 2019 at 09:34:17AM -0700, Darrick J. Wong wrote:
> > > > > On Sun, Oct 13, 2019 at 10:37:00PM +0800, Pingfan Liu wrote:
> > > > > > When using fadump (fireware assist dump) mode on powerpc, a mismatch
> > > > > > between grub xfs driver and kernel xfs driver has been obsevered.  Note:
> > > > > > fadump boots up in the following sequence: fireware -> grub reads kernel
> > > > > > and initramfs -> kernel boots.
> > > > > > 
> > > > > > The process to reproduce this mismatch:
> > > > > >    - On powerpc, boot kernel with fadump=on and edit /etc/kdump.conf.
> > > > > >    - Replacing "path /var/crash" with "path /var/crashnew", then, "kdumpctl
> > > > > >      restart" to rebuild the initramfs. Detail about the rebuilding looks
> > > > > >      like: mkdumprd /boot/initramfs-`uname -r`.img.tmp;
> > > > > >            mv /boot/initramfs-`uname -r`.img.tmp /boot/initramfs-`uname -r`.img
> > > > > >            sync
> > > > > >    - "echo c >/proc/sysrq-trigger".
> > > > > > 
> > > > > > The result:
> > > > > > The dump image will not be saved under /var/crashnew/* as expected, but
> > > > > > still saved under /var/crash.
> > > > > > 
> > > > > > The root cause:
> > > > > > As Eric pointed out that on xfs, 'sync' ensures the consistency by writing
> > > > > > back metadata to xlog, but not necessary to fsblock. This raises issue if
> > > > > > grub can not replay the xlog before accessing the xfs files. Since the
> > > > > > above dir entry of initramfs should be saved as inline data with xfs_inode,
> > > > > > so xfs_fs_sync_fs() does not guarantee it written to fsblock.
> > > > > > 
> > > > > > umount can be used to write metadata fsblock, but the filesystem can not be
> > > > > > umounted if still in use.
> > > > > > 
> > > > > > There are two ways to fix this mismatch, either grub or xfs. It may be
> > > > > > easier to do this in xfs side by introducing an interface to flush metadata
> > > > > > to fsblock explicitly.
> > > > > > 
> > > > > > With this patch, metadata can be written to fsblock by:
> > > > > >    # update AIL
> > > > > >    sync
> > > > > >    # new introduced interface to flush metadata to fsblock
> > > > > >    mount -o remount,metasync mountpoint
> > > > > 
> > > > > I think this ought to be an ioctl or some sort of generic call since the
> > > > > jbd2 filesystems (ext3, ext4, ocfs2) suffer from the same "$BOOTLOADER
> > > > > is too dumb to recover logs but still wants to write to the fs"
> > > > > checkpointing problem.
> > > > Yes, a syscall sounds more reasonable.
> > > > > 
> > > > > (Or maybe we should just put all that stuff in a vfat filesystem, I
> > > > > don't know...)
> > > > I think it is unavoidable to involve in each fs' implementation. What
> > > > about introducing an interface sync_to_fsblock(struct super_block *sb) in
> > > > the struct super_operations, then let each fs manage its own case?
> > > 
> > > Well, we already have a way to achieve what you need: fsfreeze.
> > > Traditionally, that is guaranteed to put fs into a "clean" state very much
> > > equivalent to the fs being unmounted and that seems to be what the
> > > bootloader wants so that it can access the filesystem without worrying
> > > about some recovery details. So do you see any problem with replacing
> > > 'sync' in your example above with 'fsfreeze /boot && fsfreeze -u /boot'?
> > > 
> > > 								Honza
> > 
> > The problem with fsfreeze is that if the device you want to quiesce is, say,
> > the root fs, freeze isn't really a good option.
> 
> I agree you need to be really careful not to deadlock against yourself in
> that case. But this particular use actually has a chance to work.
> 
Yeah, normally there is a /boot partition in system, and if so, fsfreeze
can work.
> > But the other thing I want to highlight about this approach is that it does not
> > solve the root problem: something is trying to read the block device without
> > first replaying the log.
> > 
> > A call such as the proposal here is only going to leave consistent metadata at
> > the time the call returns; at any time after that, all guarantees are off again,
> > so the problem hasn't been solved.
> 
> Oh, absolutely agreed. I was also thinking about this before sending my
> reply. Once you unfreeze, the log can start filling with changes and
> there's no guarantee that e.g. inode does not move as part of these
But just as fsync, we only guarantee the consistency before a sync. If
the involved files change again, we need another sync.
> changes. But to be fair, replaying the log isn't easy either, even more so
> from a bootloader. You cannot write the changes from the log back into the
> filesystem as e.g. in case of suspend-to-disk the resumed kernel gets
> surprised and corrupts the fs under its hands (been there, tried that). So
> you must keep changes only in memory and that's not really easy in the
> constrained bootloader environment.
Sigh, this is more complicated than I had thought. I guess it will be a
long time to go with this bug, and use fsfreeze as a work around.

Thanks and regards,
	Pingfan
> 
> So I guess we are left with hacks that kind of mostly work and fsfreeze is
> one of those. If you don't mess with the files after fsfreeze, you're
> likely to find what you need even without replaying the log.
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Christoph Hellwig Oct. 15, 2019, 8:01 a.m. UTC | #11
On Mon, Oct 14, 2019 at 03:09:48PM -0500, Eric Sandeen wrote:
> We're in agreement here.  ;)  I only worry about implementing things like this
> which sound like guarantees, but aren't, and end up encouraging bad behavior
> or promoting misconceptions.
> 
> More and more, I think we should reconsider Darrick's "bootfs" (ext2 by another
> name, but with extra-sync-iness) proposal...

Having a separate simple file system for the boot loader makes a lot of
sense.  Note that vfat of EFI is the best choice, but at least it is
something.  SysV Unix from the 90s actually had a special file system just
for that, and fs/bfs/ in Linux supports that.  So this isn't really a new
thing either.
Christoph Hellwig Oct. 15, 2019, 8:01 a.m. UTC | #12
On Tue, Oct 15, 2019 at 09:56:20AM +0800, Pingfan Liu wrote:
> Agree. For the consistency of the whole fs, we need grub to be aware of
> log. While this patch just assumes that files accessed by grub are
> known, and the consistency is forced only on these files.
> > get by freezing and unfreezing the file system using the FIFREEZE and
> > FITHAW ioctls.  And if my memory is serving me correctly Dave has been
> freeze will block any further modification to the fs. That is different
> from my patch, which does not have such limitation.

So you freeze and immediately unfreeze.
Theodore Ts'o Oct. 15, 2019, 1:10 p.m. UTC | #13
On Tue, Oct 15, 2019 at 01:01:02AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 14, 2019 at 03:09:48PM -0500, Eric Sandeen wrote:
> > We're in agreement here.  ;)  I only worry about implementing things like this
> > which sound like guarantees, but aren't, and end up encouraging bad behavior
> > or promoting misconceptions.
> > 
> > More and more, I think we should reconsider Darrick's "bootfs" (ext2 by another
> > name, but with extra-sync-iness) proposal...
> 
> Having a separate simple file system for the boot loader makes a lot of
> sense.  Note that vfat of EFI is the best choice, but at least it is
> something.  SysV Unix from the 90s actually had a special file system just
> for that, and fs/bfs/ in Linux supports that.  So this isn't really a new
> thing either.

Did you mean to say "vfaat of EFI isn't the best choice"?

If we were going to be doing something like "bootfs", what sort of
semantics would be sufficient?  Is doing an implied fsync() on every
close(2) enough, or do we need to do something even more conservative?

	 	       	       	     - Ted
Darrick J. Wong Oct. 15, 2019, 4:18 p.m. UTC | #14
On Tue, Oct 15, 2019 at 09:10:04AM -0400, Theodore Y. Ts'o wrote:
> On Tue, Oct 15, 2019 at 01:01:02AM -0700, Christoph Hellwig wrote:
> > On Mon, Oct 14, 2019 at 03:09:48PM -0500, Eric Sandeen wrote:
> > > We're in agreement here.  ;)  I only worry about implementing things like this
> > > which sound like guarantees, but aren't, and end up encouraging bad behavior
> > > or promoting misconceptions.
> > > 
> > > More and more, I think we should reconsider Darrick's "bootfs" (ext2 by another
> > > name, but with extra-sync-iness) proposal...
> > 
> > Having a separate simple file system for the boot loader makes a lot of
> > sense.  Note that vfat of EFI is the best choice, but at least it is
> > something.  SysV Unix from the 90s actually had a special file system just
> > for that, and fs/bfs/ in Linux supports that.  So this isn't really a new
> > thing either.
> 
> Did you mean to say "vfaat of EFI isn't the best choice"?
> 
> If we were going to be doing something like "bootfs", what sort of
> semantics would be sufficient?  Is doing an implied fsync() on every
> close(2) enough, or do we need to do something even more conservative?

I'm assuming you'd also want to make sure the journal checkpoints as
part of fsync, right?  Aside from being an April Fools joke, bootfs[1]
does implement the semantics I needed to fix all the complaining about
grub being broken. 8-)

Granted there's also the systemd bootloader spec[2] which says
FAT{16,32}...

[1] https://lore.kernel.org/linux-fsdevel/20190401070001.GJ1173@magnolia/
[2] https://systemd.io/BOOT_LOADER_SPECIFICATION.html

--D

> 	 	       	       	     - Ted
diff mbox series

Patch

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fdb60e0..85f32e6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -243,6 +243,7 @@  typedef struct xfs_mount {
 #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
 						   allocator */
 #define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
+#define XFS_MOUNT_METASYNC	(1ull << 26)	/* write meta to fsblock */
 
 #define XFS_MOUNT_DAX		(1ULL << 62)	/* TEST ONLY! */
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8d1df9f..41df810 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -59,7 +59,7 @@  enum {
 	Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
 	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
 	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
-	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+	Opt_discard, Opt_nodiscard, Opt_dax, Opt_metasync, Opt_err
 };
 
 static const match_table_t tokens = {
@@ -106,6 +106,7 @@  static const match_table_t tokens = {
 	{Opt_discard,	"discard"},	/* Discard unused blocks */
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
+	{Opt_metasync,	"metasync"},	/* one shot to write meta to fsblock */
 	{Opt_err,	NULL},
 };
 
@@ -338,6 +339,9 @@  xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DAX;
 			break;
 #endif
+		case Opt_metasync:
+			mp->m_flags |= XFS_MOUNT_METASYNC;
+			break;
 		default:
 			xfs_warn(mp, "unknown mount option [%s].", p);
 			return -EINVAL;
@@ -1259,6 +1263,9 @@  xfs_fs_remount(
 			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
 			mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
 			break;
+		case Opt_metasync:
+			mp->m_flags |= XFS_MOUNT_METASYNC;
+			break;
 		default:
 			/*
 			 * Logically we would return an error here to prevent
@@ -1286,6 +1293,12 @@  xfs_fs_remount(
 		}
 	}
 
+	if (mp->m_flags & XFS_MOUNT_METASYNC) {
+		xfs_ail_push_sync(mp->m_ail);
+		/* one shot flag */
+		mp->m_flags &= ~XFS_MOUNT_METASYNC;
+	}
+
 	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
 		if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 64d7f17..fcdb902 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -242,6 +242,8 @@  void		xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
 void		xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
 					struct xfs_buf *src_bp);
 
+void		xfs_ail_push_sync(struct xfs_ail *ailp);
+
 extern kmem_zone_t	*xfs_trans_zone;
 
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6ccfd75..b8d8df1 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -488,7 +488,11 @@  xfsaild_push(
 	xfs_trans_ail_cursor_done(&cur);
 	spin_unlock(&ailp->ail_lock);
 
-	if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
+	if (unlikely(mp->m_flags & XFS_MOUNT_METASYNC)) {
+		xfs_buf_delwri_submit(&ailp->ail_buf_list);
+		ailp->ail_log_flush++;
+		wake_up_all(&ailp->pushed_que);
+	} else if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list))
 		ailp->ail_log_flush++;
 
 	if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
@@ -641,6 +645,25 @@  xfs_ail_push(
 	wake_up_process(ailp->ail_task);
 }
 
+void
+xfs_ail_push_sync(
+	struct xfs_ail		*ailp)
+{
+	xfs_lsn_t		sync_lsn;
+	DEFINE_WAIT(wait);
+
+	sync_lsn = xfs_ail_max_lsn(ailp);
+	for (;;) {
+		xfs_ail_push(ailp, sync_lsn);
+		prepare_to_wait(&ailp->pushed_que, &wait, TASK_INTERRUPTIBLE);
+		if (XFS_LSN_CMP(READ_ONCE(ailp->ail_target_prev),
+			sync_lsn) >= 0)
+			break;
+		schedule();
+	}
+	finish_wait(&ailp->pushed_que, &wait);
+}
+
 /*
  * Push out all items in the AIL immediately
  */
@@ -834,6 +857,7 @@  xfs_trans_ail_init(
 	spin_lock_init(&ailp->ail_lock);
 	INIT_LIST_HEAD(&ailp->ail_buf_list);
 	init_waitqueue_head(&ailp->ail_empty);
+	init_waitqueue_head(&ailp->pushed_que);
 
 	ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
 			ailp->ail_mount->m_fsname);
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1..9fe3cc6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -61,6 +61,7 @@  struct xfs_ail {
 	int			ail_log_flush;
 	struct list_head	ail_buf_list;
 	wait_queue_head_t	ail_empty;
+	wait_queue_head_t	pushed_que;
 };
 
 /*