diff mbox series

[v1,2/4] xfs: implement custom freeze/thaw functions

Message ID 20230314042109.82161-3-catherine.hoang@oracle.com (mailing list archive)
State New, archived
Headers show
Series setting uuid of online filesystems | expand

Commit Message

Catherine Hoang March 14, 2023, 4:21 a.m. UTC
Implement internal freeze/thaw functions and prevent other threads from changing
the freeze level by adding a new SB_FREEZE_ECLUSIVE level. This is required to
prevent concurrent transactions while we are updating the uuid.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/xfs/xfs_super.c | 112 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_super.h |   5 ++
 2 files changed, 117 insertions(+)

Comments

Amir Goldstein March 14, 2023, 5:11 a.m. UTC | #1
On Tue, Mar 14, 2023 at 6:25 AM Catherine Hoang
<catherine.hoang@oracle.com> wrote:
>
> Implement internal freeze/thaw functions and prevent other threads from changing
> the freeze level by adding a new SB_FREEZE_ECLUSIVE level. This is required to

This looks troubling in several ways:
- Layering violation
- Duplication of subtle vfs code

> prevent concurrent transactions while we are updating the uuid.
>

Wouldn't it be easier to hold s_umount while updating the uuid?
Let userspace freeze before XFS_IOC_SETFSUUID and let
XFS_IOC_SETFSUUID take s_umount and verify that fs is frozen.

> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/xfs/xfs_super.c | 112 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_super.h |   5 ++
>  2 files changed, 117 insertions(+)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 2479b5cbd75e..6a52ae660810 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -2279,6 +2279,118 @@ static inline int xfs_cpu_hotplug_init(void) { return 0; }
>  static inline void xfs_cpu_hotplug_destroy(void) {}
>  #endif
>
> +/*
> + * We need to disable all writer threads, which means taking the first two
> + * freeze levels to put userspace to sleep, and the third freeze level to
> + * prevent background threads from starting new transactions.  Take one level
> + * more to prevent other callers from unfreezing the filesystem while we run.
> + */
> +int
> +xfs_internal_freeze(
> +       struct xfs_mount        *mp)
> +{
> +       struct super_block      *sb = mp->m_super;
> +       int                     level;
> +       int                     error = 0;
> +
> +       /* Wait until we're ready to freeze. */
> +       down_write(&sb->s_umount);
> +       while (sb->s_writers.frozen != SB_UNFROZEN) {
> +               up_write(&sb->s_umount);
> +               delay(HZ / 10);
> +               down_write(&sb->s_umount);
> +       }

That can easily wait forever, without any task holding any lock.

> +
> +       if (sb_rdonly(sb)) {
> +               sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
> +               goto done;
> +       }
> +
> +       sb->s_writers.frozen = SB_FREEZE_WRITE;
> +       /* Release s_umount to preserve sb_start_write -> s_umount ordering */
> +       up_write(&sb->s_umount);
> +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
> +       down_write(&sb->s_umount);
> +
> +       /* Now we go and block page faults... */
> +       sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
> +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_PAGEFAULT - 1);
> +
> +       /*
> +        * All writers are done so after syncing there won't be dirty data.
> +        * Let xfs_fs_sync_fs flush dirty data so the VFS won't start writeback
> +        * and to disable the background gc workers.
> +        */
> +       error = sync_filesystem(sb);
> +       if (error) {
> +               sb->s_writers.frozen = SB_UNFROZEN;
> +               for (level = SB_FREEZE_PAGEFAULT - 1; level >= 0; level--)
> +                       percpu_up_write(sb->s_writers.rw_sem + level);
> +               wake_up(&sb->s_writers.wait_unfrozen);
> +               up_write(&sb->s_umount);
> +               return error;
> +       }
> +
> +       /* Now wait for internal filesystem counter */
> +       sb->s_writers.frozen = SB_FREEZE_FS;
> +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_FS - 1);
> +
> +       xfs_log_clean(mp);
> +
> +       /*
> +        * To prevent anyone else from unfreezing us, set the VFS freeze
> +        * level to one higher than FREEZE_COMPLETE.
> +        */
> +       sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
> +       for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
> +               percpu_rwsem_release(sb->s_writers.rw_sem + level, 0,
> +                               _THIS_IP_);

If you really must introduce a new freeze level, you should do it in vfs
and not inside xfs, even if xfs is the only current user of the new leve.

Thanks,
Amir.
Darrick J. Wong March 14, 2023, 5:25 a.m. UTC | #2
On Tue, Mar 14, 2023 at 07:11:56AM +0200, Amir Goldstein wrote:
> On Tue, Mar 14, 2023 at 6:25 AM Catherine Hoang
> <catherine.hoang@oracle.com> wrote:
> >
> > Implement internal freeze/thaw functions and prevent other threads from changing
> > the freeze level by adding a new SB_FREEZE_ECLUSIVE level. This is required to
> 
> This looks troubling in several ways:
> - Layering violation
> - Duplication of subtle vfs code
> 
> > prevent concurrent transactions while we are updating the uuid.
> >
> 
> Wouldn't it be easier to hold s_umount while updating the uuid?

Why?  Userspace holds an open file descriptor, the fs won't get
unmounted.

> Let userspace freeze before XFS_IOC_SETFSUUID and let
> XFS_IOC_SETFSUUID take s_umount and verify that fs is frozen.

Ugh, no, I don't want *userspace* to have to know how to do that.

> > Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> > ---
> >  fs/xfs/xfs_super.c | 112 +++++++++++++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_super.h |   5 ++
> >  2 files changed, 117 insertions(+)
> >
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index 2479b5cbd75e..6a52ae660810 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -2279,6 +2279,118 @@ static inline int xfs_cpu_hotplug_init(void) { return 0; }
> >  static inline void xfs_cpu_hotplug_destroy(void) {}
> >  #endif
> >
> > +/*
> > + * We need to disable all writer threads, which means taking the first two
> > + * freeze levels to put userspace to sleep, and the third freeze level to
> > + * prevent background threads from starting new transactions.  Take one level
> > + * more to prevent other callers from unfreezing the filesystem while we run.
> > + */
> > +int
> > +xfs_internal_freeze(
> > +       struct xfs_mount        *mp)
> > +{
> > +       struct super_block      *sb = mp->m_super;
> > +       int                     level;
> > +       int                     error = 0;
> > +
> > +       /* Wait until we're ready to freeze. */
> > +       down_write(&sb->s_umount);
> > +       while (sb->s_writers.frozen != SB_UNFROZEN) {
> > +               up_write(&sb->s_umount);
> > +               delay(HZ / 10);
> > +               down_write(&sb->s_umount);
> > +       }
> 
> That can easily wait forever, without any task holding any lock.

Indeed, this needs at a bare minimum some kind of fatal_signal_pending
check every time through the loop.

> > +
> > +       if (sb_rdonly(sb)) {
> > +               sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
> > +               goto done;
> > +       }
> > +
> > +       sb->s_writers.frozen = SB_FREEZE_WRITE;
> > +       /* Release s_umount to preserve sb_start_write -> s_umount ordering */
> > +       up_write(&sb->s_umount);
> > +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
> > +       down_write(&sb->s_umount);
> > +
> > +       /* Now we go and block page faults... */
> > +       sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
> > +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_PAGEFAULT - 1);
> > +
> > +       /*
> > +        * All writers are done so after syncing there won't be dirty data.
> > +        * Let xfs_fs_sync_fs flush dirty data so the VFS won't start writeback
> > +        * and to disable the background gc workers.
> > +        */
> > +       error = sync_filesystem(sb);
> > +       if (error) {
> > +               sb->s_writers.frozen = SB_UNFROZEN;
> > +               for (level = SB_FREEZE_PAGEFAULT - 1; level >= 0; level--)
> > +                       percpu_up_write(sb->s_writers.rw_sem + level);
> > +               wake_up(&sb->s_writers.wait_unfrozen);
> > +               up_write(&sb->s_umount);
> > +               return error;
> > +       }
> > +
> > +       /* Now wait for internal filesystem counter */
> > +       sb->s_writers.frozen = SB_FREEZE_FS;
> > +       percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_FS - 1);
> > +
> > +       xfs_log_clean(mp);

Hmm... some of these calls really ought to be returning errors.

> > +
> > +       /*
> > +        * To prevent anyone else from unfreezing us, set the VFS freeze
> > +        * level to one higher than FREEZE_COMPLETE.
> > +        */
> > +       sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
> > +       for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
> > +               percpu_rwsem_release(sb->s_writers.rw_sem + level, 0,
> > +                               _THIS_IP_);
> 
> If you really must introduce a new freeze level, you should do it in vfs
> and not inside xfs, even if xfs is the only current user of the new leve.

Luis is already trying to do something similar to this.  So far Jan and
I seem to be the only ones who have taken a look at this fs-internal
freeze...

https://lore.kernel.org/linux-fsdevel/20230114003409.1168311-4-mcgrof@kernel.org/

--D

> Thanks,
> Amir.
Amir Goldstein March 14, 2023, 6 a.m. UTC | #3
On Tue, Mar 14, 2023 at 7:25 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Tue, Mar 14, 2023 at 07:11:56AM +0200, Amir Goldstein wrote:
> > On Tue, Mar 14, 2023 at 6:25 AM Catherine Hoang
> > <catherine.hoang@oracle.com> wrote:
> > >
> > > Implement internal freeze/thaw functions and prevent other threads from changing
> > > the freeze level by adding a new SB_FREEZE_ECLUSIVE level. This is required to
> >
> > This looks troubling in several ways:
> > - Layering violation
> > - Duplication of subtle vfs code
> >
> > > prevent concurrent transactions while we are updating the uuid.
> > >
> >
> > Wouldn't it be easier to hold s_umount while updating the uuid?
>
> Why?  Userspace holds an open file descriptor, the fs won't get
> unmounted.

"Implement internal freeze/thaw functions and prevent other threads
from changing
the freeze level..."

holding s_umount prevents changing freeze levels.

The special thing about the frozen state is that userspace is allowed
to leave the fs frozen without holding any open fd, but because there
is no such requirement from SET_FSUUID I don't understand the need
for a new freeze level.

Thanks,
Amir.
Darrick J. Wong March 16, 2023, 5:16 a.m. UTC | #4
On Tue, Mar 14, 2023 at 08:00:21AM +0200, Amir Goldstein wrote:
> On Tue, Mar 14, 2023 at 7:25 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Tue, Mar 14, 2023 at 07:11:56AM +0200, Amir Goldstein wrote:
> > > On Tue, Mar 14, 2023 at 6:25 AM Catherine Hoang
> > > <catherine.hoang@oracle.com> wrote:
> > > >
> > > > Implement internal freeze/thaw functions and prevent other threads from changing
> > > > the freeze level by adding a new SB_FREEZE_ECLUSIVE level. This is required to
> > >
> > > This looks troubling in several ways:
> > > - Layering violation
> > > - Duplication of subtle vfs code
> > >
> > > > prevent concurrent transactions while we are updating the uuid.
> > > >
> > >
> > > Wouldn't it be easier to hold s_umount while updating the uuid?
> >
> > Why?  Userspace holds an open file descriptor, the fs won't get
> > unmounted.
> 
> "Implement internal freeze/thaw functions and prevent other threads
> from changing
> the freeze level..."
> 
> holding s_umount prevents changing freeze levels.
> 
> The special thing about the frozen state is that userspace is allowed
> to leave the fs frozen without holding any open fd, but because there
> is no such requirement from SET_FSUUID I don't understand the need
> for a new freeze level.

Hm.  I think you could be right, since we can certainly run transactions
with s_umount held.  I wonder if we're going to run into problems with
the weird xlog things that fsuuid setting will want, but I guess we have
lockdep on our side, right? ;)

--D

> Thanks,
> Amir.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2479b5cbd75e..6a52ae660810 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2279,6 +2279,118 @@  static inline int xfs_cpu_hotplug_init(void) { return 0; }
 static inline void xfs_cpu_hotplug_destroy(void) {}
 #endif
 
+/*
+ * We need to disable all writer threads, which means taking the first two
+ * freeze levels to put userspace to sleep, and the third freeze level to
+ * prevent background threads from starting new transactions.  Take one level
+ * more to prevent other callers from unfreezing the filesystem while we run.
+ */
+int
+xfs_internal_freeze(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+	int			level;
+	int			error = 0;
+
+	/* Wait until we're ready to freeze. */
+	down_write(&sb->s_umount);
+	while (sb->s_writers.frozen != SB_UNFROZEN) {
+		up_write(&sb->s_umount);
+		delay(HZ / 10);
+		down_write(&sb->s_umount);
+	}
+
+	if (sb_rdonly(sb)) {
+		sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
+		goto done;
+	}
+
+	sb->s_writers.frozen = SB_FREEZE_WRITE;
+	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
+	up_write(&sb->s_umount);
+	percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+	down_write(&sb->s_umount);
+
+	/* Now we go and block page faults... */
+	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
+	percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_PAGEFAULT - 1);
+
+	/*
+	 * All writers are done so after syncing there won't be dirty data.
+	 * Let xfs_fs_sync_fs flush dirty data so the VFS won't start writeback
+	 * and to disable the background gc workers.
+	 */
+	error = sync_filesystem(sb);
+	if (error) {
+		sb->s_writers.frozen = SB_UNFROZEN;
+		for (level = SB_FREEZE_PAGEFAULT - 1; level >= 0; level--)
+			percpu_up_write(sb->s_writers.rw_sem + level);
+		wake_up(&sb->s_writers.wait_unfrozen);
+		up_write(&sb->s_umount);
+		return error;
+	}
+
+	/* Now wait for internal filesystem counter */
+	sb->s_writers.frozen = SB_FREEZE_FS;
+	percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_FS - 1);
+
+	xfs_log_clean(mp);
+
+	/*
+	 * To prevent anyone else from unfreezing us, set the VFS freeze
+	 * level to one higher than FREEZE_COMPLETE.
+	 */
+	sb->s_writers.frozen = SB_FREEZE_EXCLUSIVE;
+	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+		percpu_rwsem_release(sb->s_writers.rw_sem + level, 0,
+				_THIS_IP_);
+done:
+	up_write(&sb->s_umount);
+	return 0;
+}
+
+void
+xfs_internal_unfreeze(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+	int			level;
+
+	down_write(&sb->s_umount);
+	if (sb->s_writers.frozen != SB_FREEZE_EXCLUSIVE) {
+		/* somebody snuck in and unfroze us? */
+		ASSERT(0);
+		up_write(&sb->s_umount);
+		return;
+	}
+
+	if (sb_rdonly(sb)) {
+		sb->s_writers.frozen = SB_UNFROZEN;
+		goto out;
+	}
+
+	for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+		percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0,
+				_THIS_IP_);
+
+	/*
+	 * We didn't call xfs_fs_freeze, so we can't call xfs_fs_thaw.  Start
+	 * the background gc workers that were shut down by xfs_fs_sync_fs
+	 * when we froze.
+	 */
+	xfs_blockgc_start(mp);
+	xfs_inodegc_start(mp);
+
+	sb->s_writers.frozen = SB_UNFROZEN;
+	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+		percpu_up_write(sb->s_writers.rw_sem + level);
+out:
+	wake_up(&sb->s_writers.wait_unfrozen);
+	up_write(&sb->s_umount);
+	return;
+}
+
 STATIC int __init
 init_xfs_fs(void)
 {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 364e2c2648a8..0817cfc340ef 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -81,6 +81,8 @@  extern void xfs_qm_exit(void);
 # define XFS_WQFLAGS(wqflags)	(wqflags)
 #endif
 
+#define SB_FREEZE_EXCLUSIVE	(SB_FREEZE_COMPLETE + 1)
+
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_buftarg;
@@ -98,6 +100,9 @@  extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 
 extern struct workqueue_struct *xfs_discard_wq;
 
+extern int xfs_internal_freeze(struct xfs_mount *mp);
+extern void xfs_internal_unfreeze(struct xfs_mount *mp);
+
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
 #endif	/* __XFS_SUPER_H__ */