diff mbox

[3/3] xfs: freeze rw filesystems just prior to reboot

Message ID 20170518013242.GW4519@birch.djwong.org (mailing list archive)
State New, archived
Headers show

Commit Message

Darrick J. Wong May 18, 2017, 1:32 a.m. UTC
Apparently there are certain system software configurations that do odd
things like update the kernel and reboot without umounting the /boot fs
or remounting it readonly, either of which would push all the AIL items
out to disk.  As a result, a subsequent invocation of something like
grub (which has a frightening willingness to read a fs with a dirty log)
can read stale disk contents and/or miss files the metadata for which
have been written to the log but not checkpointed into the filesystem.

Granted, most of the time /boot is a separate partition and
systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
This "fix" is only needed for people who have one giant filesystem.

Therefore, add a reboot hook to freeze the rw filesystems (which
checkpoints the log) just prior to reboot.  This is an unfortunate and
insufficient workaround for multiple layers of inadequate external
software, but at least it will reduce boot time surprises for the "OS
updater failed to disengage the filesystem before rebooting" case.

Seeing as grub is unlikely ever to learn to replay the XFS log (and we
probably don't want it doing that), *LILO has been discontinued for at
least 18 months, and we're not quite to the point of putting kernel
files directly on the EFI System Partition, this seems like the least
crappy solution to this problem.

Yes, you're still screwed in grub if the system crashes. :)  I don't
anticipate this patch will make it upstream, but the idea could get at
least a single hearing.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_super.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Christoph Hellwig May 18, 2017, 6:28 a.m. UTC | #1
NAK.  If the answer is a reboot notifier the question is always wrong.

IF XFS has a problem with this others doe as well and we'll need to fix
it in common code.

On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> Apparently there are certain system software configurations that do odd
> things like update the kernel and reboot without umounting the /boot fs
> or remounting it readonly, either of which would push all the AIL items
> out to disk.  As a result, a subsequent invocation of something like
> grub (which has a frightening willingness to read a fs with a dirty log)
> can read stale disk contents and/or miss files the metadata for which
> have been written to the log but not checkpointed into the filesystem.
> 
> Granted, most of the time /boot is a separate partition and
> systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> This "fix" is only needed for people who have one giant filesystem.
> 
> Therefore, add a reboot hook to freeze the rw filesystems (which
> checkpoints the log) just prior to reboot.  This is an unfortunate and
> insufficient workaround for multiple layers of inadequate external
> software, but at least it will reduce boot time surprises for the "OS
> updater failed to disengage the filesystem before rebooting" case.
> 
> Seeing as grub is unlikely ever to learn to replay the XFS log (and we
> probably don't want it doing that), *LILO has been discontinued for at
> least 18 months, and we're not quite to the point of putting kernel
> files directly on the EFI System Partition, this seems like the least
> crappy solution to this problem.
> 
> Yes, you're still screwed in grub if the system crashes. :)  I don't
> anticipate this patch will make it upstream, but the idea could get at
> least a single hearing.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/xfs/xfs_super.c |   40 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 455a575..415b1e8 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -61,6 +61,7 @@
>  #include <linux/kthread.h>
>  #include <linux/freezer.h>
>  #include <linux/parser.h>
> +#include <linux/reboot.h>
>  
>  static const struct super_operations xfs_super_operations;
>  struct bio_set *xfs_ioend_bioset;
> @@ -1982,6 +1983,38 @@ xfs_destroy_workqueues(void)
>  	destroy_workqueue(xfs_alloc_wq);
>  }
>  
> +STATIC void
> +xfs_reboot_fs(
> +	struct super_block	*sb,
> +	void			*priv)
> +{
> +	int			error;
> +
> +	if (sb->s_flags & MS_RDONLY)
> +		return;
> +	xfs_info(XFS_M(sb), "Freezing prior to reboot.");
> +	up_read(&sb->s_umount);
> +	error = freeze_super(sb);
> +	down_read(&sb->s_umount);
> +	if (error && error != -EBUSY)
> +		xfs_info(XFS_M(sb), "Unable to freeze, error=%d", error);
> +}
> +
> +STATIC int
> +xfs_reboot(
> +	struct notifier_block	*nb,
> +	ulong			event,
> +	void			*buf)
> +{
> +	iterate_supers_type(&xfs_fs_type, xfs_reboot_fs, NULL);
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block xfs_reboot_notifier = {
> +	.notifier_call = xfs_reboot,
> +	.priority = INT_MAX,
> +};
> +
>  STATIC int __init
>  init_xfs_fs(void)
>  {
> @@ -2056,8 +2089,14 @@ init_xfs_fs(void)
>  	error = register_filesystem(&xfs_fs_type);
>  	if (error)
>  		goto out_qm_exit;
> +
> +	error = register_reboot_notifier(&xfs_reboot_notifier);
> +	if (error)
> +		goto out_register_fs;
>  	return 0;
>  
> + out_register_fs:
> +	unregister_filesystem(&xfs_fs_type);
>   out_qm_exit:
>  	xfs_qm_exit();
>   out_remove_dbg_kobj:
> @@ -2089,6 +2128,7 @@ init_xfs_fs(void)
>  STATIC void __exit
>  exit_xfs_fs(void)
>  {
> +	unregister_reboot_notifier(&xfs_reboot_notifier);
>  	xfs_qm_exit();
>  	unregister_filesystem(&xfs_fs_type);
>  #ifdef DEBUG
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner May 18, 2017, 8:34 a.m. UTC | #2
On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> Apparently there are certain system software configurations that do odd
> things like update the kernel and reboot without umounting the /boot fs
> or remounting it readonly, either of which would push all the AIL items
> out to disk.  As a result, a subsequent invocation of something like
> grub (which has a frightening willingness to read a fs with a dirty log)
> can read stale disk contents and/or miss files the metadata for which
> have been written to the log but not checkpointed into the filesystem.

> Granted, most of the time /boot is a separate partition and
> systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> This "fix" is only needed for people who have one giant filesystem.

Let me guess the series of events: grub calls "sync" and says "I'm
done", then user runs an immediate reboot/shutdown and something
still running after init has killed everything but PID 1 has an open
writeable file descriptor causing the remount-ro of / to return
EBUSY and so it just shuts down/restarts with an unflushed log?

> Therefore, add a reboot hook to freeze the rw filesystems (which
> checkpoints the log) just prior to reboot.  This is an unfortunate and
> insufficient workaround for multiple layers of inadequate external
> software, but at least it will reduce boot time surprises for the "OS
> updater failed to disengage the filesystem before rebooting" case.
> 
> Seeing as grub is unlikely ever to learn to replay the XFS log (and we
> probably don't want it doing that),

If anything other than XFS code modifies the filesystem (log,
metadata or data) then we have a tainted, unsuportable filesystem
image.....

> *LILO has been discontinued for at least 18 months,

Yet Lilo still works just fine.

> and we're not quite to the point of putting kernel
> files directly on the EFI System Partition,

Really? How have we not got there yet - we were doing this almost
15 years ago with ia64 and elilo via mounting the EFI partition on
/boot....

> this seems like the least
> crappy solution to this problem.
> 
> Yes, you're still screwed in grub if the system crashes. :)

This really sounds like the perennial "grub doesn't ensure the
information it requires to boot is safely on stable storage before
reboot" problem combined with some sub-optimal init behaviour to
expose the grub issue....

Cheers,

Dave.
Darrick J. Wong May 18, 2017, 10:30 p.m. UTC | #3
On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> > Apparently there are certain system software configurations that do odd
> > things like update the kernel and reboot without umounting the /boot fs
> > or remounting it readonly, either of which would push all the AIL items
> > out to disk.  As a result, a subsequent invocation of something like
> > grub (which has a frightening willingness to read a fs with a dirty log)
> > can read stale disk contents and/or miss files the metadata for which
> > have been written to the log but not checkpointed into the filesystem.
> 
> > Granted, most of the time /boot is a separate partition and
> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> > This "fix" is only needed for people who have one giant filesystem.
> 
> Let me guess the series of events: grub calls "sync" and says "I'm

dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)

> done", then user runs an immediate reboot/shutdown and something
> still running after init has killed everything but PID 1 has an open

Worse than that, actually -- it was plymouthd, aka the splash screen.
If plymouthd isn't running, then the ro remount succeeds (not that
systemd actually checks) and grub is fine afterwards.

> writeable file descriptor causing the remount-ro of / to return
> EBUSY and so it just shuts down/restarts with an unflushed log?

Yes, it's /that/ problem again, that you and I were going 'round and
'round about a month or two ago.  I decided that I could at least try to
get something merged to reduce the user pain, even if the real problem
is herpy derpy userspace.

> > Therefore, add a reboot hook to freeze the rw filesystems (which
> > checkpoints the log) just prior to reboot.  This is an unfortunate and
> > insufficient workaround for multiple layers of inadequate external
> > software, but at least it will reduce boot time surprises for the "OS
> > updater failed to disengage the filesystem before rebooting" case.
> > 
> > Seeing as grub is unlikely ever to learn to replay the XFS log (and we
> > probably don't want it doing that),
> 
> If anything other than XFS code modifies the filesystem (log,
> metadata or data) then we have a tainted, unsuportable filesystem
> image.....

Indeed.

> > *LILO has been discontinued for at least 18 months,
> 
> Yet Lilo still works just fine.

Ok fine it's been /totally stable/ for 18 months. ;)
https://lilo.alioth.debian.org/

FWIW lilo isn't compatible with reflinked inodes (admittedly unlikely on
/boot) but 

> > and we're not quite to the point of putting kernel
> > files directly on the EFI System Partition,
> 
> Really? How have we not got there yet - we were doing this almost
> 15 years ago with ia64 and elilo via mounting the EFI partition on
> /boot....

elilo also seems dead, according to its SF page.
https://sourceforge.net/projects/elilo/

I'm not sure why we don't just drop kernel+initrd into the ESP and
create a bootloader entry via efibootmgr, other than the ages ago
reports about EFI firmwares bricking if the nvram fills up with data:
https://mjg59.dreamwidth.org/22855.html

Though I imagine certain distros don't want to have to get their kernels
signed by Microsoft so that Secure Boot works or whatever, etc. etc.
I guess elilo was a nice shim for that.

> > this seems like the least
> > crappy solution to this problem.
> > 
> > Yes, you're still screwed in grub if the system crashes. :)
> 
> This really sounds like the perennial "grub doesn't ensure the
> information it requires to boot is safely on stable storage before
> reboot" problem combined with some sub-optimal init behaviour to
> expose the grub issue....

Yep!  Anyway Christoph is right, this isn't something that plagues only
XFS; Ted was also musing that ext4 likely needs the same workaround, so
I'll go move this to fsdevel. :)

--D

> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Murphy May 19, 2017, 7:09 p.m. UTC | #4
On Thu, May 18, 2017 at 4:30 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:
> On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
>> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
>> > Apparently there are certain system software configurations that do odd
>> > things like update the kernel and reboot without umounting the /boot fs
>> > or remounting it readonly, either of which would push all the AIL items
>> > out to disk.  As a result, a subsequent invocation of something like
>> > grub (which has a frightening willingness to read a fs with a dirty log)
>> > can read stale disk contents and/or miss files the metadata for which
>> > have been written to the log but not checkpointed into the filesystem.
>>
>> > Granted, most of the time /boot is a separate partition and
>> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
>> > This "fix" is only needed for people who have one giant filesystem.
>>
>> Let me guess the series of events: grub calls "sync" and says "I'm
>
> dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)


It has nothing to do with GRUB. The exact same problem would happen
regardless of bootloader because the thing writing out bootloader
configuration prior to reboot is grubby, which is not at all in any
way related to GRUB.

I've explained this before, and now Dave's continued misstatements are
propagating to Darrick. If you guys want to believe in untrue things,
have at it. But please stop repeating untrue things.


>
>> done", then user runs an immediate reboot/shutdown and something
>> still running after init has killed everything but PID 1 has an open
>
> Worse than that, actually -- it was plymouthd, aka the splash screen.
> If plymouthd isn't running, then the ro remount succeeds (not that
> systemd actually checks) and grub is fine afterwards.
>
>> writeable file descriptor causing the remount-ro of / to return
>> EBUSY and so it just shuts down/restarts with an unflushed log?
>
> Yes, it's /that/ problem again, that you and I were going 'round and
> 'round about a month or two ago.  I decided that I could at least try to
> get something merged to reduce the user pain, even if the real problem
> is herpy derpy userspace.


Note that plymouth is doing the wrong thing per systemd's own
documentation. Plymouth has asked systemd to be exempt from being
killed, which systemd honors, but documentation says that programs
should not request such exemption on root file systems and should
instead run from the initramfs if they must be non-killable.

Both systemd and plymouth upstreams are aware of this and have been
looking into their own solution the problem, I don't know why they
consider the fix invasive, but that's how it's been characterized.

And I've argued to systemd folks that they need to take some
responsibility for this because this happens during their
offline-update.target which which a particular boot mode that is
explicitly designed for system software updates, and it's used because
it's supposed to be a safe, stable, known environment, compared to
doing updates while a bunch of stuff including a desktop environment
is running. And yet - poof - it yanks the file system out from under
itself.

Now originally they were blaming the file systems, saying that sync()
is supposed to guarantee everything, data and metadata is completely
written to stable media. But I think that definition of sync()
predates journaled file systems, so now there's broad understanding in
fs circles that journaled file systems only guarantee the journal and
data are committed to stable media, not the fs metadata itself. And do
require sync() to apply to fs metadata, I suspect means file systems
would become slower than molasses in winter.


>
>> > Therefore, add a reboot hook to freeze the rw filesystems (which
>> > checkpoints the log) just prior to reboot.  This is an unfortunate and
>> > insufficient workaround for multiple layers of inadequate external
>> > software, but at least it will reduce boot time surprises for the "OS
>> > updater failed to disengage the filesystem before rebooting" case.
>> >
>> > Seeing as grub is unlikely ever to learn to replay the XFS log (and we
>> > probably don't want it doing that),
>>
>> If anything other than XFS code modifies the filesystem (log,
>> metadata or data) then we have a tainted, unsuportable filesystem
>> image.....
>
> Indeed.

Doesn't mount -o ro still do journal replay but then doesn't write any
fixes back to stable media? Why can't the bootloader do this? GRUB2
for a rather long time now has a 4GiB memory limit on 32-bit and GRUB
devs have said this could be lifted higher on 64-bit. There is no
640KiB limit for GRUB.


>
>> > *LILO has been discontinued for at least 18 months,
>>
>> Yet Lilo still works just fine.
>
> Ok fine it's been /totally stable/ for 18 months. ;)
> https://lilo.alioth.debian.org/
>
> FWIW lilo isn't compatible with reflinked inodes (admittedly unlikely on
> /boot) but


This whole LILO thing is irritating. I don't know how many times I
have to say it...

grubby is the sole thing responsible for writing bootloader
configuration changes, no matter the bootloader, on Red Hat and Fedora
systems. There is absolutely no difference between LILO and GRUB
bootloader configuration changes on these distros.


>
>> > and we're not quite to the point of putting kernel
>> > files directly on the EFI System Partition,
>>
>> Really? How have we not got there yet - we were doing this almost
>> 15 years ago with ia64 and elilo via mounting the EFI partition on
>> /boot....
>
> elilo also seems dead, according to its SF page.
> https://sourceforge.net/projects/elilo/
>
> I'm not sure why we don't just drop kernel+initrd into the ESP and
> create a bootloader entry via efibootmgr,


I explained this too already.

So long as there is dual boot, this is a dead end. There isn't enough
room on ESP's for this, and it can't be grown, and it's unreliable to
have two ESPs on  the same system due to myriad UEFI bugs, and also it
confuses Windows. So it's not ever going to happen except on Linux
only systems.



>> This really sounds like the perennial "grub doesn't ensure the
>> information it requires to boot is safely on stable storage before
>> reboot" problem combined with some sub-optimal init behaviour to
>> expose the grub issue....
>
> Yep!  Anyway Christoph is right, this isn't something that plagues only
> XFS; Ted was also musing that ext4 likely needs the same workaround, so
> I'll go move this to fsdevel. :)


*facepalm* You guys are driving me crazy.

1. The grub.cfg is modified by grubby. Not grub. The same damn problem
would happen no matter what bootloader is used.
2. It's not just the grub.cfg that cannot be found by GRUB. It can't
find the new kernel file, any of its modules, or the new initramfs.
None of that has XFS file system metadata either. It's all simply not
there as far as the bootloader is concerned. And all of those things
were written by RPM.

So to be consistent you have to blame RPM for not ensuring its writes
are safely on stable storage either, before reboot.

Jesus Christ...

I want Dave to write "this problem has nothing to do with GRUB" 50
times on a chalkboard.

And then I want him to strace grub2-mkconfig (which grubby does not
use, which no Fedora or Red Hat system uses except one time during the
original installation of the system) to prove his claim that grub
isn't ensuring bootloader configuration info isn't getting to stable
storage. Otherwise this is just handwaiving without evidence. If the
GRUB folks are doing something wrong, seeing as all other distros do
rely upon it, then it needs to get fixed. But claiming things without
evidence is super shitty.

Now I just tried to strace grub2-mkconfig with -ff and I get literally
2880+ files. That is batshit crazy, but aside from that I think the
most relevant child process that writes out the actual final grub.cfg
is this:

https://paste.fedoraproject.org/paste/iksyeiYhxAIgbrbrugqOzV5M1UNdIGYhyRLivL9gydE=

I don't know what its not doing that it should be doing, but then also
RPM must also not being doing it.
Darrick J. Wong May 19, 2017, 9 p.m. UTC | #5
On Fri, May 19, 2017 at 01:09:31PM -0600, Chris Murphy wrote:
> On Thu, May 18, 2017 at 4:30 PM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> > On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
> >> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> >> > Apparently there are certain system software configurations that do odd
> >> > things like update the kernel and reboot without umounting the /boot fs
> >> > or remounting it readonly, either of which would push all the AIL items
> >> > out to disk.  As a result, a subsequent invocation of something like
> >> > grub (which has a frightening willingness to read a fs with a dirty log)
> >> > can read stale disk contents and/or miss files the metadata for which
> >> > have been written to the log but not checkpointed into the filesystem.
> >>
> >> > Granted, most of the time /boot is a separate partition and
> >> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> >> > This "fix" is only needed for people who have one giant filesystem.
> >>
> >> Let me guess the series of events: grub calls "sync" and says "I'm
> >
> > dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)
> 
> 
> It has nothing to do with GRUB. The exact same problem would happen
> regardless of bootloader because the thing writing out bootloader
> configuration prior to reboot is grubby, which is not at all in any
> way related to GRUB.
> 
> I've explained this before, and now Dave's continued misstatements are
> propagating to Darrick. If you guys want to believe in untrue things,
> have at it. But please stop repeating untrue things.

I need to clarify here what I meant by
"dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes". is that the thing
that writes to the filesystem prior to the reboot -- the package manager
and the boot configuration file writer.  Not the grub stage1/2/3, not
the lilo binary, not any of the bootloaders.  I won't speak for Dave but
I think you and I are roughly on the same page here.

> >> done", then user runs an immediate reboot/shutdown and something
> >> still running after init has killed everything but PID 1 has an open
> >
> > Worse than that, actually -- it was plymouthd, aka the splash screen.
> > If plymouthd isn't running, then the ro remount succeeds (not that
> > systemd actually checks) and grub is fine afterwards.
> >
> >> writeable file descriptor causing the remount-ro of / to return
> >> EBUSY and so it just shuts down/restarts with an unflushed log?
> >
> > Yes, it's /that/ problem again, that you and I were going 'round and
> > 'round about a month or two ago.  I decided that I could at least try to
> > get something merged to reduce the user pain, even if the real problem
> > is herpy derpy userspace.
> 
> 
> Note that plymouth is doing the wrong thing per systemd's own
> documentation. Plymouth has asked systemd to be exempt from being
> killed, which systemd honors, but documentation says that programs
> should not request such exemption on root file systems and should
> instead run from the initramfs if they must be non-killable.
> 
> Both systemd and plymouth upstreams are aware of this and have been
> looking into their own solution the problem, I don't know why they
> consider the fix invasive, but that's how it's been characterized.
> 
> And I've argued to systemd folks that they need to take some
> responsibility for this because this happens during their
> offline-update.target which which a particular boot mode that is
> explicitly designed for system software updates, and it's used because
> it's supposed to be a safe, stable, known environment, compared to
> doing updates while a bunch of stuff including a desktop environment
> is running. And yet - poof - it yanks the file system out from under
> itself.
> 
> Now originally they were blaming the file systems, saying that sync()
> is supposed to guarantee everything, data and metadata is completely
> written to stable media. But I think that definition of sync()
> predates journaled file systems, so now there's broad understanding in
> fs circles that journaled file systems only guarantee the journal and
> data are committed to stable media, not the fs metadata itself.

Right.  Everything's committed to stable media /somewhere/, but the
metadata isn't necessarily where an fs driver will find it if that
driver ignores the dirty log.

> And do require sync() to apply to fs metadata, I suspect means file
> systems would become slower than molasses in winter.

Probably.

> >> > Therefore, add a reboot hook to freeze the rw filesystems (which
> >> > checkpoints the log) just prior to reboot.  This is an unfortunate and
> >> > insufficient workaround for multiple layers of inadequate external
> >> > software, but at least it will reduce boot time surprises for the "OS
> >> > updater failed to disengage the filesystem before rebooting" case.
> >> >
> >> > Seeing as grub is unlikely ever to learn to replay the XFS log (and we
> >> > probably don't want it doing that),
> >>
> >> If anything other than XFS code modifies the filesystem (log,
> >> metadata or data) then we have a tainted, unsuportable filesystem
> >> image.....
> >
> > Indeed.
> 
> Doesn't mount -o ro still do journal replay but then doesn't write any
> fixes back to stable media?

There are multiple opinions about what an ro mount means.  The most
common I've heard are "userspace cannot change files but the fs can
write to the underlying block device" and "userspace cannot change files
and the fs cannot write to the underlying block device".  XFS does the
second if the block device is ro, and the first if it is rw.

> Why can't the bootloader do this? GRUB2 for a rather long time now has
> a 4GiB memory limit on 32-bit and GRUB devs have said this could be
> lifted higher on 64-bit. There is no 640KiB limit for GRUB.

I don't think the XFS community will look fondly on a second competing
log implementation since the XFS log is rather more complex than jbd2.

> >> > *LILO has been discontinued for at least 18 months,
> >>
> >> Yet Lilo still works just fine.
> >
> > Ok fine it's been /totally stable/ for 18 months. ;)
> > https://lilo.alioth.debian.org/
> >
> > FWIW lilo isn't compatible with reflinked inodes (admittedly unlikely on
> > /boot) but
> 
> 
> This whole LILO thing is irritating. I don't know how many times I
> have to say it...
> 
> grubby is the sole thing responsible for writing bootloader
> configuration changes, no matter the bootloader, on Red Hat and Fedora
> systems. There is absolutely no difference between LILO and GRUB
> bootloader configuration changes on these distros.

I'm tired of getting beat over the head about this.  There's no grubby
on Debian!

# apt-cache search grubby
<nothing>

# dpkg -S /etc/kernel/postinst.d/zz-update-grub 
grub-efi-amd64: /etc/kernel/postinst.d/zz-update-grub

zz-update-grub calls update-grub:

# dpkg -S $(which update-grub)
grub2-common: /usr/sbin/update-grub

update-grub calls grub-mkconfig:

# dpkg -S $(which grub-mkconfig)
grub-common: /usr/sbin/grub-mkconfig

# head -n30 /usr/sbin/grub-mkconfig
head -n30 !$
head -n30 /usr/sbin/grub-mkconfig
#! /bin/sh
set -e

# Generate grub.cfg by inspecting /boot contents.
# Copyright (C) 2006,2007,2008,2009,2010 Free Software Foundation, Inc.
#
# GRUB is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# GRUB is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# strace -f -o /tmp/a update-grub
# egrep '(/boot.*cfg|sync)' /tmp/a
20879 execve("/usr/share/djwong-colorgcc/grub-mkconfig", ["grub-mkconfig", "-o", "/boot/grub/grub.cfg"], [/* 31 vars */]) = -1 ENOENT (No such file or directory)
20879 execve("/usr/local/sbin/grub-mkconfig", ["grub-mkconfig", "-o", "/boot/grub/grub.cfg"], [/* 31 vars */]) = -1 ENOENT (No such file or directory)
20879 execve("/usr/local/bin/grub-mkconfig", ["grub-mkconfig", "-o", "/boot/grub/grub.cfg"], [/* 31 vars */]) = -1 ENOENT (No such file or directory)
20879 execve("/usr/sbin/grub-mkconfig", ["grub-mkconfig", "-o", "/boot/grub/grub.cfg"], [/* 31 vars */]) = 0
20882 write(1, "/boot/grub/grub.cfg\n", 20 <unfinished ...>
20879 <... read resumed> "/boot/grub/grub.cfg\n", 128) = 20
20961 execve("/bin/rm", ["rm", "-f", "/boot/grub/grub.cfg.new"], [/* 50 vars */]) = 0
20961 newfstatat(AT_FDCWD, "/boot/grub/grub.cfg.new", 0x1f48948, AT_SYMLINK_NOFOLLOW) = -1 ENOENT (No such file or directory)
20961 unlinkat(AT_FDCWD, "/boot/grub/grub.cfg.new", 0) = -1 ENOENT (No such file or directory)
20879 open("/boot/grub/grub.cfg.new", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3
22199 execve("/bin/grep", ["grep", "^password", "/boot/grub/grub.cfg.new"], [/* 50 vars */]) = 0
22199 openat(AT_FDCWD, "/boot/grub/grub.cfg.new", O_RDONLY|O_NOCTTY) = 3
22200 execve("/bin/chmod", ["chmod", "444", "/boot/grub/grub.cfg.new"], [/* 50 vars */]) = 0
22200 stat("/boot/grub/grub.cfg.new", {st_mode=S_IFREG|0600, st_size=10329, ...}) = 0
22200 fchmodat(AT_FDCWD, "/boot/grub/grub.cfg.new", 0444) = 0
22201 execve("/usr/bin/grub-script-check", ["/usr/bin/grub-script-check", "/boot/grub/grub.cfg.new"], [/* 50 vars */]) = 0
22201 open("/boot/grub/grub.cfg.new", O_RDONLY) = 3
22202 execve("/bin/mv", ["mv", "-f", "/boot/grub/grub.cfg.new", "/boot/grub/grub.cfg"], [/* 50 vars */]) = 0
22202 stat("/boot/grub/grub.cfg", {st_mode=S_IFREG|0444, st_size=10329, ...}) = 0
22202 lstat("/boot/grub/grub.cfg.new", {st_mode=S_IFREG|0444, st_size=10329, ...}) = 0
22202 lstat("/boot/grub/grub.cfg", {st_mode=S_IFREG|0444, st_size=10329, ...}) = 0
22202 rename("/boot/grub/grub.cfg.new", "/boot/grub/grub.cfg") = 0

(No *sync(), huh?)

So, no, grub actually /does/ generate grub.cfg on Debian.  Let's go have
a look at RHEL7.3, which I agree does have grubby:

# rpm -qf /etc/kernel/postinst.d/51-dracut-rescue-postinst.sh 
dracut-config-rescue-033-463.0.2.el7.x86_64

51-dracut-rescue-postinst.sh calls new-kernel-pkg:

# rpm -qf $(which new-kernel-pkg)
grubby-8.28-21.0.1.el7_3.x86_64
# grub-mkconfig
-bash: grub-mkconfig: command not found
# strace -f -o /tmp/a grubby --update-kernel /boot/vmlinuz-3.10.0-514.16.1.el7.x86_64
...
# grep /boot.*cfg /tmp/a
14568 readlink("grub2.cfg", "../boot/grub2/grub.cfg", 256) = 22
14568 open("../boot/grub2/grub.cfg-", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3
14568 stat("../boot/grub2/grub.cfg", {st_mode=S_IFREG|0644, st_size=8731, ...}) = 0
14568 chmod("../boot/grub2/grub.cfg-", 0644) = 0
14572 fsync(4)
14568 rename("../boot/grub2/grub.cfg-", "../boot/grub2/grub.cfg") = 0

(we only fsync the grubenv file??)

So yes, grubby does handle things for RHEL and Fedora, but RHEL and
Fedora are not the entire universe.

> >> > and we're not quite to the point of putting kernel
> >> > files directly on the EFI System Partition,
> >>
> >> Really? How have we not got there yet - we were doing this almost
> >> 15 years ago with ia64 and elilo via mounting the EFI partition on
> >> /boot....
> >
> > elilo also seems dead, according to its SF page.
> > https://sourceforge.net/projects/elilo/
> >
> > I'm not sure why we don't just drop kernel+initrd into the ESP and
> > create a bootloader entry via efibootmgr,
> 
> 
> I explained this too already.
> 
> So long as there is dual boot, this is a dead end. There isn't enough
> room on ESP's for this, and it can't be grown, and it's unreliable to
> have two ESPs on  the same system due to myriad UEFI bugs, and also it
> confuses Windows. So it's not ever going to happen except on Linux
> only systems.

Fair enough.  Though truth be told, /boot can be small and unresizable
which just leads to problems if the distro doesn't take care of reaping
old kernel packages before installing new ones.

Though /boot tends to be bigger than the default ESPs I see.

> >> This really sounds like the perennial "grub doesn't ensure the
> >> information it requires to boot is safely on stable storage before
> >> reboot" problem combined with some sub-optimal init behaviour to
> >> expose the grub issue....
> >
> > Yep!  Anyway Christoph is right, this isn't something that plagues only
> > XFS; Ted was also musing that ext4 likely needs the same workaround, so
> > I'll go move this to fsdevel. :)
> 
> 
> *facepalm* You guys are driving me crazy.
> 
> 1. The grub.cfg is modified by grubby. Not grub.  The same damn
> problem would happen no matter what bootloader is used.
>
> 2. It's not just the grub.cfg that cannot be found by GRUB. It can't
> find the new kernel file, any of its modules, or the new initramfs.
> None of that has XFS file system metadata either.

Those new files /do/ have XFS metadata; the metadata is sitting
(uncheckpointed) in the log.

> It's all simply not there as far as the bootloader is concerned.

Yes, because the bootloader ignores dirty logs and goes looking for
directories and files, the metadata for which is sitting
(uncheckpointed) in the log.

> And all of those things were written by RPM.

Would it help if I mentioned this --

*sync() -> write data to disk, write metadata to log
FIFREEZE() -> sync() and write log contents to fs.
unmount() -> sync() write log contents to fs.
reboot() -> sync() and reboot.

So if you're going to write data to a filesystem that has to be read by
a piece of software that /ignores dirty logs/, you have to write the log
to disk before you reboot.  This can be unmount or freeze, take your
pick.  You cannot sync, because sync only ensures that everything is
/somewhere/ on the stable medium.  It does not guarantee that log
contents have been written to the fs.

> So to be consistent you have to blame RPM for not ensuring its writes
> are safely on stable storage either, before reboot.
> 
> Jesus Christ...
> 
> I want Dave to write "this problem has nothing to do with GRUB" 50
> times on a chalkboard.

Frankly, all you have to do is tweak the grub config file writer to
"fsfreeze -f /boot && fsfreeze -u /boot" after it writes the grub.cfg and
before it exits.  That's both grubby and grub-mkconfig.

Or systemd/sysvinit could always ensure that the fs containing the boot
files is unmounted no matter what.  Or systemd/sysvinit could freeze the
fs for us before asking for a reboot.  Or the kernel could freeze the fs
for us before actually doing the reboot.  Any one of those options is
sufficient, but not one of them is reliably performed by any of the
pieces.  systemd *almost* always succeed at unmounting /boot, which is
why few people see this problem.

> And then I want him to strace grub2-mkconfig (which grubby does not
> use, which no Fedora or Red Hat system uses except one time during the
> original installation of the system) to prove his claim that grub
> isn't ensuring bootloader configuration info isn't getting to stable
> storage. Otherwise this is just handwaiving without evidence. If the
> GRUB folks are doing something wrong, seeing as all other distros do
> rely upon it, then it needs to get fixed. But claiming things without
> evidence is super shitty.
> 
> Now I just tried to strace grub2-mkconfig with -ff and I get literally
> 2880+ files. That is batshit crazy, but aside from that I think the
> most relevant child process that writes out the actual final grub.cfg
> is this:
> 
> https://paste.fedoraproject.org/paste/iksyeiYhxAIgbrbrugqOzV5M1UNdIGYhyRLivL9gydE=
> 
> I don't know what its not doing that it should be doing, but then also
> RPM must also not being doing it.
> 
> -- 
> Chris Murphy
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Murphy May 20, 2017, 12:27 a.m. UTC | #6
On Fri, May 19, 2017 at 3:00 PM, Darrick J. Wong
<darrick.wong@oracle.com> wrote:


> Would it help if I mentioned this --
>
> *sync() -> write data to disk, write metadata to log
> FIFREEZE() -> sync() and write log contents to fs.
> unmount() -> sync() write log contents to fs.
> reboot() -> sync() and reboot.
>
> So if you're going to write data to a filesystem that has to be read by
> a piece of software that /ignores dirty logs/, you have to write the log
> to disk before you reboot.  This can be unmount or freeze, take your
> pick.  You cannot sync, because sync only ensures that everything is
> /somewhere/ on the stable medium.  It does not guarantee that log
> contents have been written to the fs.

Yes that helps quite a bit.


> Frankly, all you have to do is tweak the grub config file writer to
> "fsfreeze -f /boot && fsfreeze -u /boot" after it writes the grub.cfg and
> before it exits.  That's both grubby and grub-mkconfig.

I am willing to bet that their assumption has been, since ancient
times, that PID 1 umounts the file system before reboot, thereby
writing log contents to the file system. And it's not just these two
programs, there's probably bunches of kernel updates that are done by
myriad scripts that likewise assume on umount happening before reboot.


For sure systemd devs did not previously understand that sync() does
not write the log contents to the file system, and I think they are
open to the idea of falling back to FIFREEZE if remount-ro or umount
fail, before doing a reboot. That is functionally equivalent to what
they thought sync() was doing, after all.

Does that that alter your assessment of where to fix this?


> Or systemd/sysvinit could always ensure that the fs containing the boot
> files is unmounted no matter what.

systemd has an explicit feature allowing programs to opt out and so
far the feature can be abused contrary to documentation. So it's
definitely not reasonable (anymore) that PID 1 guarantees remount-ro
or umount prior to reboot.


>Or systemd/sysvinit could freeze the
> fs for us before asking for a reboot.

I think that's very plausible, but I do not speak for the systemd folks.

>Or the kernel could freeze the fs
> for us before actually doing the reboot.  Any one of those options is
> sufficient, but not one of them is reliably performed by any of the
> pieces.  systemd *almost* always succeed at unmounting /boot, which is
> why few people see this problem.

I can't assess what is better, systemd doing freeze, vs the kernel, vs
bootloader scripts. I think the path of least resistance is systemd
doing it, that more closely mimics historical assumptions anyway.

If there is a better way of doing it than history proposes, that's
fine also. But I think the fsdevel folks should actually have a direct
conversation with the systemd folks, to arrest the wrong assumptions
about sync() vs FIFREEZE(), and avoid duplicate efforts.
Dave Chinner May 22, 2017, 2:01 a.m. UTC | #7
On Fri, May 19, 2017 at 02:00:40PM -0700, Darrick J. Wong wrote:
> On Fri, May 19, 2017 at 01:09:31PM -0600, Chris Murphy wrote:
> > On Thu, May 18, 2017 at 4:30 PM, Darrick J. Wong
> > <darrick.wong@oracle.com> wrote:
> > > On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
> > >> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> > >> > Apparently there are certain system software configurations that do odd
> > >> > things like update the kernel and reboot without umounting the /boot fs
> > >> > or remounting it readonly, either of which would push all the AIL items
> > >> > out to disk.  As a result, a subsequent invocation of something like
> > >> > grub (which has a frightening willingness to read a fs with a dirty log)
> > >> > can read stale disk contents and/or miss files the metadata for which
> > >> > have been written to the log but not checkpointed into the filesystem.
> > >>
> > >> > Granted, most of the time /boot is a separate partition and
> > >> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> > >> > This "fix" is only needed for people who have one giant filesystem.
> > >>
> > >> Let me guess the series of events: grub calls "sync" and says "I'm
> > >
> > > dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)
> > 
> > 
> > It has nothing to do with GRUB. The exact same problem would happen
> > regardless of bootloader because the thing writing out bootloader
> > configuration prior to reboot is grubby, which is not at all in any
> > way related to GRUB.

Chris, I've told you previously this is wrong (it's incorrect for
lilo) and too narrow (grubby is not the only update infrastructure
around), but I'll repeat it again because we need to move past your
single-distro focus on grubby and discuss the underlying problems.

> > I've explained this before, and now Dave's continued misstatements are
> > propagating to Darrick. If you guys want to believe in untrue things,
> > have at it. But please stop repeating untrue things.
> 
> I need to clarify here what I meant by
> "dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes". is that the thing
> that writes to the filesystem prior to the reboot -- the package manager
> and the boot configuration file writer.  Not the grub stage1/2/3, not
> the lilo binary, not any of the bootloaders.

> I won't speak for Dave but
> I think you and I are roughly on the same page here.

Mostly. If you s/grub/grub update infrastructure/ in what I wrote,
then the first aspect of the description is the same.

However, IMO problem does indeed lie with the bootloader and not the
distro packaging mechanism/scripts, and so we need to talk a bit out
the architectural differences between bootloaders like lilo and
grub/petitboot and why I consider update durability to be something
the bootloader needs to provide, not thrid party packagers or the
init system.

> > Now originally they were blaming the file systems, saying that sync()
> > is supposed to guarantee everything, data and metadata is completely
> > written to stable media. But I think that definition of sync()
> > predates journaled file systems, so now there's broad understanding in
> > fs circles that journaled file systems only guarantee the journal and
> > data are committed to stable media, not the fs metadata itself.
> 
> Right.  Everything's committed to stable media /somewhere/, but the
> metadata isn't necessarily where an fs driver will find it if that
> driver ignores the dirty log.

Right. ISTR that early FS journalling papers from the late 80s
talked about this in detail, and why they didn't need to write back
metadata to provide sync(1) guarantees.  All journalling filesystems
since then have simply written the data+journal on sync(1).  These
sorts of basic OS concepts should be known by *everyone* writing low
level OS infrastructure...

> > This whole LILO thing is irritating. I don't know how many times I
> > have to say it...
> > 
> > grubby is the sole thing responsible for writing bootloader
> > configuration changes, no matter the bootloader, on Red Hat and Fedora
> > systems. There is absolutely no difference between LILO and GRUB
> > bootloader configuration changes on these distros.
> 
> I'm tired of getting beat over the head about this.  There's no grubby
> on Debian!

I've told Chris that several times, too. He's still shouting at me
(and you!) about grubby, though...

> (No *sync(), huh?)
...
> (we only fsync the grubenv file??)

Yikes! It's worse than I thought.

So, with that all out of the way, lets look at a lilo update.
My laptop (debian) running lilo:

$ sudo vi /etc/lilo.conf
<modify and save new config file to simulate dpkg installing a new
 kernel package, updating /boot and various links and modifying the
 /etc/lilo.conf file>
$ sudo strace -o t.t lilo
Added Linux  *
Added Linux.old
$ grep sync t.t
sync()                                  = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
fdatasync(6)                            = 0
sync()                                  = 0
sync()                                  = 0
$ 

The files decriptors of note are:

open("/dev/disk/by-id/ata-C400-MTFDDAK256MAM_000000001305036641EA", O_RDWR) = 5
open("/boot/map~", O_RDWR)              = 6

And here's the last ~10 operations done:

write(6, "\353N\0\0\0\0LILO\30\2\3360\"Y\2\3\0\0\266\0\0\0\0\0\225;\340V\2\0", 32) = 32
close(6)                                = 0
lseek(5, 0, SEEK_SET)                   = 0
sync()                                  = 0
write(5, "\372\353!\1\264\1LILO\30\2\3360\"Y\0\0\0\0\303\3049Q\224\236\7\0\241\0\200`"..., 512) = 512
close(5)                                = 0
rename("/boot/map~", "/boot/map")       = 0
sync()                                  = 0
close(4)                                = 0
exit_group(0)

This says that lilo uses a safe overwrite proceedure to do the
on-disk bootloader update. i.e. the initial sync ensures all the new
kernel updates and config file mods are written to disk. It then
maps all the files and writes the /boot/map~ file and syncs
everything. At this point, the new boot information is on the block
device. It then writes the new boot sector, renames /boot/map~ to it's
proper name, then runs sync. This atomic update of the boot sector
switches to the new boot map and makes the switch permanent.

Hence when the lilo binary exits, the information required by the
bootloader is *completely on stable storage*. At each stage, right
up to the atomic boot sector overwrite, a crash or failure leaves
the old boot map and boot sector intact. IOWs, lilo has a *fail-safe
update procedure*.

As a result, you do *not* need to call sync or freeze the filesystem
after running the lilo command because it has already guaranteed the
updated bootloader information is on stable storage.  And because
you have to run lilo to update the bootloader, distro package
managers *can't fuck it up* because the bootloader has full control
of the update process.

IOWs, Lilo updates are based on a fool-proof design.  It's also
fail-safe on many levels because not only is the update mechanism
fail-safe, config errors/failures will be caught during update
rather before the system is restarted rather than when the config
files are parsed during bootloader execution when it errors may be
difficult/impossible to fix.

Grub, unfortunately, does not have a fail-safe update procedure, and
it clearly does not have a fool-proof update architecture (as this
thread demonstrates). If we expect distro packaging tools to do
reliable boot loader updates, then the bootloader needs to provide
both fail-safe updates and do it via a fool-proof mechanism. It's no
good having a fail-safe update mechanism if it does not get used to
do updates. Grub currently fails on both accounts....

> > So to be consistent you have to blame RPM for not ensuring its writes
> > are safely on stable storage either, before reboot.
> > 
> > Jesus Christ...
> > 
> > I want Dave to write "this problem has nothing to do with GRUB" 50
> > times on a chalkboard.
> 
> Frankly, all you have to do is tweak the grub config file writer to
> "fsfreeze -f /boot && fsfreeze -u /boot" after it writes the grub.cfg and
> before it exits.  That's both grubby and grub-mkconfig.

Yup, that's the work-around I've previously suggested should be
added to grub and distro update mechanisms.  It doesn't fix the
underlying architectural issues grub has (e.g. updates are still
not fail-safe), but it provides the durability guarantee that is
required if the whole update process completes.

> Or systemd/sysvinit could always ensure that the fs containing the boot
> files is unmounted no matter what.  Or systemd/sysvinit could freeze the
> fs for us before asking for a reboot.  Or the kernel could freeze the fs
> for us before actually doing the reboot.  Any one of those options is
> sufficient, but not one of them is reliably performed by any of the
> pieces.  systemd *almost* always succeed at unmounting /boot, which is
> why few people see this problem.

Design principles of modularity and coupling say it's really up to
the bootloader to provide the durability mechanisms it needs.  IOWs,
the *bootloader update infrastructure* needs to provide the
durability guarantee/mechanism the bootloader infrastructure
requires (as per the lilo example) and not the init system.

Cheers,

Dave.
Dave Chinner May 22, 2017, 2:07 a.m. UTC | #8
On Fri, May 19, 2017 at 06:27:53PM -0600, Chris Murphy wrote:
> On Fri, May 19, 2017 at 3:00 PM, Darrick J. Wong
> <darrick.wong@oracle.com> wrote:
> > Frankly, all you have to do is tweak the grub config file writer to
> > "fsfreeze -f /boot && fsfreeze -u /boot" after it writes the grub.cfg and
> > before it exits.  That's both grubby and grub-mkconfig.
> 
> I am willing to bet that their assumption has been, since ancient
> times, that PID 1 umounts the file system before reboot, thereby
> writing log contents to the file system. And it's not just these two
> programs, there's probably bunches of kernel updates that are done by
> myriad scripts that likewise assume on umount happening before reboot.

This is not relevant to the discussion, because sync(1) is
sufficient for those updates to be accessible after reboot because
the filesystem replays the log at mount time and the changes are not
accessed until the filesystem is mounted.

IOWs, the problem is purely a bootloader issue due to the fact it is
the only software on the system that directly accesses the
filesystem metadata prior to log recovery being performed at boot
time.

Cheers,

Dave.
Chris Murphy May 22, 2017, 8:46 p.m. UTC | #9
On Sun, May 21, 2017 at 8:01 PM, Dave Chinner <david@fromorbit.com> wrote:
> On Fri, May 19, 2017 at 02:00:40PM -0700, Darrick J. Wong wrote:
>> On Fri, May 19, 2017 at 01:09:31PM -0600, Chris Murphy wrote:
>> > On Thu, May 18, 2017 at 4:30 PM, Darrick J. Wong
>> > <darrick.wong@oracle.com> wrote:
>> > > On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
>> > >> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
>> > >> > Apparently there are certain system software configurations that do odd
>> > >> > things like update the kernel and reboot without umounting the /boot fs
>> > >> > or remounting it readonly, either of which would push all the AIL items
>> > >> > out to disk.  As a result, a subsequent invocation of something like
>> > >> > grub (which has a frightening willingness to read a fs with a dirty log)
>> > >> > can read stale disk contents and/or miss files the metadata for which
>> > >> > have been written to the log but not checkpointed into the filesystem.
>> > >>
>> > >> > Granted, most of the time /boot is a separate partition and
>> > >> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
>> > >> > This "fix" is only needed for people who have one giant filesystem.
>> > >>
>> > >> Let me guess the series of events: grub calls "sync" and says "I'm
>> > >
>> > > dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)
>> >
>> >
>> > It has nothing to do with GRUB. The exact same problem would happen
>> > regardless of bootloader because the thing writing out bootloader
>> > configuration prior to reboot is grubby, which is not at all in any
>> > way related to GRUB.
>
> Chris, I've told you previously this is wrong (it's incorrect for
> lilo) and too narrow (grubby is not the only update infrastructure
> around), but I'll repeat it again because we need to move past your
> single-distro focus on grubby and discuss the underlying problems.


And I'm trying to get you to move past your ancient legacy bootloader
focus and discuss things that matter in 2017.

First, nothing works like LILO, not even ELILO works like LILO.

Second, I have only been able to reproduce this problem with grubby +
XFS. If I manually do grub2-mkconfig + reboot -f instead, the problem
does not happen. It might be that I'm too slow, and the marginal
amount of extra time permits the new grub.cfg to fully commit, but I
don't know.

Third, basic bootloader concepts:

The bootloader is the executable that's loaded by the firmware after POST.

The bootloader installer is not the bootloader. /sbin/lilo,
/sbin/extlinux, /sbin/grub-install are not bootloaders, they are
bootloader installers. Only lilo totally reinstalls the bootloader
binary when configurations change. GRUB, extlinux, uboot, zipl don't
do that. Their configuration files change and that's it, and they do
not depend on boot sectors for this, they don't write outside the file
system to the drive like lilo. They depend on the file system.

So what you are stuck with, what you have in 2017, are bootloaders
that do not work the way you describe, and will not ever work like
what you describe.



> However, IMO problem does indeed lie with the bootloader and not the
> distro packaging mechanism/scripts, and so we need to talk a bit out
> the architectural differences between bootloaders like lilo and
> grub/petitboot and why I consider update durability to be something
> the bootloader needs to provide, not thrid party packagers or the
> init system.

Either the bootloader needs to learn how to read dirty logs, or there
can be no dirty log entries related to three things: kernel,
initramfs, and bootloader configuration. If any of those three things
are not fully committed to the file system metadata,  the bootloader
will fail to find them.

GRUB, extlinux, uboot - they are reading the file system to find their
configuration files, the kernel and the initramfs. That's not how LILO
works. And putting it on some pedestal as if that's how these other
bootloader ought to work in order to be fail safe, is ridiculous on
its face. It's basically demanding they do a total architectural
change and rewrite from scratch.

If GRUB or grubby are not being used, then the bootloader
configuration file is most likely modified by a script in the kernel
package. How do you avoid burdening the kernel package from update
durability when it is responsible for writing the kernel, initramfs,
and the third most likely thing to modify the bootloader
configuration?

You're also saying that sync() is not sufficient, and also haven't
answered the question if it's sane for the kernel package manager as
well as the bootloader configuration modifier (be it the kernel
package, grubby, or grub-mkconfig) to assume that something else is
going to reliably remount-ro or umount or freeze the file system and
thus fully commit these changes.

If that is not sane, then you're burdening all of them with freeze
because they aren't in a position to remount or umount.


>> > Now originally they were blaming the file systems, saying that sync()
>> > is supposed to guarantee everything, data and metadata is completely
>> > written to stable media. But I think that definition of sync()
>> > predates journaled file systems, so now there's broad understanding in
>> > fs circles that journaled file systems only guarantee the journal and
>> > data are committed to stable media, not the fs metadata itself.
>>
>> Right.  Everything's committed to stable media /somewhere/, but the
>> metadata isn't necessarily where an fs driver will find it if that
>> driver ignores the dirty log.
>
> Right. ISTR that early FS journalling papers from the late 80s
> talked about this in detail, and why they didn't need to write back
> metadata to provide sync(1) guarantees.  All journalling filesystems
> since then have simply written the data+journal on sync(1).  These
> sorts of basic OS concepts should be known by *everyone* writing low
> level OS infrastructure...



*shrug* ok well there's evidence they do not in fact know this. So
start imagining what other confusion exists from wrong assumptions
about basic concepts, and how to go about mitigating this.







>> > This whole LILO thing is irritating. I don't know how many times I
>> > have to say it...
>> >
>> > grubby is the sole thing responsible for writing bootloader
>> > configuration changes, no matter the bootloader, on Red Hat and Fedora
>> > systems. There is absolutely no difference between LILO and GRUB
>> > bootloader configuration changes on these distros.
>>
>> I'm tired of getting beat over the head about this.  There's no grubby
>> on Debian!
>
> I've told Chris that several times, too. He's still shouting at me
> (and you!) about grubby, though...
>
>> (No *sync(), huh?)
> ...
>> (we only fsync the grubenv file??)
>
> Yikes! It's worse than I thought.
>
> So, with that all out of the way, lets look at a lilo update.
> My laptop (debian) running lilo:
>
> $ sudo vi /etc/lilo.conf
> <modify and save new config file to simulate dpkg installing a new
>  kernel package, updating /boot and various links and modifying the
>  /etc/lilo.conf file>
> $ sudo strace -o t.t lilo
> Added Linux  *
> Added Linux.old
> $ grep sync t.t
> sync()                                  = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> fdatasync(6)                            = 0
> sync()                                  = 0
> sync()                                  = 0
> $
>
> The files decriptors of note are:
>
> open("/dev/disk/by-id/ata-C400-MTFDDAK256MAM_000000001305036641EA", O_RDWR) = 5
> open("/boot/map~", O_RDWR)              = 6
>
> And here's the last ~10 operations done:
>
> write(6, "\353N\0\0\0\0LILO\30\2\3360\"Y\2\3\0\0\266\0\0\0\0\0\225;\340V\2\0", 32) = 32
> close(6)                                = 0
> lseek(5, 0, SEEK_SET)                   = 0
> sync()                                  = 0
> write(5, "\372\353!\1\264\1LILO\30\2\3360\"Y\0\0\0\0\303\3049Q\224\236\7\0\241\0\200`"..., 512) = 512
> close(5)                                = 0
> rename("/boot/map~", "/boot/map")       = 0
> sync()                                  = 0
> close(4)                                = 0
> exit_group(0)
>
> This says that lilo uses a safe overwrite proceedure to do the
> on-disk bootloader update. i.e. the initial sync ensures all the new
> kernel updates and config file mods are written to disk. It then
> maps all the files and writes the /boot/map~ file and syncs
> everything. At this point, the new boot information is on the block
> device. It then writes the new boot sector, renames /boot/map~ to it's
> proper name, then runs sync. This atomic update of the boot sector
> switches to the new boot map and makes the switch permanent.
>
> Hence when the lilo binary exits, the information required by the
> bootloader is *completely on stable storage*. At each stage, right
> up to the atomic boot sector overwrite, a crash or failure leaves
> the old boot map and boot sector intact. IOWs, lilo has a *fail-safe
> update procedure*.
>
> As a result, you do *not* need to call sync or freeze the filesystem
> after running the lilo command because it has already guaranteed the
> updated bootloader information is on stable storage.  And because
> you have to run lilo to update the bootloader, distro package
> managers *can't fuck it up* because the bootloader has full control
> of the update process.

Haha. Well it can't fuck it up because it's doing a total end run
around the file system. Not even ELILO can do that. There is no such
thing as boot sectors, and writing data outside the file system on a
UEFI computer.




>
> IOWs, Lilo updates are based on a fool-proof design.  It's also
> fail-safe on many levels because not only is the update mechanism
> fail-safe, config errors/failures will be caught during update
> rather before the system is restarted rather than when the config
> files are parsed during bootloader execution when it errors may be
> difficult/impossible to fix.
>
> Grub, unfortunately, does not have a fail-safe update procedure, and
> it clearly does not have a fool-proof update architecture (as this
> thread demonstrates). If we expect distro packaging tools to do
> reliable boot loader updates, then the bootloader needs to provide
> both fail-safe updates and do it via a fool-proof mechanism. It's no
> good having a fail-safe update mechanism if it does not get used to
> do updates. Grub currently fails on both accounts....


Fool proof design because it doesn't trust the file system, at all, for writing.

Actually GRUB has a functional equivalent, it's just that it's not the
upstream default, and no distro appears to want to use it by default:
create configuration file, create core image with that configuration
file baked into it, and then install new core.img to MBR gap or
bios-grub partition. And this can be done atomically and fail safe if
the gap is big enough, with the last step being to write a new
boot.img in the first 440 bytes of LBA 0 to point to the new core.img.

That works and is fail safe why? Because it depends on the file system
as much as lilo. Zero.




>
>> > So to be consistent you have to blame RPM for not ensuring its writes
>> > are safely on stable storage either, before reboot.
>> >
>> > Jesus Christ...
>> >
>> > I want Dave to write "this problem has nothing to do with GRUB" 50
>> > times on a chalkboard.
>>
>> Frankly, all you have to do is tweak the grub config file writer to
>> "fsfreeze -f /boot && fsfreeze -u /boot" after it writes the grub.cfg and
>> before it exits.  That's both grubby and grub-mkconfig.
>
> Yup, that's the work-around I've previously suggested should be
> added to grub and distro update mechanisms.  It doesn't fix the
> underlying architectural issues grub has (e.g. updates are still
> not fail-safe), but it provides the durability guarantee that is
> required if the whole update process completes.

It assumes that the kernel package executes grub-mkconfig or grubby
only once the kernel and initramfs have finished writing. I've seen
this happen in parallel with RPM and new-kernel-package, so you could
conceivably get this scenario:

1. kernel is written and sync()
2. initramfs is opened and dracut is creating it
3. grub-mkconfig runs and then fifreeze/fiunfreeze
4. dracut continues to append to initramfs then completes
5. reboot without remount-ro or umount

And now you have a borked boot because the initramfs isn't fully
committed, part of its writes are only in the journal which the
bootloader will not see. So it'll end up loading only part of the
initramfs into memory.


>
>> Or systemd/sysvinit could always ensure that the fs containing the boot
>> files is unmounted no matter what.  Or systemd/sysvinit could freeze the
>> fs for us before asking for a reboot.  Or the kernel could freeze the fs
>> for us before actually doing the reboot.  Any one of those options is
>> sufficient, but not one of them is reliably performed by any of the
>> pieces.  systemd *almost* always succeed at unmounting /boot, which is
>> why few people see this problem.
>
> Design principles of modularity and coupling say it's really up to
> the bootloader to provide the durability mechanisms it needs.  IOWs,
> the *bootloader update infrastructure* needs to provide the
> durability guarantee/mechanism the bootloader infrastructure
> requires (as per the lilo example) and not the init system.


The lilo example needs to be banned from the discussion. Nothing else
works like it does, nor will it. And it also ignores the fact that the
kernel or initramfs could possibly only be in the dirty log, without
being in fs metadata itself and thus not locatable by the bootloader.



Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chris Murphy May 23, 2017, 3:56 a.m. UTC | #10
On Mon, May 22, 2017 at 2:46 PM, Chris Murphy <lists@colorremedies.com> wrote:

> Second, I have only been able to reproduce this problem with grubby +
> XFS.

OK so this is just on Fedora/RH systems, which is the example case I
have. The kernel RPM is running a script called new-kernel-pkg which
is part of grubby. It's found here:

https://github.com/rhinstaller/grubby/blob/master/new-kernel-pkg

# make sure changes make it to the disk.
# if /boot is a mountpoint, force the meta data on disk
# to by-pass writeback delay.
# PPC64LE-only to deal with Petitboot issues
if [ "$ARCH" = "ppc64le" ]; then
    sync && mountpoint -q /boot && fsfreeze -f /boot && fsfreeze -u /boot
fi


So why only Petitboot on ppc64 is considered to need this?

Also I put zipl in the bootloader category that doesn't need to be totally
reinstalled, which is incorrect it's more like lilo. I guess I was thinking of
yaboot. In any case the main point remains that the common
bootloaders being used these days depend only on modification of a
bootloader configuration file via the file system that file is located
on.
Eric Sandeen May 23, 2017, 4:04 a.m. UTC | #11
On 5/22/17 10:56 PM, Chris Murphy wrote:
> On Mon, May 22, 2017 at 2:46 PM, Chris Murphy <lists@colorremedies.com> wrote:
> 
>> Second, I have only been able to reproduce this problem with grubby +
>> XFS.
> 
> OK so this is just on Fedora/RH systems, which is the example case I
> have. The kernel RPM is running a script called new-kernel-pkg which
> is part of grubby. It's found here:
> 
> https://github.com/rhinstaller/grubby/blob/master/new-kernel-pkg
> 
> # make sure changes make it to the disk.
> # if /boot is a mountpoint, force the meta data on disk
> # to by-pass writeback delay.
> # PPC64LE-only to deal with Petitboot issues
> if [ "$ARCH" = "ppc64le" ]; then
>     sync && mountpoint -q /boot && fsfreeze -f /boot && fsfreeze -u /boot
> fi
> 
> 
> So why only Petitboot on ppc64 is considered to need this?

because petitboot essentially a full kernel & userspace, and it is
(may be?) a different endianness than the os and the filesystem it's booting...
so normally it actually /could/ replay the log, but it can't due to the
different endianness.  So, freeze / unfreeze to quiesce, and then
petitboot does mount -o norecovery to ignore the unmount record we leave
lying around on a frozen fs.

Hacks upon hacks.

-Eric

> Also I put zipl in the bootloader category that doesn't need to be totally
> reinstalled, which is incorrect it's more like lilo. I guess I was thinking of
> yaboot. In any case the main point remains that the common
> bootloaders being used these days depend only on modification of a
> bootloader configuration file via the file system that file is located
> on.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner May 23, 2017, 11:44 a.m. UTC | #12
On Mon, May 22, 2017 at 11:04:54PM -0500, Eric Sandeen wrote:
> On 5/22/17 10:56 PM, Chris Murphy wrote:
> > On Mon, May 22, 2017 at 2:46 PM, Chris Murphy <lists@colorremedies.com> wrote:
> > 
> >> Second, I have only been able to reproduce this problem with grubby +
> >> XFS.
> > 
> > OK so this is just on Fedora/RH systems, which is the example case I
> > have. The kernel RPM is running a script called new-kernel-pkg which
> > is part of grubby. It's found here:
> > 
> > https://github.com/rhinstaller/grubby/blob/master/new-kernel-pkg
> > 
> > # make sure changes make it to the disk.
> > # if /boot is a mountpoint, force the meta data on disk
> > # to by-pass writeback delay.
> > # PPC64LE-only to deal with Petitboot issues
> > if [ "$ARCH" = "ppc64le" ]; then
> >     sync && mountpoint -q /boot && fsfreeze -f /boot && fsfreeze -u /boot
> > fi
> > 
> > 
> > So why only Petitboot on ppc64 is considered to need this?

> because petitboot essentially a full kernel & userspace, and it is
> (may be?) a different endianness than the os and the filesystem it's booting...
> so normally it actually /could/ replay the log, but it can't due to the
> different endianness.  So, freeze / unfreeze to quiesce, and then
> petitboot does mount -o norecovery to ignore the unmount record we leave
> lying around on a frozen fs.

Actually, IIUC what Stewart told me at LCA earlier this year, what
petitboot does now is far more convoluted and whacky than mount -o
norecovery. It overlays a write-capturing ram disk on top of the
block device, mounts the filesystem and performs log recovery.  All
writes from log recovery go into the ram disk so it never modifies
the underlying block device. It then traverses the recovered
filesystem to find the kernel images, and once they are found it
boots from them, throwing away the ramdisk that contained the
recovered changes from the log.

> Hacks upon hacks.

It's hacks all the way down....

Cheers,

Dave.
Dave Chinner May 24, 2017, 3:19 a.m. UTC | #13
On Mon, May 22, 2017 at 02:46:42PM -0600, Chris Murphy wrote:
> On Sun, May 21, 2017 at 8:01 PM, Dave Chinner <david@fromorbit.com> wrote:
> > On Fri, May 19, 2017 at 02:00:40PM -0700, Darrick J. Wong wrote:
> >> On Fri, May 19, 2017 at 01:09:31PM -0600, Chris Murphy wrote:
> >> > On Thu, May 18, 2017 at 4:30 PM, Darrick J. Wong
> >> > <darrick.wong@oracle.com> wrote:
> >> > > On Thu, May 18, 2017 at 06:34:05PM +1000, Dave Chinner wrote:
> >> > >> On Wed, May 17, 2017 at 06:32:42PM -0700, Darrick J. Wong wrote:
> >> > >> > Apparently there are certain system software configurations that do odd
> >> > >> > things like update the kernel and reboot without umounting the /boot fs
> >> > >> > or remounting it readonly, either of which would push all the AIL items
> >> > >> > out to disk.  As a result, a subsequent invocation of something like
> >> > >> > grub (which has a frightening willingness to read a fs with a dirty log)
> >> > >> > can read stale disk contents and/or miss files the metadata for which
> >> > >> > have been written to the log but not checkpointed into the filesystem.
> >> > >>
> >> > >> > Granted, most of the time /boot is a separate partition and
> >> > >> > systemd/sysvinit/whatever actually /do/ unmount /boot before rebooting.
> >> > >> > This "fix" is only needed for people who have one giant filesystem.
> >> > >>
> >> > >> Let me guess the series of events: grub calls "sync" and says "I'm
> >> > >
> >> > > dpkg/rpm/systemd/CABEXTRACT.EXE/whatever, but yes :)
> >> >
> >> >
> >> > It has nothing to do with GRUB. The exact same problem would happen
> >> > regardless of bootloader because the thing writing out bootloader
> >> > configuration prior to reboot is grubby, which is not at all in any
> >> > way related to GRUB.
> >
> > Chris, I've told you previously this is wrong (it's incorrect for
> > lilo) and too narrow (grubby is not the only update infrastructure
> > around), but I'll repeat it again because we need to move past your
> > single-distro focus on grubby and discuss the underlying problems.
> 
> 
> And I'm trying to get you to move past your ancient legacy bootloader
> focus and discuss things that matter in 2017.
> 
> First, nothing works like LILO, not even ELILO works like LILO.

Yeah, Elilo just used the lilo config file format and parser and
threw everyhting else away. I learnt how simple it is when I first
started working on ia64 machines back in 2004. Or was it 2005? It's
so long ago now I can't remember exactly.

> Second, I have only been able to reproduce this problem with grubby +
> XFS. If I manually do grub2-mkconfig + reboot -f instead, the problem
> does not happen. It might be that I'm too slow, and the marginal
> amount of extra time permits the new grub.cfg to fully commit, but I
> don't know.

Yup, those are typical writeback/reboot race symptoms.

I think Darrick's traces also hinted at the possibility that
grub2-mkconfig doesn't run sync at the correct time so the new
config file may not have even made it to disk and so it may be that
you are seeing it boot from the old config which was still valid...

> Third, basic bootloader concepts:
> 
> The bootloader is the executable that's loaded by the firmware after POST.

> The bootloader installer is not the bootloader. /sbin/lilo,

Well, yes, it's usually obvious which bootloader part is being
discussed from the context of the discussion. But seeing as you seem
to have trouble with this sort of thing, I've taken the time to
specify which one I'm talking about...

> /sbin/extlinux, /sbin/grub-install are not bootloaders, they are
> bootloader installers. Only lilo totally reinstalls the bootloader
> binary when configurations change. GRUB, extlinux, uboot, zipl don't
> do that. Their configuration files change and that's it, and they do
> not depend on boot sectors for this,

Sure, but they still all depend on writing their bootloader
executable to the boot sector to be able to bootstrap the boot
process.  They are no different to LILO in this respect - the only
difference is that LILO also writes a pointer to the pre-processed
boot config into the boot sector...

> they don't write outside the file
> system to the drive like lilo.

Hmmm - I demonstrated this assertion to be false in the email you
replied to. You've used this reasoning to repeatedly dismiss my
comments with no other technical information, so I hope you just
misread/misunderstood my reply. I'll try again to make it absolutely
clear how lilo works below, because.....

> They depend on the file system.

.... all the bootloaders (except ELILO) depend on the filesystem to
provide them with the physical block mapping information needed to
read the kernel/initramfs to boot them.  The only real difference is
how they get that physical location information. To recap:

The Lilo installer gets the physical location information from the
filesystem via the OS provided FIBMAP ioctl(), which it then writes
it into a file in the filesysetm (boot.map), which it then it maps
with FIBMAP and writes that map into the boot sector on the block
device.  At no point does it "go around the filesystem" to obtain or
write this information to stable storage for the bootloader
executable to use. The bootloader executable then just reads the
block map information directly from the block device to load the
kernel.

The grub installer writes a config file and that's about it. On
boot, the stage 1 loader in the boot sector bootstraps
the larger stage 2 loader. That then loads all the modules needed to
probe the booting block device contents and load the modules needed
to traverse the metadata structure it contains to find the block
map/extent list of the config file.  Then it decodes the block map,
reads the config file direct from the block device and parses it. It
then repeats this metadata traversal and extent list decoding for
each file it needs to read to boot.

IOWs, the information they use is exactly the same, but LILO avoids
all the bootloader executable complexity by doing all the mapping
work n the installer through generic, filesystem agnostic OS
interfaces before reboot. 

In contrast, the "parse the block device" architecture of grub and
similar bootloaders ensures that the bootloader executable is locked
into an endless game of whack-a-mole as filesystems, volume managers
and other storage management applications change formats and
behaviours.....

> > However, IMO problem does indeed lie with the bootloader and not the
> > distro packaging mechanism/scripts, and so we need to talk a bit out
> > the architectural differences between bootloaders like lilo and
> > grub/petitboot and why I consider update durability to be something
> > the bootloader needs to provide, not thrid party packagers or the
> > init system.
> 
> Either the bootloader needs to learn how to read dirty logs,

I don't think anyone wants to expend the time and effort needed to
do this for each filesystem that the grub bootloader supports,
especially as it is not necessary.

> or there
> can be no dirty log entries related to three things: kernel,
> initramfs, and bootloader configuration. If any of those three things
> are not fully committed to the file system metadata,  the bootloader
> will fail to find them.

Yup - I've been trying to tell you that these are the exact
guarantees that freezing the fs will provide the bootloader
installer....

> If GRUB or grubby are not being used, then the bootloader
> configuration file is most likely modified by a script in the kernel
> package. How do you avoid burdening the kernel package from update
> durability when it is responsible for writing the kernel, initramfs,
> and the third most likely thing to modify the bootloader
> configuration?

Solved problem via post-inst package scripts.

$ sudo apt-get install linux-image-amd64
....
Preparing to unpack .../linux-image-4.9.0-3-amd64_4.9.25-1_amd64.deb ...
Unpacking linux-image-4.9.0-3-amd64 (4.9.25-1) ...
Preparing to unpack .../linux-image-amd64_4.9+80_amd64.deb ...
Unpacking linux-image-amd64 (4.9+80) over (4.9+78) ...
Setting up linux-image-4.9.0-3-amd64 (4.9.25-1) ...
I: /vmlinuz.old is now a symlink to boot/vmlinuz-4.9.0-1-amd64
I: /initrd.img.old is now a symlink to boot/initrd.img-4.9.0-1-amd64
I: /vmlinuz is now a symlink to boot/vmlinuz-4.9.0-3-amd64
I: /initrd.img is now a symlink to boot/initrd.img-4.9.0-3-amd64
/etc/kernel/postinst.d/initramfs-tools:
update-initramfs: Generating /boot/initrd.img-4.9.0-3-amd64
/etc/kernel/postinst.d/zz-runlilo:
Added Linux  *
Added Linux.old
Setting up linux-image-amd64 (4.9+80) ...
$

> > As a result, you do *not* need to call sync or freeze the filesystem
> > after running the lilo command because it has already guaranteed the
> > updated bootloader information is on stable storage.  And because
> > you have to run lilo to update the bootloader, distro package
> > managers *can't fuck it up* because the bootloader has full control
> > of the update process.
> 
> Haha. Well it can't fuck it up because it's doing a total end run
> around the file system.

FYI: the canonical example of "doing a total end run around the
filesystem" is to write your own filesystem parsers to directly walk
filesystem structures on a block device without going through the
supported OS filesystem channels for accessing that filesystem
information.

> Actually GRUB has a functional equivalent, it's just that it's not the
> upstream default, and no distro appears to want to use it by default:

IIRC, that's the "filesystem install" mode and there's good reason
it's not used: LBA 0 of the block device belongs to the filesystem
on that device, not the bootloader. Hence if you have a filesystem
that uses LBA 0 (e.g. XFS), using grub in this mode on that block
device will trash it and then you've got bigger problems....

The Lilo installer avoids this issue by asking the filesystem to map
the mapping file and writing it's LBA to the boot sector, hence
allowing the bootloader info to be placed anywhere in the block
device rather than being fixed to a hard coded location as per the
grub functionality...

Cheers,

Dave.
Chris Murphy May 24, 2017, 6:22 a.m. UTC | #14
On Mon, May 22, 2017 at 2:46 PM, Chris Murphy <lists@colorremedies.com> wrote:

>
> Second, I have only been able to reproduce this problem with grubby +
> XFS. If I manually do grub2-mkconfig + reboot -f instead, the problem
> does not happen. It might be that I'm too slow, and the marginal
> amount of extra time permits the new grub.cfg to fully commit, but I
> don't know.

This is wrong. I retested this and 2 for 2 attempts I can reproduce
the problem either with grubby + reboot -f, or grub2-mkconfig + reboot
-f; so I must have been doing a normal reboot and somehow systemd
must've been successful at umounting or remount-ro'ing the file system
before the reboot.

But I did notice something else in both grubby and grub-mkconfig
cases. The initramfs and/or the kernel are variably either missing or
are zero length files. Here's an example from an updated system that
fails to boot due to zero length grub.cfg;

-rw-------. 1 root root 59650987 May 23 18:16
initramfs-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d.img
-rw-------. 1 root root 19764807 May 23 18:16 initramfs-4.11.0-2.fc26.x86_64.img
-rw-r--r--. 1 root root   182704 Feb 10 22:58 memtest86+-5.01
-rw-------. 1 root root  3548950 May  9 09:42 System.map-4.11.0-2.fc26.x86_64
-rw-------. 1 root root        0 May 15 13:46 System.map-4.11.1-300.fc26.x86_64
-rwxr-xr-x. 1 root root  7282776 May 23 18:16
vmlinuz-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d
-rwxr-xr-x. 1 root root  7282776 May  9 09:43 vmlinuz-4.11.0-2.fc26.x86_64
-rwxr-xr-x. 1 root root        0 May 15 13:46 vmlinuz-4.11.1-300.fc26.x86_64

-rw-rw-r--. 1 root root    0 May 23 18:44 grub.cfg

The proper initramfs is not there at all. The kernel is zero bytes.
And as previously reported the grub.cfg is zero bytes, hence failure
in grub. Upon normal mount, all of them appear as you'd expect.

If grubby and grub-mkconfig are to be blamed for not freezing the
system, on the basis that remount-ro or umount cannot be depended on,
then that means non-GRUB and non-grubby setups need their kernel
package postinstall script to fsfreeze.

Consider this scenario on UEFI. The grub.cfg on Fedora/RH systems goes
on the FAT EFI System partition (unlike upstream which calls for it
going on /boot), and it's got a pretty good chance of fully committing
even if systemd fails to remount-ro /boot. So we get a valid grub.cfg
pointing to a new kernel and initramfs that may not be locatable by
the bootloader because it can't read the dirty log. And again it
implicates the kernel package for not having done an fsfreeze, knowing
full well the limitations of the most common bootloaders which don't
read logs.

Something seems really out of order here. The kernel is installed
first, then the initramfs is built, grub.cfg.new is created, grub.cfg
is deleted, grub.cfg.new is renamed to grub.cfg. How is it the first
two items are in the dirty log, not fully committed to fs metadata;
but the grub.cfg is zero length? Why isn't the old one still there as
far as grub is concerned? If that were the case, the old kernel and
initramfs could be booted and then the log replayed to fix up
everything. The ordering here seems pretty screwy.
Chris Murphy May 24, 2017, 6:25 a.m. UTC | #15
On Wed, May 24, 2017 at 12:22 AM, Chris Murphy <lists@colorremedies.com> wrote:
> Here's an example from an updated system that
> fails to boot due to zero length grub.cfg;
>
> -rw-------. 1 root root 59650987 May 23 18:16
> initramfs-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d.img
> -rw-------. 1 root root 19764807 May 23 18:16 initramfs-4.11.0-2.fc26.x86_64.img
> -rw-r--r--. 1 root root   182704 Feb 10 22:58 memtest86+-5.01
> -rw-------. 1 root root  3548950 May  9 09:42 System.map-4.11.0-2.fc26.x86_64
> -rw-------. 1 root root        0 May 15 13:46 System.map-4.11.1-300.fc26.x86_64
> -rwxr-xr-x. 1 root root  7282776 May 23 18:16
> vmlinuz-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d
> -rwxr-xr-x. 1 root root  7282776 May  9 09:43 vmlinuz-4.11.0-2.fc26.x86_64
> -rwxr-xr-x. 1 root root        0 May 15 13:46 vmlinuz-4.11.1-300.fc26.x86_64
>
> -rw-rw-r--. 1 root root    0 May 23 18:44 grub.cfg

FWIW this was mounted with -o ro,norecovery following an update that
resulted in the problem of hitting the grub prompt instead of a boot
menu.
Chris Murphy May 24, 2017, 8:06 a.m. UTC | #16
On Tue, May 23, 2017 at 9:19 PM, Dave Chinner <david@fromorbit.com> wrote:



>
>> they don't write outside the file
>> system to the drive like lilo.
>
> Hmmm - I demonstrated this assertion to be false in the email you
> replied to.

They = grubby, grub-mkconfig. They do not write a block list outside
the file system like lilo does. All they do is modify the bootloader
configuration file, via the file system, they do not write directly to
the block device like lilo.


>
> The Lilo installer gets the physical location information from the
> filesystem via the OS provided FIBMAP ioctl(), which it then writes
> it into a file in the filesysetm (boot.map), which it then it maps
> with FIBMAP and writes that map into the boot sector on the block
> device.  At no point does it "go around the filesystem" to obtain or
> write this information to stable storage for the bootloader
> executable to use. The bootloader executable then just reads the
> block map information directly from the block device to load the
> kernel.


Yes it's basically a block list and it has no idea what a file system
is. i get that. When I say do an end run around the file system, I
mean it writes that block list direct to the block device, not as a
file in the file system. When the computer boots, it reads a sequence
of blocks without any understanding of the file system, again it goes
around the file system.


> The grub installer writes a config file and that's about it.

 grub-install only writes out boot.img to LBA 0, and core.img to the
MBR gap or bios-grub partition, and copies its additional support
modules from /usr to /boot. That's it. No configuration file is
created with grub-install.

grub-mkconfig only writes a config file, it does not modify any binary data.

LILO is combining these functions to make its blocklist, the whole
thing is written out each time.


 On
> boot, the stage 1 loader in the boot sector bootstraps
> the larger stage 2 loader. That then loads all the modules needed to
> probe the booting block device contents and load the modules needed
> to traverse the metadata structure it contains to find the block
> map/extent list of the config file.  Then it decodes the block map,
> reads the config file direct from the block device and parses it. It
> then repeats this metadata traversal and extent list decoding for
> each file it needs to read to boot.
>
> IOWs, the information they use is exactly the same, but LILO avoids
> all the bootloader executable complexity by doing all the mapping
> work n the installer through generic, filesystem agnostic OS
> interfaces before reboot.
>
> In contrast, the "parse the block device" architecture of grub and
> similar bootloaders ensures that the bootloader executable is locked
> into an endless game of whack-a-mole as filesystems, volume managers
> and other storage management applications change formats and
> behaviours.....


Yes, I'm aware.

Here is the difference. I'm accepting reality, and you keep
complaining about reality while pining for the glory days of LILO and
how everything else is Doing It Wrong (TM). It doesn't matter. We have
what we have.



>> > However, IMO problem does indeed lie with the bootloader and not the
>> > distro packaging mechanism/scripts, and so we need to talk a bit out
>> > the architectural differences between bootloaders like lilo and
>> > grub/petitboot and why I consider update durability to be something
>> > the bootloader needs to provide, not thrid party packagers or the
>> > init system.
>>
>> Either the bootloader needs to learn how to read dirty logs,
>
> I don't think anyone wants to expend the time and effort needed to
> do this for each filesystem that the grub bootloader supports,
> especially as it is not necessary.
>
>> or there
>> can be no dirty log entries related to three things: kernel,
>> initramfs, and bootloader configuration. If any of those three things
>> are not fully committed to the file system metadata,  the bootloader
>> will fail to find them.
>
> Yup - I've been trying to tell you that these are the exact
> guarantees that freezing the fs will provide the bootloader
> installer....

Except the bootloader can't do that. And yet you've been trying to pin
the entire problem on grub, and even if that's a generic term, it
isn't the bootloader generally that can do anything about this.

As far as I know, only GRUB comes with a utility for updating its
bootloader configuration file and it could be modified to do fsfreeze.
Other bootloaders depend on the kernel package postinstall script (I
assume) to modify the bootloader configuration. For sure extlinux
doesn't come with such a utility.




>
>> If GRUB or grubby are not being used, then the bootloader
>> configuration file is most likely modified by a script in the kernel
>> package. How do you avoid burdening the kernel package from update
>> durability when it is responsible for writing the kernel, initramfs,
>> and the third most likely thing to modify the bootloader
>> configuration?
>
> Solved problem via post-inst package scripts.

Ergo it is not the problem of the bootloader and no solution can be
found there like I've been saying this whole damn time.

OK so you're saying that every kernel package post install script
needs to do fsfreeze, rather than systemd doing it when remount-ro or
umount fail? Really? You really think they're all going to do this? I
would be shocked if it took systemd folks more than a month to
implement this, and shocked if it took less than a year for either
grubby or GRUB folks, let alone myriad kernel packaging teams to grok
the reason for this. I can just hear it now: What? sync doesn't do it?
Fuck!!! times every distro...

Yeah solved problem. Hilarious.






>> Haha. Well it can't fuck it up because it's doing a total end run
>> around the file system.
>
> FYI: the canonical example of "doing a total end run around the
> filesystem" is to write your own filesystem parsers to directly walk
> filesystem structures on a block device without going through the
> supported OS filesystem channels for accessing that filesystem
> information.

Yeah well you're coming at it from a certain perspective and I'm
coming at it from another. You see the code as the file system. When I
say the file system, i mean the file system volume, the actual
instance of a file system on a drive. That's what's being run around
by writing directly to blocks, and directly reading blocks.


>
>> Actually GRUB has a functional equivalent, it's just that it's not the
>> upstream default, and no distro appears to want to use it by default:
>
> IIRC, that's the "filesystem install" mode and there's good reason
> it's not used: LBA 0 of the block device belongs to the filesystem
> on that device, not the bootloader. Hence if you have a filesystem
> that uses LBA 0 (e.g. XFS), using grub in this mode on that block
> device will trash it and then you've got bigger problems....

No, the baking in of the configuration file is available regardless of
whether the installation happens to the whole block device or to a
partition/volume. The file system install mode is basically block
listing the core.img file rather than it being embedded in the MBR
gap. I am not a fan of this complexity but it still has a bunch of
fanboys from a bygone era...

The first sector of the block device, and each partition, is really
owned by the firmware on x86. It's the boot sector, has been since
ancient times. It is the only sectors the firmware will blindly read
at boot time. It's the original way multiboot worked, jump code in LBA
0 plus an active bit on a partition told the firmware to read and
execute the contents of the first sector at that partition's CHS
address. To switch OS's, you only needed to change the active bit to a
different partition.
Dave Chinner May 24, 2017, 11:13 p.m. UTC | #17
On Wed, May 24, 2017 at 12:25:10AM -0600, Chris Murphy wrote:
> On Wed, May 24, 2017 at 12:22 AM, Chris Murphy <lists@colorremedies.com> wrote:
> > Here's an example from an updated system that
> > fails to boot due to zero length grub.cfg;
> >
> > -rw-------. 1 root root 59650987 May 23 18:16
> > initramfs-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d.img
> > -rw-------. 1 root root 19764807 May 23 18:16 initramfs-4.11.0-2.fc26.x86_64.img
> > -rw-r--r--. 1 root root   182704 Feb 10 22:58 memtest86+-5.01
> > -rw-------. 1 root root  3548950 May  9 09:42 System.map-4.11.0-2.fc26.x86_64
> > -rw-------. 1 root root        0 May 15 13:46 System.map-4.11.1-300.fc26.x86_64
> > -rwxr-xr-x. 1 root root  7282776 May 23 18:16
> > vmlinuz-0-rescue-a0269ef67a5f4c1ca97e0817ac1c4a6d
> > -rwxr-xr-x. 1 root root  7282776 May  9 09:43 vmlinuz-4.11.0-2.fc26.x86_64
> > -rwxr-xr-x. 1 root root        0 May 15 13:46 vmlinuz-4.11.1-300.fc26.x86_64
> >
> > -rw-rw-r--. 1 root root    0 May 23 18:44 grub.cfg
> 
> FWIW this was mounted with -o ro,norecovery following an update that
> resulted in the problem of hitting the grub prompt instead of a boot
> menu.

Yup, if the files were sync()d then the file size updates are still
in the journal which has not been replayed. This is what we've been
saying is the problem all along and that a post-update freeze will
work around. If won't fix the fact that the updates are not
fail-safe (crash before freeze will still leave the config file
update in this state), but it will avoid the failure in the "update
successful, reboot without unmounting" scenario being used here.

Cheers,

Dave.
Dave Chinner May 25, 2017, 12:03 a.m. UTC | #18
On Wed, May 24, 2017 at 12:22:13AM -0600, Chris Murphy wrote:
> Something seems really out of order here.

Intentionally so.

Filesystems do screwy, complex things to extract maximum performance
whilst maintaining operational ordering.  We use log sequence
numbers to ensure the order of operations is maintained correctly in
the active portion of the journal, but we provide no ordering
guarantees for metadata writeback operations from the journal.

> The kernel is installed
> first, then the initramfs is built, grub.cfg.new is created, grub.cfg
> is deleted, grub.cfg.new is renamed to grub.cfg. How is it the first
> two items are in the dirty log, not fully committed to fs metadata;
> but the grub.cfg is zero length?  Why isn't the old one still there as
> far as grub is concerned? If that were the case, the old kernel and
> initramfs could be booted and then the log replayed to fix up
> everything. The ordering here seems pretty screwy.

By mounting "-o ro,norecovery" what you are seeing is evidence of
metadata being flushed from the journal in optimal IO order rather
than strict sequence order. The filesystem is not in a consistent
state if you haven't recovered the journal. Recovering the journal
(i.e. a normal mount) replays all the recorded changes and intents
in strict sequence order, hence providing the high level operational
ordering semantics we guarantee users.

Cheers,

Dave.
diff mbox

Patch

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 455a575..415b1e8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,6 +61,7 @@ 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/parser.h>
+#include <linux/reboot.h>
 
 static const struct super_operations xfs_super_operations;
 struct bio_set *xfs_ioend_bioset;
@@ -1982,6 +1983,38 @@  xfs_destroy_workqueues(void)
 	destroy_workqueue(xfs_alloc_wq);
 }
 
+STATIC void
+xfs_reboot_fs(
+	struct super_block	*sb,
+	void			*priv)
+{
+	int			error;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	xfs_info(XFS_M(sb), "Freezing prior to reboot.");
+	up_read(&sb->s_umount);
+	error = freeze_super(sb);
+	down_read(&sb->s_umount);
+	if (error && error != -EBUSY)
+		xfs_info(XFS_M(sb), "Unable to freeze, error=%d", error);
+}
+
+STATIC int
+xfs_reboot(
+	struct notifier_block	*nb,
+	ulong			event,
+	void			*buf)
+{
+	iterate_supers_type(&xfs_fs_type, xfs_reboot_fs, NULL);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xfs_reboot_notifier = {
+	.notifier_call = xfs_reboot,
+	.priority = INT_MAX,
+};
+
 STATIC int __init
 init_xfs_fs(void)
 {
@@ -2056,8 +2089,14 @@  init_xfs_fs(void)
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
 		goto out_qm_exit;
+
+	error = register_reboot_notifier(&xfs_reboot_notifier);
+	if (error)
+		goto out_register_fs;
 	return 0;
 
+ out_register_fs:
+	unregister_filesystem(&xfs_fs_type);
  out_qm_exit:
 	xfs_qm_exit();
  out_remove_dbg_kobj:
@@ -2089,6 +2128,7 @@  init_xfs_fs(void)
 STATIC void __exit
 exit_xfs_fs(void)
 {
+	unregister_reboot_notifier(&xfs_reboot_notifier);
 	xfs_qm_exit();
 	unregister_filesystem(&xfs_fs_type);
 #ifdef DEBUG