diff mbox series

[v4,1/5] btrfs: add compat_flags to btrfs_inode_item

Message ID 6ed83a27f88e18f295f7d661e9c87e7ec7d33428.1620241221.git.boris@bur.io (mailing list archive)
State New, archived
Headers show
Series btrfs: support fsverity | expand

Commit Message

Boris Burkov May 5, 2021, 7:20 p.m. UTC
The tree checker currently rejects unrecognized flags when it reads
btrfs_inode_item. Practically, this means that adding a new flag makes
the change backwards incompatible if the flag is ever set on a file.

Take up one of the 4 reserved u64 fields in the btrfs_inode_item as a
new "compat_flags". These flags are zero on inode creation in btrfs and
mkfs and are ignored by an older kernel, so it should be safe to use
them in this way.

Signed-off-by: Boris Burkov <boris@bur.io>
---
 fs/btrfs/btrfs_inode.h          | 1 +
 fs/btrfs/ctree.h                | 2 ++
 fs/btrfs/delayed-inode.c        | 2 ++
 fs/btrfs/inode.c                | 3 +++
 fs/btrfs/ioctl.c                | 7 ++++---
 fs/btrfs/tree-log.c             | 1 +
 include/uapi/linux/btrfs_tree.h | 7 ++++++-
 7 files changed, 19 insertions(+), 4 deletions(-)

Comments

David Sterba May 11, 2021, 7:11 p.m. UTC | #1
On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> The tree checker currently rejects unrecognized flags when it reads
> btrfs_inode_item. Practically, this means that adding a new flag makes
> the change backwards incompatible if the flag is ever set on a file.

Is there any other known problem when the verity flag is set? The tree
checker is naturally the first instance where it gets noticed and I
haven't found any other place as the flag would be just another one.

Why am I asking: allocating 8 bytes for incompat bits where we know
there will be likely just one used is wasteful. I'm exploring
possibilities if the incompat flags can be squeezed to existing flags.
In the end the size can be reduced to u16, u64 is really too much.

> Take up one of the 4 reserved u64 fields in the btrfs_inode_item as a
> new "compat_flags". These flags are zero on inode creation in btrfs and
> mkfs and are ignored by an older kernel, so it should be safe to use
> them in this way.

Yeah this should be safe.

> Signed-off-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/btrfs_inode.h          | 1 +
>  fs/btrfs/ctree.h                | 2 ++
>  fs/btrfs/delayed-inode.c        | 2 ++
>  fs/btrfs/inode.c                | 3 +++
>  fs/btrfs/ioctl.c                | 7 ++++---
>  fs/btrfs/tree-log.c             | 1 +
>  include/uapi/linux/btrfs_tree.h | 7 ++++++-
>  7 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index c652e19ad74e..e8dbc8e848ce 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -191,6 +191,7 @@ struct btrfs_inode {
>  
>  	/* flags field from the on disk inode */
>  	u32 flags;
> +	u64 compat_flags;

This got me curious, u32 flags is for the in-memory inode, but the
on-disk inode_item::flags is u64

>  BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> +BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);

>  	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);

Which means we currently use only 32 bits and half of the on-disk
inode_item::flags is always zero. So the idea is to repurpose this for
the incompat bits (say upper 16 bits). With a minimal patch to tree
checker we can make old kernels accept a verity-enabled kernel.

It could be tricky, but for backport only additional bitmask would be
added to BTRFS_INODE_FLAG_MASK to ignore bits 48-63.

For proper support the inode_item::flags can be simply used as one space
where the split would be just logical, and IMO manageable.

> +	btrfs_set_stack_inode_compat_flags(inode_item, BTRFS_I(inode)->compat_flags);
>  	btrfs_set_stack_inode_block_group(inode_item, 0);
>  
>  	btrfs_set_stack_timespec_sec(&inode_item->atime,
> @@ -1776,6 +1777,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>  	inode->i_rdev = 0;
>  	*rdev = btrfs_stack_inode_rdev(inode_item);
>  	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);

As another example, the stack inode flags get trimmed from u64 to u32,
so old kernels won't notice.

> +	BTRFS_I(inode)->compat_flags = btrfs_stack_inode_compat_flags(inode_item);
>  
>  	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
>  	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 69fcdf8f0b1c..d89000577f7f 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3627,6 +3627,7 @@ static int btrfs_read_locked_inode(struct inode *inode,
>  
>  	BTRFS_I(inode)->index_cnt = (u64)-1;
>  	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
> +	BTRFS_I(inode)->compat_flags = btrfs_inode_compat_flags(leaf, inode_item);
>  
>  cache_index:
>  	/*
> @@ -3793,6 +3794,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
>  	btrfs_set_token_inode_transid(&token, item, trans->transid);
>  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
>  	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
> +	btrfs_set_token_inode_compat_flags(&token, item, BTRFS_I(inode)->compat_flags);
>  	btrfs_set_token_inode_block_group(&token, item, 0);
>  }
>  
> @@ -8857,6 +8859,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
>  	ei->defrag_bytes = 0;
>  	ei->disk_i_size = 0;
>  	ei->flags = 0;
> +	ei->compat_flags = 0;
>  	ei->csum_bytes = 0;
>  	ei->index_cnt = (u64)-1;
>  	ei->dir_index = 0;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 0ba0e4ddaf6b..ff335c192170 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -102,8 +102,9 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
>   * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
>   * ioctl.
>   */
> -static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
> +static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
>  {
> +	unsigned int flags = binode->flags;

So things like the above must be careful and store the variables to
properly sized integers.

>  	unsigned int iflags = 0;
>  
>  	if (flags & BTRFS_INODE_SYNC)
> @@ -156,7 +157,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
>  static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
>  {
>  	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
> -	unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
> +	unsigned int flags = btrfs_inode_flags_to_fsflags(binode);

This now does not apply to 5.13-rc1 as there was a patchset converting
all the file attributes to a common API and this hunk now does not apply
as the btrfs_ioctl_getflags is handled by fileattr_fill_flags in
btrfs_fileattr_get.

The fix seem to be simple as it's using the same helpers but I did not
get far enough to resolve the conflict compeletely, so please rebase and
resend.
David Sterba May 17, 2021, 9:48 p.m. UTC | #2
On Tue, May 11, 2021 at 09:11:08PM +0200, David Sterba wrote:
> On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -191,6 +191,7 @@ struct btrfs_inode {
> >  
> >  	/* flags field from the on disk inode */
> >  	u32 flags;
> > +	u64 compat_flags;
> 
> This got me curious, u32 flags is for the in-memory inode, but the
> on-disk inode_item::flags is u64
> 
> >  BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
>                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> > +BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);
> 
> >  	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
> 
> Which means we currently use only 32 bits and half of the on-disk
> inode_item::flags is always zero. So the idea is to repurpose this for
> the incompat bits (say upper 16 bits). With a minimal patch to tree
> checker we can make old kernels accept a verity-enabled kernel.
> 
> It could be tricky, but for backport only additional bitmask would be
> added to BTRFS_INODE_FLAG_MASK to ignore bits 48-63.
> 
> For proper support the inode_item::flags can be simply used as one space
> where the split would be just logical, and IMO manageable.

To demonstrate the idea, here's a compile-tested patch, based on
current misc-next but the verity bits are easy to match to your
patchset:

- btrfs_inode::ro_flags - in-memory representation of the ro flags
- tree-checker verifies the flags separately
  - errors if there are unkonwn flags (u32)
  - errors if ro_flags don't match fs ro_compat bits
- inode_item::flags gets synced with btrfs_inode::flags + ro_flags
- the split of inode_item::flags is 32/32 for simplicity as it matches
  the current type, we can make it 48/16 if that would work (maybe not)

---

--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -189,8 +189,10 @@ struct btrfs_inode {
 	 */
 	u64 csum_bytes;
 
-	/* flags field from the on disk inode */
+	/* Flags field from the on disk inode, lower half of inode_item::flags  */
 	u32 flags;
+	/* Read-only compatibility flags, upper half of inode_item::flags */
+	u32 ro_flags;
 
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -281,7 +281,8 @@ struct btrfs_super_block {
 
 #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
 	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
-	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
+	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
+	 BTRFS_FEATURE_COMPAT_RO_VERITY)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
@@ -1490,6 +1491,8 @@ do {                                                                   \
 
 #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
 
+#define BTRFS_INODE_RO_VERITY		(1ULL << 32)
+
 #define BTRFS_INODE_FLAG_MASK						\
 	(BTRFS_INODE_NODATASUM |					\
 	 BTRFS_INODE_NODATACOW |					\
@@ -1505,6 +1508,9 @@ do {                                                                   \
 	 BTRFS_INODE_COMPRESS |						\
 	 BTRFS_INODE_ROOT_ITEM_INIT)
 
+#define BTRFS_INODE_FLAG_INCOMPAT_MASK			(0x00000000FFFFFFFF)
+#define BTRFS_INODE_FLAG_RO_COMPAT_MASK			(0xFFFFFFFF00000000)
+
 struct btrfs_map_token {
 	struct extent_buffer *eb;
 	char *kaddr;
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1717,7 +1717,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 				       inode_peek_iversion(inode));
 	btrfs_set_stack_inode_transid(inode_item, trans->transid);
 	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
-	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags |
+			((u64)BTRFS_I(inode)->ro_flags << 32));
 	btrfs_set_stack_inode_block_group(inode_item, 0);
 
 	btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1775,7 +1776,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 				   btrfs_stack_inode_sequence(inode_item));
 	inode->i_rdev = 0;
 	*rdev = btrfs_stack_inode_rdev(inode_item);
-	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+	BTRFS_I(inode)->flags = (u32)btrfs_stack_inode_flags(inode_item);
+	BTRFS_I(inode)->ro_flags = (u32)(btrfs_stack_inode_flags(inode_item) >> 32);
 
 	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
 	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3630,7 +3630,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
-	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+	BTRFS_I(inode)->flags = (u32)btrfs_inode_flags(leaf, inode_item);
+	BTRFS_I(inode)->ro_flags = (u32)(btrfs_inode_flags(leaf, inode_item) >> 32);
 
 cache_index:
 	/*
@@ -3796,7 +3797,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags |
+			((u64)BTRFS_I(inode)->ro_flags << 32));
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -378,7 +378,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
 
 /* Inode item error output has the same format as dir_item_err() */
 #define inode_item_err(eb, slot, fmt, ...)			\
-	dir_item_err(eb, slot, fmt, __VA_ARGS__)
+	dir_item_err(eb, slot, fmt, ## __VA_ARGS__)
 
 static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
 			   int slot)
@@ -999,6 +999,7 @@ static int check_inode_item(struct extent_buffer *leaf,
 	u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
 	u32 mode;
 	int ret;
+	u64 inode_flags;
 
 	ret = check_inode_key(leaf, key, slot);
 	if (unlikely(ret < 0))
@@ -1054,13 +1055,22 @@ static int check_inode_item(struct extent_buffer *leaf,
 			btrfs_inode_nlink(leaf, iitem));
 		return -EUCLEAN;
 	}
-	if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
+	inode_flags = btrfs_inode_flags(leaf, iitem);
+	if (unlikely(inode_flags & ~BTRFS_INODE_FLAG_INCOMPAT_MASK)) {
 		inode_item_err(leaf, slot,
-			       "unknown flags detected: 0x%llx",
-			       btrfs_inode_flags(leaf, iitem) &
-			       ~BTRFS_INODE_FLAG_MASK);
+			       "unknown incompat flags detected: 0x%llx",
+			       inode_flags & ~BTRFS_INODE_FLAG_INCOMPAT_MASK);
 		return -EUCLEAN;
 	}
+	if (unlikely(inode_flags & ~BTRFS_INODE_FLAG_RO_COMPAT_MASK)) {
+		if (unlikely(inode_flags & BTRFS_INODE_RO_VERITY)) {
+			if (btrfs_fs_compat_ro(fs_info, VERITY)) {
+				inode_item_err(leaf, slot,
+			"inode ro compat VERITY flag set but not on filesystem");
+				return -EUCLEAN;
+			}
+		}
+	}
 	return 0;
 }
 
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3941,7 +3941,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
-	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags |
+			((u64)BTRFS_I(inode)->ro_flags << 32));
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
  * first mount when booting older kernel versions.
  */
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID	(1ULL << 1)
+#define BTRFS_FEATURE_COMPAT_RO_VERITY			(1ULL << 2)
 
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
Boris Burkov May 19, 2021, 9:45 p.m. UTC | #3
On Mon, May 17, 2021 at 11:48:59PM +0200, David Sterba wrote:
> On Tue, May 11, 2021 at 09:11:08PM +0200, David Sterba wrote:
> > On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> > > --- a/fs/btrfs/btrfs_inode.h
> > > +++ b/fs/btrfs/btrfs_inode.h
> > > @@ -191,6 +191,7 @@ struct btrfs_inode {
> > >  
> > >  	/* flags field from the on disk inode */
> > >  	u32 flags;
> > > +	u64 compat_flags;
> > 
> > This got me curious, u32 flags is for the in-memory inode, but the
> > on-disk inode_item::flags is u64
> > 
> > >  BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
> >                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > 
> > > +BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);
> > 
> > >  	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
> > 
> > Which means we currently use only 32 bits and half of the on-disk
> > inode_item::flags is always zero. So the idea is to repurpose this for
> > the incompat bits (say upper 16 bits). With a minimal patch to tree
> > checker we can make old kernels accept a verity-enabled kernel.
> > 
> > It could be tricky, but for backport only additional bitmask would be
> > added to BTRFS_INODE_FLAG_MASK to ignore bits 48-63.
> > 
> > For proper support the inode_item::flags can be simply used as one space
> > where the split would be just logical, and IMO manageable.
> 
> To demonstrate the idea, here's a compile-tested patch, based on
> current misc-next but the verity bits are easy to match to your
> patchset:

Thanks for taking the time to prove this idea out. However, I'd still
like to discuss the pros/cons of this approach for this application.

As far as I can tell, the two issues at hand are ensuring compatibility
and using fewer of the reserved bits. Your proposal uses 0 reserved
bits, which is great, but is still quite a headache for compatibility,
as an administrator would have to backport the compat patch on any kernel
they wanted to roll back to before the one this went out on.

This is especially painful for less well-loved things like
dracut/systemd mounting the root filesystem and doing a pivot_root during
boot. You would have to make sure that any machine using fsverity btrfs
files has an updated initramfs kernel or it won't be able to boot.

Alternatively, we could have our cake and eat it too if we separate the
idea of unlocking the top 32 bits of the inode flags from adding compat
flags.

If we:
1. take a u16 or a u32 out of reserved and make it compat flags (my
patch, but shrinking from u64)
2. implement something similar to your patch, but don't use those 32
bits just yet

Then we are setup to more conveniently use the freed-up 32 bits in the
future, as the application which wants reserved bytes then will have a
buffer of kernel versions to trivially roll back into, which may cover
most practical rollbacks.

For what it's worth, I do like that your proposal stuffs inode flags and
inode compat flags together, which is certainly neater than turning the
upper 32 of inode flags into general reserved bits. But I'm just not
sure that the aesthetic benefit is worth the real pain now.

> 
> - btrfs_inode::ro_flags - in-memory representation of the ro flags
> - tree-checker verifies the flags separately
>   - errors if there are unkonwn flags (u32)
>   - errors if ro_flags don't match fs ro_compat bits
> - inode_item::flags gets synced with btrfs_inode::flags + ro_flags
> - the split of inode_item::flags is 32/32 for simplicity as it matches
>   the current type, we can make it 48/16 if that would work (maybe not)
> 
> ---
> 
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -189,8 +189,10 @@ struct btrfs_inode {
>  	 */
>  	u64 csum_bytes;
>  
> -	/* flags field from the on disk inode */
> +	/* Flags field from the on disk inode, lower half of inode_item::flags  */
>  	u32 flags;
> +	/* Read-only compatibility flags, upper half of inode_item::flags */
> +	u32 ro_flags;
>  
>  	/*
>  	 * Counters to keep track of the number of extent item's we may use due
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -281,7 +281,8 @@ struct btrfs_super_block {
>  
>  #define BTRFS_FEATURE_COMPAT_RO_SUPP			\
>  	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |	\
> -	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID)
> +	 BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
> +	 BTRFS_FEATURE_COMPAT_RO_VERITY)
>  
>  #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
>  #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
> @@ -1490,6 +1491,8 @@ do {                                                                   \
>  
>  #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
>  
> +#define BTRFS_INODE_RO_VERITY		(1ULL << 32)
> +
>  #define BTRFS_INODE_FLAG_MASK						\
>  	(BTRFS_INODE_NODATASUM |					\
>  	 BTRFS_INODE_NODATACOW |					\
> @@ -1505,6 +1508,9 @@ do {                                                                   \
>  	 BTRFS_INODE_COMPRESS |						\
>  	 BTRFS_INODE_ROOT_ITEM_INIT)
>  
> +#define BTRFS_INODE_FLAG_INCOMPAT_MASK			(0x00000000FFFFFFFF)
> +#define BTRFS_INODE_FLAG_RO_COMPAT_MASK			(0xFFFFFFFF00000000)
> +
>  struct btrfs_map_token {
>  	struct extent_buffer *eb;
>  	char *kaddr;
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -1717,7 +1717,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
>  				       inode_peek_iversion(inode));
>  	btrfs_set_stack_inode_transid(inode_item, trans->transid);
>  	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
> -	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
> +	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags |
> +			((u64)BTRFS_I(inode)->ro_flags << 32));
>  	btrfs_set_stack_inode_block_group(inode_item, 0);
>  
>  	btrfs_set_stack_timespec_sec(&inode_item->atime,
> @@ -1775,7 +1776,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
>  				   btrfs_stack_inode_sequence(inode_item));
>  	inode->i_rdev = 0;
>  	*rdev = btrfs_stack_inode_rdev(inode_item);
> -	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
> +	BTRFS_I(inode)->flags = (u32)btrfs_stack_inode_flags(inode_item);
> +	BTRFS_I(inode)->ro_flags = (u32)(btrfs_stack_inode_flags(inode_item) >> 32);
>  
>  	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
>  	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3630,7 +3630,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
>  	rdev = btrfs_inode_rdev(leaf, inode_item);
>  
>  	BTRFS_I(inode)->index_cnt = (u64)-1;
> -	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
> +	BTRFS_I(inode)->flags = (u32)btrfs_inode_flags(leaf, inode_item);
> +	BTRFS_I(inode)->ro_flags = (u32)(btrfs_inode_flags(leaf, inode_item) >> 32);
>  
>  cache_index:
>  	/*
> @@ -3796,7 +3797,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
>  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
>  	btrfs_set_token_inode_transid(&token, item, trans->transid);
>  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
> -	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
> +	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags |
> +			((u64)BTRFS_I(inode)->ro_flags << 32));
>  	btrfs_set_token_inode_block_group(&token, item, 0);
>  }
>  
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -378,7 +378,7 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
>  
>  /* Inode item error output has the same format as dir_item_err() */
>  #define inode_item_err(eb, slot, fmt, ...)			\
> -	dir_item_err(eb, slot, fmt, __VA_ARGS__)
> +	dir_item_err(eb, slot, fmt, ## __VA_ARGS__)
>  
>  static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
>  			   int slot)
> @@ -999,6 +999,7 @@ static int check_inode_item(struct extent_buffer *leaf,
>  	u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
>  	u32 mode;
>  	int ret;
> +	u64 inode_flags;
>  
>  	ret = check_inode_key(leaf, key, slot);
>  	if (unlikely(ret < 0))
> @@ -1054,13 +1055,22 @@ static int check_inode_item(struct extent_buffer *leaf,
>  			btrfs_inode_nlink(leaf, iitem));
>  		return -EUCLEAN;
>  	}
> -	if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) {
> +	inode_flags = btrfs_inode_flags(leaf, iitem);
> +	if (unlikely(inode_flags & ~BTRFS_INODE_FLAG_INCOMPAT_MASK)) {
>  		inode_item_err(leaf, slot,
> -			       "unknown flags detected: 0x%llx",
> -			       btrfs_inode_flags(leaf, iitem) &
> -			       ~BTRFS_INODE_FLAG_MASK);
> +			       "unknown incompat flags detected: 0x%llx",
> +			       inode_flags & ~BTRFS_INODE_FLAG_INCOMPAT_MASK);
>  		return -EUCLEAN;
>  	}
> +	if (unlikely(inode_flags & ~BTRFS_INODE_FLAG_RO_COMPAT_MASK)) {
> +		if (unlikely(inode_flags & BTRFS_INODE_RO_VERITY)) {
> +			if (btrfs_fs_compat_ro(fs_info, VERITY)) {
> +				inode_item_err(leaf, slot,
> +			"inode ro compat VERITY flag set but not on filesystem");
> +				return -EUCLEAN;
> +			}
> +		}
> +	}
>  	return 0;
>  }
>  
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -3941,7 +3941,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
>  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
>  	btrfs_set_token_inode_transid(&token, item, trans->transid);
>  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
> -	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
> +	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags |
> +			((u64)BTRFS_I(inode)->ro_flags << 32));
>  	btrfs_set_token_inode_block_group(&token, item, 0);
>  }
>  
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -288,6 +288,7 @@ struct btrfs_ioctl_fs_info_args {
>   * first mount when booting older kernel versions.
>   */
>  #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID	(1ULL << 1)
> +#define BTRFS_FEATURE_COMPAT_RO_VERITY			(1ULL << 2)
>  
>  #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
>  #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
> -- 
> 2.29.2
>
Eric Biggers May 25, 2021, 6:12 p.m. UTC | #4
On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> The tree checker currently rejects unrecognized flags when it reads
> btrfs_inode_item. Practically, this means that adding a new flag makes
> the change backwards incompatible if the flag is ever set on a file.
> 
> Take up one of the 4 reserved u64 fields in the btrfs_inode_item as a
> new "compat_flags". These flags are zero on inode creation in btrfs and
> mkfs and are ignored by an older kernel, so it should be safe to use
> them in this way.
> 
> Signed-off-by: Boris Burkov <boris@bur.io>

This patchset doesn't have a cover letter anymore for some reason.

Also, please mention where this patchset applies to.  I tried mainline and
btrfs/for-next, but neither works.

- Eric
David Sterba June 7, 2021, 9:10 p.m. UTC | #5
On Tue, May 25, 2021 at 11:12:21AM -0700, Eric Biggers wrote:
> On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> > The tree checker currently rejects unrecognized flags when it reads
> > btrfs_inode_item. Practically, this means that adding a new flag makes
> > the change backwards incompatible if the flag is ever set on a file.
> > 
> > Take up one of the 4 reserved u64 fields in the btrfs_inode_item as a
> > new "compat_flags". These flags are zero on inode creation in btrfs and
> > mkfs and are ignored by an older kernel, so it should be safe to use
> > them in this way.
> > 
> > Signed-off-by: Boris Burkov <boris@bur.io>
> 
> This patchset doesn't have a cover letter anymore for some reason.
> 
> Also, please mention where this patchset applies to.  I tried mainline and
> btrfs/for-next, but neither works.

There was a parallel change updating file attributes causing conflict
with the patchset as sent. Boris is aware of that and the new version
will be on top of something that appalies on top of the btrfs
development branch again.
David Sterba June 7, 2021, 9:43 p.m. UTC | #6
Hi,

sorry I did not notice you replied some time ago.

On Wed, May 19, 2021 at 02:45:23PM -0700, Boris Burkov wrote:
> On Mon, May 17, 2021 at 11:48:59PM +0200, David Sterba wrote:
> > On Tue, May 11, 2021 at 09:11:08PM +0200, David Sterba wrote:
> > > On Wed, May 05, 2021 at 12:20:39PM -0700, Boris Burkov wrote:
> > > > --- a/fs/btrfs/btrfs_inode.h
> > > > +++ b/fs/btrfs/btrfs_inode.h
> > > > @@ -191,6 +191,7 @@ struct btrfs_inode {
> > > >  
> > > >  	/* flags field from the on disk inode */
> > > >  	u32 flags;
> > > > +	u64 compat_flags;
> > > 
> > > This got me curious, u32 flags is for the in-memory inode, but the
> > > on-disk inode_item::flags is u64
> > > 
> > > >  BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
> > >                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > > 
> > > > +BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);
> > > 
> > > >  	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
> > > 
> > > Which means we currently use only 32 bits and half of the on-disk
> > > inode_item::flags is always zero. So the idea is to repurpose this for
> > > the incompat bits (say upper 16 bits). With a minimal patch to tree
> > > checker we can make old kernels accept a verity-enabled kernel.
> > > 
> > > It could be tricky, but for backport only additional bitmask would be
> > > added to BTRFS_INODE_FLAG_MASK to ignore bits 48-63.
> > > 
> > > For proper support the inode_item::flags can be simply used as one space
> > > where the split would be just logical, and IMO manageable.
> > 
> > To demonstrate the idea, here's a compile-tested patch, based on
> > current misc-next but the verity bits are easy to match to your
> > patchset:
> 
> Thanks for taking the time to prove this idea out. However, I'd still
> like to discuss the pros/cons of this approach for this application.
> 
> As far as I can tell, the two issues at hand are ensuring compatibility
> and using fewer of the reserved bits. Your proposal uses 0 reserved
> bits, which is great, but is still quite a headache for compatibility,
> as an administrator would have to backport the compat patch on any kernel
> they wanted to roll back to before the one this went out on.

The compatibility problems are there for any new feature and usually
it's strict no mount, while here we can do a read-only compat mode at
least. Deploying a new feature should always take the fallback mount
into account, so it's advisable to wait a few releases eg. up to the
next stable release.

Luckily in that case we can backport the compatibility to the older
stable trees so the fallback would work after a minor release.

> This is especially painful for less well-loved things like
> dracut/systemd mounting the root filesystem and doing a pivot_root during
> boot. You would have to make sure that any machine using fsverity btrfs
> files has an updated initramfs kernel or it won't be able to boot.

So I hope this would get covered by the backports, as discussed, to 5.4
and 5.10.

> Alternatively, we could have our cake and eat it too if we separate the
> idea of unlocking the top 32 bits of the inode flags from adding compat
> flags.
> 
> If we:
> 1. take a u16 or a u32 out of reserved and make it compat flags (my
> patch, but shrinking from u64)
> 2. implement something similar to your patch, but don't use those 32
> bits just yet
> 
> Then we are setup to more conveniently use the freed-up 32 bits in the
> future, as the application which wants reserved bytes then will have a
> buffer of kernel versions to trivially roll back into, which may cover
> most practical rollbacks.
> 
> For what it's worth, I do like that your proposal stuffs inode flags and
> inode compat flags together, which is certainly neater than turning the
> upper 32 of inode flags into general reserved bits. But I'm just not
> sure that the aesthetic benefit is worth the real pain now.

My motivation is not aesthetic, rather I'm very conservative when
on-disk structures get changed, and inode is the core structure.
Curiously, you can thank Josef who switched the per-inode compat flags
to whole-filesystem only in f2b636e80d8206dd40 "Btrfs: add support for
compat flags to btrfs". But that was in 2008 and was a hard incompatible
change that lead to the last major format change (the _BHRfS_M
signature).

If the incompat change can be squeezed into existing structure, it
leaves the reserved fileds untouched. Right now we have 4x u64. Any
other change requires increasing the item size which is ultimately
possible but brings other problems. So if there's a possibily not to go
to the next level, I'll pursue it. Right now the major objection is the
problem with deployment and fallback mount, but I think this is solved.

Until now I haven't found any problem with the ro compat flags merged to
normal flags on itself, so as agreed offline, we're going to do that.
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c652e19ad74e..e8dbc8e848ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -191,6 +191,7 @@  struct btrfs_inode {
 
 	/* flags field from the on disk inode */
 	u32 flags;
+	u64 compat_flags;
 
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f5b0b12762b..0546273a520b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1786,6 +1786,7 @@  BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
@@ -1803,6 +1804,7 @@  BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_compat_flags, struct btrfs_inode_item, compat_flags, 64);
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1a88f6214ebc..ef4e0265dbe3 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1718,6 +1718,7 @@  static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_inode_transid(inode_item, trans->transid);
 	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
 	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+	btrfs_set_stack_inode_compat_flags(inode_item, BTRFS_I(inode)->compat_flags);
 	btrfs_set_stack_inode_block_group(inode_item, 0);
 
 	btrfs_set_stack_timespec_sec(&inode_item->atime,
@@ -1776,6 +1777,7 @@  int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 	inode->i_rdev = 0;
 	*rdev = btrfs_stack_inode_rdev(inode_item);
 	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+	BTRFS_I(inode)->compat_flags = btrfs_stack_inode_compat_flags(inode_item);
 
 	inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
 	inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 69fcdf8f0b1c..d89000577f7f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3627,6 +3627,7 @@  static int btrfs_read_locked_inode(struct inode *inode,
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+	BTRFS_I(inode)->compat_flags = btrfs_inode_compat_flags(leaf, inode_item);
 
 cache_index:
 	/*
@@ -3793,6 +3794,7 @@  static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
 	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	btrfs_set_token_inode_compat_flags(&token, item, BTRFS_I(inode)->compat_flags);
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
@@ -8857,6 +8859,7 @@  struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->defrag_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
+	ei->compat_flags = 0;
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->dir_index = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0ba0e4ddaf6b..ff335c192170 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -102,8 +102,9 @@  static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
  * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
  * ioctl.
  */
-static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
+static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 {
+	unsigned int flags = binode->flags;
 	unsigned int iflags = 0;
 
 	if (flags & BTRFS_INODE_SYNC)
@@ -156,7 +157,7 @@  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
 	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
-	unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
+	unsigned int flags = btrfs_inode_flags_to_fsflags(binode);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
 		return -EFAULT;
@@ -228,7 +229,7 @@  static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	btrfs_inode_lock(inode, 0);
 	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
-	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+	old_fsflags = btrfs_inode_flags_to_fsflags(binode);
 
 	ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
 	if (ret)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a0fc3a1390ab..3ef166a3485a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3944,6 +3944,7 @@  static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_token_inode_transid(&token, item, trans->transid);
 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
 	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+	btrfs_set_token_inode_compat_flags(&token, item, BTRFS_I(inode)->compat_flags);
 	btrfs_set_token_inode_block_group(&token, item, 0);
 }
 
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 58d7cff9afb1..ae25280316bd 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -574,11 +574,16 @@  struct btrfs_inode_item {
 	/* modification sequence number for NFS */
 	__le64 sequence;
 
+	/*
+	 * flags which aren't checked for corruption at mount
+	 * and can be added in a backwards compatible way
+	 */
+	__le64 compat_flags;
 	/*
 	 * a little future expansion, for more than this we can
 	 * just grow the inode item and version it
 	 */
-	__le64 reserved[4];
+	__le64 reserved[3];
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;