[2/2] xfs: clear XFS_ILOG_[AD]OWNER for non-btree formats
diff mbox series

Message ID 31aaeb46-8fb5-79f5-66bd-d594b9a6cc89@redhat.com
State New
Headers show
Series
  • xfs: swapext replay fixes
Related show

Commit Message

Eric Sandeen Dec. 18, 2018, 7:09 p.m. UTC
If an inode had been in btree format and had a data fork owner change
logged, that flag must be cleared if the inode changes format to
non-btree.  Otherwise we hit the old ASSERT (now changed to
corruption detection) in xfs_recover_inode_owner_change() which
enforces that if XFS_ILOG_[AD]OWNER is set, XFS_ILOG_[AD]BROOT
must be as well.

(Or, put more plainly, changing the fork owner is nonsensical
for inodes which have no such fork.)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

Comments

Darrick J. Wong Dec. 18, 2018, 7:28 p.m. UTC | #1
On Tue, Dec 18, 2018 at 01:09:40PM -0600, Eric Sandeen wrote:
> If an inode had been in btree format and had a data fork owner change
> logged, that flag must be cleared if the inode changes format to
> non-btree.  Otherwise we hit the old ASSERT (now changed to
> corruption detection) in xfs_recover_inode_owner_change() which
> enforces that if XFS_ILOG_[AD]OWNER is set, XFS_ILOG_[AD]BROOT
> must be as well.
> 
> (Or, put more plainly, changing the fork owner is nonsensical
> for inodes which have no such fork.)

...and now I learn why not to review a series until all the patches show
up. :(

> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
> ---
> 
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index fa1c4fe2ffbf..82a13b7ad321 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -144,7 +144,8 @@ xfs_inode_item_format_data_fork(
>  	switch (ip->i_d.di_format) {
>  	case XFS_DINODE_FMT_EXTENTS:
>  		iip->ili_fields &=
> -			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
> +			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
> +			  XFS_ILOG_DEV | XFS_ILOG_DOWNER);

So ... existing kernels could accidentally log an inode with an extents
format data fork and (DOWNER | DBROOT).  If a kernel encounters such a
log item, it would assert and kaboom, and now it'll fail the rest of log
recovery, which forces the admin to zap the log.

However, if the data fork is in extents format there isn't any work
required to change the owner anyway, so why do we have to abort the log
recovery in that case?

IOWs, why doesn't the previous patch do:

/*
 * Some old kernels mistakenly log inode items with DOWNER set on a
 * inode with a non-btree format data fork.  DBROOT won't be set in this
 * non-btree case, but since the work that DOWNER does is only required
 * for the btree case we can safely ignore this one weird combination.
 */
if ((ili_fields & DOWNER) && di_format == FMT_BTREE) {
	if (!(ili_fields & DBROOT))
		return -EFSCORRUPTED;
	error = xfs_bmbt_change_owner(...);
}

--D

>  
>  		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
>  		    ip->i_d.di_nextents > 0 &&
> @@ -185,7 +186,8 @@ xfs_inode_item_format_data_fork(
>  		break;
>  	case XFS_DINODE_FMT_LOCAL:
>  		iip->ili_fields &=
> -			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
> +			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
> +			  XFS_ILOG_DEV | XFS_ILOG_DOWNER);
>  		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
>  		    ip->i_df.if_bytes > 0) {
>  			/*
> @@ -206,7 +208,8 @@ xfs_inode_item_format_data_fork(
>  		break;
>  	case XFS_DINODE_FMT_DEV:
>  		iip->ili_fields &=
> -			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
> +			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
> +			  XFS_ILOG_DEXT | XFS_ILOG_DOWNER);
>  		if (iip->ili_fields & XFS_ILOG_DEV)
>  			ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
>  		break;
> @@ -229,7 +232,7 @@ xfs_inode_item_format_attr_fork(
>  	switch (ip->i_d.di_aformat) {
>  	case XFS_DINODE_FMT_EXTENTS:
>  		iip->ili_fields &=
> -			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
> +			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AOWNER);
>  
>  		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
>  		    ip->i_d.di_anextents > 0 &&
> @@ -268,7 +271,7 @@ xfs_inode_item_format_attr_fork(
>  		break;
>  	case XFS_DINODE_FMT_LOCAL:
>  		iip->ili_fields &=
> -			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
> +			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT | XFS_ILOG_AOWNER);
>  
>  		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
>  		    ip->i_afp->if_bytes > 0) {
>
Eric Sandeen Dec. 18, 2018, 7:42 p.m. UTC | #2
On 12/18/18 1:28 PM, Darrick J. Wong wrote:
> On Tue, Dec 18, 2018 at 01:09:40PM -0600, Eric Sandeen wrote:
>> If an inode had been in btree format and had a data fork owner change
>> logged, that flag must be cleared if the inode changes format to
>> non-btree.  Otherwise we hit the old ASSERT (now changed to
>> corruption detection) in xfs_recover_inode_owner_change() which
>> enforces that if XFS_ILOG_[AD]OWNER is set, XFS_ILOG_[AD]BROOT
>> must be as well.
>>
>> (Or, put more plainly, changing the fork owner is nonsensical
>> for inodes which have no such fork.)
> 
> ...and now I learn why not to review a series until all the patches show
> up. :(
> 
>> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
>> ---
>>
>> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
>> index fa1c4fe2ffbf..82a13b7ad321 100644
>> --- a/fs/xfs/xfs_inode_item.c
>> +++ b/fs/xfs/xfs_inode_item.c
>> @@ -144,7 +144,8 @@ xfs_inode_item_format_data_fork(
>>  	switch (ip->i_d.di_format) {
>>  	case XFS_DINODE_FMT_EXTENTS:
>>  		iip->ili_fields &=
>> -			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
>> +			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
>> +			  XFS_ILOG_DEV | XFS_ILOG_DOWNER);
> 
> So ... existing kernels could accidentally log an inode with an extents
> format data fork and (DOWNER | DBROOT).  If a kernel encounters such a
> log item, it would assert and kaboom, and now it'll fail the rest of log
> recovery, which forces the admin to zap the log.
> 
> However, if the data fork is in extents format there isn't any work
> required to change the owner anyway, so why do we have to abort the log
> recovery in that case?

Because I didn't think we were in the business of guessing which bits of
inconsistency indicate corruption and which bits are "probably ok"
during log recovery...

We have a mismatch between the logged fields and the format.  Your
suggestion seems to put a lot of faith in the format being correct,
and therefore deciding to ignore the inconsistency in the fields, no?

i.e. if we don't see FMT_BTREE, what justification do we have for
completely ignoring the flag inconsistency?  What if the format was
wrong? 

I don't really see how we can make that prioritization.  Maybe I'm
not thinking creatively enough and you can convince me.  :)

(Granted, the admin recovery path of "throw it all away" does suck,
but such is life.)

-Eric

> IOWs, why doesn't the previous patch do:
> 
> /*
>  * Some old kernels mistakenly log inode items with DOWNER set on a
>  * inode with a non-btree format data fork.  DBROOT won't be set in this
>  * non-btree case, but since the work that DOWNER does is only required
>  * for the btree case we can safely ignore this one weird combination.
>  */
> if ((ili_fields & DOWNER) && di_format == FMT_BTREE) {
> 	if (!(ili_fields & DBROOT))
> 		return -EFSCORRUPTED;
> 	error = xfs_bmbt_change_owner(...);
> }
> 
> --D

Patch
diff mbox series

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fa1c4fe2ffbf..82a13b7ad321 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -144,7 +144,8 @@  xfs_inode_item_format_data_fork(
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		iip->ili_fields &=
-			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_DOWNER);
 
 		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
 		    ip->i_d.di_nextents > 0 &&
@@ -185,7 +186,8 @@  xfs_inode_item_format_data_fork(
 		break;
 	case XFS_DINODE_FMT_LOCAL:
 		iip->ili_fields &=
-			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_DOWNER);
 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
 		    ip->i_df.if_bytes > 0) {
 			/*
@@ -206,7 +208,8 @@  xfs_inode_item_format_data_fork(
 		break;
 	case XFS_DINODE_FMT_DEV:
 		iip->ili_fields &=
-			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_DOWNER);
 		if (iip->ili_fields & XFS_ILOG_DEV)
 			ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
 		break;
@@ -229,7 +232,7 @@  xfs_inode_item_format_attr_fork(
 	switch (ip->i_d.di_aformat) {
 	case XFS_DINODE_FMT_EXTENTS:
 		iip->ili_fields &=
-			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AOWNER);
 
 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
 		    ip->i_d.di_anextents > 0 &&
@@ -268,7 +271,7 @@  xfs_inode_item_format_attr_fork(
 		break;
 	case XFS_DINODE_FMT_LOCAL:
 		iip->ili_fields &=
-			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT | XFS_ILOG_AOWNER);
 
 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
 		    ip->i_afp->if_bytes > 0) {