[RESEND] jbd2: fix ocfs2 corrupt when updating journal superblock fails
diff mbox

Message ID 557E703D.2060709@huawei.com
State New
Headers show

Commit Message

Joseph Qi June 15, 2015, 6:27 a.m. UTC
If updating journal superblock fails after journal data has been flushed,
the error is omitted and this will mislead the caller as a normal case.
In ocfs2, the checkpoint will be treated successfully and the other node
can get the lock to update. Since the sb_start is still pointing to the
old log block, it will rewrite the journal data during journal recovery
by the other node. Thus the new updates will be overwritten and ocfs2
corrupts.
So in above case we have to return the error, and ocfs2_commit_cache will
take care of the error and prevent the other node to do update first.
And only after recovering journal it can do the new updates.

The issue discussion mail can be found at:
https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html
http://comments.gmane.org/gmane.comp.file-systems.ext4/48841

Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Tested-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: <stable@vger.kernel.org>
---
 fs/jbd2/checkpoint.c |  5 ++---
 fs/jbd2/journal.c    | 37 ++++++++++++++++++++++++++++++-------
 include/linux/jbd2.h |  4 ++--
 3 files changed, 34 insertions(+), 12 deletions(-)

Comments

Jan Kara June 15, 2015, 1:08 p.m. UTC | #1
On Mon 15-06-15 14:27:09, Joseph Qi wrote:
> If updating journal superblock fails after journal data has been flushed,
> the error is omitted and this will mislead the caller as a normal case.
> In ocfs2, the checkpoint will be treated successfully and the other node
> can get the lock to update. Since the sb_start is still pointing to the
> old log block, it will rewrite the journal data during journal recovery
> by the other node. Thus the new updates will be overwritten and ocfs2
> corrupts.
> So in above case we have to return the error, and ocfs2_commit_cache will
> take care of the error and prevent the other node to do update first.
> And only after recovering journal it can do the new updates.
> 
> The issue discussion mail can be found at:
> https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html
> http://comments.gmane.org/gmane.comp.file-systems.ext4/48841
> 
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
> Tested-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Cc: Junxiao Bi <junxiao.bi@oracle.com>
> Cc: <stable@vger.kernel.org>
  The patch looks good but it seems to be against relatively old kernel
version. Can you please rebase your patch against current kernel? Thanks!

								Honza

> ---
>  fs/jbd2/checkpoint.c |  5 ++---
>  fs/jbd2/journal.c    | 37 ++++++++++++++++++++++++++++++-------
>  include/linux/jbd2.h |  4 ++--
>  3 files changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
> index 988b32e..82e5b7d 100644
> --- a/fs/jbd2/checkpoint.c
> +++ b/fs/jbd2/checkpoint.c
> @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
>  	unsigned long	blocknr;
> 
>  	if (is_journal_aborted(journal))
> -		return 1;
> +		return -EIO;
> 
>  	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
>  		return 1;
> @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
>  	if (journal->j_flags & JBD2_BARRIER)
>  		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
> 
> -	__jbd2_update_log_tail(journal, first_tid, blocknr);
> -	return 0;
> +	return __jbd2_update_log_tail(journal, first_tid, blocknr);
>  }
> 
> 
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index b96bd80..6b33a42 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
>   *
>   * Requires j_checkpoint_mutex
>   */
> -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
> +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  {
>  	unsigned long freed;
> +	int ret;
> 
>  	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
> 
> @@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  	 * space and if we lose sb update during power failure we'd replay
>  	 * old transaction with possibly newly overwritten data.
>  	 */
> -	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
> +	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
> +	if (ret)
> +		goto out;
> +
>  	write_lock(&journal->j_state_lock);
>  	freed = block - journal->j_tail;
>  	if (block < journal->j_tail)
> @@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
>  	journal->j_tail_sequence = tid;
>  	journal->j_tail = block;
>  	write_unlock(&journal->j_state_lock);
> +
> +out:
> +	return ret;
>  }
> 
>  /*
> @@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal)
>  	return jbd2_journal_start_thread(journal);
>  }
> 
> -static void jbd2_write_superblock(journal_t *journal, int write_op)
> +static int jbd2_write_superblock(journal_t *journal, int write_op)
>  {
>  	struct buffer_head *bh = journal->j_sb_buffer;
>  	journal_superblock_t *sb = journal->j_superblock;
> @@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
>  		printk(KERN_ERR "JBD2: Error %d detected when updating "
>  		       "journal superblock for %s.\n", ret,
>  		       journal->j_devname);
> +		jbd2_journal_abort(journal, ret);
>  	}
> +
> +	return ret;
>  }
> 
>  /**
> @@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
>   * Update a journal's superblock information about log tail and write it to
>   * disk, waiting for the IO to complete.
>   */
> -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
> +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
>  				     unsigned long tail_block, int write_op)
>  {
>  	journal_superblock_t *sb = journal->j_superblock;
> +	int ret;
> 
>  	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
>  	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
> @@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
>  	sb->s_sequence = cpu_to_be32(tail_tid);
>  	sb->s_start    = cpu_to_be32(tail_block);
> 
> -	jbd2_write_superblock(journal, write_op);
> +	ret = jbd2_write_superblock(journal, write_op);
> +	if (ret)
> +		goto out;
> 
>  	/* Log is no longer empty */
>  	write_lock(&journal->j_state_lock);
>  	WARN_ON(!sb->s_sequence);
>  	journal->j_flags &= ~JBD2_FLUSHED;
>  	write_unlock(&journal->j_state_lock);
> +
> +out:
> +	return ret;
>  }
> 
>  /**
> @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal)
>  		return -EIO;
> 
>  	mutex_lock(&journal->j_checkpoint_mutex);
> -	jbd2_cleanup_journal_tail(journal);
> +	if (!err) {
> +		err = jbd2_cleanup_journal_tail(journal);
> +		if (err < 0) {
> +			mutex_unlock(&journal->j_checkpoint_mutex);
> +			goto out;
> +		}
> +	}
> 
>  	/* Finally, mark the journal as really needing no recovery.
>  	 * This sets s_start==0 in the underlying superblock, which is
> @@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal)
>  	J_ASSERT(journal->j_head == journal->j_tail);
>  	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
>  	write_unlock(&journal->j_state_lock);
> -	return 0;
> +out:
> +	return err;
>  }
> 
>  /**
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 20e7f78..edb640a 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
>  int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
>  int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
>  			      unsigned long *block);
> -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
> +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
>  void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
> 
>  /* Commit management */
> @@ -1157,7 +1157,7 @@ extern int	   jbd2_journal_recover    (journal_t *journal);
>  extern int	   jbd2_journal_wipe       (journal_t *, int);
>  extern int	   jbd2_journal_skip_recovery	(journal_t *);
>  extern void	   jbd2_journal_update_sb_errno(journal_t *);
> -extern void	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
> +extern int	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
>  				unsigned long, int);
>  extern void	   __jbd2_journal_abort_hard	(journal_t *);
>  extern void	   jbd2_journal_abort      (journal_t *, int);
> -- 
> 1.8.4.3
> 
>
Theodore Y. Ts'o June 15, 2015, 6:47 p.m. UTC | #2
On Mon, Jun 15, 2015 at 02:27:09PM +0800, Joseph Qi wrote:
> If updating journal superblock fails after journal data has been flushed,
> the error is omitted and this will mislead the caller as a normal case.
> In ocfs2, the checkpoint will be treated successfully and the other node
> can get the lock to update. Since the sb_start is still pointing to the
> old log block, it will rewrite the journal data during journal recovery
> by the other node. Thus the new updates will be overwritten and ocfs2
> corrupts.
> So in above case we have to return the error, and ocfs2_commit_cache will
> take care of the error and prevent the other node to do update first.
> And only after recovering journal it can do the new updates.
> 
> The issue discussion mail can be found at:
> https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html
> http://comments.gmane.org/gmane.comp.file-systems.ext4/48841
> 
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
> Tested-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Cc: Junxiao Bi <junxiao.bi@oracle.com>
> Cc: <stable@vger.kernel.org>

Thanks, applied.

					- Ted
Theodore Y. Ts'o June 15, 2015, 7:02 p.m. UTC | #3
On Mon, Jun 15, 2015 at 03:08:44PM +0200, Jan Kara wrote:
>   The patch looks good but it seems to be against relatively old kernel
> version. Can you please rebase your patch against current kernel? Thanks!

It applied cleanly for me, but there's quite a few commits building up
in the ext4.git tree for the next merge cycle (see shortlog below).
I'm guessing that perhaps Joseph generated his patch versus the
linux-next tree?

							- Ted

Andreas Dilger (1):
      ext4: improve warning directory handling messages

Chao Yu (1):
      ext4 crypto: release crypto resource on module exit

David Moore (1):
      ext4: BUG_ON assertion repeated for inode1, not done for inode2

Dmitry Monakhov (1):
      jbd2: use GFP_NOFS in jbd2_cleanup_journal_tail()

Fabian Frederick (3):
      ext4 crypto: fix sparse warnings in fs/ext4/ioctl.c
      ext4: use swap() in memswap()
      ext4: use swap() in mext_page_double_lock()

Jan Kara (4):
      jbd2: simplify code flow in do_get_write_access()
      jbd2: simplify error path on allocation failure in do_get_write_access()
      jbd2: more simplifications in do_get_write_access()
      jbd2: speedup jbd2_journal_get_[write|undo]_access()

Joseph Qi (1):
      jbd2: fix ocfs2 corrupt when updating journal superblock fails

Lukas Czerner (5):
      ext4: verify block bitmap even after fresh initialization
      ext4: try to initialize all groups we can in case of failure on ppc64
      ext4: return error code from ext4_mb_good_group()
      ext4: recalculate journal credits as inode depth changes
      ext4: wait for existing dio workers in ext4_alloc_file_blocks()

Michal Hocko (1):
      jbd2: revert must-not-fail allocation loops back to GFP_NOFAIL

Namjae Jeon (1):
      ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate

Rasmus Villemoes (1):
      ext4: mballoc: avoid 20-argument function call

Theodore Ts'o (23):
      ext4 crypto: optimize filename encryption
      ext4 crypto: don't allocate a page when encrypting/decrypting file names
      ext4 crypto: separate kernel and userspace structure for the key
      ext4 crypto: reorganize how we store keys in the inode
      ext4: clean up superblock encryption mode fields
      ext4 crypto: use slab caches
      ext4 crypto: get rid of ci_mode from struct ext4_crypt_info
      ext4 crypto: shrink size of the ext4_crypto_ctx structure
      ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is enabled
      ext4 crypto: use per-inode tfm structure
      ext4 crypto: fix memory leaks in ext4_encrypted_zeroout
      ext4 crypto: set up encryption info for new inodes in ext4_inherit_context()
      ext4 crypto: make sure the encryption info is initialized on opendir(2)
      ext4 crypto: encrypt tmpfile located in encryption protected directory
      ext4 crypto: enforce crypto policy restrictions on cross-renames
      ext4 crypto: policies may only be set on directories
      ext4 crypto: clean up error handling in ext4_fname_setup_filename
      ext4 crypto: allocate the right amount of memory for the on-disk symlink
      ext4 crypto: handle unexpected lack of encryption keys
      ext4 crypto: allocate bounce pages using GFP_NOWAIT
      ext4 crypto: fix ext4_get_crypto_ctx()'s calling convention in ext4_decrypt_one
      ext4 crypto: fail the mount if blocksize != pagesize
      ext4: fix race between truncate and __ext4_journalled_writepage()

Patch
diff mbox

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 988b32e..82e5b7d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -390,7 +390,7 @@  int jbd2_cleanup_journal_tail(journal_t *journal)
 	unsigned long	blocknr;

 	if (is_journal_aborted(journal))
-		return 1;
+		return -EIO;

 	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
 		return 1;
@@ -407,8 +407,7 @@  int jbd2_cleanup_journal_tail(journal_t *journal)
 	if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);

-	__jbd2_update_log_tail(journal, first_tid, blocknr);
-	return 0;
+	return __jbd2_update_log_tail(journal, first_tid, blocknr);
 }


diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd80..6b33a42 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -885,9 +885,10 @@  int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
  *
  * Requires j_checkpoint_mutex
  */
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 {
 	unsigned long freed;
+	int ret;

 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));

@@ -897,7 +898,10 @@  void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	if (ret)
+		goto out;
+
 	write_lock(&journal->j_state_lock);
 	freed = block - journal->j_tail;
 	if (block < journal->j_tail)
@@ -913,6 +917,9 @@  void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	journal->j_tail_sequence = tid;
 	journal->j_tail = block;
 	write_unlock(&journal->j_state_lock);
+
+out:
+	return ret;
 }

 /*
@@ -1331,7 +1338,7 @@  static int journal_reset(journal_t *journal)
 	return jbd2_journal_start_thread(journal);
 }

-static void jbd2_write_superblock(journal_t *journal, int write_op)
+static int jbd2_write_superblock(journal_t *journal, int write_op)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
 	journal_superblock_t *sb = journal->j_superblock;
@@ -1370,7 +1377,10 @@  static void jbd2_write_superblock(journal_t *journal, int write_op)
 		printk(KERN_ERR "JBD2: Error %d detected when updating "
 		       "journal superblock for %s.\n", ret,
 		       journal->j_devname);
+		jbd2_journal_abort(journal, ret);
 	}
+
+	return ret;
 }

 /**
@@ -1383,10 +1393,11 @@  static void jbd2_write_superblock(journal_t *journal, int write_op)
  * Update a journal's superblock information about log tail and write it to
  * disk, waiting for the IO to complete.
  */
-void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 				     unsigned long tail_block, int write_op)
 {
 	journal_superblock_t *sb = journal->j_superblock;
+	int ret;

 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -1395,13 +1406,18 @@  void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	sb->s_sequence = cpu_to_be32(tail_tid);
 	sb->s_start    = cpu_to_be32(tail_block);

-	jbd2_write_superblock(journal, write_op);
+	ret = jbd2_write_superblock(journal, write_op);
+	if (ret)
+		goto out;

 	/* Log is no longer empty */
 	write_lock(&journal->j_state_lock);
 	WARN_ON(!sb->s_sequence);
 	journal->j_flags &= ~JBD2_FLUSHED;
 	write_unlock(&journal->j_state_lock);
+
+out:
+	return ret;
 }

 /**
@@ -1950,7 +1966,13 @@  int jbd2_journal_flush(journal_t *journal)
 		return -EIO;

 	mutex_lock(&journal->j_checkpoint_mutex);
-	jbd2_cleanup_journal_tail(journal);
+	if (!err) {
+		err = jbd2_cleanup_journal_tail(journal);
+		if (err < 0) {
+			mutex_unlock(&journal->j_checkpoint_mutex);
+			goto out;
+		}
+	}

 	/* Finally, mark the journal as really needing no recovery.
 	 * This sets s_start==0 in the underlying superblock, which is
@@ -1966,7 +1988,8 @@  int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	write_unlock(&journal->j_state_lock);
-	return 0;
+out:
+	return err;
 }

 /**
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 20e7f78..edb640a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1035,7 +1035,7 @@  struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 			      unsigned long *block);
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

 /* Commit management */
@@ -1157,7 +1157,7 @@  extern int	   jbd2_journal_recover    (journal_t *journal);
 extern int	   jbd2_journal_wipe       (journal_t *, int);
 extern int	   jbd2_journal_skip_recovery	(journal_t *);
 extern void	   jbd2_journal_update_sb_errno(journal_t *);
-extern void	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
+extern int	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
 				unsigned long, int);
 extern void	   __jbd2_journal_abort_hard	(journal_t *);
 extern void	   jbd2_journal_abort      (journal_t *, int);