Message ID | 557E703D.2060709@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon 15-06-15 14:27:09, Joseph Qi wrote: > If updating journal superblock fails after journal data has been flushed, > the error is omitted and this will mislead the caller as a normal case. > In ocfs2, the checkpoint will be treated successfully and the other node > can get the lock to update. Since the sb_start is still pointing to the > old log block, it will rewrite the journal data during journal recovery > by the other node. Thus the new updates will be overwritten and ocfs2 > corrupts. > So in above case we have to return the error, and ocfs2_commit_cache will > take care of the error and prevent the other node to do update first. > And only after recovering journal it can do the new updates. > > The issue discussion mail can be found at: > https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html > http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 > > Reported-by: Yiwen Jiang <jiangyiwen@huawei.com> > Signed-off-by: Joseph Qi <joseph.qi@huawei.com> > Tested-by: Yiwen Jiang <jiangyiwen@huawei.com> > Cc: Junxiao Bi <junxiao.bi@oracle.com> > Cc: <stable@vger.kernel.org> The patch looks good but it seems to be against relatively old kernel version. Can you please rebase your patch against current kernel? Thanks! Honza > --- > fs/jbd2/checkpoint.c | 5 ++--- > fs/jbd2/journal.c | 37 ++++++++++++++++++++++++++++++------- > include/linux/jbd2.h | 4 ++-- > 3 files changed, 34 insertions(+), 12 deletions(-) > > diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c > index 988b32e..82e5b7d 100644 > --- a/fs/jbd2/checkpoint.c > +++ b/fs/jbd2/checkpoint.c > @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) > unsigned long blocknr; > > if (is_journal_aborted(journal)) > - return 1; > + return -EIO; > > if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) > return 1; > @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) > if (journal->j_flags & JBD2_BARRIER) > blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); > > - __jbd2_update_log_tail(journal, first_tid, blocknr); > - return 0; > + return __jbd2_update_log_tail(journal, first_tid, blocknr); > } > > > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c > index b96bd80..6b33a42 100644 > --- a/fs/jbd2/journal.c > +++ b/fs/jbd2/journal.c > @@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, > * > * Requires j_checkpoint_mutex > */ > -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > { > unsigned long freed; > + int ret; > > BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); > > @@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > * space and if we lose sb update during power failure we'd replay > * old transaction with possibly newly overwritten data. > */ > - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); > + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); > + if (ret) > + goto out; > + > write_lock(&journal->j_state_lock); > freed = block - journal->j_tail; > if (block < journal->j_tail) > @@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) > journal->j_tail_sequence = tid; > journal->j_tail = block; > write_unlock(&journal->j_state_lock); > + > +out: > + return ret; > } > > /* > @@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal) > return jbd2_journal_start_thread(journal); > } > > -static void jbd2_write_superblock(journal_t *journal, int write_op) > +static int jbd2_write_superblock(journal_t *journal, int write_op) > { > struct buffer_head *bh = journal->j_sb_buffer; > journal_superblock_t *sb = journal->j_superblock; > @@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) > printk(KERN_ERR "JBD2: Error %d detected when updating " > "journal superblock for %s.\n", ret, > journal->j_devname); > + jbd2_journal_abort(journal, ret); > } > + > + return ret; > } > > /** > @@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) > * Update a journal's superblock information about log tail and write it to > * disk, waiting for the IO to complete. > */ > -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > unsigned long tail_block, int write_op) > { > journal_superblock_t *sb = journal->j_superblock; > + int ret; > > BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); > jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", > @@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, > sb->s_sequence = cpu_to_be32(tail_tid); > sb->s_start = cpu_to_be32(tail_block); > > - jbd2_write_superblock(journal, write_op); > + ret = jbd2_write_superblock(journal, write_op); > + if (ret) > + goto out; > > /* Log is no longer empty */ > write_lock(&journal->j_state_lock); > WARN_ON(!sb->s_sequence); > journal->j_flags &= ~JBD2_FLUSHED; > write_unlock(&journal->j_state_lock); > + > +out: > + return ret; > } > > /** > @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal) > return -EIO; > > mutex_lock(&journal->j_checkpoint_mutex); > - jbd2_cleanup_journal_tail(journal); > + if (!err) { > + err = jbd2_cleanup_journal_tail(journal); > + if (err < 0) { > + mutex_unlock(&journal->j_checkpoint_mutex); > + goto out; > + } > + } > > /* Finally, mark the journal as really needing no recovery. > * This sets s_start==0 in the underlying superblock, which is > @@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal) > J_ASSERT(journal->j_head == journal->j_tail); > J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); > write_unlock(&journal->j_state_lock); > - return 0; > +out: > + return err; > } > > /** > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h > index 20e7f78..edb640a 100644 > --- a/include/linux/jbd2.h > +++ b/include/linux/jbd2.h > @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); > int jbd2_journal_next_log_block(journal_t *, unsigned long long *); > int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, > unsigned long *block); > -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); > > /* Commit management */ > @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); > extern int jbd2_journal_wipe (journal_t *, int); > extern int jbd2_journal_skip_recovery (journal_t *); > extern void jbd2_journal_update_sb_errno(journal_t *); > -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, > +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, > unsigned long, int); > extern void __jbd2_journal_abort_hard (journal_t *); > extern void jbd2_journal_abort (journal_t *, int); > -- > 1.8.4.3 > >
On Mon, Jun 15, 2015 at 02:27:09PM +0800, Joseph Qi wrote: > If updating journal superblock fails after journal data has been flushed, > the error is omitted and this will mislead the caller as a normal case. > In ocfs2, the checkpoint will be treated successfully and the other node > can get the lock to update. Since the sb_start is still pointing to the > old log block, it will rewrite the journal data during journal recovery > by the other node. Thus the new updates will be overwritten and ocfs2 > corrupts. > So in above case we have to return the error, and ocfs2_commit_cache will > take care of the error and prevent the other node to do update first. > And only after recovering journal it can do the new updates. > > The issue discussion mail can be found at: > https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html > http://comments.gmane.org/gmane.comp.file-systems.ext4/48841 > > Reported-by: Yiwen Jiang <jiangyiwen@huawei.com> > Signed-off-by: Joseph Qi <joseph.qi@huawei.com> > Tested-by: Yiwen Jiang <jiangyiwen@huawei.com> > Cc: Junxiao Bi <junxiao.bi@oracle.com> > Cc: <stable@vger.kernel.org> Thanks, applied. - Ted
On Mon, Jun 15, 2015 at 03:08:44PM +0200, Jan Kara wrote: > The patch looks good but it seems to be against relatively old kernel > version. Can you please rebase your patch against current kernel? Thanks! It applied cleanly for me, but there's quite a few commits building up in the ext4.git tree for the next merge cycle (see shortlog below). I'm guessing that perhaps Joseph generated his patch versus the linux-next tree? - Ted Andreas Dilger (1): ext4: improve warning directory handling messages Chao Yu (1): ext4 crypto: release crypto resource on module exit David Moore (1): ext4: BUG_ON assertion repeated for inode1, not done for inode2 Dmitry Monakhov (1): jbd2: use GFP_NOFS in jbd2_cleanup_journal_tail() Fabian Frederick (3): ext4 crypto: fix sparse warnings in fs/ext4/ioctl.c ext4: use swap() in memswap() ext4: use swap() in mext_page_double_lock() Jan Kara (4): jbd2: simplify code flow in do_get_write_access() jbd2: simplify error path on allocation failure in do_get_write_access() jbd2: more simplifications in do_get_write_access() jbd2: speedup jbd2_journal_get_[write|undo]_access() Joseph Qi (1): jbd2: fix ocfs2 corrupt when updating journal superblock fails Lukas Czerner (5): ext4: verify block bitmap even after fresh initialization ext4: try to initialize all groups we can in case of failure on ppc64 ext4: return error code from ext4_mb_good_group() ext4: recalculate journal credits as inode depth changes ext4: wait for existing dio workers in ext4_alloc_file_blocks() Michal Hocko (1): jbd2: revert must-not-fail allocation loops back to GFP_NOFAIL Namjae Jeon (1): ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate Rasmus Villemoes (1): ext4: mballoc: avoid 20-argument function call Theodore Ts'o (23): ext4 crypto: optimize filename encryption ext4 crypto: don't allocate a page when encrypting/decrypting file names ext4 crypto: separate kernel and userspace structure for the key ext4 crypto: reorganize how we store keys in the inode ext4: clean up superblock encryption mode fields ext4 crypto: use slab caches ext4 crypto: get rid of ci_mode from struct ext4_crypt_info ext4 crypto: shrink size of the ext4_crypto_ctx structure ext4 crypto: require CONFIG_CRYPTO_CTR if ext4 encryption is enabled ext4 crypto: use per-inode tfm structure ext4 crypto: fix memory leaks in ext4_encrypted_zeroout ext4 crypto: set up encryption info for new inodes in ext4_inherit_context() ext4 crypto: make sure the encryption info is initialized on opendir(2) ext4 crypto: encrypt tmpfile located in encryption protected directory ext4 crypto: enforce crypto policy restrictions on cross-renames ext4 crypto: policies may only be set on directories ext4 crypto: clean up error handling in ext4_fname_setup_filename ext4 crypto: allocate the right amount of memory for the on-disk symlink ext4 crypto: handle unexpected lack of encryption keys ext4 crypto: allocate bounce pages using GFP_NOWAIT ext4 crypto: fix ext4_get_crypto_ctx()'s calling convention in ext4_decrypt_one ext4 crypto: fail the mount if blocksize != pagesize ext4: fix race between truncate and __ext4_journalled_writepage()
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 988b32e..82e5b7d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) unsigned long blocknr; if (is_journal_aborted(journal)) - return 1; + return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; @@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); - __jbd2_update_log_tail(journal, first_tid, blocknr); - return 0; + return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b96bd80..6b33a42 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -885,9 +885,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, * * Requires j_checkpoint_mutex */ -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { unsigned long freed; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); @@ -897,7 +898,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + if (ret) + goto out; + write_lock(&journal->j_state_lock); freed = block - journal->j_tail; if (block < journal->j_tail) @@ -913,6 +917,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) journal->j_tail_sequence = tid; journal->j_tail = block; write_unlock(&journal->j_state_lock); + +out: + return ret; } /* @@ -1331,7 +1338,7 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static void jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1370,7 +1377,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); + jbd2_journal_abort(journal, ret); } + + return ret; } /** @@ -1383,10 +1393,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, unsigned long tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1395,13 +1406,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_op); + if (ret) + goto out; /* Log is no longer empty */ write_lock(&journal->j_state_lock); WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); + +out: + return ret; } /** @@ -1950,7 +1966,13 @@ int jbd2_journal_flush(journal_t *journal) return -EIO; mutex_lock(&journal->j_checkpoint_mutex); - jbd2_cleanup_journal_tail(journal); + if (!err) { + err = jbd2_cleanup_journal_tail(journal); + if (err < 0) { + mutex_unlock(&journal->j_checkpoint_mutex); + goto out; + } + } /* Finally, mark the journal as really needing no recovery. * This sets s_start==0 in the underlying superblock, which is @@ -1966,7 +1988,8 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); write_unlock(&journal->j_state_lock); - return 0; +out: + return err; } /** diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 20e7f78..edb640a 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); int jbd2_journal_next_log_block(journal_t *, unsigned long long *); int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, unsigned long *block); -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); /* Commit management */ @@ -1157,7 +1157,7 @@ extern int jbd2_journal_recover (journal_t *journal); extern int jbd2_journal_wipe (journal_t *, int); extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); -extern void jbd2_journal_update_sb_log_tail (journal_t *, tid_t, +extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int);