diff mbox

ocfs2: fix deadlock due to wrong locking order

Message ID 1410313732-27840-1-git-send-email-junxiao.bi@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Junxiao Bi Sept. 10, 2014, 1:48 a.m. UTC
For commit ocfs2 journal, ocfs2 journal thread will acquire the mutex
osb->journal->j_trans_barrier and wake up jbd2 commit thread, then it
will wait until jbd2 commit thread done. In order journal mode, jbd2
needs flushing dirty data pages first, and this needs get page lock.
So osb->journal->j_trans_barrier should be got before page lock.

But ocfs2_write_zero_page() and ocfs2_write_begin_inline() obey this
locking order, and this will cause deadlock and hung the whole cluster.

One deadlock catched is the following:

PID: 13449  TASK: ffff8802e2f08180  CPU: 31  COMMAND: "oracle"
 #0 [ffff8802ee3f79b0] __schedule at ffffffff8150a524
 #1 [ffff8802ee3f7a58] schedule at ffffffff8150acbf
 #2 [ffff8802ee3f7a68] rwsem_down_failed_common at ffffffff8150cb85
 #3 [ffff8802ee3f7ad8] rwsem_down_read_failed at ffffffff8150cc55
 #4 [ffff8802ee3f7ae8] call_rwsem_down_read_failed at ffffffff812617a4
 #5 [ffff8802ee3f7b50] ocfs2_start_trans at ffffffffa0498919 [ocfs2]
 #6 [ffff8802ee3f7ba0] ocfs2_zero_start_ordered_transaction at ffffffffa048b2b8 [ocfs2]
 #7 [ffff8802ee3f7bf0] ocfs2_write_zero_page at ffffffffa048e9bd [ocfs2]
 #8 [ffff8802ee3f7c80] ocfs2_zero_extend_range at ffffffffa048ec83 [ocfs2]
 #9 [ffff8802ee3f7ce0] ocfs2_zero_extend at ffffffffa048edfd [ocfs2]
 #10 [ffff8802ee3f7d50] ocfs2_extend_file at ffffffffa049079e [ocfs2]
 #11 [ffff8802ee3f7da0] ocfs2_setattr at ffffffffa04910ed [ocfs2]
 #12 [ffff8802ee3f7e70] notify_change at ffffffff81187d29
 #13 [ffff8802ee3f7ee0] do_truncate at ffffffff8116bbc1
 #14 [ffff8802ee3f7f50] sys_ftruncate at ffffffff8116bcbd
 #15 [ffff8802ee3f7f80] system_call_fastpath at ffffffff81515142
    RIP: 00007f8de750c6f7  RSP: 00007fffe786e478  RFLAGS: 00000206
    RAX: 000000000000004d  RBX: ffffffff81515142  RCX: 0000000000000000
    RDX: 0000000000000200  RSI: 0000000000028400  RDI: 000000000000000d
    RBP: 00007fffe786e040   R8: 0000000000000000   R9: 000000000000000d
    R10: 0000000000000000  R11: 0000000000000206  R12: 000000000000000d
    R13: 00007fffe786e710  R14: 00007f8de70f8340  R15: 0000000000028400
    ORIG_RAX: 000000000000004d  CS: 0033  SS: 002b

crash64> bt
PID: 7610   TASK: ffff88100fd56140  CPU: 1   COMMAND: "ocfs2cmt"
 #0 [ffff88100f4d1c50] __schedule at ffffffff8150a524
 #1 [ffff88100f4d1cf8] schedule at ffffffff8150acbf
 #2 [ffff88100f4d1d08] jbd2_log_wait_commit at ffffffffa01274fd [jbd2]
 #3 [ffff88100f4d1d98] jbd2_journal_flush at ffffffffa01280b4 [jbd2]
 #4 [ffff88100f4d1dd8] ocfs2_commit_cache at ffffffffa0499b14 [ocfs2]
 #5 [ffff88100f4d1e38] ocfs2_commit_thread at ffffffffa0499d38 [ocfs2]
 #6 [ffff88100f4d1ee8] kthread at ffffffff81090db6
 #7 [ffff88100f4d1f48] kernel_thread_helper at ffffffff81516284

crash64> bt
PID: 7609   TASK: ffff88100f2d4480  CPU: 0   COMMAND: "jbd2/dm-20-86"
 #0 [ffff88100def3920] __schedule at ffffffff8150a524
 #1 [ffff88100def39c8] schedule at ffffffff8150acbf
 #2 [ffff88100def39d8] io_schedule at ffffffff8150ad6c
 #3 [ffff88100def39f8] sleep_on_page at ffffffff8111069e
 #4 [ffff88100def3a08] __wait_on_bit_lock at ffffffff8150b30a
 #5 [ffff88100def3a58] __lock_page at ffffffff81110687
 #6 [ffff88100def3ab8] write_cache_pages at ffffffff8111b752
 #7 [ffff88100def3be8] generic_writepages at ffffffff8111b901
 #8 [ffff88100def3c48] journal_submit_data_buffers at ffffffffa0120f67 [jbd2]
 #9 [ffff88100def3cf8] jbd2_journal_commit_transaction at ffffffffa0121372[jbd2]
 #10 [ffff88100def3e68] kjournald2 at ffffffffa0127a86 [jbd2]
 #11 [ffff88100def3ee8] kthread at ffffffff81090db6
 #12 [ffff88100def3f48] kernel_thread_helper at ffffffff81516284

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
---
 fs/ocfs2/aops.c |   15 ++++++++-------
 fs/ocfs2/file.c |   52 ++++++++++++++++++++++++----------------------------
 2 files changed, 32 insertions(+), 35 deletions(-)

Comments

Alex Chen Sept. 24, 2014, 8:49 a.m. UTC | #1
Hi, Junxiao:

On 2014/9/10 9:48, Junxiao Bi wrote:
> For commit ocfs2 journal, ocfs2 journal thread will acquire the mutex
> osb->journal->j_trans_barrier and wake up jbd2 commit thread, then it
> will wait until jbd2 commit thread done. In order journal mode, jbd2
> needs flushing dirty data pages first, and this needs get page lock.
> So osb->journal->j_trans_barrier should be got before page lock.
> 
> But ocfs2_write_zero_page() and ocfs2_write_begin_inline() obey this
> locking order, and this will cause deadlock and hung the whole cluster.
> 
> One deadlock catched is the following:
> 
> PID: 13449  TASK: ffff8802e2f08180  CPU: 31  COMMAND: "oracle"
>  #0 [ffff8802ee3f79b0] __schedule at ffffffff8150a524
>  #1 [ffff8802ee3f7a58] schedule at ffffffff8150acbf
>  #2 [ffff8802ee3f7a68] rwsem_down_failed_common at ffffffff8150cb85
>  #3 [ffff8802ee3f7ad8] rwsem_down_read_failed at ffffffff8150cc55
>  #4 [ffff8802ee3f7ae8] call_rwsem_down_read_failed at ffffffff812617a4
>  #5 [ffff8802ee3f7b50] ocfs2_start_trans at ffffffffa0498919 [ocfs2]
>  #6 [ffff8802ee3f7ba0] ocfs2_zero_start_ordered_transaction at ffffffffa048b2b8 [ocfs2]
>  #7 [ffff8802ee3f7bf0] ocfs2_write_zero_page at ffffffffa048e9bd [ocfs2]
>  #8 [ffff8802ee3f7c80] ocfs2_zero_extend_range at ffffffffa048ec83 [ocfs2]
>  #9 [ffff8802ee3f7ce0] ocfs2_zero_extend at ffffffffa048edfd [ocfs2]
>  #10 [ffff8802ee3f7d50] ocfs2_extend_file at ffffffffa049079e [ocfs2]
>  #11 [ffff8802ee3f7da0] ocfs2_setattr at ffffffffa04910ed [ocfs2]
>  #12 [ffff8802ee3f7e70] notify_change at ffffffff81187d29
>  #13 [ffff8802ee3f7ee0] do_truncate at ffffffff8116bbc1
>  #14 [ffff8802ee3f7f50] sys_ftruncate at ffffffff8116bcbd
>  #15 [ffff8802ee3f7f80] system_call_fastpath at ffffffff81515142
>     RIP: 00007f8de750c6f7  RSP: 00007fffe786e478  RFLAGS: 00000206
>     RAX: 000000000000004d  RBX: ffffffff81515142  RCX: 0000000000000000
>     RDX: 0000000000000200  RSI: 0000000000028400  RDI: 000000000000000d
>     RBP: 00007fffe786e040   R8: 0000000000000000   R9: 000000000000000d
>     R10: 0000000000000000  R11: 0000000000000206  R12: 000000000000000d
>     R13: 00007fffe786e710  R14: 00007f8de70f8340  R15: 0000000000028400
>     ORIG_RAX: 000000000000004d  CS: 0033  SS: 002b
> 
> crash64> bt
> PID: 7610   TASK: ffff88100fd56140  CPU: 1   COMMAND: "ocfs2cmt"
>  #0 [ffff88100f4d1c50] __schedule at ffffffff8150a524
>  #1 [ffff88100f4d1cf8] schedule at ffffffff8150acbf
>  #2 [ffff88100f4d1d08] jbd2_log_wait_commit at ffffffffa01274fd [jbd2]
>  #3 [ffff88100f4d1d98] jbd2_journal_flush at ffffffffa01280b4 [jbd2]
>  #4 [ffff88100f4d1dd8] ocfs2_commit_cache at ffffffffa0499b14 [ocfs2]
>  #5 [ffff88100f4d1e38] ocfs2_commit_thread at ffffffffa0499d38 [ocfs2]
>  #6 [ffff88100f4d1ee8] kthread at ffffffff81090db6
>  #7 [ffff88100f4d1f48] kernel_thread_helper at ffffffff81516284
> 
> crash64> bt
> PID: 7609   TASK: ffff88100f2d4480  CPU: 0   COMMAND: "jbd2/dm-20-86"
>  #0 [ffff88100def3920] __schedule at ffffffff8150a524
>  #1 [ffff88100def39c8] schedule at ffffffff8150acbf
>  #2 [ffff88100def39d8] io_schedule at ffffffff8150ad6c
>  #3 [ffff88100def39f8] sleep_on_page at ffffffff8111069e
>  #4 [ffff88100def3a08] __wait_on_bit_lock at ffffffff8150b30a
>  #5 [ffff88100def3a58] __lock_page at ffffffff81110687
>  #6 [ffff88100def3ab8] write_cache_pages at ffffffff8111b752
>  #7 [ffff88100def3be8] generic_writepages at ffffffff8111b901
>  #8 [ffff88100def3c48] journal_submit_data_buffers at ffffffffa0120f67 [jbd2]
>  #9 [ffff88100def3cf8] jbd2_journal_commit_transaction at ffffffffa0121372[jbd2]
>  #10 [ffff88100def3e68] kjournald2 at ffffffffa0127a86 [jbd2]
>  #11 [ffff88100def3ee8] kthread at ffffffff81090db6
>  #12 [ffff88100def3f48] kernel_thread_helper at ffffffff81516284
> 
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>  fs/ocfs2/aops.c |   15 ++++++++-------
>  fs/ocfs2/file.c |   52 ++++++++++++++++++++++++----------------------------
>  2 files changed, 32 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index c04183b..c71174a 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -1485,8 +1485,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	handle_t *handle;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
>  
> +	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
>  	page = find_or_create_page(mapping, 0, GFP_NOFS);
>  	if (!page) {
> +		ocfs2_commit_trans(osb, handle);
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
>  		goto out;
> @@ -1498,13 +1506,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	wc->w_pages[0] = wc->w_target_page = page;
>  	wc->w_num_pages = 1;
>  
> -	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> -	if (IS_ERR(handle)) {
> -		ret = PTR_ERR(handle);
> -		mlog_errno(ret);
> -		goto out;
> -	}
> -
>  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
>  				      OCFS2_JOURNAL_ACCESS_WRITE);
>  	if (ret) {
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 2930e23..c534cb0 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	struct address_space *mapping = inode->i_mapping;
>  	struct page *page;
>  	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
> -	handle_t *handle = NULL;
> +	handle_t *handle;
>  	int ret = 0;
>  	unsigned zero_from, zero_to, block_start, block_end;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
> @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
>  	BUG_ON(abs_from & (inode->i_blkbits - 1));
>  
> +	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		goto out;
> +	}
> +

In ocfs2_zero_start_ordered_transaction, the inode will be filed into
inode list of transaction by ocfs2_jbd2_file_inode.
The relationship between ocfs2_jbd2_file_inode and lock_page as
follow:
	find_or_create_page
		->lock_page
	ocfs2_zero_start_ordered_transaction
		->ocfs2_jbd2_file_inode
	unlock_page

But in your patch, the inode will be filed into inode list before the
lock_page, is there a problem?

Thanks.
	--Alex

>  	page = find_or_create_page(mapping, index, GFP_NOFS);
>  	if (!page) {
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
> -		goto out;
> +		goto out_commit_trans;
>  	}
>  
>  	/* Get the offsets within the page that we want to zero */
> @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			goto out_unlock;
>  		}
>  
> -		if (!handle) {
> -			handle = ocfs2_zero_start_ordered_transaction(inode,
> -								      di_bh);
> -			if (IS_ERR(handle)) {
> -				ret = PTR_ERR(handle);
> -				handle = NULL;
> -				break;
> -			}
> -		}
>  
>  		/* must not update i_size! */
>  		ret = block_commit_write(page, block_start + 1,
> @@ -824,27 +821,26 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			ret = 0;
>  	}
>  
> -	if (handle) {
> -		/*
> -		 * fs-writeback will release the dirty pages without page lock
> -		 * whose offset are over inode size, the release happens at
> -		 * block_write_full_page().
> -		 */
> -		i_size_write(inode, abs_to);
> -		inode->i_blocks = ocfs2_inode_sector_count(inode);
> -		di->i_size = cpu_to_le64((u64)i_size_read(inode));
> -		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> -		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> -		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> -		di->i_mtime_nsec = di->i_ctime_nsec;
> -		ocfs2_journal_dirty(handle, di_bh);
> -		ocfs2_update_inode_fsync_trans(handle, inode, 1);
> -		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
> -	}
> +	/*
> +	 * fs-writeback will release the dirty pages without page lock
> +	 * whose offset are over inode size, the release happens at
> +	 * block_write_full_page().
> +	 */
> +	i_size_write(inode, abs_to);
> +	inode->i_blocks = ocfs2_inode_sector_count(inode);
> +	di->i_size = cpu_to_le64((u64)i_size_read(inode));
> +	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> +	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> +	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> +	di->i_mtime_nsec = di->i_ctime_nsec;
> +	ocfs2_journal_dirty(handle, di_bh);
> +	ocfs2_update_inode_fsync_trans(handle, inode, 1);
>  
>  out_unlock:
>  	unlock_page(page);
>  	page_cache_release(page);
> +out_commit_trans:
> +	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
>  out:
>  	return ret;
>  }
>
Alex Chen Sept. 24, 2014, 9:41 a.m. UTC | #2
Hi Junxiao,

On 2014/9/10 9:48, Junxiao Bi wrote:
> For commit ocfs2 journal, ocfs2 journal thread will acquire the mutex
> osb->journal->j_trans_barrier and wake up jbd2 commit thread, then it
> will wait until jbd2 commit thread done. In order journal mode, jbd2
> needs flushing dirty data pages first, and this needs get page lock.
> So osb->journal->j_trans_barrier should be got before page lock.
> 
> But ocfs2_write_zero_page() and ocfs2_write_begin_inline() obey this
> locking order, and this will cause deadlock and hung the whole cluster.
> 
> One deadlock catched is the following:
> 
> PID: 13449  TASK: ffff8802e2f08180  CPU: 31  COMMAND: "oracle"
>  #0 [ffff8802ee3f79b0] __schedule at ffffffff8150a524
>  #1 [ffff8802ee3f7a58] schedule at ffffffff8150acbf
>  #2 [ffff8802ee3f7a68] rwsem_down_failed_common at ffffffff8150cb85
>  #3 [ffff8802ee3f7ad8] rwsem_down_read_failed at ffffffff8150cc55
>  #4 [ffff8802ee3f7ae8] call_rwsem_down_read_failed at ffffffff812617a4
>  #5 [ffff8802ee3f7b50] ocfs2_start_trans at ffffffffa0498919 [ocfs2]
>  #6 [ffff8802ee3f7ba0] ocfs2_zero_start_ordered_transaction at ffffffffa048b2b8 [ocfs2]
>  #7 [ffff8802ee3f7bf0] ocfs2_write_zero_page at ffffffffa048e9bd [ocfs2]
>  #8 [ffff8802ee3f7c80] ocfs2_zero_extend_range at ffffffffa048ec83 [ocfs2]
>  #9 [ffff8802ee3f7ce0] ocfs2_zero_extend at ffffffffa048edfd [ocfs2]
>  #10 [ffff8802ee3f7d50] ocfs2_extend_file at ffffffffa049079e [ocfs2]
>  #11 [ffff8802ee3f7da0] ocfs2_setattr at ffffffffa04910ed [ocfs2]
>  #12 [ffff8802ee3f7e70] notify_change at ffffffff81187d29
>  #13 [ffff8802ee3f7ee0] do_truncate at ffffffff8116bbc1
>  #14 [ffff8802ee3f7f50] sys_ftruncate at ffffffff8116bcbd
>  #15 [ffff8802ee3f7f80] system_call_fastpath at ffffffff81515142
>     RIP: 00007f8de750c6f7  RSP: 00007fffe786e478  RFLAGS: 00000206
>     RAX: 000000000000004d  RBX: ffffffff81515142  RCX: 0000000000000000
>     RDX: 0000000000000200  RSI: 0000000000028400  RDI: 000000000000000d
>     RBP: 00007fffe786e040   R8: 0000000000000000   R9: 000000000000000d
>     R10: 0000000000000000  R11: 0000000000000206  R12: 000000000000000d
>     R13: 00007fffe786e710  R14: 00007f8de70f8340  R15: 0000000000028400
>     ORIG_RAX: 000000000000004d  CS: 0033  SS: 002b
> 
> crash64> bt
> PID: 7610   TASK: ffff88100fd56140  CPU: 1   COMMAND: "ocfs2cmt"
>  #0 [ffff88100f4d1c50] __schedule at ffffffff8150a524
>  #1 [ffff88100f4d1cf8] schedule at ffffffff8150acbf
>  #2 [ffff88100f4d1d08] jbd2_log_wait_commit at ffffffffa01274fd [jbd2]
>  #3 [ffff88100f4d1d98] jbd2_journal_flush at ffffffffa01280b4 [jbd2]
>  #4 [ffff88100f4d1dd8] ocfs2_commit_cache at ffffffffa0499b14 [ocfs2]
>  #5 [ffff88100f4d1e38] ocfs2_commit_thread at ffffffffa0499d38 [ocfs2]
>  #6 [ffff88100f4d1ee8] kthread at ffffffff81090db6
>  #7 [ffff88100f4d1f48] kernel_thread_helper at ffffffff81516284
> 
> crash64> bt
> PID: 7609   TASK: ffff88100f2d4480  CPU: 0   COMMAND: "jbd2/dm-20-86"
>  #0 [ffff88100def3920] __schedule at ffffffff8150a524
>  #1 [ffff88100def39c8] schedule at ffffffff8150acbf
>  #2 [ffff88100def39d8] io_schedule at ffffffff8150ad6c
>  #3 [ffff88100def39f8] sleep_on_page at ffffffff8111069e
>  #4 [ffff88100def3a08] __wait_on_bit_lock at ffffffff8150b30a
>  #5 [ffff88100def3a58] __lock_page at ffffffff81110687
>  #6 [ffff88100def3ab8] write_cache_pages at ffffffff8111b752
>  #7 [ffff88100def3be8] generic_writepages at ffffffff8111b901
>  #8 [ffff88100def3c48] journal_submit_data_buffers at ffffffffa0120f67 [jbd2]
>  #9 [ffff88100def3cf8] jbd2_journal_commit_transaction at ffffffffa0121372[jbd2]
>  #10 [ffff88100def3e68] kjournald2 at ffffffffa0127a86 [jbd2]
>  #11 [ffff88100def3ee8] kthread at ffffffff81090db6
>  #12 [ffff88100def3f48] kernel_thread_helper at ffffffff81516284
> 
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>  fs/ocfs2/aops.c |   15 ++++++++-------
>  fs/ocfs2/file.c |   52 ++++++++++++++++++++++++----------------------------
>  2 files changed, 32 insertions(+), 35 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index c04183b..c71174a 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -1485,8 +1485,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	handle_t *handle;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
>  
> +	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		mlog_errno(ret);
> +		goto out;
> +	}
> +
>  	page = find_or_create_page(mapping, 0, GFP_NOFS);
>  	if (!page) {
> +		ocfs2_commit_trans(osb, handle);
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
>  		goto out;
> @@ -1498,13 +1506,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>  	wc->w_pages[0] = wc->w_target_page = page;
>  	wc->w_num_pages = 1;
>  
> -	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
> -	if (IS_ERR(handle)) {
> -		ret = PTR_ERR(handle);
> -		mlog_errno(ret);
> -		goto out;
> -	}
> -
>  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
>  				      OCFS2_JOURNAL_ACCESS_WRITE);
>  	if (ret) {
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 2930e23..c534cb0 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	struct address_space *mapping = inode->i_mapping;
>  	struct page *page;
>  	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
> -	handle_t *handle = NULL;
> +	handle_t *handle;
>  	int ret = 0;
>  	unsigned zero_from, zero_to, block_start, block_end;
>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
> @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
>  	BUG_ON(abs_from & (inode->i_blkbits - 1));
>  
> +	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
> +	if (IS_ERR(handle)) {
> +		ret = PTR_ERR(handle);
> +		goto out;
> +	}
> +

The ocfs2_zero_start_ordered_transaction may return NULL.

>  	page = find_or_create_page(mapping, index, GFP_NOFS);
>  	if (!page) {
>  		ret = -ENOMEM;
>  		mlog_errno(ret);
> -		goto out;
> +		goto out_commit_trans;
>  	}
>  
>  	/* Get the offsets within the page that we want to zero */
> @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			goto out_unlock;
>  		}
>  
> -		if (!handle) {
> -			handle = ocfs2_zero_start_ordered_transaction(inode,
> -								      di_bh);
> -			if (IS_ERR(handle)) {
> -				ret = PTR_ERR(handle);
> -				handle = NULL;
> -				break;
> -			}
> -		}
>  
>  		/* must not update i_size! */
>  		ret = block_commit_write(page, block_start + 1,
> @@ -824,27 +821,26 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>  			ret = 0;
>  	}
>  
> -	if (handle) {
> -		/*
> -		 * fs-writeback will release the dirty pages without page lock
> -		 * whose offset are over inode size, the release happens at
> -		 * block_write_full_page().
> -		 */
> -		i_size_write(inode, abs_to);
> -		inode->i_blocks = ocfs2_inode_sector_count(inode);
> -		di->i_size = cpu_to_le64((u64)i_size_read(inode));
> -		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> -		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> -		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> -		di->i_mtime_nsec = di->i_ctime_nsec;
> -		ocfs2_journal_dirty(handle, di_bh);
> -		ocfs2_update_inode_fsync_trans(handle, inode, 1);
> -		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
> -	}
> +	/*
> +	 * fs-writeback will release the dirty pages without page lock
> +	 * whose offset are over inode size, the release happens at
> +	 * block_write_full_page().
> +	 */
> +	i_size_write(inode, abs_to);
> +	inode->i_blocks = ocfs2_inode_sector_count(inode);
> +	di->i_size = cpu_to_le64((u64)i_size_read(inode));
> +	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> +	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
> +	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
> +	di->i_mtime_nsec = di->i_ctime_nsec;
> +	ocfs2_journal_dirty(handle, di_bh);
> +	ocfs2_update_inode_fsync_trans(handle, inode, 1);

if ocfs2_zero_start_ordered_transaction return NULL, here will BUG.
so should judge if handle is NULL.

>  
>  out_unlock:
>  	unlock_page(page);
>  	page_cache_release(page);
> +out_commit_trans:
> +	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);

if ocfs2_zero_start_ordered_transaction return NULL, here will BUG.
so should judge if handle is NULL.

>  out:
>  	return ret;
>  }
>
Junxiao Bi Sept. 25, 2014, 6:41 a.m. UTC | #3
On 09/24/2014 04:49 PM, alex chen wrote:
> Hi, Junxiao:
> 
> On 2014/9/10 9:48, Junxiao Bi wrote:
>> For commit ocfs2 journal, ocfs2 journal thread will acquire the mutex
>> osb->journal->j_trans_barrier and wake up jbd2 commit thread, then it
>> will wait until jbd2 commit thread done. In order journal mode, jbd2
>> needs flushing dirty data pages first, and this needs get page lock.
>> So osb->journal->j_trans_barrier should be got before page lock.
>>
>> But ocfs2_write_zero_page() and ocfs2_write_begin_inline() obey this
>> locking order, and this will cause deadlock and hung the whole cluster.
>>
>> One deadlock catched is the following:
>>
>> PID: 13449  TASK: ffff8802e2f08180  CPU: 31  COMMAND: "oracle"
>>  #0 [ffff8802ee3f79b0] __schedule at ffffffff8150a524
>>  #1 [ffff8802ee3f7a58] schedule at ffffffff8150acbf
>>  #2 [ffff8802ee3f7a68] rwsem_down_failed_common at ffffffff8150cb85
>>  #3 [ffff8802ee3f7ad8] rwsem_down_read_failed at ffffffff8150cc55
>>  #4 [ffff8802ee3f7ae8] call_rwsem_down_read_failed at ffffffff812617a4
>>  #5 [ffff8802ee3f7b50] ocfs2_start_trans at ffffffffa0498919 [ocfs2]
>>  #6 [ffff8802ee3f7ba0] ocfs2_zero_start_ordered_transaction at ffffffffa048b2b8 [ocfs2]
>>  #7 [ffff8802ee3f7bf0] ocfs2_write_zero_page at ffffffffa048e9bd [ocfs2]
>>  #8 [ffff8802ee3f7c80] ocfs2_zero_extend_range at ffffffffa048ec83 [ocfs2]
>>  #9 [ffff8802ee3f7ce0] ocfs2_zero_extend at ffffffffa048edfd [ocfs2]
>>  #10 [ffff8802ee3f7d50] ocfs2_extend_file at ffffffffa049079e [ocfs2]
>>  #11 [ffff8802ee3f7da0] ocfs2_setattr at ffffffffa04910ed [ocfs2]
>>  #12 [ffff8802ee3f7e70] notify_change at ffffffff81187d29
>>  #13 [ffff8802ee3f7ee0] do_truncate at ffffffff8116bbc1
>>  #14 [ffff8802ee3f7f50] sys_ftruncate at ffffffff8116bcbd
>>  #15 [ffff8802ee3f7f80] system_call_fastpath at ffffffff81515142
>>     RIP: 00007f8de750c6f7  RSP: 00007fffe786e478  RFLAGS: 00000206
>>     RAX: 000000000000004d  RBX: ffffffff81515142  RCX: 0000000000000000
>>     RDX: 0000000000000200  RSI: 0000000000028400  RDI: 000000000000000d
>>     RBP: 00007fffe786e040   R8: 0000000000000000   R9: 000000000000000d
>>     R10: 0000000000000000  R11: 0000000000000206  R12: 000000000000000d
>>     R13: 00007fffe786e710  R14: 00007f8de70f8340  R15: 0000000000028400
>>     ORIG_RAX: 000000000000004d  CS: 0033  SS: 002b
>>
>> crash64> bt
>> PID: 7610   TASK: ffff88100fd56140  CPU: 1   COMMAND: "ocfs2cmt"
>>  #0 [ffff88100f4d1c50] __schedule at ffffffff8150a524
>>  #1 [ffff88100f4d1cf8] schedule at ffffffff8150acbf
>>  #2 [ffff88100f4d1d08] jbd2_log_wait_commit at ffffffffa01274fd [jbd2]
>>  #3 [ffff88100f4d1d98] jbd2_journal_flush at ffffffffa01280b4 [jbd2]
>>  #4 [ffff88100f4d1dd8] ocfs2_commit_cache at ffffffffa0499b14 [ocfs2]
>>  #5 [ffff88100f4d1e38] ocfs2_commit_thread at ffffffffa0499d38 [ocfs2]
>>  #6 [ffff88100f4d1ee8] kthread at ffffffff81090db6
>>  #7 [ffff88100f4d1f48] kernel_thread_helper at ffffffff81516284
>>
>> crash64> bt
>> PID: 7609   TASK: ffff88100f2d4480  CPU: 0   COMMAND: "jbd2/dm-20-86"
>>  #0 [ffff88100def3920] __schedule at ffffffff8150a524
>>  #1 [ffff88100def39c8] schedule at ffffffff8150acbf
>>  #2 [ffff88100def39d8] io_schedule at ffffffff8150ad6c
>>  #3 [ffff88100def39f8] sleep_on_page at ffffffff8111069e
>>  #4 [ffff88100def3a08] __wait_on_bit_lock at ffffffff8150b30a
>>  #5 [ffff88100def3a58] __lock_page at ffffffff81110687
>>  #6 [ffff88100def3ab8] write_cache_pages at ffffffff8111b752
>>  #7 [ffff88100def3be8] generic_writepages at ffffffff8111b901
>>  #8 [ffff88100def3c48] journal_submit_data_buffers at ffffffffa0120f67 [jbd2]
>>  #9 [ffff88100def3cf8] jbd2_journal_commit_transaction at ffffffffa0121372[jbd2]
>>  #10 [ffff88100def3e68] kjournald2 at ffffffffa0127a86 [jbd2]
>>  #11 [ffff88100def3ee8] kthread at ffffffff81090db6
>>  #12 [ffff88100def3f48] kernel_thread_helper at ffffffff81516284
>>
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>> ---
>>  fs/ocfs2/aops.c |   15 ++++++++-------
>>  fs/ocfs2/file.c |   52 ++++++++++++++++++++++++----------------------------
>>  2 files changed, 32 insertions(+), 35 deletions(-)
>>
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index c04183b..c71174a 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
>> @@ -1485,8 +1485,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>>  	handle_t *handle;
>>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
>>  
>> +	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
>> +	if (IS_ERR(handle)) {
>> +		ret = PTR_ERR(handle);
>> +		mlog_errno(ret);
>> +		goto out;
>> +	}
>> +
>>  	page = find_or_create_page(mapping, 0, GFP_NOFS);
>>  	if (!page) {
>> +		ocfs2_commit_trans(osb, handle);
>>  		ret = -ENOMEM;
>>  		mlog_errno(ret);
>>  		goto out;
>> @@ -1498,13 +1506,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
>>  	wc->w_pages[0] = wc->w_target_page = page;
>>  	wc->w_num_pages = 1;
>>  
>> -	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
>> -	if (IS_ERR(handle)) {
>> -		ret = PTR_ERR(handle);
>> -		mlog_errno(ret);
>> -		goto out;
>> -	}
>> -
>>  	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
>>  				      OCFS2_JOURNAL_ACCESS_WRITE);
>>  	if (ret) {
>> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
>> index 2930e23..c534cb0 100644
>> --- a/fs/ocfs2/file.c
>> +++ b/fs/ocfs2/file.c
>> @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>>  	struct address_space *mapping = inode->i_mapping;
>>  	struct page *page;
>>  	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
>> -	handle_t *handle = NULL;
>> +	handle_t *handle;
>>  	int ret = 0;
>>  	unsigned zero_from, zero_to, block_start, block_end;
>>  	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
>> @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>>  	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
>>  	BUG_ON(abs_from & (inode->i_blkbits - 1));
>>  
>> +	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
>> +	if (IS_ERR(handle)) {
>> +		ret = PTR_ERR(handle);
>> +		goto out;
>> +	}
>> +
> 
> In ocfs2_zero_start_ordered_transaction, the inode will be filed into
> inode list of transaction by ocfs2_jbd2_file_inode.
> The relationship between ocfs2_jbd2_file_inode and lock_page as
> follow:
> 	find_or_create_page
> 		->lock_page
> 	ocfs2_zero_start_ordered_transaction
> 		->ocfs2_jbd2_file_inode
> 	unlock_page
> 
> But in your patch, the inode will be filed into inode list before the
> lock_page, is there a problem?

Could you detail more about what harm will be after changing the order?

Thanks,
Junxiao.
> 
> Thanks.
> 	--Alex
> 
>>  	page = find_or_create_page(mapping, index, GFP_NOFS);
>>  	if (!page) {
>>  		ret = -ENOMEM;
>>  		mlog_errno(ret);
>> -		goto out;
>> +		goto out_commit_trans;
>>  	}
>>  
>>  	/* Get the offsets within the page that we want to zero */
>> @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>>  			goto out_unlock;
>>  		}
>>  
>> -		if (!handle) {
>> -			handle = ocfs2_zero_start_ordered_transaction(inode,
>> -								      di_bh);
>> -			if (IS_ERR(handle)) {
>> -				ret = PTR_ERR(handle);
>> -				handle = NULL;
>> -				break;
>> -			}
>> -		}
>>  
>>  		/* must not update i_size! */
>>  		ret = block_commit_write(page, block_start + 1,
>> @@ -824,27 +821,26 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
>>  			ret = 0;
>>  	}
>>  
>> -	if (handle) {
>> -		/*
>> -		 * fs-writeback will release the dirty pages without page lock
>> -		 * whose offset are over inode size, the release happens at
>> -		 * block_write_full_page().
>> -		 */
>> -		i_size_write(inode, abs_to);
>> -		inode->i_blocks = ocfs2_inode_sector_count(inode);
>> -		di->i_size = cpu_to_le64((u64)i_size_read(inode));
>> -		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
>> -		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
>> -		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
>> -		di->i_mtime_nsec = di->i_ctime_nsec;
>> -		ocfs2_journal_dirty(handle, di_bh);
>> -		ocfs2_update_inode_fsync_trans(handle, inode, 1);
>> -		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
>> -	}
>> +	/*
>> +	 * fs-writeback will release the dirty pages without page lock
>> +	 * whose offset are over inode size, the release happens at
>> +	 * block_write_full_page().
>> +	 */
>> +	i_size_write(inode, abs_to);
>> +	inode->i_blocks = ocfs2_inode_sector_count(inode);
>> +	di->i_size = cpu_to_le64((u64)i_size_read(inode));
>> +	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
>> +	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
>> +	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
>> +	di->i_mtime_nsec = di->i_ctime_nsec;
>> +	ocfs2_journal_dirty(handle, di_bh);
>> +	ocfs2_update_inode_fsync_trans(handle, inode, 1);
>>  
>>  out_unlock:
>>  	unlock_page(page);
>>  	page_cache_release(page);
>> +out_commit_trans:
>> +	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
>>  out:
>>  	return ret;
>>  }
>>
>
diff mbox

Patch

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c04183b..c71174a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1485,8 +1485,16 @@  static int ocfs2_write_begin_inline(struct address_space *mapping,
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
 	page = find_or_create_page(mapping, 0, GFP_NOFS);
 	if (!page) {
+		ocfs2_commit_trans(osb, handle);
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -1498,13 +1506,6 @@  static int ocfs2_write_begin_inline(struct address_space *mapping,
 	wc->w_pages[0] = wc->w_target_page = page;
 	wc->w_num_pages = 1;
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e23..c534cb0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
-	handle_t *handle = NULL;
+	handle_t *handle;
 	int ret = 0;
 	unsigned zero_from, zero_to, block_start, block_end;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
+	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
 	page = find_or_create_page(mapping, index, GFP_NOFS);
 	if (!page) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
-		goto out;
+		goto out_commit_trans;
 	}
 
 	/* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 			goto out_unlock;
 		}
 
-		if (!handle) {
-			handle = ocfs2_zero_start_ordered_transaction(inode,
-								      di_bh);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				handle = NULL;
-				break;
-			}
-		}
 
 		/* must not update i_size! */
 		ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,26 @@  static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 			ret = 0;
 	}
 
-	if (handle) {
-		/*
-		 * fs-writeback will release the dirty pages without page lock
-		 * whose offset are over inode size, the release happens at
-		 * block_write_full_page().
-		 */
-		i_size_write(inode, abs_to);
-		inode->i_blocks = ocfs2_inode_sector_count(inode);
-		di->i_size = cpu_to_le64((u64)i_size_read(inode));
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-		di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-		di->i_mtime_nsec = di->i_ctime_nsec;
-		ocfs2_journal_dirty(handle, di_bh);
-		ocfs2_update_inode_fsync_trans(handle, inode, 1);
-		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-	}
+	/*
+	 * fs-writeback will release the dirty pages without page lock
+	 * whose offset are over inode size, the release happens at
+	 * block_write_full_page().
+	 */
+	i_size_write(inode, abs_to);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	di->i_mtime_nsec = di->i_ctime_nsec;
+	ocfs2_journal_dirty(handle, di_bh);
+	ocfs2_update_inode_fsync_trans(handle, inode, 1);
 
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
+out_commit_trans:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }