Message ID | 1373597724-28242-1-git-send-email-junxiao.bi@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Joel, Mark, Sunil, Could you help review this patch? Now it's in mmotm tree and need your ack to go in mainline. This bug can cause sparse file hole with non-zero data, and cause a big confuse to customer who think they have a corrupt file. Thanks, Junxiao. On 07/12/2013 10:55 AM, Junxiao Bi wrote: > fs-writeback will release the dirty pages without page lock > whose offset are over inode size, the release happens at > block_write_full_page_endio(). If not update, dirty pages > in file holes may be released before flushed to the disk, > then file holes will contain some non-zero data, this will > cause sparse file md5sum error. > > To reproduce the bug, find a big sparse file with many holes, > like vm image file, its actual size should be bigger than > available mem size to make writeback work more frequently, > tar it with -S option, then keep untar it and check its md5sum > again and again until you get a wrong md5sum. > > Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> > --- > fs/ocfs2/file.c | 40 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 32 insertions(+), 8 deletions(-) > > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > index ff54014..3ce6a8b 100644 > --- a/fs/ocfs2/file.c > +++ b/fs/ocfs2/file.c > @@ -723,7 +723,8 @@ leave: > * While a write will already be ordering the data, a truncate will not. > * Thus, we need to explicitly order the zeroed pages. > */ > -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) > +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, > + struct buffer_head *di_bh) > { > struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > handle_t *handle = NULL; > @@ -740,7 +741,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) > } > > ret = ocfs2_jbd2_file_inode(handle, inode); > - if (ret < 0) > + if (ret < 0) { > + mlog_errno(ret); > + goto out; > + } > + > + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, > + OCFS2_JOURNAL_ACCESS_WRITE); > + if (ret) > mlog_errno(ret); > > out: > @@ -756,7 +764,7 @@ out: > * to be too fragile to do exactly what we need without us having to > * worry about recursive locking in ->write_begin() and ->write_end(). */ > static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, > - u64 abs_to) > + u64 abs_to, struct buffer_head *di_bh) > { > struct address_space *mapping = inode->i_mapping; > struct page *page; > @@ -764,6 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, > handle_t *handle = NULL; > int ret = 0; > unsigned zero_from, zero_to, block_start, block_end; > + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; > > BUG_ON(abs_from >= abs_to); > BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); > @@ -806,7 +815,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, > } > > if (!handle) { > - handle = ocfs2_zero_start_ordered_transaction(inode); > + handle = ocfs2_zero_start_ordered_transaction(inode, > + di_bh); > if (IS_ERR(handle)) { > ret = PTR_ERR(handle); > handle = NULL; > @@ -823,8 +833,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, > ret = 0; > } > > - if (handle) > + if (handle) { > + /* > + * fs-writeback will release the dirty pages without page lock > + * whose offset are over inode size, the release happens at > + * block_write_full_page_endio(). > + */ > + i_size_write(inode, abs_to); > + inode->i_blocks = ocfs2_inode_sector_count(inode); > + di->i_size = cpu_to_le64((u64)i_size_read(inode)); > + inode->i_mtime = inode->i_ctime = CURRENT_TIME; > + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); > + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); > + di->i_mtime_nsec = di->i_ctime_nsec; > + ocfs2_journal_dirty(handle, di_bh); > ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); > + } > > out_unlock: > unlock_page(page); > @@ -920,7 +944,7 @@ out: > * has made sure that the entire range needs zeroing. > */ > static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, > - u64 range_end) > + u64 range_end, struct buffer_head *di_bh) > { > int rc = 0; > u64 next_pos; > @@ -936,7 +960,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, > next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; > if (next_pos > range_end) > next_pos = range_end; > - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); > + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); > if (rc < 0) { > mlog_errno(rc); > break; > @@ -982,7 +1006,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, > range_end = zero_to_size; > > ret = ocfs2_zero_extend_range(inode, range_start, > - range_end); > + range_end, di_bh); > if (ret) { > mlog_errno(ret); > break;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ff54014..3ce6a8b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -723,7 +723,8 @@ leave: * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. */ -static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, + struct buffer_head *di_bh) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -740,7 +741,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) } ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) mlog_errno(ret); out: @@ -756,7 +764,7 @@ out: * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, - u64 abs_to) + u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -764,6 +772,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, handle_t *handle = NULL; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; BUG_ON(abs_from >= abs_to); BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); @@ -806,7 +815,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode); + handle = ocfs2_zero_start_ordered_transaction(inode, + di_bh); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; @@ -823,8 +833,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } - if (handle) + if (handle) { + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page_endio(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; + ocfs2_journal_dirty(handle, di_bh); ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + } out_unlock: unlock_page(page); @@ -920,7 +944,7 @@ out: * has made sure that the entire range needs zeroing. */ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, - u64 range_end) + u64 range_end, struct buffer_head *di_bh) { int rc = 0; u64 next_pos; @@ -936,7 +960,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; if (next_pos > range_end) next_pos = range_end; - rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh); if (rc < 0) { mlog_errno(rc); break; @@ -982,7 +1006,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, range_end = zero_to_size; ret = ocfs2_zero_extend_range(inode, range_start, - range_end); + range_end, di_bh); if (ret) { mlog_errno(ret); break;
fs-writeback will release the dirty pages without page lock whose offset are over inode size, the release happens at block_write_full_page_endio(). If not update, dirty pages in file holes may be released before flushed to the disk, then file holes will contain some non-zero data, this will cause sparse file md5sum error. To reproduce the bug, find a big sparse file with many holes, like vm image file, its actual size should be bigger than available mem size to make writeback work more frequently, tar it with -S option, then keep untar it and check its md5sum again and again until you get a wrong md5sum. Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> --- fs/ocfs2/file.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-)