Message ID | 20240314073353.11489-1-heming.zhao@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ocfs2: fix journal protection issues | expand |
Hi, On 3/14/24 3:33 PM, Heming Zhao wrote: > This patch resolves 3 journal-related issues: > 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal > protection for modifying dinode. > 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide > journal protection for writing fe->i_size. > 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). > This adjustment ensures that only content requiring protection is > included in the journal. > Any real issue you've found for above cases? Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is already dirtied before. Since they are in the same transaction, I don't see any problem here. Thanks, Joseph > Signed-off-by: Heming Zhao <heming.zhao@suse.com> > --- > fs/ocfs2/move_extents.c | 2 +- > fs/ocfs2/resize.c | 7 +++---- > fs/ocfs2/suballoc.c | 25 +++++++++++++++++-------- > fs/ocfs2/suballoc.h | 3 ++- > 4 files changed, 23 insertions(+), 14 deletions(-) > > diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c > index 1f9ed117e78b..a631f9cf0c05 100644 > --- a/fs/ocfs2/move_extents.c > +++ b/fs/ocfs2/move_extents.c > @@ -687,7 +687,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, > ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, > goal_bit, len); > if (ret) { > - ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, > + ocfs2_rollback_alloc_dinode_counts(handle, gb_inode, gb_bh, len, > le16_to_cpu(gd->bg_chain)); > mlog_errno(ret); > } > diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c > index d65d43c61857..2c7e3548ae82 100644 > --- a/fs/ocfs2/resize.c > +++ b/fs/ocfs2/resize.c > @@ -143,15 +143,14 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, > le32_add_cpu(&cr->c_free, -1 * backups); > le32_add_cpu(&fe->id1.bitmap1.i_used, backups); > } > + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); > + ocfs2_journal_dirty(handle, bm_bh); > > spin_lock(&OCFS2_I(bm_inode)->ip_lock); > OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); > - le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); > spin_unlock(&OCFS2_I(bm_inode)->ip_lock); > i_size_write(bm_inode, le64_to_cpu(fe->i_size)); > > - ocfs2_journal_dirty(handle, bm_bh); > - > out_rollback: > if (ret < 0) { > ocfs2_calc_new_backup_super(bm_inode, > @@ -551,12 +550,12 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) > le32_add_cpu(&fe->id1.bitmap1.i_used, > (input->clusters - input->frees) * cl_bpc); > le32_add_cpu(&fe->i_clusters, input->clusters); > + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); > > ocfs2_journal_dirty(handle, main_bm_bh); > > spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); > OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); > - le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); > spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); > i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); > > diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c > index 166c8918c825..e1168db659df 100644 > --- a/fs/ocfs2/suballoc.c > +++ b/fs/ocfs2/suballoc.c > @@ -727,17 +727,16 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, > le16_to_cpu(bg->bg_free_bits_count)); > le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); > le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); > - > + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, > + le32_to_cpu(fe->i_clusters))); > ocfs2_journal_dirty(handle, bh); > + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); > > spin_lock(&OCFS2_I(alloc_inode)->ip_lock); > OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); > - fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, > - le32_to_cpu(fe->i_clusters))); > spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); > i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); > alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); > - ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); > > status = 0; > > @@ -1601,19 +1600,28 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, > return ret; > } > > -void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, > +void ocfs2_rollback_alloc_dinode_counts(handle_t *handle, > + struct inode *inode, > struct buffer_head *di_bh, > u32 num_bits, > u16 chain) > { > + int ret; > u32 tmp_used; > struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; > struct ocfs2_chain_list *cl; > > + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, > + OCFS2_JOURNAL_ACCESS_WRITE); > + if (ret < 0) { > + mlog_errno(ret); > + return; > + } > cl = (struct ocfs2_chain_list *)&di->id2.i_chain; > tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); > di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); > le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); > + ocfs2_journal_dirty(handle, di_bh); > } > > static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, > @@ -1717,7 +1725,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, > ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, > res->sr_bit_offset, res->sr_bits); > if (ret < 0) { > - ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, > + ocfs2_rollback_alloc_dinode_counts(handle, > + alloc_inode, ac->ac_bh, > res->sr_bits, > le16_to_cpu(gd->bg_chain)); > mlog_errno(ret); > @@ -1851,7 +1860,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, > res->sr_bit_offset, > res->sr_bits); > if (status < 0) { > - ocfs2_rollback_alloc_dinode_counts(alloc_inode, > + ocfs2_rollback_alloc_dinode_counts(handle, alloc_inode, > ac->ac_bh, res->sr_bits, chain); > mlog_errno(status); > goto bail; > @@ -2165,7 +2174,7 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, > res->sr_bit_offset, > res->sr_bits); > if (ret < 0) { > - ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, > + ocfs2_rollback_alloc_dinode_counts(handle, ac->ac_inode, > ac->ac_bh, res->sr_bits, chain); > mlog_errno(ret); > goto out; > diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h > index 9c74eace3adc..0c56ddce0752 100644 > --- a/fs/ocfs2/suballoc.h > +++ b/fs/ocfs2/suballoc.h > @@ -75,7 +75,8 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, > struct buffer_head *di_bh, > u32 num_bits, > u16 chain); > -void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, > +void ocfs2_rollback_alloc_dinode_counts(handle_t *handle, > + struct inode *inode, > struct buffer_head *di_bh, > u32 num_bits, > u16 chain);
On 3/15/24 09:09, Joseph Qi wrote: > Hi, > > On 3/14/24 3:33 PM, Heming Zhao wrote: >> This patch resolves 3 journal-related issues: >> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >> protection for modifying dinode. >> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >> journal protection for writing fe->i_size. >> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >> This adjustment ensures that only content requiring protection is >> included in the journal. >> > > Any real issue you've found for above cases? No. I just found these issues when I was reading code. > > Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be > pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in > the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is > already dirtied before. Since they are in the same transaction, I don't > see any problem here. Yes, they share the same transaction. However: In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty pair to protect metadata. In my view, jbd2_journal_access starts the protection stage, jbd2_journal_dirty closes the protection stage. the rollback function should start a new access/dirty pair to info jbd2 the di_bh had been modified again. -Heming
On 3/15/24 10:25 AM, Heming Zhao wrote: > On 3/15/24 09:09, Joseph Qi wrote: >> Hi, >> >> On 3/14/24 3:33 PM, Heming Zhao wrote: >>> This patch resolves 3 journal-related issues: >>> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >>> protection for modifying dinode. >>> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >>> journal protection for writing fe->i_size. >>> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >>> This adjustment ensures that only content requiring protection is >>> included in the journal. >>> >> >> Any real issue you've found for above cases? > > No. I just found these issues when I was reading code. >> >> Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be >> pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in >> the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is >> already dirtied before. Since they are in the same transaction, I don't >> see any problem here. > > Yes, they share the same transaction. > However: > In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty > pair to protect metadata. In my view, jbd2_journal_access starts the > protection stage, jbd2_journal_dirty closes the protection stage. the rollback > function should start a new access/dirty pair to info jbd2 the di_bh had been > modified again. > Refer jbd2_journal_dirty_metadata(): /* * fastpath, to avoid expensive locking. If this buffer is already * on the running transaction's metadata list there is nothing to do. * Nobody can take it off again because there is a handle open. * I _think_ we're OK here with SMP barriers - a mistaken decision will * result in this test being false, so we go in and take the locks. */ So IMO, we don't have to access/dirty again since the buffer head is already added before. Joseph
On 3/15/24 15:22, Joseph Qi wrote: > > > On 3/15/24 10:25 AM, Heming Zhao wrote: >> On 3/15/24 09:09, Joseph Qi wrote: >>> Hi, >>> >>> On 3/14/24 3:33 PM, Heming Zhao wrote: >>>> This patch resolves 3 journal-related issues: >>>> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >>>> protection for modifying dinode. >>>> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >>>> journal protection for writing fe->i_size. >>>> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >>>> This adjustment ensures that only content requiring protection is >>>> included in the journal. >>>> >>> >>> Any real issue you've found for above cases? >> >> No. I just found these issues when I was reading code. >>> >>> Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be >>> pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in >>> the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is >>> already dirtied before. Since they are in the same transaction, I don't >>> see any problem here. >> >> Yes, they share the same transaction. >> However: >> In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty >> pair to protect metadata. In my view, jbd2_journal_access starts the >> protection stage, jbd2_journal_dirty closes the protection stage. the rollback >> function should start a new access/dirty pair to info jbd2 the di_bh had been >> modified again. >> > Refer jbd2_journal_dirty_metadata(): > > /* > * fastpath, to avoid expensive locking. If this buffer is already > * on the running transaction's metadata list there is nothing to do. > * Nobody can take it off again because there is a handle open. > * I _think_ we're OK here with SMP barriers - a mistaken decision will > * result in this test being false, so we go in and take the locks. > */ > > So IMO, we don't have to access/dirty again since the buffer head is > already added before. > > Joseph Thanks for the detailed info. Regarding the jbd2 code, you are right. What is your opinion about other modifications of my patch? (the modifications about the jbd2 protection scope.) under above buffer protection rule of access/dirty, in function, the bh is same. It seems like the jbd2 dirty function could be called anywhere regardless of the end-use point of buffer_head. it's counterintuitive. -Heming
On 3/15/24 8:26 PM, Heming Zhao wrote: > On 3/15/24 15:22, Joseph Qi wrote: >> >> >> On 3/15/24 10:25 AM, Heming Zhao wrote: >>> On 3/15/24 09:09, Joseph Qi wrote: >>>> Hi, >>>> >>>> On 3/14/24 3:33 PM, Heming Zhao wrote: >>>>> This patch resolves 3 journal-related issues: >>>>> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >>>>> protection for modifying dinode. >>>>> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >>>>> journal protection for writing fe->i_size. >>>>> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >>>>> This adjustment ensures that only content requiring protection is >>>>> included in the journal. >>>>> >>>> >>>> Any real issue you've found for above cases? >>> >>> No. I just found these issues when I was reading code. >>>> >>>> Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be >>>> pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in >>>> the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is >>>> already dirtied before. Since they are in the same transaction, I don't >>>> see any problem here. >>> >>> Yes, they share the same transaction. >>> However: >>> In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty >>> pair to protect metadata. In my view, jbd2_journal_access starts the >>> protection stage, jbd2_journal_dirty closes the protection stage. the rollback >>> function should start a new access/dirty pair to info jbd2 the di_bh had been >>> modified again. >>> >> Refer jbd2_journal_dirty_metadata(): >> >> /* >> * fastpath, to avoid expensive locking. If this buffer is already >> * on the running transaction's metadata list there is nothing to do. >> * Nobody can take it off again because there is a handle open. >> * I _think_ we're OK here with SMP barriers - a mistaken decision will >> * result in this test being false, so we go in and take the locks. >> */ >> >> So IMO, we don't have to access/dirty again since the buffer head is >> already added before. >> >> Joseph > > Thanks for the detailed info. Regarding the jbd2 code, you are right. > > What is your opinion about other modifications of my patch? > (the modifications about the jbd2 protection scope.) > under above buffer protection rule of access/dirty, in function, > the bh is same. It seems like the jbd2 dirty function could be > called anywhere regardless of the end-use point of buffer_head. > it's counterintuitive. > Which part do you refer? It seems that all of them are changing the same buffer head which is dirtied before, so it won't be a problem since the transaction has been committed yet. BTW, dinode size change should be protected under ip_lock, so we can't move it out. Thanks, Joseph
On 3/18/24 09:26, Joseph Qi wrote: > > > On 3/15/24 8:26 PM, Heming Zhao wrote: >> On 3/15/24 15:22, Joseph Qi wrote: >>> >>> >>> On 3/15/24 10:25 AM, Heming Zhao wrote: >>>> On 3/15/24 09:09, Joseph Qi wrote: >>>>> Hi, >>>>> >>>>> On 3/14/24 3:33 PM, Heming Zhao wrote: >>>>>> This patch resolves 3 journal-related issues: >>>>>> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >>>>>> protection for modifying dinode. >>>>>> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >>>>>> journal protection for writing fe->i_size. >>>>>> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >>>>>> This adjustment ensures that only content requiring protection is >>>>>> included in the journal. >>>>>> >>>>> >>>>> Any real issue you've found for above cases? >>>> >>>> No. I just found these issues when I was reading code. >>>>> >>>>> Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be >>>>> pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in >>>>> the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is >>>>> already dirtied before. Since they are in the same transaction, I don't >>>>> see any problem here. >>>> >>>> Yes, they share the same transaction. >>>> However: >>>> In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty >>>> pair to protect metadata. In my view, jbd2_journal_access starts the >>>> protection stage, jbd2_journal_dirty closes the protection stage. the rollback >>>> function should start a new access/dirty pair to info jbd2 the di_bh had been >>>> modified again. >>>> >>> Refer jbd2_journal_dirty_metadata(): >>> >>> /* >>> * fastpath, to avoid expensive locking. If this buffer is already >>> * on the running transaction's metadata list there is nothing to do. >>> * Nobody can take it off again because there is a handle open. >>> * I _think_ we're OK here with SMP barriers - a mistaken decision will >>> * result in this test being false, so we go in and take the locks. >>> */ >>> >>> So IMO, we don't have to access/dirty again since the buffer head is >>> already added before. >>> >>> Joseph >> >> Thanks for the detailed info. Regarding the jbd2 code, you are right. >> >> What is your opinion about other modifications of my patch? >> (the modifications about the jbd2 protection scope.) >> under above buffer protection rule of access/dirty, in function, >> the bh is same. It seems like the jbd2 dirty function could be >> called anywhere regardless of the end-use point of buffer_head. >> it's counterintuitive. >> > Which part do you refer? > It seems that all of them are changing the same buffer head which is > dirtied before, so it won't be a problem since the transaction has been > committed yet. > BTW, dinode size change should be protected under ip_lock, so we can't > move it out. > > Thanks, > Joseph > Thank you for your explanation. For my patch, it's wrong to move out fe->i_size from spin_[un]lock() area. But for above info, I have a different view: ->ip_lock is not used to protect dinode i_size. From the comment in 'struct ocfs2_inode_info', 'spinlock_t ip_lock' is used to protect below items: ``` /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; struct list_head ip_io_markers; u32 ip_clusters; u16 ip_dyn_features; struct mutex ip_io_mutex; //zhm: should not include this u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ ``` from the code of ocfs2_block_group_alloc(): ``` 737 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); 738 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 739 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, 740 le32_to_cpu(fe->i_clusters))); 741 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); ``` we can see that ->ip_lock is protecting line 738, the assignment operation of "oi->ip_clusters". Because the cluster length is obtained from 'fe->i_clusters', if we move line 739 out of spin_[un]lock(), fe->clusters may change, resulting in a different value for fe->i_size. So author Mark put both 738 and 739 into spin_[un]lock(). From another viewpoint, it appears that Mark think fe->i_size should keep consistent with oi->ip_clusters. we can also verify my analysis from ocfs2_mark_inode_dirty(). In this function, ->ip_lock doesn't protect fe->i_size. because the assignment action reads data from 'inode', and not related with any 'ocfs2_inode_info' ip_xx items. ------------------ Let's discuss with jbd2 access/dirty. Because these code has already been running almost twenty years, I accept no changes for existing code. But it's better to unify the jbd2 access/dirty code style. e.g: ocfs2_group_add() & ocfs2_block_group_alloc(): - jbd2 dirty *before* spin_lock area. ocfs2_update_last_group_and_inode(): - jbd2 dirty *after* spin_lock area. ocfs2_mark_inode_dirty(): - jbd2 dirty *after* spin_lock area. could we unify the code style? For example, placing jbd2 dirty *after* spin_lock area. e.g: ``` @@ -559,13 +559,12 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) (input->clusters - input->frees) * cl_bpc); le32_add_cpu(&fe->i_clusters, input->clusters); - ocfs2_journal_dirty(handle, main_bm_bh); - spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); + ocfs2_journal_dirty(handle, main_bm_bh); ocfs2_update_super_and_backups(main_bm_inode, input->clusters); ``` Thanks, Heming
On 3/18/24 11:16 AM, Heming Zhao wrote: > On 3/18/24 09:26, Joseph Qi wrote: >> >> >> On 3/15/24 8:26 PM, Heming Zhao wrote: >>> On 3/15/24 15:22, Joseph Qi wrote: >>>> >>>> >>>> On 3/15/24 10:25 AM, Heming Zhao wrote: >>>>> On 3/15/24 09:09, Joseph Qi wrote: >>>>>> Hi, >>>>>> >>>>>> On 3/14/24 3:33 PM, Heming Zhao wrote: >>>>>>> This patch resolves 3 journal-related issues: >>>>>>> 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal >>>>>>> protection for modifying dinode. >>>>>>> 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide >>>>>>> journal protection for writing fe->i_size. >>>>>>> 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). >>>>>>> This adjustment ensures that only content requiring protection is >>>>>>> included in the journal. >>>>>>> >>>>>> >>>>>> Any real issue you've found for above cases? >>>>> >>>>> No. I just found these issues when I was reading code. >>>>>> >>>>>> Take ocfs2_rollback_alloc_dinode_counts() for example, it will always be >>>>>> pair used with ocfs2_alloc_dinode_update_counts(), more precisely, in >>>>>> the error case of ocfs2_block_group_set_bits(). And the 'di_bh' is >>>>>> already dirtied before. Since they are in the same transaction, I don't >>>>>> see any problem here. >>>>> >>>>> Yes, they share the same transaction. >>>>> However: >>>>> In ocfs2_alloc_dinode_update_counts(), there already uses a jbd2 access/dirty >>>>> pair to protect metadata. In my view, jbd2_journal_access starts the >>>>> protection stage, jbd2_journal_dirty closes the protection stage. the rollback >>>>> function should start a new access/dirty pair to info jbd2 the di_bh had been >>>>> modified again. >>>>> >>>> Refer jbd2_journal_dirty_metadata(): >>>> >>>> /* >>>> * fastpath, to avoid expensive locking. If this buffer is already >>>> * on the running transaction's metadata list there is nothing to do. >>>> * Nobody can take it off again because there is a handle open. >>>> * I _think_ we're OK here with SMP barriers - a mistaken decision will >>>> * result in this test being false, so we go in and take the locks. >>>> */ >>>> >>>> So IMO, we don't have to access/dirty again since the buffer head is >>>> already added before. >>>> >>>> Joseph >>> >>> Thanks for the detailed info. Regarding the jbd2 code, you are right. >>> >>> What is your opinion about other modifications of my patch? >>> (the modifications about the jbd2 protection scope.) >>> under above buffer protection rule of access/dirty, in function, >>> the bh is same. It seems like the jbd2 dirty function could be >>> called anywhere regardless of the end-use point of buffer_head. >>> it's counterintuitive. >>> >> Which part do you refer? >> It seems that all of them are changing the same buffer head which is >> dirtied before, so it won't be a problem since the transaction has been >> committed yet. >> BTW, dinode size change should be protected under ip_lock, so we can't >> move it out. >> >> Thanks, >> Joseph >> > > Thank you for your explanation. > For my patch, it's wrong to move out fe->i_size from spin_[un]lock() > area. > > But for above info, I have a different view: > ->ip_lock is not used to protect dinode i_size. > > From the comment in 'struct ocfs2_inode_info', 'spinlock_t ip_lock' > is used to protect below items: > ``` > /* These fields are protected by ip_lock */ > spinlock_t ip_lock; > u32 ip_open_count; > struct list_head ip_io_markers; > u32 ip_clusters; > > u16 ip_dyn_features; > struct mutex ip_io_mutex; //zhm: should not include this > u32 ip_flags; /* see below */ > u32 ip_attr; /* inode attributes */ > ``` > > from the code of ocfs2_block_group_alloc(): > ``` > 737 spin_lock(&OCFS2_I(alloc_inode)->ip_lock); > 738 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); > 739 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, > 740 le32_to_cpu(fe->i_clusters))); > 741 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); > ``` > > we can see that ->ip_lock is protecting line 738, the assignment > operation of "oi->ip_clusters". Because the cluster length is > obtained from 'fe->i_clusters', if we move line 739 out of spin_[un]lock(), > fe->clusters may change, resulting in a different value for fe->i_size. > So author Mark put both 738 and 739 into spin_[un]lock(). From another > viewpoint, it appears that Mark think fe->i_size should keep consistent > with oi->ip_clusters. > Make sense. > we can also verify my analysis from ocfs2_mark_inode_dirty(). > In this function, ->ip_lock doesn't protect fe->i_size. because > the assignment action reads data from 'inode', and not related > with any 'ocfs2_inode_info' ip_xx items. > > ------------------ > Let's discuss with jbd2 access/dirty. > Because these code has already been running almost twenty years, > I accept no changes for existing code. But it's better to unify > the jbd2 access/dirty code style. > > e.g: > ocfs2_group_add() & ocfs2_block_group_alloc(): > - jbd2 dirty *before* spin_lock area. > > ocfs2_update_last_group_and_inode(): > - jbd2 dirty *after* spin_lock area. > > ocfs2_mark_inode_dirty(): > - jbd2 dirty *after* spin_lock area. > > could we unify the code style? For example, placing jbd2 dirty > *after* spin_lock area. > > e.g: > ``` > @@ -559,13 +559,12 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) > (input->clusters - input->frees) * cl_bpc); > le32_add_cpu(&fe->i_clusters, input->clusters); > > - ocfs2_journal_dirty(handle, main_bm_bh); > - > spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); > OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); > le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); > spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); > i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); > + ocfs2_journal_dirty(handle, main_bm_bh); > > ocfs2_update_super_and_backups(main_bm_inode, input->clusters); > ``` > The above change looks sane. Joseph
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 1f9ed117e78b..a631f9cf0c05 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -687,7 +687,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, goal_bit, len); if (ret) { - ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, + ocfs2_rollback_alloc_dinode_counts(handle, gb_inode, gb_bh, len, le16_to_cpu(gd->bg_chain)); mlog_errno(ret); } diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d65d43c61857..2c7e3548ae82 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -143,15 +143,14 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le32_add_cpu(&cr->c_free, -1 * backups); le32_add_cpu(&fe->id1.bitmap1.i_used, backups); } + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); + ocfs2_journal_dirty(handle, bm_bh); spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); - ocfs2_journal_dirty(handle, bm_bh); - out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, @@ -551,12 +550,12 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) le32_add_cpu(&fe->id1.bitmap1.i_used, (input->clusters - input->frees) * cl_bpc); le32_add_cpu(&fe->i_clusters, input->clusters); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); ocfs2_journal_dirty(handle, main_bm_bh); spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 166c8918c825..e1168db659df 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -727,17 +727,16 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); - + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); ocfs2_journal_dirty(handle, bh); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); spin_lock(&OCFS2_I(alloc_inode)->ip_lock); OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, - le32_to_cpu(fe->i_clusters))); spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); - ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; @@ -1601,19 +1600,28 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, return ret; } -void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, +void ocfs2_rollback_alloc_dinode_counts(handle_t *handle, + struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain) { + int ret; u32 tmp_used; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + return; + } cl = (struct ocfs2_chain_list *)&di->id2.i_chain; tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); + ocfs2_journal_dirty(handle, di_bh); } static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, @@ -1717,7 +1725,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits); if (ret < 0) { - ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + ocfs2_rollback_alloc_dinode_counts(handle, + alloc_inode, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); mlog_errno(ret); @@ -1851,7 +1860,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, res->sr_bit_offset, res->sr_bits); if (status < 0) { - ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ocfs2_rollback_alloc_dinode_counts(handle, alloc_inode, ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; @@ -2165,7 +2174,7 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, res->sr_bit_offset, res->sr_bits); if (ret < 0) { - ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ocfs2_rollback_alloc_dinode_counts(handle, ac->ac_inode, ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 9c74eace3adc..0c56ddce0752 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -75,7 +75,8 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); -void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, +void ocfs2_rollback_alloc_dinode_counts(handle_t *handle, + struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain);
This patch resolves 3 journal-related issues: 1. ocfs2_rollback_alloc_dinode_counts() forgets to provide journal protection for modifying dinode. 2. ocfs2_block_group_alloc() & ocfs2_group_add() forget to provide journal protection for writing fe->i_size. 3. adjusted journal_dirty scope for ocfs2_update_last_group_and_inode(). This adjustment ensures that only content requiring protection is included in the journal. Signed-off-by: Heming Zhao <heming.zhao@suse.com> --- fs/ocfs2/move_extents.c | 2 +- fs/ocfs2/resize.c | 7 +++---- fs/ocfs2/suballoc.c | 25 +++++++++++++++++-------- fs/ocfs2/suballoc.h | 3 ++- 4 files changed, 23 insertions(+), 14 deletions(-)