diff mbox series

ocfs2: protect extent tree in the ocfs2_prepare_inode_for_write

Message ID 1568195880-6569-1-git-send-email-sunny.s.zhang@oracle.com (mailing list archive)
State New, archived
Headers show
Series ocfs2: protect extent tree in the ocfs2_prepare_inode_for_write | expand

Commit Message

sunnyZhang Sept. 11, 2019, 9:58 a.m. UTC
When the extent tree is modified, it shuold be protected by
inode cluster lock and ip_alloc_sem.

The extent tree is accessed and modified in the ocfs2_prepare_inode_for_write,
but isn't protected by ip_alloc_sem.

The following is a case. The function ocfs2_fiemap is accessing
the extent tree, which is modified at the same time.

[47145.974472] kernel BUG at fs/ocfs2/extent_map.c:475!
[47145.974480] invalid opcode: 0000 [#1] SMP
[47145.974489] Modules linked in: tun ocfs2 ocfs2_nodemanager configfs
ocfs2_stackglue xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn
xenfs xen_privcmd vfat fat bnx2fc fcoe libfcoe libfc scsi_transport_fc sunrpc
bridge 8021q mrp garp stp llc ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad
ib_core ib_addr dm_round_robin dm_multipath sg pcspkr raid1 shpchp
ipmi_devintf ipmi_msghandler ext4 jbd2 mbcache2 sd_mod nvme nvme_core bnxt_en
xhci_pci xhci_hcd crc32c_intel be2iscsi bnx2i cnic uio cxgb4i cxgb4 cxgb3i
libcxgbi ipv6 cxgb3 mdio qla4xxx wmi dm_mirror dm_region_hash dm_log dm_mod
iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iscsi_ibft
iscsi_boot_sysfs
[47145.974636] CPU: 16 PID: 14047 Comm: o2info Not tainted
4.1.12-124.23.1.el6uek.x86_64 #2
[47145.974646] Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB
MECH, X7-2L, BIOS 42040600 10/19/2018
[47145.974658] task: ffff88019487e200 ti: ffff88003daa4000 task.ti:
ffff88003daa4000
[47145.974667] RIP: e030:[<ffffffffa05e4840>]  [<ffffffffa05e4840>]
ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
[47145.974708] RSP: e02b:ffff88003daa7d88  EFLAGS: 00010287
[47145.974713] RAX: 00000000000000de RBX: ffff8801d1104030 RCX:
ffff8801d1104e10
[47145.974719] RDX: 00000000000000de RSI: 000000000009ec40 RDI:
ffff8801d1104e24
[47145.974725] RBP: ffff88003daa7df8 R08: ffff88003daa7e38 R09:
0000000000000000
[47145.974732] R10: 000000000009ec3f R11: 0000000000000246 R12:
000000000009ec3f
[47145.974739] R13: ffff88004c419000 R14: 0000000000000002 R15:
ffff88003daa7e28
[47145.974754] FS:  00007fdbccc92720(0000) GS:ffff880358800000(0000)
knlGS:ffff880358800000
[47145.974764] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
[47145.974772] CR2: 00007fd5dfcd8350 CR3: 0000000208677000 CR4:
0000000000042660
[47145.974785] Stack:
[47145.974790]  ffff88003daa7df8 00002000a05e249b ffff8801d1104000
ffff88003daa7e2c
[47145.974802]  ffff88003daa7e38 ffff88000cc484c0 ffff880145f5b478
0000000000000000
[47145.974811]  0000000000002000 ffff88000cc484c0 ffff88003daa7ea0
0000000000000000
[47145.974820] Call Trace:
[47145.974837]  [<ffffffffa05e53e3>] ocfs2_fiemap+0x1e3/0x430 [ocfs2]
[47145.974848]  [<ffffffff816f644f>] ? xen_hypervisor_callback+0x7f/0x120
[47145.974855]  [<ffffffff816f6448>] ? xen_hypervisor_callback+0x78/0x120
[47145.974861]  [<ffffffff816f64a3>] ? xen_hypervisor_callback+0xd3/0x120
[47145.974872]  [<ffffffff81220905>] do_vfs_ioctl+0x155/0x510
[47145.974878]  [<ffffffff81220d41>] SyS_ioctl+0x81/0xa0
[47145.974885]  [<ffffffff816f1b9f>] ? system_call_after_swapgs+0xe9/0x190
[47145.974891]  [<ffffffff816f1b98>] ? system_call_after_swapgs+0xe2/0x190
[47145.974899]  [<ffffffff816f1b91>] ? system_call_after_swapgs+0xdb/0x190
[47145.974905]  [<ffffffff816f1c5e>] system_call_fastpath+0x18/0xd8
[47145.974910] Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40
48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f>
0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
[47145.975000] RIP  [<ffffffffa05e4840>]
ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
[47145.975018]  RSP <ffff88003daa7d88>
[47145.989999] ---[ end trace c8aa0c8180e869dc ]---
[47146.087579] Kernel panic - not syncing: Fatal exception
[47146.087691] Kernel Offset: disabled

Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
---
 fs/ocfs2/file.c | 79 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 25 deletions(-)

Comments

Junxiao Bi Sept. 11, 2019, 10:30 p.m. UTC | #1
Hi Shuning,

See comments inlined.

On 9/11/19 2:58 AM, Shuning Zhang wrote:
> When the extent tree is modified, it shuold be protected by
> inode cluster lock and ip_alloc_sem.
>
> The extent tree is accessed and modified in the ocfs2_prepare_inode_for_write,
> but isn't protected by ip_alloc_sem.
>
> The following is a case. The function ocfs2_fiemap is accessing
> the extent tree, which is modified at the same time.
>
> [47145.974472] kernel BUG at fs/ocfs2/extent_map.c:475!
> [47145.974480] invalid opcode: 0000 [#1] SMP
> [47145.974489] Modules linked in: tun ocfs2 ocfs2_nodemanager configfs
> ocfs2_stackglue xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn
> xenfs xen_privcmd vfat fat bnx2fc fcoe libfcoe libfc scsi_transport_fc sunrpc
> bridge 8021q mrp garp stp llc ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad
> ib_core ib_addr dm_round_robin dm_multipath sg pcspkr raid1 shpchp
> ipmi_devintf ipmi_msghandler ext4 jbd2 mbcache2 sd_mod nvme nvme_core bnxt_en
> xhci_pci xhci_hcd crc32c_intel be2iscsi bnx2i cnic uio cxgb4i cxgb4 cxgb3i
> libcxgbi ipv6 cxgb3 mdio qla4xxx wmi dm_mirror dm_region_hash dm_log dm_mod
> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iscsi_ibft
> iscsi_boot_sysfs
> [47145.974636] CPU: 16 PID: 14047 Comm: o2info Not tainted
> 4.1.12-124.23.1.el6uek.x86_64 #2
> [47145.974646] Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB
> MECH, X7-2L, BIOS 42040600 10/19/2018
> [47145.974658] task: ffff88019487e200 ti: ffff88003daa4000 task.ti:
> ffff88003daa4000
> [47145.974667] RIP: e030:[<ffffffffa05e4840>]  [<ffffffffa05e4840>]
> ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
> [47145.974708] RSP: e02b:ffff88003daa7d88  EFLAGS: 00010287
> [47145.974713] RAX: 00000000000000de RBX: ffff8801d1104030 RCX:
> ffff8801d1104e10
> [47145.974719] RDX: 00000000000000de RSI: 000000000009ec40 RDI:
> ffff8801d1104e24
> [47145.974725] RBP: ffff88003daa7df8 R08: ffff88003daa7e38 R09:
> 0000000000000000
> [47145.974732] R10: 000000000009ec3f R11: 0000000000000246 R12:
> 000000000009ec3f
> [47145.974739] R13: ffff88004c419000 R14: 0000000000000002 R15:
> ffff88003daa7e28
> [47145.974754] FS:  00007fdbccc92720(0000) GS:ffff880358800000(0000)
> knlGS:ffff880358800000
> [47145.974764] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
> [47145.974772] CR2: 00007fd5dfcd8350 CR3: 0000000208677000 CR4:
> 0000000000042660
> [47145.974785] Stack:
> [47145.974790]  ffff88003daa7df8 00002000a05e249b ffff8801d1104000
> ffff88003daa7e2c
> [47145.974802]  ffff88003daa7e38 ffff88000cc484c0 ffff880145f5b478
> 0000000000000000
> [47145.974811]  0000000000002000 ffff88000cc484c0 ffff88003daa7ea0
> 0000000000000000
> [47145.974820] Call Trace:
> [47145.974837]  [<ffffffffa05e53e3>] ocfs2_fiemap+0x1e3/0x430 [ocfs2]
> [47145.974848]  [<ffffffff816f644f>] ? xen_hypervisor_callback+0x7f/0x120
> [47145.974855]  [<ffffffff816f6448>] ? xen_hypervisor_callback+0x78/0x120
> [47145.974861]  [<ffffffff816f64a3>] ? xen_hypervisor_callback+0xd3/0x120
> [47145.974872]  [<ffffffff81220905>] do_vfs_ioctl+0x155/0x510
> [47145.974878]  [<ffffffff81220d41>] SyS_ioctl+0x81/0xa0
> [47145.974885]  [<ffffffff816f1b9f>] ? system_call_after_swapgs+0xe9/0x190
> [47145.974891]  [<ffffffff816f1b98>] ? system_call_after_swapgs+0xe2/0x190
> [47145.974899]  [<ffffffff816f1b91>] ? system_call_after_swapgs+0xdb/0x190
> [47145.974905]  [<ffffffff816f1c5e>] system_call_fastpath+0x18/0xd8
> [47145.974910] Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40
> 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f>
> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
> [47145.975000] RIP  [<ffffffffa05e4840>]
> ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
> [47145.975018]  RSP <ffff88003daa7d88>
> [47145.989999] ---[ end trace c8aa0c8180e869dc ]---
> [47146.087579] Kernel panic - not syncing: Fatal exception
> [47146.087691] Kernel Offset: disabled
>
> Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
> ---
>   fs/ocfs2/file.c | 79 +++++++++++++++++++++++++++++++++++++++------------------
>   1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 4435df3..85cddd2 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -2093,29 +2093,19 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
>   }
>   
>   static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
> +					    struct buffer_head *di_bh,
>   					    struct file *file,
> -					    loff_t pos, size_t count,
> -					    int *meta_level)
> +					    loff_t pos, size_t count)
>   {
>   	int ret;
> -	struct buffer_head *di_bh = NULL;
>   	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
>   	u32 clusters =
>   		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
>   
> -	ret = ocfs2_inode_lock(inode, &di_bh, 1);
> -	if (ret) {
> -		mlog_errno(ret);
> -		goto out;
> -	}
> -
> -	*meta_level = 1;
> -
>   	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
>   	if (ret)
>   		mlog_errno(ret);
> -out:
> -	brelse(di_bh);
> +
>   	return ret;
>   }
Since you moved out cluster from ocfs2_prepare_inode_for_refcount() , 
only ocfs2_refcount_cow left, can we go even further to remove it?

>   
> @@ -2123,6 +2113,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
>   					 loff_t pos, size_t count, int wait)
>   {
>   	int ret = 0, meta_level = 0, overwrite_io = 0;
> +	int write_sem = 0;
>   	struct dentry *dentry = file->f_path.dentry;
>   	struct inode *inode = d_inode(dentry);
>   	struct buffer_head *di_bh = NULL;
> @@ -2145,25 +2136,30 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
>   			goto out;
>   		}
>   
> +		if (wait)
> +			down_read(&OCFS2_I(inode)->ip_alloc_sem);
> +		else {
> +			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
> +			if (!ret) {
> +				ret = -EAGAIN;
> +				goto out_unlock;
> +			}
> +		}

can we do it like the following?

if(wait) {

     lock(inode_lock);

     lock(ip_alloc_sem);

} else {

     try_lock();

}

> +
>   		/*
>   		 * Check if IO will overwrite allocated blocks in case
>   		 * IOCB_NOWAIT flag is set.
>   		 */
>   		if (!wait && !overwrite_io) {
>   			overwrite_io = 1;
> -			if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
> -				ret = -EAGAIN;
> -				goto out_unlock;
> -			}
>   
>   			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
>   			brelse(di_bh);
>   			di_bh = NULL;
> -			up_read(&OCFS2_I(inode)->ip_alloc_sem);
>   			if (ret < 0) {
>   				if (ret != -EAGAIN)
>   					mlog_errno(ret);
> -				goto out_unlock;
> +				goto out_up_sem;
>   			}
>   		}
>   
> @@ -2178,6 +2174,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
>   		 * set inode->i_size at the end of a write. */
>   		if (should_remove_suid(dentry)) {
>   			if (meta_level == 0) {
> +				up_read(&OCFS2_I(inode)->ip_alloc_sem);
>   				ocfs2_inode_unlock(inode, meta_level);
>   				meta_level = 1;
>   				continue;
> @@ -2186,7 +2183,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
>   			ret = ocfs2_write_remove_suid(inode);
>   			if (ret < 0) {
>   				mlog_errno(ret);
> -				goto out_unlock;
> +				goto out_up_sem;
>   			}
>   		}
>   
> @@ -2194,24 +2191,56 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
>   
>   		ret = ocfs2_check_range_for_refcount(inode, pos, count);
>   		if (ret == 1) {
> +			up_read(&OCFS2_I(inode)->ip_alloc_sem);
>   			ocfs2_inode_unlock(inode, meta_level);
> -			meta_level = -1;
> +			brelse(di_bh);
> +			di_bh = NULL;
> +
> +			if (wait)
> +				ret = ocfs2_inode_lock(inode, &di_bh, 1);
> +			else
> +				ret = ocfs2_try_inode_lock(inode, &di_bh, 1);
You seemed miss call brelse(di_bh)? That will cause buffer_head leak. 
ocfs2_assign_bh() get it.
> +			if (ret) {
> +				if (ret != -EAGAIN)
> +					mlog_errno(ret);
> +				goto out;
> +			}
> +
> +			meta_level = 1;
> +
> +			if (wait)
> +				down_write(&OCFS2_I(inode)->ip_alloc_sem);
> +			else {
> +				ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
> +				if (!ret) {
> +					ret = -EAGAIN;
> +					goto out_unlock;
> +				}
> +			}
The same style issue like above. Maybe we can abstract it in a function?
> +
> +			write_sem = 1;
>   
>   			ret = ocfs2_prepare_inode_for_refcount(inode,
> +							       di_bh,
>   							       file,
>   							       pos,
> -							       count,
> -							       &meta_level);
> +							       count);
>   		}
>   
>   		if (ret < 0) {
> -			mlog_errno(ret);
> -			goto out_unlock;
> +			if (ret != -EAGAIN)
> +				mlog_errno(ret);
> +			goto out_up_sem;
>   		}
>   
>   		break;
>   	}
>   
> +out_up_sem:
> +	if (write_sem)
> +		up_write(&OCFS2_I(inode)->ip_alloc_sem);
> +	else
> +		up_read(&OCFS2_I(inode)->ip_alloc_sem);

Where were inode cluster lock unlocked?

Thanks,

Junxiao.

>   out_unlock:
>   	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
>   					    pos, count, wait);
sunnyZhang Sept. 12, 2019, 2:01 a.m. UTC | #2
Hi Junxiao,

Thank you for review.

在 2019/9/12 上午6:30, Junxiao Bi 写道:
> Hi Shuning,
>
> See comments inlined.
>
> On 9/11/19 2:58 AM, Shuning Zhang wrote:
>> When the extent tree is modified, it shuold be protected by
>> inode cluster lock and ip_alloc_sem.
>>
>> The extent tree is accessed and modified in the 
>> ocfs2_prepare_inode_for_write,
>> but isn't protected by ip_alloc_sem.
>>
>> The following is a case. The function ocfs2_fiemap is accessing
>> the extent tree, which is modified at the same time.
>>
>> [47145.974472] kernel BUG at fs/ocfs2/extent_map.c:475!
>> [47145.974480] invalid opcode: 0000 [#1] SMP
>> [47145.974489] Modules linked in: tun ocfs2 ocfs2_nodemanager configfs
>> ocfs2_stackglue xen_netback xen_blkback xen_gntalloc xen_gntdev 
>> xen_evtchn
>> xenfs xen_privcmd vfat fat bnx2fc fcoe libfcoe libfc 
>> scsi_transport_fc sunrpc
>> bridge 8021q mrp garp stp llc ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad
>> ib_core ib_addr dm_round_robin dm_multipath sg pcspkr raid1 shpchp
>> ipmi_devintf ipmi_msghandler ext4 jbd2 mbcache2 sd_mod nvme nvme_core 
>> bnxt_en
>> xhci_pci xhci_hcd crc32c_intel be2iscsi bnx2i cnic uio cxgb4i cxgb4 
>> cxgb3i
>> libcxgbi ipv6 cxgb3 mdio qla4xxx wmi dm_mirror dm_region_hash dm_log 
>> dm_mod
>> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi iscsi_ibft
>> iscsi_boot_sysfs
>> [47145.974636] CPU: 16 PID: 14047 Comm: o2info Not tainted
>> 4.1.12-124.23.1.el6uek.x86_64 #2
>> [47145.974646] Hardware name: Oracle Corporation ORACLE SERVER 
>> X7-2L/ASM, MB
>> MECH, X7-2L, BIOS 42040600 10/19/2018
>> [47145.974658] task: ffff88019487e200 ti: ffff88003daa4000 task.ti:
>> ffff88003daa4000
>> [47145.974667] RIP: e030:[<ffffffffa05e4840>] [<ffffffffa05e4840>]
>> ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
>> [47145.974708] RSP: e02b:ffff88003daa7d88  EFLAGS: 00010287
>> [47145.974713] RAX: 00000000000000de RBX: ffff8801d1104030 RCX:
>> ffff8801d1104e10
>> [47145.974719] RDX: 00000000000000de RSI: 000000000009ec40 RDI:
>> ffff8801d1104e24
>> [47145.974725] RBP: ffff88003daa7df8 R08: ffff88003daa7e38 R09:
>> 0000000000000000
>> [47145.974732] R10: 000000000009ec3f R11: 0000000000000246 R12:
>> 000000000009ec3f
>> [47145.974739] R13: ffff88004c419000 R14: 0000000000000002 R15:
>> ffff88003daa7e28
>> [47145.974754] FS:  00007fdbccc92720(0000) GS:ffff880358800000(0000)
>> knlGS:ffff880358800000
>> [47145.974764] CS:  e033 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [47145.974772] CR2: 00007fd5dfcd8350 CR3: 0000000208677000 CR4:
>> 0000000000042660
>> [47145.974785] Stack:
>> [47145.974790]  ffff88003daa7df8 00002000a05e249b ffff8801d1104000
>> ffff88003daa7e2c
>> [47145.974802]  ffff88003daa7e38 ffff88000cc484c0 ffff880145f5b478
>> 0000000000000000
>> [47145.974811]  0000000000002000 ffff88000cc484c0 ffff88003daa7ea0
>> 0000000000000000
>> [47145.974820] Call Trace:
>> [47145.974837]  [<ffffffffa05e53e3>] ocfs2_fiemap+0x1e3/0x430 [ocfs2]
>> [47145.974848]  [<ffffffff816f644f>] ? 
>> xen_hypervisor_callback+0x7f/0x120
>> [47145.974855]  [<ffffffff816f6448>] ? 
>> xen_hypervisor_callback+0x78/0x120
>> [47145.974861]  [<ffffffff816f64a3>] ? 
>> xen_hypervisor_callback+0xd3/0x120
>> [47145.974872]  [<ffffffff81220905>] do_vfs_ioctl+0x155/0x510
>> [47145.974878]  [<ffffffff81220d41>] SyS_ioctl+0x81/0xa0
>> [47145.974885]  [<ffffffff816f1b9f>] ? 
>> system_call_after_swapgs+0xe9/0x190
>> [47145.974891]  [<ffffffff816f1b98>] ? 
>> system_call_after_swapgs+0xe2/0x190
>> [47145.974899]  [<ffffffff816f1b91>] ? 
>> system_call_after_swapgs+0xdb/0x190
>> [47145.974905]  [<ffffffff816f1c5e>] system_call_fastpath+0x18/0xd8
>> [47145.974910] Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 
>> 8b 4a 40
>> 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 
>> 00 <0f>
>> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
>> [47145.975000] RIP  [<ffffffffa05e4840>]
>> ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
>> [47145.975018]  RSP <ffff88003daa7d88>
>> [47145.989999] ---[ end trace c8aa0c8180e869dc ]---
>> [47146.087579] Kernel panic - not syncing: Fatal exception
>> [47146.087691] Kernel Offset: disabled
>>
>> Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
>> ---
>>   fs/ocfs2/file.c | 79 
>> +++++++++++++++++++++++++++++++++++++++------------------
>>   1 file changed, 54 insertions(+), 25 deletions(-)
>>
>> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
>> index 4435df3..85cddd2 100644
>> --- a/fs/ocfs2/file.c
>> +++ b/fs/ocfs2/file.c
>> @@ -2093,29 +2093,19 @@ static int ocfs2_is_io_unaligned(struct inode 
>> *inode, size_t count, loff_t pos)
>>   }
>>     static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
>> +                        struct buffer_head *di_bh,
>>                           struct file *file,
>> -                        loff_t pos, size_t count,
>> -                        int *meta_level)
>> +                        loff_t pos, size_t count)
>>   {
>>       int ret;
>> -    struct buffer_head *di_bh = NULL;
>>       u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
>>       u32 clusters =
>>           ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
>>   -    ret = ocfs2_inode_lock(inode, &di_bh, 1);
>> -    if (ret) {
>> -        mlog_errno(ret);
>> -        goto out;
>> -    }
>> -
>> -    *meta_level = 1;
>> -
>>       ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
>>       if (ret)
>>           mlog_errno(ret);
>> -out:
>> -    brelse(di_bh);
>> +
>>       return ret;
>>   }
> Since you moved out cluster from ocfs2_prepare_inode_for_refcount() , 
> only ocfs2_refcount_cow left, can we go even further to remove it?
>
That is a good idea. I will have a try.
>>   @@ -2123,6 +2113,7 @@ static int 
>> ocfs2_prepare_inode_for_write(struct file *file,
>>                        loff_t pos, size_t count, int wait)
>>   {
>>       int ret = 0, meta_level = 0, overwrite_io = 0;
>> +    int write_sem = 0;
>>       struct dentry *dentry = file->f_path.dentry;
>>       struct inode *inode = d_inode(dentry);
>>       struct buffer_head *di_bh = NULL;
>> @@ -2145,25 +2136,30 @@ static int 
>> ocfs2_prepare_inode_for_write(struct file *file,
>>               goto out;
>>           }
>>   +        if (wait)
>> +            down_read(&OCFS2_I(inode)->ip_alloc_sem);
>> +        else {
>> +            ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
>> +            if (!ret) {
>> +                ret = -EAGAIN;
>> +                goto out_unlock;
>> +            }
>> +        }
>
> can we do it like the following?
>
> if(wait) {
>
>     lock(inode_lock);
>
>     lock(ip_alloc_sem);
>
> } else {
>
>     try_lock();
>
> }
>
Is that to make the code concise?

If using this mode, the code will be more redundant. That is because we 
should

check the retrun value of each function.


if(wait) {
     ret = lock(inode_lock);
if (!ret) {

     }
lock(ip_alloc_sem);
} else {
     ret = try_lock(inode_lock);

if (!ret) {

     }


     ret = try_lock(ip_alloc_sem);

if (!ret) {

     }

}

>> +
>>           /*
>>            * Check if IO will overwrite allocated blocks in case
>>            * IOCB_NOWAIT flag is set.
>>            */
>>           if (!wait && !overwrite_io) {
>>               overwrite_io = 1;
>> -            if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
>> -                ret = -EAGAIN;
>> -                goto out_unlock;
>> -            }
>>                 ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
>>               brelse(di_bh);
>>               di_bh = NULL;
>> -            up_read(&OCFS2_I(inode)->ip_alloc_sem);
>>               if (ret < 0) {
>>                   if (ret != -EAGAIN)
>>                       mlog_errno(ret);
>> -                goto out_unlock;
>> +                goto out_up_sem;
>>               }
>>           }
>>   @@ -2178,6 +2174,7 @@ static int 
>> ocfs2_prepare_inode_for_write(struct file *file,
>>            * set inode->i_size at the end of a write. */
>>           if (should_remove_suid(dentry)) {
>>               if (meta_level == 0) {
>> +                up_read(&OCFS2_I(inode)->ip_alloc_sem);
>>                   ocfs2_inode_unlock(inode, meta_level);
>>                   meta_level = 1;
>>                   continue;
>> @@ -2186,7 +2183,7 @@ static int ocfs2_prepare_inode_for_write(struct 
>> file *file,
>>               ret = ocfs2_write_remove_suid(inode);
>>               if (ret < 0) {
>>                   mlog_errno(ret);
>> -                goto out_unlock;
>> +                goto out_up_sem;
>>               }
>>           }
>>   @@ -2194,24 +2191,56 @@ static int 
>> ocfs2_prepare_inode_for_write(struct file *file,
>>             ret = ocfs2_check_range_for_refcount(inode, pos, count);
>>           if (ret == 1) {
>> +            up_read(&OCFS2_I(inode)->ip_alloc_sem);
>>               ocfs2_inode_unlock(inode, meta_level);
>> -            meta_level = -1;
>> +            brelse(di_bh);
>> +            di_bh = NULL;
>> +
>> +            if (wait)
>> +                ret = ocfs2_inode_lock(inode, &di_bh, 1);
>> +            else
>> +                ret = ocfs2_try_inode_lock(inode, &di_bh, 1);
> You seemed miss call brelse(di_bh)? That will cause buffer_head leak. 
> ocfs2_assign_bh() get it.
The function brelse is at end of ocfs2_prepare_inode_for_write.
>> +            if (ret) {
>> +                if (ret != -EAGAIN)
>> +                    mlog_errno(ret);
>> +                goto out;
>> +            }
>> +
>> +            meta_level = 1;
>> +
>> +            if (wait)
>> + down_write(&OCFS2_I(inode)->ip_alloc_sem);
>> +            else {
>> +                ret = 
>> down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
>> +                if (!ret) {
>> +                    ret = -EAGAIN;
>> +                    goto out_unlock;
>> +                }
>> +            }
> The same style issue like above. Maybe we can abstract it in a function?
Yes, there are some redundancy. I will have a try.
>> +
>> +            write_sem = 1;
>>                 ret = ocfs2_prepare_inode_for_refcount(inode,
>> +                                   di_bh,
>>                                      file,
>>                                      pos,
>> -                                   count,
>> -                                   &meta_level);
>> +                                   count);
>>           }
>>             if (ret < 0) {
>> -            mlog_errno(ret);
>> -            goto out_unlock;
>> +            if (ret != -EAGAIN)
>> +                mlog_errno(ret);
>> +            goto out_up_sem;
>>           }
>>             break;
>>       }
>>   +out_up_sem:
>> +    if (write_sem)
>> +        up_write(&OCFS2_I(inode)->ip_alloc_sem);
>> +    else
>> +        up_read(&OCFS2_I(inode)->ip_alloc_sem);
>
> Where were inode cluster lock unlocked?
>
This lock is unlocked at the end of function.

2250         if (meta_level >= 0)
2251                 ocfs2_inode_unlock(inode, meta_level);

> Thanks,
>
> Junxiao.
>
>>   out_unlock:
>> trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
>>                           pos, count, wait);
diff mbox series

Patch

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4435df3..85cddd2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2093,29 +2093,19 @@  static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
 }
 
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+					    struct buffer_head *di_bh,
 					    struct file *file,
-					    loff_t pos, size_t count,
-					    int *meta_level)
+					    loff_t pos, size_t count)
 {
 	int ret;
-	struct buffer_head *di_bh = NULL;
 	u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 	u32 clusters =
 		ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
 
-	ret = ocfs2_inode_lock(inode, &di_bh, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	*meta_level = 1;
-
 	ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
 	if (ret)
 		mlog_errno(ret);
-out:
-	brelse(di_bh);
+
 	return ret;
 }
 
@@ -2123,6 +2113,7 @@  static int ocfs2_prepare_inode_for_write(struct file *file,
 					 loff_t pos, size_t count, int wait)
 {
 	int ret = 0, meta_level = 0, overwrite_io = 0;
+	int write_sem = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *di_bh = NULL;
@@ -2145,25 +2136,30 @@  static int ocfs2_prepare_inode_for_write(struct file *file,
 			goto out;
 		}
 
+		if (wait)
+			down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		else {
+			ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+			if (!ret) {
+				ret = -EAGAIN;
+				goto out_unlock;
+			}
+		}
+
 		/*
 		 * Check if IO will overwrite allocated blocks in case
 		 * IOCB_NOWAIT flag is set.
 		 */
 		if (!wait && !overwrite_io) {
 			overwrite_io = 1;
-			if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
-				ret = -EAGAIN;
-				goto out_unlock;
-			}
 
 			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
 			brelse(di_bh);
 			di_bh = NULL;
-			up_read(&OCFS2_I(inode)->ip_alloc_sem);
 			if (ret < 0) {
 				if (ret != -EAGAIN)
 					mlog_errno(ret);
-				goto out_unlock;
+				goto out_up_sem;
 			}
 		}
 
@@ -2178,6 +2174,7 @@  static int ocfs2_prepare_inode_for_write(struct file *file,
 		 * set inode->i_size at the end of a write. */
 		if (should_remove_suid(dentry)) {
 			if (meta_level == 0) {
+				up_read(&OCFS2_I(inode)->ip_alloc_sem);
 				ocfs2_inode_unlock(inode, meta_level);
 				meta_level = 1;
 				continue;
@@ -2186,7 +2183,7 @@  static int ocfs2_prepare_inode_for_write(struct file *file,
 			ret = ocfs2_write_remove_suid(inode);
 			if (ret < 0) {
 				mlog_errno(ret);
-				goto out_unlock;
+				goto out_up_sem;
 			}
 		}
 
@@ -2194,24 +2191,56 @@  static int ocfs2_prepare_inode_for_write(struct file *file,
 
 		ret = ocfs2_check_range_for_refcount(inode, pos, count);
 		if (ret == 1) {
+			up_read(&OCFS2_I(inode)->ip_alloc_sem);
 			ocfs2_inode_unlock(inode, meta_level);
-			meta_level = -1;
+			brelse(di_bh);
+			di_bh = NULL;
+
+			if (wait)
+				ret = ocfs2_inode_lock(inode, &di_bh, 1);
+			else
+				ret = ocfs2_try_inode_lock(inode, &di_bh, 1);
+			if (ret) {
+				if (ret != -EAGAIN)
+					mlog_errno(ret);
+				goto out;
+			}
+
+			meta_level = 1;
+
+			if (wait)
+				down_write(&OCFS2_I(inode)->ip_alloc_sem);
+			else {
+				ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+				if (!ret) {
+					ret = -EAGAIN;
+					goto out_unlock;
+				}
+			}
+
+			write_sem = 1;
 
 			ret = ocfs2_prepare_inode_for_refcount(inode,
+							       di_bh,
 							       file,
 							       pos,
-							       count,
-							       &meta_level);
+							       count);
 		}
 
 		if (ret < 0) {
-			mlog_errno(ret);
-			goto out_unlock;
+			if (ret != -EAGAIN)
+				mlog_errno(ret);
+			goto out_up_sem;
 		}
 
 		break;
 	}
 
+out_up_sem:
+	if (write_sem)
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	else
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
 out_unlock:
 	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
 					    pos, count, wait);