diff mbox series

ocfs2: reflink deadlock when clone file to the same directory simultaneously

Message ID 20210729110230.18983-1-ghe@suse.com (mailing list archive)
State New, archived
Headers show
Series ocfs2: reflink deadlock when clone file to the same directory simultaneously | expand

Commit Message

Gang He July 29, 2021, 11:02 a.m. UTC
Running reflink from multiple nodes simultaneously to clone a file
to the same directory probably triggers a deadlock issue.
For example, there is a three node ocfs2 cluster, each node mounts
the ocfs2 file system to /mnt/shared, and run the reflink command
from each node repeatedly, like
  reflink "/mnt/shared/test" \
  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
then, reflink command process will be hung on each node, and you
can't list this file system directory.
The problematic reflink command process is blocked at one node,
task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
Call Trace:
  __schedule+0x2fd/0x750
  schedule+0x2f/0xa0
  schedule_timeout+0x1cc/0x310
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0e3e000
  wait_for_completion+0xba/0x140
  ? wake_up_q+0xa0/0xa0
  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
  ocfs2_reflink+0x436/0x4c0 [ocfs2]
  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_ioctl+0x25e/0x670 [ocfs2]
  do_vfs_ioctl+0xa0/0x680
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1e0
The other reflink command processes are blocked at other nodes,
task:reflink         state:D stack:    0 pid:29759 ppid:  4088
Call Trace:
  __schedule+0x2fd/0x750
  schedule+0x2f/0xa0
  schedule_timeout+0x1cc/0x310
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0b19000
  wait_for_completion+0xba/0x140
  ? wake_up_q+0xa0/0xa0
  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
  ocfs2_reflink+0x335/0x4c0 [ocfs2]
  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_ioctl+0x25e/0x670 [ocfs2]
  do_vfs_ioctl+0xa0/0x680
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1e0
or
task:reflink         state:D stack:    0 pid:18465 ppid:  4156
Call Trace:
  __schedule+0x302/0x940
  ? usleep_range+0x80/0x80
  schedule+0x46/0xb0
  schedule_timeout+0xff/0x140
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0c3b000
  __wait_for_common+0xb9/0x170
  __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
  ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
  ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
  ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
  ? dput+0x32/0x2f0
  ocfs2_permission+0x45/0xe0 [ocfs2]
  inode_permission+0xcc/0x170
  link_path_walk.part.0.constprop.0+0x2a2/0x380
  ? path_init+0x2c1/0x3f0
  path_parentat+0x3c/0x90
  filename_parentat+0xc1/0x1d0
  ? filename_lookup+0x138/0x1c0
  filename_create+0x43/0x160
  ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
  ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
  ? do_sys_openat2+0x81/0x150
  __x64_sys_ioctl+0x82/0xb0
  do_syscall_64+0x61/0xb0

The deadlock is caused by multiple acquiring the destination directory
inode dlm lock in ocfs2_reflink function, we should acquire this
directory inode dlm lock at the beginning, and hold this dlm lock until
end of the function.

Signed-off-by: Gang He <ghe@suse.com>
---
 fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
 fs/ocfs2/namei.h        |  2 ++
 fs/ocfs2/refcounttree.c | 15 +++++++++++----
 fs/ocfs2/xattr.c        | 12 +-----------
 fs/ocfs2/xattr.h        |  1 +
 5 files changed, 28 insertions(+), 34 deletions(-)

Comments

Wengang Wang July 29, 2021, 10:07 p.m. UTC | #1
Hi Gang,

I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.  
With that, the problem would be easier to understand.

thanks,
wengang

> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
> 
> Running reflink from multiple nodes simultaneously to clone a file
> to the same directory probably triggers a deadlock issue.
> For example, there is a three node ocfs2 cluster, each node mounts
> the ocfs2 file system to /mnt/shared, and run the reflink command
> from each node repeatedly, like
>  reflink "/mnt/shared/test" \
>  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
> then, reflink command process will be hung on each node, and you
> can't list this file system directory.
> The problematic reflink command process is blocked at one node,
> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
> Call Trace:
>  __schedule+0x2fd/0x750
>  schedule+0x2f/0xa0
>  schedule_timeout+0x1cc/0x310
>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>  ? 0xffffffffc0e3e000
>  wait_for_completion+0xba/0x140
>  ? wake_up_q+0xa0/0xa0
>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>  do_vfs_ioctl+0xa0/0x680
>  ksys_ioctl+0x70/0x80
>  __x64_sys_ioctl+0x16/0x20
>  do_syscall_64+0x5b/0x1e0
> The other reflink command processes are blocked at other nodes,
> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
> Call Trace:
>  __schedule+0x2fd/0x750
>  schedule+0x2f/0xa0
>  schedule_timeout+0x1cc/0x310
>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>  ? 0xffffffffc0b19000
>  wait_for_completion+0xba/0x140
>  ? wake_up_q+0xa0/0xa0
>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>  ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>  ocfs2_reflink+0x335/0x4c0 [ocfs2]
>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>  do_vfs_ioctl+0xa0/0x680
>  ksys_ioctl+0x70/0x80
>  __x64_sys_ioctl+0x16/0x20
>  do_syscall_64+0x5b/0x1e0
> or
> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
> Call Trace:
>  __schedule+0x302/0x940
>  ? usleep_range+0x80/0x80
>  schedule+0x46/0xb0
>  schedule_timeout+0xff/0x140
>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>  ? 0xffffffffc0c3b000
>  __wait_for_common+0xb9/0x170
>  __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>  ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>  ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>  ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>  ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>  ? dput+0x32/0x2f0
>  ocfs2_permission+0x45/0xe0 [ocfs2]
>  inode_permission+0xcc/0x170
>  link_path_walk.part.0.constprop.0+0x2a2/0x380
>  ? path_init+0x2c1/0x3f0
>  path_parentat+0x3c/0x90
>  filename_parentat+0xc1/0x1d0
>  ? filename_lookup+0x138/0x1c0
>  filename_create+0x43/0x160
>  ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>  ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>  ? do_sys_openat2+0x81/0x150
>  __x64_sys_ioctl+0x82/0xb0
>  do_syscall_64+0x61/0xb0
> 
> The deadlock is caused by multiple acquiring the destination directory
> inode dlm lock in ocfs2_reflink function, we should acquire this
> directory inode dlm lock at the beginning, and hold this dlm lock until
> end of the function.
> 
> Signed-off-by: Gang He <ghe@suse.com>
> ---
> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
> fs/ocfs2/namei.h        |  2 ++
> fs/ocfs2/refcounttree.c | 15 +++++++++++----
> fs/ocfs2/xattr.c        | 12 +-----------
> fs/ocfs2/xattr.h        |  1 +
> 5 files changed, 28 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
> index 2c46ff6ba4ea..f8bbb22cc60b 100644
> --- a/fs/ocfs2/namei.c
> +++ b/fs/ocfs2/namei.c
> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
> }
> 
> int ocfs2_create_inode_in_orphan(struct inode *dir,
> +				 struct buffer_head **dir_bh,
> 				 int mode,
> 				 struct inode **new_inode)
> {
> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
> 
> 	brelse(new_di_bh);
> 
> -	if (!status)
> -		*new_inode = inode;
> -
> 	ocfs2_free_dir_lookup_result(&orphan_insert);
> 
> -	ocfs2_inode_unlock(dir, 1);
> -	brelse(parent_di_bh);
> +	if (!status) {
> +		*new_inode = inode;
> +		*dir_bh = parent_di_bh;
> +	} else {
> +		ocfs2_inode_unlock(dir, 1);
> +		brelse(parent_di_bh);
> +	}
> +
> 	return status;
> }
> 
> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
> }
> 
> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> +				   struct buffer_head *dir_bh,
> 				   struct inode *inode,
> 				   struct dentry *dentry)
> {
> 	int status = 0;
> -	struct buffer_head *parent_di_bh = NULL;
> 	handle_t *handle = NULL;
> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
> 	struct ocfs2_dinode *dir_di, *di;
> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
> 
> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
> -	if (status < 0) {
> -		if (status != -ENOENT)
> -			mlog_errno(status);
> -		return status;
> -	}
> -
> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
> 	if (!dir_di->i_links_count) {
> 		/* can't make a file in a deleted directory. */
> 		status = -ENOENT;
> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> 		goto leave;
> 
> 	/* get a spot inside the dir. */
> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
> 					      dentry->d_name.name,
> 					      dentry->d_name.len, &lookup);
> 	if (status < 0) {
> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> 	ocfs2_journal_dirty(handle, di_bh);
> 
> 	status = ocfs2_add_entry(handle, dentry, inode,
> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
> 				 &lookup);
> 	if (status < 0) {
> 		mlog_errno(status);
> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> 	iput(orphan_dir_inode);
> leave:
> 
> -	ocfs2_inode_unlock(dir, 1);
> -
> 	brelse(di_bh);
> -	brelse(parent_di_bh);
> 	brelse(orphan_dir_bh);
> 
> 	ocfs2_free_dir_lookup_result(&lookup);
> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
> index 9cc891eb874e..03a2c526e2c1 100644
> --- a/fs/ocfs2/namei.h
> +++ b/fs/ocfs2/namei.h
> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
> 		     struct buffer_head *orphan_dir_bh,
> 		     bool dio);
> int ocfs2_create_inode_in_orphan(struct inode *dir,
> +				 struct buffer_head **dir_bh,
> 				 int mode,
> 				 struct inode **new_inode);
> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
> 		struct inode *inode, struct buffer_head *di_bh,
> 		int update_isize, loff_t end);
> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> +				   struct buffer_head *dir_bh,
> 				   struct inode *new_inode,
> 				   struct dentry *new_dentry);
> 
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index 7f6355cbb587..a9a0c7c37e8e 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
> {
> 	int error, had_lock;
> 	struct inode *inode = d_inode(old_dentry);
> -	struct buffer_head *old_bh = NULL;
> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
> 	struct inode *new_orphan_inode = NULL;
> 	struct ocfs2_lock_holder oh;
> 
> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
> 		return -EOPNOTSUPP;
> 
> 
> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
> 					     &new_orphan_inode);
> 	if (error) {
> 		mlog_errno(error);
> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
> 
> 	/* If the security isn't preserved, we need to re-initialize them. */
> 	if (!preserve) {
> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
> +						    new_orphan_inode,
> 						    &new_dentry->d_name);
> 		if (error)
> 			mlog_errno(error);
> 	}
> 	if (!error) {
> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
> +						       new_orphan_inode,
> 						       new_dentry);
> 		if (error)
> 			mlog_errno(error);
> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
> 			iput(new_orphan_inode);
> 	}
> 
> +	if (dir_bh) {
> +		ocfs2_inode_unlock(dir, 1);
> +		brelse(dir_bh);
> +	}
> +
> 	return error;
> }
> 
> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
> index dd784eb0cd7c..3f23e3a5018c 100644
> --- a/fs/ocfs2/xattr.c
> +++ b/fs/ocfs2/xattr.c
> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
> /*
>  * Initialize security and acl for a already created inode.
>  * Used for reflink a non-preserve-security file.
> - *
> - * It uses common api like ocfs2_xattr_set, so the caller
> - * must not hold any lock expect i_mutex.
>  */
> int ocfs2_init_security_and_acl(struct inode *dir,
> +				struct buffer_head *dir_bh,
> 				struct inode *inode,
> 				const struct qstr *qstr)
> {
> 	int ret = 0;
> -	struct buffer_head *dir_bh = NULL;
> 
> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
> 	if (ret) {
> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
> 		goto leave;
> 	}
> 
> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
> -	if (ret) {
> -		mlog_errno(ret);
> -		goto leave;
> -	}
> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
> 	if (ret)
> 		mlog_errno(ret);
> 
> -	ocfs2_inode_unlock(dir, 0);
> -	brelse(dir_bh);
> leave:
> 	return ret;
> }
> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
> index 00308b57f64f..b27fd8ba0019 100644
> --- a/fs/ocfs2/xattr.h
> +++ b/fs/ocfs2/xattr.h
> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
> 			 struct buffer_head *new_bh,
> 			 bool preserve_security);
> int ocfs2_init_security_and_acl(struct inode *dir,
> +				struct buffer_head *dir_bh,
> 				struct inode *inode,
> 				const struct qstr *qstr);
> #endif /* OCFS2_XATTR_H */
> -- 
> 2.21.0
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Gang He July 30, 2021, 6:16 a.m. UTC | #2
Hello Wengang and all,

This issue can be reproduced stably when you run the below reflink 
command line(maybe you also can follow a "rm this file" command line and 
  sleep some usecs) from each node repeatedly for a while.
Based on my observation, the reflink processes are always blocked at the 
below points.
 From dlm_tool output and crash analysis, the node1 has acquired 
.snapshots directory inode EX dlm lock, but the reflink process is 
blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode 
dlm lock again.
On the other two nodes, the reflink processes are blocked at acquire 
.snapshots directory inode dlm lock, then the whole file system is hung,
you can not list this file again.

The problem looks like acquiring the destination direcory multiple 
during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
e.g.
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM 
error -16 while calling ocfs2_dlm_lock on resource 
M000000000000000004661c00000000
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: 
status = -16

Then, I change the code to acquire this destination direcory dlm lock, 
and hold the lock until the end of ocfs2_reflink function.
After this change, I did not encounter this hang problem again after 
lots of testing. Second, I find the code change also improve reflink 
performance, since the code avoids the previous ping-pong effect.

Thanks
Gang


On 2021/7/30 6:07, Wengang Wang wrote:
> Hi Gang,
> 
> I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
> With that, the problem would be easier to understand.
> 
> thanks,
> wengang
> 
>> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
>>
>> Running reflink from multiple nodes simultaneously to clone a file
>> to the same directory probably triggers a deadlock issue.
>> For example, there is a three node ocfs2 cluster, each node mounts
>> the ocfs2 file system to /mnt/shared, and run the reflink command
>> from each node repeatedly, like
>>   reflink "/mnt/shared/test" \
>>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>> then, reflink command process will be hung on each node, and you
>> can't list this file system directory.
>> The problematic reflink command process is blocked at one node,
>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>> Call Trace:
>>   __schedule+0x2fd/0x750
>>   schedule+0x2f/0xa0
>>   schedule_timeout+0x1cc/0x310
>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>   ? 0xffffffffc0e3e000
>>   wait_for_completion+0xba/0x140
>>   ? wake_up_q+0xa0/0xa0
>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>   ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>   ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>   do_vfs_ioctl+0xa0/0x680
>>   ksys_ioctl+0x70/0x80
>>   __x64_sys_ioctl+0x16/0x20
>>   do_syscall_64+0x5b/0x1e0
>> The other reflink command processes are blocked at other nodes,
>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>> Call Trace:
>>   __schedule+0x2fd/0x750
>>   schedule+0x2f/0xa0
>>   schedule_timeout+0x1cc/0x310
>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>   ? 0xffffffffc0b19000
>>   wait_for_completion+0xba/0x140
>>   ? wake_up_q+0xa0/0xa0
>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>   ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>   ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>   do_vfs_ioctl+0xa0/0x680
>>   ksys_ioctl+0x70/0x80
>>   __x64_sys_ioctl+0x16/0x20
>>   do_syscall_64+0x5b/0x1e0
>> or
>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>> Call Trace:
>>   __schedule+0x302/0x940
>>   ? usleep_range+0x80/0x80
>>   schedule+0x46/0xb0
>>   schedule_timeout+0xff/0x140
>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>   ? 0xffffffffc0c3b000
>>   __wait_for_common+0xb9/0x170
>>   __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>   ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>   ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>   ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>   ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>   ? dput+0x32/0x2f0
>>   ocfs2_permission+0x45/0xe0 [ocfs2]
>>   inode_permission+0xcc/0x170
>>   link_path_walk.part.0.constprop.0+0x2a2/0x380
>>   ? path_init+0x2c1/0x3f0
>>   path_parentat+0x3c/0x90
>>   filename_parentat+0xc1/0x1d0
>>   ? filename_lookup+0x138/0x1c0
>>   filename_create+0x43/0x160
>>   ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>   ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>   ? do_sys_openat2+0x81/0x150
>>   __x64_sys_ioctl+0x82/0xb0
>>   do_syscall_64+0x61/0xb0
>>
>> The deadlock is caused by multiple acquiring the destination directory
>> inode dlm lock in ocfs2_reflink function, we should acquire this
>> directory inode dlm lock at the beginning, and hold this dlm lock until
>> end of the function.
>>
>> Signed-off-by: Gang He <ghe@suse.com>
>> ---
>> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>> fs/ocfs2/namei.h        |  2 ++
>> fs/ocfs2/refcounttree.c | 15 +++++++++++----
>> fs/ocfs2/xattr.c        | 12 +-----------
>> fs/ocfs2/xattr.h        |  1 +
>> 5 files changed, 28 insertions(+), 34 deletions(-)
>>
>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>> --- a/fs/ocfs2/namei.c
>> +++ b/fs/ocfs2/namei.c
>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>> }
>>
>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>> +				 struct buffer_head **dir_bh,
>> 				 int mode,
>> 				 struct inode **new_inode)
>> {
>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>
>> 	brelse(new_di_bh);
>>
>> -	if (!status)
>> -		*new_inode = inode;
>> -
>> 	ocfs2_free_dir_lookup_result(&orphan_insert);
>>
>> -	ocfs2_inode_unlock(dir, 1);
>> -	brelse(parent_di_bh);
>> +	if (!status) {
>> +		*new_inode = inode;
>> +		*dir_bh = parent_di_bh;
>> +	} else {
>> +		ocfs2_inode_unlock(dir, 1);
>> +		brelse(parent_di_bh);
>> +	}
>> +
>> 	return status;
>> }
>>
>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>> }
>>
>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> +				   struct buffer_head *dir_bh,
>> 				   struct inode *inode,
>> 				   struct dentry *dentry)
>> {
>> 	int status = 0;
>> -	struct buffer_head *parent_di_bh = NULL;
>> 	handle_t *handle = NULL;
>> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>> 	struct ocfs2_dinode *dir_di, *di;
>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>
>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>> -	if (status < 0) {
>> -		if (status != -ENOENT)
>> -			mlog_errno(status);
>> -		return status;
>> -	}
>> -
>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>> 	if (!dir_di->i_links_count) {
>> 		/* can't make a file in a deleted directory. */
>> 		status = -ENOENT;
>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> 		goto leave;
>>
>> 	/* get a spot inside the dir. */
>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>> 					      dentry->d_name.name,
>> 					      dentry->d_name.len, &lookup);
>> 	if (status < 0) {
>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> 	ocfs2_journal_dirty(handle, di_bh);
>>
>> 	status = ocfs2_add_entry(handle, dentry, inode,
>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>> 				 &lookup);
>> 	if (status < 0) {
>> 		mlog_errno(status);
>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> 	iput(orphan_dir_inode);
>> leave:
>>
>> -	ocfs2_inode_unlock(dir, 1);
>> -
>> 	brelse(di_bh);
>> -	brelse(parent_di_bh);
>> 	brelse(orphan_dir_bh);
>>
>> 	ocfs2_free_dir_lookup_result(&lookup);
>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>> index 9cc891eb874e..03a2c526e2c1 100644
>> --- a/fs/ocfs2/namei.h
>> +++ b/fs/ocfs2/namei.h
>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>> 		     struct buffer_head *orphan_dir_bh,
>> 		     bool dio);
>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>> +				 struct buffer_head **dir_bh,
>> 				 int mode,
>> 				 struct inode **new_inode);
>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>> 		struct inode *inode, struct buffer_head *di_bh,
>> 		int update_isize, loff_t end);
>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> +				   struct buffer_head *dir_bh,
>> 				   struct inode *new_inode,
>> 				   struct dentry *new_dentry);
>>
>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>> index 7f6355cbb587..a9a0c7c37e8e 100644
>> --- a/fs/ocfs2/refcounttree.c
>> +++ b/fs/ocfs2/refcounttree.c
>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>> {
>> 	int error, had_lock;
>> 	struct inode *inode = d_inode(old_dentry);
>> -	struct buffer_head *old_bh = NULL;
>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>> 	struct inode *new_orphan_inode = NULL;
>> 	struct ocfs2_lock_holder oh;
>>
>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>> 		return -EOPNOTSUPP;
>>
>>
>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>> 					     &new_orphan_inode);
>> 	if (error) {
>> 		mlog_errno(error);
>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>
>> 	/* If the security isn't preserved, we need to re-initialize them. */
>> 	if (!preserve) {
>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>> +						    new_orphan_inode,
>> 						    &new_dentry->d_name);
>> 		if (error)
>> 			mlog_errno(error);
>> 	}
>> 	if (!error) {
>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>> +						       new_orphan_inode,
>> 						       new_dentry);
>> 		if (error)
>> 			mlog_errno(error);
>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>> 			iput(new_orphan_inode);
>> 	}
>>
>> +	if (dir_bh) {
>> +		ocfs2_inode_unlock(dir, 1);
>> +		brelse(dir_bh);
>> +	}
>> +
>> 	return error;
>> }
>>
>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>> index dd784eb0cd7c..3f23e3a5018c 100644
>> --- a/fs/ocfs2/xattr.c
>> +++ b/fs/ocfs2/xattr.c
>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>> /*
>>   * Initialize security and acl for a already created inode.
>>   * Used for reflink a non-preserve-security file.
>> - *
>> - * It uses common api like ocfs2_xattr_set, so the caller
>> - * must not hold any lock expect i_mutex.
>>   */
>> int ocfs2_init_security_and_acl(struct inode *dir,
>> +				struct buffer_head *dir_bh,
>> 				struct inode *inode,
>> 				const struct qstr *qstr)
>> {
>> 	int ret = 0;
>> -	struct buffer_head *dir_bh = NULL;
>>
>> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>> 	if (ret) {
>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>> 		goto leave;
>> 	}
>>
>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>> -	if (ret) {
>> -		mlog_errno(ret);
>> -		goto leave;
>> -	}
>> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>> 	if (ret)
>> 		mlog_errno(ret);
>>
>> -	ocfs2_inode_unlock(dir, 0);
>> -	brelse(dir_bh);
>> leave:
>> 	return ret;
>> }
>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>> index 00308b57f64f..b27fd8ba0019 100644
>> --- a/fs/ocfs2/xattr.h
>> +++ b/fs/ocfs2/xattr.h
>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>> 			 struct buffer_head *new_bh,
>> 			 bool preserve_security);
>> int ocfs2_init_security_and_acl(struct inode *dir,
>> +				struct buffer_head *dir_bh,
>> 				struct inode *inode,
>> 				const struct qstr *qstr);
>> #endif /* OCFS2_XATTR_H */
>> -- 
>> 2.21.0
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel@oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Wengang Wang July 30, 2021, 4:35 p.m. UTC | #3
Hi Gang,

> On Jul 29, 2021, at 11:16 PM, Gang He <ghe@suse.com> wrote:
> 
> Hello Wengang and all,
> 
> This issue can be reproduced stably when you run the below reflink command line(maybe you also can follow a "rm this file" command line and  sleep some usecs) from each node repeatedly for a while.
> Based on my observation, the reflink processes are always blocked at the below points.
> From dlm_tool output and crash analysis, the node1 has acquired .snapshots directory inode EX dlm lock, but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode dlm lock again.

So the reflink path on node 1 have .snapshots inode lock granted and is blocking at the new created inode under orphan directory.  BTW, what’s .snapshots directory? What’s the call path to lock that .snapshots inode?

> On the other two nodes, the reflink processes are blocked at acquire .snapshots directory inode dlm lock, then the whole file system is hung,
> you can not list this file again.

So there are reflink paths on the other two nodes blocking at .snapshots inode. But what lock they are granted already?

For a typical ABBA deadlock,
path 1 granted lock A and blocks at lock B
path 2 granted lock B and blocks at lock A

Per your description, I see this:
reflink path on node1  granted .snapshots lock and blocks at new inode lock
reflnk paths on onde2/3 block at .snapshots lock.

I don't see how deadlock formed… the new inode lock is granted to any of the reflink path on node2/3? how?

thanks,
wengang

> 
> The problem looks like acquiring the destination direcory multiple during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
> e.g.
> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M000000000000000004661c00000000
> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: status = -16
> 
> Then, I change the code to acquire this destination direcory dlm lock, and hold the lock until the end of ocfs2_reflink function.
> After this change, I did not encounter this hang problem again after lots of testing. Second, I find the code change also improve reflink performance, since the code avoids the previous ping-pong effect.
> 
> Thanks
> Gang
> 
> 
> On 2021/7/30 6:07, Wengang Wang wrote:
>> Hi Gang,
>> I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
>> With that, the problem would be easier to understand.
>> thanks,
>> wengang
>>> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
>>> 
>>> Running reflink from multiple nodes simultaneously to clone a file
>>> to the same directory probably triggers a deadlock issue.
>>> For example, there is a three node ocfs2 cluster, each node mounts
>>> the ocfs2 file system to /mnt/shared, and run the reflink command
>>> from each node repeatedly, like
>>>  reflink "/mnt/shared/test" \
>>>  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>> then, reflink command process will be hung on each node, and you
>>> can't list this file system directory.
>>> The problematic reflink command process is blocked at one node,
>>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>>> Call Trace:
>>>  __schedule+0x2fd/0x750
>>>  schedule+0x2f/0xa0
>>>  schedule_timeout+0x1cc/0x310
>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>  ? 0xffffffffc0e3e000
>>>  wait_for_completion+0xba/0x140
>>>  ? wake_up_q+0xa0/0xa0
>>>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>  do_vfs_ioctl+0xa0/0x680
>>>  ksys_ioctl+0x70/0x80
>>>  __x64_sys_ioctl+0x16/0x20
>>>  do_syscall_64+0x5b/0x1e0
>>> The other reflink command processes are blocked at other nodes,
>>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>>> Call Trace:
>>>  __schedule+0x2fd/0x750
>>>  schedule+0x2f/0xa0
>>>  schedule_timeout+0x1cc/0x310
>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>  ? 0xffffffffc0b19000
>>>  wait_for_completion+0xba/0x140
>>>  ? wake_up_q+0xa0/0xa0
>>>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>  ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>>  ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>  do_vfs_ioctl+0xa0/0x680
>>>  ksys_ioctl+0x70/0x80
>>>  __x64_sys_ioctl+0x16/0x20
>>>  do_syscall_64+0x5b/0x1e0
>>> or
>>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>>> Call Trace:
>>>  __schedule+0x302/0x940
>>>  ? usleep_range+0x80/0x80
>>>  schedule+0x46/0xb0
>>>  schedule_timeout+0xff/0x140
>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>  ? 0xffffffffc0c3b000
>>>  __wait_for_common+0xb9/0x170
>>>  __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>>  ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>>  ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>  ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>  ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>>  ? dput+0x32/0x2f0
>>>  ocfs2_permission+0x45/0xe0 [ocfs2]
>>>  inode_permission+0xcc/0x170
>>>  link_path_walk.part.0.constprop.0+0x2a2/0x380
>>>  ? path_init+0x2c1/0x3f0
>>>  path_parentat+0x3c/0x90
>>>  filename_parentat+0xc1/0x1d0
>>>  ? filename_lookup+0x138/0x1c0
>>>  filename_create+0x43/0x160
>>>  ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>>  ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>>  ? do_sys_openat2+0x81/0x150
>>>  __x64_sys_ioctl+0x82/0xb0
>>>  do_syscall_64+0x61/0xb0
>>> 
>>> The deadlock is caused by multiple acquiring the destination directory
>>> inode dlm lock in ocfs2_reflink function, we should acquire this
>>> directory inode dlm lock at the beginning, and hold this dlm lock until
>>> end of the function.
>>> 
>>> Signed-off-by: Gang He <ghe@suse.com>
>>> ---
>>> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>>> fs/ocfs2/namei.h        |  2 ++
>>> fs/ocfs2/refcounttree.c | 15 +++++++++++----
>>> fs/ocfs2/xattr.c        | 12 +-----------
>>> fs/ocfs2/xattr.h        |  1 +
>>> 5 files changed, 28 insertions(+), 34 deletions(-)
>>> 
>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>>> --- a/fs/ocfs2/namei.c
>>> +++ b/fs/ocfs2/namei.c
>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>>> }
>>> 
>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>> +				 struct buffer_head **dir_bh,
>>> 				 int mode,
>>> 				 struct inode **new_inode)
>>> {
>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>> 
>>> 	brelse(new_di_bh);
>>> 
>>> -	if (!status)
>>> -		*new_inode = inode;
>>> -
>>> 	ocfs2_free_dir_lookup_result(&orphan_insert);
>>> 
>>> -	ocfs2_inode_unlock(dir, 1);
>>> -	brelse(parent_di_bh);
>>> +	if (!status) {
>>> +		*new_inode = inode;
>>> +		*dir_bh = parent_di_bh;
>>> +	} else {
>>> +		ocfs2_inode_unlock(dir, 1);
>>> +		brelse(parent_di_bh);
>>> +	}
>>> +
>>> 	return status;
>>> }
>>> 
>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>> }
>>> 
>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> +				   struct buffer_head *dir_bh,
>>> 				   struct inode *inode,
>>> 				   struct dentry *dentry)
>>> {
>>> 	int status = 0;
>>> -	struct buffer_head *parent_di_bh = NULL;
>>> 	handle_t *handle = NULL;
>>> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>>> 	struct ocfs2_dinode *dir_di, *di;
>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>>> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>> 
>>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>>> -	if (status < 0) {
>>> -		if (status != -ENOENT)
>>> -			mlog_errno(status);
>>> -		return status;
>>> -	}
>>> -
>>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>>> 	if (!dir_di->i_links_count) {
>>> 		/* can't make a file in a deleted directory. */
>>> 		status = -ENOENT;
>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> 		goto leave;
>>> 
>>> 	/* get a spot inside the dir. */
>>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>>> 					      dentry->d_name.name,
>>> 					      dentry->d_name.len, &lookup);
>>> 	if (status < 0) {
>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> 	ocfs2_journal_dirty(handle, di_bh);
>>> 
>>> 	status = ocfs2_add_entry(handle, dentry, inode,
>>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>>> 				 &lookup);
>>> 	if (status < 0) {
>>> 		mlog_errno(status);
>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> 	iput(orphan_dir_inode);
>>> leave:
>>> 
>>> -	ocfs2_inode_unlock(dir, 1);
>>> -
>>> 	brelse(di_bh);
>>> -	brelse(parent_di_bh);
>>> 	brelse(orphan_dir_bh);
>>> 
>>> 	ocfs2_free_dir_lookup_result(&lookup);
>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>>> index 9cc891eb874e..03a2c526e2c1 100644
>>> --- a/fs/ocfs2/namei.h
>>> +++ b/fs/ocfs2/namei.h
>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>>> 		     struct buffer_head *orphan_dir_bh,
>>> 		     bool dio);
>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>> +				 struct buffer_head **dir_bh,
>>> 				 int mode,
>>> 				 struct inode **new_inode);
>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>> 		struct inode *inode, struct buffer_head *di_bh,
>>> 		int update_isize, loff_t end);
>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>> +				   struct buffer_head *dir_bh,
>>> 				   struct inode *new_inode,
>>> 				   struct dentry *new_dentry);
>>> 
>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>>> index 7f6355cbb587..a9a0c7c37e8e 100644
>>> --- a/fs/ocfs2/refcounttree.c
>>> +++ b/fs/ocfs2/refcounttree.c
>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>> {
>>> 	int error, had_lock;
>>> 	struct inode *inode = d_inode(old_dentry);
>>> -	struct buffer_head *old_bh = NULL;
>>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>>> 	struct inode *new_orphan_inode = NULL;
>>> 	struct ocfs2_lock_holder oh;
>>> 
>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>> 		return -EOPNOTSUPP;
>>> 
>>> 
>>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>>> 					     &new_orphan_inode);
>>> 	if (error) {
>>> 		mlog_errno(error);
>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>> 
>>> 	/* If the security isn't preserved, we need to re-initialize them. */
>>> 	if (!preserve) {
>>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>>> +						    new_orphan_inode,
>>> 						    &new_dentry->d_name);
>>> 		if (error)
>>> 			mlog_errno(error);
>>> 	}
>>> 	if (!error) {
>>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>>> +						       new_orphan_inode,
>>> 						       new_dentry);
>>> 		if (error)
>>> 			mlog_errno(error);
>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>> 			iput(new_orphan_inode);
>>> 	}
>>> 
>>> +	if (dir_bh) {
>>> +		ocfs2_inode_unlock(dir, 1);
>>> +		brelse(dir_bh);
>>> +	}
>>> +
>>> 	return error;
>>> }
>>> 
>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>>> index dd784eb0cd7c..3f23e3a5018c 100644
>>> --- a/fs/ocfs2/xattr.c
>>> +++ b/fs/ocfs2/xattr.c
>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>> /*
>>>  * Initialize security and acl for a already created inode.
>>>  * Used for reflink a non-preserve-security file.
>>> - *
>>> - * It uses common api like ocfs2_xattr_set, so the caller
>>> - * must not hold any lock expect i_mutex.
>>>  */
>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>> +				struct buffer_head *dir_bh,
>>> 				struct inode *inode,
>>> 				const struct qstr *qstr)
>>> {
>>> 	int ret = 0;
>>> -	struct buffer_head *dir_bh = NULL;
>>> 
>>> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>>> 	if (ret) {
>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>>> 		goto leave;
>>> 	}
>>> 
>>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>>> -	if (ret) {
>>> -		mlog_errno(ret);
>>> -		goto leave;
>>> -	}
>>> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>>> 	if (ret)
>>> 		mlog_errno(ret);
>>> 
>>> -	ocfs2_inode_unlock(dir, 0);
>>> -	brelse(dir_bh);
>>> leave:
>>> 	return ret;
>>> }
>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>>> index 00308b57f64f..b27fd8ba0019 100644
>>> --- a/fs/ocfs2/xattr.h
>>> +++ b/fs/ocfs2/xattr.h
>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>> 			 struct buffer_head *new_bh,
>>> 			 bool preserve_security);
>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>> +				struct buffer_head *dir_bh,
>>> 				struct inode *inode,
>>> 				const struct qstr *qstr);
>>> #endif /* OCFS2_XATTR_H */
>>> -- 
>>> 2.21.0
>>> 
>>> 
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel@oss.oracle.com
>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Gang He Aug. 2, 2021, 4:35 a.m. UTC | #4
Hi Wengang,

The ocfs2 file system is mounted at /mnt/shared,
The test file(e.g. 40G VM image) is located at /mnt/shared/test, the 
snapshot direcory is /mnt/shared/.snapshots, then the user runs a 
reflink command from each node to generate a clone file repeatedly like,
reflink "/mnt/shared/test" \
    "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"

after a while, the reflink process on each node is hung, the user
cannot list the file system again.

The problematic reflink command process is blocked at one node, like
__schedule+0x2fd/0x750
schedule+0x2f/0xa0
schedule_timeout+0x1cc/0x310
? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
? 0xffffffffc0e3e000
wait_for_completion+0xba/0x140
? wake_up_q+0xa0/0xa0
__ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
ocfs2_reflink+0x436/0x4c0 [ocfs2]
? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
ocfs2_ioctl+0x25e/0x670 [ocfs2]
do_vfs_ioctl+0xa0/0x680
ksys_ioctl+0x70/0x80
__x64_sys_ioctl+0x16/0x20

In fact, this node has acquired .snapshots directory inode dlm lock,
but the reflink process is blocked at 
ocfs2_init_security_and_acl+0xbe/0x1d0, the process is wait for getting
.snapshots directory inode dlm lock.
The reflink process on other nodes is also blocked at getting .snapshots
directory inode dlm lock, then the file system cannot be listed since 
"ls -l" command also need to get .snapshot directory's attributes.

Why the the reflink process is blocked at 
ocfs2_init_security_and_acl+0xbe/0x1d0? I feel it is caused by multiple 
acquiring the .snapshots directory inode dlm lock in ocfs2_reflink 
function. For the deeper reasons, it is probably related to dlmglue 
layer or there is any factor which blocked .snapshots inode dlm lock to 
downconvert.

Thanks
Gang




On 2021/7/31 0:35, Wengang Wang wrote:
> Hi Gang,
> 
>> On Jul 29, 2021, at 11:16 PM, Gang He <ghe@suse.com> wrote:
>>
>> Hello Wengang and all,
>>
>> This issue can be reproduced stably when you run the below reflink command line(maybe you also can follow a "rm this file" command line and  sleep some usecs) from each node repeatedly for a while.
>> Based on my observation, the reflink processes are always blocked at the below points.
>>  From dlm_tool output and crash analysis, the node1 has acquired .snapshots directory inode EX dlm lock, but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode dlm lock again.
> 
> So the reflink path on node 1 have .snapshots inode lock granted and is blocking at the new created inode under orphan directory.  BTW, what’s .snapshots directory? What’s the call path to lock that .snapshots inode?
> 
>> On the other two nodes, the reflink processes are blocked at acquire .snapshots directory inode dlm lock, then the whole file system is hung,
>> you can not list this file again.
> 
> So there are reflink paths on the other two nodes blocking at .snapshots inode. But what lock they are granted already?
> 
> For a typical ABBA deadlock,
> path 1 granted lock A and blocks at lock B
> path 2 granted lock B and blocks at lock A
> 
> Per your description, I see this:
> reflink path on node1  granted .snapshots lock and blocks at new inode lock
> reflnk paths on onde2/3 block at .snapshots lock.
> 
> I don't see how deadlock formed… the new inode lock is granted to any of the reflink path on node2/3? how?
> 
> thanks,
> wengang
> 
>>
>> The problem looks like acquiring the destination direcory multiple during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
>> e.g.
>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M000000000000000004661c00000000
>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: status = -16
>>
>> Then, I change the code to acquire this destination direcory dlm lock, and hold the lock until the end of ocfs2_reflink function.
>> After this change, I did not encounter this hang problem again after lots of testing. Second, I find the code change also improve reflink performance, since the code avoids the previous ping-pong effect.
>>
>> Thanks
>> Gang
>>
>>
>> On 2021/7/30 6:07, Wengang Wang wrote:
>>> Hi Gang,
>>> I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
>>> With that, the problem would be easier to understand.
>>> thanks,
>>> wengang
>>>> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
>>>>
>>>> Running reflink from multiple nodes simultaneously to clone a file
>>>> to the same directory probably triggers a deadlock issue.
>>>> For example, there is a three node ocfs2 cluster, each node mounts
>>>> the ocfs2 file system to /mnt/shared, and run the reflink command
>>>> from each node repeatedly, like
>>>>   reflink "/mnt/shared/test" \
>>>>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>> then, reflink command process will be hung on each node, and you
>>>> can't list this file system directory.
>>>> The problematic reflink command process is blocked at one node,
>>>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>>>> Call Trace:
>>>>   __schedule+0x2fd/0x750
>>>>   schedule+0x2f/0xa0
>>>>   schedule_timeout+0x1cc/0x310
>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>   ? 0xffffffffc0e3e000
>>>>   wait_for_completion+0xba/0x140
>>>>   ? wake_up_q+0xa0/0xa0
>>>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>   ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>   ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>   do_vfs_ioctl+0xa0/0x680
>>>>   ksys_ioctl+0x70/0x80
>>>>   __x64_sys_ioctl+0x16/0x20
>>>>   do_syscall_64+0x5b/0x1e0
>>>> The other reflink command processes are blocked at other nodes,
>>>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>>>> Call Trace:
>>>>   __schedule+0x2fd/0x750
>>>>   schedule+0x2f/0xa0
>>>>   schedule_timeout+0x1cc/0x310
>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>   ? 0xffffffffc0b19000
>>>>   wait_for_completion+0xba/0x140
>>>>   ? wake_up_q+0xa0/0xa0
>>>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>   ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>>>   ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>   do_vfs_ioctl+0xa0/0x680
>>>>   ksys_ioctl+0x70/0x80
>>>>   __x64_sys_ioctl+0x16/0x20
>>>>   do_syscall_64+0x5b/0x1e0
>>>> or
>>>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>>>> Call Trace:
>>>>   __schedule+0x302/0x940
>>>>   ? usleep_range+0x80/0x80
>>>>   schedule+0x46/0xb0
>>>>   schedule_timeout+0xff/0x140
>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>   ? 0xffffffffc0c3b000
>>>>   __wait_for_common+0xb9/0x170
>>>>   __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>>>   ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>>>   ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>   ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>   ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>>>   ? dput+0x32/0x2f0
>>>>   ocfs2_permission+0x45/0xe0 [ocfs2]
>>>>   inode_permission+0xcc/0x170
>>>>   link_path_walk.part.0.constprop.0+0x2a2/0x380
>>>>   ? path_init+0x2c1/0x3f0
>>>>   path_parentat+0x3c/0x90
>>>>   filename_parentat+0xc1/0x1d0
>>>>   ? filename_lookup+0x138/0x1c0
>>>>   filename_create+0x43/0x160
>>>>   ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>>>   ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>>>   ? do_sys_openat2+0x81/0x150
>>>>   __x64_sys_ioctl+0x82/0xb0
>>>>   do_syscall_64+0x61/0xb0
>>>>
>>>> The deadlock is caused by multiple acquiring the destination directory
>>>> inode dlm lock in ocfs2_reflink function, we should acquire this
>>>> directory inode dlm lock at the beginning, and hold this dlm lock until
>>>> end of the function.
>>>>
>>>> Signed-off-by: Gang He <ghe@suse.com>
>>>> ---
>>>> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>>>> fs/ocfs2/namei.h        |  2 ++
>>>> fs/ocfs2/refcounttree.c | 15 +++++++++++----
>>>> fs/ocfs2/xattr.c        | 12 +-----------
>>>> fs/ocfs2/xattr.h        |  1 +
>>>> 5 files changed, 28 insertions(+), 34 deletions(-)
>>>>
>>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>>>> --- a/fs/ocfs2/namei.c
>>>> +++ b/fs/ocfs2/namei.c
>>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>>>> }
>>>>
>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>> +				 struct buffer_head **dir_bh,
>>>> 				 int mode,
>>>> 				 struct inode **new_inode)
>>>> {
>>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>
>>>> 	brelse(new_di_bh);
>>>>
>>>> -	if (!status)
>>>> -		*new_inode = inode;
>>>> -
>>>> 	ocfs2_free_dir_lookup_result(&orphan_insert);
>>>>
>>>> -	ocfs2_inode_unlock(dir, 1);
>>>> -	brelse(parent_di_bh);
>>>> +	if (!status) {
>>>> +		*new_inode = inode;
>>>> +		*dir_bh = parent_di_bh;
>>>> +	} else {
>>>> +		ocfs2_inode_unlock(dir, 1);
>>>> +		brelse(parent_di_bh);
>>>> +	}
>>>> +
>>>> 	return status;
>>>> }
>>>>
>>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>> }
>>>>
>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> +				   struct buffer_head *dir_bh,
>>>> 				   struct inode *inode,
>>>> 				   struct dentry *dentry)
>>>> {
>>>> 	int status = 0;
>>>> -	struct buffer_head *parent_di_bh = NULL;
>>>> 	handle_t *handle = NULL;
>>>> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>>>> 	struct ocfs2_dinode *dir_di, *di;
>>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>>>> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>>>
>>>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>>>> -	if (status < 0) {
>>>> -		if (status != -ENOENT)
>>>> -			mlog_errno(status);
>>>> -		return status;
>>>> -	}
>>>> -
>>>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>>>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>>>> 	if (!dir_di->i_links_count) {
>>>> 		/* can't make a file in a deleted directory. */
>>>> 		status = -ENOENT;
>>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> 		goto leave;
>>>>
>>>> 	/* get a spot inside the dir. */
>>>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>>>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>>>> 					      dentry->d_name.name,
>>>> 					      dentry->d_name.len, &lookup);
>>>> 	if (status < 0) {
>>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> 	ocfs2_journal_dirty(handle, di_bh);
>>>>
>>>> 	status = ocfs2_add_entry(handle, dentry, inode,
>>>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>>>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>>>> 				 &lookup);
>>>> 	if (status < 0) {
>>>> 		mlog_errno(status);
>>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> 	iput(orphan_dir_inode);
>>>> leave:
>>>>
>>>> -	ocfs2_inode_unlock(dir, 1);
>>>> -
>>>> 	brelse(di_bh);
>>>> -	brelse(parent_di_bh);
>>>> 	brelse(orphan_dir_bh);
>>>>
>>>> 	ocfs2_free_dir_lookup_result(&lookup);
>>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>>>> index 9cc891eb874e..03a2c526e2c1 100644
>>>> --- a/fs/ocfs2/namei.h
>>>> +++ b/fs/ocfs2/namei.h
>>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>>>> 		     struct buffer_head *orphan_dir_bh,
>>>> 		     bool dio);
>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>> +				 struct buffer_head **dir_bh,
>>>> 				 int mode,
>>>> 				 struct inode **new_inode);
>>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>> 		struct inode *inode, struct buffer_head *di_bh,
>>>> 		int update_isize, loff_t end);
>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>> +				   struct buffer_head *dir_bh,
>>>> 				   struct inode *new_inode,
>>>> 				   struct dentry *new_dentry);
>>>>
>>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>>>> index 7f6355cbb587..a9a0c7c37e8e 100644
>>>> --- a/fs/ocfs2/refcounttree.c
>>>> +++ b/fs/ocfs2/refcounttree.c
>>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>> {
>>>> 	int error, had_lock;
>>>> 	struct inode *inode = d_inode(old_dentry);
>>>> -	struct buffer_head *old_bh = NULL;
>>>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>>>> 	struct inode *new_orphan_inode = NULL;
>>>> 	struct ocfs2_lock_holder oh;
>>>>
>>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>> 		return -EOPNOTSUPP;
>>>>
>>>>
>>>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>>>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>>>> 					     &new_orphan_inode);
>>>> 	if (error) {
>>>> 		mlog_errno(error);
>>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>
>>>> 	/* If the security isn't preserved, we need to re-initialize them. */
>>>> 	if (!preserve) {
>>>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>>>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>>>> +						    new_orphan_inode,
>>>> 						    &new_dentry->d_name);
>>>> 		if (error)
>>>> 			mlog_errno(error);
>>>> 	}
>>>> 	if (!error) {
>>>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>>>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>>>> +						       new_orphan_inode,
>>>> 						       new_dentry);
>>>> 		if (error)
>>>> 			mlog_errno(error);
>>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>> 			iput(new_orphan_inode);
>>>> 	}
>>>>
>>>> +	if (dir_bh) {
>>>> +		ocfs2_inode_unlock(dir, 1);
>>>> +		brelse(dir_bh);
>>>> +	}
>>>> +
>>>> 	return error;
>>>> }
>>>>
>>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>>>> index dd784eb0cd7c..3f23e3a5018c 100644
>>>> --- a/fs/ocfs2/xattr.c
>>>> +++ b/fs/ocfs2/xattr.c
>>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>> /*
>>>>   * Initialize security and acl for a already created inode.
>>>>   * Used for reflink a non-preserve-security file.
>>>> - *
>>>> - * It uses common api like ocfs2_xattr_set, so the caller
>>>> - * must not hold any lock expect i_mutex.
>>>>   */
>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>> +				struct buffer_head *dir_bh,
>>>> 				struct inode *inode,
>>>> 				const struct qstr *qstr)
>>>> {
>>>> 	int ret = 0;
>>>> -	struct buffer_head *dir_bh = NULL;
>>>>
>>>> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>>>> 	if (ret) {
>>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>>>> 		goto leave;
>>>> 	}
>>>>
>>>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>>>> -	if (ret) {
>>>> -		mlog_errno(ret);
>>>> -		goto leave;
>>>> -	}
>>>> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>>>> 	if (ret)
>>>> 		mlog_errno(ret);
>>>>
>>>> -	ocfs2_inode_unlock(dir, 0);
>>>> -	brelse(dir_bh);
>>>> leave:
>>>> 	return ret;
>>>> }
>>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>>>> index 00308b57f64f..b27fd8ba0019 100644
>>>> --- a/fs/ocfs2/xattr.h
>>>> +++ b/fs/ocfs2/xattr.h
>>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>> 			 struct buffer_head *new_bh,
>>>> 			 bool preserve_security);
>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>> +				struct buffer_head *dir_bh,
>>>> 				struct inode *inode,
>>>> 				const struct qstr *qstr);
>>>> #endif /* OCFS2_XATTR_H */
>>>> -- 
>>>> 2.21.0
>>>>
>>>>
>>>> _______________________________________________
>>>> Ocfs2-devel mailing list
>>>> Ocfs2-devel@oss.oracle.com
>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>
>
Wengang Wang Aug. 2, 2021, 6:22 p.m. UTC | #5
> On Aug 1, 2021, at 9:35 PM, Gang He <ghe@suse.com> wrote:
> 
> Hi Wengang,
> 
> The ocfs2 file system is mounted at /mnt/shared,
> The test file(e.g. 40G VM image) is located at /mnt/shared/test, the snapshot direcory is /mnt/shared/.snapshots, then the user runs a reflink command from each node to generate a clone file repeatedly like,
> reflink "/mnt/shared/test" \
>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
> 
> after a while, the reflink process on each node is hung, the user
> cannot list the file system again.
> 
> The problematic reflink command process is blocked at one node, like
> __schedule+0x2fd/0x750
> schedule+0x2f/0xa0
> schedule_timeout+0x1cc/0x310
> ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
> ? 0xffffffffc0e3e000
> wait_for_completion+0xba/0x140
> ? wake_up_q+0xa0/0xa0
> __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
> ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
> ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
> ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
> ocfs2_reflink+0x436/0x4c0 [ocfs2]
> ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
> ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
> ocfs2_ioctl+0x25e/0x670 [ocfs2]
> do_vfs_ioctl+0xa0/0x680
> ksys_ioctl+0x70/0x80
> __x64_sys_ioctl+0x16/0x20
> 
> In fact, this node has acquired .snapshots directory inode dlm lock,
> but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0, the process is wait for getting
> .snapshots directory inode dlm lock.
> The reflink process on other nodes is also blocked at getting .snapshots
> directory inode dlm lock, then the file system cannot be listed since "ls -l" command also need to get .snapshot directory's attributes.
> 

I still didn’t get which locks are involved in the dead lock party.
If that is not clear, it’s hard to review your patch further.


> Why the the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0? I feel it is caused by multiple acquiring the .snapshots directory inode dlm lock in ocfs2_reflink function. For the deeper reasons, it is probably related to dlmglue layer or there is any factor which blocked .snapshots inode dlm lock to downconvert.

I suggest you take tcpdump in your system and get clear what was going on.  So you can remove the words like “feel” or “probably” next time.
If the problem is in downconvert, we should fix that instead.

thanks,
wengang

> 
> Thanks
> Gang
> 
> 
> 
> 
> On 2021/7/31 0:35, Wengang Wang wrote:
>> Hi Gang,
>>> On Jul 29, 2021, at 11:16 PM, Gang He <ghe@suse.com> wrote:
>>> 
>>> Hello Wengang and all,
>>> 
>>> This issue can be reproduced stably when you run the below reflink command line(maybe you also can follow a "rm this file" command line and  sleep some usecs) from each node repeatedly for a while.
>>> Based on my observation, the reflink processes are always blocked at the below points.
>>> From dlm_tool output and crash analysis, the node1 has acquired .snapshots directory inode EX dlm lock, but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode dlm lock again.
>> So the reflink path on node 1 have .snapshots inode lock granted and is blocking at the new created inode under orphan directory.  BTW, what’s .snapshots directory? What’s the call path to lock that .snapshots inode?
>>> On the other two nodes, the reflink processes are blocked at acquire .snapshots directory inode dlm lock, then the whole file system is hung,
>>> you can not list this file again.
>> So there are reflink paths on the other two nodes blocking at .snapshots inode. But what lock they are granted already?
>> For a typical ABBA deadlock,
>> path 1 granted lock A and blocks at lock B
>> path 2 granted lock B and blocks at lock A
>> Per your description, I see this:
>> reflink path on node1  granted .snapshots lock and blocks at new inode lock
>> reflnk paths on onde2/3 block at .snapshots lock.
>> I don't see how deadlock formed… the new inode lock is granted to any of the reflink path on node2/3? how?
>> thanks,
>> wengang
>>> 
>>> The problem looks like acquiring the destination direcory multiple during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
>>> e.g.
>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M000000000000000004661c00000000
>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: status = -16
>>> 
>>> Then, I change the code to acquire this destination direcory dlm lock, and hold the lock until the end of ocfs2_reflink function.
>>> After this change, I did not encounter this hang problem again after lots of testing. Second, I find the code change also improve reflink performance, since the code avoids the previous ping-pong effect.
>>> 
>>> Thanks
>>> Gang
>>> 
>>> 
>>> On 2021/7/30 6:07, Wengang Wang wrote:
>>>> Hi Gang,
>>>> I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
>>>> With that, the problem would be easier to understand.
>>>> thanks,
>>>> wengang
>>>>> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
>>>>> 
>>>>> Running reflink from multiple nodes simultaneously to clone a file
>>>>> to the same directory probably triggers a deadlock issue.
>>>>> For example, there is a three node ocfs2 cluster, each node mounts
>>>>> the ocfs2 file system to /mnt/shared, and run the reflink command
>>>>> from each node repeatedly, like
>>>>>  reflink "/mnt/shared/test" \
>>>>>  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>> then, reflink command process will be hung on each node, and you
>>>>> can't list this file system directory.
>>>>> The problematic reflink command process is blocked at one node,
>>>>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>>>>> Call Trace:
>>>>>  __schedule+0x2fd/0x750
>>>>>  schedule+0x2f/0xa0
>>>>>  schedule_timeout+0x1cc/0x310
>>>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>  ? 0xffffffffc0e3e000
>>>>>  wait_for_completion+0xba/0x140
>>>>>  ? wake_up_q+0xa0/0xa0
>>>>>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>>  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>  do_vfs_ioctl+0xa0/0x680
>>>>>  ksys_ioctl+0x70/0x80
>>>>>  __x64_sys_ioctl+0x16/0x20
>>>>>  do_syscall_64+0x5b/0x1e0
>>>>> The other reflink command processes are blocked at other nodes,
>>>>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>>>>> Call Trace:
>>>>>  __schedule+0x2fd/0x750
>>>>>  schedule+0x2f/0xa0
>>>>>  schedule_timeout+0x1cc/0x310
>>>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>  ? 0xffffffffc0b19000
>>>>>  wait_for_completion+0xba/0x140
>>>>>  ? wake_up_q+0xa0/0xa0
>>>>>  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>  ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>>>>  ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>>>>  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>  do_vfs_ioctl+0xa0/0x680
>>>>>  ksys_ioctl+0x70/0x80
>>>>>  __x64_sys_ioctl+0x16/0x20
>>>>>  do_syscall_64+0x5b/0x1e0
>>>>> or
>>>>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>>>>> Call Trace:
>>>>>  __schedule+0x302/0x940
>>>>>  ? usleep_range+0x80/0x80
>>>>>  schedule+0x46/0xb0
>>>>>  schedule_timeout+0xff/0x140
>>>>>  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>  ? 0xffffffffc0c3b000
>>>>>  __wait_for_common+0xb9/0x170
>>>>>  __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>>>>  ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>>>>  ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>>  ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>>  ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>>>>  ? dput+0x32/0x2f0
>>>>>  ocfs2_permission+0x45/0xe0 [ocfs2]
>>>>>  inode_permission+0xcc/0x170
>>>>>  link_path_walk.part.0.constprop.0+0x2a2/0x380
>>>>>  ? path_init+0x2c1/0x3f0
>>>>>  path_parentat+0x3c/0x90
>>>>>  filename_parentat+0xc1/0x1d0
>>>>>  ? filename_lookup+0x138/0x1c0
>>>>>  filename_create+0x43/0x160
>>>>>  ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>>>>  ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>>>>  ? do_sys_openat2+0x81/0x150
>>>>>  __x64_sys_ioctl+0x82/0xb0
>>>>>  do_syscall_64+0x61/0xb0
>>>>> 
>>>>> The deadlock is caused by multiple acquiring the destination directory
>>>>> inode dlm lock in ocfs2_reflink function, we should acquire this
>>>>> directory inode dlm lock at the beginning, and hold this dlm lock until
>>>>> end of the function.
>>>>> 
>>>>> Signed-off-by: Gang He <ghe@suse.com>
>>>>> ---
>>>>> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>>>>> fs/ocfs2/namei.h        |  2 ++
>>>>> fs/ocfs2/refcounttree.c | 15 +++++++++++----
>>>>> fs/ocfs2/xattr.c        | 12 +-----------
>>>>> fs/ocfs2/xattr.h        |  1 +
>>>>> 5 files changed, 28 insertions(+), 34 deletions(-)
>>>>> 
>>>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>>>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>>>>> --- a/fs/ocfs2/namei.c
>>>>> +++ b/fs/ocfs2/namei.c
>>>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>>>>> }
>>>>> 
>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>> +				 struct buffer_head **dir_bh,
>>>>> 				 int mode,
>>>>> 				 struct inode **new_inode)
>>>>> {
>>>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>> 
>>>>> 	brelse(new_di_bh);
>>>>> 
>>>>> -	if (!status)
>>>>> -		*new_inode = inode;
>>>>> -
>>>>> 	ocfs2_free_dir_lookup_result(&orphan_insert);
>>>>> 
>>>>> -	ocfs2_inode_unlock(dir, 1);
>>>>> -	brelse(parent_di_bh);
>>>>> +	if (!status) {
>>>>> +		*new_inode = inode;
>>>>> +		*dir_bh = parent_di_bh;
>>>>> +	} else {
>>>>> +		ocfs2_inode_unlock(dir, 1);
>>>>> +		brelse(parent_di_bh);
>>>>> +	}
>>>>> +
>>>>> 	return status;
>>>>> }
>>>>> 
>>>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>>> }
>>>>> 
>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> +				   struct buffer_head *dir_bh,
>>>>> 				   struct inode *inode,
>>>>> 				   struct dentry *dentry)
>>>>> {
>>>>> 	int status = 0;
>>>>> -	struct buffer_head *parent_di_bh = NULL;
>>>>> 	handle_t *handle = NULL;
>>>>> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>>>>> 	struct ocfs2_dinode *dir_di, *di;
>>>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>>>>> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>>>> 
>>>>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>>>>> -	if (status < 0) {
>>>>> -		if (status != -ENOENT)
>>>>> -			mlog_errno(status);
>>>>> -		return status;
>>>>> -	}
>>>>> -
>>>>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>>>>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>>>>> 	if (!dir_di->i_links_count) {
>>>>> 		/* can't make a file in a deleted directory. */
>>>>> 		status = -ENOENT;
>>>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> 		goto leave;
>>>>> 
>>>>> 	/* get a spot inside the dir. */
>>>>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>>>>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>>>>> 					      dentry->d_name.name,
>>>>> 					      dentry->d_name.len, &lookup);
>>>>> 	if (status < 0) {
>>>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> 	ocfs2_journal_dirty(handle, di_bh);
>>>>> 
>>>>> 	status = ocfs2_add_entry(handle, dentry, inode,
>>>>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>>>>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>>>>> 				 &lookup);
>>>>> 	if (status < 0) {
>>>>> 		mlog_errno(status);
>>>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> 	iput(orphan_dir_inode);
>>>>> leave:
>>>>> 
>>>>> -	ocfs2_inode_unlock(dir, 1);
>>>>> -
>>>>> 	brelse(di_bh);
>>>>> -	brelse(parent_di_bh);
>>>>> 	brelse(orphan_dir_bh);
>>>>> 
>>>>> 	ocfs2_free_dir_lookup_result(&lookup);
>>>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>>>>> index 9cc891eb874e..03a2c526e2c1 100644
>>>>> --- a/fs/ocfs2/namei.h
>>>>> +++ b/fs/ocfs2/namei.h
>>>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>>>>> 		     struct buffer_head *orphan_dir_bh,
>>>>> 		     bool dio);
>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>> +				 struct buffer_head **dir_bh,
>>>>> 				 int mode,
>>>>> 				 struct inode **new_inode);
>>>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>>>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>>> 		struct inode *inode, struct buffer_head *di_bh,
>>>>> 		int update_isize, loff_t end);
>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>> +				   struct buffer_head *dir_bh,
>>>>> 				   struct inode *new_inode,
>>>>> 				   struct dentry *new_dentry);
>>>>> 
>>>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>>>>> index 7f6355cbb587..a9a0c7c37e8e 100644
>>>>> --- a/fs/ocfs2/refcounttree.c
>>>>> +++ b/fs/ocfs2/refcounttree.c
>>>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>> {
>>>>> 	int error, had_lock;
>>>>> 	struct inode *inode = d_inode(old_dentry);
>>>>> -	struct buffer_head *old_bh = NULL;
>>>>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>>>>> 	struct inode *new_orphan_inode = NULL;
>>>>> 	struct ocfs2_lock_holder oh;
>>>>> 
>>>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>> 		return -EOPNOTSUPP;
>>>>> 
>>>>> 
>>>>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>>>>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>>>>> 					     &new_orphan_inode);
>>>>> 	if (error) {
>>>>> 		mlog_errno(error);
>>>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>> 
>>>>> 	/* If the security isn't preserved, we need to re-initialize them. */
>>>>> 	if (!preserve) {
>>>>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>>>>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>>>>> +						    new_orphan_inode,
>>>>> 						    &new_dentry->d_name);
>>>>> 		if (error)
>>>>> 			mlog_errno(error);
>>>>> 	}
>>>>> 	if (!error) {
>>>>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>>>>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>>>>> +						       new_orphan_inode,
>>>>> 						       new_dentry);
>>>>> 		if (error)
>>>>> 			mlog_errno(error);
>>>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>> 			iput(new_orphan_inode);
>>>>> 	}
>>>>> 
>>>>> +	if (dir_bh) {
>>>>> +		ocfs2_inode_unlock(dir, 1);
>>>>> +		brelse(dir_bh);
>>>>> +	}
>>>>> +
>>>>> 	return error;
>>>>> }
>>>>> 
>>>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>>>>> index dd784eb0cd7c..3f23e3a5018c 100644
>>>>> --- a/fs/ocfs2/xattr.c
>>>>> +++ b/fs/ocfs2/xattr.c
>>>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>>> /*
>>>>>  * Initialize security and acl for a already created inode.
>>>>>  * Used for reflink a non-preserve-security file.
>>>>> - *
>>>>> - * It uses common api like ocfs2_xattr_set, so the caller
>>>>> - * must not hold any lock expect i_mutex.
>>>>>  */
>>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>>> +				struct buffer_head *dir_bh,
>>>>> 				struct inode *inode,
>>>>> 				const struct qstr *qstr)
>>>>> {
>>>>> 	int ret = 0;
>>>>> -	struct buffer_head *dir_bh = NULL;
>>>>> 
>>>>> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>>>>> 	if (ret) {
>>>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>>>>> 		goto leave;
>>>>> 	}
>>>>> 
>>>>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>>>>> -	if (ret) {
>>>>> -		mlog_errno(ret);
>>>>> -		goto leave;
>>>>> -	}
>>>>> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>>>>> 	if (ret)
>>>>> 		mlog_errno(ret);
>>>>> 
>>>>> -	ocfs2_inode_unlock(dir, 0);
>>>>> -	brelse(dir_bh);
>>>>> leave:
>>>>> 	return ret;
>>>>> }
>>>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>>>>> index 00308b57f64f..b27fd8ba0019 100644
>>>>> --- a/fs/ocfs2/xattr.h
>>>>> +++ b/fs/ocfs2/xattr.h
>>>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>>> 			 struct buffer_head *new_bh,
>>>>> 			 bool preserve_security);
>>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>>> +				struct buffer_head *dir_bh,
>>>>> 				struct inode *inode,
>>>>> 				const struct qstr *qstr);
>>>>> #endif /* OCFS2_XATTR_H */
>>>>> -- 
>>>>> 2.21.0
>>>>> 
>>>>> 
>>>>> _______________________________________________
>>>>> Ocfs2-devel mailing list
>>>>> Ocfs2-devel@oss.oracle.com
>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>> 
>
Gang He Aug. 3, 2021, 1 a.m. UTC | #6
On 2021/8/3 2:22, Wengang Wang wrote:
> 
> 
>> On Aug 1, 2021, at 9:35 PM, Gang He <ghe@suse.com> wrote:
>>
>> Hi Wengang,
>>
>> The ocfs2 file system is mounted at /mnt/shared,
>> The test file(e.g. 40G VM image) is located at /mnt/shared/test, the snapshot direcory is /mnt/shared/.snapshots, then the user runs a reflink command from each node to generate a clone file repeatedly like,
>> reflink "/mnt/shared/test" \
>>    "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>
>> after a while, the reflink process on each node is hung, the user
>> cannot list the file system again.
>>
>> The problematic reflink command process is blocked at one node, like
>> __schedule+0x2fd/0x750
>> schedule+0x2f/0xa0
>> schedule_timeout+0x1cc/0x310
>> ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>> ? 0xffffffffc0e3e000
>> wait_for_completion+0xba/0x140
>> ? wake_up_q+0xa0/0xa0
>> __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>> ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>> ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>> ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>> ocfs2_reflink+0x436/0x4c0 [ocfs2]
>> ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>> ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>> ocfs2_ioctl+0x25e/0x670 [ocfs2]
>> do_vfs_ioctl+0xa0/0x680
>> ksys_ioctl+0x70/0x80
>> __x64_sys_ioctl+0x16/0x20
>>
>> In fact, this node has acquired .snapshots directory inode dlm lock,
>> but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0, the process is wait for getting
>> .snapshots directory inode dlm lock.
>> The reflink process on other nodes is also blocked at getting .snapshots
>> directory inode dlm lock, then the file system cannot be listed since "ls -l" command also need to get .snapshot directory's attributes.
>>
> 
> I still didn’t get which locks are involved in the dead lock party.
> If that is not clear, it’s hard to review your patch further.
.snapshot inode dlm lock, the node has acquired this lock, then
acquire it again, this is a deadlock key point.


> 
> 
>> Why the the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0? I feel it is caused by multiple acquiring the .snapshots directory inode dlm lock in ocfs2_reflink function. For the deeper reasons, it is probably related to dlmglue layer or there is any factor which blocked .snapshots inode dlm lock to downconvert.
> 
> I suggest you take tcpdump in your system and get clear what was going on.  So you can remove the words like “feel” or “probably” next time.
> If the problem is in downconvert, we should fix that instead.
I think we can fix it via using one time acquire the lock before trying 
to investigate further. In fact, this way also can improve the 
performance in case the user run the reflink to clone file to the same 
directory simultaneously.


Thanks
Gang

> 
> thanks,
> wengang
> 
>>
>> Thanks
>> Gang
>>
>>
>>
>>
>> On 2021/7/31 0:35, Wengang Wang wrote:
>>> Hi Gang,
>>>> On Jul 29, 2021, at 11:16 PM, Gang He <ghe@suse.com> wrote:
>>>>
>>>> Hello Wengang and all,
>>>>
>>>> This issue can be reproduced stably when you run the below reflink command line(maybe you also can follow a "rm this file" command line and  sleep some usecs) from each node repeatedly for a while.
>>>> Based on my observation, the reflink processes are always blocked at the below points.
>>>>  From dlm_tool output and crash analysis, the node1 has acquired .snapshots directory inode EX dlm lock, but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode dlm lock again.
>>> So the reflink path on node 1 have .snapshots inode lock granted and is blocking at the new created inode under orphan directory.  BTW, what’s .snapshots directory? What’s the call path to lock that .snapshots inode?
>>>> On the other two nodes, the reflink processes are blocked at acquire .snapshots directory inode dlm lock, then the whole file system is hung,
>>>> you can not list this file again.
>>> So there are reflink paths on the other two nodes blocking at .snapshots inode. But what lock they are granted already?
>>> For a typical ABBA deadlock,
>>> path 1 granted lock A and blocks at lock B
>>> path 2 granted lock B and blocks at lock A
>>> Per your description, I see this:
>>> reflink path on node1  granted .snapshots lock and blocks at new inode lock
>>> reflnk paths on onde2/3 block at .snapshots lock.
>>> I don't see how deadlock formed… the new inode lock is granted to any of the reflink path on node2/3? how?
>>> thanks,
>>> wengang
>>>>
>>>> The problem looks like acquiring the destination direcory multiple during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
>>>> e.g.
>>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M000000000000000004661c00000000
>>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
>>>> kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: status = -16
>>>>
>>>> Then, I change the code to acquire this destination direcory dlm lock, and hold the lock until the end of ocfs2_reflink function.
>>>> After this change, I did not encounter this hang problem again after lots of testing. Second, I find the code change also improve reflink performance, since the code avoids the previous ping-pong effect.
>>>>
>>>> Thanks
>>>> Gang
>>>>
>>>>
>>>> On 2021/7/30 6:07, Wengang Wang wrote:
>>>>> Hi Gang,
>>>>> I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
>>>>> With that, the problem would be easier to understand.
>>>>> thanks,
>>>>> wengang
>>>>>> On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com> wrote:
>>>>>>
>>>>>> Running reflink from multiple nodes simultaneously to clone a file
>>>>>> to the same directory probably triggers a deadlock issue.
>>>>>> For example, there is a three node ocfs2 cluster, each node mounts
>>>>>> the ocfs2 file system to /mnt/shared, and run the reflink command
>>>>>> from each node repeatedly, like
>>>>>>   reflink "/mnt/shared/test" \
>>>>>>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>>> then, reflink command process will be hung on each node, and you
>>>>>> can't list this file system directory.
>>>>>> The problematic reflink command process is blocked at one node,
>>>>>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>>>>>> Call Trace:
>>>>>>   __schedule+0x2fd/0x750
>>>>>>   schedule+0x2f/0xa0
>>>>>>   schedule_timeout+0x1cc/0x310
>>>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>>   ? 0xffffffffc0e3e000
>>>>>>   wait_for_completion+0xba/0x140
>>>>>>   ? wake_up_q+0xa0/0xa0
>>>>>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>   ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>>>   ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>>   do_vfs_ioctl+0xa0/0x680
>>>>>>   ksys_ioctl+0x70/0x80
>>>>>>   __x64_sys_ioctl+0x16/0x20
>>>>>>   do_syscall_64+0x5b/0x1e0
>>>>>> The other reflink command processes are blocked at other nodes,
>>>>>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>>>>>> Call Trace:
>>>>>>   __schedule+0x2fd/0x750
>>>>>>   schedule+0x2f/0xa0
>>>>>>   schedule_timeout+0x1cc/0x310
>>>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>>   ? 0xffffffffc0b19000
>>>>>>   wait_for_completion+0xba/0x140
>>>>>>   ? wake_up_q+0xa0/0xa0
>>>>>>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>   ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>>>>>   ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>>>>>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>>   do_vfs_ioctl+0xa0/0x680
>>>>>>   ksys_ioctl+0x70/0x80
>>>>>>   __x64_sys_ioctl+0x16/0x20
>>>>>>   do_syscall_64+0x5b/0x1e0
>>>>>> or
>>>>>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>>>>>> Call Trace:
>>>>>>   __schedule+0x302/0x940
>>>>>>   ? usleep_range+0x80/0x80
>>>>>>   schedule+0x46/0xb0
>>>>>>   schedule_timeout+0xff/0x140
>>>>>>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>>>>>   ? 0xffffffffc0c3b000
>>>>>>   __wait_for_common+0xb9/0x170
>>>>>>   __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>>>>>   ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>>>>>   ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>>>   ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>>>>>   ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>>>>>   ? dput+0x32/0x2f0
>>>>>>   ocfs2_permission+0x45/0xe0 [ocfs2]
>>>>>>   inode_permission+0xcc/0x170
>>>>>>   link_path_walk.part.0.constprop.0+0x2a2/0x380
>>>>>>   ? path_init+0x2c1/0x3f0
>>>>>>   path_parentat+0x3c/0x90
>>>>>>   filename_parentat+0xc1/0x1d0
>>>>>>   ? filename_lookup+0x138/0x1c0
>>>>>>   filename_create+0x43/0x160
>>>>>>   ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>>>>>   ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>>>>>   ? do_sys_openat2+0x81/0x150
>>>>>>   __x64_sys_ioctl+0x82/0xb0
>>>>>>   do_syscall_64+0x61/0xb0
>>>>>>
>>>>>> The deadlock is caused by multiple acquiring the destination directory
>>>>>> inode dlm lock in ocfs2_reflink function, we should acquire this
>>>>>> directory inode dlm lock at the beginning, and hold this dlm lock until
>>>>>> end of the function.
>>>>>>
>>>>>> Signed-off-by: Gang He <ghe@suse.com>
>>>>>> ---
>>>>>> fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>>>>>> fs/ocfs2/namei.h        |  2 ++
>>>>>> fs/ocfs2/refcounttree.c | 15 +++++++++++----
>>>>>> fs/ocfs2/xattr.c        | 12 +-----------
>>>>>> fs/ocfs2/xattr.h        |  1 +
>>>>>> 5 files changed, 28 insertions(+), 34 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>>>>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>>>>>> --- a/fs/ocfs2/namei.c
>>>>>> +++ b/fs/ocfs2/namei.c
>>>>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>>>>>> }
>>>>>>
>>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>>> +				 struct buffer_head **dir_bh,
>>>>>> 				 int mode,
>>>>>> 				 struct inode **new_inode)
>>>>>> {
>>>>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>>>
>>>>>> 	brelse(new_di_bh);
>>>>>>
>>>>>> -	if (!status)
>>>>>> -		*new_inode = inode;
>>>>>> -
>>>>>> 	ocfs2_free_dir_lookup_result(&orphan_insert);
>>>>>>
>>>>>> -	ocfs2_inode_unlock(dir, 1);
>>>>>> -	brelse(parent_di_bh);
>>>>>> +	if (!status) {
>>>>>> +		*new_inode = inode;
>>>>>> +		*dir_bh = parent_di_bh;
>>>>>> +	} else {
>>>>>> +		ocfs2_inode_unlock(dir, 1);
>>>>>> +		brelse(parent_di_bh);
>>>>>> +	}
>>>>>> +
>>>>>> 	return status;
>>>>>> }
>>>>>>
>>>>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>>>> }
>>>>>>
>>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> +				   struct buffer_head *dir_bh,
>>>>>> 				   struct inode *inode,
>>>>>> 				   struct dentry *dentry)
>>>>>> {
>>>>>> 	int status = 0;
>>>>>> -	struct buffer_head *parent_di_bh = NULL;
>>>>>> 	handle_t *handle = NULL;
>>>>>> 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>>>>>> 	struct ocfs2_dinode *dir_di, *di;
>>>>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>>>>>> 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>>>>>
>>>>>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>>>>>> -	if (status < 0) {
>>>>>> -		if (status != -ENOENT)
>>>>>> -			mlog_errno(status);
>>>>>> -		return status;
>>>>>> -	}
>>>>>> -
>>>>>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>>>>>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>>>>>> 	if (!dir_di->i_links_count) {
>>>>>> 		/* can't make a file in a deleted directory. */
>>>>>> 		status = -ENOENT;
>>>>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> 		goto leave;
>>>>>>
>>>>>> 	/* get a spot inside the dir. */
>>>>>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>>>>>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>>>>>> 					      dentry->d_name.name,
>>>>>> 					      dentry->d_name.len, &lookup);
>>>>>> 	if (status < 0) {
>>>>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> 	ocfs2_journal_dirty(handle, di_bh);
>>>>>>
>>>>>> 	status = ocfs2_add_entry(handle, dentry, inode,
>>>>>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>>>>>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>>>>>> 				 &lookup);
>>>>>> 	if (status < 0) {
>>>>>> 		mlog_errno(status);
>>>>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> 	iput(orphan_dir_inode);
>>>>>> leave:
>>>>>>
>>>>>> -	ocfs2_inode_unlock(dir, 1);
>>>>>> -
>>>>>> 	brelse(di_bh);
>>>>>> -	brelse(parent_di_bh);
>>>>>> 	brelse(orphan_dir_bh);
>>>>>>
>>>>>> 	ocfs2_free_dir_lookup_result(&lookup);
>>>>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>>>>>> index 9cc891eb874e..03a2c526e2c1 100644
>>>>>> --- a/fs/ocfs2/namei.h
>>>>>> +++ b/fs/ocfs2/namei.h
>>>>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>>>>>> 		     struct buffer_head *orphan_dir_bh,
>>>>>> 		     bool dio);
>>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir,
>>>>>> +				 struct buffer_head **dir_bh,
>>>>>> 				 int mode,
>>>>>> 				 struct inode **new_inode);
>>>>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>>>>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>>>>> 		struct inode *inode, struct buffer_head *di_bh,
>>>>>> 		int update_isize, loff_t end);
>>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>>>>> +				   struct buffer_head *dir_bh,
>>>>>> 				   struct inode *new_inode,
>>>>>> 				   struct dentry *new_dentry);
>>>>>>
>>>>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>>>>>> index 7f6355cbb587..a9a0c7c37e8e 100644
>>>>>> --- a/fs/ocfs2/refcounttree.c
>>>>>> +++ b/fs/ocfs2/refcounttree.c
>>>>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>>> {
>>>>>> 	int error, had_lock;
>>>>>> 	struct inode *inode = d_inode(old_dentry);
>>>>>> -	struct buffer_head *old_bh = NULL;
>>>>>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>>>>>> 	struct inode *new_orphan_inode = NULL;
>>>>>> 	struct ocfs2_lock_holder oh;
>>>>>>
>>>>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>>> 		return -EOPNOTSUPP;
>>>>>>
>>>>>>
>>>>>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>>>>>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>>>>>> 					     &new_orphan_inode);
>>>>>> 	if (error) {
>>>>>> 		mlog_errno(error);
>>>>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>>>
>>>>>> 	/* If the security isn't preserved, we need to re-initialize them. */
>>>>>> 	if (!preserve) {
>>>>>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>>>>>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>>>>>> +						    new_orphan_inode,
>>>>>> 						    &new_dentry->d_name);
>>>>>> 		if (error)
>>>>>> 			mlog_errno(error);
>>>>>> 	}
>>>>>> 	if (!error) {
>>>>>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>>>>>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>>>>>> +						       new_orphan_inode,
>>>>>> 						       new_dentry);
>>>>>> 		if (error)
>>>>>> 			mlog_errno(error);
>>>>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>>>>> 			iput(new_orphan_inode);
>>>>>> 	}
>>>>>>
>>>>>> +	if (dir_bh) {
>>>>>> +		ocfs2_inode_unlock(dir, 1);
>>>>>> +		brelse(dir_bh);
>>>>>> +	}
>>>>>> +
>>>>>> 	return error;
>>>>>> }
>>>>>>
>>>>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>>>>>> index dd784eb0cd7c..3f23e3a5018c 100644
>>>>>> --- a/fs/ocfs2/xattr.c
>>>>>> +++ b/fs/ocfs2/xattr.c
>>>>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>>>> /*
>>>>>>   * Initialize security and acl for a already created inode.
>>>>>>   * Used for reflink a non-preserve-security file.
>>>>>> - *
>>>>>> - * It uses common api like ocfs2_xattr_set, so the caller
>>>>>> - * must not hold any lock expect i_mutex.
>>>>>>   */
>>>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>>>> +				struct buffer_head *dir_bh,
>>>>>> 				struct inode *inode,
>>>>>> 				const struct qstr *qstr)
>>>>>> {
>>>>>> 	int ret = 0;
>>>>>> -	struct buffer_head *dir_bh = NULL;
>>>>>>
>>>>>> 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>>>>>> 	if (ret) {
>>>>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>>>>>> 		goto leave;
>>>>>> 	}
>>>>>>
>>>>>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>>>>>> -	if (ret) {
>>>>>> -		mlog_errno(ret);
>>>>>> -		goto leave;
>>>>>> -	}
>>>>>> 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>>>>>> 	if (ret)
>>>>>> 		mlog_errno(ret);
>>>>>>
>>>>>> -	ocfs2_inode_unlock(dir, 0);
>>>>>> -	brelse(dir_bh);
>>>>>> leave:
>>>>>> 	return ret;
>>>>>> }
>>>>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>>>>>> index 00308b57f64f..b27fd8ba0019 100644
>>>>>> --- a/fs/ocfs2/xattr.h
>>>>>> +++ b/fs/ocfs2/xattr.h
>>>>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>>>>> 			 struct buffer_head *new_bh,
>>>>>> 			 bool preserve_security);
>>>>>> int ocfs2_init_security_and_acl(struct inode *dir,
>>>>>> +				struct buffer_head *dir_bh,
>>>>>> 				struct inode *inode,
>>>>>> 				const struct qstr *qstr);
>>>>>> #endif /* OCFS2_XATTR_H */
>>>>>> -- 
>>>>>> 2.21.0
>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> Ocfs2-devel mailing list
>>>>>> Ocfs2-devel@oss.oracle.com
>>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>>
>>
>
Wengang Wang Aug. 3, 2021, 8:08 p.m. UTC | #7
On Aug 2, 2021, at 6:00 PM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:



On 2021/8/3 2:22, Wengang Wang wrote:
On Aug 1, 2021, at 9:35 PM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:

Hi Wengang,

The ocfs2 file system is mounted at /mnt/shared,
The test file(e.g. 40G VM image) is located at /mnt/shared/test, the snapshot direcory is /mnt/shared/.snapshots, then the user runs a reflink command from each node to generate a clone file repeatedly like,
reflink "/mnt/shared/test" \
  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"

after a while, the reflink process on each node is hung, the user
cannot list the file system again.

The problematic reflink command process is blocked at one node, like
__schedule+0x2fd/0x750
schedule+0x2f/0xa0
schedule_timeout+0x1cc/0x310
? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
? 0xffffffffc0e3e000
wait_for_completion+0xba/0x140
? wake_up_q+0xa0/0xa0
__ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
ocfs2_reflink+0x436/0x4c0 [ocfs2]
? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
ocfs2_ioctl+0x25e/0x670 [ocfs2]
do_vfs_ioctl+0xa0/0x680
ksys_ioctl+0x70/0x80
__x64_sys_ioctl+0x16/0x20

In fact, this node has acquired .snapshots directory inode dlm lock,
but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0, the process is wait for getting
.snapshots directory inode dlm lock.
The reflink process on other nodes is also blocked at getting .snapshots
directory inode dlm lock, then the file system cannot be listed since "ls -l" command also need to get .snapshot directory's attributes.

I still didn’t get which locks are involved in the dead lock party.
If that is not clear, it’s hard to review your patch further.
.snapshot inode dlm lock, the node has acquired this lock, then
acquire it again, this is a deadlock key point.



If the .snapshot inode lock is already granted to node1, it shouldn’t block when the processes on node 1 lock it again. It just increases the in use counter.

Why the the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0? I feel it is caused by multiple acquiring the .snapshots directory inode dlm lock in ocfs2_reflink function. For the deeper reasons, it is probably related to dlmglue layer or there is any factor which blocked .snapshots inode dlm lock to downconvert.
I suggest you take tcpdump in your system and get clear what was going on.  So you can remove the words like “feel” or “probably” next time.
If the problem is in downconvert, we should fix that instead.
I think we can fix it via using one time acquire the lock before trying to investigate further. In fact, this way also can improve the performance in case the user run the reflink to clone file to the same directory simultaneously.

If the downconvert has problem, it also affect other locks (not only for your case), we have to fix it.  For performance consideration, it can be in a separated patch.

thanks,
wengang


Thanks
Gang

thanks,
wengang

Thanks
Gang




On 2021/7/31 0:35, Wengang Wang wrote:
Hi Gang,
On Jul 29, 2021, at 11:16 PM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:

Hello Wengang and all,

This issue can be reproduced stably when you run the below reflink command line(maybe you also can follow a "rm this file" command line and  sleep some usecs) from each node repeatedly for a while.
Based on my observation, the reflink processes are always blocked at the below points.
From dlm_tool output and crash analysis, the node1 has acquired .snapshots directory inode EX dlm lock, but the reflink process is blocked at ocfs2_init_security_and_acl+0xbe/0x1d0 to acqure it's inode dlm lock again.
So the reflink path on node 1 have .snapshots inode lock granted and is blocking at the new created inode under orphan directory.  BTW, what’s .snapshots directory? What’s the call path to lock that .snapshots inode?
On the other two nodes, the reflink processes are blocked at acquire .snapshots directory inode dlm lock, then the whole file system is hung,
you can not list this file again.
So there are reflink paths on the other two nodes blocking at .snapshots inode. But what lock they are granted already?
For a typical ABBA deadlock,
path 1 granted lock A and blocks at lock B
path 2 granted lock B and blocks at lock A
Per your description, I see this:
reflink path on node1  granted .snapshots lock and blocks at new inode lock
reflnk paths on onde2/3 block at .snapshots lock.
I don't see how deadlock formed… the new inode lock is granted to any of the reflink path on node2/3? how?
thanks,
wengang

The problem looks like acquiring the destination direcory multiple during ocfs2_reflink, dlm glue layer cannot downconvert lock in some case.
e.g.
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_downconvert_lock:3674 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M000000000000000004661c00000000
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_unblock_lock:3918 ERROR: status = -16
kernel: (ocfs2dc-F50B203,1593,0):ocfs2_process_blocked_lock:4317 ERROR: status = -16

Then, I change the code to acquire this destination direcory dlm lock, and hold the lock until the end of ocfs2_reflink function.
After this change, I did not encounter this hang problem again after lots of testing. Second, I find the code change also improve reflink performance, since the code avoids the previous ping-pong effect.

Thanks
Gang


On 2021/7/30 6:07, Wengang Wang wrote:
Hi Gang,
I’d suggest you list the call paths on the related nodes, Say call path 1 on node one granted lock A and is requesting for lock B, at the same time, path2 on node two granted lock B and now is requesting for lock A.
With that, the problem would be easier to understand.
thanks,
wengang
On Jul 29, 2021, at 4:02 AM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:

Running reflink from multiple nodes simultaneously to clone a file
to the same directory probably triggers a deadlock issue.
For example, there is a three node ocfs2 cluster, each node mounts
the ocfs2 file system to /mnt/shared, and run the reflink command
from each node repeatedly, like
 reflink "/mnt/shared/test" \
 "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
then, reflink command process will be hung on each node, and you
can't list this file system directory.
The problematic reflink command process is blocked at one node,
task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
Call Trace:
 __schedule+0x2fd/0x750
 schedule+0x2f/0xa0
 schedule_timeout+0x1cc/0x310
 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
 ? 0xffffffffc0e3e000
 wait_for_completion+0xba/0x140
 ? wake_up_q+0xa0/0xa0
 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
 ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
 ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
 ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
 ocfs2_reflink+0x436/0x4c0 [ocfs2]
 ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
 ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
 ocfs2_ioctl+0x25e/0x670 [ocfs2]
 do_vfs_ioctl+0xa0/0x680
 ksys_ioctl+0x70/0x80
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x5b/0x1e0
The other reflink command processes are blocked at other nodes,
task:reflink         state:D stack:    0 pid:29759 ppid:  4088
Call Trace:
 __schedule+0x2fd/0x750
 schedule+0x2f/0xa0
 schedule_timeout+0x1cc/0x310
 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
 ? 0xffffffffc0b19000
 wait_for_completion+0xba/0x140
 ? wake_up_q+0xa0/0xa0
 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
 ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
 ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
 ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
 ocfs2_reflink+0x335/0x4c0 [ocfs2]
 ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
 ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
 ocfs2_ioctl+0x25e/0x670 [ocfs2]
 do_vfs_ioctl+0xa0/0x680
 ksys_ioctl+0x70/0x80
 __x64_sys_ioctl+0x16/0x20
 do_syscall_64+0x5b/0x1e0
or
task:reflink         state:D stack:    0 pid:18465 ppid:  4156
Call Trace:
 __schedule+0x302/0x940
 ? usleep_range+0x80/0x80
 schedule+0x46/0xb0
 schedule_timeout+0xff/0x140
 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
 ? 0xffffffffc0c3b000
 __wait_for_common+0xb9/0x170
 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
 ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
 ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
 ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
 ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
 ? dput+0x32/0x2f0
 ocfs2_permission+0x45/0xe0 [ocfs2]
 inode_permission+0xcc/0x170
 link_path_walk.part.0.constprop.0+0x2a2/0x380
 ? path_init+0x2c1/0x3f0
 path_parentat+0x3c/0x90
 filename_parentat+0xc1/0x1d0
 ? filename_lookup+0x138/0x1c0
 filename_create+0x43/0x160
 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
 ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
 ? do_sys_openat2+0x81/0x150
 __x64_sys_ioctl+0x82/0xb0
 do_syscall_64+0x61/0xb0

The deadlock is caused by multiple acquiring the destination directory
inode dlm lock in ocfs2_reflink function, we should acquire this
directory inode dlm lock at the beginning, and hold this dlm lock until
end of the function.

Signed-off-by: Gang He <ghe@suse.com<mailto:ghe@suse.com>>
---
fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
fs/ocfs2/namei.h        |  2 ++
fs/ocfs2/refcounttree.c | 15 +++++++++++----
fs/ocfs2/xattr.c        | 12 +-----------
fs/ocfs2/xattr.h        |  1 +
5 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..f8bbb22cc60b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}

int ocfs2_create_inode_in_orphan(struct inode *dir,
+  struct buffer_head **dir_bh,
 int mode,
 struct inode **new_inode)
{
@@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,

brelse(new_di_bh);

- if (!status)
- *new_inode = inode;
-
ocfs2_free_dir_lookup_result(&orphan_insert);

- ocfs2_inode_unlock(dir, 1);
- brelse(parent_di_bh);
+ if (!status) {
+ *new_inode = inode;
+ *dir_bh = parent_di_bh;
+ } else {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ }
+
return status;
}

@@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
}

int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+    struct buffer_head *dir_bh,
   struct inode *inode,
   struct dentry *dentry)
{
int status = 0;
- struct buffer_head *parent_di_bh = NULL;
handle_t *handle = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *dir_di, *di;
@@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);

- status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
- return status;
- }
-
- dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
if (!dir_di->i_links_count) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
@@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;

/* get a spot inside the dir. */
- status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
      dentry->d_name.name,
      dentry->d_name.len, &lookup);
if (status < 0) {
@@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ocfs2_journal_dirty(handle, di_bh);

status = ocfs2_add_entry(handle, dentry, inode,
-  OCFS2_I(inode)->ip_blkno, parent_di_bh,
+  OCFS2_I(inode)->ip_blkno, dir_bh,
 &lookup);
if (status < 0) {
mlog_errno(status);
@@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
iput(orphan_dir_inode);
leave:

- ocfs2_inode_unlock(dir, 1);
-
brelse(di_bh);
- brelse(parent_di_bh);
brelse(orphan_dir_bh);

ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 9cc891eb874e..03a2c526e2c1 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
     struct buffer_head *orphan_dir_bh,
     bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
+  struct buffer_head **dir_bh,
 int mode,
 struct inode **new_inode);
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
struct inode *inode, struct buffer_head *di_bh,
int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+    struct buffer_head *dir_bh,
   struct inode *new_inode,
   struct dentry *new_dentry);

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb587..a9a0c7c37e8e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
{
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
- struct buffer_head *old_bh = NULL;
+ struct buffer_head *old_bh = NULL, *dir_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct ocfs2_lock_holder oh;

@@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
return -EOPNOTSUPP;


- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
     &new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,

/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
- error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ error = ocfs2_init_security_and_acl(dir, dir_bh,
+     new_orphan_inode,
    &new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
- error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+        new_orphan_inode,
       new_dentry);
if (error)
mlog_errno(error);
@@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
iput(new_orphan_inode);
}

+ if (dir_bh) {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(dir_bh);
+ }
+
return error;
}

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..3f23e3a5018c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
/*
 * Initialize security and acl for a already created inode.
 * Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
 */
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr)
{
int ret = 0;
- struct buffer_head *dir_bh = NULL;

ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}

- ret = ocfs2_inode_lock(dir, &dir_bh, 0);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);

- ocfs2_inode_unlock(dir, 0);
- brelse(dir_bh);
leave:
return ret;
}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..b27fd8ba0019 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
 struct buffer_head *new_bh,
 bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
--
2.21.0
Joseph Qi Aug. 4, 2021, 1:57 a.m. UTC | #8
Hi Gang,
As I mentioned in your previous mail, I'd like we describe the issue in
the following way:

Node 1		Node 2		Node3
...
		...
				...

That will be more easier to understand the lock sequence.

Thanks,
Joseph

On 7/29/21 7:02 PM, Gang He wrote:
> Running reflink from multiple nodes simultaneously to clone a file
> to the same directory probably triggers a deadlock issue.
> For example, there is a three node ocfs2 cluster, each node mounts
> the ocfs2 file system to /mnt/shared, and run the reflink command
> from each node repeatedly, like
>   reflink "/mnt/shared/test" \
>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
> then, reflink command process will be hung on each node, and you
> can't list this file system directory.
> The problematic reflink command process is blocked at one node,
> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
> Call Trace:
>   __schedule+0x2fd/0x750
>   schedule+0x2f/0xa0
>   schedule_timeout+0x1cc/0x310
>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>   ? 0xffffffffc0e3e000
>   wait_for_completion+0xba/0x140
>   ? wake_up_q+0xa0/0xa0
>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>   ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>   ocfs2_reflink+0x436/0x4c0 [ocfs2]
>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>   do_vfs_ioctl+0xa0/0x680
>   ksys_ioctl+0x70/0x80
>   __x64_sys_ioctl+0x16/0x20
>   do_syscall_64+0x5b/0x1e0
> The other reflink command processes are blocked at other nodes,
> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
> Call Trace:
>   __schedule+0x2fd/0x750
>   schedule+0x2f/0xa0
>   schedule_timeout+0x1cc/0x310
>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>   ? 0xffffffffc0b19000
>   wait_for_completion+0xba/0x140
>   ? wake_up_q+0xa0/0xa0
>   __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>   ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>   ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>   ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>   ocfs2_reflink+0x335/0x4c0 [ocfs2]
>   ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>   ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>   ocfs2_ioctl+0x25e/0x670 [ocfs2]
>   do_vfs_ioctl+0xa0/0x680
>   ksys_ioctl+0x70/0x80
>   __x64_sys_ioctl+0x16/0x20
>   do_syscall_64+0x5b/0x1e0
> or
> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
> Call Trace:
>   __schedule+0x302/0x940
>   ? usleep_range+0x80/0x80
>   schedule+0x46/0xb0
>   schedule_timeout+0xff/0x140
>   ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>   ? 0xffffffffc0c3b000
>   __wait_for_common+0xb9/0x170
>   __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>   ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>   ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>   ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>   ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>   ? dput+0x32/0x2f0
>   ocfs2_permission+0x45/0xe0 [ocfs2]
>   inode_permission+0xcc/0x170
>   link_path_walk.part.0.constprop.0+0x2a2/0x380
>   ? path_init+0x2c1/0x3f0
>   path_parentat+0x3c/0x90
>   filename_parentat+0xc1/0x1d0
>   ? filename_lookup+0x138/0x1c0
>   filename_create+0x43/0x160
>   ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>   ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>   ? do_sys_openat2+0x81/0x150
>   __x64_sys_ioctl+0x82/0xb0
>   do_syscall_64+0x61/0xb0
> 
> The deadlock is caused by multiple acquiring the destination directory
> inode dlm lock in ocfs2_reflink function, we should acquire this
> directory inode dlm lock at the beginning, and hold this dlm lock until
> end of the function.
> 
> Signed-off-by: Gang He <ghe@suse.com>
> ---
>  fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>  fs/ocfs2/namei.h        |  2 ++
>  fs/ocfs2/refcounttree.c | 15 +++++++++++----
>  fs/ocfs2/xattr.c        | 12 +-----------
>  fs/ocfs2/xattr.h        |  1 +
>  5 files changed, 28 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
> index 2c46ff6ba4ea..f8bbb22cc60b 100644
> --- a/fs/ocfs2/namei.c
> +++ b/fs/ocfs2/namei.c
> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>  }
>  
>  int ocfs2_create_inode_in_orphan(struct inode *dir,
> +				 struct buffer_head **dir_bh,
>  				 int mode,
>  				 struct inode **new_inode)
>  {
> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>  
>  	brelse(new_di_bh);
>  
> -	if (!status)
> -		*new_inode = inode;
> -
>  	ocfs2_free_dir_lookup_result(&orphan_insert);
>  
> -	ocfs2_inode_unlock(dir, 1);
> -	brelse(parent_di_bh);
> +	if (!status) {
> +		*new_inode = inode;
> +		*dir_bh = parent_di_bh;
> +	} else {
> +		ocfs2_inode_unlock(dir, 1);
> +		brelse(parent_di_bh);
> +	}
> +
>  	return status;
>  }
>  
> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>  }
>  
>  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> +				   struct buffer_head *dir_bh,
>  				   struct inode *inode,
>  				   struct dentry *dentry)
>  {
>  	int status = 0;
> -	struct buffer_head *parent_di_bh = NULL;
>  	handle_t *handle = NULL;
>  	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>  	struct ocfs2_dinode *dir_di, *di;
> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>  				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>  				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>  
> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
> -	if (status < 0) {
> -		if (status != -ENOENT)
> -			mlog_errno(status);
> -		return status;
> -	}
> -
> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>  	if (!dir_di->i_links_count) {
>  		/* can't make a file in a deleted directory. */
>  		status = -ENOENT;
> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>  		goto leave;
>  
>  	/* get a spot inside the dir. */
> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>  					      dentry->d_name.name,
>  					      dentry->d_name.len, &lookup);
>  	if (status < 0) {
> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>  	ocfs2_journal_dirty(handle, di_bh);
>  
>  	status = ocfs2_add_entry(handle, dentry, inode,
> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>  				 &lookup);
>  	if (status < 0) {
>  		mlog_errno(status);
> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>  	iput(orphan_dir_inode);
>  leave:
>  
> -	ocfs2_inode_unlock(dir, 1);
> -
>  	brelse(di_bh);
> -	brelse(parent_di_bh);
>  	brelse(orphan_dir_bh);
>  
>  	ocfs2_free_dir_lookup_result(&lookup);
> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
> index 9cc891eb874e..03a2c526e2c1 100644
> --- a/fs/ocfs2/namei.h
> +++ b/fs/ocfs2/namei.h
> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>  		     struct buffer_head *orphan_dir_bh,
>  		     bool dio);
>  int ocfs2_create_inode_in_orphan(struct inode *dir,
> +				 struct buffer_head **dir_bh,
>  				 int mode,
>  				 struct inode **new_inode);
>  int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>  		struct inode *inode, struct buffer_head *di_bh,
>  		int update_isize, loff_t end);
>  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
> +				   struct buffer_head *dir_bh,
>  				   struct inode *new_inode,
>  				   struct dentry *new_dentry);
>  
> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
> index 7f6355cbb587..a9a0c7c37e8e 100644
> --- a/fs/ocfs2/refcounttree.c
> +++ b/fs/ocfs2/refcounttree.c
> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>  {
>  	int error, had_lock;
>  	struct inode *inode = d_inode(old_dentry);
> -	struct buffer_head *old_bh = NULL;
> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>  	struct inode *new_orphan_inode = NULL;
>  	struct ocfs2_lock_holder oh;
>  
> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>  		return -EOPNOTSUPP;
>  
>  
> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>  					     &new_orphan_inode);
>  	if (error) {
>  		mlog_errno(error);
> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>  
>  	/* If the security isn't preserved, we need to re-initialize them. */
>  	if (!preserve) {
> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
> +						    new_orphan_inode,
>  						    &new_dentry->d_name);
>  		if (error)
>  			mlog_errno(error);
>  	}
>  	if (!error) {
> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
> +						       new_orphan_inode,
>  						       new_dentry);
>  		if (error)
>  			mlog_errno(error);
> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>  			iput(new_orphan_inode);
>  	}
>  
> +	if (dir_bh) {
> +		ocfs2_inode_unlock(dir, 1);
> +		brelse(dir_bh);
> +	}
> +
>  	return error;
>  }
>  
> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
> index dd784eb0cd7c..3f23e3a5018c 100644
> --- a/fs/ocfs2/xattr.c
> +++ b/fs/ocfs2/xattr.c
> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>  /*
>   * Initialize security and acl for a already created inode.
>   * Used for reflink a non-preserve-security file.
> - *
> - * It uses common api like ocfs2_xattr_set, so the caller
> - * must not hold any lock expect i_mutex.
>   */
>  int ocfs2_init_security_and_acl(struct inode *dir,
> +				struct buffer_head *dir_bh,
>  				struct inode *inode,
>  				const struct qstr *qstr)
>  {
>  	int ret = 0;
> -	struct buffer_head *dir_bh = NULL;
>  
>  	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>  	if (ret) {
> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>  		goto leave;
>  	}
>  
> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
> -	if (ret) {
> -		mlog_errno(ret);
> -		goto leave;
> -	}
>  	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>  	if (ret)
>  		mlog_errno(ret);
>  
> -	ocfs2_inode_unlock(dir, 0);
> -	brelse(dir_bh);
>  leave:
>  	return ret;
>  }
> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
> index 00308b57f64f..b27fd8ba0019 100644
> --- a/fs/ocfs2/xattr.h
> +++ b/fs/ocfs2/xattr.h
> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>  			 struct buffer_head *new_bh,
>  			 bool preserve_security);
>  int ocfs2_init_security_and_acl(struct inode *dir,
> +				struct buffer_head *dir_bh,
>  				struct inode *inode,
>  				const struct qstr *qstr);
>  #endif /* OCFS2_XATTR_H */
>
Gang He Aug. 9, 2021, 10:08 a.m. UTC | #9
Hi Joseph and All,

The deadlock is caused by self-locking on one node.
There is three node cluster (mounted to /mnt/shared), the user run 
reflink command to clone the file to the same directory repeatedly,
e.g.
  reflink "/mnt/shared/test" \
  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"

After a while, the reflink process on each node is hung, the file system 
cannot be listed.
The problematic reflink command process is blocked by itself, e.g. the 
reflink process is hung at ghe-sle15sp2-nd2,
kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
kernel: Call Trace:
kernel:  __schedule+0x2fd/0x750
kernel:  ? try_to_wake_up+0x17b/0x4e0
kernel:  schedule+0x2f/0xa0
kernel:  schedule_timeout+0x1cc/0x310
kernel:  ? __wake_up_common+0x74/0x120
kernel:  wait_for_completion+0xba/0x140
kernel:  ? wake_up_q+0xa0/0xa0
kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
kernel:  do_vfs_ioctl+0xa0/0x680
kernel:  ksys_ioctl+0x70/0x80

In fact, the destination directory(.snapshots) inode dlm lock was 
acquired by ghe-sle15sp2-nd2, next there is bast message from other 
nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation 
failed, the kernel message is printed like,
kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM 
error -16 while calling ocfs2_dlm_lock on resource 
M0000000000000000046e0200000000
kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: 
status = -16

Then, the reflink process tries to acquire this directory inode dlm 
lock, the process is blocked, the dlm lock resource in memory looks like

     l_name = "M0000000000000000046e0200000000",
     l_ro_holders = 0,
     l_ex_holders = 0,
     l_level = 5 '\005',
     l_requested = 0 '\000',
     l_blocking = 5 '\005',
     l_type = 0 '\000',
     l_action = 0 '\000',
     l_unlock_action = 0 '\000',
     l_pending_gen = 645948,


So far, I do not know what makes dlm lock function failed, it also looks 
we do not handle this failure case in dlmglue layer, but I always 
reproduce this hang with my test script, e.g.

   loop=1
   while ((loop++)) ; do
         for i in `seq 1 100`; do
           reflink "/mnt/shared/test" "/mnt/shared/.snapshots 
/test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
         done
         usleep 500000
         rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
   done

My patch changes multiple acquiring dest directory inode dlm lock during 
in ocfs2_reflink function, it avoids the hang issue happen again.The 
code change also can improve reflink performance in this case.

Thanks
Gang


On 2021/8/4 9:57, Joseph Qi wrote:
> Hi Gang,
> As I mentioned in your previous mail, I'd like we describe the issue in
> the following way:
> 
> Node 1		Node 2		Node3
> ...
> 		...
> 				...
> 
> That will be more easier to understand the lock sequence.
> 
> Thanks,
> Joseph
> 
> On 7/29/21 7:02 PM, Gang He wrote:
>> Running reflink from multiple nodes simultaneously to clone a file
>> to the same directory probably triggers a deadlock issue.
>> For example, there is a three node ocfs2 cluster, each node mounts
>> the ocfs2 file system to /mnt/shared, and run the reflink command
>> from each node repeatedly, like
>>    reflink "/mnt/shared/test" \
>>    "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>> then, reflink command process will be hung on each node, and you
>> can't list this file system directory.
>> The problematic reflink command process is blocked at one node,
>> task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
>> Call Trace:
>>    __schedule+0x2fd/0x750
>>    schedule+0x2f/0xa0
>>    schedule_timeout+0x1cc/0x310
>>    ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>    ? 0xffffffffc0e3e000
>>    wait_for_completion+0xba/0x140
>>    ? wake_up_q+0xa0/0xa0
>>    __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>    ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>    ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>    ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>    ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>    ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>    ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>    ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>    do_vfs_ioctl+0xa0/0x680
>>    ksys_ioctl+0x70/0x80
>>    __x64_sys_ioctl+0x16/0x20
>>    do_syscall_64+0x5b/0x1e0
>> The other reflink command processes are blocked at other nodes,
>> task:reflink         state:D stack:    0 pid:29759 ppid:  4088
>> Call Trace:
>>    __schedule+0x2fd/0x750
>>    schedule+0x2f/0xa0
>>    schedule_timeout+0x1cc/0x310
>>    ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>    ? 0xffffffffc0b19000
>>    wait_for_completion+0xba/0x140
>>    ? wake_up_q+0xa0/0xa0
>>    __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>    ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>    ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>    ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
>>    ocfs2_reflink+0x335/0x4c0 [ocfs2]
>>    ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>    ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>    ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>    do_vfs_ioctl+0xa0/0x680
>>    ksys_ioctl+0x70/0x80
>>    __x64_sys_ioctl+0x16/0x20
>>    do_syscall_64+0x5b/0x1e0
>> or
>> task:reflink         state:D stack:    0 pid:18465 ppid:  4156
>> Call Trace:
>>    __schedule+0x302/0x940
>>    ? usleep_range+0x80/0x80
>>    schedule+0x46/0xb0
>>    schedule_timeout+0xff/0x140
>>    ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
>>    ? 0xffffffffc0c3b000
>>    __wait_for_common+0xb9/0x170
>>    __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
>>    ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
>>    ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>    ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
>>    ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
>>    ? dput+0x32/0x2f0
>>    ocfs2_permission+0x45/0xe0 [ocfs2]
>>    inode_permission+0xcc/0x170
>>    link_path_walk.part.0.constprop.0+0x2a2/0x380
>>    ? path_init+0x2c1/0x3f0
>>    path_parentat+0x3c/0x90
>>    filename_parentat+0xc1/0x1d0
>>    ? filename_lookup+0x138/0x1c0
>>    filename_create+0x43/0x160
>>    ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
>>    ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
>>    ? do_sys_openat2+0x81/0x150
>>    __x64_sys_ioctl+0x82/0xb0
>>    do_syscall_64+0x61/0xb0
>>
>> The deadlock is caused by multiple acquiring the destination directory
>> inode dlm lock in ocfs2_reflink function, we should acquire this
>> directory inode dlm lock at the beginning, and hold this dlm lock until
>> end of the function.
>>
>> Signed-off-by: Gang He <ghe@suse.com>
>> ---
>>   fs/ocfs2/namei.c        | 32 +++++++++++++-------------------
>>   fs/ocfs2/namei.h        |  2 ++
>>   fs/ocfs2/refcounttree.c | 15 +++++++++++----
>>   fs/ocfs2/xattr.c        | 12 +-----------
>>   fs/ocfs2/xattr.h        |  1 +
>>   5 files changed, 28 insertions(+), 34 deletions(-)
>>
>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
>> index 2c46ff6ba4ea..f8bbb22cc60b 100644
>> --- a/fs/ocfs2/namei.c
>> +++ b/fs/ocfs2/namei.c
>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
>>   }
>>   
>>   int ocfs2_create_inode_in_orphan(struct inode *dir,
>> +				 struct buffer_head **dir_bh,
>>   				 int mode,
>>   				 struct inode **new_inode)
>>   {
>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
>>   
>>   	brelse(new_di_bh);
>>   
>> -	if (!status)
>> -		*new_inode = inode;
>> -
>>   	ocfs2_free_dir_lookup_result(&orphan_insert);
>>   
>> -	ocfs2_inode_unlock(dir, 1);
>> -	brelse(parent_di_bh);
>> +	if (!status) {
>> +		*new_inode = inode;
>> +		*dir_bh = parent_di_bh;
>> +	} else {
>> +		ocfs2_inode_unlock(dir, 1);
>> +		brelse(parent_di_bh);
>> +	}
>> +
>>   	return status;
>>   }
>>   
>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>   }
>>   
>>   int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> +				   struct buffer_head *dir_bh,
>>   				   struct inode *inode,
>>   				   struct dentry *dentry)
>>   {
>>   	int status = 0;
>> -	struct buffer_head *parent_di_bh = NULL;
>>   	handle_t *handle = NULL;
>>   	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
>>   	struct ocfs2_dinode *dir_di, *di;
>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>   				(unsigned long long)OCFS2_I(dir)->ip_blkno,
>>   				(unsigned long long)OCFS2_I(inode)->ip_blkno);
>>   
>> -	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
>> -	if (status < 0) {
>> -		if (status != -ENOENT)
>> -			mlog_errno(status);
>> -		return status;
>> -	}
>> -
>> -	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
>> +	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
>>   	if (!dir_di->i_links_count) {
>>   		/* can't make a file in a deleted directory. */
>>   		status = -ENOENT;
>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>   		goto leave;
>>   
>>   	/* get a spot inside the dir. */
>> -	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
>> +	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
>>   					      dentry->d_name.name,
>>   					      dentry->d_name.len, &lookup);
>>   	if (status < 0) {
>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>   	ocfs2_journal_dirty(handle, di_bh);
>>   
>>   	status = ocfs2_add_entry(handle, dentry, inode,
>> -				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
>> +				 OCFS2_I(inode)->ip_blkno, dir_bh,
>>   				 &lookup);
>>   	if (status < 0) {
>>   		mlog_errno(status);
>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>>   	iput(orphan_dir_inode);
>>   leave:
>>   
>> -	ocfs2_inode_unlock(dir, 1);
>> -
>>   	brelse(di_bh);
>> -	brelse(parent_di_bh);
>>   	brelse(orphan_dir_bh);
>>   
>>   	ocfs2_free_dir_lookup_result(&lookup);
>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
>> index 9cc891eb874e..03a2c526e2c1 100644
>> --- a/fs/ocfs2/namei.h
>> +++ b/fs/ocfs2/namei.h
>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
>>   		     struct buffer_head *orphan_dir_bh,
>>   		     bool dio);
>>   int ocfs2_create_inode_in_orphan(struct inode *dir,
>> +				 struct buffer_head **dir_bh,
>>   				 int mode,
>>   				 struct inode **new_inode);
>>   int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
>>   		struct inode *inode, struct buffer_head *di_bh,
>>   		int update_isize, loff_t end);
>>   int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
>> +				   struct buffer_head *dir_bh,
>>   				   struct inode *new_inode,
>>   				   struct dentry *new_dentry);
>>   
>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
>> index 7f6355cbb587..a9a0c7c37e8e 100644
>> --- a/fs/ocfs2/refcounttree.c
>> +++ b/fs/ocfs2/refcounttree.c
>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>   {
>>   	int error, had_lock;
>>   	struct inode *inode = d_inode(old_dentry);
>> -	struct buffer_head *old_bh = NULL;
>> +	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
>>   	struct inode *new_orphan_inode = NULL;
>>   	struct ocfs2_lock_holder oh;
>>   
>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>   		return -EOPNOTSUPP;
>>   
>>   
>> -	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
>> +	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
>>   					     &new_orphan_inode);
>>   	if (error) {
>>   		mlog_errno(error);
>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>   
>>   	/* If the security isn't preserved, we need to re-initialize them. */
>>   	if (!preserve) {
>> -		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
>> +		error = ocfs2_init_security_and_acl(dir, dir_bh,
>> +						    new_orphan_inode,
>>   						    &new_dentry->d_name);
>>   		if (error)
>>   			mlog_errno(error);
>>   	}
>>   	if (!error) {
>> -		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
>> +		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
>> +						       new_orphan_inode,
>>   						       new_dentry);
>>   		if (error)
>>   			mlog_errno(error);
>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
>>   			iput(new_orphan_inode);
>>   	}
>>   
>> +	if (dir_bh) {
>> +		ocfs2_inode_unlock(dir, 1);
>> +		brelse(dir_bh);
>> +	}
>> +
>>   	return error;
>>   }
>>   
>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
>> index dd784eb0cd7c..3f23e3a5018c 100644
>> --- a/fs/ocfs2/xattr.c
>> +++ b/fs/ocfs2/xattr.c
>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>   /*
>>    * Initialize security and acl for a already created inode.
>>    * Used for reflink a non-preserve-security file.
>> - *
>> - * It uses common api like ocfs2_xattr_set, so the caller
>> - * must not hold any lock expect i_mutex.
>>    */
>>   int ocfs2_init_security_and_acl(struct inode *dir,
>> +				struct buffer_head *dir_bh,
>>   				struct inode *inode,
>>   				const struct qstr *qstr)
>>   {
>>   	int ret = 0;
>> -	struct buffer_head *dir_bh = NULL;
>>   
>>   	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
>>   	if (ret) {
>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
>>   		goto leave;
>>   	}
>>   
>> -	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
>> -	if (ret) {
>> -		mlog_errno(ret);
>> -		goto leave;
>> -	}
>>   	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
>>   	if (ret)
>>   		mlog_errno(ret);
>>   
>> -	ocfs2_inode_unlock(dir, 0);
>> -	brelse(dir_bh);
>>   leave:
>>   	return ret;
>>   }
>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
>> index 00308b57f64f..b27fd8ba0019 100644
>> --- a/fs/ocfs2/xattr.h
>> +++ b/fs/ocfs2/xattr.h
>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
>>   			 struct buffer_head *new_bh,
>>   			 bool preserve_security);
>>   int ocfs2_init_security_and_acl(struct inode *dir,
>> +				struct buffer_head *dir_bh,
>>   				struct inode *inode,
>>   				const struct qstr *qstr);
>>   #endif /* OCFS2_XATTR_H */
>>
>
Joseph Qi Aug. 13, 2021, 9:54 a.m. UTC | #10
On 8/9/21 6:08 PM, Gang He wrote:
> Hi Joseph and All,
> 
> The deadlock is caused by self-locking on one node.
> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
> e.g.
>  reflink "/mnt/shared/test" \
>  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
> 
> After a while, the reflink process on each node is hung, the file system cannot be listed.
> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
> kernel: Call Trace:
> kernel:  __schedule+0x2fd/0x750
> kernel:  ? try_to_wake_up+0x17b/0x4e0
> kernel:  schedule+0x2f/0xa0
> kernel:  schedule_timeout+0x1cc/0x310
> kernel:  ? __wake_up_common+0x74/0x120
> kernel:  wait_for_completion+0xba/0x140
> kernel:  ? wake_up_q+0xa0/0xa0
> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
> kernel:  do_vfs_ioctl+0xa0/0x680
> kernel:  ksys_ioctl+0x70/0x80
> 
> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
> 
> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
> 
>     l_name = "M0000000000000000046e0200000000",
>     l_ro_holders = 0,
>     l_ex_holders = 0,
>     l_level = 5 '\005',
>     l_requested = 0 '\000',
>     l_blocking = 5 '\005',
>     l_type = 0 '\000',
>     l_action = 0 '\000',
>     l_unlock_action = 0 '\000',
>     l_pending_gen = 645948,
> 
> 
> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
> 
>   loop=1
>   while ((loop++)) ; do
>         for i in `seq 1 100`; do
>           reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>         done
>         usleep 500000
>         rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>   done
> 
> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
> 
> Thanks
> Gang

'status = -16' implies DLM_CANCELGRANT.
Do you use stack user instead of o2cb? If yes, can you try o2cb with
your reproducer?

Thanks,
Joseph
Gang He Aug. 18, 2021, 9:20 a.m. UTC | #11
On 2021/8/13 17:54, Joseph Qi wrote:
> 
> 
> On 8/9/21 6:08 PM, Gang He wrote:
>> Hi Joseph and All,
>>
>> The deadlock is caused by self-locking on one node.
>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>> e.g.
>>   reflink "/mnt/shared/test" \
>>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>
>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>> kernel: Call Trace:
>> kernel:  __schedule+0x2fd/0x750
>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>> kernel:  schedule+0x2f/0xa0
>> kernel:  schedule_timeout+0x1cc/0x310
>> kernel:  ? __wake_up_common+0x74/0x120
>> kernel:  wait_for_completion+0xba/0x140
>> kernel:  ? wake_up_q+0xa0/0xa0
>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>> kernel:  do_vfs_ioctl+0xa0/0x680
>> kernel:  ksys_ioctl+0x70/0x80
>>
>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>
>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>
>>      l_name = "M0000000000000000046e0200000000",
>>      l_ro_holders = 0,
>>      l_ex_holders = 0,
>>      l_level = 5 '\005',
>>      l_requested = 0 '\000',
>>      l_blocking = 5 '\005',
>>      l_type = 0 '\000',
>>      l_action = 0 '\000',
>>      l_unlock_action = 0 '\000',
>>      l_pending_gen = 645948,
>>
>>
>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>
>>    loop=1
>>    while ((loop++)) ; do
>>          for i in `seq 1 100`; do
>>            reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>          done
>>          usleep 500000
>>          rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>    done
>>
>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>
>> Thanks
>> Gang
> 
> 'status = -16' implies DLM_CANCELGRANT.
> Do you use stack user instead of o2cb? If yes, can you try o2cb with
> your reproducer?

I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
After two day testing with the same script, I did not encounter dlm_lock 
downconvert failure, the hang issue did not happen.
After my patch was applied, there was not any side effect, the reflink 
performance was doubled in the case.

Thanks
Gang

> 
> Thanks,
> Joseph
>
Joseph Qi Aug. 18, 2021, 11:20 a.m. UTC | #12
On 8/18/21 5:20 PM, Gang He wrote:
> 
> 
> On 2021/8/13 17:54, Joseph Qi wrote:
>>
>>
>> On 8/9/21 6:08 PM, Gang He wrote:
>>> Hi Joseph and All,
>>>
>>> The deadlock is caused by self-locking on one node.
>>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>>> e.g.
>>>   reflink "/mnt/shared/test" \
>>>   "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>
>>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>>> kernel: Call Trace:
>>> kernel:  __schedule+0x2fd/0x750
>>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>>> kernel:  schedule+0x2f/0xa0
>>> kernel:  schedule_timeout+0x1cc/0x310
>>> kernel:  ? __wake_up_common+0x74/0x120
>>> kernel:  wait_for_completion+0xba/0x140
>>> kernel:  ? wake_up_q+0xa0/0xa0
>>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>> kernel:  do_vfs_ioctl+0xa0/0x680
>>> kernel:  ksys_ioctl+0x70/0x80
>>>
>>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>>
>>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>>
>>>      l_name = "M0000000000000000046e0200000000",
>>>      l_ro_holders = 0,
>>>      l_ex_holders = 0,
>>>      l_level = 5 '\005',
>>>      l_requested = 0 '\000',
>>>      l_blocking = 5 '\005',
>>>      l_type = 0 '\000',
>>>      l_action = 0 '\000',
>>>      l_unlock_action = 0 '\000',
>>>      l_pending_gen = 645948,
>>>
>>>
>>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>>
>>>    loop=1
>>>    while ((loop++)) ; do
>>>          for i in `seq 1 100`; do
>>>            reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>>          done
>>>          usleep 500000
>>>          rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>>    done
>>>
>>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>>
>>> Thanks
>>> Gang
>>
>> 'status = -16' implies DLM_CANCELGRANT.
>> Do you use stack user instead of o2cb? If yes, can you try o2cb with
>> your reproducer?
> 
> I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
> After two day testing with the same script, I did not encounter dlm_lock downconvert failure, the hang issue did not happen.
> After my patch was applied, there was not any side effect, the reflink performance was doubled in the case.
> 

Do you mean the hang only happens on stack user?
IMO, we'd better directly fix the stack user hang first, and then optimize
reflink performance.

Thanks,
Joseph
Gang He Aug. 19, 2021, 1:51 a.m. UTC | #13
On 2021/8/18 19:20, Joseph Qi wrote:
> 
> 
> On 8/18/21 5:20 PM, Gang He wrote:
>>
>>
>> On 2021/8/13 17:54, Joseph Qi wrote:
>>>
>>>
>>> On 8/9/21 6:08 PM, Gang He wrote:
>>>> Hi Joseph and All,
>>>>
>>>> The deadlock is caused by self-locking on one node.
>>>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>>>> e.g.
>>>>    reflink "/mnt/shared/test" \
>>>>    "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>
>>>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>>>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>>>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>>>> kernel: Call Trace:
>>>> kernel:  __schedule+0x2fd/0x750
>>>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>>>> kernel:  schedule+0x2f/0xa0
>>>> kernel:  schedule_timeout+0x1cc/0x310
>>>> kernel:  ? __wake_up_common+0x74/0x120
>>>> kernel:  wait_for_completion+0xba/0x140
>>>> kernel:  ? wake_up_q+0xa0/0xa0
>>>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>> kernel:  do_vfs_ioctl+0xa0/0x680
>>>> kernel:  ksys_ioctl+0x70/0x80
>>>>
>>>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>>>
>>>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>>>
>>>>       l_name = "M0000000000000000046e0200000000",
>>>>       l_ro_holders = 0,
>>>>       l_ex_holders = 0,
>>>>       l_level = 5 '\005',
>>>>       l_requested = 0 '\000',
>>>>       l_blocking = 5 '\005',
>>>>       l_type = 0 '\000',
>>>>       l_action = 0 '\000',
>>>>       l_unlock_action = 0 '\000',
>>>>       l_pending_gen = 645948,
>>>>
>>>>
>>>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>>>
>>>>     loop=1
>>>>     while ((loop++)) ; do
>>>>           for i in `seq 1 100`; do
>>>>             reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>>>           done
>>>>           usleep 500000
>>>>           rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>>>     done
>>>>
>>>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>>>
>>>> Thanks
>>>> Gang
>>>
>>> 'status = -16' implies DLM_CANCELGRANT.
>>> Do you use stack user instead of o2cb? If yes, can you try o2cb with
>>> your reproducer?
>>
>> I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
>> After two day testing with the same script, I did not encounter dlm_lock downconvert failure, the hang issue did not happen.
>> After my patch was applied, there was not any side effect, the reflink performance was doubled in the case.
>>
> 
> Do you mean the hang only happens on stack user?
Yes.
Why? since o2cb based dlm_lock did not return error -16 when downcovert 
dlm lock during the whole testing.
But pmck based dlm_lock retuned error -16 during the testing, then we 
did not handle this error further in dlmglue layer, next encounter the 
hang issue when dlm_lock acquire the lock. Maybe there is a race 
condition when using dlm_lock/dlm_unlock(cancel) in dlmglue layer.
Anyway, the problem belongs to ocfs2 own parts.

> IMO, we'd better directly fix the stack user hang first, and then optimize
> reflink performance.
Before we fix the potential problem in dlmglue layer, I suggest to 
accept this patch first. Since the dlmglue layer code is kind of 
compliant, we need to carefully modify this part of the code. But this 
patch can workaround dlm_lock failure(-16) problem for pcmk stack, 
meanwhile, it also can improve the performance.

Thanks
Gang


> 
> Thanks,
> Joseph
>
Joseph Qi Aug. 19, 2021, 2:02 a.m. UTC | #14
On 8/19/21 9:51 AM, Gang He wrote:
> 
> 
> On 2021/8/18 19:20, Joseph Qi wrote:
>>
>>
>> On 8/18/21 5:20 PM, Gang He wrote:
>>>
>>>
>>> On 2021/8/13 17:54, Joseph Qi wrote:
>>>>
>>>>
>>>> On 8/9/21 6:08 PM, Gang He wrote:
>>>>> Hi Joseph and All,
>>>>>
>>>>> The deadlock is caused by self-locking on one node.
>>>>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>>>>> e.g.
>>>>>    reflink "/mnt/shared/test" \
>>>>>    "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>>
>>>>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>>>>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>>>>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>>>>> kernel: Call Trace:
>>>>> kernel:  __schedule+0x2fd/0x750
>>>>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>>>>> kernel:  schedule+0x2f/0xa0
>>>>> kernel:  schedule_timeout+0x1cc/0x310
>>>>> kernel:  ? __wake_up_common+0x74/0x120
>>>>> kernel:  wait_for_completion+0xba/0x140
>>>>> kernel:  ? wake_up_q+0xa0/0xa0
>>>>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>> kernel:  do_vfs_ioctl+0xa0/0x680
>>>>> kernel:  ksys_ioctl+0x70/0x80
>>>>>
>>>>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>>>>
>>>>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>>>>
>>>>>       l_name = "M0000000000000000046e0200000000",
>>>>>       l_ro_holders = 0,
>>>>>       l_ex_holders = 0,
>>>>>       l_level = 5 '\005',
>>>>>       l_requested = 0 '\000',
>>>>>       l_blocking = 5 '\005',
>>>>>       l_type = 0 '\000',
>>>>>       l_action = 0 '\000',
>>>>>       l_unlock_action = 0 '\000',
>>>>>       l_pending_gen = 645948,
>>>>>
>>>>>
>>>>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>>>>
>>>>>     loop=1
>>>>>     while ((loop++)) ; do
>>>>>           for i in `seq 1 100`; do
>>>>>             reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>>>>           done
>>>>>           usleep 500000
>>>>>           rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>>>>     done
>>>>>
>>>>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>>>>
>>>>> Thanks
>>>>> Gang
>>>>
>>>> 'status = -16' implies DLM_CANCELGRANT.
>>>> Do you use stack user instead of o2cb? If yes, can you try o2cb with
>>>> your reproducer?
>>>
>>> I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
>>> After two day testing with the same script, I did not encounter dlm_lock downconvert failure, the hang issue did not happen.
>>> After my patch was applied, there was not any side effect, the reflink performance was doubled in the case.
>>>
>>
>> Do you mean the hang only happens on stack user?
> Yes.
> Why? since o2cb based dlm_lock did not return error -16 when downcovert dlm lock during the whole testing.
> But pmck based dlm_lock retuned error -16 during the testing, then we did not handle this error further in dlmglue layer, next encounter the hang issue when dlm_lock acquire the lock. Maybe there is a race condition when using dlm_lock/dlm_unlock(cancel) in dlmglue layer.
> Anyway, the problem belongs to ocfs2 own parts.
> 
I meant if DLM_CANCELGRANT is not the expected return code, we'd
better fix the issue in stack_user.c but not dlmglue, e.g. some specific
wrapper.

Thanks,
Joseph
Gang He Aug. 19, 2021, 6:31 a.m. UTC | #15
On 2021/8/19 10:02, Joseph Qi wrote:
> 
> 
> On 8/19/21 9:51 AM, Gang He wrote:
>>
>>
>> On 2021/8/18 19:20, Joseph Qi wrote:
>>>
>>>
>>> On 8/18/21 5:20 PM, Gang He wrote:
>>>>
>>>>
>>>> On 2021/8/13 17:54, Joseph Qi wrote:
>>>>>
>>>>>
>>>>> On 8/9/21 6:08 PM, Gang He wrote:
>>>>>> Hi Joseph and All,
>>>>>>
>>>>>> The deadlock is caused by self-locking on one node.
>>>>>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>>>>>> e.g.
>>>>>>     reflink "/mnt/shared/test" \
>>>>>>     "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>>>
>>>>>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>>>>>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>>>>>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>>>>>> kernel: Call Trace:
>>>>>> kernel:  __schedule+0x2fd/0x750
>>>>>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>>>>>> kernel:  schedule+0x2f/0xa0
>>>>>> kernel:  schedule_timeout+0x1cc/0x310
>>>>>> kernel:  ? __wake_up_common+0x74/0x120
>>>>>> kernel:  wait_for_completion+0xba/0x140
>>>>>> kernel:  ? wake_up_q+0xa0/0xa0
>>>>>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>> kernel:  do_vfs_ioctl+0xa0/0x680
>>>>>> kernel:  ksys_ioctl+0x70/0x80
>>>>>>
>>>>>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>>>>>
>>>>>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>>>>>
>>>>>>        l_name = "M0000000000000000046e0200000000",
>>>>>>        l_ro_holders = 0,
>>>>>>        l_ex_holders = 0,
>>>>>>        l_level = 5 '\005',
>>>>>>        l_requested = 0 '\000',
>>>>>>        l_blocking = 5 '\005',
>>>>>>        l_type = 0 '\000',
>>>>>>        l_action = 0 '\000',
>>>>>>        l_unlock_action = 0 '\000',
>>>>>>        l_pending_gen = 645948,
>>>>>>
>>>>>>
>>>>>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>>>>>
>>>>>>      loop=1
>>>>>>      while ((loop++)) ; do
>>>>>>            for i in `seq 1 100`; do
>>>>>>              reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>>>>>            done
>>>>>>            usleep 500000
>>>>>>            rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>>>>>      done
>>>>>>
>>>>>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>>>>>
>>>>>> Thanks
>>>>>> Gang
>>>>>
>>>>> 'status = -16' implies DLM_CANCELGRANT.
>>>>> Do you use stack user instead of o2cb? If yes, can you try o2cb with
>>>>> your reproducer?
>>>>
>>>> I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
>>>> After two day testing with the same script, I did not encounter dlm_lock downconvert failure, the hang issue did not happen.
>>>> After my patch was applied, there was not any side effect, the reflink performance was doubled in the case.
>>>>
>>>
>>> Do you mean the hang only happens on stack user?
>> Yes.
>> Why? since o2cb based dlm_lock did not return error -16 when downcovert dlm lock during the whole testing.
>> But pmck based dlm_lock retuned error -16 during the testing, then we did not handle this error further in dlmglue layer, next encounter the hang issue when dlm_lock acquire the lock. Maybe there is a race condition when using dlm_lock/dlm_unlock(cancel) in dlmglue layer.
>> Anyway, the problem belongs to ocfs2 own parts.
>>
> I meant if DLM_CANCELGRANT is not the expected return code, we'd
> better fix the issue in stack_user.c but not dlmglue, e.g. some specific
> wrapper.
We cannot wrapper(or ignore) this error in stack_user, otherwise it will 
lead to a hang problem when the next dlm_lock is invoked.
Based on comments from fs/dlm maintainer, the error -16 is returned by 
dlm_lock in case ocfs2 calls dlm_unlock(CANCEL) to cancel an in-progress 
dlm_lock() request.
In fact, if you read the code comments in dlmglue.c, it also talked 
about the similar situation, but I feel the current code should still 
has a race condition, then trigger dlm_lock return -16 error.
For o2cb stack, it's dlm_lock did not expose this error, maybe it is 
different in dlm implementation.

Thanks
Gang

> 
> Thanks,
> Joseph
>
Gang He Aug. 26, 2021, 5:56 a.m. UTC | #16
On 2021/8/19 14:31, Gang He wrote:
> 
> 
> On 2021/8/19 10:02, Joseph Qi wrote:
>>
>>
>> On 8/19/21 9:51 AM, Gang He wrote:
>>>
>>>
>>> On 2021/8/18 19:20, Joseph Qi wrote:
>>>>
>>>>
>>>> On 8/18/21 5:20 PM, Gang He wrote:
>>>>>
>>>>>
>>>>> On 2021/8/13 17:54, Joseph Qi wrote:
>>>>>>
>>>>>>
>>>>>> On 8/9/21 6:08 PM, Gang He wrote:
>>>>>>> Hi Joseph and All,
>>>>>>>
>>>>>>> The deadlock is caused by self-locking on one node.
>>>>>>> There is three node cluster (mounted to /mnt/shared), the user run reflink command to clone the file to the same directory repeatedly,
>>>>>>> e.g.
>>>>>>>      reflink "/mnt/shared/test" \
>>>>>>>      "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
>>>>>>>
>>>>>>> After a while, the reflink process on each node is hung, the file system cannot be listed.
>>>>>>> The problematic reflink command process is blocked by itself, e.g. the reflink process is hung at ghe-sle15sp2-nd2,
>>>>>>> kernel: task:reflink         state:D stack:    0 pid:16992 ppid:  4530
>>>>>>> kernel: Call Trace:
>>>>>>> kernel:  __schedule+0x2fd/0x750
>>>>>>> kernel:  ? try_to_wake_up+0x17b/0x4e0
>>>>>>> kernel:  schedule+0x2f/0xa0
>>>>>>> kernel:  schedule_timeout+0x1cc/0x310
>>>>>>> kernel:  ? __wake_up_common+0x74/0x120
>>>>>>> kernel:  wait_for_completion+0xba/0x140
>>>>>>> kernel:  ? wake_up_q+0xa0/0xa0
>>>>>>> kernel:  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
>>>>>>> kernel:  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>> kernel:  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
>>>>>>> kernel:  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
>>>>>>> kernel:  ocfs2_reflink+0x436/0x4c0 [ocfs2]
>>>>>>> kernel:  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>> kernel:  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
>>>>>>> kernel:  ocfs2_ioctl+0x25e/0x670 [ocfs2]
>>>>>>> kernel:  do_vfs_ioctl+0xa0/0x680
>>>>>>> kernel:  ksys_ioctl+0x70/0x80
>>>>>>>
>>>>>>> In fact, the destination directory(.snapshots) inode dlm lock was acquired by ghe-sle15sp2-nd2, next there is bast message from other nodes to ask ghe-sle15sp2-nd2 downconvert lock, but the operation failed, the kernel message is printed like,
>>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_downconvert_lock:3660 ERROR: DLM error -16 while calling ocfs2_dlm_lock on resource M0000000000000000046e0200000000
>>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_unblock_lock:3904 ERROR: status = -16
>>>>>>> kernel: (ocfs2dc-AA35DD9,2560,3):ocfs2_process_blocked_lock:4303 ERROR: status = -16
>>>>>>>
>>>>>>> Then, the reflink process tries to acquire this directory inode dlm lock, the process is blocked, the dlm lock resource in memory looks like
>>>>>>>
>>>>>>>         l_name = "M0000000000000000046e0200000000",
>>>>>>>         l_ro_holders = 0,
>>>>>>>         l_ex_holders = 0,
>>>>>>>         l_level = 5 '\005',
>>>>>>>         l_requested = 0 '\000',
>>>>>>>         l_blocking = 5 '\005',
>>>>>>>         l_type = 0 '\000',
>>>>>>>         l_action = 0 '\000',
>>>>>>>         l_unlock_action = 0 '\000',
>>>>>>>         l_pending_gen = 645948,
>>>>>>>
>>>>>>>
>>>>>>> So far, I do not know what makes dlm lock function failed, it also looks we do not handle this failure case in dlmglue layer, but I always reproduce this hang with my test script, e.g.
>>>>>>>
>>>>>>>       loop=1
>>>>>>>       while ((loop++)) ; do
>>>>>>>             for i in `seq 1 100`; do
>>>>>>>               reflink "/mnt/shared/test" "/mnt/shared/.snapshots /test.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
>>>>>>>             done
>>>>>>>             usleep 500000
>>>>>>>             rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
>>>>>>>       done
>>>>>>>
>>>>>>> My patch changes multiple acquiring dest directory inode dlm lock during in ocfs2_reflink function, it avoids the hang issue happen again.The code change also can improve reflink performance in this case.
>>>>>>>
>>>>>>> Thanks
>>>>>>> Gang
>>>>>>
>>>>>> 'status = -16' implies DLM_CANCELGRANT.
>>>>>> Do you use stack user instead of o2cb? If yes, can you try o2cb with
>>>>>> your reproducer?
>>>>>
>>>>> I setup o2cb based ocfs2 clusters with sle15sp2 and oracleLinux8u4.
>>>>> After two day testing with the same script, I did not encounter dlm_lock downconvert failure, the hang issue did not happen.
>>>>> After my patch was applied, there was not any side effect, the reflink performance was doubled in the case.
>>>>>
>>>>
>>>> Do you mean the hang only happens on stack user?
>>> Yes.
>>> Why? since o2cb based dlm_lock did not return error -16 when downcovert dlm lock during the whole testing.
>>> But pmck based dlm_lock retuned error -16 during the testing, then we did not handle this error further in dlmglue layer, next encounter the hang issue when dlm_lock acquire the lock. Maybe there is a race condition when using dlm_lock/dlm_unlock(cancel) in dlmglue layer.
>>> Anyway, the problem belongs to ocfs2 own parts.
>>>
>> I meant if DLM_CANCELGRANT is not the expected return code, we'd
>> better fix the issue in stack_user.c but not dlmglue, e.g. some specific
>> wrapper.
> We cannot wrapper(or ignore) this error in stack_user, otherwise it will
> lead to a hang problem when the next dlm_lock is invoked.
> Based on comments from fs/dlm maintainer, the error -16 is returned by
> dlm_lock in case ocfs2 calls dlm_unlock(CANCEL) to cancel an in-progress
> dlm_lock() request.
> In fact, if you read the code comments in dlmglue.c, it also talked
> about the similar situation, but I feel the current code should still
> has a race condition, then trigger dlm_lock return -16 error.
> For o2cb stack, it's dlm_lock did not expose this error, maybe it is
> different in dlm implementation.

Based on my further analysis and BASTS traces, I found there are some 
differences in behavior between o2dlm and fsdlm.
Usually, ocfs2_downconvert_lock() function always downconverts
dlm lock to the expected level for satisfy dlm bast requests
from the other nodes.
But there is a rare situation. When dlm lock conversion is being
canceled, ocfs2_downconvert_lock() function will return -EBUSY.
You need to be aware that ocfs2_cancel_convert() function is
asynchronous in fsdlm implementation, that means ocfs2_cancel_convert()
will return directly, then the actual dlm lock cancel is executed in 
background.
For o2dlm implementation, ocfs2_cancel_convert() function is 
synchronous, that means this function will return after the dlm lock is
cancelled and ast callback function is invoked.

If we does not requeue this lockres entry, ocfs2 downconvert
thread no longer handles this dlm lock bast request. Then, the
other nodes will not get the dlm lock again, the current node's
process will be blocked when acquire this dlm lock again.

So, I will send a new patch to fix this deadlock problem via dlmglue layer.
For this patch, I want to change the patch comments as a reflink 
improvement patch.

Thanks
Gang



> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Joseph
>>
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>
Andrew Morton Oct. 16, 2022, 11:29 p.m. UTC | #17
On Thu, 26 Aug 2021 13:56:16 +0800 Gang He <ghe@suse.com> wrote:

> So, I will send a new patch to fix this deadlock problem via dlmglue layer.
> For this patch, I want to change the patch comments as a reflink 
> improvement patch.

Did this ever happen?  I've been sitting on this patch for ages.


From: Gang He <ghe@suse.com>
Subject: ocfs2: reflink deadlock when clone file to the same directory simultaneously

Running reflink from multiple nodes simultaneously to clone a file to the
same directory probably triggers a deadlock issue.  For example, there is
a three node ocfs2 cluster, each node mounts the ocfs2 file system to
/mnt/shared, and run the reflink command from each node repeatedly, like

  reflink "/mnt/shared/test" \
  "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`"
then, reflink command process will be hung on each node, and you
can't list this file system directory.
The problematic reflink command process is blocked at one node,
task:reflink         state:D stack:    0 pid: 1283 ppid:  4154
Call Trace:
  __schedule+0x2fd/0x750
  schedule+0x2f/0xa0
  schedule_timeout+0x1cc/0x310
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0e3e000
  wait_for_completion+0xba/0x140
  ? wake_up_q+0xa0/0xa0
  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2]
  ocfs2_reflink+0x436/0x4c0 [ocfs2]
  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_ioctl+0x25e/0x670 [ocfs2]
  do_vfs_ioctl+0xa0/0x680
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1e0
The other reflink command processes are blocked at other nodes,
task:reflink         state:D stack:    0 pid:29759 ppid:  4088
Call Trace:
  __schedule+0x2fd/0x750
  schedule+0x2f/0xa0
  schedule_timeout+0x1cc/0x310
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0b19000
  wait_for_completion+0xba/0x140
  ? wake_up_q+0xa0/0xa0
  __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2]
  ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2]
  ocfs2_reflink+0x335/0x4c0 [ocfs2]
  ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2]
  ocfs2_ioctl+0x25e/0x670 [ocfs2]
  do_vfs_ioctl+0xa0/0x680
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1e0
or
task:reflink         state:D stack:    0 pid:18465 ppid:  4156
Call Trace:
  __schedule+0x302/0x940
  ? usleep_range+0x80/0x80
  schedule+0x46/0xb0
  schedule_timeout+0xff/0x140
  ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user]
  ? 0xffffffffc0c3b000
  __wait_for_common+0xb9/0x170
  __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2]
  ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2]
  ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
  ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2]
  ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2]
  ? dput+0x32/0x2f0
  ocfs2_permission+0x45/0xe0 [ocfs2]
  inode_permission+0xcc/0x170
  link_path_walk.part.0.constprop.0+0x2a2/0x380
  ? path_init+0x2c1/0x3f0
  path_parentat+0x3c/0x90
  filename_parentat+0xc1/0x1d0
  ? filename_lookup+0x138/0x1c0
  filename_create+0x43/0x160
  ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2]
  ocfs2_ioctl+0x1ea/0x2c0 [ocfs2]
  ? do_sys_openat2+0x81/0x150
  __x64_sys_ioctl+0x82/0xb0
  do_syscall_64+0x61/0xb0

The deadlock is caused by multiple acquiring the destination directory
inode dlm lock in ocfs2_reflink function, we should acquire this directory
inode dlm lock at the beginning, and hold this dlm lock until end of the
function.

Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com
Signed-off-by: Gang He <ghe@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/ocfs2/namei.c        |   32 +++++++++++++-------------------
 fs/ocfs2/namei.h        |    2 ++
 fs/ocfs2/refcounttree.c |   15 +++++++++++----
 fs/ocfs2/xattr.c        |   12 +-----------
 fs/ocfs2/xattr.h        |    1 +
 5 files changed, 28 insertions(+), 34 deletions(-)

--- a/fs/ocfs2/namei.c~ocfs2-reflink-deadlock-when-clone-file-to-the-same-directory-simultaneously
+++ a/fs/ocfs2/namei.c
@@ -2490,6 +2490,7 @@ out:
 }
 
 int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 struct buffer_head **dir_bh,
 				 int mode,
 				 struct inode **new_inode)
 {
@@ -2598,13 +2599,16 @@ leave:
 
 	brelse(new_di_bh);
 
-	if (!status)
-		*new_inode = inode;
-
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 
-	ocfs2_inode_unlock(dir, 1);
-	brelse(parent_di_bh);
+	if (!status) {
+		*new_inode = inode;
+		*dir_bh = parent_di_bh;
+	} else {
+		ocfs2_inode_unlock(dir, 1);
+		brelse(parent_di_bh);
+	}
+
 	return status;
 }
 
@@ -2761,11 +2765,11 @@ bail:
 }
 
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct buffer_head *dir_bh,
 				   struct inode *inode,
 				   struct dentry *dentry)
 {
 	int status = 0;
-	struct buffer_head *parent_di_bh = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_dinode *dir_di, *di;
@@ -2779,14 +2783,7 @@ int ocfs2_mv_orphaned_inode_to_new(struc
 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
-		return status;
-	}
-
-	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
 	if (!dir_di->i_links_count) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
@@ -2799,7 +2796,7 @@ int ocfs2_mv_orphaned_inode_to_new(struc
 		goto leave;
 
 	/* get a spot inside the dir. */
-	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
 					      dentry->d_name.name,
 					      dentry->d_name.len, &lookup);
 	if (status < 0) {
@@ -2863,7 +2860,7 @@ int ocfs2_mv_orphaned_inode_to_new(struc
 	ocfs2_journal_dirty(handle, di_bh);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
-				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
+				 OCFS2_I(inode)->ip_blkno, dir_bh,
 				 &lookup);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2887,10 +2884,7 @@ orphan_unlock:
 	iput(orphan_dir_inode);
 leave:
 
-	ocfs2_inode_unlock(dir, 1);
-
 	brelse(di_bh);
-	brelse(parent_di_bh);
 	brelse(orphan_dir_bh);
 
 	ocfs2_free_dir_lookup_result(&lookup);
--- a/fs/ocfs2/namei.h~ocfs2-reflink-deadlock-when-clone-file-to-the-same-directory-simultaneously
+++ a/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super
 		     struct buffer_head *orphan_dir_bh,
 		     bool dio);
 int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 struct buffer_head **dir_bh,
 				 int mode,
 				 struct inode **new_inode);
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct o
 		struct inode *inode, struct buffer_head *di_bh,
 		int update_isize, loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct buffer_head *dir_bh,
 				   struct inode *new_inode,
 				   struct dentry *new_dentry);
 
--- a/fs/ocfs2/refcounttree.c~ocfs2-reflink-deadlock-when-clone-file-to-the-same-directory-simultaneously
+++ a/fs/ocfs2/refcounttree.c
@@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *
 {
 	int error, had_lock;
 	struct inode *inode = d_inode(old_dentry);
-	struct buffer_head *old_bh = NULL;
+	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
 	struct inode *new_orphan_inode = NULL;
 	struct ocfs2_lock_holder oh;
 
@@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *
 		return -EOPNOTSUPP;
 
 
-	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
 					     &new_orphan_inode);
 	if (error) {
 		mlog_errno(error);
@@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *
 
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
-		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+		error = ocfs2_init_security_and_acl(dir, dir_bh,
+						    new_orphan_inode,
 						    &new_dentry->d_name);
 		if (error)
 			mlog_errno(error);
 	}
 	if (!error) {
-		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+						       new_orphan_inode,
 						       new_dentry);
 		if (error)
 			mlog_errno(error);
@@ -4300,6 +4302,11 @@ out:
 			iput(new_orphan_inode);
 	}
 
+	if (dir_bh) {
+		ocfs2_inode_unlock(dir, 1);
+		brelse(dir_bh);
+	}
+
 	return error;
 }
 
--- a/fs/ocfs2/xattr.c~ocfs2-reflink-deadlock-when-clone-file-to-the-same-directory-simultaneously
+++ a/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@ out:
 /*
  * Initialize security and acl for a already created inode.
  * Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_rwsem.
  */
 int ocfs2_init_security_and_acl(struct inode *dir,
+				struct buffer_head *dir_bh,
 				struct inode *inode,
 				const struct qstr *qstr)
 {
 	int ret = 0;
-	struct buffer_head *dir_bh = NULL;
 
 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
 	if (ret) {
@@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct i
 		goto leave;
 	}
 
-	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
 	if (ret)
 		mlog_errno(ret);
 
-	ocfs2_inode_unlock(dir, 0);
-	brelse(dir_bh);
 leave:
 	return ret;
 }
--- a/fs/ocfs2/xattr.h~ocfs2-reflink-deadlock-when-clone-file-to-the-same-directory-simultaneously
+++ a/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *o
 			 struct buffer_head *new_bh,
 			 bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
+				struct buffer_head *dir_bh,
 				struct inode *inode,
 				const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
Joseph Qi Oct. 17, 2022, 1:51 a.m. UTC | #18
Hi Andrew,

On 10/17/22 7:29 AM, Andrew Morton via Ocfs2-devel wrote:
> On Thu, 26 Aug 2021 13:56:16 +0800 Gang He <ghe@suse.com> wrote:
> 
>> So, I will send a new patch to fix this deadlock problem via dlmglue layer.
>> For this patch, I want to change the patch comments as a reflink 
>> improvement patch.
> 
> Did this ever happen?  I've been sitting on this patch for ages.
> 
Looked back this thread, the root cause of deadlock seems to be the
asynchronous of fsdlm handling. So this is not a right fix.
While for the performance improvement, I need some test data as well.

Thanks,
Joseph
Andrew Morton Oct. 17, 2022, 2:29 a.m. UTC | #19
On Mon, 17 Oct 2022 09:51:14 +0800 Joseph Qi <jiangqi903@gmail.com> wrote:

> Hi Andrew,
> 
> On 10/17/22 7:29 AM, Andrew Morton via Ocfs2-devel wrote:
> > On Thu, 26 Aug 2021 13:56:16 +0800 Gang He <ghe@suse.com> wrote:
> > 
> >> So, I will send a new patch to fix this deadlock problem via dlmglue layer.
> >> For this patch, I want to change the patch comments as a reflink 
> >> improvement patch.
> > 
> > Did this ever happen?  I've been sitting on this patch for ages.
> > 
> Looked back this thread, the root cause of deadlock seems to be the
> asynchronous of fsdlm handling. So this is not a right fix.
> While for the performance improvement, I need some test data as well.

OK, thanks, I dropped the patch.
diff mbox series

Patch

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..f8bbb22cc60b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2489,6 +2489,7 @@  static int ocfs2_prep_new_orphaned_file(struct inode *dir,
 }
 
 int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 struct buffer_head **dir_bh,
 				 int mode,
 				 struct inode **new_inode)
 {
@@ -2597,13 +2598,16 @@  int ocfs2_create_inode_in_orphan(struct inode *dir,
 
 	brelse(new_di_bh);
 
-	if (!status)
-		*new_inode = inode;
-
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 
-	ocfs2_inode_unlock(dir, 1);
-	brelse(parent_di_bh);
+	if (!status) {
+		*new_inode = inode;
+		*dir_bh = parent_di_bh;
+	} else {
+		ocfs2_inode_unlock(dir, 1);
+		brelse(parent_di_bh);
+	}
+
 	return status;
 }
 
@@ -2760,11 +2764,11 @@  int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
 }
 
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct buffer_head *dir_bh,
 				   struct inode *inode,
 				   struct dentry *dentry)
 {
 	int status = 0;
-	struct buffer_head *parent_di_bh = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_dinode *dir_di, *di;
@@ -2778,14 +2782,7 @@  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 				(unsigned long long)OCFS2_I(dir)->ip_blkno,
 				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
-		return status;
-	}
-
-	dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+	dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
 	if (!dir_di->i_links_count) {
 		/* can't make a file in a deleted directory. */
 		status = -ENOENT;
@@ -2798,7 +2795,7 @@  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 		goto leave;
 
 	/* get a spot inside the dir. */
-	status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+	status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
 					      dentry->d_name.name,
 					      dentry->d_name.len, &lookup);
 	if (status < 0) {
@@ -2862,7 +2859,7 @@  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	ocfs2_journal_dirty(handle, di_bh);
 
 	status = ocfs2_add_entry(handle, dentry, inode,
-				 OCFS2_I(inode)->ip_blkno, parent_di_bh,
+				 OCFS2_I(inode)->ip_blkno, dir_bh,
 				 &lookup);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2886,10 +2883,7 @@  int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	iput(orphan_dir_inode);
 leave:
 
-	ocfs2_inode_unlock(dir, 1);
-
 	brelse(di_bh);
-	brelse(parent_di_bh);
 	brelse(orphan_dir_bh);
 
 	ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 9cc891eb874e..03a2c526e2c1 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@  int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     struct buffer_head *orphan_dir_bh,
 		     bool dio);
 int ocfs2_create_inode_in_orphan(struct inode *dir,
+				 struct buffer_head **dir_bh,
 				 int mode,
 				 struct inode **new_inode);
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@  int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
 		struct inode *inode, struct buffer_head *di_bh,
 		int update_isize, loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+				   struct buffer_head *dir_bh,
 				   struct inode *new_inode,
 				   struct dentry *new_dentry);
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb587..a9a0c7c37e8e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,7 +4250,7 @@  static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 {
 	int error, had_lock;
 	struct inode *inode = d_inode(old_dentry);
-	struct buffer_head *old_bh = NULL;
+	struct buffer_head *old_bh = NULL, *dir_bh = NULL;
 	struct inode *new_orphan_inode = NULL;
 	struct ocfs2_lock_holder oh;
 
@@ -4258,7 +4258,7 @@  static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 		return -EOPNOTSUPP;
 
 
-	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+	error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
 					     &new_orphan_inode);
 	if (error) {
 		mlog_errno(error);
@@ -4304,13 +4304,15 @@  static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
-		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+		error = ocfs2_init_security_and_acl(dir, dir_bh,
+						    new_orphan_inode,
 						    &new_dentry->d_name);
 		if (error)
 			mlog_errno(error);
 	}
 	if (!error) {
-		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+		error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+						       new_orphan_inode,
 						       new_dentry);
 		if (error)
 			mlog_errno(error);
@@ -4328,6 +4330,11 @@  static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 			iput(new_orphan_inode);
 	}
 
+	if (dir_bh) {
+		ocfs2_inode_unlock(dir, 1);
+		brelse(dir_bh);
+	}
+
 	return error;
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..3f23e3a5018c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@  int ocfs2_reflink_xattrs(struct inode *old_inode,
 /*
  * Initialize security and acl for a already created inode.
  * Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
  */
 int ocfs2_init_security_and_acl(struct inode *dir,
+				struct buffer_head *dir_bh,
 				struct inode *inode,
 				const struct qstr *qstr)
 {
 	int ret = 0;
-	struct buffer_head *dir_bh = NULL;
 
 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
 	if (ret) {
@@ -7220,17 +7217,10 @@  int ocfs2_init_security_and_acl(struct inode *dir,
 		goto leave;
 	}
 
-	ret = ocfs2_inode_lock(dir, &dir_bh, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
 	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
 	if (ret)
 		mlog_errno(ret);
 
-	ocfs2_inode_unlock(dir, 0);
-	brelse(dir_bh);
 leave:
 	return ret;
 }
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..b27fd8ba0019 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@  int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 struct buffer_head *new_bh,
 			 bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
+				struct buffer_head *dir_bh,
 				struct inode *inode,
 				const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */