diff mbox

unlink performance: wait for open lock in case of dirs

Message ID 20130615020510.GA4487@shrek.cartoons (mailing list archive)
State New, archived
Headers show

Commit Message

Goldwyn Rodriues June 15, 2013, 2:05 a.m. UTC
This patch is to improve the unlink performance. Here is the scenario:

On node A, create multiple directories say d1-d8, and each have 3
files under it f1, f2 and f3.
On node B, delete all directories using rm -Rf d*

The FS first unlinks f1, f2 and f3. However, when it performs
ocfs2_evict_inode() -> ocfs2_delete_inode() ->
ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
with -EAGAIN. The open lock fails because on the remote node
a PR->EX convert takes longer than a simple EX grant.

This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
on the directory inode. Now, a checkpoint interferes with the journaling
of the inodes deleted in the following unlinks, in our case,
directories d2-d8 and the files contained in it.

With this patch, We wait on a directory EX lock only if we already
have an open_lock in PR mode. This way we will avoid the ABBA locking.

By waiting for the open_lock on the directory, I am getting a unlink
performance improvement of a rm -Rf of 50-60% in the usual case.

Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.

Let me know if you would like to see the test case.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>

Comments

Shen Canquan June 18, 2013, 1:30 a.m. UTC | #1
On 2013/6/15 10:05, Goldwyn Rodrigues wrote:

> This patch is to improve the unlink performance. Here is the scenario:
> 
> On node A, create multiple directories say d1-d8, and each have 3
> files under it f1, f2 and f3.
> On node B, delete all directories using rm -Rf d*
> 
> The FS first unlinks f1, f2 and f3. However, when it performs
> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
> with -EAGAIN. The open lock fails because on the remote node
> a PR->EX convert takes longer than a simple EX grant.


Why a PR->EX convert takes longer than a simple EX grant.

> 
> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
> on the directory inode. Now, a checkpoint interferes with the journaling
> of the inodes deleted in the following unlinks, in our case,
> directories d2-d8 and the files contained in it.
> 
> With this patch, We wait on a directory EX lock only if we already
> have an open_lock in PR mode. This way we will avoid the ABBA locking.

why it can avoid the ABBA locking.

> 
> By waiting for the open_lock on the directory, I am getting a unlink
> performance improvement of a rm -Rf of 50-60% in the usual case.
> 
> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
> 
> Let me know if you would like to see the test case.
> 

Please send the testcase . thanks.

> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c
> ===================================================================
> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.c	2013-06-14 06:47:29.322506695 -0500
> +++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c	2013-06-14 09:19:48.651924037 -0500
> @@ -1681,9 +1681,9 @@ void ocfs2_rw_unlock(struct inode *inode
>  /*
>   * ocfs2_open_lock always get PR mode lock.
>   */
> -int ocfs2_open_lock(struct inode *inode)
> +int ocfs2_open_lock(struct inode *inode, int write, int wait)
>  {
> -	int status = 0;
> +	int status = 0, level, flags;
>  	struct ocfs2_lock_res *lockres;
>  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  
> @@ -1696,43 +1696,25 @@ int ocfs2_open_lock(struct inode *inode)
>  		goto out;
>  
>  	lockres = &OCFS2_I(inode)->ip_open_lockres;
> -
> -	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
> -				    DLM_LOCK_PR, 0, 0);
> -	if (status < 0)
> -		mlog_errno(status);
> -
> -out:
> -	return status;
> -}
> -
> -int ocfs2_try_open_lock(struct inode *inode, int write)
> -{
> -	int status = 0, level;
> -	struct ocfs2_lock_res *lockres;
> -	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> -
> -	BUG_ON(!inode);
> -
> -	mlog(0, "inode %llu try to take %s open lock\n",
> -	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
> -	     write ? "EXMODE" : "PRMODE");
> -
> -	if (ocfs2_mount_local(osb))
> -		goto out;
> -
> -	lockres = &OCFS2_I(inode)->ip_open_lockres;
> -
>  	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
> +	if (wait) {
> +		flags = 0;
> +		/* If we don't already have the lock in PR mode,
> +		 * don't wait.
> +		 *
> +		 * This should avoid ABBA locking.
> +		 */
> +		if ((lockres->l_level != DLM_LOCK_PR) && write)
> +			flags = DLM_LKF_NOQUEUE;
> +
> +	} else
> +		flags = DLM_LKF_NOQUEUE;
>  
> -	/*
> -	 * The file system may already holding a PRMODE/EXMODE open lock.
> -	 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
> -	 * other nodes and the -EAGAIN will indicate to the caller that
> -	 * this inode is still in use.
> -	 */
>  	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
> -				    level, DLM_LKF_NOQUEUE, 0);
> +				    level, flags, 0);
> +
> +	if ((status < 0) && (flags && (status != -EAGAIN)))
> +		mlog_errno(status);
>  
>  out:
>  	return status;
> Index: linux-3.0-SLE11-SP3/fs/ocfs2/inode.c
> ===================================================================
> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/inode.c	2013-06-13 22:54:32.527606012 -0500
> +++ linux-3.0-SLE11-SP3/fs/ocfs2/inode.c	2013-06-14 07:33:53.960196648 -0500
> @@ -455,7 +455,7 @@ static int ocfs2_read_locked_inode(struc
>  				  0, inode);
>  
>  	if (can_lock) {
> -		status = ocfs2_open_lock(inode);
> +		status = ocfs2_open_lock(inode, 0, 1);
>  		if (status) {
>  			make_bad_inode(inode);
>  			mlog_errno(status);
> @@ -470,7 +470,7 @@ static int ocfs2_read_locked_inode(struc
>  	}
>  
>  	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
> -		status = ocfs2_try_open_lock(inode, 0);
> +		status = ocfs2_open_lock(inode, 0, 0);
>  		if (status) {
>  			make_bad_inode(inode);
>  			return status;
> @@ -923,7 +923,8 @@ static int ocfs2_query_inode_wipe(struct
>  	 * Though we call this with the meta data lock held, the
>  	 * trylock keeps us from ABBA deadlock.
>  	 */
> -	status = ocfs2_try_open_lock(inode, 1);
> +	status = ocfs2_open_lock(inode, 1, S_ISDIR(inode->i_mode));
> +
>  	if (status == -EAGAIN) {
>  		status = 0;
>  		reason = 3;
> Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h
> ===================================================================
> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.h	2013-06-10 09:45:20.787386504 -0500
> +++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h	2013-06-14 07:38:49.861576515 -0500
> @@ -110,8 +110,7 @@ int ocfs2_create_new_inode_locks(struct
>  int ocfs2_drop_inode_locks(struct inode *inode);
>  int ocfs2_rw_lock(struct inode *inode, int write);
>  void ocfs2_rw_unlock(struct inode *inode, int write);
> -int ocfs2_open_lock(struct inode *inode);
> -int ocfs2_try_open_lock(struct inode *inode, int write);
> +int ocfs2_open_lock(struct inode *inode, int write, int wait);
>  void ocfs2_open_unlock(struct inode *inode);
>  int ocfs2_inode_lock_atime(struct inode *inode,
>  			  struct vfsmount *vfsmnt,
> Index: linux-3.0-SLE11-SP3/fs/ocfs2/namei.c
> ===================================================================
> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/namei.c	2013-06-13 22:54:32.527606012 -0500
> +++ linux-3.0-SLE11-SP3/fs/ocfs2/namei.c	2013-06-14 07:40:06.623785914 -0500
> @@ -2302,7 +2302,7 @@ int ocfs2_create_inode_in_orphan(struct
>  	}
>  
>  	/* get open lock so that only nodes can't remove it from orphan dir. */
> -	status = ocfs2_open_lock(inode);
> +	status = ocfs2_open_lock(inode, 0, 1);
>  	if (status < 0)
>  		mlog_errno(status);
>  
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> 
> .
>
Goldwyn Rodriues June 18, 2013, 2:17 a.m. UTC | #2
On Mon, Jun 17, 2013 at 8:30 PM, shencanquan <shencanquan@huawei.com> wrote:
> On 2013/6/15 10:05, Goldwyn Rodrigues wrote:
>
>> This patch is to improve the unlink performance. Here is the scenario:
>>
>> On node A, create multiple directories say d1-d8, and each have 3
>> files under it f1, f2 and f3.
>> On node B, delete all directories using rm -Rf d*
>>
>> The FS first unlinks f1, f2 and f3. However, when it performs
>> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
>> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
>> with -EAGAIN. The open lock fails because on the remote node
>> a PR->EX convert takes longer than a simple EX grant.
>
>
> Why a PR->EX convert takes longer than a simple EX grant.

This is best answered by the DLM. In short, a direct EX grant goes to
the lock master and if no previous lock found, is granted. In the case
of PR->EX, it first goes to the lock master, the lock master checks
the holders, request each holder to unlock. Once it gets a response
from all PR holders only then grants the lock.

>
>>
>> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
>> on the directory inode. Now, a checkpoint interferes with the journaling
>> of the inodes deleted in the following unlinks, in our case,
>> directories d2-d8 and the files contained in it.
>>
>> With this patch, We wait on a directory EX lock only if we already
>> have an open_lock in PR mode. This way we will avoid the ABBA locking.
>
> why it can avoid the ABBA locking.

The locking sequence in inode.c:ocfs2_read_locked_inode is open_lock
and then meta locks (ocfs2_inode_lock).
The ocfs2_try_open_lock was a non-blocking sequence and was always
called with meta locks held. However, if we use a blocking open_lock,
it will deadlock. OTOH, if we already posses the PR lock, it means
that the lock sequence already has happened in open_lock->meta_lock
and hence this is just an upgrade from PR->EX hence we can wait.

>
>>
>> By waiting for the open_lock on the directory, I am getting a unlink
>> performance improvement of a rm -Rf of 50-60% in the usual case.
>>
>> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
>>
>> Let me know if you would like to see the test case.
>>
>
> Please send the testcase . thanks.

Attached. You will have to modify the name of the other nodes in the
script. In my case, it was "sles11a" and "sles11c". Also remember to
setup .ssh/authorized_keys for automatic logins. You may get delays in
later calls because ocfs2cmt thread wakes up and has a big backlog,
but it is still much better than frequent checkpoints.

Create a directory, and copy the script there and execute it. Make
sure the ocfs2 partition is mounted at the same directory in all the
machines.

Hope that clarifies your doubts. Let me know if you need further clarifications.

>
>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c
>> ===================================================================
>> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.c       2013-06-14 06:47:29.322506695 -0500
>> +++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c    2013-06-14 09:19:48.651924037 -0500
>> @@ -1681,9 +1681,9 @@ void ocfs2_rw_unlock(struct inode *inode
>>  /*
>>   * ocfs2_open_lock always get PR mode lock.
>>   */
>> -int ocfs2_open_lock(struct inode *inode)
>> +int ocfs2_open_lock(struct inode *inode, int write, int wait)
>>  {
>> -     int status = 0;
>> +     int status = 0, level, flags;
>>       struct ocfs2_lock_res *lockres;
>>       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>
>> @@ -1696,43 +1696,25 @@ int ocfs2_open_lock(struct inode *inode)
>>               goto out;
>>
>>       lockres = &OCFS2_I(inode)->ip_open_lockres;
>> -
>> -     status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
>> -                                 DLM_LOCK_PR, 0, 0);
>> -     if (status < 0)
>> -             mlog_errno(status);
>> -
>> -out:
>> -     return status;
>> -}
>> -
>> -int ocfs2_try_open_lock(struct inode *inode, int write)
>> -{
>> -     int status = 0, level;
>> -     struct ocfs2_lock_res *lockres;
>> -     struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>> -
>> -     BUG_ON(!inode);
>> -
>> -     mlog(0, "inode %llu try to take %s open lock\n",
>> -          (unsigned long long)OCFS2_I(inode)->ip_blkno,
>> -          write ? "EXMODE" : "PRMODE");
>> -
>> -     if (ocfs2_mount_local(osb))
>> -             goto out;
>> -
>> -     lockres = &OCFS2_I(inode)->ip_open_lockres;
>> -
>>       level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
>> +     if (wait) {
>> +             flags = 0;
>> +             /* If we don't already have the lock in PR mode,
>> +              * don't wait.
>> +              *
>> +              * This should avoid ABBA locking.
>> +              */
>> +             if ((lockres->l_level != DLM_LOCK_PR) && write)
>> +                     flags = DLM_LKF_NOQUEUE;
>> +
>> +     } else
>> +             flags = DLM_LKF_NOQUEUE;
>>
>> -     /*
>> -      * The file system may already holding a PRMODE/EXMODE open lock.
>> -      * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
>> -      * other nodes and the -EAGAIN will indicate to the caller that
>> -      * this inode is still in use.
>> -      */
>>       status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
>> -                                 level, DLM_LKF_NOQUEUE, 0);
>> +                                 level, flags, 0);
>> +
>> +     if ((status < 0) && (flags && (status != -EAGAIN)))
>> +             mlog_errno(status);
>>
>>  out:
>>       return status;
>> Index: linux-3.0-SLE11-SP3/fs/ocfs2/inode.c
>> ===================================================================
>> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/inode.c 2013-06-13 22:54:32.527606012 -0500
>> +++ linux-3.0-SLE11-SP3/fs/ocfs2/inode.c      2013-06-14 07:33:53.960196648 -0500
>> @@ -455,7 +455,7 @@ static int ocfs2_read_locked_inode(struc
>>                                 0, inode);
>>
>>       if (can_lock) {
>> -             status = ocfs2_open_lock(inode);
>> +             status = ocfs2_open_lock(inode, 0, 1);
>>               if (status) {
>>                       make_bad_inode(inode);
>>                       mlog_errno(status);
>> @@ -470,7 +470,7 @@ static int ocfs2_read_locked_inode(struc
>>       }
>>
>>       if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
>> -             status = ocfs2_try_open_lock(inode, 0);
>> +             status = ocfs2_open_lock(inode, 0, 0);
>>               if (status) {
>>                       make_bad_inode(inode);
>>                       return status;
>> @@ -923,7 +923,8 @@ static int ocfs2_query_inode_wipe(struct
>>        * Though we call this with the meta data lock held, the
>>        * trylock keeps us from ABBA deadlock.
>>        */
>> -     status = ocfs2_try_open_lock(inode, 1);
>> +     status = ocfs2_open_lock(inode, 1, S_ISDIR(inode->i_mode));
>> +
>>       if (status == -EAGAIN) {
>>               status = 0;
>>               reason = 3;
>> Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h
>> ===================================================================
>> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.h       2013-06-10 09:45:20.787386504 -0500
>> +++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h    2013-06-14 07:38:49.861576515 -0500
>> @@ -110,8 +110,7 @@ int ocfs2_create_new_inode_locks(struct
>>  int ocfs2_drop_inode_locks(struct inode *inode);
>>  int ocfs2_rw_lock(struct inode *inode, int write);
>>  void ocfs2_rw_unlock(struct inode *inode, int write);
>> -int ocfs2_open_lock(struct inode *inode);
>> -int ocfs2_try_open_lock(struct inode *inode, int write);
>> +int ocfs2_open_lock(struct inode *inode, int write, int wait);
>>  void ocfs2_open_unlock(struct inode *inode);
>>  int ocfs2_inode_lock_atime(struct inode *inode,
>>                         struct vfsmount *vfsmnt,
>> Index: linux-3.0-SLE11-SP3/fs/ocfs2/namei.c
>> ===================================================================
>> --- linux-3.0-SLE11-SP3.orig/fs/ocfs2/namei.c 2013-06-13 22:54:32.527606012 -0500
>> +++ linux-3.0-SLE11-SP3/fs/ocfs2/namei.c      2013-06-14 07:40:06.623785914 -0500
>> @@ -2302,7 +2302,7 @@ int ocfs2_create_inode_in_orphan(struct
>>       }
>>
>>       /* get open lock so that only nodes can't remove it from orphan dir. */
>> -     status = ocfs2_open_lock(inode);
>> +     status = ocfs2_open_lock(inode, 0, 1);
>>       if (status < 0)
>>               mlog_errno(status);
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel@oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>
>> .
>>
>
>
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel@oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel



--
Goldwyn
Andrew Morton June 28, 2013, 11:12 p.m. UTC | #3
On Fri, 14 Jun 2013 21:05:10 -0500 Goldwyn Rodrigues <rgoldwyn@gmail.com> wrote:

> This patch is to improve the unlink performance. Here is the scenario:
> 
> On node A, create multiple directories say d1-d8, and each have 3
> files under it f1, f2 and f3.
> On node B, delete all directories using rm -Rf d*
> 
> The FS first unlinks f1, f2 and f3. However, when it performs
> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
> with -EAGAIN. The open lock fails because on the remote node
> a PR->EX convert takes longer than a simple EX grant.
> 
> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
> on the directory inode. Now, a checkpoint interferes with the journaling
> of the inodes deleted in the following unlinks, in our case,
> directories d2-d8 and the files contained in it.
> 
> With this patch, We wait on a directory EX lock only if we already
> have an open_lock in PR mode. This way we will avoid the ABBA locking.
> 
> By waiting for the open_lock on the directory, I am getting a unlink
> performance improvement of a rm -Rf of 50-60% in the usual case.
> 
> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
> 
> Let me know if you would like to see the test case.

We need some more review and test of this patch, please.

The patch doesn't apply to current kernels - please redo, retest and
resend it.

In particular, the kernel you're patching doesn't have the
ocfs2_is_hard_readonly() tests in ocfs2_open_lock() and
ocfs2_try_open_lock().

And looking at those tests, I wonder about ocfs2_open_lock().

: int ocfs2_open_lock(struct inode *inode)
: {
: 	int status = 0;
: 	struct ocfs2_lock_res *lockres;
: 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
: 
: 	BUG_ON(!inode);
: 
: 	mlog(0, "inode %llu take PRMODE open lock\n",
: 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
: 
: 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
: 		goto out;
: 
: 	lockres = &OCFS2_I(inode)->ip_open_lockres;
: 
: 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
: 				    DLM_LOCK_PR, 0, 0);
: 	if (status < 0)
: 		mlog_errno(status);
: 
: out:
: 	return status;
: }

It will return zero if ocfs2_is_hard_readonly().  Is that correct? 
Should it return -EROFS for writes?  After your patch, this code does
know whether the attempt was for a write.

Please check all that.
jeff.liu June 30, 2013, 6:44 a.m. UTC | #4
On 06/29/2013 07:12 AM, Andrew Morton wrote:

> On Fri, 14 Jun 2013 21:05:10 -0500 Goldwyn Rodrigues <rgoldwyn@gmail.com> wrote:
> 
>> This patch is to improve the unlink performance. Here is the scenario:
>>
>> On node A, create multiple directories say d1-d8, and each have 3
>> files under it f1, f2 and f3.
>> On node B, delete all directories using rm -Rf d*
>>
>> The FS first unlinks f1, f2 and f3. However, when it performs
>> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
>> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
>> with -EAGAIN. The open lock fails because on the remote node
>> a PR->EX convert takes longer than a simple EX grant.
>>
>> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
>> on the directory inode. Now, a checkpoint interferes with the journaling
>> of the inodes deleted in the following unlinks, in our case,
>> directories d2-d8 and the files contained in it.
>>
>> With this patch, We wait on a directory EX lock only if we already
>> have an open_lock in PR mode. This way we will avoid the ABBA locking.
>>
>> By waiting for the open_lock on the directory, I am getting a unlink
>> performance improvement of a rm -Rf of 50-60% in the usual case.
>>
>> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
>>
>> Let me know if you would like to see the test case.
> 
> We need some more review and test of this patch, please.
> 
> The patch doesn't apply to current kernels - please redo, retest and
> resend it.
> 
> In particular, the kernel you're patching doesn't have the
> ocfs2_is_hard_readonly() tests in ocfs2_open_lock() and
> ocfs2_try_open_lock().
> 
> And looking at those tests, I wonder about ocfs2_open_lock().
> 
> : int ocfs2_open_lock(struct inode *inode)
> : {
> : 	int status = 0;
> : 	struct ocfs2_lock_res *lockres;
> : 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
> : 
> : 	BUG_ON(!inode);
> : 
> : 	mlog(0, "inode %llu take PRMODE open lock\n",
> : 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
> : 
> : 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
> : 		goto out;
> : 
> : 	lockres = &OCFS2_I(inode)->ip_open_lockres;
> : 
> : 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
> : 				    DLM_LOCK_PR, 0, 0);
> : 	if (status < 0)
> : 		mlog_errno(status);
> : 
> : out:
> : 	return status;
> : }
> 
> It will return zero if ocfs2_is_hard_readonly().  Is that correct? 

> Should it return -EROFS for writes? 

Hi Andrew,

That is correct for the current design, because ocfs2_open_lock() is
implemented to grant a lock in protect read mode.

The ocfs2_is_hard_readonly() has been introduced to ocfs2_open_lock()
by commit: 03efed8a2a1b8e00164eb4720a82a7dd5e368a8e

    ocfs2: Bugfix for hard readonly mount
        ocfs2 cannot currently mount a device that is readonly at the media
    ("hard readonly").  Fix the broken places.
    see detail: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1322

> After your patch, this code does
> know whether the attempt was for a write.



Yes, with this change, it need a fix indeed.

> 
> Please check all that.




Hi Goldwyn,

Sorry for the too late response as I have mentioned that I'll take a look at it,
but some internal tasks really racked my brains in the past couple of weeks.

Just as Andrew's comments, this patch can not be applied properly against the
upstream development tree, could you please re-base it.

If we merge ocfs2_open_lock()/ocfs2_open_try_lock() into one, looks the semantics of
ocfs2_open_lock() is broke with the 'write' option is introduced, but this is not a
big issue as we can tweak up the comments as well as those functions which would be
affected, but we need to run ocfs2-test(at least cover all the code path which are
involved in this change) to avoid any regression, it can be download from:
https://oss.oracle.com/projects/ocfs2-test/source.html
https://oss.oracle.com/osswiki/OCFS2/Ocfs2TestList.html

Another thing come to mind since this patch would affect ocfs2_evict_inode() with
ocfs2_try_open_lock(), i.e, the following call chains:

ocfs2_evict_inode()
|
|-------ocfs2_delete_inode()
	|
	|-------ocfs2_query_inode_wipe()
	|	|
	|	|-------ocfs2_try_open_lock()
	|	|
	|-------ocfs2_clear_inode()
		|
		|---------ocfs2_open_unlock()

OCFS2 has a DEAD LOCK issue when creating/removing tons of files in parallel if the
disk quota is enabled, please refer to:
https://oss.oracle.com/bugzilla/show_bug.cgi?id=1339

As per the old discussions, Jan Kara pointed out that there is a race in
ocfs2_evict_inode() if consider the following lock sequences:

ocfs2_evict_inode()
|
|-------ocfs2_inode_lock()
	|
	|-------ocfs2_try_open_lock() [ try to get open lock in EX mode ]
		|
		|-------ocfs2_inode_unlock()
			|
			|-------ocfs2_open_unlock() [ drops shared open lock that is hold ]

Quotas from Jan:
"""
Now if two nodes happen to execute ocfs2_evict_inode() in parallel and
ocfs2_try_open_lock() happens on both nodes before ocfs2_open_unlock() is
called on any of them, ocfs2_try_open_lock() fails for both nodes...

I would think that the code should be reorganized so that shared open
lock is dropped before we drop inode lock. Then the race could not happen.
But I'm not sure if something else would not break.
"""

This is deserve to give a try IMHO, especially there would be kind of dependencies
with your fix, and that would be wonderful if we can have your improvement as well
as lock sequences adjustments that might fix the quota problems.


Thanks,
-Jeff
Goldwyn Rodriues July 1, 2013, 4:44 p.m. UTC | #5
On Sun, Jun 30, 2013 at 1:44 AM, Jeff Liu <jeff.liu@oracle.com> wrote:
> On 06/29/2013 07:12 AM, Andrew Morton wrote:
>
>> On Fri, 14 Jun 2013 21:05:10 -0500 Goldwyn Rodrigues <rgoldwyn@gmail.com> wrote:
>>
>>> This patch is to improve the unlink performance. Here is the scenario:
>>>
>>> On node A, create multiple directories say d1-d8, and each have 3
>>> files under it f1, f2 and f3.
>>> On node B, delete all directories using rm -Rf d*
>>>
>>> The FS first unlinks f1, f2 and f3. However, when it performs
>>> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
>>> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
>>> with -EAGAIN. The open lock fails because on the remote node
>>> a PR->EX convert takes longer than a simple EX grant.
>>>
>>> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
>>> on the directory inode. Now, a checkpoint interferes with the journaling
>>> of the inodes deleted in the following unlinks, in our case,
>>> directories d2-d8 and the files contained in it.
>>>
>>> With this patch, We wait on a directory EX lock only if we already
>>> have an open_lock in PR mode. This way we will avoid the ABBA locking.
>>>
>>> By waiting for the open_lock on the directory, I am getting a unlink
>>> performance improvement of a rm -Rf of 50-60% in the usual case.
>>>
>>> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
>>>
>>> Let me know if you would like to see the test case.
>>
>> We need some more review and test of this patch, please.
>>
>> The patch doesn't apply to current kernels - please redo, retest and
>> resend it.

Yes, I sent it against an older kernel. I will update it once I
incorporate the other review comments. Thanks for this comment.

>>
>> In particular, the kernel you're patching doesn't have the
>> ocfs2_is_hard_readonly() tests in ocfs2_open_lock() and
>> ocfs2_try_open_lock().
>>
>> And looking at those tests, I wonder about ocfs2_open_lock().
>>
>> : int ocfs2_open_lock(struct inode *inode)
>> : {
>> :     int status = 0;
>> :     struct ocfs2_lock_res *lockres;
>> :     struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>> :
>> :     BUG_ON(!inode);
>> :
>> :     mlog(0, "inode %llu take PRMODE open lock\n",
>> :          (unsigned long long)OCFS2_I(inode)->ip_blkno);
>> :
>> :     if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
>> :             goto out;
>> :
>> :     lockres = &OCFS2_I(inode)->ip_open_lockres;
>> :
>> :     status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
>> :                                 DLM_LOCK_PR, 0, 0);
>> :     if (status < 0)
>> :             mlog_errno(status);
>> :
>> : out:
>> :     return status;
>> : }
>>
>> It will return zero if ocfs2_is_hard_readonly().  Is that correct?
>
>> Should it return -EROFS for writes?

Yes, it should.


>
> Hi Andrew,
>
> That is correct for the current design, because ocfs2_open_lock() is
> implemented to grant a lock in protect read mode.
>
> The ocfs2_is_hard_readonly() has been introduced to ocfs2_open_lock()
> by commit: 03efed8a2a1b8e00164eb4720a82a7dd5e368a8e
>
>     ocfs2: Bugfix for hard readonly mount
>         ocfs2 cannot currently mount a device that is readonly at the media
>     ("hard readonly").  Fix the broken places.
>     see detail: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1322
>
>> After your patch, this code does
>> know whether the attempt was for a write.
>
>
>
> Yes, with this change, it need a fix indeed.
>
>>
>> Please check all that.
>
>
>
>
> Hi Goldwyn,
>
> Sorry for the too late response as I have mentioned that I'll take a look at it,
> but some internal tasks really racked my brains in the past couple of weeks.

No problem.

>
> Just as Andrew's comments, this patch can not be applied properly against the
> upstream development tree, could you please re-base it.

Yes, will do. I have other comments from Mark as well.

>
> If we merge ocfs2_open_lock()/ocfs2_open_try_lock() into one, looks the semantics of
> ocfs2_open_lock() is broke with the 'write' option is introduced, but this is not a
> big issue as we can tweak up the comments as well as those functions which would be
> affected, but we need to run ocfs2-test(at least cover all the code path which are
> involved in this change) to avoid any regression, it can be download from:
> https://oss.oracle.com/projects/ocfs2-test/source.html
> https://oss.oracle.com/osswiki/OCFS2/Ocfs2TestList.html
>
> Another thing come to mind since this patch would affect ocfs2_evict_inode() with
> ocfs2_try_open_lock(), i.e, the following call chains:
>
> ocfs2_evict_inode()
> |
> |-------ocfs2_delete_inode()
>         |
>         |-------ocfs2_query_inode_wipe()
>         |       |
>         |       |-------ocfs2_try_open_lock()
>         |       |
>         |-------ocfs2_clear_inode()
>                 |
>                 |---------ocfs2_open_unlock()
>
> OCFS2 has a DEAD LOCK issue when creating/removing tons of files in parallel if the
> disk quota is enabled, please refer to:
> https://oss.oracle.com/bugzilla/show_bug.cgi?id=1339
>
> As per the old discussions, Jan Kara pointed out that there is a race in
> ocfs2_evict_inode() if consider the following lock sequences:
>
> ocfs2_evict_inode()
> |
> |-------ocfs2_inode_lock()
>         |
>         |-------ocfs2_try_open_lock() [ try to get open lock in EX mode ]
>                 |
>                 |-------ocfs2_inode_unlock()
>                         |
>                         |-------ocfs2_open_unlock() [ drops shared open lock that is hold ]
>
> Quotas from Jan:
> """
> Now if two nodes happen to execute ocfs2_evict_inode() in parallel and
> ocfs2_try_open_lock() happens on both nodes before ocfs2_open_unlock() is
> called on any of them, ocfs2_try_open_lock() fails for both nodes...
>
> I would think that the code should be reorganized so that shared open
> lock is dropped before we drop inode lock. Then the race could not happen.
> But I'm not sure if something else would not break.
> """

I could not find the reference of this in the bug. Is it discussed on
the ML. Do you have a link so I can look at the history?

>
> This is deserve to give a try IMHO, especially there would be kind of dependencies
> with your fix, and that would be wonderful if we can have your improvement as well
> as lock sequences adjustments that might fix the quota problems.
>

Yes, I understand. Please provide the references if possible so I get
the bigger picture.


--
Goldwyn
jeff.liu July 2, 2013, 7:11 a.m. UTC | #6
On 07/02/2013 12:44 AM, Goldwyn Rodrigues wrote:

> On Sun, Jun 30, 2013 at 1:44 AM, Jeff Liu <jeff.liu@oracle.com> wrote:
>> On 06/29/2013 07:12 AM, Andrew Morton wrote:
>>
>>> On Fri, 14 Jun 2013 21:05:10 -0500 Goldwyn Rodrigues <rgoldwyn@gmail.com> wrote:
>>>
>>>> This patch is to improve the unlink performance. Here is the scenario:
>>>>
>>>> On node A, create multiple directories say d1-d8, and each have 3
>>>> files under it f1, f2 and f3.
>>>> On node B, delete all directories using rm -Rf d*
>>>>
>>>> The FS first unlinks f1, f2 and f3. However, when it performs
>>>> ocfs2_evict_inode() -> ocfs2_delete_inode() ->
>>>> ocfs2_query_inode_wipe() -> ocfs2_try_open_lock() on d1, it fails
>>>> with -EAGAIN. The open lock fails because on the remote node
>>>> a PR->EX convert takes longer than a simple EX grant.
>>>>
>>>> This starts a checkpoint because OCFS2_INODE_DELETED flag is not set
>>>> on the directory inode. Now, a checkpoint interferes with the journaling
>>>> of the inodes deleted in the following unlinks, in our case,
>>>> directories d2-d8 and the files contained in it.
>>>>
>>>> With this patch, We wait on a directory EX lock only if we already
>>>> have an open_lock in PR mode. This way we will avoid the ABBA locking.
>>>>
>>>> By waiting for the open_lock on the directory, I am getting a unlink
>>>> performance improvement of a rm -Rf of 50-60% in the usual case.
>>>>
>>>> Also, folded ocfs2_open_lock and ocfs2_try_open_lock into one.
>>>>
>>>> Let me know if you would like to see the test case.
>>>
>>> We need some more review and test of this patch, please.
>>>
>>> The patch doesn't apply to current kernels - please redo, retest and
>>> resend it.
> 
> Yes, I sent it against an older kernel. I will update it once I
> incorporate the other review comments. Thanks for this comment.
> 
>>>
>>> In particular, the kernel you're patching doesn't have the
>>> ocfs2_is_hard_readonly() tests in ocfs2_open_lock() and
>>> ocfs2_try_open_lock().
>>>
>>> And looking at those tests, I wonder about ocfs2_open_lock().
>>>
>>> : int ocfs2_open_lock(struct inode *inode)
>>> : {
>>> :     int status = 0;
>>> :     struct ocfs2_lock_res *lockres;
>>> :     struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>>> :
>>> :     BUG_ON(!inode);
>>> :
>>> :     mlog(0, "inode %llu take PRMODE open lock\n",
>>> :          (unsigned long long)OCFS2_I(inode)->ip_blkno);
>>> :
>>> :     if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
>>> :             goto out;
>>> :
>>> :     lockres = &OCFS2_I(inode)->ip_open_lockres;
>>> :
>>> :     status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
>>> :                                 DLM_LOCK_PR, 0, 0);
>>> :     if (status < 0)
>>> :             mlog_errno(status);
>>> :
>>> : out:
>>> :     return status;
>>> : }
>>>
>>> It will return zero if ocfs2_is_hard_readonly().  Is that correct?
>>
>>> Should it return -EROFS for writes?
> 
> Yes, it should.
> 
> 
>>
>> Hi Andrew,
>>
>> That is correct for the current design, because ocfs2_open_lock() is
>> implemented to grant a lock in protect read mode.
>>
>> The ocfs2_is_hard_readonly() has been introduced to ocfs2_open_lock()
>> by commit: 03efed8a2a1b8e00164eb4720a82a7dd5e368a8e
>>
>>     ocfs2: Bugfix for hard readonly mount
>>         ocfs2 cannot currently mount a device that is readonly at the media
>>     ("hard readonly").  Fix the broken places.
>>     see detail: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1322
>>
>>> After your patch, this code does
>>> know whether the attempt was for a write.
>>
>>
>>
>> Yes, with this change, it need a fix indeed.
>>
>>>
>>> Please check all that.
>>
>>
>>
>>
>> Hi Goldwyn,
>>
>> Sorry for the too late response as I have mentioned that I'll take a look at it,
>> but some internal tasks really racked my brains in the past couple of weeks.
> 
> No problem.
> 
>>
>> Just as Andrew's comments, this patch can not be applied properly against the
>> upstream development tree, could you please re-base it.
> 
> Yes, will do. I have other comments from Mark as well.
> 
>>
>> If we merge ocfs2_open_lock()/ocfs2_open_try_lock() into one, looks the semantics of
>> ocfs2_open_lock() is broke with the 'write' option is introduced, but this is not a
>> big issue as we can tweak up the comments as well as those functions which would be
>> affected, but we need to run ocfs2-test(at least cover all the code path which are
>> involved in this change) to avoid any regression, it can be download from:
>> https://oss.oracle.com/projects/ocfs2-test/source.html
>> https://oss.oracle.com/osswiki/OCFS2/Ocfs2TestList.html
>>
>> Another thing come to mind since this patch would affect ocfs2_evict_inode() with
>> ocfs2_try_open_lock(), i.e, the following call chains:
>>
>> ocfs2_evict_inode()
>> |
>> |-------ocfs2_delete_inode()
>>         |
>>         |-------ocfs2_query_inode_wipe()
>>         |       |
>>         |       |-------ocfs2_try_open_lock()
>>         |       |
>>         |-------ocfs2_clear_inode()
>>                 |
>>                 |---------ocfs2_open_unlock()
>>
>> OCFS2 has a DEAD LOCK issue when creating/removing tons of files in parallel if the
>> disk quota is enabled, please refer to:
>> https://oss.oracle.com/bugzilla/show_bug.cgi?id=1339
>>
>> As per the old discussions, Jan Kara pointed out that there is a race in
>> ocfs2_evict_inode() if consider the following lock sequences:
>>
>> ocfs2_evict_inode()
>> |
>> |-------ocfs2_inode_lock()
>>         |
>>         |-------ocfs2_try_open_lock() [ try to get open lock in EX mode ]
>>                 |
>>                 |-------ocfs2_inode_unlock()
>>                         |
>>                         |-------ocfs2_open_unlock() [ drops shared open lock that is hold ]
>>
>> Quotas from Jan:
>> """
>> Now if two nodes happen to execute ocfs2_evict_inode() in parallel and
>> ocfs2_try_open_lock() happens on both nodes before ocfs2_open_unlock() is
>> called on any of them, ocfs2_try_open_lock() fails for both nodes...
>>
>> I would think that the code should be reorganized so that shared open
>> lock is dropped before we drop inode lock. Then the race could not happen.
>> But I'm not sure if something else would not break.
>> """
> 
> I could not find the reference of this in the bug. Is it discussed on
> the ML. Do you have a link so I can look at the history?
> 
>>
>> This is deserve to give a try IMHO, especially there would be kind of dependencies
>> with your fix, and that would be wonderful if we can have your improvement as well
>> as lock sequences adjustments that might fix the quota problems.
>>
> 
> Yes, I understand. Please provide the references if possible so I get
> the bigger picture.

https://oss.oracle.com/pipermail/ocfs2-devel/2012-August/008709.html

Thanks,
-Jeff
diff mbox

Patch

Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c
===================================================================
--- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.c	2013-06-14 06:47:29.322506695 -0500
+++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.c	2013-06-14 09:19:48.651924037 -0500
@@ -1681,9 +1681,9 @@  void ocfs2_rw_unlock(struct inode *inode
 /*
  * ocfs2_open_lock always get PR mode lock.
  */
-int ocfs2_open_lock(struct inode *inode)
+int ocfs2_open_lock(struct inode *inode, int write, int wait)
 {
-	int status = 0;
+	int status = 0, level, flags;
 	struct ocfs2_lock_res *lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -1696,43 +1696,25 @@  int ocfs2_open_lock(struct inode *inode)
 		goto out;
 
 	lockres = &OCFS2_I(inode)->ip_open_lockres;
-
-	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-				    DLM_LOCK_PR, 0, 0);
-	if (status < 0)
-		mlog_errno(status);
-
-out:
-	return status;
-}
-
-int ocfs2_try_open_lock(struct inode *inode, int write)
-{
-	int status = 0, level;
-	struct ocfs2_lock_res *lockres;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	BUG_ON(!inode);
-
-	mlog(0, "inode %llu try to take %s open lock\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     write ? "EXMODE" : "PRMODE");
-
-	if (ocfs2_mount_local(osb))
-		goto out;
-
-	lockres = &OCFS2_I(inode)->ip_open_lockres;
-
 	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+	if (wait) {
+		flags = 0;
+		/* If we don't already have the lock in PR mode,
+		 * don't wait.
+		 *
+		 * This should avoid ABBA locking.
+		 */
+		if ((lockres->l_level != DLM_LOCK_PR) && write)
+			flags = DLM_LKF_NOQUEUE;
+
+	} else
+		flags = DLM_LKF_NOQUEUE;
 
-	/*
-	 * The file system may already holding a PRMODE/EXMODE open lock.
-	 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
-	 * other nodes and the -EAGAIN will indicate to the caller that
-	 * this inode is still in use.
-	 */
 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-				    level, DLM_LKF_NOQUEUE, 0);
+				    level, flags, 0);
+
+	if ((status < 0) && (flags && (status != -EAGAIN)))
+		mlog_errno(status);
 
 out:
 	return status;
Index: linux-3.0-SLE11-SP3/fs/ocfs2/inode.c
===================================================================
--- linux-3.0-SLE11-SP3.orig/fs/ocfs2/inode.c	2013-06-13 22:54:32.527606012 -0500
+++ linux-3.0-SLE11-SP3/fs/ocfs2/inode.c	2013-06-14 07:33:53.960196648 -0500
@@ -455,7 +455,7 @@  static int ocfs2_read_locked_inode(struc
 				  0, inode);
 
 	if (can_lock) {
-		status = ocfs2_open_lock(inode);
+		status = ocfs2_open_lock(inode, 0, 1);
 		if (status) {
 			make_bad_inode(inode);
 			mlog_errno(status);
@@ -470,7 +470,7 @@  static int ocfs2_read_locked_inode(struc
 	}
 
 	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
-		status = ocfs2_try_open_lock(inode, 0);
+		status = ocfs2_open_lock(inode, 0, 0);
 		if (status) {
 			make_bad_inode(inode);
 			return status;
@@ -923,7 +923,8 @@  static int ocfs2_query_inode_wipe(struct
 	 * Though we call this with the meta data lock held, the
 	 * trylock keeps us from ABBA deadlock.
 	 */
-	status = ocfs2_try_open_lock(inode, 1);
+	status = ocfs2_open_lock(inode, 1, S_ISDIR(inode->i_mode));
+
 	if (status == -EAGAIN) {
 		status = 0;
 		reason = 3;
Index: linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h
===================================================================
--- linux-3.0-SLE11-SP3.orig/fs/ocfs2/dlmglue.h	2013-06-10 09:45:20.787386504 -0500
+++ linux-3.0-SLE11-SP3/fs/ocfs2/dlmglue.h	2013-06-14 07:38:49.861576515 -0500
@@ -110,8 +110,7 @@  int ocfs2_create_new_inode_locks(struct
 int ocfs2_drop_inode_locks(struct inode *inode);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
-int ocfs2_open_lock(struct inode *inode);
-int ocfs2_try_open_lock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode, int write, int wait);
 void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_inode_lock_atime(struct inode *inode,
 			  struct vfsmount *vfsmnt,
Index: linux-3.0-SLE11-SP3/fs/ocfs2/namei.c
===================================================================
--- linux-3.0-SLE11-SP3.orig/fs/ocfs2/namei.c	2013-06-13 22:54:32.527606012 -0500
+++ linux-3.0-SLE11-SP3/fs/ocfs2/namei.c	2013-06-14 07:40:06.623785914 -0500
@@ -2302,7 +2302,7 @@  int ocfs2_create_inode_in_orphan(struct
 	}
 
 	/* get open lock so that only nodes can't remove it from orphan dir. */
-	status = ocfs2_open_lock(inode);
+	status = ocfs2_open_lock(inode, 0, 1);
 	if (status < 0)
 		mlog_errno(status);