diff mbox

[12/12] mds: Avoid creating unnecessary snaprealm

Message ID 1349168152-13281-12-git-send-email-zheng.z.yan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yan, Zheng Oct. 2, 2012, 8:55 a.m. UTC
From: "Yan, Zheng" <zheng.z.yan@intel.com>

When moving directory between snaprealms, we can avoid creating snaprealm
if the directory doesn't has its own snaprealm and directory was created
after both realms' newest snapshot.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/Server.cc | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

Comments

Sage Weil Oct. 2, 2012, 6:31 p.m. UTC | #1
Hi Yan,

This whole series looks great!  Sticking it in wip-mds and running it 
through the fs qa suite before merging it.

How are you testing these?  If you haven't seen it yet, there is an 'mds 
thrash exports' option that will make MDSs random migrate subtrees to each 
other that is great for shaking out bugs.  That and periodic daemon 
restarts (one of the first things we need to do on the clustered mds front 
is to get daemon restarting integrated into teuthology).

Thanks!
sage


On Tue, 2 Oct 2012, Yan, Zheng wrote:

> From: "Yan, Zheng" <zheng.z.yan@intel.com>
> 
> When moving directory between snaprealms, we can avoid creating snaprealm
> if the directory doesn't has its own snaprealm and directory was created
> after both realms' newest snapshot.
> 
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/Server.cc | 25 +++++++++++++++++--------
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/src/mds/Server.cc b/src/mds/Server.cc
> index e16800e..b706b5a 100644
> --- a/src/mds/Server.cc
> +++ b/src/mds/Server.cc
> @@ -4577,7 +4577,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
>      mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
>  
>      // project snaprealm, too
> -    in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
> +    if (in->snaprealm || follows + 1 > dn->first)
> +      in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
>  
>      le->metablob.add_primary_dentry(straydn, true, in);
>    } else {
> @@ -5247,11 +5248,16 @@ void Server::handle_client_rename(MDRequest *mdr)
>    }
>  
>    // moving between snaprealms?
> -  if (srcdnl->is_primary() && !srci->snaprealm &&
> -      srci->find_snaprealm() != destdn->get_dir()->inode->find_snaprealm()) {
> -    dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
> -    mds->mdcache->snaprealm_create(mdr, srci);
> -    return;
> +  if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
> +    SnapRealm *srcrealm = srci->find_snaprealm();
> +    SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
> +    if (srcrealm != destrealm &&
> +	(srcrealm->get_newest_seq() + 1 > srcdn->first ||
> +	 destrealm->get_newest_seq() + 1 > srcdn->first)) {
> +      dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
> +      mds->mdcache->snaprealm_create(mdr, srci);
> +      return;
> +    }
>    }
>  
>    assert(g_conf->mds_kill_rename_at != 1);
> @@ -5650,6 +5656,7 @@ void Server::_rename_prepare(MDRequest *mdr,
>    if (destdn->is_auth())
>      mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
>  
> +  SnapRealm *src_realm = srci->find_snaprealm();
>    SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
>    snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
>  
> @@ -5659,7 +5666,8 @@ void Server::_rename_prepare(MDRequest *mdr,
>      if (destdnl->is_primary()) {
>        if (destdn->is_auth()) {
>  	// project snaprealm, too
> -	oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
> +	if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first)
> +	  oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
>  	straydn->first = MAX(oldin->first, next_dest_snap);
>  	metablob->add_primary_dentry(straydn, true, oldin);
>        }
> @@ -5703,7 +5711,8 @@ void Server::_rename_prepare(MDRequest *mdr,
>      }
>    } else if (srcdnl->is_primary()) {
>      // project snap parent update?
> -    if (destdn->is_auth() && srci->snaprealm)
> +    if (destdn->is_auth() &&
> +        (srci->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first))
>        srci->project_past_snaprealm_parent(dest_realm);
>      
>      if (destdn->is_auth() && !destdnl->is_null())
> -- 
> 1.7.11.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng Oct. 2, 2012, 11:45 p.m. UTC | #2
On 10/03/2012 02:31 AM, Sage Weil wrote:
> Hi Yan,
> 
> This whole series looks great!  Sticking it in wip-mds and running it 
> through the fs qa suite before merging it.
> 
> How are you testing these?  If you haven't seen it yet, there is an 'mds 
> thrash exports' option that will make MDSs random migrate subtrees to each 
> other that is great for shaking out bugs.  That and periodic daemon 
> restarts (one of the first things we need to do on the clustered mds front 
> is to get daemon restarting integrated into teuthology).
> 

The patches are fixes for problems I encountered during playing MDS shutdown.
I setup a 2 MDS cephfs and copied some data into it, deleted some directories
whose authority is MDS.1, then shutdown MDS.1.

Most patches in this series are obvious. The two snaprealm related patches are
workaround for a bug: replica inode's snaprealm->open is not true. The bug triggers
assertion in CInode::pop_projected_snaprealm() if snaprealm is involved in cross
authority rename.

Regards
Yan, Zheng
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil Oct. 3, 2012, 12:12 a.m. UTC | #3
On Wed, 3 Oct 2012, Yan, Zheng wrote:
> On 10/03/2012 02:31 AM, Sage Weil wrote:
> > Hi Yan,
> > 
> > This whole series looks great!  Sticking it in wip-mds and running it 
> > through the fs qa suite before merging it.
> > 
> > How are you testing these?  If you haven't seen it yet, there is an 'mds 
> > thrash exports' option that will make MDSs random migrate subtrees to each 
> > other that is great for shaking out bugs.  That and periodic daemon 
> > restarts (one of the first things we need to do on the clustered mds front 
> > is to get daemon restarting integrated into teuthology).
> > 
> 
> The patches are fixes for problems I encountered during playing MDS shutdown.
> I setup a 2 MDS cephfs and copied some data into it, deleted some directories
> whose authority is MDS.1, then shutdown MDS.1.
> 
> Most patches in this series are obvious. The two snaprealm related patches are
> workaround for a bug: replica inode's snaprealm->open is not true. The bug triggers
> assertion in CInode::pop_projected_snaprealm() if snaprealm is involved in cross
> authority rename.

Do you mind opening a ticket at tracker.newdream.net so we don't lose 
track of it?

Fsstress on a single mds turned up this:

2012-10-02T17:09:09.359 INFO:teuthology.task.ceph.mds.a.err:*** Caught signal (Segmentation fault) **
2012-10-02T17:09:09.359 INFO:teuthology.task.ceph.mds.a.err: in thread 7f8873a41700
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: ceph version 0.52-949-ge8df6a7 (commit:e8df6a74cae66accb6682129c9c5ad33797f458c)
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 1: /tmp/cephtest/binary/usr/local/bin/ceph-mds() [0x812b21]
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 2: (()+0xfcb0) [0x7f88787b3cb0]
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 3: (Server::handle_client_rename(MDRequest*)+0xa28) [0x53dc88]
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 4: (Server::dispatch_client_request(MDRequest*)+0x4fb) [0x54123b]
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 5: (Server::handle_client_request(MClientRequest*)+0x51d) [0x544a6d]
2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 6: (Server::dispatch(Message*)+0x2d3) [0x5452e3]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 7: (MDS::handle_deferrable_message(Message*)+0x91f) [0x4bc32f]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 8: (MDS::_dispatch(Message*)+0x9b6) [0x4cf8b6]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 9: (MDS::ms_dispatch(Message*)+0x21b) [0x4d0c3b]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 10: (DispatchQueue::entry()+0x711) [0x7eb301]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 11: (DispatchQueue::DispatchThread::entry()+0xd) [0x7713dd]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 12: (()+0x7e9a) [0x7f88787abe9a]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 13: (clone()+0x6d) [0x7f8876d534bd]
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err:2012-10-02 17:09:09.349272 7f8873a41700 -1 *** Caught signal (Segmentation fault) **
2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: in thread 7f8873a41700

I don't have time right now to hunt this down, but you should be able to 
reproduce with qa/workunits/suites/fsstress.sh on top of ceph-fuse with 1 
mds.

Thanks!
sage

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng Oct. 3, 2012, 11:44 a.m. UTC | #4
On 10/03/2012 08:12 AM, Sage Weil wrote:
> On Wed, 3 Oct 2012, Yan, Zheng wrote:
>> On 10/03/2012 02:31 AM, Sage Weil wrote:
>>> Hi Yan,
>>>
>>> This whole series looks great!  Sticking it in wip-mds and running it 
>>> through the fs qa suite before merging it.
>>>
>>> How are you testing these?  If you haven't seen it yet, there is an 'mds 
>>> thrash exports' option that will make MDSs random migrate subtrees to each 
>>> other that is great for shaking out bugs.  That and periodic daemon 
>>> restarts (one of the first things we need to do on the clustered mds front 
>>> is to get daemon restarting integrated into teuthology).
>>>
>>
>> The patches are fixes for problems I encountered during playing MDS shutdown.
>> I setup a 2 MDS cephfs and copied some data into it, deleted some directories
>> whose authority is MDS.1, then shutdown MDS.1.
>>
>> Most patches in this series are obvious. The two snaprealm related patches are
>> workaround for a bug: replica inode's snaprealm->open is not true. The bug triggers
>> assertion in CInode::pop_projected_snaprealm() if snaprealm is involved in cross
>> authority rename.
> 
> Do you mind opening a ticket at tracker.newdream.net so we don't lose 
> track of it?

will do
> 
> Fsstress on a single mds turned up this:
> 
> 2012-10-02T17:09:09.359 INFO:teuthology.task.ceph.mds.a.err:*** Caught signal (Segmentation fault) **
> 2012-10-02T17:09:09.359 INFO:teuthology.task.ceph.mds.a.err: in thread 7f8873a41700
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: ceph version 0.52-949-ge8df6a7 (commit:e8df6a74cae66accb6682129c9c5ad33797f458c)
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 1: /tmp/cephtest/binary/usr/local/bin/ceph-mds() [0x812b21]
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 2: (()+0xfcb0) [0x7f88787b3cb0]
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 3: (Server::handle_client_rename(MDRequest*)+0xa28) [0x53dc88]
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 4: (Server::dispatch_client_request(MDRequest*)+0x4fb) [0x54123b]
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 5: (Server::handle_client_request(MClientRequest*)+0x51d) [0x544a6d]
> 2012-10-02T17:09:09.361 INFO:teuthology.task.ceph.mds.a.err: 6: (Server::dispatch(Message*)+0x2d3) [0x5452e3]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 7: (MDS::handle_deferrable_message(Message*)+0x91f) [0x4bc32f]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 8: (MDS::_dispatch(Message*)+0x9b6) [0x4cf8b6]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 9: (MDS::ms_dispatch(Message*)+0x21b) [0x4d0c3b]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 10: (DispatchQueue::entry()+0x711) [0x7eb301]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 11: (DispatchQueue::DispatchThread::entry()+0xd) [0x7713dd]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 12: (()+0x7e9a) [0x7f88787abe9a]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: 13: (clone()+0x6d) [0x7f8876d534bd]
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err:2012-10-02 17:09:09.349272 7f8873a41700 -1 *** Caught signal (Segmentation fault) **
> 2012-10-02T17:09:09.362 INFO:teuthology.task.ceph.mds.a.err: in thread 7f8873a41700
> 
> I don't have time right now to hunt this down, but you should be able to 
> reproduce with qa/workunits/suites/fsstress.sh on top of ceph-fuse with 1 
> mds.
> 

this is a old stray reintegration bug, I just sent a patch to fix it.

Regards
Yan, Zheng
 

> Thanks!
> sage
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index e16800e..b706b5a 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4577,7 +4577,8 @@  void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
     mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
 
     // project snaprealm, too
-    in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
+    if (in->snaprealm || follows + 1 > dn->first)
+      in->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
 
     le->metablob.add_primary_dentry(straydn, true, in);
   } else {
@@ -5247,11 +5248,16 @@  void Server::handle_client_rename(MDRequest *mdr)
   }
 
   // moving between snaprealms?
-  if (srcdnl->is_primary() && !srci->snaprealm &&
-      srci->find_snaprealm() != destdn->get_dir()->inode->find_snaprealm()) {
-    dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
-    mds->mdcache->snaprealm_create(mdr, srci);
-    return;
+  if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
+    SnapRealm *srcrealm = srci->find_snaprealm();
+    SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
+    if (srcrealm != destrealm &&
+	(srcrealm->get_newest_seq() + 1 > srcdn->first ||
+	 destrealm->get_newest_seq() + 1 > srcdn->first)) {
+      dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
+      mds->mdcache->snaprealm_create(mdr, srci);
+      return;
+    }
   }
 
   assert(g_conf->mds_kill_rename_at != 1);
@@ -5650,6 +5656,7 @@  void Server::_rename_prepare(MDRequest *mdr,
   if (destdn->is_auth())
     mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
 
+  SnapRealm *src_realm = srci->find_snaprealm();
   SnapRealm *dest_realm = destdn->get_dir()->inode->find_snaprealm();
   snapid_t next_dest_snap = dest_realm->get_newest_seq() + 1;
 
@@ -5659,7 +5666,8 @@  void Server::_rename_prepare(MDRequest *mdr,
     if (destdnl->is_primary()) {
       if (destdn->is_auth()) {
 	// project snaprealm, too
-	oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
+	if (oldin->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first)
+	  oldin->project_past_snaprealm_parent(straydn->get_dir()->inode->find_snaprealm());
 	straydn->first = MAX(oldin->first, next_dest_snap);
 	metablob->add_primary_dentry(straydn, true, oldin);
       }
@@ -5703,7 +5711,8 @@  void Server::_rename_prepare(MDRequest *mdr,
     }
   } else if (srcdnl->is_primary()) {
     // project snap parent update?
-    if (destdn->is_auth() && srci->snaprealm)
+    if (destdn->is_auth() &&
+        (srci->snaprealm || src_realm->get_newest_seq() + 1 > srcdn->first))
       srci->project_past_snaprealm_parent(dest_realm);
     
     if (destdn->is_auth() && !destdnl->is_null())