diff mbox

[13/39] mds: don't send resolve message between active MDS

Message ID 1363531902-24909-14-git-send-email-zheng.z.yan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yan, Zheng March 17, 2013, 2:51 p.m. UTC
From: "Yan, Zheng" <zheng.z.yan@intel.com>

When MDS cluster is resolving, current behavior is sending subtree resolve
message to all other MDS and waiting for all other MDS' resolve message.
The problem is that active MDS can have diffent subtree map due to rename.
Besides gathering active MDS's resolve messages are also racy. The only
function for these messages is disambiguate other MDS' import. We can
replace it by import finish notification.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc  | 12 +++++++++---
 src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
 src/mds/Migrator.h  |  3 ++-
 3 files changed, 34 insertions(+), 6 deletions(-)

Comments

Gregory Farnum March 20, 2013, 9:56 p.m. UTC | #1
On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> When MDS cluster is resolving, current behavior is sending subtree resolve
> message to all other MDS and waiting for all other MDS' resolve message.
> The problem is that active MDS can have diffent subtree map due to rename.
> Besides gathering active MDS's resolve messages are also racy. The only
> function for these messages is disambiguate other MDS' import. We can
> replace it by import finish notification.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc  | 12 +++++++++---
>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>  src/mds/Migrator.h  |  3 ++-
>  3 files changed, 34 insertions(+), 6 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index c455a20..73c1d59 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>         ++p) {
>      if (*p == mds->whoami)
>        continue;
> -    resolves[*p] = new MMDSResolve;
> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
> +      resolves[*p] = new MMDSResolve;
>    }
>
>    // known
> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>           migrator->import_reverse(dir);
>         } else {
>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
> -         migrator->import_finish(dir);
> +         migrator->import_finish(dir, true);
>         }
>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>        }
> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>         ++p) {
>      CDir *dir = p->first;
>      assert(dir->is_subtree_root());
> -    assert(!dir->is_ambiguous_dir_auth());
> +    if (dir->is_ambiguous_dir_auth()) {
> +      // exporter is recovering, importer is survivor.

The importer has to be the MDS this code is running on, right?

> +      assert(rejoins.count(dir->authority().first));
> +      assert(!rejoins.count(dir->authority().second));
> +      continue;
> +    }
>
>      // my subtree?
>      if (dir->is_auth())
> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
> index 5e53803..833df12 100644
> --- a/src/mds/Migrator.cc
> +++ b/src/mds/Migrator.cc
> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>    }
>  }
>
> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
> +{
> +  dout(7) << "import_notify_finish " << *dir << dendl;
> +
> +  for (set<int>::iterator p = import_bystanders[dir].begin();
> +       p != import_bystanders[dir].end();
> +       ++p) {
> +    MExportDirNotify *notify =
> +      new MExportDirNotify(dir->dirfrag(), false,
> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));

I don't think this is quite right — we're notifying them that we've
just finished importing data from somebody, right? And so we know that
we're the auth node...

> +    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
> +      notify->get_bounds().push_back((*i)->dirfrag());
> +    mds->send_message_mds(notify, *p);
> +  }
> +}
> +
>  void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
>  {
>    dout(7) << "import_notify_abort " << *dir << dendl;
> @@ -2183,11 +2200,11 @@ void Migrator::handle_export_finish(MExportDirFinish *m)
>    CDir *dir = cache->get_dirfrag(m->get_dirfrag());
>    assert(dir);
>    dout(7) << "handle_export_finish on " << *dir << dendl;
> -  import_finish(dir);
> +  import_finish(dir, false);
>    m->put();
>  }
>
> -void Migrator::import_finish(CDir *dir)
> +void Migrator::import_finish(CDir *dir, bool notify)
>  {
>    dout(7) << "import_finish on " << *dir << dendl;
>
> @@ -2205,6 +2222,10 @@ void Migrator::import_finish(CDir *dir)
>    // remove pins
>    set<CDir*> bounds;
>    cache->get_subtree_bounds(dir, bounds);
> +
> +  if (notify)
> +    import_notify_finish(dir, bounds);
> +
>    import_remove_pins(dir, bounds);
>
>    map<CInode*, map<client_t,Capability::Export> > cap_imports;
> diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
> index 7988f32..2889a74 100644
> --- a/src/mds/Migrator.h
> +++ b/src/mds/Migrator.h
> @@ -273,12 +273,13 @@ protected:
>    void import_reverse_unfreeze(CDir *dir);
>    void import_reverse_final(CDir *dir);
>    void import_notify_abort(CDir *dir, set<CDir*>& bounds);
> +  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
>    void import_logged_start(dirfrag_t df, CDir *dir, int from,
>                            map<client_t,entity_inst_t> &imported_client_map,
>                            map<client_t,uint64_t>& sseqmap);
>    void handle_export_finish(MExportDirFinish *m);
>  public:
> -  void import_finish(CDir *dir);
> +  void import_finish(CDir *dir, bool notify);
>  protected:
>
>    void handle_export_caps(MExportCaps *m);
> --
> 1.7.11.7
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yan, Zheng March 21, 2013, 2:55 a.m. UTC | #2
On 03/21/2013 05:56 AM, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>
>> When MDS cluster is resolving, current behavior is sending subtree resolve
>> message to all other MDS and waiting for all other MDS' resolve message.
>> The problem is that active MDS can have diffent subtree map due to rename.
>> Besides gathering active MDS's resolve messages are also racy. The only
>> function for these messages is disambiguate other MDS' import. We can
>> replace it by import finish notification.
>>
>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>> ---
>>  src/mds/MDCache.cc  | 12 +++++++++---
>>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>>  src/mds/Migrator.h  |  3 ++-
>>  3 files changed, 34 insertions(+), 6 deletions(-)
>>
>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>> index c455a20..73c1d59 100644
>> --- a/src/mds/MDCache.cc
>> +++ b/src/mds/MDCache.cc
>> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>>         ++p) {
>>      if (*p == mds->whoami)
>>        continue;
>> -    resolves[*p] = new MMDSResolve;
>> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
>> +      resolves[*p] = new MMDSResolve;
>>    }
>>
>>    // known
>> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>>           migrator->import_reverse(dir);
>>         } else {
>>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
>> -         migrator->import_finish(dir);
>> +         migrator->import_finish(dir, true);
>>         }
>>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>>        }
>> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>>         ++p) {
>>      CDir *dir = p->first;
>>      assert(dir->is_subtree_root());
>> -    assert(!dir->is_ambiguous_dir_auth());
>> +    if (dir->is_ambiguous_dir_auth()) {
>> +      // exporter is recovering, importer is survivor.
> 
> The importer has to be the MDS this code is running on, right?

This code is for bystanders. The exporter is recovering, and its resolve message didn't claim
the subtree. So the export must succeed.

> 
>> +      assert(rejoins.count(dir->authority().first));
>> +      assert(!rejoins.count(dir->authority().second));
>> +      continue;
>> +    }
>>
>>      // my subtree?
>>      if (dir->is_auth())
>> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
>> index 5e53803..833df12 100644
>> --- a/src/mds/Migrator.cc
>> +++ b/src/mds/Migrator.cc
>> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>>    }
>>  }
>>
>> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
>> +{
>> +  dout(7) << "import_notify_finish " << *dir << dendl;
>> +
>> +  for (set<int>::iterator p = import_bystanders[dir].begin();
>> +       p != import_bystanders[dir].end();
>> +       ++p) {
>> +    MExportDirNotify *notify =
>> +      new MExportDirNotify(dir->dirfrag(), false,
>> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
>> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
> 
> I don't think this is quite right — we're notifying them that we've
> just finished importing data from somebody, right? And so we know that
> we're the auth node...

Yes. In normal case, exporter notifies the bystanders. But if exporter crashes, the importer notifies
the bystanders after it confirms ambiguous import succeeds.

Thanks
Yan, Zheng

> 
>> +    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
>> +      notify->get_bounds().push_back((*i)->dirfrag());
>> +    mds->send_message_mds(notify, *p);
>> +  }
>> +}
>> +
>>  void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
>>  {
>>    dout(7) << "import_notify_abort " << *dir << dendl;
>> @@ -2183,11 +2200,11 @@ void Migrator::handle_export_finish(MExportDirFinish *m)
>>    CDir *dir = cache->get_dirfrag(m->get_dirfrag());
>>    assert(dir);
>>    dout(7) << "handle_export_finish on " << *dir << dendl;
>> -  import_finish(dir);
>> +  import_finish(dir, false);
>>    m->put();
>>  }
>>
>> -void Migrator::import_finish(CDir *dir)
>> +void Migrator::import_finish(CDir *dir, bool notify)
>>  {
>>    dout(7) << "import_finish on " << *dir << dendl;
>>
>> @@ -2205,6 +2222,10 @@ void Migrator::import_finish(CDir *dir)
>>    // remove pins
>>    set<CDir*> bounds;
>>    cache->get_subtree_bounds(dir, bounds);
>> +
>> +  if (notify)
>> +    import_notify_finish(dir, bounds);
>> +
>>    import_remove_pins(dir, bounds);
>>
>>    map<CInode*, map<client_t,Capability::Export> > cap_imports;
>> diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
>> index 7988f32..2889a74 100644
>> --- a/src/mds/Migrator.h
>> +++ b/src/mds/Migrator.h
>> @@ -273,12 +273,13 @@ protected:
>>    void import_reverse_unfreeze(CDir *dir);
>>    void import_reverse_final(CDir *dir);
>>    void import_notify_abort(CDir *dir, set<CDir*>& bounds);
>> +  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
>>    void import_logged_start(dirfrag_t df, CDir *dir, int from,
>>                            map<client_t,entity_inst_t> &imported_client_map,
>>                            map<client_t,uint64_t>& sseqmap);
>>    void handle_export_finish(MExportDirFinish *m);
>>  public:
>> -  void import_finish(CDir *dir);
>> +  void import_finish(CDir *dir, bool notify);
>>  protected:
>>
>>    void handle_export_caps(MExportCaps *m);
>> --
>> 1.7.11.7
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gregory Farnum March 21, 2013, 9:55 p.m. UTC | #3
On Wed, Mar 20, 2013 at 7:55 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> On 03/21/2013 05:56 AM, Gregory Farnum wrote:
>> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
>>> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>>>
>>> When MDS cluster is resolving, current behavior is sending subtree resolve
>>> message to all other MDS and waiting for all other MDS' resolve message.
>>> The problem is that active MDS can have diffent subtree map due to rename.
>>> Besides gathering active MDS's resolve messages are also racy. The only
>>> function for these messages is disambiguate other MDS' import. We can
>>> replace it by import finish notification.
>>>
>>> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
>>> ---
>>>  src/mds/MDCache.cc  | 12 +++++++++---
>>>  src/mds/Migrator.cc | 25 +++++++++++++++++++++++--
>>>  src/mds/Migrator.h  |  3 ++-
>>>  3 files changed, 34 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
>>> index c455a20..73c1d59 100644
>>> --- a/src/mds/MDCache.cc
>>> +++ b/src/mds/MDCache.cc
>>> @@ -2517,7 +2517,8 @@ void MDCache::send_subtree_resolves()
>>>         ++p) {
>>>      if (*p == mds->whoami)
>>>        continue;
>>> -    resolves[*p] = new MMDSResolve;
>>> +    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
>>> +      resolves[*p] = new MMDSResolve;
>>>    }
>>>
>>>    // known
>>> @@ -2837,7 +2838,7 @@ void MDCache::handle_resolve(MMDSResolve *m)
>>>           migrator->import_reverse(dir);
>>>         } else {
>>>           dout(7) << "ambiguous import succeeded on " << *dir << dendl;
>>> -         migrator->import_finish(dir);
>>> +         migrator->import_finish(dir, true);
>>>         }
>>>         my_ambiguous_imports.erase(p);  // no longer ambiguous.
>>>        }
>>> @@ -3432,7 +3433,12 @@ void MDCache::rejoin_send_rejoins()
>>>         ++p) {
>>>      CDir *dir = p->first;
>>>      assert(dir->is_subtree_root());
>>> -    assert(!dir->is_ambiguous_dir_auth());
>>> +    if (dir->is_ambiguous_dir_auth()) {
>>> +      // exporter is recovering, importer is survivor.
>>
>> The importer has to be the MDS this code is running on, right?
>
> This code is for bystanders. The exporter is recovering, and its resolve message didn't claim
> the subtree. So the export must succeed.

Ah, yep. That's what I get for eyeing just the diff.

>
>>
>>> +      assert(rejoins.count(dir->authority().first));
>>> +      assert(!rejoins.count(dir->authority().second));
>>> +      continue;
>>> +    }
>>>
>>>      // my subtree?
>>>      if (dir->is_auth())
>>> diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
>>> index 5e53803..833df12 100644
>>> --- a/src/mds/Migrator.cc
>>> +++ b/src/mds/Migrator.cc
>>> @@ -2088,6 +2088,23 @@ void Migrator::import_reverse(CDir *dir)
>>>    }
>>>  }
>>>
>>> +void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
>>> +{
>>> +  dout(7) << "import_notify_finish " << *dir << dendl;
>>> +
>>> +  for (set<int>::iterator p = import_bystanders[dir].begin();
>>> +       p != import_bystanders[dir].end();
>>> +       ++p) {
>>> +    MExportDirNotify *notify =
>>> +      new MExportDirNotify(dir->dirfrag(), false,
>>> +                          pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
>>> +                          pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
>>
>> I don't think this is quite right — we're notifying them that we've
>> just finished importing data from somebody, right? And so we know that
>> we're the auth node...
>
> Yes. In normal case, exporter notifies the bystanders. But if exporter crashes, the importer notifies
> the bystanders after it confirms ambiguous import succeeds.

Never mind — I had the semantic meaning of these pairs wrong.

Reviewed-by: Greg Farnum <greg@inktank.com>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index c455a20..73c1d59 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2517,7 +2517,8 @@  void MDCache::send_subtree_resolves()
        ++p) {
     if (*p == mds->whoami)
       continue;
-    resolves[*p] = new MMDSResolve;
+    if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
+      resolves[*p] = new MMDSResolve;
   }
 
   // known
@@ -2837,7 +2838,7 @@  void MDCache::handle_resolve(MMDSResolve *m)
 	  migrator->import_reverse(dir);
 	} else {
 	  dout(7) << "ambiguous import succeeded on " << *dir << dendl;
-	  migrator->import_finish(dir);
+	  migrator->import_finish(dir, true);
 	}
 	my_ambiguous_imports.erase(p);  // no longer ambiguous.
       }
@@ -3432,7 +3433,12 @@  void MDCache::rejoin_send_rejoins()
        ++p) {
     CDir *dir = p->first;
     assert(dir->is_subtree_root());
-    assert(!dir->is_ambiguous_dir_auth());
+    if (dir->is_ambiguous_dir_auth()) {
+      // exporter is recovering, importer is survivor.
+      assert(rejoins.count(dir->authority().first));
+      assert(!rejoins.count(dir->authority().second));
+      continue;
+    }
 
     // my subtree?
     if (dir->is_auth())
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 5e53803..833df12 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -2088,6 +2088,23 @@  void Migrator::import_reverse(CDir *dir)
   }
 }
 
+void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
+{
+  dout(7) << "import_notify_finish " << *dir << dendl;
+
+  for (set<int>::iterator p = import_bystanders[dir].begin();
+       p != import_bystanders[dir].end();
+       ++p) {
+    MExportDirNotify *notify =
+      new MExportDirNotify(dir->dirfrag(), false,
+			   pair<int,int>(import_peer[dir->dirfrag()], mds->get_nodeid()),
+			   pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
+    for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); i++)
+      notify->get_bounds().push_back((*i)->dirfrag());
+    mds->send_message_mds(notify, *p);
+  }
+}
+
 void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
 {
   dout(7) << "import_notify_abort " << *dir << dendl;
@@ -2183,11 +2200,11 @@  void Migrator::handle_export_finish(MExportDirFinish *m)
   CDir *dir = cache->get_dirfrag(m->get_dirfrag());
   assert(dir);
   dout(7) << "handle_export_finish on " << *dir << dendl;
-  import_finish(dir);
+  import_finish(dir, false);
   m->put();
 }
 
-void Migrator::import_finish(CDir *dir) 
+void Migrator::import_finish(CDir *dir, bool notify)
 {
   dout(7) << "import_finish on " << *dir << dendl;
 
@@ -2205,6 +2222,10 @@  void Migrator::import_finish(CDir *dir)
   // remove pins
   set<CDir*> bounds;
   cache->get_subtree_bounds(dir, bounds);
+
+  if (notify)
+    import_notify_finish(dir, bounds);
+
   import_remove_pins(dir, bounds);
 
   map<CInode*, map<client_t,Capability::Export> > cap_imports;
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h
index 7988f32..2889a74 100644
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -273,12 +273,13 @@  protected:
   void import_reverse_unfreeze(CDir *dir);
   void import_reverse_final(CDir *dir);
   void import_notify_abort(CDir *dir, set<CDir*>& bounds);
+  void import_notify_finish(CDir *dir, set<CDir*>& bounds);
   void import_logged_start(dirfrag_t df, CDir *dir, int from,
 			   map<client_t,entity_inst_t> &imported_client_map,
 			   map<client_t,uint64_t>& sseqmap);
   void handle_export_finish(MExportDirFinish *m);
 public:
-  void import_finish(CDir *dir);
+  void import_finish(CDir *dir, bool notify);
 protected:
 
   void handle_export_caps(MExportCaps *m);