diff mbox

[1/2] osd: force restart peering when osd is marked down

Message ID 1527500180-17479-2-git-send-email-kouya@jp.fujitsu.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kouya Shimura May 28, 2018, 9:36 a.m. UTC
Despite a OSD is marked down, replica OSD might stay on 'RepicaActive'
and discard peering requests from the primary OSD.
This causes a PG is eternally stuck in 'unfound_recovery'.

With this patch, every PG on downed OSD force transits to 'Reset' and
restarts peering.

Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com>
---
 src/osd/OSD.cc | 9 +++++++--
 src/osd/PG.cc  | 6 ++++++
 src/osd/PG.h   | 6 ++++++
 3 files changed, 19 insertions(+), 2 deletions(-)
diff mbox

Patch

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 26c28d6..8b87fa6 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -7666,9 +7666,14 @@  void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
     else
       start_boot();
   }
-  else if (do_restart)
+  else if (do_restart) {
+    vector<PGRef> pgs;
+    _get_pgs(&pgs);
+    for (auto pg : pgs) {
+      pg->force_restart_peering();
+    }
     start_boot();
-
+  }
 }
 
 void OSD::check_osdmap_features()
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 8088f99..13ed169 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -335,6 +335,7 @@  PG::PG(OSDService *o, OSDMapRef curmap,
   role(-1),
   state(0),
   send_notify(false),
+  restart_peering(false),
   pg_whoami(osd->whoami, p.shard),
   need_up_thru(false),
   last_peering_reset(0),
@@ -5743,6 +5744,9 @@  bool PG::should_restart_peering(
   OSDMapRef lastmap,
   OSDMapRef osdmap)
 {
+  if (restart_peering) {
+    return true;
+  }
   if (PastIntervals::is_new_interval(
 	primary.osd,
 	newactingprimary,
@@ -5832,6 +5836,8 @@  void PG::start_peering_interval(
 {
   const OSDMapRef osdmap = get_osdmap();
 
+  restart_peering = false;
+
   set_last_peering_reset();
 
   vector<int> oldacting, oldup;
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 7b4f26d..cad66ac 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -942,6 +942,12 @@  protected:
 
   bool send_notify;    ///< true if we are non-primary and should notify the primary
 
+  bool restart_peering;
+public:
+  void force_restart_peering() {
+    restart_peering = true;
+  }
+
 protected:
   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
   eversion_t  last_complete_ondisk;  // last_complete that has committed.