@@ -7666,9 +7666,14 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
else
start_boot();
}
- else if (do_restart)
+ else if (do_restart) {
+ vector<PGRef> pgs;
+ _get_pgs(&pgs);
+ for (auto pg : pgs) {
+ pg->force_restart_peering();
+ }
start_boot();
-
+ }
}
void OSD::check_osdmap_features()
@@ -335,6 +335,7 @@ PG::PG(OSDService *o, OSDMapRef curmap,
role(-1),
state(0),
send_notify(false),
+ restart_peering(false),
pg_whoami(osd->whoami, p.shard),
need_up_thru(false),
last_peering_reset(0),
@@ -5743,6 +5744,9 @@ bool PG::should_restart_peering(
OSDMapRef lastmap,
OSDMapRef osdmap)
{
+ if (restart_peering) {
+ return true;
+ }
if (PastIntervals::is_new_interval(
primary.osd,
newactingprimary,
@@ -5832,6 +5836,8 @@ void PG::start_peering_interval(
{
const OSDMapRef osdmap = get_osdmap();
+ restart_peering = false;
+
set_last_peering_reset();
vector<int> oldacting, oldup;
@@ -942,6 +942,12 @@ protected:
bool send_notify; ///< true if we are non-primary and should notify the primary
+ bool restart_peering;
+public:
+ void force_restart_peering() {
+ restart_peering = true;
+ }
+
protected:
eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
eversion_t last_complete_ondisk; // last_complete that has committed.
Despite a OSD is marked down, replica OSD might stay on 'RepicaActive' and discard peering requests from the primary OSD. This causes a PG is eternally stuck in 'unfound_recovery'. With this patch, every PG on downed OSD force transits to 'Reset' and restarts peering. Signed-off-by: Kouya Shimura <kouya@jp.fujitsu.com> --- src/osd/OSD.cc | 9 +++++++-- src/osd/PG.cc | 6 ++++++ src/osd/PG.h | 6 ++++++ 3 files changed, 19 insertions(+), 2 deletions(-)