[v3,2/3] mdsmap: fix mdsmap cluster available check based on laggy number
diff mbox series

Message ID 20191126122422.12396-3-xiubli@redhat.com
State New
Headers show
Series
  • mdsmap: fix mds choosing
Related show

Commit Message

Xiubo Li Nov. 26, 2019, 12:24 p.m. UTC
From: Xiubo Li <xiubli@redhat.com>

In case the max_mds > 1 in MDS cluster and there is no any standby
MDS and all the max_mds MDSs are in up:active state, if one of the
up:active MDSs is dead, the m->m_num_laggy in kclient will be 1.
Then the mount will fail without considering other healthy MDSs.

There manybe some MDSs still "in" the cluster but not in up:active
state, we will ignore them. Only when all the up:active MDSs in
the cluster are laggy will treat the cluster as not be available.

In case decreasing the max_mds, the cluster will not stop the extra
up:active MDSs immediately and there will be a latency. During it
the up:active MDS number will be larger than the max_mds, so later
the m_info memories will 100% be reallocated.

Here will pick out the up:active MDSs as the m_num_mds and allocate
the needed memories once.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mdsmap.c            | 38 +++++++++++++++++--------------------
 include/linux/ceph/mdsmap.h |  5 +++--
 2 files changed, 20 insertions(+), 23 deletions(-)

Patch
diff mbox series

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 471bac335fae..3418cf2c6a12 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -138,14 +138,21 @@  struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	m->m_session_autoclose = ceph_decode_32(p);
 	m->m_max_file_size = ceph_decode_64(p);
 	m->m_max_mds = ceph_decode_32(p);
-	m->m_num_mds = m->m_max_mds;
+
+	/*
+	 * pick out the active nodes as the m_num_mds, the m_num_mds
+	 * maybe larger than m_max_mds when decreasing the max_mds in
+	 * cluster side, in other case it should less than or equal
+	 * to m_max_mds.
+	 */
+	m->m_num_mds = n = ceph_decode_32(p);
+	m->m_num_active_mds = m->m_num_mds;
 
 	m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
 	if (!m->m_info)
 		goto nomem;
 
 	/* pick out active nodes from mds_info (state > 0) */
-	n = ceph_decode_32(p);
 	for (i = 0; i < n; i++) {
 		u64 global_id;
 		u32 namelen;
@@ -215,18 +222,15 @@  struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		     ceph_mds_state_name(state),
 		     laggy ? "(laggy)" : "");
 
-		if (mds < 0 || state <= 0)
+		if (mds < 0 || mds >= m->m_num_mds) {
+			pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
 			continue;
+		}
 
-		if (mds >= m->m_num_mds) {
-			int new_num = max(mds + 1, m->m_num_mds * 2);
-			void *new_m_info = krealloc(m->m_info,
-						new_num * sizeof(*m->m_info),
-						GFP_NOFS | __GFP_ZERO);
-			if (!new_m_info)
-				goto nomem;
-			m->m_info = new_m_info;
-			m->m_num_mds = new_num;
+		if (state <= 0) {
+			pr_warn("mdsmap_decode got incorrect state(%s)\n",
+				ceph_mds_state_name(state));
+			continue;
 		}
 
 		info = &m->m_info[mds];
@@ -247,14 +251,6 @@  struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			info->export_targets = NULL;
 		}
 	}
-	if (m->m_num_mds > m->m_max_mds) {
-		/* find max up mds */
-		for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
-			if (i == 0 || m->m_info[i-1].state > 0)
-				break;
-		}
-		m->m_num_mds = i;
-	}
 
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
@@ -396,7 +392,7 @@  bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
 		return false;
 	if (m->m_damaged)
 		return false;
-	if (m->m_num_laggy > 0)
+	if (m->m_num_laggy == m->m_num_active_mds)
 		return false;
 	for (i = 0; i < m->m_num_mds; i++) {
 		if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 0067d767c9ae..3a66f4f926ce 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,8 +25,9 @@  struct ceph_mdsmap {
 	u32 m_session_timeout;          /* seconds */
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
-	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-	int m_num_mds;
+	u32 m_max_mds;			/* expected up:active mds number */
+	int m_num_active_mds;		/* actual up:active mds number */
+	int m_num_mds;                  /* size of m_info array */
 	struct ceph_mds_info *m_info;
 
 	/* which object pools file data can be stored in */