From patchwork Mon Nov 25 11:08:25 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11260231 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 98C0C1393 for ; Mon, 25 Nov 2019 11:09:07 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 79E352075C for ; Mon, 25 Nov 2019 11:09:07 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="A9ThN6Nw" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727527AbfKYLJG (ORCPT ); Mon, 25 Nov 2019 06:09:06 -0500 Received: from us-smtp-delivery-1.mimecast.com ([207.211.31.120]:46880 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1727278AbfKYLJG (ORCPT ); Mon, 25 Nov 2019 06:09:06 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574680145; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=0434QUXjPvIaHT1EaIkqlZLBAS2xooOlYe242h7NpmM=; b=A9ThN6Nwg6fntVodY2S2iKGWPikI9z7KbwYjgVwizps/fZA2GT90KIjPIA4oS6UFEtX9wx PFpnrCBE0itFdBJrtKDOs+uY7uXGzkLLXre1JzA0SytBeNc1RBx0THIGjwlekh1MnjRuQE +mkgjxaGnAdqAYdm0/VoPn02BCgQ89Y= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-97-uWMptwYgMDCjVoPJbc_g4A-1; Mon, 25 Nov 2019 06:09:03 -0500 Received: from smtp.corp.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.14]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id 7D51F800686; Mon, 25 Nov 2019 11:09:02 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id 53BB75D9CA; Mon, 25 Nov 2019 11:08:57 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org Cc: sage@redhat.com, idryomov@gmail.com, zyan@redhat.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v2 1/3] mdsmap: add more debug info when decoding Date: Mon, 25 Nov 2019 06:08:25 -0500 Message-Id: <20191125110827.12827-2-xiubli@redhat.com> In-Reply-To: <20191125110827.12827-1-xiubli@redhat.com> References: <20191125110827.12827-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.14 X-MC-Unique: uWMptwYgMDCjVoPJbc_g4A-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li Show the laggy state. Signed-off-by: Xiubo Li --- fs/ceph/mdsmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index aeec1d6e3769..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; From patchwork Mon Nov 25 11:08:26 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11260235 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id DB89615AC for ; Mon, 25 Nov 2019 11:09:25 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id BCAE22075C for ; Mon, 25 Nov 2019 11:09:25 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="UCMC37QJ" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727648AbfKYLJY (ORCPT ); Mon, 25 Nov 2019 06:09:24 -0500 Received: from us-smtp-delivery-1.mimecast.com ([205.139.110.120]:54713 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1727495AbfKYLJY (ORCPT ); Mon, 25 Nov 2019 06:09:24 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574680162; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=myFxpDrAzanEG01Ae41+b1PiW1Sqeoj4x/3weLu2EX8=; b=UCMC37QJy7Q1zXPLysxIKEz2upwY5y6Ga7A9DReRavKa5le04oAiDaqagaSdehLzVmtMll M0EVnMvveqFKH/4U1MeBxEfRT5Uwc34Y7pBnTi3LTG6/0KXa31/D+eiJycSQL/utMEDVOL iHy7slaXvsgS9sneIy5ZQWPNIPnw2Kc= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-150-UCeHMbNJOL-aRMxoHyd-nQ-1; Mon, 25 Nov 2019 06:09:18 -0500 Received: from smtp.corp.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.14]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id 67E46DB65; Mon, 25 Nov 2019 11:09:17 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id 11FC55D9CA; Mon, 25 Nov 2019 11:09:02 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org Cc: sage@redhat.com, idryomov@gmail.com, zyan@redhat.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v2 2/3] mdsmap: fix mdsmap cluster available check based on laggy number Date: Mon, 25 Nov 2019 06:08:26 -0500 Message-Id: <20191125110827.12827-3-xiubli@redhat.com> In-Reply-To: <20191125110827.12827-1-xiubli@redhat.com> References: <20191125110827.12827-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.14 X-MC-Unique: UCeHMbNJOL-aRMxoHyd-nQ-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li In case the max_mds > 1 in MDS cluster and there is no any standby MDS and all the max_mds MDSs are in up:active state, if one of the up:active MDSs is dead, the m->m_num_laggy in kclient will be 1. Then the mount will fail without considering other healthy MDSs. There manybe some MDSs still "in" the cluster but not in up:active state, we will ignore them. Only when all the up:active MDSs in the cluster are laggy will treat the cluster as not be available. In case decreasing the max_mds, the cluster will not stop the extra up:active MDSs immediately and there will be a latency. During it the up:active MDS number will be larger than the max_mds, so later the m_info memories will 100% be reallocated. Here will pick out the up:active MDSs as the m_num_mds and allocate the needed memories once. Signed-off-by: Xiubo Li --- fs/ceph/mdsmap.c | 32 ++++++++++---------------------- include/linux/ceph/mdsmap.h | 5 +++-- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 471bac335fae..cc9ec959fe46 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -138,14 +138,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); - m->m_num_mds = m->m_max_mds; + + /* + * pick out the active nodes as the m_num_mds, the m_num_mds + * maybe larger than m_max_mds when decreasing the max_mds in + * cluster side, in other case it should less than or equal + * to m_max_mds. + */ + m->m_num_mds = n = ceph_decode_32(p); + m->m_num_active_mds = m->m_num_mds; m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ - n = ceph_decode_32(p); for (i = 0; i < n; i++) { u64 global_id; u32 namelen; @@ -218,17 +225,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (mds < 0 || state <= 0) continue; - if (mds >= m->m_num_mds) { - int new_num = max(mds + 1, m->m_num_mds * 2); - void *new_m_info = krealloc(m->m_info, - new_num * sizeof(*m->m_info), - GFP_NOFS | __GFP_ZERO); - if (!new_m_info) - goto nomem; - m->m_info = new_m_info; - m->m_num_mds = new_num; - } - info = &m->m_info[mds]; info->global_id = global_id; info->state = state; @@ -247,14 +243,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } - if (m->m_num_mds > m->m_max_mds) { - /* find max up mds */ - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { - if (i == 0 || m->m_info[i-1].state > 0) - break; - } - m->m_num_mds = i; - } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -396,7 +384,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_damaged) return false; - if (m->m_num_laggy > 0) + if (m->m_num_laggy == m->m_num_active_mds) return false; for (i = 0; i < m->m_num_mds; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 0067d767c9ae..3a66f4f926ce 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,8 +25,9 @@ struct ceph_mdsmap { u32 m_session_timeout; /* seconds */ u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - int m_num_mds; + u32 m_max_mds; /* expected up:active mds number */ + int m_num_active_mds; /* actual up:active mds number */ + int m_num_mds; /* size of m_info array */ struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ From patchwork Mon Nov 25 11:08:27 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xiubo Li X-Patchwork-Id: 11260233 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 8648A13A4 for ; Mon, 25 Nov 2019 11:09:25 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by mail.kernel.org (Postfix) with ESMTP id 666E4207FD for ; Mon, 25 Nov 2019 11:09:25 +0000 (UTC) Authentication-Results: mail.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="XRcaXT2x" Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1727655AbfKYLJY (ORCPT ); Mon, 25 Nov 2019 06:09:24 -0500 Received: from us-smtp-delivery-1.mimecast.com ([207.211.31.120]:35434 "EHLO us-smtp-1.mimecast.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1727633AbfKYLJY (ORCPT ); Mon, 25 Nov 2019 06:09:24 -0500 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1574680163; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=aUB1SOEzee/uLv95tCGE3sqZ+srfP2ZV5gKnHlcz8MU=; b=XRcaXT2xBNpNvsweZsxDSjwEKvx22zKKNe2A/Oe4cpPD3Fxl/EwX26IhKd9YdZ7Mzt4nIl T0T5uwA3TPFzuam0O4VC1jLmA5oM7bdWna2Sb5pVY3d9y1Mz4QMaT9rOKtKmWcjxUXMSTH NQGKD2KTrqZ06IQq8xUDQCOpvcpgfEc= Received: from mimecast-mx01.redhat.com (mimecast-mx01.redhat.com [209.132.183.4]) (Using TLS) by relay.mimecast.com with ESMTP id us-mta-67-IZAEJYZwOD2FYEXJ6geodg-1; Mon, 25 Nov 2019 06:09:21 -0500 Received: from smtp.corp.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.14]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mimecast-mx01.redhat.com (Postfix) with ESMTPS id A9321800585; Mon, 25 Nov 2019 11:09:20 +0000 (UTC) Received: from localhost.localdomain (ovpn-12-66.pek2.redhat.com [10.72.12.66]) by smtp.corp.redhat.com (Postfix) with ESMTP id F27E35D9CA; Mon, 25 Nov 2019 11:09:17 +0000 (UTC) From: xiubli@redhat.com To: jlayton@kernel.org Cc: sage@redhat.com, idryomov@gmail.com, zyan@redhat.com, pdonnell@redhat.com, ceph-devel@vger.kernel.org, Xiubo Li Subject: [PATCH v2 3/3] mdsmap: only choose one MDS who is in up:active state without laggy Date: Mon, 25 Nov 2019 06:08:27 -0500 Message-Id: <20191125110827.12827-4-xiubli@redhat.com> In-Reply-To: <20191125110827.12827-1-xiubli@redhat.com> References: <20191125110827.12827-1-xiubli@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 2.79 on 10.5.11.14 X-MC-Unique: IZAEJYZwOD2FYEXJ6geodg-1 X-Mimecast-Spam-Score: 0 Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: ceph-devel@vger.kernel.org From: Xiubo Li Even the MDS is in up:active state, but it also maybe laggy. Here will skip the laggy MDSs. Signed-off-by: Xiubo Li --- fs/ceph/mds_client.c | 13 +++++++++---- fs/ceph/mdsmap.c | 30 +++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0444288fe87e..2c92a1452876 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -972,14 +972,14 @@ static int __choose_mds(struct ceph_mds_client *mdsc, frag.frag, mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) + CEPH_MDS_STATE_ACTIVE && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ - mode = USE_AUTH_MDS; if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; @@ -987,9 +987,14 @@ static int __choose_mds(struct ceph_mds_client *mdsc, "frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= - CEPH_MDS_STATE_ACTIVE) - goto out; + CEPH_MDS_STATE_ACTIVE) { + if (mode == USE_ANY_MDS && + !ceph_mdsmap_is_laggy(mdsc->mdsmap, + mds)) + goto out; + } } + mode = USE_AUTH_MDS; } } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index cc9ec959fe46..1e32cf486825 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -13,22 +13,24 @@ #include "super.h" +#define CEPH_MDS_IS_READY(i, ignore_laggy) \ + (m->m_info[i].state > 0 && (ignore_laggy ? true : !m->m_info[i].laggy)) -/* - * choose a random mds that is "up" (i.e. has a state > 0), or -1. - */ -int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) { int n = 0; int i, j; - /* special case for one mds */ + /* + * special case for one mds, no matter it is laggy or + * not we have no choice + */ if (1 == m->m_num_mds && m->m_info[0].state > 0) return 0; /* count */ for (i = 0; i < m->m_num_mds; i++) - if (m->m_info[i].state > 0) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) n++; if (n == 0) return -1; @@ -36,7 +38,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) /* pick */ n = prandom_u32() % n; for (j = 0, i = 0; i < m->m_num_mds; i++) { - if (m->m_info[i].state > 0) + if (CEPH_MDS_IS_READY(i, ignore_laggy)) j++; if (j > n) break; @@ -45,6 +47,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return i; } +/* + * choose a random mds that is "up" (i.e. has a state > 0), or -1. + */ +int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) +{ + int mds; + + mds = __mdsmap_get_random_mds(m, false); + if (mds == m->m_num_mds || mds == -1) + mds = __mdsmap_get_random_mds(m, true); + + return mds == m->m_num_mds ? -1 : mds; +} + #define __decode_and_drop_type(p, end, type, bad) \ do { \ if (*p + sizeof(type) > end) \