From patchwork Sat Jun 18 13:51:55 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: James Simmons X-Patchwork-Id: 12886405 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id D6E12C43334 for ; Sat, 18 Jun 2022 14:00:47 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233293AbiFRNxL (ORCPT ); Sat, 18 Jun 2022 09:53:11 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48178 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234164AbiFRNw6 (ORCPT ); Sat, 18 Jun 2022 09:52:58 -0400 Received: from smtp3.ccs.ornl.gov (smtp3.ccs.ornl.gov [160.91.203.39]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C31BCE2E for ; Sat, 18 Jun 2022 06:52:55 -0700 (PDT) Received: from star.ccs.ornl.gov (star.ccs.ornl.gov [160.91.202.134]) by smtp3.ccs.ornl.gov (Postfix) with ESMTP id 1C98313E7; Sat, 18 Jun 2022 09:52:14 -0400 (EDT) Received: by star.ccs.ornl.gov (Postfix, from userid 2004) id 19855DC803; Sat, 18 Jun 2022 09:52:14 -0400 (EDT) From: James Simmons To: Eric Biggers , Andreas Dilger , NeilBrown Cc: linux-fscrypt@vger.kernel.org, Cyril Bordage , Chris Horn , James Simmons Subject: [PATCH 13/28] lnet: set max recovery interval duration Date: Sat, 18 Jun 2022 09:51:55 -0400 Message-Id: <1655560330-30743-14-git-send-email-jsimmons@infradead.org> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1655560330-30743-1-git-send-email-jsimmons@infradead.org> References: <1655560330-30743-1-git-send-email-jsimmons@infradead.org> Precedence: bulk List-ID: X-Mailing-List: linux-fscrypt@vger.kernel.org From: Cyril Bordage Add a tunable parameter to limit the recovery ping interval which was previously statically set to 900. This can be done by using: lnetctl set max_recovery_ping_interval WC-bug-id: https://jira.whamcloud.com/browse/LU-14979 Lustre-commit: 4027395fe463b6ea1 ("LU-14979 lnet: set max recovery interval duration") Signed-off-by: Cyril Bordage Signed-off-by: Chris Horn Reviewed-on: https://review.whamcloud.com/44927 Reviewed-by: Serguei Smirnov Reviewed-by: Frank Sehr Reviewed-by: Oleg Drokin Signed-off-by: James Simmons --- include/linux/lnet/lib-lnet.h | 9 ++++---- net/lnet/lnet/api-ni.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/include/linux/lnet/lib-lnet.h b/include/linux/lnet/lib-lnet.h index ceb12b1..e21866b 100644 --- a/include/linux/lnet/lib-lnet.h +++ b/include/linux/lnet/lib-lnet.h @@ -559,6 +559,8 @@ unsigned int lnet_nid_cpt_hash(struct lnet_nid *nid, extern unsigned int lnet_recovery_limit; extern unsigned int lnet_peer_discovery_disabled; extern unsigned int lnet_drop_asym_route; +extern unsigned int lnet_max_recovery_ping_interval; +extern unsigned int lnet_max_recovery_ping_count; extern unsigned int router_sensitivity_percentage; extern int alive_router_check_interval; extern int live_router_check_interval; @@ -1009,15 +1011,14 @@ int lnet_get_peer_ni_info(u32 peer_index, u64 *nid, return false; } -#define LNET_RECOVERY_INTERVAL_MAX 900 static inline unsigned int lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now) { unsigned int interval; - /* 2^9 = 512, 2^10 = 1024 */ - if (ping_count > 9) - interval = LNET_RECOVERY_INTERVAL_MAX; + /* lnet_max_recovery_interval <= 2^lnet_max_recovery_ping_count */ + if (ping_count > lnet_max_recovery_ping_count) + interval = lnet_max_recovery_ping_interval; else interval = 1 << ping_count; diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c index 8643ac8d..165728d 100644 --- a/net/lnet/lnet/api-ni.c +++ b/net/lnet/lnet/api-ni.c @@ -117,6 +117,22 @@ static int recovery_interval_set(const char *val, MODULE_PARM_DESC(lnet_recovery_limit, "How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery"); +unsigned int lnet_max_recovery_ping_interval = 900; +unsigned int lnet_max_recovery_ping_count = 9; +static int max_recovery_ping_interval_set(const char *val, + const struct kernel_param *kp); + +#define param_check_max_recovery_ping_interval(name, p) \ + __param_check(name, p, int) + +static struct kernel_param_ops param_ops_max_recovery_ping_interval = { + .set = max_recovery_ping_interval_set, + .get = param_get_int, +}; +module_param(lnet_max_recovery_ping_interval, max_recovery_ping_interval, 0644); +MODULE_PARM_DESC(lnet_max_recovery_ping_interval, + "The max interval between LNet recovery pings, in seconds"); + static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; static int intf_max_set(const char *val, const struct kernel_param *kp); module_param_call(lnet_interfaces_max, intf_max_set, param_get_int, @@ -258,6 +274,39 @@ static int lnet_discover(struct lnet_process_id id, u32 force, } static int +max_recovery_ping_interval_set(const char *val, const struct kernel_param *kp) +{ + int rc; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_max_recovery_ping_interval'\n"); + return rc; + } + + if (!value) { + CERROR("Invalid max ping timeout. Must be strictly positive\n"); + return -EINVAL; + } + + /* The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + lnet_max_recovery_ping_interval = value; + lnet_max_recovery_ping_count = 0; + value >>= 1; + while (value) { + lnet_max_recovery_ping_count++; + value >>= 1; + } + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int discovery_set(const char *val, const struct kernel_param *kp) { int rc;