From patchwork Thu Feb 27 21:13:07 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: James Simmons X-Patchwork-Id: 11410199 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 858E2138D for ; Thu, 27 Feb 2020 21:32:33 +0000 (UTC) Received: from pdx1-mailman02.dreamhost.com (pdx1-mailman02.dreamhost.com [64.90.62.194]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id 6DBAF24677 for ; Thu, 27 Feb 2020 21:32:33 +0000 (UTC) DMARC-Filter: OpenDMARC Filter v1.3.2 mail.kernel.org 6DBAF24677 Authentication-Results: mail.kernel.org; dmarc=none (p=none dis=none) header.from=infradead.org Authentication-Results: mail.kernel.org; spf=none smtp.mailfrom=lustre-devel-bounces@lists.lustre.org Received: from pdx1-mailman02.dreamhost.com (localhost [IPv6:::1]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id 6AA1D348B99; Thu, 27 Feb 2020 13:27:35 -0800 (PST) X-Original-To: lustre-devel@lists.lustre.org Delivered-To: lustre-devel-lustre.org@pdx1-mailman02.dreamhost.com Received: from smtp3.ccs.ornl.gov (smtp3.ccs.ornl.gov [160.91.203.39]) by pdx1-mailman02.dreamhost.com (Postfix) with ESMTP id 675FD21FCC3 for ; Thu, 27 Feb 2020 13:19:56 -0800 (PST) Received: from star.ccs.ornl.gov (star.ccs.ornl.gov [160.91.202.134]) by smtp3.ccs.ornl.gov (Postfix) with ESMTP id 165308A54; Thu, 27 Feb 2020 16:18:17 -0500 (EST) Received: by star.ccs.ornl.gov (Postfix, from userid 2004) id 1517846D; Thu, 27 Feb 2020 16:18:17 -0500 (EST) From: James Simmons To: Andreas Dilger , Oleg Drokin , NeilBrown Date: Thu, 27 Feb 2020 16:13:07 -0500 Message-Id: <1582838290-17243-320-git-send-email-jsimmons@infradead.org> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1582838290-17243-1-git-send-email-jsimmons@infradead.org> References: <1582838290-17243-1-git-send-email-jsimmons@infradead.org> Subject: [lustre-devel] [PATCH 319/622] lnet: setup health timeout defaults X-BeenThere: lustre-devel@lists.lustre.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: "For discussing Lustre software development." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Cc: Amir Shehata , Lustre Development List MIME-Version: 1.0 Errors-To: lustre-devel-bounces@lists.lustre.org Sender: "lustre-devel" From: Amir Shehata Enable health feature by default. Setup transaction timeout to a default 10 seconds and retry count to 3 when health is enabled. When health is disabled set default transaction timeout to 50. When toggling between health enabled/disabled the defaults will always kick in. WC-bug-id: https://jira.whamcloud.com/browse/LU-11816 Lustre-commit: 8632e94aeb7e ("LU-11816 lnet: setup health timeout defaults") Signed-off-by: Amir Shehata Reviewed-on: https://review.whamcloud.com/34252 Reviewed-by: Olaf Weber Reviewed-by: Sebastien Buisson Reviewed-by: Chris Horn Signed-off-by: James Simmons --- net/lnet/lnet/api-ni.c | 55 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c index 1388bd4..aeb9d92 100644 --- a/net/lnet/lnet/api-ni.c +++ b/net/lnet/lnet/api-ni.c @@ -79,10 +79,10 @@ struct lnet the_lnet = { "NUMA range to consider during Multi-Rail selection"); /* lnet_health_sensitivity determines by how much we decrement the health - * value on sending error. The value defaults to 0, which means health - * checking is turned off by default. + * value on sending error. The value defaults to 100, which means health + * interface health is decremented by 100 points every failure. */ -unsigned int lnet_health_sensitivity; +unsigned int lnet_health_sensitivity = 100; static int sensitivity_set(const char *val, const struct kernel_param *kp); static struct kernel_param_ops param_ops_health_sensitivity = { .set = sensitivity_set, @@ -140,7 +140,10 @@ static int recovery_interval_set(const char *val, MODULE_PARM_DESC(lnet_drop_asym_route, "Set to 1 to drop asymmetrical route messages."); -unsigned int lnet_transaction_timeout = 50; +#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50 +#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10 + +unsigned int lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; static int transaction_to_set(const char *val, const struct kernel_param *kp); static struct kernel_param_ops param_ops_transaction_timeout = { .set = transaction_to_set, @@ -153,7 +156,8 @@ static int recovery_interval_set(const char *val, MODULE_PARM_DESC(lnet_transaction_timeout, "Maximum number of seconds to wait for a peer response."); -unsigned int lnet_retry_count; +#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3 +unsigned int lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; static int retry_count_set(const char *val, const struct kernel_param *kp); static struct kernel_param_ops param_ops_retry_count = { .set = retry_count_set, @@ -201,11 +205,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force, */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - if (value > LNET_MAX_HEALTH_VALUE) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid health value. Maximum: %d value = %lu\n", @@ -213,6 +212,22 @@ static int lnet_discover(struct lnet_process_id id, u32 force, return -EINVAL; } + /* if we're turning on health then use the health timeout + * defaults. + */ + if (*sensitivity == 0 && value != 0) { + lnet_transaction_timeout = + LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; + lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; + /* if we're turning off health then use the no health timeout + * default. + */ + } else if (*sensitivity != 0 && value == 0) { + lnet_transaction_timeout = + LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT; + lnet_retry_count = 0; + } + *sensitivity = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -243,11 +258,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force, */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - *interval = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -353,11 +363,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force, */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - if (value < lnet_retry_count || value == 0) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid value for lnet_transaction_timeout (%lu). Has to be greater than lnet_retry_count (%u)\n", @@ -399,9 +404,10 @@ static int lnet_discover(struct lnet_process_id id, u32 force, */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { + if (lnet_health_sensitivity == 0) { mutex_unlock(&the_lnet.ln_api_mutex); - return 0; + CERROR("Can not set retry_count when health feature is turned off\n"); + return -EINVAL; } if (value > lnet_transaction_timeout) { @@ -411,11 +417,6 @@ static int lnet_discover(struct lnet_process_id id, u32 force, return -EINVAL; } - if (value == *retry_count) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - *retry_count = value; if (value == 0)