diff mbox series

[15/15] lustre: mgc: configurable wait-to-reprocess time

Message ID 1625685076-1964-16-git-send-email-jsimmons@infradead.org (mailing list archive)
State New, archived
Headers show
Series lustre: updates to OpenSFS tree as of July 7 2021 | expand

Commit Message

James Simmons July 7, 2021, 7:11 p.m. UTC
From: Alex Zhuravlev <bzzz@whamcloud.com>

so we can set it shorter, for testing purposes at least. to change
minimal wait time MGC module option 'mgc_requeue_timeout_min'
should be used (in seconds). additionally a random value upto
mgc_requeue_timeout_min is added to avoid a flood of config re-read
requests from clients. if mgc_requeue_timeout_min is set to 0,
then random part will be upto 1 second.

ost-pools: before: 5840s, after:a 3474s
sanity-flr: before: 1575s, after: 1381s
sanity-quota: before: 10679s, after: 9703s

WC-bug-id: https://jira.whamcloud.com/browse/LU-14516
Lustre-commit: 04b2da6180d3c8eda ("LU-14516 mgc: configurable wait-to-reprocess time")
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/42020
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Aurelien Degremont <degremoa@amazon.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/mgc/mgc_internal.h |  8 ++++++++
 fs/lustre/mgc/mgc_request.c  | 44 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 41 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/fs/lustre/mgc/mgc_internal.h b/fs/lustre/mgc/mgc_internal.h
index a2a09d4..91f5fa1 100644
--- a/fs/lustre/mgc/mgc_internal.h
+++ b/fs/lustre/mgc/mgc_internal.h
@@ -43,6 +43,14 @@ 
 
 int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld);
 
+/* this timeout represents how many seconds MGC should wait before
+ * requeue config and recover lock to the MGS. We need to randomize this
+ * in order to not flood the MGS.
+ */
+#define MGC_TIMEOUT_MIN_SECONDS		5
+
+extern unsigned int mgc_requeue_timeout_min;
+
 static inline bool cld_is_sptlrpc(struct config_llog_data *cld)
 {
 	return cld->cld_type == MGS_CFG_T_SPTLRPC;
diff --git a/fs/lustre/mgc/mgc_request.c b/fs/lustre/mgc/mgc_request.c
index 1dfc74b..50044aa2 100644
--- a/fs/lustre/mgc/mgc_request.c
+++ b/fs/lustre/mgc/mgc_request.c
@@ -530,13 +530,6 @@  static void do_requeue(struct config_llog_data *cld)
 	up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem);
 }
 
-/* this timeout represents how many seconds MGC should wait before
- * requeue config and recover lock to the MGS. We need to randomize this
- * in order to not flood the MGS.
- */
-#define MGC_TIMEOUT_MIN_SECONDS   5
-#define MGC_TIMEOUT_RAND_CENTISEC 500
-
 static int mgc_requeue_thread(void *data)
 {
 	bool first = true;
@@ -548,7 +541,6 @@  static int mgc_requeue_thread(void *data)
 	rq_state |= RQ_RUNNING;
 	while (!(rq_state & RQ_STOP)) {
 		struct config_llog_data *cld, *cld_prev;
-		int rand = prandom_u32_max(MGC_TIMEOUT_RAND_CENTISEC);
 		int to;
 
 		/* Any new or requeued lostlocks will change the state */
@@ -565,11 +557,11 @@  static int mgc_requeue_thread(void *data)
 		 * random so everyone doesn't try to reconnect at once.
 		 */
 		/* rand is centi-seconds, "to" is in centi-HZ */
-		to = MGC_TIMEOUT_MIN_SECONDS * HZ * 100;
-		to += rand * HZ;
+		to = mgc_requeue_timeout_min == 0 ? 1 : mgc_requeue_timeout_min;
+		to = mgc_requeue_timeout_min * HZ + prandom_u32_max(to * HZ);
 		wait_event_idle_timeout(rq_waitq,
 					rq_state & (RQ_STOP | RQ_PRECLEANUP),
-					to/100);
+					to);
 
 		/*
 		 * iterate & processing through the list. for each cld, process
@@ -1835,6 +1827,36 @@  static int mgc_process_config(struct obd_device *obd, u32 len, void *buf)
 	.process_config	= mgc_process_config,
 };
 
+static int mgc_param_requeue_timeout_min_set(const char *val,
+					     const struct kernel_param *kp)
+{
+	int rc;
+	unsigned int num;
+
+	rc = kstrtouint(val, 0, &num);
+	if (rc < 0)
+		return rc;
+	if (num > 120)
+		return -EINVAL;
+
+	mgc_requeue_timeout_min = num;
+
+	return 0;
+}
+
+static struct kernel_param_ops param_ops_requeue_timeout_min = {
+	.set = mgc_param_requeue_timeout_min_set,
+	.get = param_get_uint,
+};
+
+#define param_check_requeue_timeout_min(name, p) \
+		__param_check(name, p, unsigned int)
+
+unsigned int mgc_requeue_timeout_min = MGC_TIMEOUT_MIN_SECONDS;
+module_param_call(mgc_requeue_timeout_min, mgc_param_requeue_timeout_min_set,
+		  param_get_uint, &param_ops_requeue_timeout_min, 0644);
+MODULE_PARM_DESC(mgc_requeue_timeout_min, "Minimal requeue time to refresh logs");
+
 static int __init mgc_init(void)
 {
 	int rc;