diff mbox series

[12/27] lustre: obdclass: fix rpc slot leakage

Message ID 1681739243-29375-13-git-send-email-jsimmons@infradead.org (mailing list archive)
State New, archived
Headers show
Series lustre: sync to OpenSFS branch April 17, 2023 | expand

Commit Message

James Simmons April 17, 2023, 1:47 p.m. UTC
From: Alex Zhuravlev <bzzz@whamcloud.com>

obd_get_mod_rpc_slot() can race with obd_put_mod_rpc_slot():
finishing wait_woken() resets WQ_FLAG_WOKEN (which is set
when the corresponding thread gets a slot incrementing
cl_mod_rpcs_in_flight. then another thread execting
__wake_up_locked_key() may find that wq_entry again and call
claim_mod_rpc_function() one more time again incrementing
cl_mod_rpc_in_flight. thus it's incremented twice for a
single obd_get_mod_rpc_slot().

flags &= ~WQ_FLAG_WOKEN
list_add()
wait_woken()
  schedule              claim_mod_rpc_function()
                                cl_mod_rpcs_in_flight++
                                wake_up()

  flags &= ~WQ_FLAG_WOKEN

                        #3: obd_put_mod_rpc_slot()
                        claim_mod_rpc_function()
                                cl_mod_rpcs_in_flight++
                                wake_up()
list_del()

the patch introduces a replacement for WQ_FLAG_WOKEN which is never
reset once set.

Fixes: 6d398c0843 ("lustre: obdclass: improve precision of wakeups for mod_rpcs")
WC-bug-id: https://jira.whamcloud.com/browse/LU-16633
Lustre-commit: 91a3726f313df33e09 ("LU-16633 obdclass: fix rpc slot leakage")
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/50261
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
---
 fs/lustre/mdc/mdc_request.c |  3 +++
 fs/lustre/obdclass/genops.c | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/fs/lustre/mdc/mdc_request.c b/fs/lustre/mdc/mdc_request.c
index 58ea982..15e58e8 100644
--- a/fs/lustre/mdc/mdc_request.c
+++ b/fs/lustre/mdc/mdc_request.c
@@ -2964,6 +2964,9 @@  static int mdc_precleanup(struct obd_device *obd)
 
 static int mdc_cleanup(struct obd_device *obd)
 {
+	struct client_obd *cli = &obd->u.cli;
+
+	LASSERT(cli->cl_mod_rpcs_in_flight == 0);
 	return osc_cleanup_common(obd);
 }
 
diff --git a/fs/lustre/obdclass/genops.c b/fs/lustre/obdclass/genops.c
index b6bde00..43772aa 100644
--- a/fs/lustre/obdclass/genops.c
+++ b/fs/lustre/obdclass/genops.c
@@ -1487,6 +1487,7 @@  int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq)
 struct mod_waiter {
 	struct client_obd *cli;
 	bool close_req;
+	bool woken;
 	wait_queue_entry_t wqe;
 };
 static int claim_mod_rpc_function(wait_queue_entry_t *wq_entry,
@@ -1499,10 +1500,9 @@  static int claim_mod_rpc_function(wait_queue_entry_t *wq_entry,
 	int ret;
 
 	/* As woken_wake_function() doesn't remove us from the wait_queue,
-	 * we could get called twice for the same thread - take care.
+	 * we use own flag to ensure we're called just once.
 	 */
-	if (wq_entry->flags & WQ_FLAG_WOKEN)
-		/* Already woke this thread, don't try again */
+	if (w->woken)
 		return 0;
 
 	/* A slot is available if
@@ -1516,6 +1516,7 @@  static int claim_mod_rpc_function(wait_queue_entry_t *wq_entry,
 		if (w->close_req)
 			cli->cl_close_rpcs_in_flight++;
 		ret = woken_wake_function(wq_entry, mode, flags, key);
+		w->woken = true;
 	} else if (cli->cl_close_rpcs_in_flight)
 		/* No other waiter could be woken */
 		ret = -1;
@@ -1543,6 +1544,7 @@  u16 obd_get_mod_rpc_slot(struct client_obd *cli, u32 opc)
 	struct mod_waiter wait = {
 		.cli = cli,
 		.close_req = (opc == MDS_CLOSE),
+		.woken = false,
 	};
 	u16 i, max;
 
@@ -1556,7 +1558,8 @@  u16 obd_get_mod_rpc_slot(struct client_obd *cli, u32 opc)
 	 * and there will be no need to wait.
 	 */
 	wake_up_locked(&cli->cl_mod_rpcs_waitq);
-	if (!(wait.wqe.flags & WQ_FLAG_WOKEN)) {
+	/* XXX: handle spurious wakeups (from unknown yet source */
+	while (wait.woken == false) {
 		spin_unlock_irq(&cli->cl_mod_rpcs_waitq.lock);
 		wait_woken(&wait.wqe, TASK_UNINTERRUPTIBLE,
 			   MAX_SCHEDULE_TIMEOUT);