diff mbox series

sched/wait: introduce endmark in __wake_up_common

Message ID 20231010032833.398033-1-robinlai@tencent.com (mailing list archive)
State New
Headers show
Series sched/wait: introduce endmark in __wake_up_common | expand

Commit Message

lai bin Oct. 10, 2023, 3:28 a.m. UTC
Without this patch applied, it can cause the waker to fall into an
infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
wake list walk. When the number of walked entries reach 64, the waker
will record scan position and release the queue lock, which reduces
interrupts and rescheduling latency.

Let's take an example, ltp-aiodio case of runltp-ng create 100 processes
for io testing. These processes write the same test file and wait for page
writing complete. Because these processes are all writing to the same
file, they may all be waiting for the writeback bit of the same page
to be cleared. When the page writeback is completed, the end_page_writeback
will clear the writeback bit of the page and wake up all processes on the
wake list for the page. At the same time, another process could submit
the page and set the writeback bit again. When the awakened processes find
that the writeback bit has not been cleared, thery will try to add
themselves to the wake list immediately. Because of the WQ_FLAG_BOOKMARK
feature, the awakened processes will be added to the tail of wake list
again after the waker releases the queue lock. It causes the waker to
fall into an infinite loop.

Therefore, we introduce the endmark to indicate the end of bookmark state.
When we get the endmark entry, stop placing the bookmark flag and hold the
lock until all remaining entries have been walked.

Signed-off-by: Bin Lai <robinlai@tencent.com>

Comments

Matthew Wilcox (Oracle) Oct. 10, 2023, 3:57 a.m. UTC | #1
On Tue, Oct 10, 2023 at 11:28:33AM +0800, Bin Lai wrote:
> Without this patch applied, it can cause the waker to fall into an
> infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
> up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
> wake list walk. When the number of walked entries reach 64, the waker
> will record scan position and release the queue lock, which reduces
> interrupts and rescheduling latency.

Maybe we could try removing bookmarks instead?  There was a thread a
while ago where we agreed they weren't actually useful any more.

Patches to follow ...
lai bin Oct. 10, 2023, 7:04 a.m. UTC | #2
Yes,  you are right. Now that bookmarks weren't actually useful any
more, removing  bookmarks  is a better way to address the livelocks.

Matthew Wilcox <willy@infradead.org> 于2023年10月10日周二 11:58写道:
>
> On Tue, Oct 10, 2023 at 11:28:33AM +0800, Bin Lai wrote:
> > Without this patch applied, it can cause the waker to fall into an
> > infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
> > up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
> > wake list walk. When the number of walked entries reach 64, the waker
> > will record scan position and release the queue lock, which reduces
> > interrupts and rescheduling latency.
>
> Maybe we could try removing bookmarks instead?  There was a thread a
> while ago where we agreed they weren't actually useful any more.
>
> Patches to follow ...
diff mbox series

Patch

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3413babd2db4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -213,7 +213,8 @@  int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+		wait_queue_entry_t *endmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..9ecb59193710 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -79,10 +79,11 @@  EXPORT_SYMBOL(remove_wait_queue);
  */
 static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key,
-			wait_queue_entry_t *bookmark)
+			wait_queue_entry_t *bookmark,
+			wait_queue_entry_t *endmark)
 {
 	wait_queue_entry_t *curr, *next;
-	int cnt = 0;
+	int cnt = 0, touch_endmark = 0;
 
 	lockdep_assert_held(&wq_head->lock);
 
@@ -95,12 +96,17 @@  static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
 
 	if (&curr->entry == &wq_head->head)
-		return nr_exclusive;
+		goto out;
 
 	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
 		int ret;
 
+		if (curr == endmark) {
+			touch_endmark = 1;
+			continue;
+		}
+
 		if (flags & WQ_FLAG_BOOKMARK)
 			continue;
 
@@ -110,14 +116,24 @@  static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 
-		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+		if (bookmark && !touch_endmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
 				(&next->entry != &wq_head->head)) {
 			bookmark->flags = WQ_FLAG_BOOKMARK;
 			list_add_tail(&bookmark->entry, &next->entry);
-			break;
+
+			if (endmark && !(endmark->flags & WQ_FLAG_BOOKMARK)) {
+				endmark->flags = WQ_FLAG_BOOKMARK;
+				list_add_tail(&endmark->entry, &wq_head->head);
+			}
+
+			return nr_exclusive;
 		}
 	}
 
+out:
+	if (endmark && (endmark->flags & WQ_FLAG_BOOKMARK))
+		list_del(&endmark->entry);
+
 	return nr_exclusive;
 }
 
@@ -125,7 +141,7 @@  static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 			int nr_exclusive, int wake_flags, void *key)
 {
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
+	wait_queue_entry_t bookmark, endmark;
 	int remaining = nr_exclusive;
 
 	bookmark.flags = 0;
@@ -133,10 +149,15 @@  static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 	bookmark.func = NULL;
 	INIT_LIST_HEAD(&bookmark.entry);
 
+	endmark.flags = 0;
+	endmark.private = NULL;
+	endmark.func = NULL;
+	INIT_LIST_HEAD(&endmark.entry);
+
 	do {
 		spin_lock_irqsave(&wq_head->lock, flags);
 		remaining = __wake_up_common(wq_head, mode, remaining,
-						wake_flags, key, &bookmark);
+					     wake_flags, key, &bookmark, &endmark);
 		spin_unlock_irqrestore(&wq_head->lock, flags);
 	} while (bookmark.flags & WQ_FLAG_BOOKMARK);
 
@@ -171,20 +192,21 @@  void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+		wait_queue_entry_t *endmark)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark, endmark);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
 
@@ -233,7 +255,7 @@  EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
 			       unsigned int mode, void *key)
 {
-        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+	__wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea4387053e8..49dc8620271d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1135,7 +1135,7 @@  static void folio_wake_bit(struct folio *folio, int bit_nr)
 	wait_queue_head_t *q = folio_waitqueue(folio);
 	struct wait_page_key key;
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
+	wait_queue_entry_t bookmark, endmark;
 
 	key.folio = folio;
 	key.bit_nr = bit_nr;
@@ -1146,8 +1146,13 @@  static void folio_wake_bit(struct folio *folio, int bit_nr)
 	bookmark.func = NULL;
 	INIT_LIST_HEAD(&bookmark.entry);
 
+	endmark.flags = 0;
+	endmark.private = NULL;
+	endmark.func = NULL;
+	INIT_LIST_HEAD(&endmark.entry);
+
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);
 
 	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
 		/*
@@ -1159,7 +1164,7 @@  static void folio_wake_bit(struct folio *folio, int bit_nr)
 		spin_unlock_irqrestore(&q->lock, flags);
 		cpu_relax();
 		spin_lock_irqsave(&q->lock, flags);
-		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);
 	}
 
 	/*