diff mbox series

v5.8.x stability on parisc

Message ID bd1c6298-db1a-79ab-279b-60daad6bed22@bell.net (mailing list archive)
State Accepted, archived
Headers show
Series v5.8.x stability on parisc | expand

Commit Message

John David Anglin Sept. 27, 2020, 2:17 p.m. UTC
With this patch collection, I haven't seen any stalls on my rp3440 and c8000 running v5.8.x.  Further,
I haven't had any signing failures on the buildd machine.  The rp3440 (mx3210) has also been successful in
building several packages that failed multiple time on the c8000 buildd.

I think this is due to the three changes at the bottom from Linus.

Regards,
Dave

Comments

Helge Deller Sept. 27, 2020, 7:45 p.m. UTC | #1
On 9/27/20 4:17 PM, John David Anglin wrote:
> With this patch collection, I haven't seen any stalls on my rp3440 and c8000 running v5.8.x.  Further,
> I haven't had any signing failures on the buildd machine.  The rp3440 (mx3210) has also been successful in
> building several packages that failed multiple time on the c8000 buildd.
>
> I think this is due to the three changes at the bottom from Linus.

Weren't thos added already upstream (or variants of them) ?

> diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
> index 640d46edf32e..c705decf2bed 100644
> --- a/arch/parisc/include/asm/barrier.h
> +++ b/arch/parisc/include/asm/barrier.h
> @@ -2,11 +2,15 @@
>  /* The synchronize caches instruction executes as a nop on systems in
>     which all memory references are performed in order. */
> -#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
> +#define synchronize_caches() asm volatile("sync" \
> +	ALTERNATIVE(ALT_COND_NO_SMP, INSN_NOP) \
> +	: : : "memory")

I wonder if this has any impact.
If it's an UP system, won't this "sync" just be a "nop" anyway?
Did you ran some performance tests, just to check?

Would you mind to send separate patches for each logical change?

Thanks for your great work!
Helge
John David Anglin Sept. 27, 2020, 9:24 p.m. UTC | #2
On 2020-09-27 3:45 p.m., Helge Deller wrote:
> On 9/27/20 4:17 PM, John David Anglin wrote:
>> With this patch collection, I haven't seen any stalls on my rp3440 and c8000 running v5.8.x.  Further,
>> I haven't had any signing failures on the buildd machine.  The rp3440 (mx3210) has also been successful in
>> building several packages that failed multiple time on the c8000 buildd.
>>
>> I think this is due to the three changes at the bottom from Linus.
> Weren't thos added already upstream (or variants of them) ?
Thought so but nothing has been committed to v5.8.x that conflicts with the changes.  I also searched the git log
for similar commits but I didn't find anything.  So, I'm puzzled as to what happened to these changes.  This is the
main reason I threw out my patch list for discussion.

The following packages built successfully on mx3210 after falling on c8000: tigervnc, postgresql-13, verse, bali-phy
and libbpf.  I also tried mariadb-10.5 but it had the same testsuite fail as before.

>
>> diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
>> index 640d46edf32e..c705decf2bed 100644
>> --- a/arch/parisc/include/asm/barrier.h
>> +++ b/arch/parisc/include/asm/barrier.h
>> @@ -2,11 +2,15 @@
>>  /* The synchronize caches instruction executes as a nop on systems in
>>     which all memory references are performed in order. */
>> -#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
>> +#define synchronize_caches() asm volatile("sync" \
>> +	ALTERNATIVE(ALT_COND_NO_SMP, INSN_NOP) \
>> +	: : : "memory")
> I wonder if this has any impact.
> If it's an UP system, won't this "sync" just be a "nop" anyway?
I believe we want this when SMP kernel is run on a UP machine or a SMP machine running with one CPU.
> Did you ran some performance tests, just to check?
No.
>
> Would you mind to send separate patches for each logical change?
>
I'll try to get there but I'm currently focusing on toolchain issues.

Regards,
Dave
John David Anglin Sept. 29, 2020, 6 p.m. UTC | #3
On 2020-09-27 5:24 p.m., John David Anglin wrote:
> Thought so but nothing has been committed to v5.8.x that conflicts with the changes.  I also searched the git log
> for similar commits but I didn't find anything.  So, I'm puzzled as to what happened to these changes.  This is the
> main reason I threw out my patch list for discussion.
A revised patch is in v5.9-rc7.
diff mbox series

Patch

diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
index 640d46edf32e..c705decf2bed 100644
--- a/arch/parisc/include/asm/barrier.h
+++ b/arch/parisc/include/asm/barrier.h
@@ -2,11 +2,15 @@ 
 #ifndef __ASM_BARRIER_H
 #define __ASM_BARRIER_H

+#include <asm/alternative.h>
+
 #ifndef __ASSEMBLY__

 /* The synchronize caches instruction executes as a nop on systems in
    which all memory references are performed in order. */
-#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
+#define synchronize_caches() asm volatile("sync" \
+	ALTERNATIVE(ALT_COND_NO_SMP, INSN_NOP) \
+	: : : "memory")

 #if defined(CONFIG_SMP)
 #define mb()		do { synchronize_caches(); } while (0)
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index 068958575871..cf5ee9b0b393 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -14,22 +14,22 @@ 
 extern void __xchg_called_with_bad_pointer(void);

 /* __xchg32/64 defined in arch/parisc/lib/bitops.c */
-extern unsigned long __xchg8(char, char *);
-extern unsigned long __xchg32(int, int *);
+extern unsigned long __xchg8(char, volatile char *);
+extern unsigned long __xchg32(int, volatile int *);
 #ifdef CONFIG_64BIT
-extern unsigned long __xchg64(unsigned long, unsigned long *);
+extern unsigned long __xchg64(unsigned long, volatile unsigned long *);
 #endif

 /* optimizer better get rid of switch since size is a constant */
 static inline unsigned long
-__xchg(unsigned long x, __volatile__ void *ptr, int size)
+__xchg(unsigned long x, volatile void *ptr, int size)
 {
 	switch (size) {
 #ifdef CONFIG_64BIT
-	case 8: return __xchg64(x, (unsigned long *) ptr);
+	case 8: return __xchg64(x, (volatile unsigned long *) ptr);
 #endif
-	case 4: return __xchg32((int) x, (int *) ptr);
-	case 1: return __xchg8((char) x, (char *) ptr);
+	case 4: return __xchg32((int) x, (volatile int *) ptr);
+	case 1: return __xchg8((char) x, (volatile char *) ptr);
 	}
 	__xchg_called_with_bad_pointer();
 	return x;
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index c459f656c8c3..fceb9cf02fb3 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -16,7 +16,7 @@  static inline void
 _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
 {
 	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
+	long index = ((long)uaddr & 0x3f8) >> 1;
 	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
 	local_irq_save(*flags);
 	arch_spin_lock(s);
@@ -26,7 +26,7 @@  static inline void
 _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 {
 	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
+	long index = ((long)uaddr & 0x3f8) >> 1;
 	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
 	arch_spin_unlock(s);
 	local_irq_restore(*flags);
diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
index e080143e79a3..0b182450a2fb 100644
--- a/arch/parisc/include/asm/ldcw.h
+++ b/arch/parisc/include/asm/ldcw.h
@@ -51,6 +51,19 @@ 
 	__ret;							\
 })

+/* LDCW is strongly ordered and can be used as a memory barrier
+   when a suitably aligned address is available. */
+#ifdef CONFIG_SMP
+#define __ldcw_mb(a) ({						\
+	unsigned __tmp;						\
+	__asm__ __volatile__(__LDCW " 0(%1),%0"			\
+	ALTERNATIVE(ALT_COND_NO_SMP, INSN_NOP)			\
+	: "=r" (__tmp) : "r" (a) : "memory");			\
+})
+#else
+#define __ldcw_mb(a) barrier();
+#endif
+
 #ifdef CONFIG_SMP
 # define __lock_aligned __section(.data..lock_aligned)
 #endif
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 51b6c47f802f..fa5ee8a45dbd 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -10,13 +10,21 @@ 
 static inline int arch_spin_is_locked(arch_spinlock_t *x)
 {
 	volatile unsigned int *a = __ldcw_align(x);
-	return *a == 0;
+	return READ_ONCE(*a) == 0;
 }

-#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
+static inline void arch_spin_lock(arch_spinlock_t *x)
+{
+	volatile unsigned int *a;
+
+	a = __ldcw_align(x);
+	while (__ldcw(a) == 0)
+		while (*a == 0)
+			continue;
+}

 static inline void arch_spin_lock_flags(arch_spinlock_t *x,
-					 unsigned long flags)
+					unsigned long flags)
 {
 	volatile unsigned int *a;

@@ -25,10 +33,8 @@  static inline void arch_spin_lock_flags(arch_spinlock_t *x,
 		while (*a == 0)
 			if (flags & PSW_SM_I) {
 				local_irq_enable();
-				cpu_relax();
 				local_irq_disable();
-			} else
-				cpu_relax();
+			}
 }
 #define arch_spin_lock_flags arch_spin_lock_flags

@@ -44,12 +50,9 @@  static inline void arch_spin_unlock(arch_spinlock_t *x)
 static inline int arch_spin_trylock(arch_spinlock_t *x)
 {
 	volatile unsigned int *a;
-	int ret;

 	a = __ldcw_align(x);
-        ret = __ldcw(a) != 0;
-
-	return ret;
+	return __ldcw(a) != 0;
 }

 /*
diff --git a/arch/parisc/install.sh b/arch/parisc/install.sh
index 6f68784fea25..056d588befdd 100644
--- a/arch/parisc/install.sh
+++ b/arch/parisc/install.sh
@@ -43,7 +43,7 @@  fi

 # Default install

-if [ "$(basename $2)" = "zImage" ]; then
+if [ "$(basename $2)" = "vmlinuz" ]; then
 # Compressed install
   echo "Installing compressed kernel"
   base=vmlinuz
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 1eedfecc5137..3686f496fd27 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -384,12 +384,12 @@  EXPORT_SYMBOL(flush_kernel_icache_range_asm);
 static unsigned long parisc_cache_flush_threshold __ro_after_init = FLUSH_THRESHOLD;

 #define FLUSH_TLB_THRESHOLD (16*1024) /* 16 KiB minimum TLB threshold */
-static unsigned long parisc_tlb_flush_threshold __ro_after_init = FLUSH_TLB_THRESHOLD;
+static unsigned long parisc_tlb_flush_threshold __ro_after_init = ~0UL;

 void __init parisc_setup_cache_timing(void)
 {
 	unsigned long rangetime, alltime;
-	unsigned long size, start;
+	unsigned long size;
 	unsigned long threshold;

 	alltime = mfctl(16);
@@ -423,14 +423,9 @@  void __init parisc_setup_cache_timing(void)
 		goto set_tlb_threshold;
 	}

-	size = 0;
-	start = (unsigned long) _text;
+	size = (unsigned long)_end - (unsigned long)_text;
 	rangetime = mfctl(16);
-	while (start < (unsigned long) _end) {
-		flush_tlb_kernel_range(start, start + PAGE_SIZE);
-		start += PAGE_SIZE;
-		size += PAGE_SIZE;
-	}
+	flush_tlb_kernel_range((unsigned long)_text, (unsigned long)_end);
 	rangetime = mfctl(16) - rangetime;

 	alltime = mfctl(16);
@@ -445,8 +440,11 @@  void __init parisc_setup_cache_timing(void)
 		threshold/1024);

 set_tlb_threshold:
-	if (threshold > parisc_tlb_flush_threshold)
+	if (threshold > FLUSH_TLB_THRESHOLD)
 		parisc_tlb_flush_threshold = threshold;
+	else
+		parisc_tlb_flush_threshold = FLUSH_TLB_THRESHOLD;
+
 	printk(KERN_INFO "TLB flush threshold set to %lu KiB\n",
 		parisc_tlb_flush_threshold/1024);
 }
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 519f9056fd00..f6f28e41bb5e 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -899,20 +899,20 @@  intr_check_sig:
 	 * Only do signals if we are returning to user space
 	 */
 	LDREG	PT_IASQ0(%r16), %r20
-	cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* backward */
+	cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* forward */
 	LDREG	PT_IASQ1(%r16), %r20
-	cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* backward */
-
-	/* NOTE: We need to enable interrupts if we have to deliver
-	 * signals. We used to do this earlier but it caused kernel
-	 * stack overflows. */
-	ssm     PSW_SM_I, %r0
+	cmpib,COND(=),n LINUX_GATEWAY_SPACE, %r20, intr_restore /* forward */

 	copy	%r0, %r25			/* long in_syscall = 0 */
 #ifdef CONFIG_64BIT
 	ldo	-16(%r30),%r29			/* Reference param save area */
 #endif

+	/* NOTE: We need to enable interrupts if we have to deliver
+	 * signals. We used to do this earlier but it caused kernel
+	 * stack overflows. */
+	ssm	PSW_SM_I, %r0
+
 	BL	do_notify_resume,%r2
 	copy	%r16, %r26			/* struct pt_regs *regs */

diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index aa93d775c34d..cd3e5fd39410 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -340,6 +340,19 @@  smp_slave_stext:
 	bv              (%rp)
 	ssm             PSW_SM_W,%r0
 2:
+#endif
+
+#ifdef CONFIG_64BIT
+	/*
+	 *  Enable Wide mode early, in case smp_init_current_idle_task was
+	 *  allocated above 4GB.
+         */
+1:	mfia            %rp             /* clear upper part of pcoq */
+	ldo             2f-1b(%rp),%rp
+	depdi           0,31,32,%rp
+	bv              (%rp)
+	ssm             PSW_SM_W,%r0
+2:
 #endif

 	/*  Initialize the SP - monarch sets up smp_init_current_idle_task */
diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S
index 81de5e2b391c..c2981401775c 100644
--- a/arch/parisc/kernel/hpmc.S
+++ b/arch/parisc/kernel/hpmc.S
@@ -289,13 +289,3 @@  os_hpmc_6:
 	b .
 	nop
 	.align 16	/* make function length multiple of 16 bytes */
-.os_hpmc_end:
-
-
-	__INITRODATA
-.globl os_hpmc_size
-	.align 4
-	.type   os_hpmc_size, @object
-	.size   os_hpmc_size, 4
-os_hpmc_size:
-	.word .os_hpmc_end-.os_hpmc
diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c
index 6e8550fefad6..fcc761b0e11b 100644
--- a/arch/parisc/kernel/pdt.c
+++ b/arch/parisc/kernel/pdt.c
@@ -18,6 +18,8 @@ 
 #include <linux/kthread.h>
 #include <linux/initrd.h>
 #include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>

 #include <asm/pdc.h>
 #include <asm/pdcpat.h>
@@ -230,6 +232,7 @@  void __init pdc_pdt_init(void)

 		/* mark memory page bad */
 		memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE);
+		num_poisoned_pages_inc();
 	}
 }

diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 3ad61a177f5b..322503780db6 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -571,8 +571,8 @@  lws_compare_and_swap:
 	ldil	L%lws_lock_start, %r20
 	ldo	R%lws_lock_start(%r20), %r28

-	/* Extract four bits from r26 and hash lock (Bits 4-7) */
-	extru  %r26, 27, 4, %r20
+	/* Extract eight bits from r26 and hash lock (Bits 3-11) */
+	extru  %r26, 28, 8, %r20

 	/* Find lock to use, the hash is either one of 0 to
 	   15, multiplied by 16 (keep it 16-byte aligned)
@@ -761,8 +761,8 @@  cas2_lock_start:
 	ldil	L%lws_lock_start, %r20
 	ldo	R%lws_lock_start(%r20), %r28

-	/* Extract four bits from r26 and hash lock (Bits 4-7) */
-	extru  %r26, 27, 4, %r20
+	/* Extract eight bits from r26 and hash lock (Bits 3-11) */
+	extru  %r26, 28, 8, %r20

 	/* Find lock to use, the hash is either one of 0 to
 	   15, multiplied by 16 (keep it 16-byte aligned)
@@ -950,7 +950,7 @@  END(sys_call_table64)
 	.align	L1_CACHE_BYTES
 ENTRY(lws_lock_start)
 	/* lws locks */
-	.rept 16
+	.rept 256
 	/* Keep locks aligned at 16-bytes */
 	.word 1
 	.word 0
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 5400e23a77a1..0c3064a927b8 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -799,14 +799,13 @@  void notrace handle_interruption(int code, struct pt_regs *regs)

 void __init initialize_ivt(const void *iva)
 {
-	extern u32 os_hpmc_size;
 	extern const u32 os_hpmc[];

 	int i;
 	u32 check = 0;
 	u32 *ivap;
 	u32 *hpmcp;
-	u32 length, instr;
+	u32 instr;

 	if (strcmp((const char *)iva, "cows can fly"))
 		panic("IVT invalid");
@@ -837,18 +836,14 @@  void __init initialize_ivt(const void *iva)

 	/* Setup IVA and compute checksum for HPMC handler */
 	ivap[6] = (u32)__pa(os_hpmc);
-	length = os_hpmc_size;
-	ivap[7] = length;

 	hpmcp = (u32 *)os_hpmc;

-	for (i=0; i<length/4; i++)
-	    check += *hpmcp++;
-
 	for (i=0; i<8; i++)
 	    check += ivap[i];

 	ivap[5] = -check;
+	pr_info("initialize_ivt: IVA[6] = 0x%08x\n", ivap[6]);
 }
 	

diff --git a/arch/parisc/lib/bitops.c b/arch/parisc/lib/bitops.c
index 2e4d1f05a926..9ac683bf6ae7 100644
--- a/arch/parisc/lib/bitops.c
+++ b/arch/parisc/lib/bitops.c
@@ -18,7 +18,7 @@  arch_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned = {
 #endif

 #ifdef CONFIG_64BIT
-unsigned long __xchg64(unsigned long x, unsigned long *ptr)
+unsigned long __xchg64(unsigned long x, volatile unsigned long *ptr)
 {
 	unsigned long temp, flags;

@@ -30,7 +30,7 @@  unsigned long __xchg64(unsigned long x, unsigned long *ptr)
 }
 #endif

-unsigned long __xchg32(int x, int *ptr)
+unsigned long __xchg32(int x, volatile int *ptr)
 {
 	unsigned long flags;
 	long temp;
@@ -43,7 +43,7 @@  unsigned long __xchg32(int x, int *ptr)
 }


-unsigned long __xchg8(char x, char *ptr)
+unsigned long __xchg8(char x, volatile char *ptr)
 {
 	unsigned long flags;
 	long temp;
diff --git a/include/linux/list.h b/include/linux/list.h
index aff44d34f4e4..0d0d17a10d25 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -282,6 +282,24 @@  static inline int list_empty(const struct list_head *head)
 	return READ_ONCE(head->next) == head;
 }

+/**
+ * list_del_init_careful - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ *
+ * This is the same as list_del_init(), except designed to be used
+ * together with list_empty_careful() in a way to guarantee ordering
+ * of other memory operations.
+ *
+ * Any memory operations done before a list_del_init_careful() are
+ * guaranteed to be visible after a list_empty_careful() test.
+ */
+static inline void list_del_init_careful(struct list_head *entry)
+{
+	__list_del_entry(entry);
+	entry->prev = entry;
+	smp_store_release(&entry->next, entry);
+}
+
 /**
  * list_empty_careful - tests whether a list is empty and not being modified
  * @head: the list to test
@@ -297,7 +315,7 @@  static inline int list_empty(const struct list_head *head)
  */
 static inline int list_empty_careful(const struct list_head *head)
 {
-	struct list_head *next = head->next;
+	struct list_head *next = smp_load_acquire(&head->next);
 	return (next == head) && (next == head->prev);
 }

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index ba059fbfc53a..01f5d3020589 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -389,7 +389,7 @@  int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 	int ret = default_wake_function(wq_entry, mode, sync, key);

 	if (ret)
-		list_del_init(&wq_entry->entry);
+		list_del_init_careful(&wq_entry->entry);

 	return ret;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 385759c4ce4b..239d156a38ea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1002,6 +1002,7 @@  struct wait_page_queue {

 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
+	int ret;
 	struct wait_page_key *key = arg;
 	struct wait_page_queue *wait_page
 		= container_of(wait, struct wait_page_queue, wait);
@@ -1013,18 +1014,35 @@  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 	if (wait_page->bit_nr != key->bit_nr)
 		return 0;

+	/* Stop walking if it's locked */
+	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+		if (test_and_set_bit(key->bit_nr, &key->page->flags))
+			return -1;
+	} else {
+		if (test_bit(key->bit_nr, &key->page->flags))
+			return -1;
+	}
+
 	/*
-	 * Stop walking if it's locked.
-	 * Is this safe if put_and_wait_on_page_locked() is in use?
-	 * Yes: the waker must hold a reference to this page, and if PG_locked
-	 * has now already been set by another task, that task must also hold
-	 * a reference to the *same usage* of this page; so there is no need
-	 * to walk on to wake even the put_and_wait_on_page_locked() callers.
+	 * Let the waiter know we have done the page flag
+	 * handling for it (and the return value lets the
+	 * wakeup logic count exclusive wakeup events).
 	 */
-	if (test_bit(key->bit_nr, &key->page->flags))
-		return -1;
+	ret = (wait->flags & WQ_FLAG_EXCLUSIVE) != 0;
+	wait->flags |= WQ_FLAG_WOKEN;
+	wake_up_state(wait->private, mode);

-	return autoremove_wake_function(wait, mode, sync, key);
+	/*
+	 * Ok, we have successfully done what we're waiting for,
+	 * and we can unconditionally remove the wait entry.
+	 *
+	 * Note that this has to be the absolute last thing we do,
+	 * since after list_del_init(&wait->entry) the wait entry
+	 * might be de-allocated and the process might even have
+	 * exited.
+	 */
+	list_del_init_careful(&wait->entry);
+	return ret;
 }

 static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1103,16 +1121,22 @@  enum behavior {
 			 */
 };

+static inline int trylock_page_bit_common(struct page *page, int bit_nr,
+	enum behavior behavior)
+{
+	return behavior == EXCLUSIVE ?
+		!test_and_set_bit(bit_nr, &page->flags) :
+		!test_bit(bit_nr, &page->flags);
+}
+
 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	struct page *page, int bit_nr, int state, enum behavior behavior)
 {
 	struct wait_page_queue wait_page;
 	wait_queue_entry_t *wait = &wait_page.wait;
-	bool bit_is_set;
 	bool thrashing = false;
 	bool delayacct = false;
 	unsigned long pflags;
-	int ret = 0;

 	if (bit_nr == PG_locked &&
 	    !PageUptodate(page) && PageWorkingset(page)) {
@@ -1130,48 +1154,44 @@  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	wait_page.page = page;
 	wait_page.bit_nr = bit_nr;

-	for (;;) {
-		spin_lock_irq(&q->lock);
+	/*
+	 * Add ourselves to the wait queue.
+	 *
+	 * NOTE! This is where we also check the page
+	 * state synchronously the last time to see that
+	 * somebody didn't just clear the bit. Do the
+	 * SetPageWaiters() before that to let anybody
+	 * we just miss know they need to wake us up.
+	 */
+	spin_lock_irq(&q->lock);
+	SetPageWaiters(page);
+	if (!trylock_page_bit_common(page, bit_nr, behavior))
+		__add_wait_queue_entry_tail(q, wait);
+	else
+		wait->flags |= WQ_FLAG_WOKEN;
+	spin_unlock_irq(&q->lock);

-		if (likely(list_empty(&wait->entry))) {
-			__add_wait_queue_entry_tail(q, wait);
-			SetPageWaiters(page);
-		}
+	/*
+	 * From now on, all the logic will be based on
+	 * whether the wait entry is on the queue or not,
+	 * and the page bit testing (and setting) will be
+	 * done by the wake function, not us.
+	 *
+	 * We can drop our reference to the page.
+	 */
+	if (behavior == DROP)
+		put_page(page);

+	for (;;) {
 		set_current_state(state);

-		spin_unlock_irq(&q->lock);
-
-		bit_is_set = test_bit(bit_nr, &page->flags);
-		if (behavior == DROP)
-			put_page(page);
-
-		if (likely(bit_is_set))
-			io_schedule();
-
-		if (behavior == EXCLUSIVE) {
-			if (!test_and_set_bit_lock(bit_nr, &page->flags))
-				break;
-		} else if (behavior == SHARED) {
-			if (!test_bit(bit_nr, &page->flags))
-				break;
-		}
-
-		if (signal_pending_state(state, current)) {
-			ret = -EINTR;
+		if (signal_pending_state(state, current))
 			break;
-		}

-		if (behavior == DROP) {
-			/*
-			 * We can no longer safely access page->flags:
-			 * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
-			 * there is a risk of waiting forever on a page reused
-			 * for something that keeps it locked indefinitely.
-			 * But best check for -EINTR above before breaking.
-			 */
+		if (wait->flags & WQ_FLAG_WOKEN)
 			break;
-		}
+
+		io_schedule();
 	}

 	finish_wait(q, wait);
@@ -1190,7 +1210,7 @@  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	 * bother with signals either.
 	 */

-	return ret;
+	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
 }

 void wait_on_page_bit(struct page *page, int bit_nr)