diff mbox series

hugetlbfs: Disable IRQ when taking hugetlb_lock in set_max_huge_pages()

Message ID 20191209160150.18064-1-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series hugetlbfs: Disable IRQ when taking hugetlb_lock in set_max_huge_pages() | expand

Commit Message

Waiman Long Dec. 9, 2019, 4:01 p.m. UTC
The following lockdep splat was observed in some test runs:

[  612.388273] ================================
[  612.411273] WARNING: inconsistent lock state
[  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
[  612.469273] --------------------------------
[  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
[  612.576273] {SOFTIRQ-ON-W} state was registered at:
[  612.598273]   lock_acquire+0x14f/0x3b0
[  612.616273]   _raw_spin_lock+0x30/0x70
[  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
[  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
[  612.681273]   proc_sys_call_handler+0x37f/0x450
[  612.703273]   vfs_write+0x157/0x460
[  612.719273]   ksys_write+0xb8/0x170
[  612.736273]   do_syscall_64+0xa5/0x4d0
[  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[  612.777273] irq event stamp: 691296
[  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
[  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
[  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
[  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
[  612.962273]
[  612.962273] other info that might help us debug this:
[  612.993273]  Possible unsafe locking scenario:
[  612.993273]
[  613.020273]        CPU0
[  613.031273]        ----
[  613.042273]   lock(hugetlb_lock);
[  613.057273]   <Interrupt>
[  613.069273]     lock(hugetlb_lock);
[  613.085273]
[  613.085273]  *** DEADLOCK ***
      :
[  613.245273] Call Trace:
[  613.256273]  <IRQ>
[  613.265273]  dump_stack+0x9a/0xf0
[  613.281273]  mark_lock+0xd0c/0x12f0
[  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
[  613.322273]  ? sched_clock_cpu+0x18/0x1e0
[  613.341273]  __lock_acquire+0x146b/0x48c0
[  613.360273]  ? trace_hardirqs_on+0x10/0x10
[  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
[  613.401273]  lock_acquire+0x14f/0x3b0
[  613.419273]  ? free_huge_page+0x36f/0xaa0
[  613.440273]  _raw_spin_lock+0x30/0x70
[  613.458273]  ? free_huge_page+0x36f/0xaa0
[  613.477273]  free_huge_page+0x36f/0xaa0
[  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
[  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
[  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
[  613.558273]  ? bio_endio+0x4ba/0x930
[  613.575273]  ? blk_account_io_completion+0x400/0x530
[  613.598273]  blk_update_request+0x276/0xe50
[  613.617273]  scsi_end_request+0x7b/0x6a0
[  613.636273]  ? lock_downgrade+0x6f0/0x6f0
[  613.654273]  scsi_io_completion+0x1c6/0x1570
[  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
[  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
[  613.718273]  blk_done_softirq+0x22e/0x350
[  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
[  613.758273]  __do_softirq+0x23d/0xad8
[  613.776273]  irq_exit+0x23e/0x2b0
[  613.792273]  do_IRQ+0x11a/0x200
[  613.806273]  common_interrupt+0xf/0xf
[  613.823273]  </IRQ>

Since hugetlb_lock can be taken from both process and IRQ contexts, we
need to protect the lock from nested locking by disabling IRQ before
taking it.  So for functions that are only called from the process
context, the spin_lock_irq()/spin_unlock_irq() pair is used. For
functions that may be called from both process and IRQ contexts, the
spin_lock_irqsave()/spin_unlock_irqrestore() pair is used.

For now, I am assuming that only free_huge_page() will be called from
IRQ context.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 mm/hugetlb.c | 116 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 71 insertions(+), 45 deletions(-)

Comments

Waiman Long Dec. 9, 2019, 4:05 p.m. UTC | #1
On 12/9/19 11:01 AM, Waiman Long wrote:
> The following lockdep splat was observed in some test runs:
>
> [  612.388273] ================================
> [  612.411273] WARNING: inconsistent lock state
> [  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
> [  612.469273] --------------------------------
> [  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
> [  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
> [  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
> [  612.576273] {SOFTIRQ-ON-W} state was registered at:
> [  612.598273]   lock_acquire+0x14f/0x3b0
> [  612.616273]   _raw_spin_lock+0x30/0x70
> [  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
> [  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
> [  612.681273]   proc_sys_call_handler+0x37f/0x450
> [  612.703273]   vfs_write+0x157/0x460
> [  612.719273]   ksys_write+0xb8/0x170
> [  612.736273]   do_syscall_64+0xa5/0x4d0
> [  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
> [  612.777273] irq event stamp: 691296
> [  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
> [  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
> [  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
> [  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
> [  612.962273]
> [  612.962273] other info that might help us debug this:
> [  612.993273]  Possible unsafe locking scenario:
> [  612.993273]
> [  613.020273]        CPU0
> [  613.031273]        ----
> [  613.042273]   lock(hugetlb_lock);
> [  613.057273]   <Interrupt>
> [  613.069273]     lock(hugetlb_lock);
> [  613.085273]
> [  613.085273]  *** DEADLOCK ***
>       :
> [  613.245273] Call Trace:
> [  613.256273]  <IRQ>
> [  613.265273]  dump_stack+0x9a/0xf0
> [  613.281273]  mark_lock+0xd0c/0x12f0
> [  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
> [  613.322273]  ? sched_clock_cpu+0x18/0x1e0
> [  613.341273]  __lock_acquire+0x146b/0x48c0
> [  613.360273]  ? trace_hardirqs_on+0x10/0x10
> [  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
> [  613.401273]  lock_acquire+0x14f/0x3b0
> [  613.419273]  ? free_huge_page+0x36f/0xaa0
> [  613.440273]  _raw_spin_lock+0x30/0x70
> [  613.458273]  ? free_huge_page+0x36f/0xaa0
> [  613.477273]  free_huge_page+0x36f/0xaa0
> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
> [  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
> [  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
> [  613.558273]  ? bio_endio+0x4ba/0x930
> [  613.575273]  ? blk_account_io_completion+0x400/0x530
> [  613.598273]  blk_update_request+0x276/0xe50
> [  613.617273]  scsi_end_request+0x7b/0x6a0
> [  613.636273]  ? lock_downgrade+0x6f0/0x6f0
> [  613.654273]  scsi_io_completion+0x1c6/0x1570
> [  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
> [  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
> [  613.718273]  blk_done_softirq+0x22e/0x350
> [  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
> [  613.758273]  __do_softirq+0x23d/0xad8
> [  613.776273]  irq_exit+0x23e/0x2b0
> [  613.792273]  do_IRQ+0x11a/0x200
> [  613.806273]  common_interrupt+0xf/0xf
> [  613.823273]  </IRQ>
>
> Since hugetlb_lock can be taken from both process and IRQ contexts, we
> need to protect the lock from nested locking by disabling IRQ before
> taking it.  So for functions that are only called from the process
> context, the spin_lock_irq()/spin_unlock_irq() pair is used. For
> functions that may be called from both process and IRQ contexts, the
> spin_lock_irqsave()/spin_unlock_irqrestore() pair is used.
>
> For now, I am assuming that only free_huge_page() will be called from
> IRQ context.
>
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  mm/hugetlb.c | 116 +++++++++++++++++++++++++++++++--------------------
>  1 file changed, 71 insertions(+), 45 deletions(-)

Sorry, I forgot to update the patch title. IRQ is disabled in all the
instances where it is acquired. So "in set_max_huge_pages()" should be
removed from the title.

Thanks,
Longman
Matthew Wilcox Dec. 9, 2019, 4:49 p.m. UTC | #2
On Mon, Dec 09, 2019 at 11:01:50AM -0500, Waiman Long wrote:
> [  613.245273] Call Trace:
> [  613.256273]  <IRQ>
> [  613.265273]  dump_stack+0x9a/0xf0
> [  613.281273]  mark_lock+0xd0c/0x12f0
> [  613.341273]  __lock_acquire+0x146b/0x48c0
> [  613.401273]  lock_acquire+0x14f/0x3b0
> [  613.440273]  _raw_spin_lock+0x30/0x70
> [  613.477273]  free_huge_page+0x36f/0xaa0
> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0

Oh, this is fun.  So we kicked off IO to a hugepage, then truncated or
otherwise caused the page to come free.  Then the IO finished and did the
final put on the page ... from interrupt context.  Splat.  Not something
that's going to happen often, but can happen if a process dies during
IO or due to a userspace bug.

Maybe we should schedule work to do the actual freeing of the page
instead of this rather large patch.  It doesn't seem like a case we need
to optimise for.

Further, this path is called from softirq context, not hardirq context.
That means the whole mess could be a spin_lock_bh(), not spin_lock_irq()
which is rather cheaper.  Anyway, I think the schedule-freeing-of-a-page-
if-in-irq-context approach is likely to be better.

> +/*
> + * Check to make sure that IRQ is enabled before calling spin_lock_irq()
> + * so that after a matching spin_unlock_irq() the system won't be in an
> + * incorrect state.
> + */
> +static __always_inline void spin_lock_irq_check(spinlock_t *lock)
> +{
> +	lockdep_assert_irqs_enabled();
> +	spin_lock_irq(lock);
> +}
> +#ifdef spin_lock_irq
> +#undef spin_lock_irq
> +#endif
> +#define spin_lock_irq(lock)	spin_lock_irq_check(lock)

Don't leave your debugging code in the patch you submit for merging.

> @@ -1775,7 +1793,11 @@ static void return_unused_surplus_pages(struct hstate *h,
>  		unused_resv_pages--;
>  		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
>  			goto out;
> -		cond_resched_lock(&hugetlb_lock);
> +		if (need_resched()) {
> +			spin_unlock_irq(&hugetlb_lock);
> +			cond_resched();
> +			spin_lock_irq(&hugetlb_lock);
> +		}

This doesn't work.  need_resched() is only going to be set due to things
happening in interrupt context.  But you've disabled interrupts, so
need_resched() is never going to be set.
Waiman Long Dec. 10, 2019, 12:46 a.m. UTC | #3
On 12/9/19 11:49 AM, Matthew Wilcox wrote:
> On Mon, Dec 09, 2019 at 11:01:50AM -0500, Waiman Long wrote:
>> [  613.245273] Call Trace:
>> [  613.256273]  <IRQ>
>> [  613.265273]  dump_stack+0x9a/0xf0
>> [  613.281273]  mark_lock+0xd0c/0x12f0
>> [  613.341273]  __lock_acquire+0x146b/0x48c0
>> [  613.401273]  lock_acquire+0x14f/0x3b0
>> [  613.440273]  _raw_spin_lock+0x30/0x70
>> [  613.477273]  free_huge_page+0x36f/0xaa0
>> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
> Oh, this is fun.  So we kicked off IO to a hugepage, then truncated or
> otherwise caused the page to come free.  Then the IO finished and did the
> final put on the page ... from interrupt context.  Splat.  Not something
> that's going to happen often, but can happen if a process dies during
> IO or due to a userspace bug.
>
> Maybe we should schedule work to do the actual freeing of the page
> instead of this rather large patch.  It doesn't seem like a case we need
> to optimise for.
I think that may be a good idea to try it out.
>
> Further, this path is called from softirq context, not hardirq context.
> That means the whole mess could be a spin_lock_bh(), not spin_lock_irq()
> which is rather cheaper.  Anyway, I think the schedule-freeing-of-a-page-
> if-in-irq-context approach is likely to be better.

Yes, using spin_lock_bh() may be better.


>> +/*
>> + * Check to make sure that IRQ is enabled before calling spin_lock_irq()
>> + * so that after a matching spin_unlock_irq() the system won't be in an
>> + * incorrect state.
>> + */
>> +static __always_inline void spin_lock_irq_check(spinlock_t *lock)
>> +{
>> +	lockdep_assert_irqs_enabled();
>> +	spin_lock_irq(lock);
>> +}
>> +#ifdef spin_lock_irq
>> +#undef spin_lock_irq
>> +#endif
>> +#define spin_lock_irq(lock)	spin_lock_irq_check(lock)
> Don't leave your debugging code in the patch you submit for merging.

As I am not 100% sure that free_huge_page() is the only lock taking
function that will be called from the interrupt context, I purposely add
this to catch other possible code path when lockdep is enabled in a
debug kernel. It has no side effect when lockdep isn't enabled. In the
future, if other lock taking functions is being accessed from the
interrupt context, we can spot that more easily. I don't see this
preventive debug code as a problem.

This change, however, should be in a separate patch.

>> @@ -1775,7 +1793,11 @@ static void return_unused_surplus_pages(struct hstate *h,
>>  		unused_resv_pages--;
>>  		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
>>  			goto out;
>> -		cond_resched_lock(&hugetlb_lock);
>> +		if (need_resched()) {
>> +			spin_unlock_irq(&hugetlb_lock);
>> +			cond_resched();
>> +			spin_lock_irq(&hugetlb_lock);
>> +		}
> This doesn't work.  need_resched() is only going to be set due to things
> happening in interrupt context.  But you've disabled interrupts, so
> need_resched() is never going to be set.

You are probably right. Thanks for pointing this out.

Cheers,
Longman
Waiman Long Dec. 10, 2019, 2:37 a.m. UTC | #4
On 12/9/19 7:46 PM, Waiman Long wrote:
> On 12/9/19 11:49 AM, Matthew Wilcox wrote:
>> On Mon, Dec 09, 2019 at 11:01:50AM -0500, Waiman Long wrote:
>>> [  613.245273] Call Trace:
>>> [  613.256273]  <IRQ>
>>> [  613.265273]  dump_stack+0x9a/0xf0
>>> [  613.281273]  mark_lock+0xd0c/0x12f0
>>> [  613.341273]  __lock_acquire+0x146b/0x48c0
>>> [  613.401273]  lock_acquire+0x14f/0x3b0
>>> [  613.440273]  _raw_spin_lock+0x30/0x70
>>> [  613.477273]  free_huge_page+0x36f/0xaa0
>>> [  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
>> Oh, this is fun.  So we kicked off IO to a hugepage, then truncated or
>> otherwise caused the page to come free.  Then the IO finished and did the
>> final put on the page ... from interrupt context.  Splat.  Not something
>> that's going to happen often, but can happen if a process dies during
>> IO or due to a userspace bug.
>>
>> Maybe we should schedule work to do the actual freeing of the page
>> instead of this rather large patch.  It doesn't seem like a case we need
>> to optimise for.
> I think that may be a good idea to try it out.

It turns out using workqueue is more complex that I originally thought.
I currently come up with the following untested changes to do that:

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..629ac000318b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1136,7 +1136,7 @@ static inline void ClearPageHugeTemporary(struct
page *pag
        page[2].mapping = NULL;
 }
 
-void free_huge_page(struct page *page)
+static void __free_huge_page(struct page *page)
 {
        /*
         * Can't pass hstate in here because it is called from the
@@ -1199,6 +1199,82 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
 
+/*
+ * As free_huge_page() can be called from softIRQ context, we have
+ * to defer the actual freeing in a workqueue to prevent potential
+ * hugetlb_lock deadlock.
+ */
+struct hpage_to_free {
+       struct page             *page;
+       struct hpage_to_free    *next;
+};
+static struct hpage_to_free *hpage_freelist;
+#define NEXT_PENDING   ((struct hpage_to_free *)-1)
+
+/*
+ * This work function locklessly retrieves the pages to be freed and
+ * frees them one-by-one.
+ */
+static void free_huge_page_workfn(struct work_struct *work)
+{
+       struct hpage_to_free *curr, *next;
+
+recheck:
+       curr = xchg(&hpage_freelist, NULL);
+       if (!curr)
+               return;
+
+       while (curr) {
+               __free_huge_page(curr->page);
+               next = READ_ONCE(curr->next);
+               while (next == NEXT_PENDING) {
+                       cpu_relax();
+                       next = READ_ONCE(curr->next);
+               }
+               kfree(curr);
+               curr = next;
+       }
+       if (work && READ_ONCE(hpage_freelist))
+               goto recheck;
+}
+static DECLARE_WORK(free_huge_page_work, free_huge_page_workfn);
+
+void free_huge_page(struct page *page)
+{
+       /*
+        * Defer freeing in softIRQ context to avoid hugetlb_lock deadlock.
+        */
+       if (in_serving_softirq()) {
+               struct hpage_to_free *item, *next;
+
+               /*
+                * We are in serious trouble if kmalloc fails. In this
+                * case, we hope for the best and do the freeing now.
+                */
+               item = kmalloc(sizeof(*item), GFP_KERNEL);
+               if (WARN_ON_ONCE(!item))
+                       goto free_page_now;
+
+               item->page = page;
+               item->next = NEXT_PENDING;
+               next = xchg(&hpage_freelist, item);
+               WRITE_ONCE(item->next, next);
+               schedule_work(&free_huge_page_work);
+               return;
+       }
+
+       /*
+        * Racing may prevent some of deferred huge pages in hpage_freelist
+        * from being freed. Check here and call schedule_work() if that
+        * is the case.
+        */
+       if (hpage_freelist && !work_pending(&free_huge_page_work))
+               schedule_work(&free_huge_page_work);
+
+free_page_now:
+       __free_huge_page(page);
+}
+
 static void prep_new_huge_page(struct hstate *h, struct page *page, int
nid)
 {
        INIT_LIST_HEAD(&page->lru);

-----------------------------------------------------------------------------------------------------

I think it may be simpler and less risky to use spin_lock_bh() as you
have suggested. Also, the above code will not be good enough if more
lock taking functions are being called from softIRQ context in the future.

So what do you think?

Cheers,
Longman

Cheers,
Longman
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..f86788d384f9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -59,7 +59,9 @@  static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
- * free_huge_pages, and surplus_huge_pages.
+ * free_huge_pages, and surplus_huge_pages. These protected data can be
+ * accessed from both process and IRQ contexts. Therefore, the spinlock needs
+ * to be protected with IRQ disabled to prevent nested locking.
  */
 DEFINE_SPINLOCK(hugetlb_lock);
 
@@ -70,6 +72,21 @@  DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
+/*
+ * Check to make sure that IRQ is enabled before calling spin_lock_irq()
+ * so that after a matching spin_unlock_irq() the system won't be in an
+ * incorrect state.
+ */
+static __always_inline void spin_lock_irq_check(spinlock_t *lock)
+{
+	lockdep_assert_irqs_enabled();
+	spin_lock_irq(lock);
+}
+#ifdef spin_lock_irq
+#undef spin_lock_irq
+#endif
+#define spin_lock_irq(lock)	spin_lock_irq_check(lock)
+
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 
@@ -1147,6 +1164,7 @@  void free_huge_page(struct page *page)
 	struct hugepage_subpool *spool =
 		(struct hugepage_subpool *)page_private(page);
 	bool restore_reserve;
+	unsigned long flags;
 
 	VM_BUG_ON_PAGE(page_count(page), page);
 	VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1175,7 +1193,7 @@  void free_huge_page(struct page *page)
 			restore_reserve = true;
 	}
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irqsave(&hugetlb_lock, flags);
 	clear_page_huge_active(page);
 	hugetlb_cgroup_uncharge_page(hstate_index(h),
 				     pages_per_huge_page(h), page);
@@ -1196,18 +1214,18 @@  void free_huge_page(struct page *page)
 		arch_clear_hugepage_flags(page);
 		enqueue_huge_page(h, page);
 	}
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irqrestore(&hugetlb_lock, flags);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	INIT_LIST_HEAD(&page->lru);
 	set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	set_hugetlb_cgroup(page, NULL);
 	h->nr_huge_pages++;
 	h->nr_huge_pages_node[nid]++;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 }
 
 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1438,7 +1456,7 @@  int dissolve_free_huge_page(struct page *page)
 	if (!PageHuge(page))
 		return 0;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	if (!PageHuge(page)) {
 		rc = 0;
 		goto out;
@@ -1466,7 +1484,7 @@  int dissolve_free_huge_page(struct page *page)
 		rc = 0;
 	}
 out:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 	return rc;
 }
 
@@ -1508,16 +1526,16 @@  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	if (hstate_is_gigantic(h))
 		return NULL;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
 		goto out_unlock;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
 	if (!page)
 		return NULL;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	/*
 	 * We could have raced with the pool size change.
 	 * Double check that and simply deallocate the new page
@@ -1527,7 +1545,7 @@  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	 */
 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 		SetPageHugeTemporary(page);
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_irq(&hugetlb_lock);
 		put_page(page);
 		return NULL;
 	} else {
@@ -1536,7 +1554,7 @@  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	}
 
 out_unlock:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	return page;
 }
@@ -1591,10 +1609,10 @@  struct page *alloc_huge_page_node(struct hstate *h, int nid)
 	if (nid != NUMA_NO_NODE)
 		gfp_mask |= __GFP_THISNODE;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	if (h->free_huge_pages - h->resv_huge_pages > 0)
 		page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	if (!page)
 		page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
@@ -1608,17 +1626,17 @@  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 {
 	gfp_t gfp_mask = htlb_alloc_mask(h);
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	if (h->free_huge_pages - h->resv_huge_pages > 0) {
 		struct page *page;
 
 		page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
 		if (page) {
-			spin_unlock(&hugetlb_lock);
+			spin_unlock_irq(&hugetlb_lock);
 			return page;
 		}
 	}
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
@@ -1664,7 +1682,7 @@  static int gather_surplus_pages(struct hstate *h, int delta)
 
 	ret = -ENOMEM;
 retry:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
 		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
 				NUMA_NO_NODE, NULL);
@@ -1681,7 +1699,7 @@  static int gather_surplus_pages(struct hstate *h, int delta)
 	 * After retaking hugetlb_lock, we need to recalculate 'needed'
 	 * because either resv_huge_pages or free_huge_pages may have changed.
 	 */
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	needed = (h->resv_huge_pages + delta) -
 			(h->free_huge_pages + allocated);
 	if (needed > 0) {
@@ -1719,12 +1737,12 @@  static int gather_surplus_pages(struct hstate *h, int delta)
 		enqueue_huge_page(h, page);
 	}
 free:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	/* Free unnecessary surplus pages to the buddy allocator */
 	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
 		put_page(page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 
 	return ret;
 }
@@ -1738,7 +1756,7 @@  static int gather_surplus_pages(struct hstate *h, int delta)
  *    the reservation.  As many as unused_resv_pages may be freed.
  *
  * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
+ * reacquired) around calls to cond_resched().  Whenever dropping the lock,
  * we must make sure nobody else can claim pages we are in the process of
  * freeing.  Do this by ensuring resv_huge_page always is greater than the
  * number of huge pages we plan to free when dropping the lock.
@@ -1775,7 +1793,11 @@  static void return_unused_surplus_pages(struct hstate *h,
 		unused_resv_pages--;
 		if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
 			goto out;
-		cond_resched_lock(&hugetlb_lock);
+		if (need_resched()) {
+			spin_unlock_irq(&hugetlb_lock);
+			cond_resched();
+			spin_lock_irq(&hugetlb_lock);
+		}
 	}
 
 out:
@@ -1994,7 +2016,7 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (ret)
 		goto out_subpool_put;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	/*
 	 * glb_chg is passed to indicate whether or not a page must be taken
 	 * from the global free pool (global change).  gbl_chg == 0 indicates
@@ -2002,7 +2024,7 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 */
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
 	if (!page) {
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_irq(&hugetlb_lock);
 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
 		if (!page)
 			goto out_uncharge_cgroup;
@@ -2010,12 +2032,12 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 			SetPagePrivate(page);
 			h->resv_huge_pages--;
 		}
-		spin_lock(&hugetlb_lock);
+		spin_lock_irq(&hugetlb_lock);
 		list_move(&page->lru, &h->hugepage_activelist);
 		/* Fall through */
 	}
 	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	set_page_private(page, (unsigned long)spool);
 
@@ -2269,7 +2291,7 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	else
 		return -ENOMEM;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 
 	/*
 	 * Check for a node specific request.
@@ -2300,7 +2322,7 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	 */
 	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
 		if (count > persistent_huge_pages(h)) {
-			spin_unlock(&hugetlb_lock);
+			spin_unlock_irq(&hugetlb_lock);
 			NODEMASK_FREE(node_alloc_noretry);
 			return -EINVAL;
 		}
@@ -2329,14 +2351,14 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_irq(&hugetlb_lock);
 
 		/* yield cpu to avoid soft lockup */
 		cond_resched();
 
 		ret = alloc_pool_huge_page(h, nodes_allowed,
 						node_alloc_noretry);
-		spin_lock(&hugetlb_lock);
+		spin_lock_irq(&hugetlb_lock);
 		if (!ret)
 			goto out;
 
@@ -2366,7 +2388,11 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	while (min_count < persistent_huge_pages(h)) {
 		if (!free_pool_huge_page(h, nodes_allowed, 0))
 			break;
-		cond_resched_lock(&hugetlb_lock);
+		if (need_resched()) {
+			spin_unlock_irq(&hugetlb_lock);
+			cond_resched();
+			spin_lock_irq(&hugetlb_lock);
+		}
 	}
 	while (count < persistent_huge_pages(h)) {
 		if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2374,7 +2400,7 @@  static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 	}
 out:
 	h->max_huge_pages = persistent_huge_pages(h);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	NODEMASK_FREE(node_alloc_noretry);
 
@@ -2528,9 +2554,9 @@  static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 	if (err)
 		return err;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	h->nr_overcommit_huge_pages = input;
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 
 	return count;
 }
@@ -2978,9 +3004,9 @@  int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 		goto out;
 
 	if (write) {
-		spin_lock(&hugetlb_lock);
+		spin_lock_irq(&hugetlb_lock);
 		h->nr_overcommit_huge_pages = tmp;
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_irq(&hugetlb_lock);
 	}
 out:
 	return ret;
@@ -3071,7 +3097,7 @@  static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
 	int ret = -ENOMEM;
 
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	/*
 	 * When cpuset is configured, it breaks the strict hugetlb page
 	 * reservation as the accounting is done on a global variable. Such
@@ -3104,7 +3130,7 @@  static int hugetlb_acct_memory(struct hstate *h, long delta)
 		return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 }
 
@@ -4970,7 +4996,7 @@  bool isolate_huge_page(struct page *page, struct list_head *list)
 	bool ret = true;
 
 	VM_BUG_ON_PAGE(!PageHead(page), page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	if (!page_huge_active(page) || !get_page_unless_zero(page)) {
 		ret = false;
 		goto unlock;
@@ -4978,17 +5004,17 @@  bool isolate_huge_page(struct page *page, struct list_head *list)
 	clear_page_huge_active(page);
 	list_move_tail(&page->lru, list);
 unlock:
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHead(page), page);
-	spin_lock(&hugetlb_lock);
+	spin_lock_irq(&hugetlb_lock);
 	set_page_huge_active(page);
 	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
-	spin_unlock(&hugetlb_lock);
+	spin_unlock_irq(&hugetlb_lock);
 	put_page(page);
 }
 
@@ -5016,11 +5042,11 @@  void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
 		SetPageHugeTemporary(oldpage);
 		ClearPageHugeTemporary(newpage);
 
-		spin_lock(&hugetlb_lock);
+		spin_lock_irq(&hugetlb_lock);
 		if (h->surplus_huge_pages_node[old_nid]) {
 			h->surplus_huge_pages_node[old_nid]--;
 			h->surplus_huge_pages_node[new_nid]++;
 		}
-		spin_unlock(&hugetlb_lock);
+		spin_unlock_irq(&hugetlb_lock);
 	}
 }