diff mbox series

mm/hugetlb: Defer freeing of huge pages if in non-task context

Message ID 20191216182739.26880-1-longman@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/hugetlb: Defer freeing of huge pages if in non-task context | expand

Commit Message

Waiman Long Dec. 16, 2019, 6:27 p.m. UTC
The following lockdep splat was observed when a certain hugetlbfs test
was run:

[  612.388273] ================================
[  612.411273] WARNING: inconsistent lock state
[  612.432273] 4.18.0-159.el8.x86_64+debug #1 Tainted: G        W --------- -  -
[  612.469273] --------------------------------
[  612.489273] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[  612.517273] swapper/30/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[  612.541273] ffffffff9acdc038 (hugetlb_lock){+.?.}, at: free_huge_page+0x36f/0xaa0
[  612.576273] {SOFTIRQ-ON-W} state was registered at:
[  612.598273]   lock_acquire+0x14f/0x3b0
[  612.616273]   _raw_spin_lock+0x30/0x70
[  612.634273]   __nr_hugepages_store_common+0x11b/0xb30
[  612.657273]   hugetlb_sysctl_handler_common+0x209/0x2d0
[  612.681273]   proc_sys_call_handler+0x37f/0x450
[  612.703273]   vfs_write+0x157/0x460
[  612.719273]   ksys_write+0xb8/0x170
[  612.736273]   do_syscall_64+0xa5/0x4d0
[  612.753273]   entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[  612.777273] irq event stamp: 691296
[  612.794273] hardirqs last  enabled at (691296): [<ffffffff99bb034b>] _raw_spin_unlock_irqrestore+0x4b/0x60
[  612.839273] hardirqs last disabled at (691295): [<ffffffff99bb0ad2>] _raw_spin_lock_irqsave+0x22/0x81
[  612.882273] softirqs last  enabled at (691284): [<ffffffff97ff0c63>] irq_enter+0xc3/0xe0
[  612.922273] softirqs last disabled at (691285): [<ffffffff97ff0ebe>] irq_exit+0x23e/0x2b0
[  612.962273]
[  612.962273] other info that might help us debug this:
[  612.993273]  Possible unsafe locking scenario:
[  612.993273]
[  613.020273]        CPU0
[  613.031273]        ----
[  613.042273]   lock(hugetlb_lock);
[  613.057273]   <Interrupt>
[  613.069273]     lock(hugetlb_lock);
[  613.085273]
[  613.085273]  *** DEADLOCK ***
      :
[  613.245273] Call Trace:
[  613.256273]  <IRQ>
[  613.265273]  dump_stack+0x9a/0xf0
[  613.281273]  mark_lock+0xd0c/0x12f0
[  613.297273]  ? print_shortest_lock_dependencies+0x80/0x80
[  613.322273]  ? sched_clock_cpu+0x18/0x1e0
[  613.341273]  __lock_acquire+0x146b/0x48c0
[  613.360273]  ? trace_hardirqs_on+0x10/0x10
[  613.379273]  ? trace_hardirqs_on_caller+0x27b/0x580
[  613.401273]  lock_acquire+0x14f/0x3b0
[  613.419273]  ? free_huge_page+0x36f/0xaa0
[  613.440273]  _raw_spin_lock+0x30/0x70
[  613.458273]  ? free_huge_page+0x36f/0xaa0
[  613.477273]  free_huge_page+0x36f/0xaa0
[  613.495273]  bio_check_pages_dirty+0x2fc/0x5c0
[  613.516273]  clone_endio+0x17f/0x670 [dm_mod]
[  613.536273]  ? disable_discard+0x90/0x90 [dm_mod]
[  613.558273]  ? bio_endio+0x4ba/0x930
[  613.575273]  ? blk_account_io_completion+0x400/0x530
[  613.598273]  blk_update_request+0x276/0xe50
[  613.617273]  scsi_end_request+0x7b/0x6a0
[  613.636273]  ? lock_downgrade+0x6f0/0x6f0
[  613.654273]  scsi_io_completion+0x1c6/0x1570
[  613.674273]  ? sd_completed_bytes+0x3a0/0x3a0 [sd_mod]
[  613.698273]  ? scsi_mq_requeue_cmd+0xc0/0xc0
[  613.718273]  blk_done_softirq+0x22e/0x350
[  613.737273]  ? blk_softirq_cpu_dead+0x230/0x230
[  613.758273]  __do_softirq+0x23d/0xad8
[  613.776273]  irq_exit+0x23e/0x2b0
[  613.792273]  do_IRQ+0x11a/0x200
[  613.806273]  common_interrupt+0xf/0xf
[  613.823273]  </IRQ>

Both the hugetbl_lock and the subpool lock can be acquired in
free_huge_page(). One way to solve the problem is to make both locks
irq-safe. Another alternative is to defer the freeing to a workqueue job.

This patch implements the deferred freeing by adding a
free_hpage_workfn() work function to do the actual freeing. The
free_huge_page() call in a non-task context saves the page to be freed
in the hpage_freelist linked list in a lockless manner.

The generic workqueue is used to process the work, but a dedicated
workqueue can be used instead if it is desirable to have the huge page
freed ASAP.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 mm/hugetlb.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

Comments

Andrew Morton Dec. 16, 2019, 9:51 p.m. UTC | #1
On Mon, 16 Dec 2019 13:27:39 -0500 Waiman Long <longman@redhat.com> wrote:

> The following lockdep splat was observed when a certain hugetlbfs test
> was run:
> 
> ...
> 
> Both the hugetbl_lock and the subpool lock can be acquired in
> free_huge_page(). One way to solve the problem is to make both locks
> irq-safe. Another alternative is to defer the freeing to a workqueue job.
> 
> This patch implements the deferred freeing by adding a
> free_hpage_workfn() work function to do the actual freeing. The
> free_huge_page() call in a non-task context saves the page to be freed
> in the hpage_freelist linked list in a lockless manner.
> 
> The generic workqueue is used to process the work, but a dedicated
> workqueue can be used instead if it is desirable to have the huge page
> freed ASAP.
>
> ...
>
> @@ -1199,6 +1199,73 @@ void free_huge_page(struct page *page)
>  	spin_unlock(&hugetlb_lock);
>  }
>  
> +/*
> + * As free_huge_page() can be called from a non-task context, we have
> + * to defer the actual freeing in a workqueue to prevent potential
> + * hugetlb_lock deadlock.
> + *
> + * free_hpage_workfn() locklessly retrieves the linked list of pages to
> + * be freed and frees them one-by-one. As the page->mapping pointer is
> + * going to be cleared in __free_huge_page() anyway, it is reused as the
> + * next pointer of a singly linked list of huge pages to be freed.
> + */
> +#define NEXT_PENDING	((struct page *)-1)
> +static struct page *hpage_freelist;
> +
> +static void free_hpage_workfn(struct work_struct *work)
> +{
> +	struct page *curr, *next;
> +	int cnt = 0;
> +
> +	do {
> +		curr = xchg(&hpage_freelist, NULL);
> +		if (!curr)
> +			break;
> +
> +		while (curr) {
> +			next = (struct page *)READ_ONCE(curr->mapping);
> +			if (next == NEXT_PENDING) {
> +				cpu_relax();
> +				continue;
> +			}
> +			__free_huge_page(curr);
> +			curr = next;
> +			cnt++;
> +		}
> +	} while (!READ_ONCE(hpage_freelist));
> +
> +	if (!cnt)
> +		return;
> +	pr_debug("HugeTLB: free_hpage_workfn() frees %d huge page(s)\n", cnt);
> +}
> +static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
> +
> +void free_huge_page(struct page *page)
> +{
> +	/*
> +	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
> +	 */
> +	if (!in_task()) {
> +		struct page *next;
> +
> +		page->mapping = (struct address_space *)NEXT_PENDING;
> +		next = xchg(&hpage_freelist, page);
> +		WRITE_ONCE(page->mapping, (struct address_space *)next);

The NEXT_PENDING stuff could do with come commenting, I think.  It's
reasonably obvious, but not obvious enough.  For example, why does the
second write to page->mapping use WRITE_ONCE() but the first does not. 
Please spell out the design, fully.

> +		schedule_work(&free_hpage_work);
> +		return;
> +	}
> +
> +	/*
> +	 * Racing may prevent some deferred huge pages in hpage_freelist
> +	 * from being freed. Check here and call schedule_work() if that
> +	 * is the case.
> +	 */
> +	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
> +		schedule_work(&free_hpage_work);
> +
> +	__free_huge_page(page);
> +}
> +
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>  {
>  	INIT_LIST_HEAD(&page->lru);

Otherwise it looks OK to me.  Deferring freeing in this way is
generally lame and gives rise to concerns about memory exhaustion in
strange situations, and to concerns about various memory accounting
stats being logically wrong for short periods.  But we already do this
in (too) many places, so fingers crossed :(
Mike Kravetz Dec. 16, 2019, 10:40 p.m. UTC | #2
On 12/16/19 10:27 AM, Waiman Long wrote:
> The following lockdep splat was observed when a certain hugetlbfs test
> was run:
<snip>
> This patch implements the deferred freeing by adding a
> free_hpage_workfn() work function to do the actual freeing. The
> free_huge_page() call in a non-task context saves the page to be freed
> in the hpage_freelist linked list in a lockless manner.
> 
> The generic workqueue is used to process the work, but a dedicated
> workqueue can be used instead if it is desirable to have the huge page
> freed ASAP.
> 
<snip>
>  
> +/*
> + * As free_huge_page() can be called from a non-task context, we have
> + * to defer the actual freeing in a workqueue to prevent potential
> + * hugetlb_lock deadlock.
> + *
> + * free_hpage_workfn() locklessly retrieves the linked list of pages to
> + * be freed and frees them one-by-one. As the page->mapping pointer is
> + * going to be cleared in __free_huge_page() anyway, it is reused as the
> + * next pointer of a singly linked list of huge pages to be freed.
> + */
> +#define NEXT_PENDING	((struct page *)-1)
> +static struct page *hpage_freelist;
> +
> +static void free_hpage_workfn(struct work_struct *work)
> +{
> +	struct page *curr, *next;
> +	int cnt = 0;
> +
> +	do {
> +		curr = xchg(&hpage_freelist, NULL);
> +		if (!curr)
> +			break;
> +
> +		while (curr) {
> +			next = (struct page *)READ_ONCE(curr->mapping);
> +			if (next == NEXT_PENDING) {
> +				cpu_relax();
> +				continue;
> +			}
> +			__free_huge_page(curr);
> +			curr = next;
> +			cnt++;
> +		}
> +	} while (!READ_ONCE(hpage_freelist));
> +
> +	if (!cnt)
> +		return;
> +	pr_debug("HugeTLB: free_hpage_workfn() frees %d huge page(s)\n", cnt);
> +}
> +static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
> +
> +void free_huge_page(struct page *page)
> +{
> +	/*
> +	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
> +	 */
> +	if (!in_task()) {
> +		struct page *next;
> +
> +		page->mapping = (struct address_space *)NEXT_PENDING;
> +		next = xchg(&hpage_freelist, page);
> +		WRITE_ONCE(page->mapping, (struct address_space *)next);
> +		schedule_work(&free_hpage_work);
> +		return;
> +	}

As Andrew mentioned, the design for the lockless queueing could use more
explanation.  I had to draw some diagrams before I felt relatively confident
in the design.

> +
> +	/*
> +	 * Racing may prevent some deferred huge pages in hpage_freelist
> +	 * from being freed. Check here and call schedule_work() if that
> +	 * is the case.
> +	 */
> +	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
> +		schedule_work(&free_hpage_work);

Can you describe the race which would leave deferred huge pages on
hpage_freelist?  I am having a hard time determining how that can happen.

And, if this indeed can happen then I would have to ask what happens if
a page is 'stuck' and we do not call free_huge_page?  Do we need to take
that case into account?

Overall, I like the design and hope this will work.  I have been testing
a 'modified' version of the patch to always do the deferred freeing.  The
modification is simply to stress the code.   So far, I have not found any
issues in any of my testing.
Waiman Long Dec. 16, 2019, 10:52 p.m. UTC | #3
On 12/16/19 4:51 PM, Andrew Morton wrote:
> On Mon, 16 Dec 2019 13:27:39 -0500 Waiman Long <longman@redhat.com> wrote:
>
>> The following lockdep splat was observed when a certain hugetlbfs test
>> was run:
>>
>> ...
>>
>> Both the hugetbl_lock and the subpool lock can be acquired in
>> free_huge_page(). One way to solve the problem is to make both locks
>> irq-safe. Another alternative is to defer the freeing to a workqueue job.
>>
>> This patch implements the deferred freeing by adding a
>> free_hpage_workfn() work function to do the actual freeing. The
>> free_huge_page() call in a non-task context saves the page to be freed
>> in the hpage_freelist linked list in a lockless manner.
>>
>> The generic workqueue is used to process the work, but a dedicated
>> workqueue can be used instead if it is desirable to have the huge page
>> freed ASAP.
>>
>> ...
>>
>> @@ -1199,6 +1199,73 @@ void free_huge_page(struct page *page)
>>  	spin_unlock(&hugetlb_lock);
>>  }
>>  
>> +/*
>> + * As free_huge_page() can be called from a non-task context, we have
>> + * to defer the actual freeing in a workqueue to prevent potential
>> + * hugetlb_lock deadlock.
>> + *
>> + * free_hpage_workfn() locklessly retrieves the linked list of pages to
>> + * be freed and frees them one-by-one. As the page->mapping pointer is
>> + * going to be cleared in __free_huge_page() anyway, it is reused as the
>> + * next pointer of a singly linked list of huge pages to be freed.
>> + */
>> +#define NEXT_PENDING	((struct page *)-1)
>> +static struct page *hpage_freelist;
>> +
>> +static void free_hpage_workfn(struct work_struct *work)
>> +{
>> +	struct page *curr, *next;
>> +	int cnt = 0;
>> +
>> +	do {
>> +		curr = xchg(&hpage_freelist, NULL);
>> +		if (!curr)
>> +			break;
>> +
>> +		while (curr) {
>> +			next = (struct page *)READ_ONCE(curr->mapping);
>> +			if (next == NEXT_PENDING) {
>> +				cpu_relax();
>> +				continue;
>> +			}
>> +			__free_huge_page(curr);
>> +			curr = next;
>> +			cnt++;
>> +		}
>> +	} while (!READ_ONCE(hpage_freelist));
>> +
>> +	if (!cnt)
>> +		return;
>> +	pr_debug("HugeTLB: free_hpage_workfn() frees %d huge page(s)\n", cnt);
>> +}
>> +static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
>> +
>> +void free_huge_page(struct page *page)
>> +{
>> +	/*
>> +	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
>> +	 */
>> +	if (!in_task()) {
>> +		struct page *next;
>> +
>> +		page->mapping = (struct address_space *)NEXT_PENDING;
>> +		next = xchg(&hpage_freelist, page);
>> +		WRITE_ONCE(page->mapping, (struct address_space *)next);
> The NEXT_PENDING stuff could do with come commenting, I think.  It's
> reasonably obvious, but not obvious enough.  For example, why does the
> second write to page->mapping use WRITE_ONCE() but the first does not. 
> Please spell out the design, fully.

Sure. The idea is that the setting of the next pointer and the writing
to hpage_freelist cannot be done atomically without using a lock. Before
xchg(), the page isn't visible to a concurrent work function. So no
special write is needed, the mb() in xchg will ensure that the
page->mapping will be visible to all. After the xchg, page->mapping is
subjected to concurrent access. So WRITE_ONCE() is used to make sure
that is no write tearing.

I will update the patch with more comment once I gather other feedbacks
from other reviewers.

>
>> +		schedule_work(&free_hpage_work);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Racing may prevent some deferred huge pages in hpage_freelist
>> +	 * from being freed. Check here and call schedule_work() if that
>> +	 * is the case.
>> +	 */
>> +	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
>> +		schedule_work(&free_hpage_work);
>> +
>> +	__free_huge_page(page);
>> +}
>> +
>>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
>>  {
>>  	INIT_LIST_HEAD(&page->lru);
> Otherwise it looks OK to me.  Deferring freeing in this way is
> generally lame and gives rise to concerns about memory exhaustion in
> strange situations, and to concerns about various memory accounting
> stats being logically wrong for short periods.  But we already do this
> in (too) many places, so fingers crossed :(
>
It is actually quite rare to hit the condition that a huge page will
have to be freed in an irq context. Otherwise, this problem will be
found earlier. Hopefully the workfn won't be invoked in that many occasions.

Cheers,
Longman
Waiman Long Dec. 16, 2019, 11:20 p.m. UTC | #4
On 12/16/19 5:40 PM, Mike Kravetz wrote:
> On 12/16/19 10:27 AM, Waiman Long wrote:
>> The following lockdep splat was observed when a certain hugetlbfs test
>> was run:
> <snip>
>> This patch implements the deferred freeing by adding a
>> free_hpage_workfn() work function to do the actual freeing. The
>> free_huge_page() call in a non-task context saves the page to be freed
>> in the hpage_freelist linked list in a lockless manner.
>>
>> The generic workqueue is used to process the work, but a dedicated
>> workqueue can be used instead if it is desirable to have the huge page
>> freed ASAP.
>>
> <snip>
>>  
>> +/*
>> + * As free_huge_page() can be called from a non-task context, we have
>> + * to defer the actual freeing in a workqueue to prevent potential
>> + * hugetlb_lock deadlock.
>> + *
>> + * free_hpage_workfn() locklessly retrieves the linked list of pages to
>> + * be freed and frees them one-by-one. As the page->mapping pointer is
>> + * going to be cleared in __free_huge_page() anyway, it is reused as the
>> + * next pointer of a singly linked list of huge pages to be freed.
>> + */
>> +#define NEXT_PENDING	((struct page *)-1)
>> +static struct page *hpage_freelist;
>> +
>> +static void free_hpage_workfn(struct work_struct *work)
>> +{
>> +	struct page *curr, *next;
>> +	int cnt = 0;
>> +
>> +	do {
>> +		curr = xchg(&hpage_freelist, NULL);
>> +		if (!curr)
>> +			break;
>> +
>> +		while (curr) {
>> +			next = (struct page *)READ_ONCE(curr->mapping);
>> +			if (next == NEXT_PENDING) {
>> +				cpu_relax();
>> +				continue;
>> +			}
>> +			__free_huge_page(curr);
>> +			curr = next;
>> +			cnt++;
>> +		}
>> +	} while (!READ_ONCE(hpage_freelist));
>> +
>> +	if (!cnt)
>> +		return;
>> +	pr_debug("HugeTLB: free_hpage_workfn() frees %d huge page(s)\n", cnt);
>> +}
>> +static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
>> +
>> +void free_huge_page(struct page *page)
>> +{
>> +	/*
>> +	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
>> +	 */
>> +	if (!in_task()) {
>> +		struct page *next;
>> +
>> +		page->mapping = (struct address_space *)NEXT_PENDING;
>> +		next = xchg(&hpage_freelist, page);
>> +		WRITE_ONCE(page->mapping, (struct address_space *)next);
>> +		schedule_work(&free_hpage_work);
>> +		return;
>> +	}
> As Andrew mentioned, the design for the lockless queueing could use more
> explanation.  I had to draw some diagrams before I felt relatively confident
> in the design.
>
>> +
>> +	/*
>> +	 * Racing may prevent some deferred huge pages in hpage_freelist
>> +	 * from being freed. Check here and call schedule_work() if that
>> +	 * is the case.
>> +	 */
>> +	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
>> +		schedule_work(&free_hpage_work);
> Can you describe the race which would leave deferred huge pages on
> hpage_freelist?  I am having a hard time determining how that can happen.
I am being cautious here. It is related how the workqueue works. Whether
a call to schedule_work() has any effect depends on the pending bit in
the workqueue structure. I suppose that it is cleared once the work is
done. So depending on when the bit is cleared, there may be a small
timing window where free_hpage_workfn() is done but the bit has not been
cleared yet. A concurrent softIRQ task may update hpage_freelist and
call schedule_work() without actually queuing it. Perhaps I can check
the return status of schedule_work() and wait for a while there until
the queuing is successfully or the free list is changed. I will need to
look more carefully at the workqueue code to see how big this timing
window is.
> And, if this indeed can happen then I would have to ask what happens if
> a page is 'stuck' and we do not call free_huge_page?  Do we need to take
> that case into account?

As said above, there may be way to reduce the racing window or eliminate
it altogether. I need a bit more time to investigate that. If there is
no way to eliminate the racing window, it is possible that a huge page
may get stuck in the free list for a while.

Cheers,
Longman
Waiman Long Dec. 17, 2019, 12:29 a.m. UTC | #5
On 12/16/19 6:20 PM, Waiman Long wrote:
>>> +
>>> +	/*
>>> +	 * Racing may prevent some deferred huge pages in hpage_freelist
>>> +	 * from being freed. Check here and call schedule_work() if that
>>> +	 * is the case.
>>> +	 */
>>> +	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
>>> +		schedule_work(&free_hpage_work);
>> Can you describe the race which would leave deferred huge pages on
>> hpage_freelist?  I am having a hard time determining how that can happen.
> I am being cautious here. It is related how the workqueue works. Whether
> a call to schedule_work() has any effect depends on the pending bit in
> the workqueue structure. I suppose that it is cleared once the work is
> done. So depending on when the bit is cleared, there may be a small
> timing window where free_hpage_workfn() is done but the bit has not been
> cleared yet. A concurrent softIRQ task may update hpage_freelist and
> call schedule_work() without actually queuing it. Perhaps I can check
> the return status of schedule_work() and wait for a while there until
> the queuing is successfully or the free list is changed. I will need to
> look more carefully at the workqueue code to see how big this timing
> window is.
>> And, if this indeed can happen then I would have to ask what happens if
>> a page is 'stuck' and we do not call free_huge_page?  Do we need to take
>> that case into account?
> As said above, there may be way to reduce the racing window or eliminate
> it altogether. I need a bit more time to investigate that. If there is
> no way to eliminate the racing window, it is possible that a huge page
> may get stuck in the free list for a while.

My mistake. The pending bit is actually cleared before calling the
workfn. That shows I don't fully understand the work queue
functionality. In this case, there should be no race. I will remove the
unnecessary check.

Cheers,
Longman
diff mbox series

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac65bb5e38ac..1dfebd898943 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1136,7 +1136,7 @@  static inline void ClearPageHugeTemporary(struct page *page)
 	page[2].mapping = NULL;
 }
 
-void free_huge_page(struct page *page)
+static void __free_huge_page(struct page *page)
 {
 	/*
 	 * Can't pass hstate in here because it is called from the
@@ -1199,6 +1199,73 @@  void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
+/*
+ * As free_huge_page() can be called from a non-task context, we have
+ * to defer the actual freeing in a workqueue to prevent potential
+ * hugetlb_lock deadlock.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
+ * be freed and frees them one-by-one. As the page->mapping pointer is
+ * going to be cleared in __free_huge_page() anyway, it is reused as the
+ * next pointer of a singly linked list of huge pages to be freed.
+ */
+#define NEXT_PENDING	((struct page *)-1)
+static struct page *hpage_freelist;
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+	struct page *curr, *next;
+	int cnt = 0;
+
+	do {
+		curr = xchg(&hpage_freelist, NULL);
+		if (!curr)
+			break;
+
+		while (curr) {
+			next = (struct page *)READ_ONCE(curr->mapping);
+			if (next == NEXT_PENDING) {
+				cpu_relax();
+				continue;
+			}
+			__free_huge_page(curr);
+			curr = next;
+			cnt++;
+		}
+	} while (!READ_ONCE(hpage_freelist));
+
+	if (!cnt)
+		return;
+	pr_debug("HugeTLB: free_hpage_workfn() frees %d huge page(s)\n", cnt);
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+void free_huge_page(struct page *page)
+{
+	/*
+	 * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
+	 */
+	if (!in_task()) {
+		struct page *next;
+
+		page->mapping = (struct address_space *)NEXT_PENDING;
+		next = xchg(&hpage_freelist, page);
+		WRITE_ONCE(page->mapping, (struct address_space *)next);
+		schedule_work(&free_hpage_work);
+		return;
+	}
+
+	/*
+	 * Racing may prevent some deferred huge pages in hpage_freelist
+	 * from being freed. Check here and call schedule_work() if that
+	 * is the case.
+	 */
+	if (unlikely(hpage_freelist && !work_pending(&free_hpage_work)))
+		schedule_work(&free_hpage_work);
+
+	__free_huge_page(page);
+}
+
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	INIT_LIST_HEAD(&page->lru);