diff mbox series

[v5,1/4] mm,hwpoison: drain pcplists before bailing out for non-buddy zero-refcount page

Message ID 20201013144447.6706-2-osalvador@suse.de (mailing list archive)
State New, archived
Headers show
Series HWpoison: further fixes and cleanups | expand

Commit Message

Oscar Salvador Oct. 13, 2020, 2:44 p.m. UTC
A page with 0-refcount and !PageBuddy could perfectly be a pcppage.
Currently, we bail out with an error if we encounter such a page, meaning
that we do not handle pcppages neither from hard-offline nor from
soft-offline path.

Fix this by draining pcplists whenever we find this kind of page and retry
the check again.  It might be that pcplists have been spilled into the
buddy allocator and so we can handle it.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
 mm/memory-failure.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

Comments

Vlastimil Babka Nov. 25, 2020, 4:26 p.m. UTC | #1
On 10/13/20 4:44 PM, Oscar Salvador wrote:
> A page with 0-refcount and !PageBuddy could perfectly be a pcppage.
> Currently, we bail out with an error if we encounter such a page, meaning
> that we do not handle pcppages neither from hard-offline nor from
> soft-offline path.
> 
> Fix this by draining pcplists whenever we find this kind of page and retry
> the check again.  It might be that pcplists have been spilled into the
> buddy allocator and so we can handle it.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>   mm/memory-failure.c | 24 ++++++++++++++++++++++--
>   1 file changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index c0bb186bba62..e2f12410c594 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -946,13 +946,13 @@ static int page_action(struct page_state *ps, struct page *p,
>   }
>   
>   /**
> - * get_hwpoison_page() - Get refcount for memory error handling:
> + * __get_hwpoison_page() - Get refcount for memory error handling:
>    * @page:	raw error page (hit by memory error)
>    *
>    * Return: return 0 if failed to grab the refcount, otherwise true (some
>    * non-zero value.)
>    */
> -static int get_hwpoison_page(struct page *page)
> +static int __get_hwpoison_page(struct page *page)
>   {
>   	struct page *head = compound_head(page);
>   
> @@ -982,6 +982,26 @@ static int get_hwpoison_page(struct page *page)
>   	return 0;
>   }
>   
> +static int get_hwpoison_page(struct page *p)
> +{
> +	int ret;
> +	bool drained = false;
> +
> +retry:
> +	ret = __get_hwpoison_page(p);
> +	if (!ret && !is_free_buddy_page(p) && !page_count(p) && !drained) {
> +		/*
> +		 * The page might be in a pcplist, so try to drain those
> +		 * and see if we are lucky.
> +		 */
> +		drain_all_pages(page_zone(p));
> +		drained = true;
> +		goto retry;
> +	}
> +
> +	return ret;
> +}
> +
>   /*
>    * Do all that is necessary to remove user space mappings. Unmap
>    * the pages and send SIGBUS to the processes if the data was dirty.
>
diff mbox series

Patch

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c0bb186bba62..e2f12410c594 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -946,13 +946,13 @@  static int page_action(struct page_state *ps, struct page *p,
 }
 
 /**
- * get_hwpoison_page() - Get refcount for memory error handling:
+ * __get_hwpoison_page() - Get refcount for memory error handling:
  * @page:	raw error page (hit by memory error)
  *
  * Return: return 0 if failed to grab the refcount, otherwise true (some
  * non-zero value.)
  */
-static int get_hwpoison_page(struct page *page)
+static int __get_hwpoison_page(struct page *page)
 {
 	struct page *head = compound_head(page);
 
@@ -982,6 +982,26 @@  static int get_hwpoison_page(struct page *page)
 	return 0;
 }
 
+static int get_hwpoison_page(struct page *p)
+{
+	int ret;
+	bool drained = false;
+
+retry:
+	ret = __get_hwpoison_page(p);
+	if (!ret && !is_free_buddy_page(p) && !page_count(p) && !drained) {
+		/*
+		 * The page might be in a pcplist, so try to drain those
+		 * and see if we are lucky.
+		 */
+		drain_all_pages(page_zone(p));
+		drained = true;
+		goto retry;
+	}
+
+	return ret;
+}
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.