diff mbox series

[v2,1/4] mm/hwpoison: mf_mutex for soft offline and unpoison

Message ID 20211025230503.2650970-2-naoya.horiguchi@linux.dev (mailing list archive)
State New
Headers show
Series mm/hwpoison: fix unpoison_memory() | expand

Commit Message

Naoya Horiguchi Oct. 25, 2021, 11:05 p.m. UTC
From: Naoya Horiguchi <naoya.horiguchi@nec.com>

Originally mf_mutex is introduced to serialize multiple MCE events, but
it's also helpful to exclude races among  soft_offline_page() and
unpoison_memory().  So apply mf_mutex to them.

Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
---
ChangeLog v2:
- add mutex_unlock() in "page already poisoned" path in soft_offline_page().
  (Thanks to Ding Hui)
---
 mm/memory-failure.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

Comments

Yang Shi Oct. 27, 2021, 1:32 a.m. UTC | #1
On Mon, Oct 25, 2021 at 4:06 PM Naoya Horiguchi
<naoya.horiguchi@linux.dev> wrote:
>
> From: Naoya Horiguchi <naoya.horiguchi@nec.com>
>
> Originally mf_mutex is introduced to serialize multiple MCE events, but
> it's also helpful to exclude races among  soft_offline_page() and
> unpoison_memory().  So apply mf_mutex to them.

My understanding is it is not that useful to make unpoison run
parallel with memory_failure() and soft offline, so they can be
serialized by mf_mutex and we could make the memory failure handler
and soft offline simpler.

If the above statement is correct, could you please tweak this commit
log to reflect it with patch #2 squashed into this patch?

> Signed-off-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
> ---
> ChangeLog v2:
> - add mutex_unlock() in "page already poisoned" path in soft_offline_page().
>   (Thanks to Ding Hui)
> ---
>  mm/memory-failure.c | 27 +++++++++++++++++++--------
>  1 file changed, 19 insertions(+), 8 deletions(-)
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index fa9dda95a2a2..97297edfbd8e 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1628,6 +1628,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
>         return rc;
>  }
>
> +static DEFINE_MUTEX(mf_mutex);
> +
>  /**
>   * memory_failure - Handle memory failure of a page.
>   * @pfn: Page Number of the corrupted page
> @@ -1654,7 +1656,6 @@ int memory_failure(unsigned long pfn, int flags)
>         int res = 0;
>         unsigned long page_flags;
>         bool retry = true;
> -       static DEFINE_MUTEX(mf_mutex);
>
>         if (!sysctl_memory_failure_recovery)
>                 panic("Memory failure on page %lx", pfn);
> @@ -1978,6 +1979,7 @@ int unpoison_memory(unsigned long pfn)
>         struct page *page;
>         struct page *p;
>         int freeit = 0;
> +       int ret = 0;
>         unsigned long flags = 0;
>         static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
>                                         DEFAULT_RATELIMIT_BURST);
> @@ -1988,28 +1990,30 @@ int unpoison_memory(unsigned long pfn)
>         p = pfn_to_page(pfn);
>         page = compound_head(p);
>
> +       mutex_lock(&mf_mutex);
> +
>         if (!PageHWPoison(p)) {
>                 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         if (page_count(page) > 1) {
>                 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         if (page_mapped(page)) {
>                 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         if (page_mapping(page)) {
>                 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         /*
> @@ -2020,7 +2024,7 @@ int unpoison_memory(unsigned long pfn)
>         if (!PageHuge(page) && PageTransHuge(page)) {
>                 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         if (!get_hwpoison_page(p, flags)) {
> @@ -2028,7 +2032,7 @@ int unpoison_memory(unsigned long pfn)
>                         num_poisoned_pages_dec();
>                 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
>                                  pfn, &unpoison_rs);
> -               return 0;
> +               goto unlock_mutex;
>         }
>
>         lock_page(page);
> @@ -2050,7 +2054,9 @@ int unpoison_memory(unsigned long pfn)
>         if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
>                 put_page(page);
>
> -       return 0;
> +unlock_mutex:
> +       mutex_unlock(&mf_mutex);
> +       return ret;
>  }
>  EXPORT_SYMBOL(unpoison_memory);
>
> @@ -2231,9 +2237,12 @@ int soft_offline_page(unsigned long pfn, int flags)
>                 return -EIO;
>         }
>
> +       mutex_lock(&mf_mutex);
> +
>         if (PageHWPoison(page)) {
>                 pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
>                 put_ref_page(ref_page);
> +               mutex_unlock(&mf_mutex);
>                 return 0;
>         }
>
> @@ -2251,5 +2260,7 @@ int soft_offline_page(unsigned long pfn, int flags)
>                 }
>         }
>
> +       mutex_unlock(&mf_mutex);
> +
>         return ret;
>  }
> --
> 2.25.1
>
Naoya Horiguchi Oct. 27, 2021, 2:31 a.m. UTC | #2
On Tue, Oct 26, 2021 at 06:32:36PM -0700, Yang Shi wrote:
> On Mon, Oct 25, 2021 at 4:06 PM Naoya Horiguchi
> <naoya.horiguchi@linux.dev> wrote:
> >
> > From: Naoya Horiguchi <naoya.horiguchi@nec.com>
> >
> > Originally mf_mutex is introduced to serialize multiple MCE events, but
> > it's also helpful to exclude races among  soft_offline_page() and
> > unpoison_memory().  So apply mf_mutex to them.
> 
> My understanding is it is not that useful to make unpoison run
> parallel with memory_failure() and soft offline, so they can be
> serialized by mf_mutex and we could make the memory failure handler
> and soft offline simpler.

Thank you for the suggestion, this sounds correct and more specific.

> 
> If the above statement is correct, could you please tweak this commit
> log to reflect it with patch #2 squashed into this patch?

Sure, I'm thinking of revising like below:

  Originally mf_mutex is introduced to serialize multiple MCE events, but
  it is not that useful to allow unpoison to run in parallel with memory_failure()
  and soft offline.  So apply mf_they to soft offline and unpoison.
  The memory failure handler and soft offline handler get simpler with this.

Thanks,
Naoya Horiguchi
diff mbox series

Patch

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fa9dda95a2a2..97297edfbd8e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1628,6 +1628,8 @@  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	return rc;
 }
 
+static DEFINE_MUTEX(mf_mutex);
+
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -1654,7 +1656,6 @@  int memory_failure(unsigned long pfn, int flags)
 	int res = 0;
 	unsigned long page_flags;
 	bool retry = true;
-	static DEFINE_MUTEX(mf_mutex);
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
@@ -1978,6 +1979,7 @@  int unpoison_memory(unsigned long pfn)
 	struct page *page;
 	struct page *p;
 	int freeit = 0;
+	int ret = 0;
 	unsigned long flags = 0;
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
@@ -1988,28 +1990,30 @@  int unpoison_memory(unsigned long pfn)
 	p = pfn_to_page(pfn);
 	page = compound_head(p);
 
+	mutex_lock(&mf_mutex);
+
 	if (!PageHWPoison(p)) {
 		unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_count(page) > 1) {
 		unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_mapped(page)) {
 		unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_mapping(page)) {
 		unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	/*
@@ -2020,7 +2024,7 @@  int unpoison_memory(unsigned long pfn)
 	if (!PageHuge(page) && PageTransHuge(page)) {
 		unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (!get_hwpoison_page(p, flags)) {
@@ -2028,7 +2032,7 @@  int unpoison_memory(unsigned long pfn)
 			num_poisoned_pages_dec();
 		unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	lock_page(page);
@@ -2050,7 +2054,9 @@  int unpoison_memory(unsigned long pfn)
 	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
 		put_page(page);
 
-	return 0;
+unlock_mutex:
+	mutex_unlock(&mf_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(unpoison_memory);
 
@@ -2231,9 +2237,12 @@  int soft_offline_page(unsigned long pfn, int flags)
 		return -EIO;
 	}
 
+	mutex_lock(&mf_mutex);
+
 	if (PageHWPoison(page)) {
 		pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
 		put_ref_page(ref_page);
+		mutex_unlock(&mf_mutex);
 		return 0;
 	}
 
@@ -2251,5 +2260,7 @@  int soft_offline_page(unsigned long pfn, int flags)
 		}
 	}
 
+	mutex_unlock(&mf_mutex);
+
 	return ret;
 }