diff mbox series

[RFC,v2,10/16] mm,hwpoison: Rework soft offline for free pages

Message ID 20191017142123.24245-11-osalvador@suse.de (mailing list archive)
State New, archived
Headers show
Series Hwpoison rework {hard,soft}-offline | expand

Commit Message

Oscar Salvador Oct. 17, 2019, 2:21 p.m. UTC
When trying to soft-offline a free page, we need to first take it off
the buddy allocator.
Once we know is out of reach, we can safely flag it as poisoned.

take_page_off_buddy will be used to take a page meant to be poisoned
off the buddy allocator.
take_page_off_buddy calls break_down_buddy_pages, which splits a
higher-order page in case our page belongs to one.

Once the page is under our control, we call page_set_poison to set it
as poisoned and grab a refcount on it.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 mm/memory-failure.c | 20 +++++++++++-----
 mm/page_alloc.c     | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 6 deletions(-)

Comments

Michal Hocko Oct. 18, 2019, 12:06 p.m. UTC | #1
On Thu 17-10-19 16:21:17, Oscar Salvador wrote:
[...]
> +bool take_page_off_buddy(struct page *page)
> + {
> +	struct zone *zone = page_zone(page);
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	unsigned int order;
> +	bool ret = false;
> +
> +	spin_lock_irqsave(&zone->lock, flags);

What prevents the page to be allocated in the meantime? Also what about
free pages on the pcp lists? Also the page could be gone by the time you
have reached here.

> +	for (order = 0; order < MAX_ORDER; order++) {
> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> +		int buddy_order = page_order(page_head);
> +		struct free_area *area = &(zone->free_area[buddy_order]);
> +
> +		if (PageBuddy(page_head) && buddy_order >= order) {
> +			unsigned long pfn_head = page_to_pfn(page_head);
> +			int migratetype = get_pfnblock_migratetype(page_head,
> +		                                                   pfn_head);
> +
> +			del_page_from_free_area(page_head, area);
> +			break_down_buddy_pages(zone, page_head, page, 0,
> +		                               buddy_order, area, migratetype);
> +			ret = true;
> +		        break;
> +		 }
> +	}
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +	return ret;
> + }
> +
> +/*
>   * Set PG_hwpoison flag if a given page is confirmed to be a free page.  This
>   * test is performed under the zone lock to prevent a race against page
>   * allocation.
> -- 
> 2.12.3
Naoya Horiguchi Oct. 21, 2019, 7:45 a.m. UTC | #2
On Thu, Oct 17, 2019 at 04:21:17PM +0200, Oscar Salvador wrote:
> When trying to soft-offline a free page, we need to first take it off
> the buddy allocator.
> Once we know is out of reach, we can safely flag it as poisoned.
> 
> take_page_off_buddy will be used to take a page meant to be poisoned
> off the buddy allocator.
> take_page_off_buddy calls break_down_buddy_pages, which splits a
> higher-order page in case our page belongs to one.
> 
> Once the page is under our control, we call page_set_poison to set it

I guess you mean page_handle_poison here.

> as poisoned and grab a refcount on it.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  mm/memory-failure.c | 20 +++++++++++-----
>  mm/page_alloc.c     | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 82 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 37b230b8cfe7..1d986580522d 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -78,6 +78,15 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
>  EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
>  EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
>  
> +extern bool take_page_off_buddy(struct page *page);
> +
> +static void page_handle_poison(struct page *page)

hwpoison is a separate idea from page poisoning, so maybe I think
it's better to be named like page_handle_hwpoison().

> +{
> +	SetPageHWPoison(page);
> +	page_ref_inc(page);
> +	num_poisoned_pages_inc();
> +}
> +
>  static int hwpoison_filter_dev(struct page *p)
>  {
>  	struct address_space *mapping;
> @@ -1830,14 +1839,13 @@ static int soft_offline_in_use_page(struct page *page)
>  
>  static int soft_offline_free_page(struct page *page)
>  {
> -	int rc = dissolve_free_huge_page(page);
> +	int rc = -EBUSY;
>  
> -	if (!rc) {
> -		if (set_hwpoison_free_buddy_page(page))
> -			num_poisoned_pages_inc();
> -		else
> -			rc = -EBUSY;
> +	if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) {
> +		page_handle_poison(page);
> +		rc = 0;
>  	}
> +
>  	return rc;
>  }
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index cd1dd0712624..255df0c76a40 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8632,6 +8632,74 @@ bool is_free_buddy_page(struct page *page)
>  
>  #ifdef CONFIG_MEMORY_FAILURE
>  /*
> + * Break down a higher-order page in sub-pages, and keep our target out of
> + * buddy allocator.
> + */
> +static void break_down_buddy_pages(struct zone *zone, struct page *page,
> +				   struct page *target, int low, int high,
> +				   struct free_area *area, int migratetype)
> +{
> +	unsigned long size = 1 << high;
> +	struct page *current_buddy, *next_page;
> +
> +	while (high > low) {
> +		area--;
> +		high--;
> +		size >>= 1;
> +
> +		if (target >= &page[size]) {
> +			next_page = page + size;
> +			current_buddy = page;
> +		} else {
> +			next_page = page;
> +			current_buddy = page + size;
> +		}
> +
> +		if (set_page_guard(zone, current_buddy, high, migratetype))
> +			continue;
> +
> +		if (current_buddy != target) {
> +			add_to_free_area(current_buddy, area, migratetype);
> +			set_page_order(current_buddy, high);
> +			page = next_page;
> +		}
> +	}
> +}
> +
> +/*
> + * Take a page that will be marked as poisoned off the buddy allocator.
> + */
> +bool take_page_off_buddy(struct page *page)
> + {
> +	struct zone *zone = page_zone(page);
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	unsigned int order;
> +	bool ret = false;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +	for (order = 0; order < MAX_ORDER; order++) {
> +		struct page *page_head = page - (pfn & ((1 << order) - 1));
> +		int buddy_order = page_order(page_head);
> +		struct free_area *area = &(zone->free_area[buddy_order]);
> +
> +		if (PageBuddy(page_head) && buddy_order >= order) {
> +			unsigned long pfn_head = page_to_pfn(page_head);
> +			int migratetype = get_pfnblock_migratetype(page_head,
> +		                                                   pfn_head);
> +
> +			del_page_from_free_area(page_head, area);
> +			break_down_buddy_pages(zone, page_head, page, 0,
> +		                               buddy_order, area, migratetype);
> +			ret = true;
> +		        break;

indent with whitespace?
And you can find a few more coding style warning with checkpatch.pl.

BTW, if we consider to make unpoison mechanism to keep up with the
new semantics, we will need the reverse operation of take_page_off_buddy().
Do you think that that part will come with a separate work?

Thanks,
Naoya Horiguchi

> +		 }
> +	}
> +	spin_unlock_irqrestore(&zone->lock, flags);
> +	return ret;
> + }
> +
> +/*
>   * Set PG_hwpoison flag if a given page is confirmed to be a free page.  This
>   * test is performed under the zone lock to prevent a race against page
>   * allocation.
> -- 
> 2.12.3
> 
>
Oscar Salvador Oct. 21, 2019, 12:58 p.m. UTC | #3
On Fri, Oct 18, 2019 at 02:06:15PM +0200, Michal Hocko wrote:
> On Thu 17-10-19 16:21:17, Oscar Salvador wrote:
> [...]
> > +bool take_page_off_buddy(struct page *page)
> > + {
> > +	struct zone *zone = page_zone(page);
> > +	unsigned long pfn = page_to_pfn(page);
> > +	unsigned long flags;
> > +	unsigned int order;
> > +	bool ret = false;
> > +
> > +	spin_lock_irqsave(&zone->lock, flags);
> 
> What prevents the page to be allocated in the meantime? Also what about
> free pages on the pcp lists? Also the page could be gone by the time you
> have reached here.

Nothing prevents the page to be allocated in the meantime.
We would just bail out and return -EBUSY to userspace.
Since we do not do __anything__ to the page until we are sure we took it off,
and it is completely isolated from the memory, there is no danger.

Since soft-offline is kinda "best effort" mode, it is something like:
"Sorry, could not poison the page, try again".

Now, thinking about this a bit more, I guess we could be more clever here
and call the routine that handles in-use pages if we see that the page
was allocated by the time we reach take_page_off_buddy.

About pcp pages, you are right.
I thought that we were already handling that case, and we do, but looking closer the
call to shake_page() (that among other things spills pcppages into buddy)
is performed at a later stage.
I think we need to adjust __get_any_page to recognize pcp pages as well.

I will do some work here.

Thanks for comments.
Michal Hocko Oct. 21, 2019, 3:41 p.m. UTC | #4
On Mon 21-10-19 14:58:49, Oscar Salvador wrote:
> On Fri, Oct 18, 2019 at 02:06:15PM +0200, Michal Hocko wrote:
> > On Thu 17-10-19 16:21:17, Oscar Salvador wrote:
> > [...]
> > > +bool take_page_off_buddy(struct page *page)
> > > + {
> > > +	struct zone *zone = page_zone(page);
> > > +	unsigned long pfn = page_to_pfn(page);
> > > +	unsigned long flags;
> > > +	unsigned int order;
> > > +	bool ret = false;
> > > +
> > > +	spin_lock_irqsave(&zone->lock, flags);
> > 
> > What prevents the page to be allocated in the meantime? Also what about
> > free pages on the pcp lists? Also the page could be gone by the time you
> > have reached here.
> 
> Nothing prevents the page to be allocated in the meantime.
> We would just bail out and return -EBUSY to userspace.
> Since we do not do __anything__ to the page until we are sure we took it off,
> and it is completely isolated from the memory, there is no danger.

Wouldn't it be better to simply check the PageBuddy state after the lock
has been taken?

> Since soft-offline is kinda "best effort" mode, it is something like:
> "Sorry, could not poison the page, try again".

Well, I would disagree here. While madvise is indeed a best effort
operation please keep in mind that the sole purpose of this interface is
to allow real MCE behavior. And that operation should better try
_really_ hard to make sure we try to recover as gracefully as possible.

> Now, thinking about this a bit more, I guess we could be more clever here
> and call the routine that handles in-use pages if we see that the page
> was allocated by the time we reach take_page_off_buddy.
> 
> About pcp pages, you are right.
> I thought that we were already handling that case, and we do, but looking closer the
> call to shake_page() (that among other things spills pcppages into buddy)
> is performed at a later stage.
> I think we need to adjust __get_any_page to recognize pcp pages as well.

Yeah, pcp pages are PITA. We cannot really recognize them now. Dropping
all pcp pages is certainly a way to go but we need to mark the page
before that happens.
Oscar Salvador Oct. 22, 2019, 7:46 a.m. UTC | #5
On Mon, Oct 21, 2019 at 05:41:58PM +0200, Michal Hocko wrote:
> On Mon 21-10-19 14:58:49, Oscar Salvador wrote:
> > Nothing prevents the page to be allocated in the meantime.
> > We would just bail out and return -EBUSY to userspace.
> > Since we do not do __anything__ to the page until we are sure we took it off,
> > and it is completely isolated from the memory, there is no danger.
> 
> Wouldn't it be better to simply check the PageBuddy state after the lock
> has been taken?

We already do that:

bool take_page_off_buddy(struct page *page)
 {
	... 
        spin_lock_irqsave(&zone->lock, flags);
        for (order = 0; order < MAX_ORDER; order++) {
                struct page *page_head = page - (pfn & ((1 << order) - 1));
                int buddy_order = page_order(page_head);
                struct free_area *area = &(zone->free_area[buddy_order]);

                if (PageBuddy(page_head) && buddy_order >= order) {
	...
 }

Actually, we __only__ call break_down_buddy_pages() (which breaks down a higher-order page
and keeps our page out of buddy) if that is true.

> > Since soft-offline is kinda "best effort" mode, it is something like:
> > "Sorry, could not poison the page, try again".
> 
> Well, I would disagree here. While madvise is indeed a best effort
> operation please keep in mind that the sole purpose of this interface is
> to allow real MCE behavior. And that operation should better try
> _really_ hard to make sure we try to recover as gracefully as possible.

In this case, there is nothing to be recovered from.
If we wanted to soft-offline a page that was free, and then it was allocated
in the meantime, there is no harm in that as we do not flag the page
until we are sure it is under our control.
That means:

 - for free pages: was succesfully taken off buddy
 - in use pages: was freed or migrated

So, opposite to hard-offline, in soft-offline we do not fiddle with pages
unless we are sure the page is not reachable anymore by any means.

> > Now, thinking about this a bit more, I guess we could be more clever here
> > and call the routine that handles in-use pages if we see that the page
> > was allocated by the time we reach take_page_off_buddy.
> > 
> > About pcp pages, you are right.
> > I thought that we were already handling that case, and we do, but looking closer the
> > call to shake_page() (that among other things spills pcppages into buddy)
> > is performed at a later stage.
> > I think we need to adjust __get_any_page to recognize pcp pages as well.
> 
> Yeah, pcp pages are PITA. We cannot really recognize them now. Dropping
> all pcp pages is certainly a way to go but we need to mark the page
> before that happens.

I will work on that.
Oscar Salvador Oct. 22, 2019, 8 a.m. UTC | #6
On Mon, Oct 21, 2019 at 07:45:33AM +0000, Naoya Horiguchi wrote:
> > +extern bool take_page_off_buddy(struct page *page);
> > +
> > +static void page_handle_poison(struct page *page)
> 
> hwpoison is a separate idea from page poisoning, so maybe I think
> it's better to be named like page_handle_hwpoison().

Yeah, that sounds better.
 
> BTW, if we consider to make unpoison mechanism to keep up with the
> new semantics, we will need the reverse operation of take_page_off_buddy().
> Do you think that that part will come with a separate work?

Well, I am not really sure.
Since we grab a refcount in page_handle_poison, all unpoison mechanism does
is a "put_page", that should send the page back to buddy/pcp lists.
I did not spot any problem when testing it, but I will double check.

Thanks Naoya.
Michal Hocko Oct. 22, 2019, 8:26 a.m. UTC | #7
On Tue 22-10-19 09:46:20, Oscar Salvador wrote:
[...]
> So, opposite to hard-offline, in soft-offline we do not fiddle with pages
> unless we are sure the page is not reachable anymore by any means.

I have to say I do not follow. Is there any _real_ reason for
soft-offline to behave differenttly from MCE (hard-offline)?
Oscar Salvador Oct. 22, 2019, 8:35 a.m. UTC | #8
On Tue, Oct 22, 2019 at 10:26:11AM +0200, Michal Hocko wrote:
> On Tue 22-10-19 09:46:20, Oscar Salvador wrote:
> [...]
> > So, opposite to hard-offline, in soft-offline we do not fiddle with pages
> > unless we are sure the page is not reachable anymore by any means.
> 
> I have to say I do not follow. Is there any _real_ reason for
> soft-offline to behave differenttly from MCE (hard-offline)?

Yes.
Do not take it as 100% true as I read that in some code/Documentation
a while ago.

But I think that it boils down to:

soft-offline: "We have seen some erros in the underlying page, but
               it is still usable, so we have a chance to keep the
               the contents (via migration)"
hard-offline: "The underlying page is dead, we cannot trust it, so
               we shut it down, killing whoever is holding it
               along the way".

Am I wrong Naoya?
Michal Hocko Oct. 22, 2019, 9:22 a.m. UTC | #9
On Tue 22-10-19 10:35:17, Oscar Salvador wrote:
> On Tue, Oct 22, 2019 at 10:26:11AM +0200, Michal Hocko wrote:
> > On Tue 22-10-19 09:46:20, Oscar Salvador wrote:
> > [...]
> > > So, opposite to hard-offline, in soft-offline we do not fiddle with pages
> > > unless we are sure the page is not reachable anymore by any means.
> > 
> > I have to say I do not follow. Is there any _real_ reason for
> > soft-offline to behave differenttly from MCE (hard-offline)?
> 
> Yes.
> Do not take it as 100% true as I read that in some code/Documentation
> a while ago.
> 
> But I think that it boils down to:
> 
> soft-offline: "We have seen some erros in the underlying page, but
>                it is still usable, so we have a chance to keep the
>                the contents (via migration)"
> hard-offline: "The underlying page is dead, we cannot trust it, so
>                we shut it down, killing whoever is holding it
>                along the way".

Hmm, that might be a misunderstanding on my end. I thought that it is
the MCE handler to say whether the failure is recoverable or not. If yes
then we can touch the content of the memory (that would imply the
migration). Other than that both paths should be essentially the same,
no? Well unrecoverable case would be essentially force migration failure
path.

MADV_HWPOISON is explicitly documented to test MCE handling IIUC:
: This feature is intended for testing of memory error-handling
: code; it is available only if the kernel was configured with
: CONFIG_MEMORY_FAILURE.

There is no explicit note about the type of the error that is injected
but I think it is reasonably safe to assume this is a recoverable one.
Oscar Salvador Oct. 22, 2019, 9:58 a.m. UTC | #10
On Tue, Oct 22, 2019 at 11:22:56AM +0200, Michal Hocko wrote:
> Hmm, that might be a misunderstanding on my end. I thought that it is
> the MCE handler to say whether the failure is recoverable or not. If yes
> then we can touch the content of the memory (that would imply the
> migration). Other than that both paths should be essentially the same,
> no? Well unrecoverable case would be essentially force migration failure
> path.
> 
> MADV_HWPOISON is explicitly documented to test MCE handling IIUC:
> : This feature is intended for testing of memory error-handling
> : code; it is available only if the kernel was configured with
> : CONFIG_MEMORY_FAILURE.
> 
> There is no explicit note about the type of the error that is injected
> but I think it is reasonably safe to assume this is a recoverable one.

MADV_HWPOISON stands for hard-offline.
MADV_SOFT_OFFLINE stands for soft-offline.

MADV_SOFT_OFFLINE (since Linux 2.6.33)
              Soft offline the pages in the range specified by addr and
              length.  The memory of each page in the specified range is
              preserved (i.e., when next accessed, the same content will be
              visible, but in a new physical page frame), and the original
              page is offlined (i.e., no longer used, and taken out of
              normal memory management).  The effect of the
              MADV_SOFT_OFFLINE operation is invisible to (i.e., does not
              change the semantics of) the calling process.

              This feature is intended for testing of memory error-handling
              code; it is available only if the kernel was configured with
              CONFIG_MEMORY_FAILURE.


But both follow different approaches.

I think it is up to some controlers to trigger soft-offline or hard-offline:

static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
{
#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
	...
        /* iff following two events can be handled properly by now */
        if (sec_sev == GHES_SEV_CORRECTED &&
            (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
                flags = MF_SOFT_OFFLINE;
        if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
                flags = 0;

        if (flags != -1)
                memory_failure_queue(pfn, flags);
	...
#endif
 }


static void memory_failure_work_func(struct work_struct *work)
{
	...
	for (;;) {
		spin_lock_irqsave(&mf_cpu->lock, proc_flags);
		gotten = kfifo_get(&mf_cpu->fifo, &entry);
		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
		if (!gotten)
			break;
		if (entry.flags & MF_SOFT_OFFLINE)
			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
		else
			memory_failure(entry.pfn, entry.flags);
	}
 }

AFAICS, for hard-offline case, a recovered event would be if:

- the page to shut down is already free
- the page was unmapped

In some cases we need to kill the process if it holds dirty pages.

But we never migrate contents in hard-offline path.
I guess it is because we cannot really trust the contents anymore.
Michal Hocko Oct. 22, 2019, 10:24 a.m. UTC | #11
On Tue 22-10-19 11:58:52, Oscar Salvador wrote:
> On Tue, Oct 22, 2019 at 11:22:56AM +0200, Michal Hocko wrote:
> > Hmm, that might be a misunderstanding on my end. I thought that it is
> > the MCE handler to say whether the failure is recoverable or not. If yes
> > then we can touch the content of the memory (that would imply the
> > migration). Other than that both paths should be essentially the same,
> > no? Well unrecoverable case would be essentially force migration failure
> > path.
> > 
> > MADV_HWPOISON is explicitly documented to test MCE handling IIUC:
> > : This feature is intended for testing of memory error-handling
> > : code; it is available only if the kernel was configured with
> > : CONFIG_MEMORY_FAILURE.
> > 
> > There is no explicit note about the type of the error that is injected
> > but I think it is reasonably safe to assume this is a recoverable one.
> 
> MADV_HWPOISON stands for hard-offline.
> MADV_SOFT_OFFLINE stands for soft-offline.
> 
> MADV_SOFT_OFFLINE (since Linux 2.6.33)
>               Soft offline the pages in the range specified by addr and
>               length.  The memory of each page in the specified range is
>               preserved (i.e., when next accessed, the same content will be
>               visible, but in a new physical page frame), and the original
>               page is offlined (i.e., no longer used, and taken out of
>               normal memory management).  The effect of the
>               MADV_SOFT_OFFLINE operation is invisible to (i.e., does not
>               change the semantics of) the calling process.
> 
>               This feature is intended for testing of memory error-handling
>               code; it is available only if the kernel was configured with
>               CONFIG_MEMORY_FAILURE.

I have missed that one somehow. Thanks for pointing out.

[...]

> AFAICS, for hard-offline case, a recovered event would be if:
> 
> - the page to shut down is already free
> - the page was unmapped
> 
> In some cases we need to kill the process if it holds dirty pages.

Yes, I would expect that the page table would be poisoned and the
process receive a SIGBUS when accessing that memory.

> But we never migrate contents in hard-offline path.
> I guess it is because we cannot really trust the contents anymore.

Yes, that makes a perfect sense. What I am saying that the migration
(aka trying to recover) is the main and only difference. The soft
offline should poison page tables when not able to migrate as well
IIUC.
Oscar Salvador Oct. 22, 2019, 10:33 a.m. UTC | #12
On Tue, Oct 22, 2019 at 12:24:57PM +0200, Michal Hocko wrote:
> Yes, that makes a perfect sense. What I am saying that the migration
> (aka trying to recover) is the main and only difference. The soft
> offline should poison page tables when not able to migrate as well
> IIUC.

Yeah, I see your point.
I do not really why soft-offline strived so much to left the page
untouched unless it was able to content the problem.

Note that if we start now to poison pages even if we could not 
content them (in soft-offline mode), that is a big and visible user
change.

Not saying it is wrong, but something to consider.

Anyway, I would like to put that aside as a follow-up
rework after this one, as this one already changes quite
some things.
Naoya Horiguchi Oct. 23, 2019, 2:01 a.m. UTC | #13
On Tue, Oct 22, 2019 at 11:58:52AM +0200, Oscar Salvador wrote:
> On Tue, Oct 22, 2019 at 11:22:56AM +0200, Michal Hocko wrote:
> > Hmm, that might be a misunderstanding on my end. I thought that it is
> > the MCE handler to say whether the failure is recoverable or not. If yes
> > then we can touch the content of the memory (that would imply the
> > migration). Other than that both paths should be essentially the same,
> > no? Well unrecoverable case would be essentially force migration failure
> > path.
> > 
> > MADV_HWPOISON is explicitly documented to test MCE handling IIUC:
> > : This feature is intended for testing of memory error-handling
> > : code; it is available only if the kernel was configured with
> > : CONFIG_MEMORY_FAILURE.
> > 
> > There is no explicit note about the type of the error that is injected
> > but I think it is reasonably safe to assume this is a recoverable one.
> 
> MADV_HWPOISON stands for hard-offline.
> MADV_SOFT_OFFLINE stands for soft-offline.

Maybe MADV_HWPOISON should've be named like MADV_HARD_OFFLINE, although
it's API and hard to change once implemented.

> 
> MADV_SOFT_OFFLINE (since Linux 2.6.33)
>               Soft offline the pages in the range specified by addr and
>               length.  The memory of each page in the specified range is
>               preserved (i.e., when next accessed, the same content will be
>               visible, but in a new physical page frame), and the original
>               page is offlined (i.e., no longer used, and taken out of
>               normal memory management).  The effect of the
>               MADV_SOFT_OFFLINE operation is invisible to (i.e., does not
>               change the semantics of) the calling process.
> 
>               This feature is intended for testing of memory error-handling
>               code;

Although this expression might not clear enough, madvise(MADV_HWPOISON or
MADV_SOFT_OFFLINE) only covers memory error handling part, not MCE handling
part.  We have some other injection methods in the lower layers like
mce-inject and APEI.

> it is available only if the kernel was configured with
>               CONFIG_MEMORY_FAILURE.
> 
> 
> But both follow different approaches.
> 
> I think it is up to some controlers to trigger soft-offline or hard-offline:

Yes, I think so.  One usecase of soft offline is triggered by CMCI interrupt
in Intel CPU.  CMCI handler stores corrected error events in /dev/mcelog.
mcelogd polls on this device file and if corrected errors occur often enough
(IIRC the default threshold is "10 events in 24 hours",) mcelogd triggers
soft-offline via soft_offline_page under /sys.

OTOH, hard-offline is triggered directly (accurately over ring buffer to separate
context) from MCE handler.  mcelogd logs MCE events but does not involve in
page offline logic.

> 
> static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
> {
> #ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
> 	...
>         /* iff following two events can be handled properly by now */
>         if (sec_sev == GHES_SEV_CORRECTED &&
>             (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
>                 flags = MF_SOFT_OFFLINE;
>         if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>                 flags = 0;
> 
>         if (flags != -1)
>                 memory_failure_queue(pfn, flags);
> 	...
> #endif
>  }
> 
> 
> static void memory_failure_work_func(struct work_struct *work)
> {
> 	...
> 	for (;;) {
> 		spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> 		gotten = kfifo_get(&mf_cpu->fifo, &entry);
> 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
> 		if (!gotten)
> 			break;
> 		if (entry.flags & MF_SOFT_OFFLINE)
> 			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
> 		else
> 			memory_failure(entry.pfn, entry.flags);
> 	}
>  }
> 
> AFAICS, for hard-offline case, a recovered event would be if:
> 
> - the page to shut down is already free
> - the page was unmapped
> 
> In some cases we need to kill the process if it holds dirty pages.

One caveat is that even if the process maps dirty error pages, we
don't have to kill it unless the error data is consumed.

Thanks,
Naoya Horiguchi
Naoya Horiguchi Oct. 23, 2019, 2:15 a.m. UTC | #14
On Tue, Oct 22, 2019 at 12:33:25PM +0200, Oscar Salvador wrote:
> On Tue, Oct 22, 2019 at 12:24:57PM +0200, Michal Hocko wrote:
> > Yes, that makes a perfect sense. What I am saying that the migration
> > (aka trying to recover) is the main and only difference. The soft
> > offline should poison page tables when not able to migrate as well
> > IIUC.
> 
> Yeah, I see your point.
> I do not really why soft-offline strived so much to left the page
> untouched unless it was able to content the problem.
> 
> Note that if we start now to poison pages even if we could not 
> content them (in soft-offline mode), that is a big and visible user
> change.

It's declared that soft offline never disrupts userspace by design,
so if poisoning page tables in migration failure, we could break this
and send SIGBUSs.  Then end users would complain that their processes
are killed by corrected (so non-urgent) errors.

Thanks,
Naoya Horiguchi
diff mbox series

Patch

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 37b230b8cfe7..1d986580522d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -78,6 +78,15 @@  EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
 
+extern bool take_page_off_buddy(struct page *page);
+
+static void page_handle_poison(struct page *page)
+{
+	SetPageHWPoison(page);
+	page_ref_inc(page);
+	num_poisoned_pages_inc();
+}
+
 static int hwpoison_filter_dev(struct page *p)
 {
 	struct address_space *mapping;
@@ -1830,14 +1839,13 @@  static int soft_offline_in_use_page(struct page *page)
 
 static int soft_offline_free_page(struct page *page)
 {
-	int rc = dissolve_free_huge_page(page);
+	int rc = -EBUSY;
 
-	if (!rc) {
-		if (set_hwpoison_free_buddy_page(page))
-			num_poisoned_pages_inc();
-		else
-			rc = -EBUSY;
+	if (!dissolve_free_huge_page(page) && take_page_off_buddy(page)) {
+		page_handle_poison(page);
+		rc = 0;
 	}
+
 	return rc;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cd1dd0712624..255df0c76a40 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8632,6 +8632,74 @@  bool is_free_buddy_page(struct page *page)
 
 #ifdef CONFIG_MEMORY_FAILURE
 /*
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
+ */
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+				   struct page *target, int low, int high,
+				   struct free_area *area, int migratetype)
+{
+	unsigned long size = 1 << high;
+	struct page *current_buddy, *next_page;
+
+	while (high > low) {
+		area--;
+		high--;
+		size >>= 1;
+
+		if (target >= &page[size]) {
+			next_page = page + size;
+			current_buddy = page;
+		} else {
+			next_page = page;
+			current_buddy = page + size;
+		}
+
+		if (set_page_guard(zone, current_buddy, high, migratetype))
+			continue;
+
+		if (current_buddy != target) {
+			add_to_free_area(current_buddy, area, migratetype);
+			set_page_order(current_buddy, high);
+			page = next_page;
+		}
+	}
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
+ {
+	struct zone *zone = page_zone(page);
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	unsigned int order;
+	bool ret = false;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (order = 0; order < MAX_ORDER; order++) {
+		struct page *page_head = page - (pfn & ((1 << order) - 1));
+		int buddy_order = page_order(page_head);
+		struct free_area *area = &(zone->free_area[buddy_order]);
+
+		if (PageBuddy(page_head) && buddy_order >= order) {
+			unsigned long pfn_head = page_to_pfn(page_head);
+			int migratetype = get_pfnblock_migratetype(page_head,
+		                                                   pfn_head);
+
+			del_page_from_free_area(page_head, area);
+			break_down_buddy_pages(zone, page_head, page, 0,
+		                               buddy_order, area, migratetype);
+			ret = true;
+		        break;
+		 }
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
+ }
+
+/*
  * Set PG_hwpoison flag if a given page is confirmed to be a free page.  This
  * test is performed under the zone lock to prevent a race against page
  * allocation.