diff mbox series

[v7,3/6] mce: fix set_mce_nospec to always unmap the whole page

Message ID 20220405194747.2386619-4-jane.chu@oracle.com (mailing list archive)
State New, archived
Headers show
Series DAX poison recovery | expand

Commit Message

Jane Chu April 5, 2022, 7:47 p.m. UTC
The set_memory_uc() approach doesn't work well in all cases.
For example, when "The VMM unmapped the bad page from guest
physical space and passed the machine check to the guest."
"The guest gets virtual #MC on an access to that page.
 When the guest tries to do set_memory_uc() and instructs
 cpa_flush() to do clean caches that results in taking another
 fault / exception perhaps because the VMM unmapped the page
 from the guest."

Since the driver has special knowledge to handle NP or UC,
let's mark the poisoned page with NP and let driver handle it
when it comes down to repair.

Please refer to discussions here for more details.
https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/

Now since poisoned page is marked as not-present, in order to
avoid writing to a 'np' page and trigger kernel Oops, also fix
pmem_do_write().

Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 arch/x86/kernel/cpu/mce/core.c |  6 +++---
 arch/x86/mm/pat/set_memory.c   | 18 ++++++------------
 drivers/nvdimm/pmem.c          | 31 +++++++------------------------
 include/linux/set_memory.h     |  4 ++--
 4 files changed, 18 insertions(+), 41 deletions(-)

Comments

Christoph Hellwig April 6, 2022, 5:02 a.m. UTC | #1
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
Dan Williams April 11, 2022, 11:27 p.m. UTC | #2
On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@oracle.com> wrote:
>
> The set_memory_uc() approach doesn't work well in all cases.
> For example, when "The VMM unmapped the bad page from guest
> physical space and passed the machine check to the guest."
> "The guest gets virtual #MC on an access to that page.
>  When the guest tries to do set_memory_uc() and instructs
>  cpa_flush() to do clean caches that results in taking another
>  fault / exception perhaps because the VMM unmapped the page
>  from the guest."
>
> Since the driver has special knowledge to handle NP or UC,

I think a patch is needed before this one to make this statement true? I.e.:

diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index ee8d9973f60b..11641f55025a 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block
*nb, unsigned long val,
         */
        mutex_lock(&acpi_desc_lock);
        list_for_each_entry(acpi_desc, &acpi_descs, list) {
+               unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc);
                struct device *dev = acpi_desc->dev;
                int found_match = 0;

@@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block
*nb, unsigned long val,

                /* If this fails due to an -ENOMEM, there is little we can do */
                nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
-                               ALIGN(mce->addr, L1_CACHE_BYTES),
-                               L1_CACHE_BYTES);
+                                       ALIGN(mce->addr, align), align);
                nvdimm_region_notify(nfit_spa->nd_region,
                                NVDIMM_REVALIDATE_POISON);


> let's mark the poisoned page with NP and let driver handle it
> when it comes down to repair.
>
> Please refer to discussions here for more details.
> https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/
>
> Now since poisoned page is marked as not-present, in order to
> avoid writing to a 'np' page and trigger kernel Oops, also fix
> pmem_do_write().
>
> Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  arch/x86/kernel/cpu/mce/core.c |  6 +++---
>  arch/x86/mm/pat/set_memory.c   | 18 ++++++------------
>  drivers/nvdimm/pmem.c          | 31 +++++++------------------------
>  include/linux/set_memory.h     |  4 ++--
>  4 files changed, 18 insertions(+), 41 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 981496e6bc0e..fa67bb9d1afe 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
>
>         pfn = mce->addr >> PAGE_SHIFT;
>         if (!memory_failure(pfn, 0)) {
> -               set_mce_nospec(pfn, whole_page(mce));
> +               set_mce_nospec(pfn);
>                 mce->kflags |= MCE_HANDLED_UC;
>         }
>
> @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb)
>
>         ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
>         if (!ret) {
> -               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
> +               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>                 sync_core();
>                 return;
>         }
> @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb)
>         p->mce_count = 0;
>         pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
>         if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
> -               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
> +               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>  }
>
>  static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 93dde949f224..404ffcb3f2cb 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -1926,13 +1926,8 @@ int set_memory_wb(unsigned long addr, int numpages)
>  EXPORT_SYMBOL(set_memory_wb);
>
>  #ifdef CONFIG_X86_64
> -/*
> - * Prevent speculative access to the page by either unmapping
> - * it (if we do not require access to any part of the page) or
> - * marking it uncacheable (if we want to try to retrieve data
> - * from non-poisoned lines in the page).
> - */
> -int set_mce_nospec(unsigned long pfn, bool unmap)
> +/* Prevent speculative access to a page by marking it not-present */
> +int set_mce_nospec(unsigned long pfn)
>  {
>         unsigned long decoy_addr;
>         int rc;
> @@ -1954,10 +1949,7 @@ int set_mce_nospec(unsigned long pfn, bool unmap)
>          */
>         decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
>
> -       if (unmap)
> -               rc = set_memory_np(decoy_addr, 1);
> -       else
> -               rc = set_memory_uc(decoy_addr, 1);
> +       rc = set_memory_np(decoy_addr, 1);
>         if (rc)
>                 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
>         return rc;
> @@ -1966,7 +1958,9 @@ int set_mce_nospec(unsigned long pfn, bool unmap)
>  /* Restore full speculative operation to the pfn. */
>  int clear_mce_nospec(unsigned long pfn)
>  {
> -       return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
> +       unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
> +
> +       return change_page_attr_set(&addr, 1, __pgprot(_PAGE_PRESENT), 0);

This probably warrants a set_memory_present() helper.

>  }
>  EXPORT_SYMBOL_GPL(clear_mce_nospec);
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 58d95242a836..30c71a68175b 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -158,36 +158,19 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
>                         struct page *page, unsigned int page_off,
>                         sector_t sector, unsigned int len)
>  {
> -       blk_status_t rc = BLK_STS_OK;
> -       bool bad_pmem = false;
>         phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
>         void *pmem_addr = pmem->virt_addr + pmem_off;
>
> -       if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
> -               bad_pmem = true;
> +       if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
> +               blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
>
> -       /*
> -        * Note that we write the data both before and after
> -        * clearing poison.  The write before clear poison
> -        * handles situations where the latest written data is
> -        * preserved and the clear poison operation simply marks
> -        * the address range as valid without changing the data.
> -        * In this case application software can assume that an
> -        * interrupted write will either return the new good
> -        * data or an error.
> -        *
> -        * However, if pmem_clear_poison() leaves the data in an
> -        * indeterminate state we need to perform the write
> -        * after clear poison.
> -        */
> +               if (rc != BLK_STS_OK)
> +                       pr_warn_ratelimited("%s: failed to clear poison\n", __func__);

This should be either "dev_warn_ratelimited(to_dev(pmem), ...", or a
trace point similar to trace_block_rq_complete() that tells userspace
about adverse I/O completion results.

However, that's probably a discussion for another patch, so I would
just drop this new addition for now and we can discuss the logging in
a follow-on patch.


> +                       return rc;
> +       }
>         flush_dcache_page(page);
>         write_pmem(pmem_addr, page, page_off, len);
> -       if (unlikely(bad_pmem)) {
> -               rc = pmem_clear_poison(pmem, pmem_off, len);
> -               write_pmem(pmem_addr, page, page_off, len);
> -       }
> -
> -       return rc;
> +       return BLK_STS_OK;
>  }
>
>  static void pmem_submit_bio(struct bio *bio)
> diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
> index d6263d7afb55..cde2d8687a7b 100644
> --- a/include/linux/set_memory.h
> +++ b/include/linux/set_memory.h
> @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void)
>  #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
>
>  #ifdef CONFIG_X86_64
> -int set_mce_nospec(unsigned long pfn, bool unmap);
> +int set_mce_nospec(unsigned long pfn);
>  int clear_mce_nospec(unsigned long pfn);
>  #else
> -static inline int set_mce_nospec(unsigned long pfn, bool unmap)
> +static inline int set_mce_nospec(unsigned long pfn)

Looks like after this change the "whole_page()" helper can be deleted
and the ->mce_whole_page flag in the task_struct can also go, right?
In a follow-on of course.

Other than those small issues, this looks good!
Borislav Petkov April 12, 2022, 10:07 a.m. UTC | #3
On Tue, Apr 05, 2022 at 01:47:44PM -0600, Jane Chu wrote:
> The set_memory_uc() approach doesn't work well in all cases.
> For example, when "The VMM unmapped the bad page from guest
> physical space and passed the machine check to the guest."
> "The guest gets virtual #MC on an access to that page.
>  When the guest tries to do set_memory_uc() and instructs
>  cpa_flush() to do clean caches that results in taking another
>  fault / exception perhaps because the VMM unmapped the page
>  from the guest."

I presume this is quoting someone...

> Since the driver has special knowledge to handle NP or UC,
> let's mark the poisoned page with NP and let driver handle it

s/let's mark/mark/

> when it comes down to repair.
> 
> Please refer to discussions here for more details.
> https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/
> 
> Now since poisoned page is marked as not-present, in order to
> avoid writing to a 'np' page and trigger kernel Oops, also fix

You can write it out: "non-present page..."

> pmem_do_write().
> 
> Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>  arch/x86/kernel/cpu/mce/core.c |  6 +++---
>  arch/x86/mm/pat/set_memory.c   | 18 ++++++------------
>  drivers/nvdimm/pmem.c          | 31 +++++++------------------------
>  include/linux/set_memory.h     |  4 ++--
>  4 files changed, 18 insertions(+), 41 deletions(-)

For such mixed subsystem patches we probably should talk about who picks
them up, eventually...

> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 981496e6bc0e..fa67bb9d1afe 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
>  
>  	pfn = mce->addr >> PAGE_SHIFT;
>  	if (!memory_failure(pfn, 0)) {
> -		set_mce_nospec(pfn, whole_page(mce));
> +		set_mce_nospec(pfn);
>  		mce->kflags |= MCE_HANDLED_UC;
>  	}
>  
> @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb)
>  
>  	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
>  	if (!ret) {
> -		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
> +		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>  		sync_core();
>  		return;
>  	}
> @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb)
>  	p->mce_count = 0;
>  	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
>  	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
> -		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
> +		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>  }

Both that ->mce_whole_page and whole_page() look unused after this.
Jane Chu April 13, 2022, 11:36 p.m. UTC | #4
On 4/11/2022 4:27 PM, Dan Williams wrote:
> On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@oracle.com> wrote:
>>
>> The set_memory_uc() approach doesn't work well in all cases.
>> For example, when "The VMM unmapped the bad page from guest
>> physical space and passed the machine check to the guest."
>> "The guest gets virtual #MC on an access to that page.
>>   When the guest tries to do set_memory_uc() and instructs
>>   cpa_flush() to do clean caches that results in taking another
>>   fault / exception perhaps because the VMM unmapped the page
>>   from the guest."
>>
>> Since the driver has special knowledge to handle NP or UC,
> 
> I think a patch is needed before this one to make this statement true? I.e.:
> 
> diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
> index ee8d9973f60b..11641f55025a 100644
> --- a/drivers/acpi/nfit/mce.c
> +++ b/drivers/acpi/nfit/mce.c
> @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block
> *nb, unsigned long val,
>           */
>          mutex_lock(&acpi_desc_lock);
>          list_for_each_entry(acpi_desc, &acpi_descs, list) {
> +               unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc);
>                  struct device *dev = acpi_desc->dev;
>                  int found_match = 0;
> 
> @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block
> *nb, unsigned long val,
> 
>                  /* If this fails due to an -ENOMEM, there is little we can do */
>                  nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
> -                               ALIGN(mce->addr, L1_CACHE_BYTES),
> -                               L1_CACHE_BYTES);
> +                                       ALIGN(mce->addr, align), align);
>                  nvdimm_region_notify(nfit_spa->nd_region,
>                                  NVDIMM_REVALIDATE_POISON);
> 

Dan, I tried the above change, and this is what I got after injecting 8 
back-to-back poisons, then read them and received  SIGBUS/BUS_MCEERR_AR, 
then repair via the v7 patch which works until this change is added.

[ 6240.955331] nfit ACPI0012:00: XXX, align = 100
[ 6240.960300] nfit ACPI0012:00: XXX, ALIGN(mce->addr, 
L1_CACHE_BYTES)=1851600400, L1_CACHE_BYTES=40, ALIGN(mce->addr, 
align)=1851600400
[..]
[ 6242.052277] nfit ACPI0012:00: XXX, align = 100
[ 6242.057243] nfit ACPI0012:00: XXX, ALIGN(mce->addr, 
L1_CACHE_BYTES)=1851601000, L1_CACHE_BYTES=40, ALIGN(mce->addr, 
align)=1851601000
[..]
[ 6244.917198] nfit ACPI0012:00: XXX, align = 1000
[ 6244.922258] nfit ACPI0012:00: XXX, ALIGN(mce->addr, 
L1_CACHE_BYTES)=1851601200, L1_CACHE_BYTES=40, ALIGN(mce->addr, 
align)=1851602000
[..]

All 8 poisons remain uncleared.

Without further investigation, I don't know why the failure.
Could we mark this change to a follow-on task?
The driver knows a lot about how to clear poisons besides hardcoding 
poison alignment to 0x40 bytes.

> 
>> let's mark the poisoned page with NP and let driver handle it
>> when it comes down to repair.
>>
>> Please refer to discussions here for more details.
>> https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/
>>
>> Now since poisoned page is marked as not-present, in order to
>> avoid writing to a 'np' page and trigger kernel Oops, also fix
>> pmem_do_write().
>>
>> Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>>   arch/x86/kernel/cpu/mce/core.c |  6 +++---
>>   arch/x86/mm/pat/set_memory.c   | 18 ++++++------------
>>   drivers/nvdimm/pmem.c          | 31 +++++++------------------------
>>   include/linux/set_memory.h     |  4 ++--
>>   4 files changed, 18 insertions(+), 41 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index 981496e6bc0e..fa67bb9d1afe 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
>>
>>          pfn = mce->addr >> PAGE_SHIFT;
>>          if (!memory_failure(pfn, 0)) {
>> -               set_mce_nospec(pfn, whole_page(mce));
>> +               set_mce_nospec(pfn);
>>                  mce->kflags |= MCE_HANDLED_UC;
>>          }
>>
>> @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb)
>>
>>          ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
>>          if (!ret) {
>> -               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
>> +               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>>                  sync_core();
>>                  return;
>>          }
>> @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb)
>>          p->mce_count = 0;
>>          pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
>>          if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
>> -               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
>> +               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>>   }
>>
>>   static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
>> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
>> index 93dde949f224..404ffcb3f2cb 100644
>> --- a/arch/x86/mm/pat/set_memory.c
>> +++ b/arch/x86/mm/pat/set_memory.c
>> @@ -1926,13 +1926,8 @@ int set_memory_wb(unsigned long addr, int numpages)
>>   EXPORT_SYMBOL(set_memory_wb);
>>
>>   #ifdef CONFIG_X86_64
>> -/*
>> - * Prevent speculative access to the page by either unmapping
>> - * it (if we do not require access to any part of the page) or
>> - * marking it uncacheable (if we want to try to retrieve data
>> - * from non-poisoned lines in the page).
>> - */
>> -int set_mce_nospec(unsigned long pfn, bool unmap)
>> +/* Prevent speculative access to a page by marking it not-present */
>> +int set_mce_nospec(unsigned long pfn)
>>   {
>>          unsigned long decoy_addr;
>>          int rc;
>> @@ -1954,10 +1949,7 @@ int set_mce_nospec(unsigned long pfn, bool unmap)
>>           */
>>          decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
>>
>> -       if (unmap)
>> -               rc = set_memory_np(decoy_addr, 1);
>> -       else
>> -               rc = set_memory_uc(decoy_addr, 1);
>> +       rc = set_memory_np(decoy_addr, 1);
>>          if (rc)
>>                  pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
>>          return rc;
>> @@ -1966,7 +1958,9 @@ int set_mce_nospec(unsigned long pfn, bool unmap)
>>   /* Restore full speculative operation to the pfn. */
>>   int clear_mce_nospec(unsigned long pfn)
>>   {
>> -       return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
>> +       unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
>> +
>> +       return change_page_attr_set(&addr, 1, __pgprot(_PAGE_PRESENT), 0);
> 
> This probably warrants a set_memory_present() helper.

I had a set_memory_present() helper in an earlier version, but as there 
is no other caller, also if I'm not mis-remembering, thought Christoph 
had suggested to simplify the code and I agreed.  If you feel strong 
about adding the helper back, I can do that to improve readibility.

> 
>>   }
>>   EXPORT_SYMBOL_GPL(clear_mce_nospec);
>>
>> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
>> index 58d95242a836..30c71a68175b 100644
>> --- a/drivers/nvdimm/pmem.c
>> +++ b/drivers/nvdimm/pmem.c
>> @@ -158,36 +158,19 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
>>                          struct page *page, unsigned int page_off,
>>                          sector_t sector, unsigned int len)
>>   {
>> -       blk_status_t rc = BLK_STS_OK;
>> -       bool bad_pmem = false;
>>          phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
>>          void *pmem_addr = pmem->virt_addr + pmem_off;
>>
>> -       if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
>> -               bad_pmem = true;
>> +       if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
>> +               blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
>>
>> -       /*
>> -        * Note that we write the data both before and after
>> -        * clearing poison.  The write before clear poison
>> -        * handles situations where the latest written data is
>> -        * preserved and the clear poison operation simply marks
>> -        * the address range as valid without changing the data.
>> -        * In this case application software can assume that an
>> -        * interrupted write will either return the new good
>> -        * data or an error.
>> -        *
>> -        * However, if pmem_clear_poison() leaves the data in an
>> -        * indeterminate state we need to perform the write
>> -        * after clear poison.
>> -        */
>> +               if (rc != BLK_STS_OK)
>> +                       pr_warn_ratelimited("%s: failed to clear poison\n", __func__);
> 
> This should be either "dev_warn_ratelimited(to_dev(pmem), ...", or a
> trace point similar to trace_block_rq_complete() that tells userspace
> about adverse I/O completion results.
> 
> However, that's probably a discussion for another patch, so I would
> just drop this new addition for now and we can discuss the logging in
> a follow-on patch.

Okay, drop the warning message.  If the unexpected pathological scenario 
happens, user will see I/O failure.

> 
> 
>> +                       return rc;
>> +       }
>>          flush_dcache_page(page);
>>          write_pmem(pmem_addr, page, page_off, len);
>> -       if (unlikely(bad_pmem)) {
>> -               rc = pmem_clear_poison(pmem, pmem_off, len);
>> -               write_pmem(pmem_addr, page, page_off, len);
>> -       }
>> -
>> -       return rc;
>> +       return BLK_STS_OK;
>>   }
>>
>>   static void pmem_submit_bio(struct bio *bio)
>> diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
>> index d6263d7afb55..cde2d8687a7b 100644
>> --- a/include/linux/set_memory.h
>> +++ b/include/linux/set_memory.h
>> @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void)
>>   #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
>>
>>   #ifdef CONFIG_X86_64
>> -int set_mce_nospec(unsigned long pfn, bool unmap);
>> +int set_mce_nospec(unsigned long pfn);
>>   int clear_mce_nospec(unsigned long pfn);
>>   #else
>> -static inline int set_mce_nospec(unsigned long pfn, bool unmap)
>> +static inline int set_mce_nospec(unsigned long pfn)
> 
> Looks like after this change the "whole_page()" helper can be deleted
> and the ->mce_whole_page flag in the task_struct can also go, right?
> In a follow-on of course.

Okay, will do in a follow-on patch.

> 
> Other than those small issues, this looks good!

thanks!
-jane
Jane Chu April 13, 2022, 11:41 p.m. UTC | #5
On 4/12/2022 3:07 AM, Borislav Petkov wrote:
> On Tue, Apr 05, 2022 at 01:47:44PM -0600, Jane Chu wrote:
>> The set_memory_uc() approach doesn't work well in all cases.
>> For example, when "The VMM unmapped the bad page from guest
>> physical space and passed the machine check to the guest."
>> "The guest gets virtual #MC on an access to that page.
>>   When the guest tries to do set_memory_uc() and instructs
>>   cpa_flush() to do clean caches that results in taking another
>>   fault / exception perhaps because the VMM unmapped the page
>>   from the guest."
> 
> I presume this is quoting someone...

Yes, will mention Dan's name next time.

> 
>> Since the driver has special knowledge to handle NP or UC,
>> let's mark the poisoned page with NP and let driver handle it
> 
> s/let's mark/mark/
> 

Okay.

>> when it comes down to repair.
>>
>> Please refer to discussions here for more details.
>> https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/
>>
>> Now since poisoned page is marked as not-present, in order to
>> avoid writing to a 'np' page and trigger kernel Oops, also fix
> 
> You can write it out: "non-present page..."

Okay.

> 
>> pmem_do_write().
>>
>> Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>>   arch/x86/kernel/cpu/mce/core.c |  6 +++---
>>   arch/x86/mm/pat/set_memory.c   | 18 ++++++------------
>>   drivers/nvdimm/pmem.c          | 31 +++++++------------------------
>>   include/linux/set_memory.h     |  4 ++--
>>   4 files changed, 18 insertions(+), 41 deletions(-)
> 
> For such mixed subsystem patches we probably should talk about who picks
> them up, eventually...

Hmm, maintainers' action item?

> 
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index 981496e6bc0e..fa67bb9d1afe 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
>>   
>>   	pfn = mce->addr >> PAGE_SHIFT;
>>   	if (!memory_failure(pfn, 0)) {
>> -		set_mce_nospec(pfn, whole_page(mce));
>> +		set_mce_nospec(pfn);
>>   		mce->kflags |= MCE_HANDLED_UC;
>>   	}
>>   
>> @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb)
>>   
>>   	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
>>   	if (!ret) {
>> -		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
>> +		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>>   		sync_core();
>>   		return;
>>   	}
>> @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb)
>>   	p->mce_count = 0;
>>   	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
>>   	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
>> -		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
>> +		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
>>   }
> 
> Both that ->mce_whole_page and whole_page() look unused after this.
> 

Yes, indeed, Dan also noticed, will remove while_page() in a follow-on 
patch.

thanks,
-jane
Dan Williams April 14, 2022, 2:32 a.m. UTC | #6
On Wed, Apr 13, 2022 at 4:36 PM Jane Chu <jane.chu@oracle.com> wrote:
>
> On 4/11/2022 4:27 PM, Dan Williams wrote:
> > On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@oracle.com> wrote:
> >>
> >> The set_memory_uc() approach doesn't work well in all cases.
> >> For example, when "The VMM unmapped the bad page from guest
> >> physical space and passed the machine check to the guest."
> >> "The guest gets virtual #MC on an access to that page.
> >>   When the guest tries to do set_memory_uc() and instructs
> >>   cpa_flush() to do clean caches that results in taking another
> >>   fault / exception perhaps because the VMM unmapped the page
> >>   from the guest."
> >>
> >> Since the driver has special knowledge to handle NP or UC,
> >
> > I think a patch is needed before this one to make this statement true? I.e.:
> >
> > diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
> > index ee8d9973f60b..11641f55025a 100644
> > --- a/drivers/acpi/nfit/mce.c
> > +++ b/drivers/acpi/nfit/mce.c
> > @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block
> > *nb, unsigned long val,
> >           */
> >          mutex_lock(&acpi_desc_lock);
> >          list_for_each_entry(acpi_desc, &acpi_descs, list) {
> > +               unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc);
> >                  struct device *dev = acpi_desc->dev;
> >                  int found_match = 0;
> >
> > @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block
> > *nb, unsigned long val,
> >
> >                  /* If this fails due to an -ENOMEM, there is little we can do */
> >                  nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
> > -                               ALIGN(mce->addr, L1_CACHE_BYTES),
> > -                               L1_CACHE_BYTES);
> > +                                       ALIGN(mce->addr, align), align);
> >                  nvdimm_region_notify(nfit_spa->nd_region,
> >                                  NVDIMM_REVALIDATE_POISON);
> >
>
> Dan, I tried the above change, and this is what I got after injecting 8
> back-to-back poisons, then read them and received  SIGBUS/BUS_MCEERR_AR,
> then repair via the v7 patch which works until this change is added.
>
> [ 6240.955331] nfit ACPI0012:00: XXX, align = 100
> [ 6240.960300] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
> L1_CACHE_BYTES)=1851600400, L1_CACHE_BYTES=40, ALIGN(mce->addr,
> align)=1851600400
> [..]
> [ 6242.052277] nfit ACPI0012:00: XXX, align = 100
> [ 6242.057243] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
> L1_CACHE_BYTES)=1851601000, L1_CACHE_BYTES=40, ALIGN(mce->addr,
> align)=1851601000
> [..]
> [ 6244.917198] nfit ACPI0012:00: XXX, align = 1000
> [ 6244.922258] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
> L1_CACHE_BYTES)=1851601200, L1_CACHE_BYTES=40, ALIGN(mce->addr,
> align)=1851602000
> [..]
>
> All 8 poisons remain uncleared.
>
> Without further investigation, I don't know why the failure.
> Could we mark this change to a follow-on task?

Perhaps a bit more debug before kicking this can down the road...

I'm worried that this means that the driver is not accurately tracking
poison data For example, that last case the hardware is indicating a
full page clobber, but the old code would only track poison from
1851601200 to 1851601400 (i.e. the first 512 bytes from the base error
address).

Oh... wait, I think there is a second bug here, that ALIGN should be
ALIGN_DOWN(). Does this restore the result you expect?

diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index ee8d9973f60b..d7a52238a741 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -63,8 +63,7 @@ static int nfit_handle_mce(struct notifier_block
*nb, unsigned long val,

                /* If this fails due to an -ENOMEM, there is little we can do */
                nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
-                               ALIGN(mce->addr, L1_CACHE_BYTES),
-                               L1_CACHE_BYTES);
+                                       ALIGN_DOWN(mce->addr, align), align);
                nvdimm_region_notify(nfit_spa->nd_region,
                                NVDIMM_REVALIDATE_POISON);


> The driver knows a lot about how to clear poisons besides hardcoding
> poison alignment to 0x40 bytes.

It does, but the badblocks tracking should still be reliable, and if
it's not reliable I expect there are cases where recovery_write() will
not be triggered because the driver will not fail the
dax_direct_access() attempt.
Jane Chu April 15, 2022, 4:18 p.m. UTC | #7
On 4/13/2022 7:32 PM, Dan Williams wrote:
> On Wed, Apr 13, 2022 at 4:36 PM Jane Chu <jane.chu@oracle.com> wrote:
>>
>> On 4/11/2022 4:27 PM, Dan Williams wrote:
>>> On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@oracle.com> wrote:
>>>>
>>>> The set_memory_uc() approach doesn't work well in all cases.
>>>> For example, when "The VMM unmapped the bad page from guest
>>>> physical space and passed the machine check to the guest."
>>>> "The guest gets virtual #MC on an access to that page.
>>>>    When the guest tries to do set_memory_uc() and instructs
>>>>    cpa_flush() to do clean caches that results in taking another
>>>>    fault / exception perhaps because the VMM unmapped the page
>>>>    from the guest."
>>>>
>>>> Since the driver has special knowledge to handle NP or UC,
>>>
>>> I think a patch is needed before this one to make this statement true? I.e.:
>>>
>>> diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
>>> index ee8d9973f60b..11641f55025a 100644
>>> --- a/drivers/acpi/nfit/mce.c
>>> +++ b/drivers/acpi/nfit/mce.c
>>> @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block
>>> *nb, unsigned long val,
>>>            */
>>>           mutex_lock(&acpi_desc_lock);
>>>           list_for_each_entry(acpi_desc, &acpi_descs, list) {
>>> +               unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc);
>>>                   struct device *dev = acpi_desc->dev;
>>>                   int found_match = 0;
>>>
>>> @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block
>>> *nb, unsigned long val,
>>>
>>>                   /* If this fails due to an -ENOMEM, there is little we can do */
>>>                   nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
>>> -                               ALIGN(mce->addr, L1_CACHE_BYTES),
>>> -                               L1_CACHE_BYTES);
>>> +                                       ALIGN(mce->addr, align), align);
>>>                   nvdimm_region_notify(nfit_spa->nd_region,
>>>                                   NVDIMM_REVALIDATE_POISON);
>>>
>>
>> Dan, I tried the above change, and this is what I got after injecting 8
>> back-to-back poisons, then read them and received  SIGBUS/BUS_MCEERR_AR,
>> then repair via the v7 patch which works until this change is added.
>>
>> [ 6240.955331] nfit ACPI0012:00: XXX, align = 100
>> [ 6240.960300] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
>> L1_CACHE_BYTES)=1851600400, L1_CACHE_BYTES=40, ALIGN(mce->addr,
>> align)=1851600400
>> [..]
>> [ 6242.052277] nfit ACPI0012:00: XXX, align = 100
>> [ 6242.057243] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
>> L1_CACHE_BYTES)=1851601000, L1_CACHE_BYTES=40, ALIGN(mce->addr,
>> align)=1851601000
>> [..]
>> [ 6244.917198] nfit ACPI0012:00: XXX, align = 1000
>> [ 6244.922258] nfit ACPI0012:00: XXX, ALIGN(mce->addr,
>> L1_CACHE_BYTES)=1851601200, L1_CACHE_BYTES=40, ALIGN(mce->addr,
>> align)=1851602000
>> [..]
>>
>> All 8 poisons remain uncleared.
>>
>> Without further investigation, I don't know why the failure.
>> Could we mark this change to a follow-on task?
> 
> Perhaps a bit more debug before kicking this can down the road...
> 
> I'm worried that this means that the driver is not accurately tracking
> poison data For example, that last case the hardware is indicating a
> full page clobber, but the old code would only track poison from
> 1851601200 to 1851601400 (i.e. the first 512 bytes from the base error
> address).

I would appear so, but the old code tracking seems to be working 
correctly. For example, injecting 8 back-to-back poison, the
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region0/badblocks
cpatures all 8 of them, that spans 2K range, right?  I never had issue 
about missing poison in my tests.

> 
> Oh... wait, I think there is a second bug here, that ALIGN should be
> ALIGN_DOWN(). Does this restore the result you expect?

That's it, ALIGN_DOWN fixed the issue, thanks!!
I'll add a patch, suggested-by you.

> 
> diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
> index ee8d9973f60b..d7a52238a741 100644
> --- a/drivers/acpi/nfit/mce.c
> +++ b/drivers/acpi/nfit/mce.c
> @@ -63,8 +63,7 @@ static int nfit_handle_mce(struct notifier_block
> *nb, unsigned long val,
> 
>                  /* If this fails due to an -ENOMEM, there is little we can do */
>                  nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
> -                               ALIGN(mce->addr, L1_CACHE_BYTES),
> -                               L1_CACHE_BYTES);
> +                                       ALIGN_DOWN(mce->addr, align), align);
>                  nvdimm_region_notify(nfit_spa->nd_region,
>                                  NVDIMM_REVALIDATE_POISON);
> 
> 
>> The driver knows a lot about how to clear poisons besides hardcoding
>> poison alignment to 0x40 bytes.
> 
> It does, but the badblocks tracking should still be reliable, and if
> it's not reliable I expect there are cases where recovery_write() will
> not be triggered because the driver will not fail the
> dax_direct_access() attempt.

thanks!
-jane
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 981496e6bc0e..fa67bb9d1afe 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -579,7 +579,7 @@  static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
 
 	pfn = mce->addr >> PAGE_SHIFT;
 	if (!memory_failure(pfn, 0)) {
-		set_mce_nospec(pfn, whole_page(mce));
+		set_mce_nospec(pfn);
 		mce->kflags |= MCE_HANDLED_UC;
 	}
 
@@ -1316,7 +1316,7 @@  static void kill_me_maybe(struct callback_head *cb)
 
 	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
 	if (!ret) {
-		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
 		sync_core();
 		return;
 	}
@@ -1342,7 +1342,7 @@  static void kill_me_never(struct callback_head *cb)
 	p->mce_count = 0;
 	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
 	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
-		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+		set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
 }
 
 static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 93dde949f224..404ffcb3f2cb 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1926,13 +1926,8 @@  int set_memory_wb(unsigned long addr, int numpages)
 EXPORT_SYMBOL(set_memory_wb);
 
 #ifdef CONFIG_X86_64
-/*
- * Prevent speculative access to the page by either unmapping
- * it (if we do not require access to any part of the page) or
- * marking it uncacheable (if we want to try to retrieve data
- * from non-poisoned lines in the page).
- */
-int set_mce_nospec(unsigned long pfn, bool unmap)
+/* Prevent speculative access to a page by marking it not-present */
+int set_mce_nospec(unsigned long pfn)
 {
 	unsigned long decoy_addr;
 	int rc;
@@ -1954,10 +1949,7 @@  int set_mce_nospec(unsigned long pfn, bool unmap)
 	 */
 	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
 
-	if (unmap)
-		rc = set_memory_np(decoy_addr, 1);
-	else
-		rc = set_memory_uc(decoy_addr, 1);
+	rc = set_memory_np(decoy_addr, 1);
 	if (rc)
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
 	return rc;
@@ -1966,7 +1958,9 @@  int set_mce_nospec(unsigned long pfn, bool unmap)
 /* Restore full speculative operation to the pfn. */
 int clear_mce_nospec(unsigned long pfn)
 {
-	return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+	unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
+
+	return change_page_attr_set(&addr, 1, __pgprot(_PAGE_PRESENT), 0);
 }
 EXPORT_SYMBOL_GPL(clear_mce_nospec);
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 58d95242a836..30c71a68175b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -158,36 +158,19 @@  static blk_status_t pmem_do_write(struct pmem_device *pmem,
 			struct page *page, unsigned int page_off,
 			sector_t sector, unsigned int len)
 {
-	blk_status_t rc = BLK_STS_OK;
-	bool bad_pmem = false;
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void *pmem_addr = pmem->virt_addr + pmem_off;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
-		bad_pmem = true;
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) {
+		blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len);
 
-	/*
-	 * Note that we write the data both before and after
-	 * clearing poison.  The write before clear poison
-	 * handles situations where the latest written data is
-	 * preserved and the clear poison operation simply marks
-	 * the address range as valid without changing the data.
-	 * In this case application software can assume that an
-	 * interrupted write will either return the new good
-	 * data or an error.
-	 *
-	 * However, if pmem_clear_poison() leaves the data in an
-	 * indeterminate state we need to perform the write
-	 * after clear poison.
-	 */
+		if (rc != BLK_STS_OK)
+			pr_warn_ratelimited("%s: failed to clear poison\n", __func__);
+			return rc;
+	}
 	flush_dcache_page(page);
 	write_pmem(pmem_addr, page, page_off, len);
-	if (unlikely(bad_pmem)) {
-		rc = pmem_clear_poison(pmem, pmem_off, len);
-		write_pmem(pmem_addr, page, page_off, len);
-	}
-
-	return rc;
+	return BLK_STS_OK;
 }
 
 static void pmem_submit_bio(struct bio *bio)
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index d6263d7afb55..cde2d8687a7b 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -43,10 +43,10 @@  static inline bool can_set_direct_map(void)
 #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
 
 #ifdef CONFIG_X86_64
-int set_mce_nospec(unsigned long pfn, bool unmap);
+int set_mce_nospec(unsigned long pfn);
 int clear_mce_nospec(unsigned long pfn);
 #else
-static inline int set_mce_nospec(unsigned long pfn, bool unmap)
+static inline int set_mce_nospec(unsigned long pfn)
 {
 	return 0;
 }