diff mbox series

[1/2] x86/mm: avoid phys_to_nid() calls for invalid addresses

Message ID d503a684-1689-ef60-23e8-5eb6b33ab5c8@suse.com (mailing list archive)
State New, archived
Headers show
Series NUMA: phys_to_nid() related adjustments | expand

Commit Message

Jan Beulich Dec. 13, 2022, 11:36 a.m. UTC
With phys_to_nid() now actively checking that a valid node ID is on
record, the two uses in paging_init() can actually trigger at least the
2nd of the assertions there. They're used to calculate allocation flags,
but the calculated flags wouldn't be used when dealing with an invalid
(unpopulated) address range. Defer the calculations such that they can
be done with a validated MFN in hands. This also does away with the
artificial calculations of an address to pass to phys_to_nid().

Note that while the variable is provably written before use, at least
some compiler versions can't actually verify that. Hence the variable
also needs to gain a (dead) initializer.

Fixes: e9c72d524fbd ("xen/x86: Use ASSERT instead of VIRTUAL_BUG_ON for phys_to_nid")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
RFC: With small enough a NUMA hash shift it would still be possible to
     hit an SRAT hole, despite mfn_valid() passing. Hence, like was the
     original plan, it may still be necessary to relax the checking in
     phys_to_nid() (or its designated replacements). At which point the
     value of this change here would shrink to merely reducing the
     chance of unintentionally doing NUMA_NO_NODE allocations.

Comments

Wei Chen Dec. 14, 2022, 3:28 a.m. UTC | #1
Hi Jan,

On 2022/12/13 19:36, Jan Beulich wrote:
> With phys_to_nid() now actively checking that a valid node ID is on
> record, the two uses in paging_init() can actually trigger at least the
> 2nd of the assertions there. They're used to calculate allocation flags,
> but the calculated flags wouldn't be used when dealing with an invalid
> (unpopulated) address range. Defer the calculations such that they can
> be done with a validated MFN in hands. This also does away with the
> artificial calculations of an address to pass to phys_to_nid().
> 
> Note that while the variable is provably written before use, at least
> some compiler versions can't actually verify that. Hence the variable
> also needs to gain a (dead) initializer.
> 
> Fixes: e9c72d524fbd ("xen/x86: Use ASSERT instead of VIRTUAL_BUG_ON for phys_to_nid")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> RFC: With small enough a NUMA hash shift it would still be possible to
>       hit an SRAT hole, despite mfn_valid() passing. Hence, like was the
>       original plan, it may still be necessary to relax the checking in
>       phys_to_nid() (or its designated replacements). At which point the
>       value of this change here would shrink to merely reducing the
>       chance of unintentionally doing NUMA_NO_NODE allocations.
> 

I think it's better to place the last sentence or the whole RFC to the
commit log. Without the RFC content, after a while, when I check this 
commit again, I will be confused about what problem this commit solved. 
Because just looking at the changes, as your said in RFC, it doesn't 
completely solve the problem.

Cheers,
Wei Chen

> --- a/xen/arch/x86/x86_64/mm.c
> +++ b/xen/arch/x86/x86_64/mm.c
> @@ -498,7 +498,7 @@ error:
>   void __init paging_init(void)
>   {
>       unsigned long i, mpt_size, va;
> -    unsigned int n, memflags;
> +    unsigned int n, memflags = 0;
>       l3_pgentry_t *l3_ro_mpt;
>       l2_pgentry_t *pl2e = NULL, *l2_ro_mpt = NULL;
>       struct page_info *l1_pg;
> @@ -547,8 +547,6 @@ void __init paging_init(void)
>       {
>           BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
>           va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
> -        memflags = MEMF_node(phys_to_nid(i <<
> -            (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
>   
>           if ( cpu_has_page1gb &&
>                !((unsigned long)pl2e & ~PAGE_MASK) &&
> @@ -559,10 +557,15 @@ void __init paging_init(void)
>               for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
>               {
>                   for ( n = 0; n < CNT; ++n)
> -                    if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
> +                {
> +                    mfn = _mfn(MFN(i + k) + n * PDX_GROUP_COUNT);
> +                    if ( mfn_valid(mfn) )
>                           break;
> +                }
>                   if ( n == CNT )
>                       ++holes;
> +                else if ( k == holes )
> +                    memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
>               }
>               if ( k == holes )
>               {
> @@ -593,8 +596,14 @@ void __init paging_init(void)
>           }
>   
>           for ( n = 0; n < CNT; ++n)
> -            if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
> +        {
> +            mfn = _mfn(MFN(i) + n * PDX_GROUP_COUNT);
> +            if ( mfn_valid(mfn) )
> +            {
> +                memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
>                   break;
> +            }
> +        }
>           if ( n == CNT )
>               l1_pg = NULL;
>           else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
> @@ -663,15 +672,19 @@ void __init paging_init(void)
>                    sizeof(*compat_machine_to_phys_mapping));
>       for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, pl2e++ )
>       {
> -        memflags = MEMF_node(phys_to_nid(i <<
> -            (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
>           for ( n = 0; n < CNT; ++n)
> -            if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
> +        {
> +            mfn = _mfn(MFN(i) + n * PDX_GROUP_COUNT);
> +            if ( mfn_valid(mfn) )
> +            {
> +                memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
>                   break;
> +            }
> +        }
>           if ( n == CNT )
>               continue;
>           if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
> -                                               memflags)) == NULL )
> +                                          memflags)) == NULL )
>               goto nomem;
>           map_pages_to_xen(
>               RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
> 
>
Jan Beulich Dec. 14, 2022, 7:44 a.m. UTC | #2
On 14.12.2022 04:28, Wei Chen wrote:
> Hi Jan,
> 
> On 2022/12/13 19:36, Jan Beulich wrote:
>> With phys_to_nid() now actively checking that a valid node ID is on
>> record, the two uses in paging_init() can actually trigger at least the
>> 2nd of the assertions there. They're used to calculate allocation flags,
>> but the calculated flags wouldn't be used when dealing with an invalid
>> (unpopulated) address range. Defer the calculations such that they can
>> be done with a validated MFN in hands. This also does away with the
>> artificial calculations of an address to pass to phys_to_nid().
>>
>> Note that while the variable is provably written before use, at least
>> some compiler versions can't actually verify that. Hence the variable
>> also needs to gain a (dead) initializer.
>>
>> Fixes: e9c72d524fbd ("xen/x86: Use ASSERT instead of VIRTUAL_BUG_ON for phys_to_nid")
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> RFC: With small enough a NUMA hash shift it would still be possible to
>>       hit an SRAT hole, despite mfn_valid() passing. Hence, like was the
>>       original plan, it may still be necessary to relax the checking in
>>       phys_to_nid() (or its designated replacements). At which point the
>>       value of this change here would shrink to merely reducing the
>>       chance of unintentionally doing NUMA_NO_NODE allocations.
>>
> 
> I think it's better to place the last sentence or the whole RFC to the
> commit log. Without the RFC content, after a while, when I check this 
> commit again, I will be confused about what problem this commit solved. 
> Because just looking at the changes, as your said in RFC, it doesn't 
> completely solve the problem.

Moving some/all of this to the commit message is one of the ways to
resolve this RFC, yes. But the other one, flipping the order of the
two patches and making mfn_to_nid() check less than page_to_nid() is
one where the commit message here would need re-writing anyway. IOW
the primary question here is what route to go.

Jan
Andrew Cooper Dec. 16, 2022, 7:24 p.m. UTC | #3
On 13/12/2022 11:36 am, Jan Beulich wrote:
> With phys_to_nid() now actively checking that a valid node ID is on
> record, the two uses in paging_init() can actually trigger at least the
> 2nd of the assertions there. They're used to calculate allocation flags,
> but the calculated flags wouldn't be used when dealing with an invalid
> (unpopulated) address range. Defer the calculations such that they can
> be done with a validated MFN in hands. This also does away with the
> artificial calculations of an address to pass to phys_to_nid().
>
> Note that while the variable is provably written before use, at least
> some compiler versions can't actually verify that. Hence the variable
> also needs to gain a (dead) initializer.

I'm not surprised in the slightest that GCC can't prove that it is
always initialised.  I suspect a lot of humans would struggle too.

> Fixes: e9c72d524fbd ("xen/x86: Use ASSERT instead of VIRTUAL_BUG_ON for phys_to_nid")
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

This does appear to fix things.  (Testing hasn't finished yet, but all
systems have installed, and they didn't get that far previously).

> ---
> RFC: With small enough a NUMA hash shift it would still be possible to
>      hit an SRAT hole, despite mfn_valid() passing. Hence, like was the
>      original plan, it may still be necessary to relax the checking in
>      phys_to_nid() (or its designated replacements). At which point the
>      value of this change here would shrink to merely reducing the
>      chance of unintentionally doing NUMA_NO_NODE allocations.

Why does the NUMA shift matter?  Can't this occur for badly constructed
SRAT tables too?


Nevertheless, this is a clear improvement over what's currently in tree,
so I'm going to commit it to try and unblock OSSTest.  The tree has been
blocked for too long.  Further adjustments can come in due course.

~Andrew
Jan Beulich Dec. 19, 2022, 7:14 a.m. UTC | #4
On 16.12.2022 20:24, Andrew Cooper wrote:
> On 13/12/2022 11:36 am, Jan Beulich wrote:
>> RFC: With small enough a NUMA hash shift it would still be possible to
>>      hit an SRAT hole, despite mfn_valid() passing. Hence, like was the
>>      original plan, it may still be necessary to relax the checking in
>>      phys_to_nid() (or its designated replacements). At which point the
>>      value of this change here would shrink to merely reducing the
>>      chance of unintentionally doing NUMA_NO_NODE allocations.
> 
> Why does the NUMA shift matter?  Can't this occur for badly constructed
> SRAT tables too?

Well, the NUMA hash shift is computed from the SRAT table entries, so
often "badly constructed" => "too small shift".

> Nevertheless, this is a clear improvement over what's currently in tree,
> so I'm going to commit it to try and unblock OSSTest.  The tree has been
> blocked for too long.  Further adjustments can come in due course.

Thanks. And I see it has unblocked the tree.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -498,7 +498,7 @@  error:
 void __init paging_init(void)
 {
     unsigned long i, mpt_size, va;
-    unsigned int n, memflags;
+    unsigned int n, memflags = 0;
     l3_pgentry_t *l3_ro_mpt;
     l2_pgentry_t *pl2e = NULL, *l2_ro_mpt = NULL;
     struct page_info *l1_pg;
@@ -547,8 +547,6 @@  void __init paging_init(void)
     {
         BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
         va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
-        memflags = MEMF_node(phys_to_nid(i <<
-            (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
 
         if ( cpu_has_page1gb &&
              !((unsigned long)pl2e & ~PAGE_MASK) &&
@@ -559,10 +557,15 @@  void __init paging_init(void)
             for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
             {
                 for ( n = 0; n < CNT; ++n)
-                    if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
+                {
+                    mfn = _mfn(MFN(i + k) + n * PDX_GROUP_COUNT);
+                    if ( mfn_valid(mfn) )
                         break;
+                }
                 if ( n == CNT )
                     ++holes;
+                else if ( k == holes )
+                    memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
             }
             if ( k == holes )
             {
@@ -593,8 +596,14 @@  void __init paging_init(void)
         }
 
         for ( n = 0; n < CNT; ++n)
-            if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
+        {
+            mfn = _mfn(MFN(i) + n * PDX_GROUP_COUNT);
+            if ( mfn_valid(mfn) )
+            {
+                memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
                 break;
+            }
+        }
         if ( n == CNT )
             l1_pg = NULL;
         else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
@@ -663,15 +672,19 @@  void __init paging_init(void)
                  sizeof(*compat_machine_to_phys_mapping));
     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, pl2e++ )
     {
-        memflags = MEMF_node(phys_to_nid(i <<
-            (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
         for ( n = 0; n < CNT; ++n)
-            if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
+        {
+            mfn = _mfn(MFN(i) + n * PDX_GROUP_COUNT);
+            if ( mfn_valid(mfn) )
+            {
+                memflags = MEMF_node(phys_to_nid(mfn_to_maddr(mfn)));
                 break;
+            }
+        }
         if ( n == CNT )
             continue;
         if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
-                                               memflags)) == NULL )
+                                          memflags)) == NULL )
             goto nomem;
         map_pages_to_xen(
             RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),