diff mbox

[v5,4/8] mm: Scrub memory from idle loop

Message ID 1498157830-21845-5-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky June 22, 2017, 6:57 p.m. UTC
Instead of scrubbing pages during guest destruction (from
free_heap_pages()) do this opportunistically, from the idle loop.

We might come to scrub_free_pages()from idle loop while another CPU
uses mapcache override, resulting in a fault while trying to do 
__map_domain_page() in scrub_one_page(). To avoid this, make mapcache
vcpu override a per-cpu variable.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
CC: Dario Faggioli <dario.faggioli@citrix.com>
---
Changes in v5:
* Added explanation in commit message for making mapcache override VCPU
  a per-cpu variable
* Fixed loop counting in scrub_free_pages()
* Fixed the off-by-one error in setting first_dirty in scrub_free_pages().
* Various style fixes
* Added a comment in node_to_scrub() explaining why it should be OK to
  prevent another CPU from scrubbing a node that ths current CPU temporarily
  claimed. (I decided against using locks there)


 xen/arch/arm/domain.c      |   2 +-
 xen/arch/x86/domain.c      |   2 +-
 xen/arch/x86/domain_page.c |   6 +--
 xen/common/page_alloc.c    | 118 ++++++++++++++++++++++++++++++++++++++++-----
 xen/include/xen/mm.h       |   1 +
 5 files changed, 111 insertions(+), 18 deletions(-)

Comments

Dario Faggioli June 23, 2017, 8:36 a.m. UTC | #1
On Thu, 2017-06-22 at 14:57 -0400, Boris Ostrovsky wrote:
> Instead of scrubbing pages during guest destruction (from
> free_heap_pages()) do this opportunistically, from the idle loop.
> 
> We might come to scrub_free_pages()from idle loop while another CPU
> uses mapcache override, resulting in a fault while trying to do 
> __map_domain_page() in scrub_one_page(). To avoid this, make mapcache
> vcpu override a per-cpu variable.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>
Reviewed-by: Dario Faggioli <dario.faggioli@citrix.com>

Regards,
Dario
Jan Beulich June 27, 2017, 6:01 p.m. UTC | #2
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:56 PM >>>
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1019,15 +1019,85 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>  
> -static void scrub_free_pages(unsigned int node)
> +static nodemask_t node_scrubbing;
> +
> +/*
> + * If get_node is true this will return closest node that needs to be scrubbed,
> + * with appropriate bit in node_scrubbing set.
> + * If get_node is not set, this will return *a* node that needs to be scrubbed.
> + * node_scrubbing bitmask will no be updated.
> + * If no node needs scrubbing then NUMA_NO_NODE is returned.
> + */
> +static unsigned int node_to_scrub(bool get_node)
>  {
> -    struct page_info *pg;
> -    unsigned int zone;
> +    nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
> +    nodeid_t closest = NUMA_NO_NODE;
> +    u8 dist, shortest = 0xff;
>  
> -    ASSERT(spin_is_locked(&heap_lock));
> +    if ( node == NUMA_NO_NODE )
> +        node = 0;
>  
> -    if ( !node_need_scrub[node] )
> -        return;
> +    if ( node_need_scrub[node] &&
> +         (!get_node || !node_test_and_set(node, node_scrubbing)) )
> +        return node;
> +
> +    /*
> +     * See if there are memory-only nodes that need scrubbing and choose
> +     * the closest one.
> +     */
> +    local_node = node;
> +    for ( ; ; )
> +    {
> +        do {
> +            node = cycle_node(node, node_online_map);
> +        } while ( !cpumask_empty(&node_to_cpumask(node)) &&
> +                  (node != local_node) );
> +
> +        if ( node == local_node )
> +            break;
> +
> +        /*
> +         * Grab the node right away. If we find a closer node later we will
> +         * release this one. While there is a chance that another CPU will
> +         * not be able to scrub that node when it is searching for scrub work
> +         * at the same time it will be able to do so next time it wakes up.
> +         * The alternative would be to perform this search under a lock but
> +         * then we'd need to take this lock every time we come in here.
> +         */

I'm fine with your choice of simply explaining your decision, but ...

> +        if ( node_need_scrub[node] )
> +        {
> +            if ( !get_node )
> +                return node;
> +
> +            dist = __node_distance(local_node, node);
> +            if ( (dist < shortest || closest == NUMA_NO_NODE) &&
> +                 !node_test_and_set(node, node_scrubbing) )

... wouldn't the comment better be placed ahead of this if()?

> @@ -1050,17 +1120,42 @@ static void scrub_free_pages(unsigned int node)
>                          scrub_one_page(&pg[i]);
>                          pg[i].count_info &= ~PGC_need_scrub;
>                          node_need_scrub[node]--;
> +                        cnt += 100; /* scrubbed pages add heavier weight. */
> +                    }
> +                    else
> +                        cnt++;
> +
> +                    /*
> +                     * Scrub a few (8) pages before becoming eligible for
> +                     * preemption. But also count non-scrubbing loop iterations
> +                     * so that we don't get stuck here with an almost clean
> +                     * heap.
> +                     */
> +                    if ( cnt > 800 && softirq_pending(cpu) )
> +                    {
> +                        preempt = true;
> +                        break;
>                      }
>                  }
>  
> -                page_list_del(pg, &heap(node, zone, order));
> -                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
> +                if ( i >= (1U << order) - 1 )
> +                {
> +                    page_list_del(pg, &heap(node, zone, order));
> +                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
> +                }
> +                else
> +                    pg->u.free.first_dirty = i + 1;
>  
> -                if ( node_need_scrub[node] == 0 )
> -                    return;
> +                if ( preempt || (node_need_scrub[node] == 0) )
> +                    goto out;
>              }
>          } while ( order-- != 0 );
>      }
> +
> + out:
> +    spin_unlock(&heap_lock);
> +    node_clear(node, node_scrubbing);
> +    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);

While I can see why you use it here, the softirq_pending() looks sort of
misplaced: While invoking it twice in the caller will look a little odd too,
I still think that's where the check belongs.

Jan
Boris Ostrovsky July 23, 2017, 2:14 a.m. UTC | #3
> 
>> @@ -1050,17 +1120,42 @@ static void scrub_free_pages(unsigned int node)
>>                           scrub_one_page(&pg[i]);
>>                           pg[i].count_info &= ~PGC_need_scrub;
>>                           node_need_scrub[node]--;
>> +                        cnt += 100; /* scrubbed pages add heavier weight. */
>> +                    }
>> +                    else
>> +                        cnt++;
>> +
>> +                    /*
>> +                     * Scrub a few (8) pages before becoming eligible for
>> +                     * preemption. But also count non-scrubbing loop iterations
>> +                     * so that we don't get stuck here with an almost clean
>> +                     * heap.
>> +                     */
>> +                    if ( cnt > 800 && softirq_pending(cpu) )
>> +                    {
>> +                        preempt = true;
>> +                        break;
>>                       }
>>                   }
>>   
>> -                page_list_del(pg, &heap(node, zone, order));
>> -                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
>> +                if ( i >= (1U << order) - 1 )
>> +                {
>> +                    page_list_del(pg, &heap(node, zone, order));
>> +                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
>> +                }
>> +                else
>> +                    pg->u.free.first_dirty = i + 1;
>>   
>> -                if ( node_need_scrub[node] == 0 )
>> -                    return;
>> +                if ( preempt || (node_need_scrub[node] == 0) )
>> +                    goto out;
>>               }
>>           } while ( order-- != 0 );
>>       }
>> +
>> + out:
>> +    spin_unlock(&heap_lock);
>> +    node_clear(node, node_scrubbing);
>> +    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
> 
> While I can see why you use it here, the softirq_pending() looks sort of
> misplaced: While invoking it twice in the caller will look a little odd too,
> I still think that's where the check belongs.


scrub_free_pages is called from idle loop as

	else if ( !softirq_pending(cpu) && !scrub_free_pages() )
             pm_idle();

so softirq_pending() is unnecessary here.

(Not sure why you are saying it would be invoked twice)

-boris
Jan Beulich July 31, 2017, 3:20 p.m. UTC | #4
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:14 AM >>>
>>> @@ -1050,17 +1120,42 @@ static void scrub_free_pages(unsigned int node)
>>> -                if ( node_need_scrub[node] == 0 )
>>> -                    return;
>>> +                if ( preempt || (node_need_scrub[node] == 0) )
>>> +                    goto out;
>>>               }
>>>           } while ( order-- != 0 );
>>>       }
>>> +
>>> + out:
>>> +    spin_unlock(&heap_lock);
>>> +    node_clear(node, node_scrubbing);
>>> +    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
>> 
>> While I can see why you use it here, the softirq_pending() looks sort of
>> misplaced: While invoking it twice in the caller will look a little odd too,
>> I still think that's where the check belongs.
>
>
>scrub_free_pages is called from idle loop as
>
>else if ( !softirq_pending(cpu) && !scrub_free_pages() )
>pm_idle();
>
>so softirq_pending() is unnecessary here.
>
>(Not sure why you are saying it would be invoked twice)

That was sort of implicit - the caller would want to become


    else if ( !softirq_pending(cpu) && !scrub_free_pages() && !softirq_pending(cpu) )
    pm_idle();

to account for the fact that a softirq may become pending while scrubbing.

Jan
Boris Ostrovsky July 31, 2017, 4:15 p.m. UTC | #5
On 07/31/2017 11:20 AM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:14 AM >>>
>>>> @@ -1050,17 +1120,42 @@ static void scrub_free_pages(unsigned int node)
>>>> -                if ( node_need_scrub[node] == 0 )
>>>> -                    return;
>>>> +                if ( preempt || (node_need_scrub[node] == 0) )
>>>> +                    goto out;
>>>>               }
>>>>           } while ( order-- != 0 );
>>>>       }
>>>> +
>>>> + out:
>>>> +    spin_unlock(&heap_lock);
>>>> +    node_clear(node, node_scrubbing);
>>>> +    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
>>> While I can see why you use it here, the softirq_pending() looks sort of
>>> misplaced: While invoking it twice in the caller will look a little odd too,
>>> I still think that's where the check belongs.
>>
>> scrub_free_pages is called from idle loop as
>>
>> else if ( !softirq_pending(cpu) && !scrub_free_pages() )
>> pm_idle();
>>
>> so softirq_pending() is unnecessary here.
>>
>> (Not sure why you are saying it would be invoked twice)
> That was sort of implicit - the caller would want to become
>
>
>     else if ( !softirq_pending(cpu) && !scrub_free_pages() && !softirq_pending(cpu) )
>     pm_idle();
>
> to account for the fact that a softirq may become pending while scrubbing.

That would look really odd IMO.

Would

else if ( !softirq_pending(cpu) )
    if ( !scrub_free_pages() && !softirq_pending(cpu) )
       pm_idle();

or 

else if ( !softirq_pending(cpu) && !scrub_free_pages() )
    if ( !softirq_pending(cpu) )
        pm_idle();

 
be better? (I'd prefer the first)

-boris
Jan Beulich Aug. 2, 2017, 9:27 a.m. UTC | #6
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 6:16 PM >>>
>On 07/31/2017 11:20 AM, Jan Beulich wrote:
>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:14 AM >>>
>>>>> @@ -1050,17 +1120,42 @@ static void scrub_free_pages(unsigned int node)
>>>>> -                if ( node_need_scrub[node] == 0 )
>>>>> -                    return;
>>>>> +                if ( preempt || (node_need_scrub[node] == 0) )
>>>>> +                    goto out;
>>>>>               }
>>>>>           } while ( order-- != 0 );
>>>>>       }
>>>>> +
>>>>> + out:
>>>>> +    spin_unlock(&heap_lock);
>>>>> +    node_clear(node, node_scrubbing);
>>>>> +    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
>>>> While I can see why you use it here, the softirq_pending() looks sort of
>>>> misplaced: While invoking it twice in the caller will look a little odd too,
>>>> I still think that's where the check belongs.
>>>
>>> scrub_free_pages is called from idle loop as
>>>
>>> else if ( !softirq_pending(cpu) && !scrub_free_pages() )
>>> pm_idle();
>>>
>>> so softirq_pending() is unnecessary here.
>>>
>>> (Not sure why you are saying it would be invoked twice)
>> That was sort of implicit - the caller would want to become
>>
>>
>>     else if ( !softirq_pending(cpu) && !scrub_free_pages() && !softirq_pending(cpu) )
>>     pm_idle();
>>
>> to account for the fact that a softirq may become pending while scrubbing.
>
>That would look really odd IMO.
>
>Would
>
>else if ( !softirq_pending(cpu) )
>if ( !scrub_free_pages() && !softirq_pending(cpu) )
>pm_idle();
>
>or 
>
>else if ( !softirq_pending(cpu) && !scrub_free_pages() )
>if ( !softirq_pending(cpu) )
>pm_idle();
>
>
>be better? (I'd prefer the first)

I dislike both (as we always as people to fold such chained if()s), and hence
would prefer my variant plus a suitable comment explaining the oddity.

Jan
diff mbox

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 2dc8b0a..d282cd8 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -51,7 +51,7 @@  void idle_loop(void)
         /* Are we here for running vcpu context tasklets, or for idling? */
         if ( unlikely(tasklet_work_to_do(cpu)) )
             do_tasklet();
-        else
+        else if ( !softirq_pending(cpu) && !scrub_free_pages() )
         {
             local_irq_disable();
             if ( cpu_is_haltable(cpu) )
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f7873da..71f1ef4 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -122,7 +122,7 @@  static void idle_loop(void)
         /* Are we here for running vcpu context tasklets, or for idling? */
         if ( unlikely(tasklet_work_to_do(cpu)) )
             do_tasklet();
-        else
+        else if ( !softirq_pending(cpu) && !scrub_free_pages() )
             pm_idle();
         do_softirq();
         /*
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 71baede..0783c1e 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -18,12 +18,12 @@ 
 #include <asm/hardirq.h>
 #include <asm/setup.h>
 
-static struct vcpu *__read_mostly override;
+static DEFINE_PER_CPU(struct vcpu *, override);
 
 static inline struct vcpu *mapcache_current_vcpu(void)
 {
     /* In the common case we use the mapcache of the running VCPU. */
-    struct vcpu *v = override ?: current;
+    struct vcpu *v = this_cpu(override) ?: current;
 
     /*
      * When current isn't properly set up yet, this is equivalent to
@@ -59,7 +59,7 @@  static inline struct vcpu *mapcache_current_vcpu(void)
 
 void __init mapcache_override_current(struct vcpu *v)
 {
-    override = v;
+    this_cpu(override) = v;
 }
 
 #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 9aac196..4e2775f 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1019,15 +1019,85 @@  static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
-static void scrub_free_pages(unsigned int node)
+static nodemask_t node_scrubbing;
+
+/*
+ * If get_node is true this will return closest node that needs to be scrubbed,
+ * with appropriate bit in node_scrubbing set.
+ * If get_node is not set, this will return *a* node that needs to be scrubbed.
+ * node_scrubbing bitmask will no be updated.
+ * If no node needs scrubbing then NUMA_NO_NODE is returned.
+ */
+static unsigned int node_to_scrub(bool get_node)
 {
-    struct page_info *pg;
-    unsigned int zone;
+    nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
+    nodeid_t closest = NUMA_NO_NODE;
+    u8 dist, shortest = 0xff;
 
-    ASSERT(spin_is_locked(&heap_lock));
+    if ( node == NUMA_NO_NODE )
+        node = 0;
 
-    if ( !node_need_scrub[node] )
-        return;
+    if ( node_need_scrub[node] &&
+         (!get_node || !node_test_and_set(node, node_scrubbing)) )
+        return node;
+
+    /*
+     * See if there are memory-only nodes that need scrubbing and choose
+     * the closest one.
+     */
+    local_node = node;
+    for ( ; ; )
+    {
+        do {
+            node = cycle_node(node, node_online_map);
+        } while ( !cpumask_empty(&node_to_cpumask(node)) &&
+                  (node != local_node) );
+
+        if ( node == local_node )
+            break;
+
+        /*
+         * Grab the node right away. If we find a closer node later we will
+         * release this one. While there is a chance that another CPU will
+         * not be able to scrub that node when it is searching for scrub work
+         * at the same time it will be able to do so next time it wakes up.
+         * The alternative would be to perform this search under a lock but
+         * then we'd need to take this lock every time we come in here.
+         */
+        if ( node_need_scrub[node] )
+        {
+            if ( !get_node )
+                return node;
+
+            dist = __node_distance(local_node, node);
+            if ( (dist < shortest || closest == NUMA_NO_NODE) &&
+                 !node_test_and_set(node, node_scrubbing) )
+            {
+                if ( closest != NUMA_NO_NODE )
+                    node_clear(closest, node_scrubbing);
+                shortest = dist;
+                closest = node;
+            }
+        }
+    }
+
+    return closest;
+}
+
+bool scrub_free_pages(void)
+{
+    struct page_info *pg;
+    unsigned int zone;
+    unsigned int cpu = smp_processor_id();
+    bool preempt = false;
+    nodeid_t node;
+    unsigned int cnt = 0;
+  
+    node = node_to_scrub(true);
+    if ( node == NUMA_NO_NODE )
+        return false;
+ 
+    spin_lock(&heap_lock);
 
     for ( zone = 0; zone < NR_ZONES; zone++ )
     {
@@ -1050,17 +1120,42 @@  static void scrub_free_pages(unsigned int node)
                         scrub_one_page(&pg[i]);
                         pg[i].count_info &= ~PGC_need_scrub;
                         node_need_scrub[node]--;
+                        cnt += 100; /* scrubbed pages add heavier weight. */
+                    }
+                    else
+                        cnt++;
+
+                    /*
+                     * Scrub a few (8) pages before becoming eligible for
+                     * preemption. But also count non-scrubbing loop iterations
+                     * so that we don't get stuck here with an almost clean
+                     * heap.
+                     */
+                    if ( cnt > 800 && softirq_pending(cpu) )
+                    {
+                        preempt = true;
+                        break;
                     }
                 }
 
-                page_list_del(pg, &heap(node, zone, order));
-                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+                if ( i >= (1U << order) - 1 )
+                {
+                    page_list_del(pg, &heap(node, zone, order));
+                    page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+                }
+                else
+                    pg->u.free.first_dirty = i + 1;
 
-                if ( node_need_scrub[node] == 0 )
-                    return;
+                if ( preempt || (node_need_scrub[node] == 0) )
+                    goto out;
             }
         } while ( order-- != 0 );
     }
+
+ out:
+    spin_unlock(&heap_lock);
+    node_clear(node, node_scrubbing);
+    return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
 }
 
 /* Free 2^@order set of pages. */
@@ -1175,9 +1270,6 @@  static void free_heap_pages(
     if ( tainted )
         reserve_offlined_page(pg);
 
-    if ( need_scrub )
-        scrub_free_pages(node);
-
     spin_unlock(&heap_lock);
 }
 
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 5f3d84a..a9829c2 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -150,6 +150,7 @@  void init_xenheap_pages(paddr_t ps, paddr_t pe);
 void xenheap_max_mfn(unsigned long mfn);
 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags);
 void free_xenheap_pages(void *v, unsigned int order);
+bool scrub_free_pages(void);
 #define alloc_xenheap_page() (alloc_xenheap_pages(0,0))
 #define free_xenheap_page(v) (free_xenheap_pages(v,0))
 /* Map machine page range in Xen virtual address space. */