diff mbox

[v5,6/8] mm: Keep heap accessible to others while scrubbing

Message ID 1498157830-21845-7-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky June 22, 2017, 6:57 p.m. UTC
Instead of scrubbing pages while holding heap lock we can mark
buddy's head as being scrubbed and drop the lock temporarily.
If someone (most likely alloc_heap_pages()) tries to access
this chunk it will signal the scrubber to abort scrub by setting
head's BUDDY_SCRUB_ABORT bit. The scrubber checks this bit after
processing each page and stops its work as soon as it sees it.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
Changes in v5:
* Fixed off-by-one error in setting first_dirty
* Changed struct page_info.u.free to a union to permit use of ACCESS_ONCE in
  check_and_stop_scrub()
* Renamed PAGE_SCRUBBING etc. macros to BUDDY_SCRUBBING etc

 xen/common/page_alloc.c  | 105 +++++++++++++++++++++++++++++++++++++++++++++--
 xen/include/asm-arm/mm.h |  28 ++++++++-----
 xen/include/asm-x86/mm.h |  29 ++++++++-----
 3 files changed, 138 insertions(+), 24 deletions(-)

Comments

Jan Beulich June 27, 2017, 7:28 p.m. UTC | #1
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:56 PM >>>
> Changes in v5:
> * Fixed off-by-one error in setting first_dirty
> * Changed struct page_info.u.free to a union to permit use of ACCESS_ONCE in
>   check_and_stop_scrub()

I don't see the need for this:

> +static void check_and_stop_scrub(struct page_info *head)
> +{
> +    if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
> +    {
> +        struct page_info pg;
> +
> +        head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
> +        spin_lock_kick();
> +        for ( ; ; )
> +        {
> +            /* Can't ACCESS_ONCE() a bitfield. */
> +            pg.u.free.val = ACCESS_ONCE(head->u.free.val);

Something like ACCESS_ONCE(head->u.free).val ought to work (or read_atomic(),
due to the questionable scalar type check in ACCESS_ONCE()).

> @@ -1106,25 +1155,53 @@ bool scrub_free_pages(void)
>          do {
>              while ( !page_list_empty(&heap(node, zone, order)) )
>              {
> -                unsigned int i;
> +                unsigned int i, dirty_cnt;
> +                struct scrub_wait_state st;
>  
>                  /* Unscrubbed pages are always at the end of the list. */
>                  pg = page_list_last(&heap(node, zone, order));
>                  if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
>                      break;
>  
> +                ASSERT(!pg->u.free.scrub_state);

Please use BUDDY_NOT_SCRUBBING here.

> @@ -1138,6 +1215,17 @@ bool scrub_free_pages(void)
>                      }
>                  }
>  
> +                st.pg = pg;
> +                st.first_dirty = (i >= (1UL << order) - 1) ?
> +                    INVALID_DIRTY_IDX : i + 1;

Would you mind explaining to me (again?) why you can't set pg's first_dirty
directly here? In case I'm not mistaken and this has been asked before, maybe
this is a hint that a comment might be warranted.

Jan
Jan Beulich June 27, 2017, 7:31 p.m. UTC | #2
>>> "Jan Beulich" <jbeulich@suse.com> 06/27/17 9:29 PM >>>
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:56 PM >>>
>> +            /* Can't ACCESS_ONCE() a bitfield. */
>> +            pg.u.free.val = ACCESS_ONCE(head->u.free.val);
>
>Something like ACCESS_ONCE(head->u.free).val ought to work (or read_atomic(),
>due to the questionable scalar type check in ACCESS_ONCE()).

And (of course) I did mean .scrub_state here instead of .val.

Jan
Boris Ostrovsky July 23, 2017, 2:28 a.m. UTC | #3
On 06/27/2017 03:28 PM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:56 PM >>>
>> Changes in v5:
>> * Fixed off-by-one error in setting first_dirty
>> * Changed struct page_info.u.free to a union to permit use of ACCESS_ONCE in
>>    check_and_stop_scrub()
> 
> I don't see the need for this:
> 
>> +static void check_and_stop_scrub(struct page_info *head)
>> +{
>> +    if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
>> +    {
>> +        struct page_info pg;
>> +
>> +        head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
>> +        spin_lock_kick();
>> +        for ( ; ; )
>> +        {
>> +            /* Can't ACCESS_ONCE() a bitfield. */
>> +            pg.u.free.val = ACCESS_ONCE(head->u.free.val);
> 
> Something like ACCESS_ONCE(head->u.free).val ought to work (or read_atomic(),
> due to the questionable scalar type check in ACCESS_ONCE()).

Hmm... I couldn't get this to work with either suggestion.

page_alloc.c:751:13: error: conversion to non-scalar type requested
              pg.u.free = read_atomic(&head->u.free);

page_alloc.c:753:6: error: conversion to non-scalar type requested
       if ( ACCESS_ONCE(head->u.free).scrub_state != BUDDY_SCRUB_ABORT )
> 
>> @@ -1106,25 +1155,53 @@ bool scrub_free_pages(void)
>>           do {
>>               while ( !page_list_empty(&heap(node, zone, order)) )
>>               {
>> -                unsigned int i;
>> +                unsigned int i, dirty_cnt;
>> +                struct scrub_wait_state st;
>>   
>>                   /* Unscrubbed pages are always at the end of the list. */
>>                   pg = page_list_last(&heap(node, zone, order));
>>                   if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
>>                       break;
>>   
>> +                ASSERT(!pg->u.free.scrub_state);
> 
> Please use BUDDY_NOT_SCRUBBING here.
> 
>> @@ -1138,6 +1215,17 @@ bool scrub_free_pages(void)
>>                       }
>>                   }
>>   
>> +                st.pg = pg;
>> +                st.first_dirty = (i >= (1UL << order) - 1) ?
>> +                    INVALID_DIRTY_IDX : i + 1;
> 
> Would you mind explaining to me (again?) why you can't set pg's first_dirty
> directly here? In case I'm not mistaken and this has been asked before, maybe
> this is a hint that a comment might be warranted.


In get_free_buddy() (formerly part of alloc_heap_pages()) I have

            /* Find smallest order which can satisfy the request. */
             for ( j = order; j <= MAX_ORDER; j++ )
             {
                 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
                 {
                     if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
                         return pg;
                     /*
                      * We grab single pages (order=0) even if they are
                      * unscrubbed. Given that scrubbing one page is 
fairly quick
                      * it is not worth breaking higher orders.
                      */
                     if ( (order == 0) || use_unscrubbed )
                     {
                         check_and_stop_scrub(pg);
                         return pg;
                     }


If first_dirty gets assigned INVALID_DIRTY_IDX then get_free_buddy() 
will return pg right away without telling the scrubber that the buddy 
has been taken for use. The scrubber will then put the buddy back on the 
heap.


-boris
Jan Beulich Aug. 2, 2017, 8:34 a.m. UTC | #4
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:28 AM >>>
>On 06/27/2017 03:28 PM, Jan Beulich wrote:
>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:56 PM >>>
>>> +static void check_and_stop_scrub(struct page_info *head)
>>> +{
>>> +    if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
>>> +    {
>>> +        struct page_info pg;
>>> +
>>> +        head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
>>> +        spin_lock_kick();
>>> +        for ( ; ; )
>>> +        {
>>> +            /* Can't ACCESS_ONCE() a bitfield. */
>>> +            pg.u.free.val = ACCESS_ONCE(head->u.free.val);
>> 
>> Something like ACCESS_ONCE(head->u.free).val ought to work (or read_atomic(),
>> due to the questionable scalar type check in ACCESS_ONCE()).
>
>Hmm... I couldn't get this to work with either suggestion.
>
>page_alloc.c:751:13: error: conversion to non-scalar type requested
>pg.u.free = read_atomic(&head->u.free);
>
>page_alloc.c:753:6: error: conversion to non-scalar type requested
>if ( ACCESS_ONCE(head->u.free).scrub_state != BUDDY_SCRUB_ABORT )

Oh, indeed. That's rather unfortunate.

Jan
diff mbox

Patch

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 4e2775f..f0e5399 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -687,6 +687,7 @@  static void page_list_add_scrub(struct page_info *pg, unsigned int node,
 {
     PFN_ORDER(pg) = order;
     pg->u.free.first_dirty = first_dirty;
+    pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
 
     if ( first_dirty != INVALID_DIRTY_IDX )
         page_list_add_tail(pg, &heap(node, zone, order));
@@ -694,6 +695,25 @@  static void page_list_add_scrub(struct page_info *pg, unsigned int node,
         page_list_add(pg, &heap(node, zone, order));
 }
 
+static void check_and_stop_scrub(struct page_info *head)
+{
+    if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
+    {
+        struct page_info pg;
+
+        head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
+        spin_lock_kick();
+        for ( ; ; )
+        {
+            /* Can't ACCESS_ONCE() a bitfield. */
+            pg.u.free.val = ACCESS_ONCE(head->u.free.val);
+            if ( pg.u.free.scrub_state != BUDDY_SCRUB_ABORT )
+                break;
+            cpu_relax();
+        }
+    }
+}
+
 static struct page_info *get_free_buddy(unsigned int zone_lo,
                                         unsigned int zone_hi,
                                         unsigned int order, unsigned int memflags,
@@ -738,14 +758,19 @@  static struct page_info *get_free_buddy(unsigned int zone_lo,
             {
                 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
                 {
+                    if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
+                        return pg;
                     /*
                      * We grab single pages (order=0) even if they are
                      * unscrubbed. Given that scrubbing one page is fairly quick
                      * it is not worth breaking higher orders.
                      */
-                    if ( (order == 0) || use_unscrubbed ||
-                         pg->u.free.first_dirty == INVALID_DIRTY_IDX)
+                    if ( (order == 0) || use_unscrubbed )
+                    {
+                        check_and_stop_scrub(pg);
                         return pg;
+                    }
+
                     page_list_add_tail(pg, &heap(node, zone, j));
                 }
             }
@@ -928,6 +953,7 @@  static int reserve_offlined_page(struct page_info *head)
 
     cur_head = head;
 
+    check_and_stop_scrub(head);
     /*
      * We may break the buddy so let's mark the head as clean. Then, when
      * merging chunks back into the heap, we will see whether the chunk has
@@ -1084,6 +1110,29 @@  static unsigned int node_to_scrub(bool get_node)
     return closest;
 }
 
+struct scrub_wait_state {
+    struct page_info *pg;
+    unsigned int first_dirty;
+    bool drop;
+};
+
+static void scrub_continue(void *data)
+{
+    struct scrub_wait_state *st = data;
+
+    if ( st->drop )
+        return;
+
+    if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
+    {
+        /* There is a waiter for this buddy. Release it. */
+        st->drop = true;
+        st->pg->u.free.first_dirty = st->first_dirty;
+        smp_wmb();
+        st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
+    }
+}
+
 bool scrub_free_pages(void)
 {
     struct page_info *pg;
@@ -1106,25 +1155,53 @@  bool scrub_free_pages(void)
         do {
             while ( !page_list_empty(&heap(node, zone, order)) )
             {
-                unsigned int i;
+                unsigned int i, dirty_cnt;
+                struct scrub_wait_state st;
 
                 /* Unscrubbed pages are always at the end of the list. */
                 pg = page_list_last(&heap(node, zone, order));
                 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
                     break;
 
+                ASSERT(!pg->u.free.scrub_state);
+                pg->u.free.scrub_state = BUDDY_SCRUBBING;
+
+                spin_unlock(&heap_lock);
+
+                dirty_cnt = 0;
+
                 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
                 {
                     if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
                     {
                         scrub_one_page(&pg[i]);
+                        /*
+                         * We can modify count_info without holding heap
+                         * lock since we effectively locked this buddy by
+                         * setting its scrub_state.
+                         */
                         pg[i].count_info &= ~PGC_need_scrub;
-                        node_need_scrub[node]--;
+                        dirty_cnt++;
                         cnt += 100; /* scrubbed pages add heavier weight. */
                     }
                     else
                         cnt++;
 
+                    if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
+                    {
+                        /* Someone wants this chunk. Drop everything. */
+
+                        pg->u.free.first_dirty = (i == (1U << order) - 1) ?
+                            INVALID_DIRTY_IDX : i + 1; 
+                        smp_wmb();
+                        pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
+
+                        spin_lock(&heap_lock);
+                        node_need_scrub[node] -= dirty_cnt;
+                        spin_unlock(&heap_lock);
+                        goto out_nolock;
+                    }
+
                     /*
                      * Scrub a few (8) pages before becoming eligible for
                      * preemption. But also count non-scrubbing loop iterations
@@ -1138,6 +1215,17 @@  bool scrub_free_pages(void)
                     }
                 }
 
+                st.pg = pg;
+                st.first_dirty = (i >= (1UL << order) - 1) ?
+                    INVALID_DIRTY_IDX : i + 1;
+                st.drop = false;
+                spin_lock_cb(&heap_lock, scrub_continue, &st);
+
+                node_need_scrub[node] -= dirty_cnt;
+
+                if ( st.drop )
+                    goto out;
+
                 if ( i >= (1U << order) - 1 )
                 {
                     page_list_del(pg, &heap(node, zone, order));
@@ -1146,6 +1234,8 @@  bool scrub_free_pages(void)
                 else
                     pg->u.free.first_dirty = i + 1;
 
+                pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
+
                 if ( preempt || (node_need_scrub[node] == 0) )
                     goto out;
             }
@@ -1154,6 +1244,8 @@  bool scrub_free_pages(void)
 
  out:
     spin_unlock(&heap_lock);
+
+ out_nolock:
     node_clear(node, node_scrubbing);
     return softirq_pending(cpu) || (node_to_scrub(false) != NUMA_NO_NODE);
 }
@@ -1235,6 +1327,8 @@  static void free_heap_pages(
                  (phys_to_nid(page_to_maddr(predecessor)) != node) )
                 break;
 
+            check_and_stop_scrub(predecessor);
+
             page_list_del(predecessor, &heap(node, zone, order));
 
             if ( predecessor->u.free.first_dirty != INVALID_DIRTY_IDX )
@@ -1256,6 +1350,9 @@  static void free_heap_pages(
                  (PFN_ORDER(successor) != order) ||
                  (phys_to_nid(page_to_maddr(successor)) != node) )
                 break;
+
+            check_and_stop_scrub(successor);
+
             page_list_del(successor, &heap(node, zone, order));
 
             if ( successor->u.free.first_dirty != INVALID_DIRTY_IDX )
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 889a85e..625aa16 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -42,18 +42,26 @@  struct page_info
             unsigned long type_info;
         } inuse;
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
-        struct {
-            /* Do TLBs need flushing for safety before next page use? */
-            unsigned long need_tlbflush:1;
-
-            /*
-             * Index of the first *possibly* unscrubbed page in the buddy.
-             * One more than maximum possible order (MAX_ORDER+1) to
-             * accommodate INVALID_DIRTY_IDX.
-             */
+        union {
+            struct {
+                /* Do TLBs need flushing for safety before next page use? */
+                unsigned long need_tlbflush:1;
+
+                /*
+                 * Index of the first *possibly* unscrubbed page in the buddy.
+                 * One more than maximum possible order (MAX_ORDER+1) to
+                 * accommodate INVALID_DIRTY_IDX.
+                 */
 #define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
-            unsigned long first_dirty:MAX_ORDER + 2;
+                unsigned long first_dirty:MAX_ORDER + 2;
+
+#define BUDDY_NOT_SCRUBBING   0
+#define BUDDY_SCRUBBING       1
+#define BUDDY_SCRUB_ABORT     2
+                unsigned long scrub_state:2;
+            };
 
+           unsigned long val;
         } free;
 
     } u;
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index cd00bef..db6f3a5 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -86,17 +86,26 @@  struct page_info
         } sh;
 
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
-        struct {
-            /* Do TLBs need flushing for safety before next page use? */
-            unsigned long need_tlbflush:1;
-
-            /*
-             * Index of the first *possibly* unscrubbed page in the buddy.
-             * One more than maximum possible order (MAX_ORDER+1) to
-             * accommodate INVALID_DIRTY_IDX.
-             */
+        union {
+	    struct {
+		/* Do TLBs need flushing for safety before next page use? */
+		unsigned long need_tlbflush:1;
+
+		/*
+		 * Index of the first *possibly* unscrubbed page in the buddy.
+		 * One more than maximum possible order (MAX_ORDER+1) to
+		 * accommodate INVALID_DIRTY_IDX.
+		 */
 #define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
-            unsigned long first_dirty:MAX_ORDER + 2;
+		unsigned long first_dirty:MAX_ORDER + 2;
+
+#define BUDDY_NOT_SCRUBBING   0
+#define BUDDY_SCRUBBING       1
+#define BUDDY_SCRUB_ABORT     2
+		unsigned long scrub_state:2;
+	    };
+
+	    unsigned long val;
         } free;
 
     } u;