diff mbox

[PATCHES,v8,1/8] mm: Place unscrubbed pages at the end of pagelist

Message ID 1502908394-9760-2-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky Aug. 16, 2017, 6:33 p.m. UTC
.. so that it's easy to find pages that need to be scrubbed (those pages are
now marked with _PGC_need_scrub bit).

We keep track of the first unscrubbed page in a page buddy using first_dirty
field. For now it can have two values, 0 (whole buddy needs scrubbing) or
INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent patches
will allow scrubbing to be interrupted, resulting in first_dirty taking any
value.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
Changes in v8:
* Changed x86's definition of page_info.u.free from using bitfields to natural
  datatypes
* Swapped order of bitfields in page_info.u.free for ARM
* Added BUILD_BUG_ON to check page_info.u.free.first_dirty size on x86, moved
  previously defined BUILD_BUG_ON from init_heap_pages() to init_boot_pages()
  (to avoid introducing extra '#ifdef x86' and to keep both together)

 xen/common/page_alloc.c  | 159 ++++++++++++++++++++++++++++++++++++++++-------
 xen/include/asm-arm/mm.h |  17 ++++-
 xen/include/asm-x86/mm.h |  15 +++++
 3 files changed, 167 insertions(+), 24 deletions(-)

Comments

Julien Grall Aug. 17, 2017, 10:30 a.m. UTC | #1
Hi Boris,

On 16/08/17 19:33, Boris Ostrovsky wrote:
> .. so that it's easy to find pages that need to be scrubbed (those pages are
> now marked with _PGC_need_scrub bit).
>
> We keep track of the first unscrubbed page in a page buddy using first_dirty
> field. For now it can have two values, 0 (whole buddy needs scrubbing) or
> INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent patches
> will allow scrubbing to be interrupted, resulting in first_dirty taking any
> value.
>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

For the ARM bits:

Acked-by: Julien Grall <julien.grall@arm.com>

Cheers,

> ---
> Changes in v8:
> * Changed x86's definition of page_info.u.free from using bitfields to natural
>   datatypes
> * Swapped order of bitfields in page_info.u.free for ARM
> * Added BUILD_BUG_ON to check page_info.u.free.first_dirty size on x86, moved
>   previously defined BUILD_BUG_ON from init_heap_pages() to init_boot_pages()
>   (to avoid introducing extra '#ifdef x86' and to keep both together)
>
>  xen/common/page_alloc.c  | 159 ++++++++++++++++++++++++++++++++++++++++-------
>  xen/include/asm-arm/mm.h |  17 ++++-
>  xen/include/asm-x86/mm.h |  15 +++++
>  3 files changed, 167 insertions(+), 24 deletions(-)
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 444ecf3..a39fd81 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -261,7 +261,11 @@ void __init init_boot_pages(paddr_t ps, paddr_t pe)
>  #ifdef CONFIG_X86
>      const unsigned long *badpage = NULL;
>      unsigned int i, array_size;
> +
> +    BUILD_BUG_ON(8 * sizeof(((struct page_info *)0)->u.free.first_dirty) <
> +                 MAX_ORDER + 1);
>  #endif
> +    BUILD_BUG_ON(sizeof(((struct page_info *)0)->u) != sizeof(unsigned long));
>
>      ps = round_pgup(ps);
>      pe = round_pgdown(pe);
> @@ -375,6 +379,8 @@ typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
>  static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
>  #define heap(node, zone, order) ((*_heap[node])[zone][order])
>
> +static unsigned long node_need_scrub[MAX_NUMNODES];
> +
>  static unsigned long *avail[MAX_NUMNODES];
>  static long total_avail_pages;
>
> @@ -670,13 +676,30 @@ static void check_low_mem_virq(void)
>      }
>  }
>
> +/* Pages that need a scrub are added to tail, otherwise to head. */
> +static void page_list_add_scrub(struct page_info *pg, unsigned int node,
> +                                unsigned int zone, unsigned int order,
> +                                unsigned int first_dirty)
> +{
> +    PFN_ORDER(pg) = order;
> +    pg->u.free.first_dirty = first_dirty;
> +
> +    if ( first_dirty != INVALID_DIRTY_IDX )
> +    {
> +        ASSERT(first_dirty < (1U << order));
> +        page_list_add_tail(pg, &heap(node, zone, order));
> +    }
> +    else
> +        page_list_add(pg, &heap(node, zone, order));
> +}
> +
>  /* Allocate 2^@order contiguous pages. */
>  static struct page_info *alloc_heap_pages(
>      unsigned int zone_lo, unsigned int zone_hi,
>      unsigned int order, unsigned int memflags,
>      struct domain *d)
>  {
> -    unsigned int i, j, zone = 0, nodemask_retry = 0;
> +    unsigned int i, j, zone = 0, nodemask_retry = 0, first_dirty;
>      nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
>      unsigned long request = 1UL << order;
>      struct page_info *pg;
> @@ -790,12 +813,26 @@ static struct page_info *alloc_heap_pages(
>      return NULL;
>
>   found:
> +
> +    first_dirty = pg->u.free.first_dirty;
> +
>      /* We may have to halve the chunk a number of times. */
>      while ( j != order )
>      {
> -        PFN_ORDER(pg) = --j;
> -        page_list_add_tail(pg, &heap(node, zone, j));
> -        pg += 1 << j;
> +        j--;
> +        page_list_add_scrub(pg, node, zone, j,
> +                            (1U << j) > first_dirty ?
> +                            first_dirty : INVALID_DIRTY_IDX);
> +        pg += 1U << j;
> +
> +        if ( first_dirty != INVALID_DIRTY_IDX )
> +        {
> +            /* Adjust first_dirty */
> +            if ( first_dirty >= 1U << j )
> +                first_dirty -= 1U << j;
> +            else
> +                first_dirty = 0; /* We've moved past original first_dirty */
> +        }
>      }
>
>      ASSERT(avail[node][zone] >= request);
> @@ -842,12 +879,20 @@ static int reserve_offlined_page(struct page_info *head)
>      unsigned int node = phys_to_nid(page_to_maddr(head));
>      int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
>      struct page_info *cur_head;
> -    int cur_order;
> +    unsigned int cur_order, first_dirty;
>
>      ASSERT(spin_is_locked(&heap_lock));
>
>      cur_head = head;
>
> +    /*
> +     * We may break the buddy so let's mark the head as clean. Then, when
> +     * merging chunks back into the heap, we will see whether the chunk has
> +     * unscrubbed pages and set its first_dirty properly.
> +     */
> +    first_dirty = head->u.free.first_dirty;
> +    head->u.free.first_dirty = INVALID_DIRTY_IDX;
> +
>      page_list_del(head, &heap(node, zone, head_order));
>
>      while ( cur_head < (head + (1 << head_order)) )
> @@ -858,6 +903,8 @@ static int reserve_offlined_page(struct page_info *head)
>          if ( page_state_is(cur_head, offlined) )
>          {
>              cur_head++;
> +            if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
> +                first_dirty--;
>              continue;
>          }
>
> @@ -884,9 +931,20 @@ static int reserve_offlined_page(struct page_info *head)
>              {
>              merge:
>                  /* We don't consider merging outside the head_order. */
> -                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
> -                PFN_ORDER(cur_head) = cur_order;
> +                page_list_add_scrub(cur_head, node, zone, cur_order,
> +                                    (1U << cur_order) > first_dirty ?
> +                                    first_dirty : INVALID_DIRTY_IDX);
>                  cur_head += (1 << cur_order);
> +
> +                /* Adjust first_dirty if needed. */
> +                if ( first_dirty != INVALID_DIRTY_IDX )
> +                {
> +                    if ( first_dirty >=  1U << cur_order )
> +                        first_dirty -= 1U << cur_order;
> +                    else
> +                        first_dirty = 0;
> +                }
> +
>                  break;
>              }
>          }
> @@ -911,9 +969,53 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>
> +static void scrub_free_pages(unsigned int node)
> +{
> +    struct page_info *pg;
> +    unsigned int zone;
> +
> +    ASSERT(spin_is_locked(&heap_lock));
> +
> +    if ( !node_need_scrub[node] )
> +        return;
> +
> +    for ( zone = 0; zone < NR_ZONES; zone++ )
> +    {
> +        unsigned int order = MAX_ORDER;
> +
> +        do {
> +            while ( !page_list_empty(&heap(node, zone, order)) )
> +            {
> +                unsigned int i;
> +
> +                /* Unscrubbed pages are always at the end of the list. */
> +                pg = page_list_last(&heap(node, zone, order));
> +                if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
> +                    break;
> +
> +                for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
> +                {
> +                    if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
> +                    {
> +                        scrub_one_page(&pg[i]);
> +                        pg[i].count_info &= ~PGC_need_scrub;
> +                        node_need_scrub[node]--;
> +                    }
> +                }
> +
> +                page_list_del(pg, &heap(node, zone, order));
> +                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
> +
> +                if ( node_need_scrub[node] == 0 )
> +                    return;
> +            }
> +        } while ( order-- != 0 );
> +    }
> +}
> +
>  /* Free 2^@order set of pages. */
>  static void free_heap_pages(
> -    struct page_info *pg, unsigned int order)
> +    struct page_info *pg, unsigned int order, bool need_scrub)
>  {
>      unsigned long mask, mfn = page_to_mfn(pg);
>      unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
> @@ -953,10 +1055,20 @@ static void free_heap_pages(
>          /* This page is not a guest frame any more. */
>          page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
>          set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
> +
> +        if ( need_scrub )
> +            pg[i].count_info |= PGC_need_scrub;
>      }
>
>      avail[node][zone] += 1 << order;
>      total_avail_pages += 1 << order;
> +    if ( need_scrub )
> +    {
> +        node_need_scrub[node] += 1 << order;
> +        pg->u.free.first_dirty = 0;
> +    }
> +    else
> +        pg->u.free.first_dirty = INVALID_DIRTY_IDX;
>
>      if ( tmem_enabled() )
>          midsize_alloc_zone_pages = max(
> @@ -980,6 +1092,12 @@ static void free_heap_pages(
>
>              page_list_del(predecessor, &heap(node, zone, order));
>
> +            /* Keep predecessor's first_dirty if it is already set. */
> +            if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
> +                 pg->u.free.first_dirty != INVALID_DIRTY_IDX )
> +                predecessor->u.free.first_dirty = (1U << order) +
> +                                                  pg->u.free.first_dirty;
> +
>              pg = predecessor;
>          }
>          else
> @@ -999,12 +1117,14 @@ static void free_heap_pages(
>          order++;
>      }
>
> -    PFN_ORDER(pg) = order;
> -    page_list_add_tail(pg, &heap(node, zone, order));
> +    page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
>
>      if ( tainted )
>          reserve_offlined_page(pg);
>
> +    if ( need_scrub )
> +        scrub_free_pages(node);
> +
>      spin_unlock(&heap_lock);
>  }
>
> @@ -1225,7 +1345,7 @@ unsigned int online_page(unsigned long mfn, uint32_t *status)
>      spin_unlock(&heap_lock);
>
>      if ( (y & PGC_state) == PGC_state_offlined )
> -        free_heap_pages(pg, 0);
> +        free_heap_pages(pg, 0, false);
>
>      return ret;
>  }
> @@ -1294,7 +1414,7 @@ static void init_heap_pages(
>              nr_pages -= n;
>          }
>
> -        free_heap_pages(pg+i, 0);
> +        free_heap_pages(pg + i, 0, false);
>      }
>  }
>
> @@ -1621,7 +1741,7 @@ void free_xenheap_pages(void *v, unsigned int order)
>
>      memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
>
> -    free_heap_pages(virt_to_page(v), order);
> +    free_heap_pages(virt_to_page(v), order, false);
>  }
>
>  #else
> @@ -1675,12 +1795,9 @@ void free_xenheap_pages(void *v, unsigned int order)
>      pg = virt_to_page(v);
>
>      for ( i = 0; i < (1u << order); i++ )
> -    {
> -        scrub_one_page(&pg[i]);
>          pg[i].count_info &= ~PGC_xen_heap;
> -    }
>
> -    free_heap_pages(pg, order);
> +    free_heap_pages(pg, order, true);
>  }
>
>  #endif
> @@ -1789,7 +1906,7 @@ struct page_info *alloc_domheap_pages(
>      if ( d && !(memflags & MEMF_no_owner) &&
>           assign_pages(d, pg, order, memflags) )
>      {
> -        free_heap_pages(pg, order);
> +        free_heap_pages(pg, order, false);
>          return NULL;
>      }
>
> @@ -1857,11 +1974,7 @@ void free_domheap_pages(struct page_info *pg, unsigned int order)
>              scrub = 1;
>          }
>
> -        if ( unlikely(scrub) )
> -            for ( i = 0; i < (1 << order); i++ )
> -                scrub_one_page(&pg[i]);
> -
> -        free_heap_pages(pg, order);
> +        free_heap_pages(pg, order, scrub);
>      }
>
>      if ( drop_dom_ref )
> diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
> index ef84b72..3b3d38f 100644
> --- a/xen/include/asm-arm/mm.h
> +++ b/xen/include/asm-arm/mm.h
> @@ -43,8 +43,16 @@ struct page_info
>          } inuse;
>          /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>          struct {
> +            /*
> +             * Index of the first *possibly* unscrubbed page in the buddy.
> +             * One more bit than maximum possible order to accommodate
> +             * INVALID_DIRTY_IDX.
> +             */
> +#define INVALID_DIRTY_IDX ((1UL << (MAX_ORDER + 1)) - 1)
> +            unsigned long first_dirty:MAX_ORDER + 1;
> +
>              /* Do TLBs need flushing for safety before next page use? */
> -            bool_t need_tlbflush;
> +            bool need_tlbflush:1;
>          } free;
>
>      } u;
> @@ -107,6 +115,13 @@ struct page_info
>  #define PGC_count_width   PG_shift(9)
>  #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
>
> +/*
> + * Page needs to be scrubbed. Since this bit can only be set on a page that is
> + * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
> + */
> +#define _PGC_need_scrub   _PGC_allocated
> +#define PGC_need_scrub    PGC_allocated
> +
>  extern mfn_t xenheap_mfn_start, xenheap_mfn_end;
>  extern vaddr_t xenheap_virt_end;
>  #ifdef CONFIG_ARM_64
> diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
> index 2bf3f33..86b1723 100644
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -87,6 +87,14 @@ struct page_info
>
>          /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>          struct {
> +            /*
> +             * Index of the first *possibly* unscrubbed page in the buddy.
> +             * One more bit than maximum possible order to accommodate
> +             * INVALID_DIRTY_IDX.
> +             */
> +#define INVALID_DIRTY_IDX ((1UL << (MAX_ORDER + 1)) - 1)
> +            unsigned int first_dirty;
> +
>              /* Do TLBs need flushing for safety before next page use? */
>              bool_t need_tlbflush;
>          } free;
> @@ -233,6 +241,13 @@ struct page_info
>  #define PGC_count_width   PG_shift(9)
>  #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
>
> +/*
> + * Page needs to be scrubbed. Since this bit can only be set on a page that is
> + * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
> + */
> +#define _PGC_need_scrub   _PGC_allocated
> +#define PGC_need_scrub    PGC_allocated
> +
>  #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
>  #define is_xen_heap_mfn(mfn) \
>      (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))
>
Jan Beulich Aug. 18, 2017, 9:11 a.m. UTC | #2
>>> On 16.08.17 at 20:33, <boris.ostrovsky@oracle.com> wrote:
> .. so that it's easy to find pages that need to be scrubbed (those pages are
> now marked with _PGC_need_scrub bit).
> 
> We keep track of the first unscrubbed page in a page buddy using first_dirty
> field. For now it can have two values, 0 (whole buddy needs scrubbing) or
> INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent patches
> will allow scrubbing to be interrupted, resulting in first_dirty taking any
> value.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>
with one remark:

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -261,7 +261,11 @@ void __init init_boot_pages(paddr_t ps, paddr_t pe)
>  #ifdef CONFIG_X86
>      const unsigned long *badpage = NULL;
>      unsigned int i, array_size;
> +
> +    BUILD_BUG_ON(8 * sizeof(((struct page_info *)0)->u.free.first_dirty) <
> +                 MAX_ORDER + 1);
>  #endif
> +    BUILD_BUG_ON(sizeof(((struct page_info *)0)->u) != sizeof(unsigned long));

As I'm generally opposed to casts whenever one can get away
without, I dislike these as well. In the case here, short of a local
variable of suitable type, I'd suggest using frame_table instead
of the open-coded cast. If you're fine with that, this can easily
be done while committing.

Jan
Boris Ostrovsky Aug. 18, 2017, 1:11 p.m. UTC | #3
On 08/18/2017 05:11 AM, Jan Beulich wrote:
>>>> On 16.08.17 at 20:33, <boris.ostrovsky@oracle.com> wrote:
>> .. so that it's easy to find pages that need to be scrubbed (those pages are
>> now marked with _PGC_need_scrub bit).
>>
>> We keep track of the first unscrubbed page in a page buddy using first_dirty
>> field. For now it can have two values, 0 (whole buddy needs scrubbing) or
>> INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent patches
>> will allow scrubbing to be interrupted, resulting in first_dirty taking any
>> value.
>>
>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> with one remark:
>
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -261,7 +261,11 @@ void __init init_boot_pages(paddr_t ps, paddr_t pe)
>>  #ifdef CONFIG_X86
>>      const unsigned long *badpage = NULL;
>>      unsigned int i, array_size;
>> +
>> +    BUILD_BUG_ON(8 * sizeof(((struct page_info *)0)->u.free.first_dirty) <
>> +                 MAX_ORDER + 1);
>>  #endif
>> +    BUILD_BUG_ON(sizeof(((struct page_info *)0)->u) != sizeof(unsigned long));
> As I'm generally opposed to casts whenever one can get away
> without, I dislike these as well. In the case here, short of a local
> variable of suitable type, I'd suggest using frame_table instead
> of the open-coded cast. If you're fine with that, this can easily
> be done while committing.

Sure.


-boris
Jan Beulich Aug. 21, 2017, 1:49 p.m. UTC | #4
>>> On 17.08.17 at 12:30, <julien.grall@arm.com> wrote:
> On 16/08/17 19:33, Boris Ostrovsky wrote:
>> .. so that it's easy to find pages that need to be scrubbed (those pages are
>> now marked with _PGC_need_scrub bit).
>>
>> We keep track of the first unscrubbed page in a page buddy using first_dirty
>> field. For now it can have two values, 0 (whole buddy needs scrubbing) or
>> INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent 
> patches
>> will allow scrubbing to be interrupted, resulting in first_dirty taking any
>> value.
>>
>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> 
> For the ARM bits:
> 
> Acked-by: Julien Grall <julien.grall@arm.com>

I've started committing the series when I noticed patches 4, 5, and 6
are still lacking ARM side acks.

Jan
Julien Grall Aug. 21, 2017, 5 p.m. UTC | #5
Hi Jan,

On 21/08/17 14:49, Jan Beulich wrote:
>>>> On 17.08.17 at 12:30, <julien.grall@arm.com> wrote:
>> On 16/08/17 19:33, Boris Ostrovsky wrote:
>>> .. so that it's easy to find pages that need to be scrubbed (those pages are
>>> now marked with _PGC_need_scrub bit).
>>>
>>> We keep track of the first unscrubbed page in a page buddy using first_dirty
>>> field. For now it can have two values, 0 (whole buddy needs scrubbing) or
>>> INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent
>> patches
>>> will allow scrubbing to be interrupted, resulting in first_dirty taking any
>>> value.
>>>
>>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>>
>> For the ARM bits:
>>
>> Acked-by: Julien Grall <julien.grall@arm.com>
>
> I've started committing the series when I noticed patches 4, 5, and 6
> are still lacking ARM side acks.

Whoops, thank you for the remainder. You can ack my ack on the 4, 5, and 6:

Acked-by: Julien Grall <julien.grall@arm.com>

Cheers,
diff mbox

Patch

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 444ecf3..a39fd81 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -261,7 +261,11 @@  void __init init_boot_pages(paddr_t ps, paddr_t pe)
 #ifdef CONFIG_X86
     const unsigned long *badpage = NULL;
     unsigned int i, array_size;
+
+    BUILD_BUG_ON(8 * sizeof(((struct page_info *)0)->u.free.first_dirty) <
+                 MAX_ORDER + 1);
 #endif
+    BUILD_BUG_ON(sizeof(((struct page_info *)0)->u) != sizeof(unsigned long));
 
     ps = round_pgup(ps);
     pe = round_pgdown(pe);
@@ -375,6 +379,8 @@  typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
 #define heap(node, zone, order) ((*_heap[node])[zone][order])
 
+static unsigned long node_need_scrub[MAX_NUMNODES];
+
 static unsigned long *avail[MAX_NUMNODES];
 static long total_avail_pages;
 
@@ -670,13 +676,30 @@  static void check_low_mem_virq(void)
     }
 }
 
+/* Pages that need a scrub are added to tail, otherwise to head. */
+static void page_list_add_scrub(struct page_info *pg, unsigned int node,
+                                unsigned int zone, unsigned int order,
+                                unsigned int first_dirty)
+{
+    PFN_ORDER(pg) = order;
+    pg->u.free.first_dirty = first_dirty;
+
+    if ( first_dirty != INVALID_DIRTY_IDX )
+    {
+        ASSERT(first_dirty < (1U << order));
+        page_list_add_tail(pg, &heap(node, zone, order));
+    }
+    else
+        page_list_add(pg, &heap(node, zone, order));
+}
+
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
     unsigned int order, unsigned int memflags,
     struct domain *d)
 {
-    unsigned int i, j, zone = 0, nodemask_retry = 0;
+    unsigned int i, j, zone = 0, nodemask_retry = 0, first_dirty;
     nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
     unsigned long request = 1UL << order;
     struct page_info *pg;
@@ -790,12 +813,26 @@  static struct page_info *alloc_heap_pages(
     return NULL;
 
  found: 
+
+    first_dirty = pg->u.free.first_dirty;
+
     /* We may have to halve the chunk a number of times. */
     while ( j != order )
     {
-        PFN_ORDER(pg) = --j;
-        page_list_add_tail(pg, &heap(node, zone, j));
-        pg += 1 << j;
+        j--;
+        page_list_add_scrub(pg, node, zone, j,
+                            (1U << j) > first_dirty ?
+                            first_dirty : INVALID_DIRTY_IDX);
+        pg += 1U << j;
+
+        if ( first_dirty != INVALID_DIRTY_IDX )
+        {
+            /* Adjust first_dirty */
+            if ( first_dirty >= 1U << j )
+                first_dirty -= 1U << j;
+            else
+                first_dirty = 0; /* We've moved past original first_dirty */
+        }
     }
 
     ASSERT(avail[node][zone] >= request);
@@ -842,12 +879,20 @@  static int reserve_offlined_page(struct page_info *head)
     unsigned int node = phys_to_nid(page_to_maddr(head));
     int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
     struct page_info *cur_head;
-    int cur_order;
+    unsigned int cur_order, first_dirty;
 
     ASSERT(spin_is_locked(&heap_lock));
 
     cur_head = head;
 
+    /*
+     * We may break the buddy so let's mark the head as clean. Then, when
+     * merging chunks back into the heap, we will see whether the chunk has
+     * unscrubbed pages and set its first_dirty properly.
+     */
+    first_dirty = head->u.free.first_dirty;
+    head->u.free.first_dirty = INVALID_DIRTY_IDX;
+
     page_list_del(head, &heap(node, zone, head_order));
 
     while ( cur_head < (head + (1 << head_order)) )
@@ -858,6 +903,8 @@  static int reserve_offlined_page(struct page_info *head)
         if ( page_state_is(cur_head, offlined) )
         {
             cur_head++;
+            if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
+                first_dirty--;
             continue;
         }
 
@@ -884,9 +931,20 @@  static int reserve_offlined_page(struct page_info *head)
             {
             merge:
                 /* We don't consider merging outside the head_order. */
-                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
-                PFN_ORDER(cur_head) = cur_order;
+                page_list_add_scrub(cur_head, node, zone, cur_order,
+                                    (1U << cur_order) > first_dirty ?
+                                    first_dirty : INVALID_DIRTY_IDX);
                 cur_head += (1 << cur_order);
+
+                /* Adjust first_dirty if needed. */
+                if ( first_dirty != INVALID_DIRTY_IDX )
+                {
+                    if ( first_dirty >=  1U << cur_order )
+                        first_dirty -= 1U << cur_order;
+                    else
+                        first_dirty = 0;
+                }
+
                 break;
             }
         }
@@ -911,9 +969,53 @@  static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
+static void scrub_free_pages(unsigned int node)
+{
+    struct page_info *pg;
+    unsigned int zone;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    if ( !node_need_scrub[node] )
+        return;
+
+    for ( zone = 0; zone < NR_ZONES; zone++ )
+    {
+        unsigned int order = MAX_ORDER;
+
+        do {
+            while ( !page_list_empty(&heap(node, zone, order)) )
+            {
+                unsigned int i;
+
+                /* Unscrubbed pages are always at the end of the list. */
+                pg = page_list_last(&heap(node, zone, order));
+                if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
+                    break;
+
+                for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
+                {
+                    if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
+                    {
+                        scrub_one_page(&pg[i]);
+                        pg[i].count_info &= ~PGC_need_scrub;
+                        node_need_scrub[node]--;
+                    }
+                }
+
+                page_list_del(pg, &heap(node, zone, order));
+                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+
+                if ( node_need_scrub[node] == 0 )
+                    return;
+            }
+        } while ( order-- != 0 );
+    }
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order)
+    struct page_info *pg, unsigned int order, bool need_scrub)
 {
     unsigned long mask, mfn = page_to_mfn(pg);
     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
@@ -953,10 +1055,20 @@  static void free_heap_pages(
         /* This page is not a guest frame any more. */
         page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
         set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
+
+        if ( need_scrub )
+            pg[i].count_info |= PGC_need_scrub;
     }
 
     avail[node][zone] += 1 << order;
     total_avail_pages += 1 << order;
+    if ( need_scrub )
+    {
+        node_need_scrub[node] += 1 << order;
+        pg->u.free.first_dirty = 0;
+    }
+    else
+        pg->u.free.first_dirty = INVALID_DIRTY_IDX;
 
     if ( tmem_enabled() )
         midsize_alloc_zone_pages = max(
@@ -980,6 +1092,12 @@  static void free_heap_pages(
 
             page_list_del(predecessor, &heap(node, zone, order));
 
+            /* Keep predecessor's first_dirty if it is already set. */
+            if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
+                 pg->u.free.first_dirty != INVALID_DIRTY_IDX )
+                predecessor->u.free.first_dirty = (1U << order) +
+                                                  pg->u.free.first_dirty;
+
             pg = predecessor;
         }
         else
@@ -999,12 +1117,14 @@  static void free_heap_pages(
         order++;
     }
 
-    PFN_ORDER(pg) = order;
-    page_list_add_tail(pg, &heap(node, zone, order));
+    page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
 
     if ( tainted )
         reserve_offlined_page(pg);
 
+    if ( need_scrub )
+        scrub_free_pages(node);
+
     spin_unlock(&heap_lock);
 }
 
@@ -1225,7 +1345,7 @@  unsigned int online_page(unsigned long mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, false);
 
     return ret;
 }
@@ -1294,7 +1414,7 @@  static void init_heap_pages(
             nr_pages -= n;
         }
 
-        free_heap_pages(pg+i, 0);
+        free_heap_pages(pg + i, 0, false);
     }
 }
 
@@ -1621,7 +1741,7 @@  void free_xenheap_pages(void *v, unsigned int order)
 
     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
 
-    free_heap_pages(virt_to_page(v), order);
+    free_heap_pages(virt_to_page(v), order, false);
 }
 
 #else
@@ -1675,12 +1795,9 @@  void free_xenheap_pages(void *v, unsigned int order)
     pg = virt_to_page(v);
 
     for ( i = 0; i < (1u << order); i++ )
-    {
-        scrub_one_page(&pg[i]);
         pg[i].count_info &= ~PGC_xen_heap;
-    }
 
-    free_heap_pages(pg, order);
+    free_heap_pages(pg, order, true);
 }
 
 #endif
@@ -1789,7 +1906,7 @@  struct page_info *alloc_domheap_pages(
     if ( d && !(memflags & MEMF_no_owner) &&
          assign_pages(d, pg, order, memflags) )
     {
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, false);
         return NULL;
     }
     
@@ -1857,11 +1974,7 @@  void free_domheap_pages(struct page_info *pg, unsigned int order)
             scrub = 1;
         }
 
-        if ( unlikely(scrub) )
-            for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
-
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, scrub);
     }
 
     if ( drop_dom_ref )
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index ef84b72..3b3d38f 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -43,8 +43,16 @@  struct page_info
         } inuse;
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
+            /*
+             * Index of the first *possibly* unscrubbed page in the buddy.
+             * One more bit than maximum possible order to accommodate
+             * INVALID_DIRTY_IDX.
+             */
+#define INVALID_DIRTY_IDX ((1UL << (MAX_ORDER + 1)) - 1)
+            unsigned long first_dirty:MAX_ORDER + 1;
+
             /* Do TLBs need flushing for safety before next page use? */
-            bool_t need_tlbflush;
+            bool need_tlbflush:1;
         } free;
 
     } u;
@@ -107,6 +115,13 @@  struct page_info
 #define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
+/*
+ * Page needs to be scrubbed. Since this bit can only be set on a page that is
+ * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
+ */
+#define _PGC_need_scrub   _PGC_allocated
+#define PGC_need_scrub    PGC_allocated
+
 extern mfn_t xenheap_mfn_start, xenheap_mfn_end;
 extern vaddr_t xenheap_virt_end;
 #ifdef CONFIG_ARM_64
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 2bf3f33..86b1723 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -87,6 +87,14 @@  struct page_info
 
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
+            /*
+             * Index of the first *possibly* unscrubbed page in the buddy.
+             * One more bit than maximum possible order to accommodate
+             * INVALID_DIRTY_IDX.
+             */
+#define INVALID_DIRTY_IDX ((1UL << (MAX_ORDER + 1)) - 1)
+            unsigned int first_dirty;
+
             /* Do TLBs need flushing for safety before next page use? */
             bool_t need_tlbflush;
         } free;
@@ -233,6 +241,13 @@  struct page_info
 #define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
+/*
+ * Page needs to be scrubbed. Since this bit can only be set on a page that is
+ * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
+ */
+#define _PGC_need_scrub   _PGC_allocated
+#define PGC_need_scrub    PGC_allocated
+
 #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap)
 #define is_xen_heap_mfn(mfn) \
     (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn)))