diff mbox series

[RFC,1/3] mm: support hugetlb free page reporting

Message ID 20201222074656.GA30035@open-light-1.localdomain (mailing list archive)
State New, archived
Headers show
Series add support for free hugepage reporting | expand

Commit Message

Liang Li Dec. 22, 2020, 7:46 a.m. UTC
Free page reporting only supports buddy pages, it can't report the
free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
is a good choice for a system with a huge amount of RAM, because it
can help to reduce the memory management overhead and improve system
performance.
This patch add the support for reporting hugepages in the free list
of hugetlb, it canbe used by virtio_balloon driver for memory
overcommit and pre zero out free pages for speeding up memory population.

Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Hildenbrand <david@redhat.com>  
Cc: Michal Hocko <mhocko@suse.com> 
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Liang Li <liliang324@gmail.com>
Signed-off-by: Liang Li <liliangleo@didiglobal.com>
---
 include/linux/hugetlb.h        |   3 +
 include/linux/page_reporting.h |   5 +
 mm/hugetlb.c                   |  29 ++++
 mm/page_reporting.c            | 287 +++++++++++++++++++++++++++++++++
 mm/page_reporting.h            |  34 ++++
 5 files changed, 358 insertions(+)

Comments

Alexander Duyck Dec. 22, 2020, 7:59 p.m. UTC | #1
On Mon, Dec 21, 2020 at 11:47 PM Liang Li <liliang.opensource@gmail.com> wrote:
>
> Free page reporting only supports buddy pages, it can't report the
> free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
> is a good choice for a system with a huge amount of RAM, because it
> can help to reduce the memory management overhead and improve system
> performance.
> This patch add the support for reporting hugepages in the free list
> of hugetlb, it canbe used by virtio_balloon driver for memory
> overcommit and pre zero out free pages for speeding up memory population.
>
> Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Liang Li <liliang324@gmail.com>
> Signed-off-by: Liang Li <liliangleo@didiglobal.com>
> ---
>  include/linux/hugetlb.h        |   3 +
>  include/linux/page_reporting.h |   5 +
>  mm/hugetlb.c                   |  29 ++++
>  mm/page_reporting.c            | 287 +++++++++++++++++++++++++++++++++
>  mm/page_reporting.h            |  34 ++++
>  5 files changed, 358 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ebca2ef02212..a72ad25501d3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -11,6 +11,7 @@
>  #include <linux/kref.h>
>  #include <linux/pgtable.h>
>  #include <linux/gfp.h>
> +#include <linux/page_reporting.h>
>
>  struct ctl_table;
>  struct user_struct;
> @@ -114,6 +115,8 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
>  int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
>                 loff_t *);
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid);
> +void putback_isolate_huge_page(struct hstate *h, struct page *page);
>  int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
>  long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>                          struct page **, struct vm_area_struct **,
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 63e1e9fbcaa2..0da3d1a6f0cc 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -7,6 +7,7 @@
>
>  /* This value should always be a power of 2, see page_reporting_cycle() */
>  #define PAGE_REPORTING_CAPACITY                32
> +#define HUGEPAGE_REPORTING_CAPACITY    1
>
>  struct page_reporting_dev_info {
>         /* function that alters pages to make them "reported" */
> @@ -26,4 +27,8 @@ struct page_reporting_dev_info {
>  /* Tear-down and bring-up for page reporting devices */
>  void page_reporting_unregister(struct page_reporting_dev_info *prdev);
>  int page_reporting_register(struct page_reporting_dev_info *prdev);
> +
> +/* Tear-down and bring-up for hugepage reporting devices */
> +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev);
> +int hugepage_reporting_register(struct page_reporting_dev_info *prdev);
>  #endif /*_LINUX_PAGE_REPORTING_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index cbf32d2824fd..de6ce147dfe2 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -41,6 +41,7 @@
>  #include <linux/node.h>
>  #include <linux/userfaultfd_k.h>
>  #include <linux/page_owner.h>
> +#include "page_reporting.h"
>  #include "internal.h"
>
>  int hugetlb_max_hstate __read_mostly;
> @@ -1028,6 +1029,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
>         list_move(&page->lru, &h->hugepage_freelists[nid]);
>         h->free_huge_pages++;
>         h->free_huge_pages_node[nid]++;
> +       if (hugepage_reported(page)) {
> +               __ClearPageReported(page);
> +               pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
> +       }
> +       hugepage_reporting_notify_free(h->order);
>  }
>
>  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
>         return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
>  }
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)
> +{
> +       bool ret = true;
> +
> +       VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> +       list_move(&page->lru, &h->hugepage_activelist);
> +       set_page_refcounted(page);
> +       h->free_huge_pages--;
> +       h->free_huge_pages_node[nid]--;
> +
> +       return ret;
> +}
> +
> +void putback_isolate_huge_page(struct hstate *h, struct page *page)
> +{
> +       int nid = page_to_nid(page);
> +       pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
> +       list_move(&page->lru, &h->hugepage_freelists[nid]);
> +       h->free_huge_pages++;
> +       h->free_huge_pages_node[nid]++;
> +}
> +
>  bool isolate_huge_page(struct page *page, struct list_head *list)
>  {
>         bool ret = true;
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 20ec3fb1afc4..15d4b5372df8 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -7,6 +7,7 @@
>  #include <linux/delay.h>
>  #include <linux/scatterlist.h>
>  #include <linux/sched.h>
> +#include <linux/hugetlb.h>
>
>  #include "page_reporting.h"
>  #include "internal.h"
> @@ -16,6 +17,10 @@ static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  int page_report_mini_order = pageblock_order;
>  unsigned long page_report_batch_size = 32 * 1024 * 1024;
>
> +static struct page_reporting_dev_info __rcu *hgpr_dev_info __read_mostly;
> +int hugepage_report_mini_order = pageblock_order;
> +unsigned long hugepage_report_batch_size = 64 * 1024 * 1024;
> +
>  enum {
>         PAGE_REPORTING_IDLE = 0,
>         PAGE_REPORTING_REQUESTED,
> @@ -67,6 +72,24 @@ void __page_reporting_notify(void)
>         rcu_read_unlock();
>  }
>
> +/* notify prdev of free hugepage reporting request */
> +void __hugepage_reporting_notify(void)
> +{
> +       struct page_reporting_dev_info *prdev;
> +
> +       /*
> +        * We use RCU to protect the pr_dev_info pointer. In almost all
> +        * cases this should be present, however in the unlikely case of
> +        * a shutdown this will be NULL and we should exit.
> +        */
> +       rcu_read_lock();
> +       prdev = rcu_dereference(hgpr_dev_info);
> +       if (likely(prdev))
> +               __page_reporting_request(prdev);
> +
> +       rcu_read_unlock();
> +}
> +
>  static void
>  page_reporting_drain(struct page_reporting_dev_info *prdev,
>                      struct scatterlist *sgl, unsigned int nents, bool reported)
> @@ -103,6 +126,213 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
>         sg_init_table(sgl, nents);
>  }
>
> +static void
> +hugepage_reporting_drain(struct page_reporting_dev_info *prdev,
> +                        struct hstate *h, struct scatterlist *sgl,
> +                        unsigned int nents, bool reported)
> +{
> +       struct scatterlist *sg = sgl;
> +
> +       /*
> +        * Drain the now reported pages back into their respective
> +        * free lists/areas. We assume at least one page is populated.
> +        */
> +       do {
> +               struct page *page = sg_page(sg);
> +
> +               putback_isolate_huge_page(h, page);
> +
> +               /* If the pages were not reported due to error skip flagging */
> +               if (!reported)
> +                       continue;
> +
> +               __SetPageReported(page);
> +       } while ((sg = sg_next(sg)));
> +
> +       /* reinitialize scatterlist now that it is empty */
> +       sg_init_table(sgl, nents);
> +}
> +
> +/*
> + * The page reporting cycle consists of 4 stages, fill, report, drain, and
> + * idle. We will cycle through the first 3 stages until we cannot obtain a
> + * full scatterlist of pages, in that case we will switch to idle.
> + */
> +static int
> +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> +                        struct hstate *h, unsigned int nid,
> +                        struct scatterlist *sgl, unsigned int *offset)
> +{
> +       struct list_head *list = &h->hugepage_freelists[nid];
> +       unsigned int page_len = PAGE_SIZE << h->order;
> +       struct page *page, *next;
> +       long budget;
> +       int ret = 0, scan_cnt = 0;
> +
> +       /*
> +        * Perform early check, if free area is empty there is
> +        * nothing to process so we can skip this free_list.
> +        */
> +       if (list_empty(list))
> +               return ret;
> +
> +       spin_lock_irq(&hugetlb_lock);
> +
> +       if (huge_page_order(h) > MAX_ORDER)
> +               budget = HUGEPAGE_REPORTING_CAPACITY;
> +       else
> +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;

Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
don't even really need budget since this should probably be pulling
out no more than one hugepage at a time.


> +       /* loop through free list adding unreported pages to sg list */
> +       list_for_each_entry_safe(page, next, list, lru) {
> +               /* We are going to skip over the reported pages. */
> +               if (PageReported(page)) {
> +                       if (++scan_cnt >= MAX_SCAN_NUM) {
> +                               ret = scan_cnt;
> +                               break;
> +                       }
> +                       continue;
> +               }
> +

It would probably have been better to place this set before your new
set. I don't see your new set necessarily being the best use for page
reporting.

> +               /*
> +                * If we fully consumed our budget then update our
> +                * state to indicate that we are requesting additional
> +                * processing and exit this list.
> +                */
> +               if (budget < 0) {
> +                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> +                       next = page;
> +                       break;
> +               }
> +

If budget is only ever going to be 1 then we probably could just look
at making this the default case for any time we find a non-reported
page.

> +               /* Attempt to pull page from list and place in scatterlist */
> +               if (*offset) {
> +                       isolate_free_huge_page(page, h, nid);
> +                       /* Add page to scatter list */
> +                       --(*offset);
> +                       sg_set_page(&sgl[*offset], page, page_len, 0);
> +
> +                       continue;
> +               }
> +

There is no point in the continue case if we only have a budget of 1.
We should probably just tighten up the loop so that all it does is
search until it finds the 1 page it can pull, pull it, and then return
it. The scatterlist doesn't serve much purpose and could be reduced to
just a single entry.

> +               /*
> +                * Make the first non-processed page in the free list
> +                * the new head of the free list before we release the
> +                * zone lock.
> +                */
> +               if (&page->lru != list && !list_is_first(&page->lru, list))
> +                       list_rotate_to_front(&page->lru, list);
> +
> +               /* release lock before waiting on report processing */
> +               spin_unlock_irq(&hugetlb_lock);
> +
> +               /* begin processing pages in local list */
> +               ret = prdev->report(prdev, sgl, HUGEPAGE_REPORTING_CAPACITY);
> +
> +               /* reset offset since the full list was reported */
> +               *offset = HUGEPAGE_REPORTING_CAPACITY;
> +
> +               /* update budget to reflect call to report function */
> +               budget--;
> +
> +               /* reacquire zone lock and resume processing */
> +               spin_lock_irq(&hugetlb_lock);
> +
> +               /* flush reported pages from the sg list */
> +               hugepage_reporting_drain(prdev, h, sgl,
> +                                        HUGEPAGE_REPORTING_CAPACITY, !ret);
> +
> +               /*
> +                * Reset next to first entry, the old next isn't valid
> +                * since we dropped the lock to report the pages
> +                */
> +               next = list_first_entry(list, struct page, lru);
> +
> +               /* exit on error */
> +               if (ret)
> +                       break;
> +       }
> +
> +       /* Rotate any leftover pages to the head of the freelist */
> +       if (&next->lru != list && !list_is_first(&next->lru, list))
> +               list_rotate_to_front(&next->lru, list);
> +
> +       spin_unlock_irq(&hugetlb_lock);
> +
> +       return ret;
> +}
> +
> +static int
> +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
> +                           struct scatterlist *sgl, struct hstate *h)
> +{
> +       unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
> +       int ret = 0, nid;
> +
> +       for (nid = 0; nid < MAX_NUMNODES; nid++) {
> +               ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
> +
> +               if (ret < 0)
> +                       return ret;
> +       }
> +
> +       /* report the leftover pages before going idle */
> +       leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
> +       if (leftover) {
> +               sgl = &sgl[offset];
> +               ret = prdev->report(prdev, sgl, leftover);
> +
> +               /* flush any remaining pages out from the last report */
> +               spin_lock_irq(&hugetlb_lock);
> +               hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
> +               spin_unlock_irq(&hugetlb_lock);
> +       }
> +
> +       return ret;
> +}
> +

If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to
rewrite this code to just optimize for a find and process a page
approach rather than trying to batch pages.

> +static void hugepage_reporting_process(struct work_struct *work)
> +{
> +       struct delayed_work *d_work = to_delayed_work(work);
> +       struct page_reporting_dev_info *prdev = container_of(d_work,
> +                                       struct page_reporting_dev_info, work);
> +       int err = 0, state = PAGE_REPORTING_ACTIVE;
> +       struct scatterlist *sgl;
> +       struct hstate *h;
> +
> +       /*
> +        * Change the state to "Active" so that we can track if there is
> +        * anyone requests page reporting after we complete our pass. If
> +        * the state is not altered by the end of the pass we will switch
> +        * to idle and quit scheduling reporting runs.
> +        */
> +       atomic_set(&prdev->state, state);
> +
> +       /* allocate scatterlist to store pages being reported on */
> +       sgl = kmalloc_array(HUGEPAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
> +       if (!sgl)
> +               goto err_out;
> +
> +       sg_init_table(sgl, HUGEPAGE_REPORTING_CAPACITY);
> +
> +       for_each_hstate(h) {
> +               err = hugepage_reporting_process_hstate(prdev, sgl, h);
> +               if (err)
> +                       break;
> +       }
> +
> +       kfree(sgl);
> +err_out:
> +       /*
> +        * If the state has reverted back to requested then there may be
> +        * additional pages to be processed. We will defer for 2s to allow
> +        * more pages to accumulate.
> +        */
> +       state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
> +       if (state == PAGE_REPORTING_REQUESTED)
> +               schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
> +}
> +
>  /*
>   * The page reporting cycle consists of 4 stages, fill, report, drain, and
>   * idle. We will cycle through the first 3 stages until we cannot obtain a
> @@ -341,6 +571,9 @@ static void page_reporting_process(struct work_struct *work)
>  static DEFINE_MUTEX(page_reporting_mutex);
>  DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
>
> +static DEFINE_MUTEX(hugepage_reporting_mutex);
> +DEFINE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
> +
>  int page_reporting_register(struct page_reporting_dev_info *prdev)
>  {
>         int err = 0;
> @@ -395,3 +628,57 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
>         mutex_unlock(&page_reporting_mutex);
>  }
>  EXPORT_SYMBOL_GPL(page_reporting_unregister);
> +
> +int hugepage_reporting_register(struct page_reporting_dev_info *prdev)
> +{
> +       int err = 0;
> +
> +       mutex_lock(&hugepage_reporting_mutex);
> +
> +       /* nothing to do if already in use */
> +       if (rcu_access_pointer(hgpr_dev_info)) {
> +               err = -EBUSY;
> +               goto err_out;
> +       }
> +
> +       /* initialize state and work structures */
> +       atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
> +       INIT_DELAYED_WORK(&prdev->work, &hugepage_reporting_process);
> +
> +       /* Begin initial flush of zones */
> +       __page_reporting_request(prdev);
> +
> +       /* Assign device to allow notifications */
> +       rcu_assign_pointer(hgpr_dev_info, prdev);
> +
> +       hugepage_report_mini_order = prdev->mini_order;
> +       hugepage_report_batch_size = prdev->batch_size;
> +
> +       /* enable hugepage reporting notification */
> +       if (!static_key_enabled(&hugepage_reporting_enabled)) {
> +               static_branch_enable(&hugepage_reporting_enabled);
> +               pr_info("Free hugepage reporting enabled\n");
> +       }
> +err_out:
> +       mutex_unlock(&hugepage_reporting_mutex);
> +
> +       return err;
> +}
> +EXPORT_SYMBOL_GPL(hugepage_reporting_register);
> +
> +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev)
> +{
> +       mutex_lock(&hugepage_reporting_mutex);
> +
> +       if (rcu_access_pointer(hgpr_dev_info) == prdev) {
> +               /* Disable page reporting notification */
> +               RCU_INIT_POINTER(hgpr_dev_info, NULL);
> +               synchronize_rcu();
> +
> +               /* Flush any existing work, and lock it out */
> +               cancel_delayed_work_sync(&prdev->work);
> +       }
> +
> +       mutex_unlock(&hugepage_reporting_mutex);
> +}
> +EXPORT_SYMBOL_GPL(hugepage_reporting_unregister);
> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index 86ac6ffad970..271c64c3c3cb 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -18,12 +18,24 @@ extern unsigned long page_report_batch_size;
>  DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
>  void __page_reporting_notify(void);
>
> +extern int hugepage_report_mini_order;
> +extern unsigned long hugepage_report_batch_size;
> +
> +DECLARE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
> +void __hugepage_reporting_notify(void);
> +
>  static inline bool page_reported(struct page *page)
>  {
>         return static_branch_unlikely(&page_reporting_enabled) &&
>                PageReported(page);
>  }
>
> +static inline bool hugepage_reported(struct page *page)
> +{
> +       return static_branch_unlikely(&hugepage_reporting_enabled) &&
> +              PageReported(page);
> +}
> +
>  /**
>   * page_reporting_notify_free - Free page notification to start page processing
>   *
> @@ -52,11 +64,33 @@ static inline void page_reporting_notify_free(unsigned int order)
>                 __page_reporting_notify();
>         }
>  }
> +
> +static inline void hugepage_reporting_notify_free(unsigned int order)
> +{
> +       static long batch_size = 0;
> +
> +       if (!static_branch_unlikely(&hugepage_reporting_enabled))
> +               return;
> +
> +       /* Determine if we have crossed reporting threshold */
> +       if (order < hugepage_report_mini_order)
> +               return;
> +
> +       batch_size += (1 << order) << PAGE_SHIFT;
> +       if (batch_size >= hugepage_report_batch_size) {
> +               batch_size = 0;
> +               __hugepage_reporting_notify();
> +       }
> +}
>  #else /* CONFIG_PAGE_REPORTING */
>  #define page_reported(_page)   false
>
>  static inline void page_reporting_notify_free(unsigned int order)
>  {
>  }
> +
> +static inline void hugepage_reporting_notify_free(unsigned int order)
> +{
> +}
>  #endif /* CONFIG_PAGE_REPORTING */
>  #endif /*_MM_PAGE_REPORTING_H */
> --
> 2.18.2
>
>
Mike Kravetz Dec. 22, 2020, 10:30 p.m. UTC | #2
On 12/21/20 11:46 PM, Liang Li wrote:
> Free page reporting only supports buddy pages, it can't report the
> free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
> is a good choice for a system with a huge amount of RAM, because it
> can help to reduce the memory management overhead and improve system
> performance.
> This patch add the support for reporting hugepages in the free list
> of hugetlb, it canbe used by virtio_balloon driver for memory
> overcommit and pre zero out free pages for speeding up memory population.

My apologies as I do not follow virtio_balloon driver.  Comments from
the hugetlb perspective.

> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -41,6 +41,7 @@
>  #include <linux/node.h>
>  #include <linux/userfaultfd_k.h>
>  #include <linux/page_owner.h>
> +#include "page_reporting.h"
>  #include "internal.h"
>  
>  int hugetlb_max_hstate __read_mostly;
> @@ -1028,6 +1029,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
>  	list_move(&page->lru, &h->hugepage_freelists[nid]);
>  	h->free_huge_pages++;
>  	h->free_huge_pages_node[nid]++;
> +	if (hugepage_reported(page)) {
> +		__ClearPageReported(page);
> +		pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
> +	}
> +	hugepage_reporting_notify_free(h->order);
>  }
>  
>  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
>  	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
>  }
>  
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)

Looks like this always returns true.  Should it be type void?

> +{
> +	bool ret = true;
> +
> +	VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> +	list_move(&page->lru, &h->hugepage_activelist);
> +	set_page_refcounted(page);
> +	h->free_huge_pages--;
> +	h->free_huge_pages_node[nid]--;
> +
> +	return ret;
> +}
> +

...

> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 20ec3fb1afc4..15d4b5372df8 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -7,6 +7,7 @@
>  #include <linux/delay.h>
>  #include <linux/scatterlist.h>
>  #include <linux/sched.h>
> +#include <linux/hugetlb.h>
>  
>  #include "page_reporting.h"
>  #include "internal.h"
> @@ -16,6 +17,10 @@ static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  int page_report_mini_order = pageblock_order;
>  unsigned long page_report_batch_size = 32 * 1024 * 1024;
>  
> +static struct page_reporting_dev_info __rcu *hgpr_dev_info __read_mostly;
> +int hugepage_report_mini_order = pageblock_order;
> +unsigned long hugepage_report_batch_size = 64 * 1024 * 1024;
> +
>  enum {
>  	PAGE_REPORTING_IDLE = 0,
>  	PAGE_REPORTING_REQUESTED,
> @@ -67,6 +72,24 @@ void __page_reporting_notify(void)
>  	rcu_read_unlock();
>  }
>  
> +/* notify prdev of free hugepage reporting request */
> +void __hugepage_reporting_notify(void)
> +{
> +	struct page_reporting_dev_info *prdev;
> +
> +	/*
> +	 * We use RCU to protect the pr_dev_info pointer. In almost all
> +	 * cases this should be present, however in the unlikely case of
> +	 * a shutdown this will be NULL and we should exit.
> +	 */
> +	rcu_read_lock();
> +	prdev = rcu_dereference(hgpr_dev_info);
> +	if (likely(prdev))
> +		__page_reporting_request(prdev);
> +
> +	rcu_read_unlock();
> +}
> +
>  static void
>  page_reporting_drain(struct page_reporting_dev_info *prdev,
>  		     struct scatterlist *sgl, unsigned int nents, bool reported)
> @@ -103,6 +126,213 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
>  	sg_init_table(sgl, nents);
>  }
>  
> +static void
> +hugepage_reporting_drain(struct page_reporting_dev_info *prdev,
> +			 struct hstate *h, struct scatterlist *sgl,
> +			 unsigned int nents, bool reported)
> +{
> +	struct scatterlist *sg = sgl;
> +
> +	/*
> +	 * Drain the now reported pages back into their respective
> +	 * free lists/areas. We assume at least one page is populated.
> +	 */
> +	do {
> +		struct page *page = sg_page(sg);
> +
> +		putback_isolate_huge_page(h, page);
> +
> +		/* If the pages were not reported due to error skip flagging */
> +		if (!reported)
> +			continue;
> +
> +		__SetPageReported(page);
> +	} while ((sg = sg_next(sg)));
> +
> +	/* reinitialize scatterlist now that it is empty */
> +	sg_init_table(sgl, nents);
> +}
> +
> +/*
> + * The page reporting cycle consists of 4 stages, fill, report, drain, and
> + * idle. We will cycle through the first 3 stages until we cannot obtain a
> + * full scatterlist of pages, in that case we will switch to idle.
> + */

As mentioned, I am not familiar with virtio_balloon and the overall design.
So, some of this does not make sense to me.

> +static int
> +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> +			 struct hstate *h, unsigned int nid,
> +			 struct scatterlist *sgl, unsigned int *offset)
> +{
> +	struct list_head *list = &h->hugepage_freelists[nid];
> +	unsigned int page_len = PAGE_SIZE << h->order;
> +	struct page *page, *next;
> +	long budget;
> +	int ret = 0, scan_cnt = 0;
> +
> +	/*
> +	 * Perform early check, if free area is empty there is
> +	 * nothing to process so we can skip this free_list.
> +	 */
> +	if (list_empty(list))
> +		return ret;

Do note that not all entries on the hugetlb free lists are free.  Reserved
entries are also on the free list.  The actual number of free entries is
'h->free_huge_pages - h->resv_huge_pages'.
Is the intention to process reserved pages as well as free pages?

> +
> +	spin_lock_irq(&hugetlb_lock);
> +
> +	if (huge_page_order(h) > MAX_ORDER)
> +		budget = HUGEPAGE_REPORTING_CAPACITY;
> +	else
> +		budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> +
> +	/* loop through free list adding unreported pages to sg list */
> +	list_for_each_entry_safe(page, next, list, lru) {
> +		/* We are going to skip over the reported pages. */
> +		if (PageReported(page)) {
> +			if (++scan_cnt >= MAX_SCAN_NUM) {
> +				ret = scan_cnt;
> +				break;
> +			}
> +			continue;
> +		}
> +
> +		/*
> +		 * If we fully consumed our budget then update our
> +		 * state to indicate that we are requesting additional
> +		 * processing and exit this list.
> +		 */
> +		if (budget < 0) {
> +			atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> +			next = page;
> +			break;
> +		}
> +
> +		/* Attempt to pull page from list and place in scatterlist */
> +		if (*offset) {
> +			isolate_free_huge_page(page, h, nid);

Once a hugetlb page is isolated, it can not be used and applications that
depend on hugetlb pages can start to fail.
I assume that is acceptable/expected behavior.  Correct?
On some systems, hugetlb pages are a precious resource and the sysadmin
carefully configures the number needed by applications.  Removing a hugetlb
page (even for a very short period of time) could cause serious application
failure.

My apologies if that is a stupid question.  I really have no knowledge of
this area.

> +			/* Add page to scatter list */
> +			--(*offset);
> +			sg_set_page(&sgl[*offset], page, page_len, 0);
> +
> +			continue;
> +		}
> +
> +		/*
> +		 * Make the first non-processed page in the free list
> +		 * the new head of the free list before we release the
> +		 * zone lock.
> +		 */
> +		if (&page->lru != list && !list_is_first(&page->lru, list))
> +			list_rotate_to_front(&page->lru, list);
> +
> +		/* release lock before waiting on report processing */
> +		spin_unlock_irq(&hugetlb_lock);
> +
> +		/* begin processing pages in local list */
> +		ret = prdev->report(prdev, sgl, HUGEPAGE_REPORTING_CAPACITY);
> +
> +		/* reset offset since the full list was reported */
> +		*offset = HUGEPAGE_REPORTING_CAPACITY;
> +
> +		/* update budget to reflect call to report function */
> +		budget--;
> +
> +		/* reacquire zone lock and resume processing */
> +		spin_lock_irq(&hugetlb_lock);
> +
> +		/* flush reported pages from the sg list */
> +		hugepage_reporting_drain(prdev, h, sgl,
> +					 HUGEPAGE_REPORTING_CAPACITY, !ret);
> +
> +		/*
> +		 * Reset next to first entry, the old next isn't valid
> +		 * since we dropped the lock to report the pages
> +		 */
> +		next = list_first_entry(list, struct page, lru);
> +
> +		/* exit on error */
> +		if (ret)
> +			break;
> +	}
> +
> +	/* Rotate any leftover pages to the head of the freelist */
> +	if (&next->lru != list && !list_is_first(&next->lru, list))
> +		list_rotate_to_front(&next->lru, list);
> +
> +	spin_unlock_irq(&hugetlb_lock);
> +
> +	return ret;
> +}
Mike Kravetz Dec. 22, 2020, 10:55 p.m. UTC | #3
On 12/22/20 11:59 AM, Alexander Duyck wrote:
> On Mon, Dec 21, 2020 at 11:47 PM Liang Li <liliang.opensource@gmail.com> wrote:
>> +
>> +       if (huge_page_order(h) > MAX_ORDER)
>> +               budget = HUGEPAGE_REPORTING_CAPACITY;
>> +       else
>> +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> 
> Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> don't even really need budget since this should probably be pulling
> out no more than one hugepage at a time.

On standard x86_64 configs, 2MB huge pages are of order 9 < MAX_ORDER (11).
What is important for hugetlb is the largest order that can be allocated
from buddy.  Anything bigger is considered a gigantic page and has to be
allocated differently.

If the code above is trying to distinguish between huge and gigantic pages,
it is off by 1.  The largest order that can be allocated from the buddy is
(MAX_ORDER - 1).  So, the check should be '>='.
Liang Li Dec. 23, 2020, 3:38 a.m. UTC | #4
> > +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> > +                        struct hstate *h, unsigned int nid,
> > +                        struct scatterlist *sgl, unsigned int *offset)
> > +{
> > +       struct list_head *list = &h->hugepage_freelists[nid];
> > +       unsigned int page_len = PAGE_SIZE << h->order;
> > +       struct page *page, *next;
> > +       long budget;
> > +       int ret = 0, scan_cnt = 0;
> > +
> > +       /*
> > +        * Perform early check, if free area is empty there is
> > +        * nothing to process so we can skip this free_list.
> > +        */
> > +       if (list_empty(list))
> > +               return ret;
> > +
> > +       spin_lock_irq(&hugetlb_lock);
> > +
> > +       if (huge_page_order(h) > MAX_ORDER)
> > +               budget = HUGEPAGE_REPORTING_CAPACITY;
> > +       else
> > +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;
>
> Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> don't even really need budget since this should probably be pulling
> out no more than one hugepage at a time.

I want to disting a 2M page and 1GB page here. The order of 1GB page is greater
than MAX_ORDER while 2M page's order is less than MAX_ORDER.

>
> > +       /* loop through free list adding unreported pages to sg list */
> > +       list_for_each_entry_safe(page, next, list, lru) {
> > +               /* We are going to skip over the reported pages. */
> > +               if (PageReported(page)) {
> > +                       if (++scan_cnt >= MAX_SCAN_NUM) {
> > +                               ret = scan_cnt;
> > +                               break;
> > +                       }
> > +                       continue;
> > +               }
> > +
>
> It would probably have been better to place this set before your new
> set. I don't see your new set necessarily being the best use for page
> reporting.

I haven't really latched on to what you mean, could you explain it again?

>
> > +               /*
> > +                * If we fully consumed our budget then update our
> > +                * state to indicate that we are requesting additional
> > +                * processing and exit this list.
> > +                */
> > +               if (budget < 0) {
> > +                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> > +                       next = page;
> > +                       break;
> > +               }
> > +
>
> If budget is only ever going to be 1 then we probably could just look
> at making this the default case for any time we find a non-reported
> page.

and here again.

> > +               /* Attempt to pull page from list and place in scatterlist */
> > +               if (*offset) {
> > +                       isolate_free_huge_page(page, h, nid);
> > +                       /* Add page to scatter list */
> > +                       --(*offset);
> > +                       sg_set_page(&sgl[*offset], page, page_len, 0);
> > +
> > +                       continue;
> > +               }
> > +
>
> There is no point in the continue case if we only have a budget of 1.
> We should probably just tighten up the loop so that all it does is
> search until it finds the 1 page it can pull, pull it, and then return
> it. The scatterlist doesn't serve much purpose and could be reduced to
> just a single entry.

I will think about it more.

> > +static int
> > +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
> > +                           struct scatterlist *sgl, struct hstate *h)
> > +{
> > +       unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
> > +       int ret = 0, nid;
> > +
> > +       for (nid = 0; nid < MAX_NUMNODES; nid++) {
> > +               ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
> > +
> > +               if (ret < 0)
> > +                       return ret;
> > +       }
> > +
> > +       /* report the leftover pages before going idle */
> > +       leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
> > +       if (leftover) {
> > +               sgl = &sgl[offset];
> > +               ret = prdev->report(prdev, sgl, leftover);
> > +
> > +               /* flush any remaining pages out from the last report */
> > +               spin_lock_irq(&hugetlb_lock);
> > +               hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
> > +               spin_unlock_irq(&hugetlb_lock);
> > +       }
> > +
> > +       return ret;
> > +}
> > +
>
> If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to
> rewrite this code to just optimize for a find and process a page
> approach rather than trying to batch pages.

Yes, I will make a change. Thanks for your comments!

Liang
Liang Li Dec. 23, 2020, 3:42 a.m. UTC | #5
> On 12/22/20 11:59 AM, Alexander Duyck wrote:
> > On Mon, Dec 21, 2020 at 11:47 PM Liang Li <liliang.opensource@gmail.com> wrote:
> >> +
> >> +       if (huge_page_order(h) > MAX_ORDER)
> >> +               budget = HUGEPAGE_REPORTING_CAPACITY;
> >> +       else
> >> +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> >
> > Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> > don't even really need budget since this should probably be pulling
> > out no more than one hugepage at a time.
>
> On standard x86_64 configs, 2MB huge pages are of order 9 < MAX_ORDER (11).
> What is important for hugetlb is the largest order that can be allocated
> from buddy.  Anything bigger is considered a gigantic page and has to be
> allocated differently.
>
> If the code above is trying to distinguish between huge and gigantic pages,
> it is off by 1.  The largest order that can be allocated from the buddy is
> (MAX_ORDER - 1).  So, the check should be '>='.
>
> --
> Mike Kravetz

Yes, you're right!  thanks

Liang
Liang Li Dec. 23, 2020, 3:57 a.m. UTC | #6
> On 12/21/20 11:46 PM, Liang Li wrote:
> > Free page reporting only supports buddy pages, it can't report the
> > free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
> > is a good choice for a system with a huge amount of RAM, because it
> > can help to reduce the memory management overhead and improve system
> > performance.
> > This patch add the support for reporting hugepages in the free list
> > of hugetlb, it canbe used by virtio_balloon driver for memory
> > overcommit and pre zero out free pages for speeding up memory population.
>
> My apologies as I do not follow virtio_balloon driver.  Comments from
> the hugetlb perspective.

Any comments are welcome.


> >  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> > @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
> >       return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> >  }
> >
> > +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)
>
> Looks like this always returns true.  Should it be type void?

will change in the next revision.

> > +{
> > +     bool ret = true;
> > +
> > +     VM_BUG_ON_PAGE(!PageHead(page), page);
> > +
> > +     list_move(&page->lru, &h->hugepage_activelist);
> > +     set_page_refcounted(page);
> > +     h->free_huge_pages--;
> > +     h->free_huge_pages_node[nid]--;
> > +
> > +     return ret;
> > +}
> > +
>
> ...

> > +static void
> > +hugepage_reporting_drain(struct page_reporting_dev_info *prdev,
> > +                      struct hstate *h, struct scatterlist *sgl,
> > +                      unsigned int nents, bool reported)
> > +{
> > +     struct scatterlist *sg = sgl;
> > +
> > +     /*
> > +      * Drain the now reported pages back into their respective
> > +      * free lists/areas. We assume at least one page is populated.
> > +      */
> > +     do {
> > +             struct page *page = sg_page(sg);
> > +
> > +             putback_isolate_huge_page(h, page);
> > +
> > +             /* If the pages were not reported due to error skip flagging */
> > +             if (!reported)
> > +                     continue;
> > +
> > +             __SetPageReported(page);
> > +     } while ((sg = sg_next(sg)));
> > +
> > +     /* reinitialize scatterlist now that it is empty */
> > +     sg_init_table(sgl, nents);
> > +}
> > +
> > +/*
> > + * The page reporting cycle consists of 4 stages, fill, report, drain, and
> > + * idle. We will cycle through the first 3 stages until we cannot obtain a
> > + * full scatterlist of pages, in that case we will switch to idle.
> > + */
>
> As mentioned, I am not familiar with virtio_balloon and the overall design.
> So, some of this does not make sense to me.
>
> > +static int
> > +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> > +                      struct hstate *h, unsigned int nid,
> > +                      struct scatterlist *sgl, unsigned int *offset)
> > +{
> > +     struct list_head *list = &h->hugepage_freelists[nid];
> > +     unsigned int page_len = PAGE_SIZE << h->order;
> > +     struct page *page, *next;
> > +     long budget;
> > +     int ret = 0, scan_cnt = 0;
> > +
> > +     /*
> > +      * Perform early check, if free area is empty there is
> > +      * nothing to process so we can skip this free_list.
> > +      */
> > +     if (list_empty(list))
> > +             return ret;
>
> Do note that not all entries on the hugetlb free lists are free.  Reserved
> entries are also on the free list.  The actual number of free entries is
> 'h->free_huge_pages - h->resv_huge_pages'.
> Is the intention to process reserved pages as well as free pages?

Yes, Reserved pages was treated as 'free pages'

> > +
> > +     spin_lock_irq(&hugetlb_lock);
> > +
> > +     if (huge_page_order(h) > MAX_ORDER)
> > +             budget = HUGEPAGE_REPORTING_CAPACITY;
> > +     else
> > +             budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> > +
> > +     /* loop through free list adding unreported pages to sg list */
> > +     list_for_each_entry_safe(page, next, list, lru) {
> > +             /* We are going to skip over the reported pages. */
> > +             if (PageReported(page)) {
> > +                     if (++scan_cnt >= MAX_SCAN_NUM) {
> > +                             ret = scan_cnt;
> > +                             break;
> > +                     }
> > +                     continue;
> > +             }
> > +
> > +             /*
> > +              * If we fully consumed our budget then update our
> > +              * state to indicate that we are requesting additional
> > +              * processing and exit this list.
> > +              */
> > +             if (budget < 0) {
> > +                     atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> > +                     next = page;
> > +                     break;
> > +             }
> > +
> > +             /* Attempt to pull page from list and place in scatterlist */
> > +             if (*offset) {
> > +                     isolate_free_huge_page(page, h, nid);
>
> Once a hugetlb page is isolated, it can not be used and applications that
> depend on hugetlb pages can start to fail.
> I assume that is acceptable/expected behavior.  Correct?
> On some systems, hugetlb pages are a precious resource and the sysadmin
> carefully configures the number needed by applications.  Removing a hugetlb
> page (even for a very short period of time) could cause serious application
> failure.

That' true, especially for 1G pages. Any suggestions?
Let the hugepage allocator be aware of this situation and retry ?

> My apologies if that is a stupid question.  I really have no knowledge of
> this area.
>
> Mike Kravetz

Thanks for your comments, Mike

Liang
Alexander Duyck Dec. 23, 2020, 4:39 p.m. UTC | #7
On Tue, Dec 22, 2020 at 7:39 PM Liang Li <liliang324@gmail.com> wrote:
>
> > > +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> > > +                        struct hstate *h, unsigned int nid,
> > > +                        struct scatterlist *sgl, unsigned int *offset)
> > > +{
> > > +       struct list_head *list = &h->hugepage_freelists[nid];
> > > +       unsigned int page_len = PAGE_SIZE << h->order;
> > > +       struct page *page, *next;
> > > +       long budget;
> > > +       int ret = 0, scan_cnt = 0;
> > > +
> > > +       /*
> > > +        * Perform early check, if free area is empty there is
> > > +        * nothing to process so we can skip this free_list.
> > > +        */
> > > +       if (list_empty(list))
> > > +               return ret;
> > > +
> > > +       spin_lock_irq(&hugetlb_lock);
> > > +
> > > +       if (huge_page_order(h) > MAX_ORDER)
> > > +               budget = HUGEPAGE_REPORTING_CAPACITY;
> > > +       else
> > > +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> >
> > Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> > don't even really need budget since this should probably be pulling
> > out no more than one hugepage at a time.
>
> I want to disting a 2M page and 1GB page here. The order of 1GB page is greater
> than MAX_ORDER while 2M page's order is less than MAX_ORDER.

The budget here is broken. When I put the budget in page reporting it
was so that we wouldn't try to report all of the memory in a given
region. It is meant to hold us to no more than one pass through 1/16
of the free memory. So essentially we will be slowly processing all of
memory and it will take 16 calls (32 seconds) for us to process a
system that is sitting completely idle. It is meant to pace us so we
don't spend a ton of time doing work that will be undone, not to
prevent us from burying a CPU which is what seems to be implied here.

Using HUGEPAGE_REPORTING_CAPACITY makes no sense here. I was using it
in the original definition because it was how many pages we could
scoop out at a time and then I was aiming for a 16th of that. Here you
are arbitrarily squaring HUGEPAGE_REPORTING_CAPACITY in terms of the
amount of work you will doo since you are using it as a multiple
instead of a divisor.

> >
> > > +       /* loop through free list adding unreported pages to sg list */
> > > +       list_for_each_entry_safe(page, next, list, lru) {
> > > +               /* We are going to skip over the reported pages. */
> > > +               if (PageReported(page)) {
> > > +                       if (++scan_cnt >= MAX_SCAN_NUM) {
> > > +                               ret = scan_cnt;
> > > +                               break;
> > > +                       }
> > > +                       continue;
> > > +               }
> > > +
> >
> > It would probably have been better to place this set before your new
> > set. I don't see your new set necessarily being the best use for page
> > reporting.
>
> I haven't really latched on to what you mean, could you explain it again?

It would be better for you to spend time understanding how this patch
set works before you go about expanding it to do other things.
Mistakes like the budget one above kind of point out the fact that you
don't understand how this code was supposed to work and just kind of
shoehorned you page zeroing code onto it.

It would be better to look at trying to understand this code first
before you extend it to support your zeroing use case. So adding huge
pages first might make more sense than trying to zero and push the
order down. The fact is the page reporting extension should be minimal
for huge pages since they are just passed as a scatterlist so you
should only need to add a small bit to page_reporting.c to extend it
to support this use case.

> >
> > > +               /*
> > > +                * If we fully consumed our budget then update our
> > > +                * state to indicate that we are requesting additional
> > > +                * processing and exit this list.
> > > +                */
> > > +               if (budget < 0) {
> > > +                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> > > +                       next = page;
> > > +                       break;
> > > +               }
> > > +
> >
> > If budget is only ever going to be 1 then we probably could just look
> > at making this the default case for any time we find a non-reported
> > page.
>
> and here again.

It comes down to the fact that the changes you made have a significant
impact on how this is supposed to function. Reducing the scatterlist
to a size of one makes the whole point of doing batching kind of
pointless. Basically the code should be rewritten with the assumption
that if you find a page you report it.

The old code would batch things up because there is significant
overhead to be addressed when going to the hypervisor to report said
memory. Your code doesn't seem to really take anything like that into
account and instead is using an arbitrary budget value based on the
page size.

> > > +               /* Attempt to pull page from list and place in scatterlist */
> > > +               if (*offset) {
> > > +                       isolate_free_huge_page(page, h, nid);
> > > +                       /* Add page to scatter list */
> > > +                       --(*offset);
> > > +                       sg_set_page(&sgl[*offset], page, page_len, 0);
> > > +
> > > +                       continue;
> > > +               }
> > > +
> >
> > There is no point in the continue case if we only have a budget of 1.
> > We should probably just tighten up the loop so that all it does is
> > search until it finds the 1 page it can pull, pull it, and then return
> > it. The scatterlist doesn't serve much purpose and could be reduced to
> > just a single entry.
>
> I will think about it more.
>
> > > +static int
> > > +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
> > > +                           struct scatterlist *sgl, struct hstate *h)
> > > +{
> > > +       unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
> > > +       int ret = 0, nid;
> > > +
> > > +       for (nid = 0; nid < MAX_NUMNODES; nid++) {
> > > +               ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
> > > +
> > > +               if (ret < 0)
> > > +                       return ret;
> > > +       }
> > > +
> > > +       /* report the leftover pages before going idle */
> > > +       leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
> > > +       if (leftover) {
> > > +               sgl = &sgl[offset];
> > > +               ret = prdev->report(prdev, sgl, leftover);
> > > +
> > > +               /* flush any remaining pages out from the last report */
> > > +               spin_lock_irq(&hugetlb_lock);
> > > +               hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
> > > +               spin_unlock_irq(&hugetlb_lock);
> > > +       }
> > > +
> > > +       return ret;
> > > +}
> > > +
> >
> > If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to
> > rewrite this code to just optimize for a find and process a page
> > approach rather than trying to batch pages.
>
> Yes, I will make a change. Thanks for your comments!

Lastly I would recommend setting up and testing page reporting with
the virtio-balloon driver. I worry that your patch set would have a
significant negative impact on the performance of it. As I mentioned
before it was designed to be more of a leaky bucket solution to
reporting memory and was supposed to take about 30 seconds for it to
flush all of the memory in a guest. Your changes seem to be trying to
do a much more aggressive task and I worry that what you are going to
find is that it will easily push CPUs to 100% on an active system
since it will be aggressively trying to zero memory as soon as it is
freed rather than taking it at a slower pace.
Mike Kravetz Dec. 23, 2020, 6:47 p.m. UTC | #8
On 12/22/20 7:57 PM, Liang Li wrote:
>> On 12/21/20 11:46 PM, Liang Li wrote:
>>> +static int
>>> +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
>>> +                      struct hstate *h, unsigned int nid,
>>> +                      struct scatterlist *sgl, unsigned int *offset)
>>> +{
>>> +     struct list_head *list = &h->hugepage_freelists[nid];
>>> +     unsigned int page_len = PAGE_SIZE << h->order;
>>> +     struct page *page, *next;
>>> +     long budget;
>>> +     int ret = 0, scan_cnt = 0;
>>> +
>>> +     /*
>>> +      * Perform early check, if free area is empty there is
>>> +      * nothing to process so we can skip this free_list.
>>> +      */
>>> +     if (list_empty(list))
>>> +             return ret;
>>
>> Do note that not all entries on the hugetlb free lists are free.  Reserved
>> entries are also on the free list.  The actual number of free entries is
>> 'h->free_huge_pages - h->resv_huge_pages'.
>> Is the intention to process reserved pages as well as free pages?
> 
> Yes, Reserved pages was treated as 'free pages'

If that is true, then this code breaks hugetlb.  hugetlb code assumes that
h->free_huge_pages is ALWAYS >= h->resv_huge_pages.  This code would break
that assumption.  If you really want to add support for hugetlb pages, then
you will need to take reserved pages into account.

P.S. There might be some confusion about 'reservations' based on the
commit message.  My comments are directed at hugetlb reservations described
in Documentation/vm/hugetlbfs_reserv.rst.

>>> +
>>> +     spin_lock_irq(&hugetlb_lock);
>>> +
>>> +     if (huge_page_order(h) > MAX_ORDER)
>>> +             budget = HUGEPAGE_REPORTING_CAPACITY;
>>> +     else
>>> +             budget = HUGEPAGE_REPORTING_CAPACITY * 32;
>>> +
>>> +     /* loop through free list adding unreported pages to sg list */
>>> +     list_for_each_entry_safe(page, next, list, lru) {
>>> +             /* We are going to skip over the reported pages. */
>>> +             if (PageReported(page)) {
>>> +                     if (++scan_cnt >= MAX_SCAN_NUM) {
>>> +                             ret = scan_cnt;
>>> +                             break;
>>> +                     }
>>> +                     continue;
>>> +             }
>>> +
>>> +             /*
>>> +              * If we fully consumed our budget then update our
>>> +              * state to indicate that we are requesting additional
>>> +              * processing and exit this list.
>>> +              */
>>> +             if (budget < 0) {
>>> +                     atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
>>> +                     next = page;
>>> +                     break;
>>> +             }
>>> +
>>> +             /* Attempt to pull page from list and place in scatterlist */
>>> +             if (*offset) {
>>> +                     isolate_free_huge_page(page, h, nid);
>>
>> Once a hugetlb page is isolated, it can not be used and applications that
>> depend on hugetlb pages can start to fail.
>> I assume that is acceptable/expected behavior.  Correct?
>> On some systems, hugetlb pages are a precious resource and the sysadmin
>> carefully configures the number needed by applications.  Removing a hugetlb
>> page (even for a very short period of time) could cause serious application
>> failure.
> 
> That' true, especially for 1G pages. Any suggestions?
> Let the hugepage allocator be aware of this situation and retry ?

I would hate to add that complexity to the allocator.

This question is likely based on my lack of understanding of virtio-balloon
usage and this reporting mechanism.  But, why do the hugetlb pages have to
be 'temporarily' allocated for reporting purposes?
Liang Li Dec. 24, 2020, 12:45 a.m. UTC | #9
> > > > +       spin_lock_irq(&hugetlb_lock);
> > > > +
> > > > +       if (huge_page_order(h) > MAX_ORDER)
> > > > +               budget = HUGEPAGE_REPORTING_CAPACITY;
> > > > +       else
> > > > +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;
> > >
> > > Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
> > > don't even really need budget since this should probably be pulling
> > > out no more than one hugepage at a time.
> >
> > I want to disting a 2M page and 1GB page here. The order of 1GB page is greater
> > than MAX_ORDER while 2M page's order is less than MAX_ORDER.
>
> The budget here is broken. When I put the budget in page reporting it
> was so that we wouldn't try to report all of the memory in a given
> region. It is meant to hold us to no more than one pass through 1/16
> of the free memory. So essentially we will be slowly processing all of
> memory and it will take 16 calls (32 seconds) for us to process a
> system that is sitting completely idle. It is meant to pace us so we
> don't spend a ton of time doing work that will be undone, not to
> prevent us from burying a CPU which is what seems to be implied here.
>
> Using HUGEPAGE_REPORTING_CAPACITY makes no sense here. I was using it
> in the original definition because it was how many pages we could
> scoop out at a time and then I was aiming for a 16th of that. Here you
> are arbitrarily squaring HUGEPAGE_REPORTING_CAPACITY in terms of the
> amount of work you will doo since you are using it as a multiple
> instead of a divisor.
>
> > >
> > > > +       /* loop through free list adding unreported pages to sg list */
> > > > +       list_for_each_entry_safe(page, next, list, lru) {
> > > > +               /* We are going to skip over the reported pages. */
> > > > +               if (PageReported(page)) {
> > > > +                       if (++scan_cnt >= MAX_SCAN_NUM) {
> > > > +                               ret = scan_cnt;
> > > > +                               break;
> > > > +                       }
> > > > +                       continue;
> > > > +               }
> > > > +
> > >
> > > It would probably have been better to place this set before your new
> > > set. I don't see your new set necessarily being the best use for page
> > > reporting.
> >
> > I haven't really latched on to what you mean, could you explain it again?
>
> It would be better for you to spend time understanding how this patch
> set works before you go about expanding it to do other things.
> Mistakes like the budget one above kind of point out the fact that you
> don't understand how this code was supposed to work and just kind of
> shoehorned you page zeroing code onto it.
>
> It would be better to look at trying to understand this code first
> before you extend it to support your zeroing use case. So adding huge
> pages first might make more sense than trying to zero and push the
> order down. The fact is the page reporting extension should be minimal
> for huge pages since they are just passed as a scatterlist so you
> should only need to add a small bit to page_reporting.c to extend it
> to support this use case.
>
> > >
> > > > +               /*
> > > > +                * If we fully consumed our budget then update our
> > > > +                * state to indicate that we are requesting additional
> > > > +                * processing and exit this list.
> > > > +                */
> > > > +               if (budget < 0) {
> > > > +                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> > > > +                       next = page;
> > > > +                       break;
> > > > +               }
> > > > +
> > >
> > > If budget is only ever going to be 1 then we probably could just look
> > > at making this the default case for any time we find a non-reported
> > > page.
> >
> > and here again.
>
> It comes down to the fact that the changes you made have a significant
> impact on how this is supposed to function. Reducing the scatterlist
> to a size of one makes the whole point of doing batching kind of
> pointless. Basically the code should be rewritten with the assumption
> that if you find a page you report it.
>
> The old code would batch things up because there is significant
> overhead to be addressed when going to the hypervisor to report said
> memory. Your code doesn't seem to really take anything like that into
> account and instead is using an arbitrary budget value based on the
> page size.
>
> > > > +               /* Attempt to pull page from list and place in scatterlist */
> > > > +               if (*offset) {
> > > > +                       isolate_free_huge_page(page, h, nid);
> > > > +                       /* Add page to scatter list */
> > > > +                       --(*offset);
> > > > +                       sg_set_page(&sgl[*offset], page, page_len, 0);
> > > > +
> > > > +                       continue;
> > > > +               }
> > > > +
> > >
> > > There is no point in the continue case if we only have a budget of 1.
> > > We should probably just tighten up the loop so that all it does is
> > > search until it finds the 1 page it can pull, pull it, and then return
> > > it. The scatterlist doesn't serve much purpose and could be reduced to
> > > just a single entry.
> >
> > I will think about it more.
> >
> > > > +static int
> > > > +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
> > > > +                           struct scatterlist *sgl, struct hstate *h)
> > > > +{
> > > > +       unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
> > > > +       int ret = 0, nid;
> > > > +
> > > > +       for (nid = 0; nid < MAX_NUMNODES; nid++) {
> > > > +               ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
> > > > +
> > > > +               if (ret < 0)
> > > > +                       return ret;
> > > > +       }
> > > > +
> > > > +       /* report the leftover pages before going idle */
> > > > +       leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
> > > > +       if (leftover) {
> > > > +               sgl = &sgl[offset];
> > > > +               ret = prdev->report(prdev, sgl, leftover);
> > > > +
> > > > +               /* flush any remaining pages out from the last report */
> > > > +               spin_lock_irq(&hugetlb_lock);
> > > > +               hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
> > > > +               spin_unlock_irq(&hugetlb_lock);
> > > > +       }
> > > > +
> > > > +       return ret;
> > > > +}
> > > > +
> > >
> > > If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to
> > > rewrite this code to just optimize for a find and process a page
> > > approach rather than trying to batch pages.
> >
> > Yes, I will make a change. Thanks for your comments!
>
> Lastly I would recommend setting up and testing page reporting with
> the virtio-balloon driver. I worry that your patch set would have a
> significant negative impact on the performance of it. As I mentioned
> before it was designed to be more of a leaky bucket solution to
> reporting memory and was supposed to take about 30 seconds for it to
> flush all of the memory in a guest. Your changes seem to be trying to
> do a much more aggressive task and I worry that what you are going to
> find is that it will easily push CPUs to 100% on an active system
> since it will be aggressively trying to zero memory as soon as it is
> freed rather than taking it at a slower pace.

Thanks for your explanation, I got what your meaning now. In this RFC
I just try to make it work, there is a lot of room for code refinement. I will
take advice and pay more attention to the points you mentioned.

Liang
Liang Li Dec. 24, 2020, 1:31 a.m. UTC | #10
> >>> +static int
> >>> +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> >>> +                      struct hstate *h, unsigned int nid,
> >>> +                      struct scatterlist *sgl, unsigned int *offset)
> >>> +{
> >>> +     struct list_head *list = &h->hugepage_freelists[nid];
> >>> +     unsigned int page_len = PAGE_SIZE << h->order;
> >>> +     struct page *page, *next;
> >>> +     long budget;
> >>> +     int ret = 0, scan_cnt = 0;
> >>> +
> >>> +     /*
> >>> +      * Perform early check, if free area is empty there is
> >>> +      * nothing to process so we can skip this free_list.
> >>> +      */
> >>> +     if (list_empty(list))
> >>> +             return ret;
> >>
> >> Do note that not all entries on the hugetlb free lists are free.  Reserved
> >> entries are also on the free list.  The actual number of free entries is
> >> 'h->free_huge_pages - h->resv_huge_pages'.
> >> Is the intention to process reserved pages as well as free pages?
> >
> > Yes, Reserved pages was treated as 'free pages'
>
> If that is true, then this code breaks hugetlb.  hugetlb code assumes that
> h->free_huge_pages is ALWAYS >= h->resv_huge_pages.  This code would break
> that assumption.  If you really want to add support for hugetlb pages, then
> you will need to take reserved pages into account.

I didn't know that. thanks!

> P.S. There might be some confusion about 'reservations' based on the
> commit message.  My comments are directed at hugetlb reservations described
> in Documentation/vm/hugetlbfs_reserv.rst.
>
> >>> +             /* Attempt to pull page from list and place in scatterlist */
> >>> +             if (*offset) {
> >>> +                     isolate_free_huge_page(page, h, nid);
> >>
> >> Once a hugetlb page is isolated, it can not be used and applications that
> >> depend on hugetlb pages can start to fail.
> >> I assume that is acceptable/expected behavior.  Correct?
> >> On some systems, hugetlb pages are a precious resource and the sysadmin
> >> carefully configures the number needed by applications.  Removing a hugetlb
> >> page (even for a very short period of time) could cause serious application
> >> failure.
> >
> > That' true, especially for 1G pages. Any suggestions?
> > Let the hugepage allocator be aware of this situation and retry ?
>
> I would hate to add that complexity to the allocator.
>
> This question is likely based on my lack of understanding of virtio-balloon
> usage and this reporting mechanism.  But, why do the hugetlb pages have to
> be 'temporarily' allocated for reporting purposes?

The link here will give your more detail about how page reporting
works, https://www.kernel.org/doc/html/latest//vm/free_page_reporting.html
the virtio-balloon driver is based on this framework and will report the
free pages information to QEMU&KVM, host can unmap the memory
region corresponding to reported free pages and reclaim the memory
for other use, it's useful for memory overcommit.
Allocated the pages 'temporarily' before reporting is necessary, it make
sure guests will not use the page when the host side unmap the region.
or it will break the guest.

Now I realized we should solve this issue first, it seems adding a lock
will help.

Thanks
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ebca2ef02212..a72ad25501d3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -11,6 +11,7 @@ 
 #include <linux/kref.h>
 #include <linux/pgtable.h>
 #include <linux/gfp.h>
+#include <linux/page_reporting.h>
 
 struct ctl_table;
 struct user_struct;
@@ -114,6 +115,8 @@  int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 
+bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid);
+void putback_isolate_huge_page(struct hstate *h, struct page *page);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index 63e1e9fbcaa2..0da3d1a6f0cc 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -7,6 +7,7 @@ 
 
 /* This value should always be a power of 2, see page_reporting_cycle() */
 #define PAGE_REPORTING_CAPACITY		32
+#define HUGEPAGE_REPORTING_CAPACITY	1
 
 struct page_reporting_dev_info {
 	/* function that alters pages to make them "reported" */
@@ -26,4 +27,8 @@  struct page_reporting_dev_info {
 /* Tear-down and bring-up for page reporting devices */
 void page_reporting_unregister(struct page_reporting_dev_info *prdev);
 int page_reporting_register(struct page_reporting_dev_info *prdev);
+
+/* Tear-down and bring-up for hugepage reporting devices */
+void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev);
+int hugepage_reporting_register(struct page_reporting_dev_info *prdev);
 #endif /*_LINUX_PAGE_REPORTING_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cbf32d2824fd..de6ce147dfe2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -41,6 +41,7 @@ 
 #include <linux/node.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
+#include "page_reporting.h"
 #include "internal.h"
 
 int hugetlb_max_hstate __read_mostly;
@@ -1028,6 +1029,11 @@  static void enqueue_huge_page(struct hstate *h, struct page *page)
 	list_move(&page->lru, &h->hugepage_freelists[nid]);
 	h->free_huge_pages++;
 	h->free_huge_pages_node[nid]++;
+	if (hugepage_reported(page)) {
+		__ClearPageReported(page);
+		pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
+	}
+	hugepage_reporting_notify_free(h->order);
 }
 
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -5531,6 +5537,29 @@  follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
 	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
 }
 
+bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)
+{
+	bool ret = true;
+
+	VM_BUG_ON_PAGE(!PageHead(page), page);
+
+	list_move(&page->lru, &h->hugepage_activelist);
+	set_page_refcounted(page);
+	h->free_huge_pages--;
+	h->free_huge_pages_node[nid]--;
+
+	return ret;
+}
+
+void putback_isolate_huge_page(struct hstate *h, struct page *page)
+{
+	int nid = page_to_nid(page);
+	pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
+	list_move(&page->lru, &h->hugepage_freelists[nid]);
+	h->free_huge_pages++;
+	h->free_huge_pages_node[nid]++;
+}
+
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
 	bool ret = true;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 20ec3fb1afc4..15d4b5372df8 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -7,6 +7,7 @@ 
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/sched.h>
+#include <linux/hugetlb.h>
 
 #include "page_reporting.h"
 #include "internal.h"
@@ -16,6 +17,10 @@  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
 int page_report_mini_order = pageblock_order;
 unsigned long page_report_batch_size = 32 * 1024 * 1024;
 
+static struct page_reporting_dev_info __rcu *hgpr_dev_info __read_mostly;
+int hugepage_report_mini_order = pageblock_order;
+unsigned long hugepage_report_batch_size = 64 * 1024 * 1024;
+
 enum {
 	PAGE_REPORTING_IDLE = 0,
 	PAGE_REPORTING_REQUESTED,
@@ -67,6 +72,24 @@  void __page_reporting_notify(void)
 	rcu_read_unlock();
 }
 
+/* notify prdev of free hugepage reporting request */
+void __hugepage_reporting_notify(void)
+{
+	struct page_reporting_dev_info *prdev;
+
+	/*
+	 * We use RCU to protect the pr_dev_info pointer. In almost all
+	 * cases this should be present, however in the unlikely case of
+	 * a shutdown this will be NULL and we should exit.
+	 */
+	rcu_read_lock();
+	prdev = rcu_dereference(hgpr_dev_info);
+	if (likely(prdev))
+		__page_reporting_request(prdev);
+
+	rcu_read_unlock();
+}
+
 static void
 page_reporting_drain(struct page_reporting_dev_info *prdev,
 		     struct scatterlist *sgl, unsigned int nents, bool reported)
@@ -103,6 +126,213 @@  page_reporting_drain(struct page_reporting_dev_info *prdev,
 	sg_init_table(sgl, nents);
 }
 
+static void
+hugepage_reporting_drain(struct page_reporting_dev_info *prdev,
+			 struct hstate *h, struct scatterlist *sgl,
+			 unsigned int nents, bool reported)
+{
+	struct scatterlist *sg = sgl;
+
+	/*
+	 * Drain the now reported pages back into their respective
+	 * free lists/areas. We assume at least one page is populated.
+	 */
+	do {
+		struct page *page = sg_page(sg);
+
+		putback_isolate_huge_page(h, page);
+
+		/* If the pages were not reported due to error skip flagging */
+		if (!reported)
+			continue;
+
+		__SetPageReported(page);
+	} while ((sg = sg_next(sg)));
+
+	/* reinitialize scatterlist now that it is empty */
+	sg_init_table(sgl, nents);
+}
+
+/*
+ * The page reporting cycle consists of 4 stages, fill, report, drain, and
+ * idle. We will cycle through the first 3 stages until we cannot obtain a
+ * full scatterlist of pages, in that case we will switch to idle.
+ */
+static int
+hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
+			 struct hstate *h, unsigned int nid,
+			 struct scatterlist *sgl, unsigned int *offset)
+{
+	struct list_head *list = &h->hugepage_freelists[nid];
+	unsigned int page_len = PAGE_SIZE << h->order;
+	struct page *page, *next;
+	long budget;
+	int ret = 0, scan_cnt = 0;
+
+	/*
+	 * Perform early check, if free area is empty there is
+	 * nothing to process so we can skip this free_list.
+	 */
+	if (list_empty(list))
+		return ret;
+
+	spin_lock_irq(&hugetlb_lock);
+
+	if (huge_page_order(h) > MAX_ORDER)
+		budget = HUGEPAGE_REPORTING_CAPACITY;
+	else
+		budget = HUGEPAGE_REPORTING_CAPACITY * 32;
+
+	/* loop through free list adding unreported pages to sg list */
+	list_for_each_entry_safe(page, next, list, lru) {
+		/* We are going to skip over the reported pages. */
+		if (PageReported(page)) {
+			if (++scan_cnt >= MAX_SCAN_NUM) {
+				ret = scan_cnt;
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * If we fully consumed our budget then update our
+		 * state to indicate that we are requesting additional
+		 * processing and exit this list.
+		 */
+		if (budget < 0) {
+			atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
+			next = page;
+			break;
+		}
+
+		/* Attempt to pull page from list and place in scatterlist */
+		if (*offset) {
+			isolate_free_huge_page(page, h, nid);
+			/* Add page to scatter list */
+			--(*offset);
+			sg_set_page(&sgl[*offset], page, page_len, 0);
+
+			continue;
+		}
+
+		/*
+		 * Make the first non-processed page in the free list
+		 * the new head of the free list before we release the
+		 * zone lock.
+		 */
+		if (&page->lru != list && !list_is_first(&page->lru, list))
+			list_rotate_to_front(&page->lru, list);
+
+		/* release lock before waiting on report processing */
+		spin_unlock_irq(&hugetlb_lock);
+
+		/* begin processing pages in local list */
+		ret = prdev->report(prdev, sgl, HUGEPAGE_REPORTING_CAPACITY);
+
+		/* reset offset since the full list was reported */
+		*offset = HUGEPAGE_REPORTING_CAPACITY;
+
+		/* update budget to reflect call to report function */
+		budget--;
+
+		/* reacquire zone lock and resume processing */
+		spin_lock_irq(&hugetlb_lock);
+
+		/* flush reported pages from the sg list */
+		hugepage_reporting_drain(prdev, h, sgl,
+					 HUGEPAGE_REPORTING_CAPACITY, !ret);
+
+		/*
+		 * Reset next to first entry, the old next isn't valid
+		 * since we dropped the lock to report the pages
+		 */
+		next = list_first_entry(list, struct page, lru);
+
+		/* exit on error */
+		if (ret)
+			break;
+	}
+
+	/* Rotate any leftover pages to the head of the freelist */
+	if (&next->lru != list && !list_is_first(&next->lru, list))
+		list_rotate_to_front(&next->lru, list);
+
+	spin_unlock_irq(&hugetlb_lock);
+
+	return ret;
+}
+
+static int
+hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
+			    struct scatterlist *sgl, struct hstate *h)
+{
+	unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
+	int ret = 0, nid;
+
+	for (nid = 0; nid < MAX_NUMNODES; nid++) {
+		ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	/* report the leftover pages before going idle */
+	leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
+	if (leftover) {
+		sgl = &sgl[offset];
+		ret = prdev->report(prdev, sgl, leftover);
+
+		/* flush any remaining pages out from the last report */
+		spin_lock_irq(&hugetlb_lock);
+		hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
+		spin_unlock_irq(&hugetlb_lock);
+	}
+
+	return ret;
+}
+
+static void hugepage_reporting_process(struct work_struct *work)
+{
+	struct delayed_work *d_work = to_delayed_work(work);
+	struct page_reporting_dev_info *prdev = container_of(d_work,
+					struct page_reporting_dev_info, work);
+	int err = 0, state = PAGE_REPORTING_ACTIVE;
+	struct scatterlist *sgl;
+	struct hstate *h;
+
+	/*
+	 * Change the state to "Active" so that we can track if there is
+	 * anyone requests page reporting after we complete our pass. If
+	 * the state is not altered by the end of the pass we will switch
+	 * to idle and quit scheduling reporting runs.
+	 */
+	atomic_set(&prdev->state, state);
+
+	/* allocate scatterlist to store pages being reported on */
+	sgl = kmalloc_array(HUGEPAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
+	if (!sgl)
+		goto err_out;
+
+	sg_init_table(sgl, HUGEPAGE_REPORTING_CAPACITY);
+
+	for_each_hstate(h) {
+		err = hugepage_reporting_process_hstate(prdev, sgl, h);
+		if (err)
+			break;
+	}
+
+	kfree(sgl);
+err_out:
+	/*
+	 * If the state has reverted back to requested then there may be
+	 * additional pages to be processed. We will defer for 2s to allow
+	 * more pages to accumulate.
+	 */
+	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
+	if (state == PAGE_REPORTING_REQUESTED)
+		schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
+}
+
 /*
  * The page reporting cycle consists of 4 stages, fill, report, drain, and
  * idle. We will cycle through the first 3 stages until we cannot obtain a
@@ -341,6 +571,9 @@  static void page_reporting_process(struct work_struct *work)
 static DEFINE_MUTEX(page_reporting_mutex);
 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
 
+static DEFINE_MUTEX(hugepage_reporting_mutex);
+DEFINE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
+
 int page_reporting_register(struct page_reporting_dev_info *prdev)
 {
 	int err = 0;
@@ -395,3 +628,57 @@  void page_reporting_unregister(struct page_reporting_dev_info *prdev)
 	mutex_unlock(&page_reporting_mutex);
 }
 EXPORT_SYMBOL_GPL(page_reporting_unregister);
+
+int hugepage_reporting_register(struct page_reporting_dev_info *prdev)
+{
+	int err = 0;
+
+	mutex_lock(&hugepage_reporting_mutex);
+
+	/* nothing to do if already in use */
+	if (rcu_access_pointer(hgpr_dev_info)) {
+		err = -EBUSY;
+		goto err_out;
+	}
+
+	/* initialize state and work structures */
+	atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
+	INIT_DELAYED_WORK(&prdev->work, &hugepage_reporting_process);
+
+	/* Begin initial flush of zones */
+	__page_reporting_request(prdev);
+
+	/* Assign device to allow notifications */
+	rcu_assign_pointer(hgpr_dev_info, prdev);
+
+	hugepage_report_mini_order = prdev->mini_order;
+	hugepage_report_batch_size = prdev->batch_size;
+
+	/* enable hugepage reporting notification */
+	if (!static_key_enabled(&hugepage_reporting_enabled)) {
+		static_branch_enable(&hugepage_reporting_enabled);
+		pr_info("Free hugepage reporting enabled\n");
+	}
+err_out:
+	mutex_unlock(&hugepage_reporting_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(hugepage_reporting_register);
+
+void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev)
+{
+	mutex_lock(&hugepage_reporting_mutex);
+
+	if (rcu_access_pointer(hgpr_dev_info) == prdev) {
+		/* Disable page reporting notification */
+		RCU_INIT_POINTER(hgpr_dev_info, NULL);
+		synchronize_rcu();
+
+		/* Flush any existing work, and lock it out */
+		cancel_delayed_work_sync(&prdev->work);
+	}
+
+	mutex_unlock(&hugepage_reporting_mutex);
+}
+EXPORT_SYMBOL_GPL(hugepage_reporting_unregister);
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index 86ac6ffad970..271c64c3c3cb 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -18,12 +18,24 @@  extern unsigned long page_report_batch_size;
 DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
 void __page_reporting_notify(void);
 
+extern int hugepage_report_mini_order;
+extern unsigned long hugepage_report_batch_size;
+
+DECLARE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
+void __hugepage_reporting_notify(void);
+
 static inline bool page_reported(struct page *page)
 {
 	return static_branch_unlikely(&page_reporting_enabled) &&
 	       PageReported(page);
 }
 
+static inline bool hugepage_reported(struct page *page)
+{
+	return static_branch_unlikely(&hugepage_reporting_enabled) &&
+	       PageReported(page);
+}
+
 /**
  * page_reporting_notify_free - Free page notification to start page processing
  *
@@ -52,11 +64,33 @@  static inline void page_reporting_notify_free(unsigned int order)
 		__page_reporting_notify();
 	}
 }
+
+static inline void hugepage_reporting_notify_free(unsigned int order)
+{
+	static long batch_size = 0;
+
+	if (!static_branch_unlikely(&hugepage_reporting_enabled))
+		return;
+
+	/* Determine if we have crossed reporting threshold */
+	if (order < hugepage_report_mini_order)
+		return;
+
+	batch_size += (1 << order) << PAGE_SHIFT;
+	if (batch_size >= hugepage_report_batch_size) {
+		batch_size = 0;
+		__hugepage_reporting_notify();
+	}
+}
 #else /* CONFIG_PAGE_REPORTING */
 #define page_reported(_page)	false
 
 static inline void page_reporting_notify_free(unsigned int order)
 {
 }
+
+static inline void hugepage_reporting_notify_free(unsigned int order)
+{
+}
 #endif /* CONFIG_PAGE_REPORTING */
 #endif /*_MM_PAGE_REPORTING_H */