diff mbox series

[2/6] mm: let user decide page reporting option

Message ID 20210106034806.GA1146@open-light-1.localdomain (mailing list archive)
State New, archived
Headers show
Series hugetlbfs: support free page reporting | expand

Commit Message

Liang Li Jan. 6, 2021, 3:48 a.m. UTC
Some key parameters for page reporting are now hard coded, different
users of the framework may have their special requirements, make
these parameter configrable and let the user decide them.

Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Liang Li <liliang324@gmail.com>
Signed-off-by: Liang Li <liliangleo@didiglobal.com>
---
 drivers/virtio/virtio_balloon.c |  3 +++
 include/linux/page_reporting.h  |  3 +++
 mm/page_reporting.c             | 13 +++++++++----
 mm/page_reporting.h             |  6 +++---
 4 files changed, 18 insertions(+), 7 deletions(-)

Comments

Alexander Duyck Jan. 6, 2021, 6:42 p.m. UTC | #1
On Tue, Jan 5, 2021 at 7:48 PM Liang Li <liliang324@gmail.com> wrote:
>
> Some key parameters for page reporting are now hard coded, different
> users of the framework may have their special requirements, make
> these parameter configrable and let the user decide them.
>
> Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> Cc: Mel Gorman <mgorman@techsingularity.net>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Liang Li <liliang324@gmail.com>
> Signed-off-by: Liang Li <liliangleo@didiglobal.com>
> ---
>  drivers/virtio/virtio_balloon.c |  3 +++
>  include/linux/page_reporting.h  |  3 +++
>  mm/page_reporting.c             | 13 +++++++++----
>  mm/page_reporting.h             |  6 +++---
>  4 files changed, 18 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 8985fc2cea86..684bcc39ef5a 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -993,6 +993,9 @@ static int virtballoon_probe(struct virtio_device *vdev)
>                         goto out_unregister_oom;
>                 }
>
> +               vb->pr_dev_info.mini_order = pageblock_order;
> +               vb->pr_dev_info.batch_size = 16 * 1024 * 1024; /* 16M */
> +               vb->pr_dev_info.delay_jiffies = 2 * HZ; /* 2 seconds */
>                 err = page_reporting_register(&vb->pr_dev_info);
>                 if (err)
>                         goto out_unregister_oom;
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 3b99e0ec24f2..63e1e9fbcaa2 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -13,6 +13,9 @@ struct page_reporting_dev_info {
>         int (*report)(struct page_reporting_dev_info *prdev,
>                       struct scatterlist *sg, unsigned int nents);
>
> +       unsigned long batch_size;
> +       unsigned long delay_jiffies;
> +       int mini_order;
>         /* work struct for processing reports */
>         struct delayed_work work;
>
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 694df981ddd2..39bc6a9d7b73 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -13,6 +13,7 @@
>  #define PAGE_REPORTING_DELAY   (2 * HZ)
>  static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  unsigned long page_report_batch_size  __read_mostly = 16 * 1024 * 1024UL;
> +int page_report_mini_order = pageblock_order;
>
>  enum {
>         PAGE_REPORTING_IDLE = 0,
> @@ -44,7 +45,7 @@ __page_reporting_request(struct page_reporting_dev_info *prdev)
>          * now we are limiting this to running no more than once every
>          * couple of seconds.
>          */
> -       schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
> +       schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
>  }
>

So this ends up being the reason why you needed to add the batch size
value. However I don't really see it working as expected since you
could essentially have 1 page freed 4M times that could trigger your
page zeroing logic. So for example if a NIC is processing frames and
ends up freeing and then reallocating some small batch of pages this
could would be running often even though there isn't really all that
many pages that needed zeroing.

>  /* notify prdev of free page reporting request */
> @@ -230,7 +231,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
>
>         /* Generate minimum watermark to be able to guarantee progress */
>         watermark = low_wmark_pages(zone) +
> -                   (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
> +                   (PAGE_REPORTING_CAPACITY << prdev->mini_order);
>
>         /*
>          * Cancel request if insufficient free memory or if we failed

With the page order being able to be greatly reduced this could have a
significant impact on if this code really has any value. Previously we
were able to guarantee a pretty significant number of higher order
pages free. With this we might only be guaranteeing something like 32
4K pages which is pretty small compared to what can end up being
pulled out at the higher end.

> @@ -240,7 +241,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
>                 return err;
>
>         /* Process each free list starting from lowest order/mt */
> -       for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
> +       for (order = prdev->mini_order; order < MAX_ORDER; order++) {
>                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
>                         /* We do not pull pages from the isolate free list */
>                         if (is_migrate_isolate(mt))
> @@ -307,7 +308,7 @@ static void page_reporting_process(struct work_struct *work)
>          */
>         state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
>         if (state == PAGE_REPORTING_REQUESTED)
> -               schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
> +               schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
>  }
>
>  static DEFINE_MUTEX(page_reporting_mutex);
> @@ -335,6 +336,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
>         /* Assign device to allow notifications */
>         rcu_assign_pointer(pr_dev_info, prdev);
>
> +       page_report_mini_order = prdev->mini_order;
> +       page_report_batch_size = prdev->batch_size;
>         /* enable page reporting notification */
>         if (!static_key_enabled(&page_reporting_enabled)) {
>                 static_branch_enable(&page_reporting_enabled);
> @@ -352,6 +355,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
>         mutex_lock(&page_reporting_mutex);
>
>         if (rcu_access_pointer(pr_dev_info) == prdev) {
> +               if (static_key_enabled(&page_reporting_enabled))
> +                       static_branch_disable(&page_reporting_enabled);
>                 /* Disable page reporting notification */
>                 RCU_INIT_POINTER(pr_dev_info, NULL);
>                 synchronize_rcu();

If we are going to use this we are using it. Once we NULL out the
prdev that should stop page reporting from running. We shouldn't be
relying on the static key.

> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index b8fb3bbb345f..86ac6ffad970 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -9,9 +9,9 @@
>  #include <linux/slab.h>
>  #include <linux/pgtable.h>
>  #include <linux/scatterlist.h>
> +#include <linux/page_reporting.h>
>
> -#define PAGE_REPORTING_MIN_ORDER       pageblock_order
> -
> +extern int page_report_mini_order;
>  extern unsigned long page_report_batch_size;
>
>  #ifdef CONFIG_PAGE_REPORTING
> @@ -42,7 +42,7 @@ static inline void page_reporting_notify_free(unsigned int order)
>                 return;
>
>         /* Determine if we have crossed reporting threshold */
> -       if (order < PAGE_REPORTING_MIN_ORDER)
> +       if (order < page_report_mini_order)
>                 return;

This is another case where it might make sense to look at placing the
code in __page_reporting_notify if we are going to allow this to be a
dynamically configurable value.

>         batch_size += (1 << order) << PAGE_SHIFT;
> --
> 2.18.2
>
>
Liang Li Jan. 7, 2021, 3:29 a.m. UTC | #2
> >  enum {
> >         PAGE_REPORTING_IDLE = 0,
> > @@ -44,7 +45,7 @@ __page_reporting_request(struct page_reporting_dev_info *prdev)
> >          * now we are limiting this to running no more than once every
> >          * couple of seconds.
> >          */
> > -       schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
> > +       schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
> >  }
> >
>
> So this ends up being the reason why you needed to add the batch size
> value. However I don't really see it working as expected since you
> could essentially have 1 page freed 4M times that could trigger your
> page zeroing logic. So for example if a NIC is processing frames and
> ends up freeing and then reallocating some small batch of pages this
> could would be running often even though there isn't really all that
> many pages that needed zeroing.

Good catch, it works not like batch size means.

> >  /* notify prdev of free page reporting request */
> > @@ -230,7 +231,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
> >
> >         /* Generate minimum watermark to be able to guarantee progress */
> >         watermark = low_wmark_pages(zone) +
> > -                   (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
> > +                   (PAGE_REPORTING_CAPACITY << prdev->mini_order);
> >
> >         /*
> >          * Cancel request if insufficient free memory or if we failed
>
> With the page order being able to be greatly reduced this could have a
> significant impact on if this code really has any value. Previously we
> were able to guarantee a pretty significant number of higher order
> pages free. With this we might only be guaranteeing something like 32
> 4K pages which is pretty small compared to what can end up being
> pulled out at the higher end.

I have dropped the 'buddy free page pre zero'  patch, so the mini order will
not change to a small value.

> > @@ -240,7 +241,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev,
> >                 return err;
> >
> >         /* Process each free list starting from lowest order/mt */
> > -       for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
> > +       for (order = prdev->mini_order; order < MAX_ORDER; order++) {
> >                 for (mt = 0; mt < MIGRATE_TYPES; mt++) {
> >                         /* We do not pull pages from the isolate free list */
> >                         if (is_migrate_isolate(mt))
> > @@ -307,7 +308,7 @@ static void page_reporting_process(struct work_struct *work)
> >          */
> >         state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
> >         if (state == PAGE_REPORTING_REQUESTED)
> > -               schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
> > +               schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
> >  }
> >
> >  static DEFINE_MUTEX(page_reporting_mutex);
> > @@ -335,6 +336,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
> >         /* Assign device to allow notifications */
> >         rcu_assign_pointer(pr_dev_info, prdev);
> >
> > +       page_report_mini_order = prdev->mini_order;
> > +       page_report_batch_size = prdev->batch_size;
> >         /* enable page reporting notification */
> >         if (!static_key_enabled(&page_reporting_enabled)) {
> >                 static_branch_enable(&page_reporting_enabled);
> > @@ -352,6 +355,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
> >         mutex_lock(&page_reporting_mutex);
> >
> >         if (rcu_access_pointer(pr_dev_info) == prdev) {
> > +               if (static_key_enabled(&page_reporting_enabled))
> > +                       static_branch_disable(&page_reporting_enabled);
> >                 /* Disable page reporting notification */
> >                 RCU_INIT_POINTER(pr_dev_info, NULL);
> >                 synchronize_rcu();
>
> If we are going to use this we are using it. Once we NULL out the
> prdev that should stop page reporting from running. We shouldn't be
> relying on the static key.

The benefits for this is that the function call of '__page_reporting_notify' in
'page_reporting_notify_free' can be skipped, it helps to save some
cycles.

> > diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> > index b8fb3bbb345f..86ac6ffad970 100644
> > --- a/mm/page_reporting.h
> > +++ b/mm/page_reporting.h
> > @@ -9,9 +9,9 @@
> >  #include <linux/slab.h>
> >  #include <linux/pgtable.h>
> >  #include <linux/scatterlist.h>
> > +#include <linux/page_reporting.h>
> >
> > -#define PAGE_REPORTING_MIN_ORDER       pageblock_order
> > -
> > +extern int page_report_mini_order;
> >  extern unsigned long page_report_batch_size;
> >
> >  #ifdef CONFIG_PAGE_REPORTING
> > @@ -42,7 +42,7 @@ static inline void page_reporting_notify_free(unsigned int order)
> >                 return;
> >
> >         /* Determine if we have crossed reporting threshold */
> > -       if (order < PAGE_REPORTING_MIN_ORDER)
> > +       if (order < page_report_mini_order)
> >                 return;
>
> This is another case where it might make sense to look at placing the
> code in __page_reporting_notify if we are going to allow this to be a
> dynamically configurable value.

I will change it in the next revision. Thanks!

Liang
diff mbox series

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8985fc2cea86..684bcc39ef5a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -993,6 +993,9 @@  static int virtballoon_probe(struct virtio_device *vdev)
 			goto out_unregister_oom;
 		}
 
+		vb->pr_dev_info.mini_order = pageblock_order;
+		vb->pr_dev_info.batch_size = 16 * 1024 * 1024; /* 16M */
+		vb->pr_dev_info.delay_jiffies = 2 * HZ; /* 2 seconds */
 		err = page_reporting_register(&vb->pr_dev_info);
 		if (err)
 			goto out_unregister_oom;
diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
index 3b99e0ec24f2..63e1e9fbcaa2 100644
--- a/include/linux/page_reporting.h
+++ b/include/linux/page_reporting.h
@@ -13,6 +13,9 @@  struct page_reporting_dev_info {
 	int (*report)(struct page_reporting_dev_info *prdev,
 		      struct scatterlist *sg, unsigned int nents);
 
+	unsigned long batch_size;
+	unsigned long delay_jiffies;
+	int mini_order;
 	/* work struct for processing reports */
 	struct delayed_work work;
 
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 694df981ddd2..39bc6a9d7b73 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -13,6 +13,7 @@ 
 #define PAGE_REPORTING_DELAY	(2 * HZ)
 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
 unsigned long page_report_batch_size  __read_mostly = 16 * 1024 * 1024UL;
+int page_report_mini_order = pageblock_order;
 
 enum {
 	PAGE_REPORTING_IDLE = 0,
@@ -44,7 +45,7 @@  __page_reporting_request(struct page_reporting_dev_info *prdev)
 	 * now we are limiting this to running no more than once every
 	 * couple of seconds.
 	 */
-	schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+	schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
 }
 
 /* notify prdev of free page reporting request */
@@ -230,7 +231,7 @@  page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 
 	/* Generate minimum watermark to be able to guarantee progress */
 	watermark = low_wmark_pages(zone) +
-		    (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER);
+		    (PAGE_REPORTING_CAPACITY << prdev->mini_order);
 
 	/*
 	 * Cancel request if insufficient free memory or if we failed
@@ -240,7 +241,7 @@  page_reporting_process_zone(struct page_reporting_dev_info *prdev,
 		return err;
 
 	/* Process each free list starting from lowest order/mt */
-	for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) {
+	for (order = prdev->mini_order; order < MAX_ORDER; order++) {
 		for (mt = 0; mt < MIGRATE_TYPES; mt++) {
 			/* We do not pull pages from the isolate free list */
 			if (is_migrate_isolate(mt))
@@ -307,7 +308,7 @@  static void page_reporting_process(struct work_struct *work)
 	 */
 	state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
 	if (state == PAGE_REPORTING_REQUESTED)
-		schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY);
+		schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
 }
 
 static DEFINE_MUTEX(page_reporting_mutex);
@@ -335,6 +336,8 @@  int page_reporting_register(struct page_reporting_dev_info *prdev)
 	/* Assign device to allow notifications */
 	rcu_assign_pointer(pr_dev_info, prdev);
 
+	page_report_mini_order = prdev->mini_order;
+	page_report_batch_size = prdev->batch_size;
 	/* enable page reporting notification */
 	if (!static_key_enabled(&page_reporting_enabled)) {
 		static_branch_enable(&page_reporting_enabled);
@@ -352,6 +355,8 @@  void page_reporting_unregister(struct page_reporting_dev_info *prdev)
 	mutex_lock(&page_reporting_mutex);
 
 	if (rcu_access_pointer(pr_dev_info) == prdev) {
+		if (static_key_enabled(&page_reporting_enabled))
+			static_branch_disable(&page_reporting_enabled);
 		/* Disable page reporting notification */
 		RCU_INIT_POINTER(pr_dev_info, NULL);
 		synchronize_rcu();
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index b8fb3bbb345f..86ac6ffad970 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -9,9 +9,9 @@ 
 #include <linux/slab.h>
 #include <linux/pgtable.h>
 #include <linux/scatterlist.h>
+#include <linux/page_reporting.h>
 
-#define PAGE_REPORTING_MIN_ORDER	pageblock_order
-
+extern int page_report_mini_order;
 extern unsigned long page_report_batch_size;
 
 #ifdef CONFIG_PAGE_REPORTING
@@ -42,7 +42,7 @@  static inline void page_reporting_notify_free(unsigned int order)
 		return;
 
 	/* Determine if we have crossed reporting threshold */
-	if (order < PAGE_REPORTING_MIN_ORDER)
+	if (order < page_report_mini_order)
 		return;
 
 	batch_size += (1 << order) << PAGE_SHIFT;