diff mbox series

[2/2] mm,memory_hotplug: {READ,WRITE}_ONCE unsynchronized zone data

Message ID 20240521-mm-hotplug-sync-v1-2-6d53706c1ba8@google.com (mailing list archive)
State New
Headers show
Series Clean up hotplug zone data synchronization | expand

Commit Message

Brendan Jackman May 21, 2024, 12:57 p.m. UTC
These fields are written by memory hotplug under mem_hotplug_lock but
read without any lock. It seems like reader code is robust against the
value being stale or "from the future", but we also need to account
for:

1. Load/store tearing (according to Linus[1], this really happens,
   even when everything is aligned as you would hope).

2. Invented loads[2] - the compiler can spill and re-read these fields
   ([2] calls this "invented loads") and assume that they have not
   changed.

Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
for write, but we still need WRITE_ONCE to prevent store-tearing.

[1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
    As discovered via the original big-bad article[2]
[2] https://lwn.net/Articles/793253/

Signed-off-by: Brendan Jackman <jackmanb@google.com>
---
 include/linux/mmzone.h | 14 ++++++++++----
 mm/compaction.c        |  2 +-
 mm/memory_hotplug.c    | 20 ++++++++++++--------
 mm/mm_init.c           |  2 +-
 mm/page_alloc.c        |  2 +-
 mm/show_mem.c          |  8 ++++----
 mm/vmstat.c            |  4 ++--
 7 files changed, 31 insertions(+), 21 deletions(-)

Comments

Lance Yang May 22, 2024, 4:25 a.m. UTC | #1
Hi Brendan,

On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote:
>
> These fields are written by memory hotplug under mem_hotplug_lock but
> read without any lock. It seems like reader code is robust against the
> value being stale or "from the future", but we also need to account
> for:
>
> 1. Load/store tearing (according to Linus[1], this really happens,
>    even when everything is aligned as you would hope).
>
> 2. Invented loads[2] - the compiler can spill and re-read these fields
>    ([2] calls this "invented loads") and assume that they have not
>    changed.
>
> Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
> for write, but we still need WRITE_ONCE to prevent store-tearing.
>
> [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
>     As discovered via the original big-bad article[2]
> [2] https://lwn.net/Articles/793253/
>
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---
>  include/linux/mmzone.h | 14 ++++++++++----
>  mm/compaction.c        |  2 +-
>  mm/memory_hotplug.c    | 20 ++++++++++++--------
>  mm/mm_init.c           |  2 +-
>  mm/page_alloc.c        |  2 +-
>  mm/show_mem.c          |  8 ++++----
>  mm/vmstat.c            |  4 ++--
>  7 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 194ef7fed9d6..bdb3be76d10c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
>  #endif
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline unsigned long zone_end_pfn(const struct zone *zone)
>  {
> -       return zone->zone_start_pfn + zone->spanned_pages;
> +       return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
>  {
>         return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone)
>         return zone->initialized;
>  }
>
> +/* This is unstable unless you hold mem_hotplug_lock. */
>  static inline bool zone_is_empty(struct zone *zone)
>  {
> -       return zone->spanned_pages == 0;
> +       return READ_ONCE(zone->spanned_pages) == 0;
>  }
>
>  #ifndef BUILD_VDSO32_64
> @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone)
>         return zone_managed_pages(zone);
>  }
>
> -/* Returns true if a zone has memory */
> +/*
> + * Returns true if a zone has memory.
> + * This is unstable unless you old mem_hotplug_lock.
> + */
>  static inline bool populated_zone(struct zone *zone)
>  {
> -       return zone->present_pages;
> +       return READ_ONCE(zone->present_pages);
>  }
>
>  #ifdef CONFIG_NUMA
> diff --git a/mm/compaction.c b/mm/compaction.c
> index e731d45befc7..b8066d1fdcf5 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
>  {
>         unsigned long score;
>
> -       score = zone->present_pages * fragmentation_score_zone(zone);
> +       score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
>         return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
>  }
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 431b1f6753c0..71b5e3d314a2 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>         int nid = zone_to_nid(zone);
>
>         if (zone->zone_start_pfn == start_pfn) {
> +               unsigned long old_end_pfn = zone_end_pfn(zone);
> +
>                 /*
>                  * If the section is smallest section in the zone, it need
>                  * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
> @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>                  * for shrinking zone.
>                  */
>                 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
> -                                               zone_end_pfn(zone));
> +                                               old_end_pfn);
>                 if (pfn) {
> -                       zone->spanned_pages = zone_end_pfn(zone) - pfn;
> +                       WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
>                         zone->zone_start_pfn = pfn;
>                 } else {
>                         zone->zone_start_pfn = 0;
> -                       zone->spanned_pages = 0;
> +                       WRITE_ONCE(zone->spanned_pages, 0);
>                 }
>         } else if (zone_end_pfn(zone) == end_pfn) {
>                 /*
> @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
>                 pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
>                                                start_pfn);
>                 if (pfn)
> -                       zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
> +                       WRITE_ONCE(zone->spanned_pages,
> +                                  pfn - zone->zone_start_pfn + 1);
>                 else {
>                         zone->zone_start_pfn = 0;
> -                       zone->spanned_pages = 0;
> +                       WRITE_ONCE(zone->spanned_pages, 0);
>                 }
>         }
>  }
> @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
>         if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
>                 zone->zone_start_pfn = start_pfn;
>
> -       zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
> +       WRITE_ONCE(zone->spanned_pages,
> +                  max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
>  }
>
>  static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
> @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
>                                             struct zone *zone)
>  {
>         if (zone_idx(zone) == ZONE_MOVABLE) {
> -               stats->movable_pages += zone->present_pages;
> +               stats->movable_pages += READ_ONCE(zone->present_pages);
>         } else {
>                 stats->kernel_early_pages += zone->present_early_pages;
>  #ifdef CONFIG_CMA
> @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
>          */
>         if (early_section(__pfn_to_section(page_to_pfn(page))))
>                 zone->present_early_pages += nr_pages;
> -       zone->present_pages += nr_pages;
> +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);

I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
on 'zone->present_pages', but it's probably just me overthinking it :)

Thanks,
Lance

>         zone->zone_pgdat->node_present_pages += nr_pages;
>
>         if (group && movable)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index c725618aeb58..ec66f2eadb95 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
>         for (z = 0; z < MAX_NR_ZONES; z++) {
>                 struct zone *zone = pgdat->node_zones + z;
>
> -               zone->present_pages = 0;
> +               WRITE_ONCE(zone->present_pages, 0);
>                 zone_init_internals(zone, z, nid, 0);
>         }
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5116a2b9ea6e..1eb9000ec7d7 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone)
>
>         if (populated_zone(zone))
>                 pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
> -                        zone->present_pages, zone_batchsize(zone));
> +                        READ_ONCE(zone->present_pages), zone_batchsize(zone));
>  }
>
>  void adjust_managed_page_count(struct page *page, long count)
> diff --git a/mm/show_mem.c b/mm/show_mem.c
> index bdb439551eef..667680a6107b 100644
> --- a/mm/show_mem.c
> +++ b/mm/show_mem.c
> @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
>                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
>                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
>                         K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
> -                       K(zone->present_pages),
> +                       K(READ_ONCE(zone->present_pages)),
>                         K(zone_managed_pages(zone)),
>                         K(zone_page_state(zone, NR_MLOCK)),
>                         K(zone_page_state(zone, NR_BOUNCE)),
> @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>
>         for_each_populated_zone(zone) {
>
> -               total += zone->present_pages;
> -               reserved += zone->present_pages - zone_managed_pages(zone);
> +               total += READ_ONCE(zone->present_pages);
> +               reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
>
>                 if (is_highmem(zone))
> -                       highmem += zone->present_pages;
> +                       highmem += READ_ONCE(zone->present_pages);
>         }
>
>         printk("%lu pages RAM\n", total);
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 8507c497218b..5a9c4b5768e5 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
>                    min_wmark_pages(zone),
>                    low_wmark_pages(zone),
>                    high_wmark_pages(zone),
> -                  zone->spanned_pages,
> -                  zone->present_pages,
> +                  READ_ONCE(zone->spanned_pages),
> +                  READ_ONCE(zone->present_pages),
>                    zone_managed_pages(zone),
>                    zone_cma_pages(zone));
>
>
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
>
Brendan Jackman May 22, 2024, 8:38 a.m. UTC | #2
Hi Lance, thanks for taking a look.

On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote:
> Hi Brendan,
> 
> On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote:
> > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
> >          */
> >         if (early_section(__pfn_to_section(page_to_pfn(page))))
> >                 zone->present_early_pages += nr_pages;
> > -       zone->present_pages += nr_pages;
> > +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
> 
> I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
> on 'zone->present_pages', but it's probably just me overthinking it :)

Hmm.. this isn't for load-tearing, it's for store-tearing. I have a
feeling I might be missing your pont here though, can you elaborate?

I have just noticed that the original "big bad optimizing compiler"
article[1] only says store-tearing has been observed in the wild when
the value being stored can be split into immediates (i.e. is
constant). But it doesn't really seem wise to rely on that. From what
I can tell from tools/memory-model/Documentation you are really out in
the wild with unmarked accesses.

[1] https://lwn.net/Articles/793253
Brendan Jackman May 22, 2024, 8:42 a.m. UTC | #3
On Tue, May 21, 2024 at 12:57:19PM +0000, Brendan Jackman wrote:
> These fields are written by memory hotplug under mem_hotplug_lock but
> read without any lock. It seems like reader code is robust against the
> value being stale or "from the future", but we also need to account
> for:
> 
> 1. Load/store tearing (according to Linus[1], this really happens,
>    even when everything is aligned as you would hope).
> 
> 2. Invented loads[2] - the compiler can spill and re-read these fields
>    ([2] calls this "invented loads") and assume that they have not
>    changed.
> 
> Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
> for write, but we still need WRITE_ONCE to prevent store-tearing.
> 
> [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
>     As discovered via the original big-bad article[2]
> [2] https://lwn.net/Articles/793253/
> 
> Signed-off-by: Brendan Jackman <jackmanb@google.com>

Oh, from a quick look it seems cma_pages would need this too.
present_early_pages seems fine.

I'll wait a few days in case anyone points out this whole thing is
garbage, then check more carefully and send a v2.
Lance Yang May 22, 2024, 9:20 a.m. UTC | #4
On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote:
>
> Hi Lance, thanks for taking a look.
>
> On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote:
> > Hi Brendan,
> >
> > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote:
> > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
> > >          */
> > >         if (early_section(__pfn_to_section(page_to_pfn(page))))
> > >                 zone->present_early_pages += nr_pages;
> > > -       zone->present_pages += nr_pages;
> > > +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
> >
> > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
> > on 'zone->present_pages', but it's probably just me overthinking it :)
>
> Hmm.. this isn't for load-tearing, it's for store-tearing. I have a
> feeling I might be missing your pont here though, can you elaborate?

Sorry, my explanation wasn't clear :(

I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages,
zone->present_pages + nr_pages);'
is equivalent to the following:

1 a = zone->present_pages + nr_pages;
2 WRITE_ONCE(zone->present_pages, a);

If so, is there any possibility of load tearing on
'zone->present_pages' in line 1?

>
> I have just noticed that the original "big bad optimizing compiler"
> article[1] only says store-tearing has been observed in the wild when
> the value being stored can be split into immediates (i.e. is
> constant). But it doesn't really seem wise to rely on that. From what
> I can tell from tools/memory-model/Documentation you are really out in
> the wild with unmarked accesses.
>
> [1] https://lwn.net/Articles/793253

Thanks for clarifying!
Lance
Brendan Jackman May 22, 2024, 10:10 a.m. UTC | #5
On Wed, May 22, 2024 at 05:20:08PM +0800, Lance Yang wrote:
> On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote:
> >
> > Hi Lance, thanks for taking a look.
> >
> > On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote:
> > > Hi Brendan,
> > >
> > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote:
> > > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
> > > >          */
> > > >         if (early_section(__pfn_to_section(page_to_pfn(page))))
> > > >                 zone->present_early_pages += nr_pages;
> > > > -       zone->present_pages += nr_pages;
> > > > +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
> > >
> > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
> > > on 'zone->present_pages', but it's probably just me overthinking it :)
> >
> > Hmm.. this isn't for load-tearing, it's for store-tearing. I have a
> > feeling I might be missing your pont here though, can you elaborate?
> 
> Sorry, my explanation wasn't clear :(
> 
> I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages,
> zone->present_pages + nr_pages);'
> is equivalent to the following:
> 
> 1 a = zone->present_pages + nr_pages;
> 2 WRITE_ONCE(zone->present_pages, a);
> 
> If so, is there any possibility of load tearing on
> 'zone->present_pages' in line 1?

Ah gotcha, thanks for clarifying. Loads are protected by
mem_hotplug_lock here, so it's fine for them to get split up (because
the value can't change between loads). This is what I was referring to
in the bit of the commit message about not needing READ_ONCE.
Lance Yang May 22, 2024, 11:23 a.m. UTC | #6
On Wed, May 22, 2024 at 6:10 PM Brendan Jackman <jackmanb@google.com> wrote:
>
> On Wed, May 22, 2024 at 05:20:08PM +0800, Lance Yang wrote:
> > On Wed, May 22, 2024 at 4:38 PM Brendan Jackman <jackmanb@google.com> wrote:
> > >
> > > Hi Lance, thanks for taking a look.
> > >
> > > On Wed, May 22, 2024 at 12:25:30PM +0800, Lance Yang wrote:
> > > > Hi Brendan,
> > > >
> > > > On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@google.com> wrote:
> > > > > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
> > > > >          */
> > > > >         if (early_section(__pfn_to_section(page_to_pfn(page))))
> > > > >                 zone->present_early_pages += nr_pages;
> > > > > -       zone->present_pages += nr_pages;
> > > > > +       WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
> > > >
> > > > I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing
> > > > on 'zone->present_pages', but it's probably just me overthinking it :)
> > >
> > > Hmm.. this isn't for load-tearing, it's for store-tearing. I have a
> > > feeling I might be missing your pont here though, can you elaborate?
> >
> > Sorry, my explanation wasn't clear :(
> >
> > I'm a bit confused about whether 'WRITE_ONCE(zone->present_pages,
> > zone->present_pages + nr_pages);'
> > is equivalent to the following:
> >
> > 1 a = zone->present_pages + nr_pages;
> > 2 WRITE_ONCE(zone->present_pages, a);
> >
> > If so, is there any possibility of load tearing on
> > 'zone->present_pages' in line 1?
>
> Ah gotcha, thanks for clarifying. Loads are protected by
> mem_hotplug_lock here, so it's fine for them to get split up (because
> the value can't change between loads). This is what I was referring to
> in the bit of the commit message about not needing READ_ONCE.

I see, thanks again for clarifying!
Lance
David Hildenbrand May 22, 2024, 2:05 p.m. UTC | #7
On 21.05.24 14:57, Brendan Jackman wrote:
> These fields are written by memory hotplug under mem_hotplug_lock but
> read without any lock. It seems like reader code is robust against the
> value being stale or "from the future", but we also need to account
> for:
> 
> 1. Load/store tearing (according to Linus[1], this really happens,
>     even when everything is aligned as you would hope).
> 
> 2. Invented loads[2] - the compiler can spill and re-read these fields
>     ([2] calls this "invented loads") and assume that they have not
>     changed.
> 
> Note we don't need READ_ONCE in paths that have the mem_hotplug_lock
> for write, but we still need WRITE_ONCE to prevent store-tearing.
> 
> [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@mail.gmail.com/T/#u
>      As discovered via the original big-bad article[2]
> [2] https://lwn.net/Articles/793253/
> 
> Signed-off-by: Brendan Jackman <jackmanb@google.com>
> ---
>   include/linux/mmzone.h | 14 ++++++++++----
>   mm/compaction.c        |  2 +-
>   mm/memory_hotplug.c    | 20 ++++++++++++--------
>   mm/mm_init.c           |  2 +-
>   mm/page_alloc.c        |  2 +-
>   mm/show_mem.c          |  8 ++++----
>   mm/vmstat.c            |  4 ++--
>   7 files changed, 31 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 194ef7fed9d6..bdb3be76d10c 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone)
>   #endif
>   }
>   
> +/* This is unstable unless you hold mem_hotplug_lock. */
>   static inline unsigned long zone_end_pfn(const struct zone *zone)
>   {
> -	return zone->zone_start_pfn + zone->spanned_pages;
> +	return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);

It's weird to apply that logic only to spanned_pages, whereby 
zone_start_pfn can (and will) similarly change when onlining/offlining 
memory.
Brendan Jackman May 22, 2024, 2:11 p.m. UTC | #8
On Wed, May 22, 2024 at 04:05:12PM +0200, David Hildenbrand wrote:
> On 21.05.24 14:57, Brendan Jackman wrote:
> > +	return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
> 
> It's weird to apply that logic only to spanned_pages, whereby zone_start_pfn
> can (and will) similarly change when onlining/offlining memory.
> 
Oh, yep. For some reason I had decided that zone_start_pfn was fixed
but that is (actually very obviously) not true!

Will take a closer look and extend v2 to cover that too, unless
someone finds a reason this whole patch is nonsense.

Thanks for the review.
Brendan Jackman May 31, 2024, 4:41 p.m. UTC | #9
On Wed, May 22, 2024 at 02:11:16PM +0000, Brendan Jackman wrote:
> On Wed, May 22, 2024 at 04:05:12PM +0200, David Hildenbrand wrote:
> > On 21.05.24 14:57, Brendan Jackman wrote:
> > > +	return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
> > 
> > It's weird to apply that logic only to spanned_pages, whereby zone_start_pfn
> > can (and will) similarly change when onlining/offlining memory.
> > 
> Oh, yep. For some reason I had decided that zone_start_pfn was fixed
> but that is (actually very obviously) not true!
> 
> Will take a closer look and extend v2 to cover that too, unless
> someone finds a reason this whole patch is nonsense.
> 
> Thanks for the review.

Hmm so while poking around during spare moments this week I learned
that compaction.c also stores a bunch of data in struct zone that is
unsynchronized.

It seems pretty unlikely that you can corrupt any memory there (unless
there's some race possible with pfn_to_online_page, which is an
orthogonal question), but it does seem like if the compiler gets smart
with us we could maybe have a compaction run that takes quasi-forever
or something weird like that.

It seems easy enough to just spam READ_ONCE/WRITE_ONCE everywhere
there too, this would remove that risk, make KCSAN happy and serve as
a kinda "this is unsynchronized, take care" comment. (There's also at
least one place where we could put data_race()).

On the other hand it's a bit verbose & visually ugly. Personally I
think it's a pretty minor downside, but anyone feel differently?
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 194ef7fed9d6..bdb3be76d10c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1018,11 +1018,13 @@  static inline unsigned long zone_cma_pages(struct zone *zone)
 #endif
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline unsigned long zone_end_pfn(const struct zone *zone)
 {
-	return zone->zone_start_pfn + zone->spanned_pages;
+	return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages);
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 {
 	return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
@@ -1033,9 +1035,10 @@  static inline bool zone_is_initialized(struct zone *zone)
 	return zone->initialized;
 }
 
+/* This is unstable unless you hold mem_hotplug_lock. */
 static inline bool zone_is_empty(struct zone *zone)
 {
-	return zone->spanned_pages == 0;
+	return READ_ONCE(zone->spanned_pages) == 0;
 }
 
 #ifndef BUILD_VDSO32_64
@@ -1485,10 +1488,13 @@  static inline bool managed_zone(struct zone *zone)
 	return zone_managed_pages(zone);
 }
 
-/* Returns true if a zone has memory */
+/*
+ * Returns true if a zone has memory.
+ * This is unstable unless you old mem_hotplug_lock.
+ */
 static inline bool populated_zone(struct zone *zone)
 {
-	return zone->present_pages;
+	return READ_ONCE(zone->present_pages);
 }
 
 #ifdef CONFIG_NUMA
diff --git a/mm/compaction.c b/mm/compaction.c
index e731d45befc7..b8066d1fdcf5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2239,7 +2239,7 @@  static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
 {
 	unsigned long score;
 
-	score = zone->present_pages * fragmentation_score_zone(zone);
+	score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone);
 	return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
 }
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431b1f6753c0..71b5e3d314a2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -463,6 +463,8 @@  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 	int nid = zone_to_nid(zone);
 
 	if (zone->zone_start_pfn == start_pfn) {
+		unsigned long old_end_pfn = zone_end_pfn(zone);
+
 		/*
 		 * If the section is smallest section in the zone, it need
 		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -470,13 +472,13 @@  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 		 * for shrinking zone.
 		 */
 		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
-						zone_end_pfn(zone));
+						old_end_pfn);
 		if (pfn) {
-			zone->spanned_pages = zone_end_pfn(zone) - pfn;
+			WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn);
 			zone->zone_start_pfn = pfn;
 		} else {
 			zone->zone_start_pfn = 0;
-			zone->spanned_pages = 0;
+			WRITE_ONCE(zone->spanned_pages, 0);
 		}
 	} else if (zone_end_pfn(zone) == end_pfn) {
 		/*
@@ -488,10 +490,11 @@  static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 		pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
 					       start_pfn);
 		if (pfn)
-			zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+			WRITE_ONCE(zone->spanned_pages,
+				   pfn - zone->zone_start_pfn + 1);
 		else {
 			zone->zone_start_pfn = 0;
-			zone->spanned_pages = 0;
+			WRITE_ONCE(zone->spanned_pages, 0);
 		}
 	}
 }
@@ -710,7 +713,8 @@  static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p
 	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
 		zone->zone_start_pfn = start_pfn;
 
-	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+	WRITE_ONCE(zone->spanned_pages,
+		   max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn);
 }
 
 static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
@@ -795,7 +799,7 @@  static void auto_movable_stats_account_zone(struct auto_movable_stats *stats,
 					    struct zone *zone)
 {
 	if (zone_idx(zone) == ZONE_MOVABLE) {
-		stats->movable_pages += zone->present_pages;
+		stats->movable_pages += READ_ONCE(zone->present_pages);
 	} else {
 		stats->kernel_early_pages += zone->present_early_pages;
 #ifdef CONFIG_CMA
@@ -1077,7 +1081,7 @@  void adjust_present_page_count(struct page *page, struct memory_group *group,
 	 */
 	if (early_section(__pfn_to_section(page_to_pfn(page))))
 		zone->present_early_pages += nr_pages;
-	zone->present_pages += nr_pages;
+	WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages);
 	zone->zone_pgdat->node_present_pages += nr_pages;
 
 	if (group && movable)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c725618aeb58..ec66f2eadb95 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1540,7 +1540,7 @@  void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
 	for (z = 0; z < MAX_NR_ZONES; z++) {
 		struct zone *zone = pgdat->node_zones + z;
 
-		zone->present_pages = 0;
+		WRITE_ONCE(zone->present_pages, 0);
 		zone_init_internals(zone, z, nid, 0);
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5116a2b9ea6e..1eb9000ec7d7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5728,7 +5728,7 @@  __meminit void zone_pcp_init(struct zone *zone)
 
 	if (populated_zone(zone))
 		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
-			 zone->present_pages, zone_batchsize(zone));
+			 READ_ONCE(zone->present_pages), zone_batchsize(zone));
 }
 
 void adjust_managed_page_count(struct page *page, long count)
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..667680a6107b 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -337,7 +337,7 @@  static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
-			K(zone->present_pages),
+			K(READ_ONCE(zone->present_pages)),
 			K(zone_managed_pages(zone)),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_BOUNCE)),
@@ -407,11 +407,11 @@  void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 
 	for_each_populated_zone(zone) {
 
-		total += zone->present_pages;
-		reserved += zone->present_pages - zone_managed_pages(zone);
+		total += READ_ONCE(zone->present_pages);
+		reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone);
 
 		if (is_highmem(zone))
-			highmem += zone->present_pages;
+			highmem += READ_ONCE(zone->present_pages);
 	}
 
 	printk("%lu pages RAM\n", total);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8507c497218b..5a9c4b5768e5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1708,8 +1708,8 @@  static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   zone->spanned_pages,
-		   zone->present_pages,
+		   READ_ONCE(zone->spanned_pages),
+		   READ_ONCE(zone->present_pages),
 		   zone_managed_pages(zone),
 		   zone_cma_pages(zone));