diff mbox series

[6/7] mm: parallelize deferred_init_memmap()

Message ID 20200430201125.532129-7-daniel.m.jordan@oracle.com (mailing list archive)
State RFC
Delegated to: Herbert Xu
Headers show
Series padata: parallelize deferred page init | expand

Commit Message

Daniel Jordan April 30, 2020, 8:11 p.m. UTC
Deferred struct page init uses one thread per node, which is a
significant bottleneck at boot for big machines--often the largest.
Parallelize to reduce system downtime.

The maximum number of threads is capped at the number of CPUs on the
node because speedups always improve with additional threads on every
system tested, and at this phase of boot, the system is otherwise idle
and waiting on page init to finish.

Helper threads operate on MAX_ORDER_NR_PAGES-aligned ranges to avoid
accessing uninitialized buddy pages, so set the job's alignment
accordingly.

The minimum chunk size is also MAX_ORDER_NR_PAGES because there was
benefit to using multiple threads even on relatively small memory (1G)
systems.

    Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
      2 nodes * 26 cores * 2 threads = 104 CPUs
      384G/node = 768G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --   4056.7 (  5.5)         --   1763.3 (  4.2)
         test        39.9%   2436.7 (  2.1)      91.8%    144.3 (  5.9)

    Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
      1 node * 16 cores * 2 threads = 32 CPUs
      192G/node = 192G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --   1957.3 ( 14.0)         --   1093.7 ( 12.9)
         test        49.1%    996.0 (  7.2)      88.4%    127.3 (  5.1)

    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
      2 nodes * 18 cores * 2 threads = 72 CPUs
      128G/node = 256G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --   1666.0 (  3.5)         --    618.0 (  3.5)
         test        31.3%   1145.3 (  1.5)      85.6%     89.0 (  1.7)

    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 8 cores * 2 threads = 16 CPUs
      64G/node = 64G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --   1029.7 ( 42.3)         --    253.7 (  3.1)
         test        23.3%    789.3 ( 15.0)      76.3%     60.0 (  5.6)

Server-oriented distros that enable deferred page init sometimes run in
small VMs, and they still benefit even though the fraction of boot time
saved is smaller:

    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      16G/node = 16G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --    757.7 ( 17.1)         --     57.0 (  0.0)
         test         6.2%    710.3 ( 15.0)      63.2%     21.0 (  0.0)

    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      14G/node = 14G memory

                   kernel boot                 deferred init
                   ------------------------    ------------------------
                   speedup  time_ms (stdev)    speedup  time_ms (stdev)
         base           --    656.3 (  7.1)         --     57.3 (  1.5)
         test         8.6%    599.7 (  5.9)      62.8%     21.3 (  1.2)

Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
 mm/Kconfig      |  6 +++---
 mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 11 deletions(-)

Comments

Alexander Duyck May 4, 2020, 10:33 p.m. UTC | #1
On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
<daniel.m.jordan@oracle.com> wrote:
>
> Deferred struct page init uses one thread per node, which is a
> significant bottleneck at boot for big machines--often the largest.
> Parallelize to reduce system downtime.
>
> The maximum number of threads is capped at the number of CPUs on the
> node because speedups always improve with additional threads on every
> system tested, and at this phase of boot, the system is otherwise idle
> and waiting on page init to finish.
>
> Helper threads operate on MAX_ORDER_NR_PAGES-aligned ranges to avoid
> accessing uninitialized buddy pages, so set the job's alignment
> accordingly.
>
> The minimum chunk size is also MAX_ORDER_NR_PAGES because there was
> benefit to using multiple threads even on relatively small memory (1G)
> systems.
>
>     Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
>       2 nodes * 26 cores * 2 threads = 104 CPUs
>       384G/node = 768G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --   4056.7 (  5.5)         --   1763.3 (  4.2)
>          test        39.9%   2436.7 (  2.1)      91.8%    144.3 (  5.9)
>
>     Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
>       1 node * 16 cores * 2 threads = 32 CPUs
>       192G/node = 192G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --   1957.3 ( 14.0)         --   1093.7 ( 12.9)
>          test        49.1%    996.0 (  7.2)      88.4%    127.3 (  5.1)
>
>     Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
>       2 nodes * 18 cores * 2 threads = 72 CPUs
>       128G/node = 256G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --   1666.0 (  3.5)         --    618.0 (  3.5)
>          test        31.3%   1145.3 (  1.5)      85.6%     89.0 (  1.7)
>
>     AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
>       1 node * 8 cores * 2 threads = 16 CPUs
>       64G/node = 64G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --   1029.7 ( 42.3)         --    253.7 (  3.1)
>          test        23.3%    789.3 ( 15.0)      76.3%     60.0 (  5.6)
>
> Server-oriented distros that enable deferred page init sometimes run in
> small VMs, and they still benefit even though the fraction of boot time
> saved is smaller:
>
>     AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
>       1 node * 2 cores * 2 threads = 4 CPUs
>       16G/node = 16G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --    757.7 ( 17.1)         --     57.0 (  0.0)
>          test         6.2%    710.3 ( 15.0)      63.2%     21.0 (  0.0)
>
>     Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
>       1 node * 2 cores * 2 threads = 4 CPUs
>       14G/node = 14G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>                    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>          base           --    656.3 (  7.1)         --     57.3 (  1.5)
>          test         8.6%    599.7 (  5.9)      62.8%     21.3 (  1.2)
>
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> ---
>  mm/Kconfig      |  6 +++---
>  mm/page_alloc.c | 46 ++++++++++++++++++++++++++++++++++++++--------
>  2 files changed, 41 insertions(+), 11 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index ab80933be65ff..e5007206c7601 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -622,13 +622,13 @@ config DEFERRED_STRUCT_PAGE_INIT
>         depends on SPARSEMEM
>         depends on !NEED_PER_CPU_KM
>         depends on 64BIT
> +       select PADATA
>         help
>           Ordinarily all struct pages are initialised during early boot in a
>           single thread. On very large machines this can take a considerable
>           amount of time. If this option is set, large machines will bring up
> -         a subset of memmap at boot and then initialise the rest in parallel
> -         by starting one-off "pgdatinitX" kernel thread for each node X. This
> -         has a potential performance impact on processes running early in the
> +         a subset of memmap at boot and then initialise the rest in parallel.
> +         This has a potential performance impact on tasks running early in the
>           lifetime of the system until these kthreads finish the
>           initialisation.
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 990514d8f0d94..96d6d0d920c27 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -68,6 +68,7 @@
>  #include <linux/lockdep.h>
>  #include <linux/nmi.h>
>  #include <linux/psi.h>
> +#include <linux/padata.h>
>
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
> @@ -1729,6 +1730,25 @@ deferred_init_maxorder(struct zone *zone, unsigned long *start_pfn,
>         return nr_pages;
>  }
>
> +struct def_init_args {
> +       struct zone *zone;
> +       atomic_long_t nr_pages;
> +};
> +
> +static void __init deferred_init_memmap_chunk(unsigned long spfn,
> +                                             unsigned long epfn, void *arg)
> +{
> +       struct def_init_args *args = arg;
> +       unsigned long nr_pages = 0;
> +
> +       while (spfn < epfn) {
> +               nr_pages += deferred_init_maxorder(args->zone, &spfn, epfn);
> +               cond_resched();
> +       }
> +
> +       atomic_long_add(nr_pages, &args->nr_pages);
> +}
> +
>  /* Initialise remaining memory on a node */
>  static int __init deferred_init_memmap(void *data)
>  {
> @@ -1738,7 +1758,7 @@ static int __init deferred_init_memmap(void *data)
>         unsigned long first_init_pfn, flags;
>         unsigned long start = jiffies;
>         struct zone *zone;
> -       int zid;
> +       int zid, max_threads;
>         u64 i;
>
>         /* Bind memory initialisation thread to a local node if possible */
> @@ -1778,15 +1798,25 @@ static int __init deferred_init_memmap(void *data)
>                 goto zone_empty;
>
>         /*
> -        * Initialize and free pages in MAX_ORDER sized increments so
> -        * that we can avoid introducing any issues with the buddy
> -        * allocator.
> +        * More CPUs always led to greater speedups on tested systems, up to
> +        * all the nodes' CPUs.  Use all since the system is otherwise idle now.
>          */

I would be curious about your data. That isn't what I have seen in the
past. Typically only up to about 8 or 10 CPUs gives you any benefit,
beyond that I was usually cache/memory bandwidth bound.

> +       max_threads = max(cpumask_weight(cpumask), 1u);
> +

We will need to gather data on if having a ton of threads works for
all architectures. For x86 I think we are freeing back pages in
pageblock_order sized chunks so we only have to touch them once in
initialize and then free the two pageblock_order chunks into the buddy
allocator.

>         for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
> -               while (spfn < epfn) {
> -                       nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
> -                       cond_resched();
> -               }
> +               struct def_init_args args = { zone, ATOMIC_LONG_INIT(0) };
> +               struct padata_mt_job job = {
> +                       .thread_fn   = deferred_init_memmap_chunk,
> +                       .fn_arg      = &args,
> +                       .start       = spfn,
> +                       .size        = epfn - spfn,
> +                       .align       = MAX_ORDER_NR_PAGES,
> +                       .min_chunk   = MAX_ORDER_NR_PAGES,
> +                       .max_threads = max_threads,
> +               };
> +
> +               padata_do_multithreaded(&job);
> +               nr_pages += atomic_long_read(&args.nr_pages);
>         }
>  zone_empty:
>         /* Sanity check that the next zone really is unpopulated */

Okay so looking at this I can see why you wanted to structure the
other patch the way you did. However I am not sure that is the best
way to go about doing it. It might make more sense to go through and
accumulate sections. If you hit the end of a range and the start of
the next range is in another section, then you split it as a new job,
otherwise I would just accumulate it into the current job. You then
could section align the work and be more or less guaranteed that each
worker thread should be generating finished work products, and not
incomplete max order pages.

That solution would work with the existing code as well since you
could basically just compare the start pfn coming out of the
deferred_init_maxorder versus the end of the chunk to determine if you
should exit or not.
Josh Triplett May 4, 2020, 11:38 p.m. UTC | #2
On May 4, 2020 3:33:58 PM PDT, Alexander Duyck <alexander.duyck@gmail.com> wrote:
>On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
><daniel.m.jordan@oracle.com> wrote:
>>         /*
>> -        * Initialize and free pages in MAX_ORDER sized increments so
>> -        * that we can avoid introducing any issues with the buddy
>> -        * allocator.
>> +        * More CPUs always led to greater speedups on tested
>systems, up to
>> +        * all the nodes' CPUs.  Use all since the system is
>otherwise idle now.
>>          */
>
>I would be curious about your data. That isn't what I have seen in the
>past. Typically only up to about 8 or 10 CPUs gives you any benefit,
>beyond that I was usually cache/memory bandwidth bound.

I've found pretty much linear performance up to memory bandwidth, and on the systems I was testing, I didn't saturate memory bandwidth until about the full number of physical cores. From number of cores up to number of threads, the performance stayed about flat; it didn't get any better or worse.

- Josh
Alexander Duyck May 5, 2020, 12:40 a.m. UTC | #3
On Mon, May 4, 2020 at 4:44 PM Josh Triplett <josh@joshtriplett.org> wrote:
>
> On May 4, 2020 3:33:58 PM PDT, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> >On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
> ><daniel.m.jordan@oracle.com> wrote:
> >>         /*
> >> -        * Initialize and free pages in MAX_ORDER sized increments so
> >> -        * that we can avoid introducing any issues with the buddy
> >> -        * allocator.
> >> +        * More CPUs always led to greater speedups on tested
> >systems, up to
> >> +        * all the nodes' CPUs.  Use all since the system is
> >otherwise idle now.
> >>          */
> >
> >I would be curious about your data. That isn't what I have seen in the
> >past. Typically only up to about 8 or 10 CPUs gives you any benefit,
> >beyond that I was usually cache/memory bandwidth bound.
>
> I've found pretty much linear performance up to memory bandwidth, and on the systems I was testing, I didn't saturate memory bandwidth until about the full number of physical cores. From number of cores up to number of threads, the performance stayed about flat; it didn't get any better or worse.

That doesn't sound right though based on the numbers you provided. The
system you had was 192GB spread over 2 nodes with 48thread/24core per
node, correct? Your numbers went from ~290ms to ~28ms so a 10x
decrease, that doesn't sound linear when you spread the work over 24
cores to get there. I agree that the numbers largely stay flat once
you hit the peak, I have seen similar behavior when I was working on
the deferred init code previously. One concern I have though is that
we may end up seeing better performance with a subset of cores instead
of running all of the cores/threads, especially if features such as
turbo come into play. In addition we are talking x86 only so far. I
would be interested in seeing if this has benefits or not for other
architectures.

Also what is the penalty that is being paid in order to break up the
work before-hand and set it up for the parallel work? I would be
interested in seeing what the cost is on a system with fewer cores per
node, maybe even down to 1. That would tell us how much additional
overhead is being added to set things up to run in parallel. If I get
a chance tomorrow I might try applying the patches and doing some
testing myself.

Thanks.

- Alex
Daniel Jordan May 5, 2020, 1:26 a.m. UTC | #4
On Mon, May 04, 2020 at 03:33:58PM -0700, Alexander Duyck wrote:
> On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
> > @@ -1778,15 +1798,25 @@ static int __init deferred_init_memmap(void *data)
> >                 goto zone_empty;
> >
> >         /*
> > -        * Initialize and free pages in MAX_ORDER sized increments so
> > -        * that we can avoid introducing any issues with the buddy
> > -        * allocator.
> > +        * More CPUs always led to greater speedups on tested systems, up to
> > +        * all the nodes' CPUs.  Use all since the system is otherwise idle now.
> >          */
> 
> I would be curious about your data. That isn't what I have seen in the
> past. Typically only up to about 8 or 10 CPUs gives you any benefit,
> beyond that I was usually cache/memory bandwidth bound.

I was surprised too!  For most of its development, this set had an interface to
get the number of cores on the theory that this was about where the bandwidth
got saturated, but the data showed otherwise.

There were diminishing returns, but they were more apparent on Haswell than
Skylake for instance.  I'll post some more data later in the thread where you
guys are talking about it.

> 
> > +       max_threads = max(cpumask_weight(cpumask), 1u);
> > +
> 
> We will need to gather data on if having a ton of threads works for
> all architectures.

Agreed.  I'll rope in some of the arch lists in the next version and include
the debugging knob to vary the thread count.

> For x86 I think we are freeing back pages in
> pageblock_order sized chunks so we only have to touch them once in
> initialize and then free the two pageblock_order chunks into the buddy
> allocator.
> 
> >         for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
> > -               while (spfn < epfn) {
> > -                       nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
> > -                       cond_resched();
> > -               }
> > +               struct def_init_args args = { zone, ATOMIC_LONG_INIT(0) };
> > +               struct padata_mt_job job = {
> > +                       .thread_fn   = deferred_init_memmap_chunk,
> > +                       .fn_arg      = &args,
> > +                       .start       = spfn,
> > +                       .size        = epfn - spfn,
> > +                       .align       = MAX_ORDER_NR_PAGES,
> > +                       .min_chunk   = MAX_ORDER_NR_PAGES,
> > +                       .max_threads = max_threads,
> > +               };
> > +
> > +               padata_do_multithreaded(&job);
> > +               nr_pages += atomic_long_read(&args.nr_pages);
> >         }
> >  zone_empty:
> >         /* Sanity check that the next zone really is unpopulated */
> 
> Okay so looking at this I can see why you wanted to structure the
> other patch the way you did. However I am not sure that is the best
> way to go about doing it. It might make more sense to go through and
> accumulate sections. If you hit the end of a range and the start of
> the next range is in another section, then you split it as a new job,
> otherwise I would just accumulate it into the current job. You then
> could section align the work and be more or less guaranteed that each
> worker thread should be generating finished work products, and not
> incomplete max order pages.

This guarantee holds now with the max-order alignment passed to padata, so I
don't see what more doing it on section boundaries buys us.
Daniel Jordan May 5, 2020, 1:48 a.m. UTC | #5
On Mon, May 04, 2020 at 05:40:19PM -0700, Alexander Duyck wrote:
> On Mon, May 4, 2020 at 4:44 PM Josh Triplett <josh@joshtriplett.org> wrote:
> >
> > On May 4, 2020 3:33:58 PM PDT, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > >On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
> > ><daniel.m.jordan@oracle.com> wrote:
> > >>         /*
> > >> -        * Initialize and free pages in MAX_ORDER sized increments so
> > >> -        * that we can avoid introducing any issues with the buddy
> > >> -        * allocator.
> > >> +        * More CPUs always led to greater speedups on tested
> > >systems, up to
> > >> +        * all the nodes' CPUs.  Use all since the system is
> > >otherwise idle now.
> > >>          */
> > >
> > >I would be curious about your data. That isn't what I have seen in the
> > >past. Typically only up to about 8 or 10 CPUs gives you any benefit,
> > >beyond that I was usually cache/memory bandwidth bound.

On Skylake it took more than 8 or 10 CPUs, though on other machines the benefit
of using all versus half or 3/4 of the CPUs is less significant.

Given that the rest of the system is idle at this point, my main concern is
whether other archs regress past a certain thread count.


    Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
      2 nodes * 26 cores * 2 threads = 104 CPUs
      384G/node = 768G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   4056.7 (  5.5)         --   1763.3 (  4.2)
          (  1)      -2.3%   4153.3 (  2.5)      -5.3%   1861.7 (  5.5)
      12% (  6)      53.8%   2637.7 ( 38.7)     408.7%    346.7 ( 37.5)
      25% ( 13)      62.4%   2497.3 ( 38.5)     739.7%    210.0 ( 41.8)
      37% ( 19)      63.8%   2477.0 ( 19.0)     851.4%    185.3 ( 21.5)
      50% ( 26)      64.1%   2471.7 ( 21.4)     881.4%    179.7 ( 25.8)
      75% ( 39)      65.2%   2455.7 ( 33.2)     990.7%    161.7 ( 29.3)
     100% ( 52)      66.5%   2436.7 (  2.1)    1121.7%    144.3 (  5.9)


    Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
      1 node * 16 cores * 2 threads = 32 CPUs
      192G/node = 192G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1957.3 ( 14.0)         --   1093.7 ( 12.9)
          (  1)       1.4%   1930.7 ( 10.0)       3.8%   1053.3 (  7.6)
      12% (  4)      70.0%   1151.7 (  9.0)     292.5%    278.7 (  0.6)
      25% (  8)      86.2%   1051.0 (  7.8)     514.4%    178.0 (  2.6)
      37% ( 12)      95.1%   1003.3 (  7.6)     672.0%    141.7 (  3.8)
      50% ( 16)      93.0%   1014.3 ( 20.0)     720.2%    133.3 (  3.2)
      75% ( 24)      97.8%    989.3 (  6.7)     765.7%    126.3 (  1.5)
     100% ( 32)      96.5%    996.0 (  7.2)     758.9%    127.3 (  5.1)
    

    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
      2 nodes * 18 cores * 2 threads = 72 CPUs
      128G/node = 256G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1666.0 (  3.5)         --    618.0 (  3.5)
          (  1)       1.0%   1649.7 (  1.5)       3.0%    600.0 (  1.0)
      12% (  4)      34.9%   1234.7 ( 21.4)     237.7%    183.0 ( 22.5)
      25% (  9)      42.0%   1173.0 ( 10.0)     417.9%    119.3 (  9.6)
      37% ( 13)      44.4%   1153.7 ( 17.0)     524.2%     99.0 ( 15.6)
      50% ( 18)      44.8%   1150.3 ( 15.5)     534.9%     97.3 ( 16.2)
      75% ( 27)      44.8%   1150.3 (  2.5)     550.5%     95.0 (  5.6)
     100% ( 36)      45.5%   1145.3 (  1.5)     594.4%     89.0 (  1.7)
    

    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 8 cores * 2 threads = 16 CPUs
      64G/node = 64G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1029.7 ( 42.3)         --    253.7 (  3.1)
          (  1)       3.4%    995.3 ( 21.4)       4.5%    242.7 (  5.5)
      12% (  2)      16.3%    885.7 ( 24.4)      86.5%    136.0 (  5.2)
      25% (  4)      23.3%    835.0 ( 21.5)     195.0%     86.0 (  1.7)
      37% (  6)      28.0%    804.7 ( 15.7)     249.1%     72.7 (  2.1)
      50% (  8)      26.3%    815.3 ( 11.7)     290.3%     65.0 (  3.5)
      75% ( 12)      30.7%    787.7 (  2.1)     284.3%     66.0 (  3.6)
     100% ( 16)      30.4%    789.3 ( 15.0)     322.8%     60.0 (  5.6)
    
    
    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      16G/node = 16G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --    757.7 ( 17.1)         --     57.0 (  0.0)
      25% (  1)      -1.0%    765.3 (  5.5)       3.6%     55.0 (  0.0)
      50% (  2)       4.9%    722.3 ( 21.5)      74.5%     32.7 (  4.6)
      75% (  3)       3.8%    729.7 (  4.9)     119.2%     26.0 (  0.0)
     100% (  4)       6.7%    710.3 ( 15.0)     171.4%     21.0 (  0.0)
    

    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      14G/node = 14G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --    656.3 (  7.1)         --     57.3 (  1.5)
      25% (  1)       1.8%    644.7 (  3.1)       0.6%     57.0 (  0.0)
      50% (  2)       7.0%    613.7 (  5.1)      68.6%     34.0 (  5.3)
      75% (  3)       7.4%    611.3 (  6.7)     135.6%     24.3 (  0.6)
     100% (  4)       9.4%    599.7 (  5.9)     168.8%     21.3 (  1.2)


> > I've found pretty much linear performance up to memory bandwidth, and on the systems I was testing, I didn't saturate memory bandwidth until about the full number of physical cores. From number of cores up to number of threads, the performance stayed about flat; it didn't get any better or worse.
> 
> That doesn't sound right though based on the numbers you provided. The
> system you had was 192GB spread over 2 nodes with 48thread/24core per
> node, correct? Your numbers went from ~290ms to ~28ms so a 10x
> decrease, that doesn't sound linear when you spread the work over 24
> cores to get there. I agree that the numbers largely stay flat once
> you hit the peak, I have seen similar behavior when I was working on
> the deferred init code previously. One concern I have though is that
> we may end up seeing better performance with a subset of cores instead
> of running all of the cores/threads, especially if features such as
> turbo come into play. In addition we are talking x86 only so far. I
> would be interested in seeing if this has benefits or not for other
> architectures.
> 
> Also what is the penalty that is being paid in order to break up the
> work before-hand and set it up for the parallel work? I would be
> interested in seeing what the cost is on a system with fewer cores per
> node, maybe even down to 1. That would tell us how much additional
> overhead is being added to set things up to run in parallel.

The numbers above have the 1-thread case.  It seems close to the noise.

> If I get
> a chance tomorrow I might try applying the patches and doing some
> testing myself.

If you end up doing that, you might find this helpful:
    https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=patch;h=afc72bf8478b95a1d6d174c269ff3693c60630e0
    
and maybe this:
    https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=patch;h=dff6537eab281e5a9917682c4adf9059c0574223

Thanks for looking this over.

[ By the way, I'm going to be out Tuesday but back the rest of the week. ]
Daniel Jordan May 5, 2020, 2:09 a.m. UTC | #6
On Mon, May 04, 2020 at 09:48:44PM -0400, Daniel Jordan wrote:
> On Mon, May 04, 2020 at 05:40:19PM -0700, Alexander Duyck wrote:
> > On Mon, May 4, 2020 at 4:44 PM Josh Triplett <josh@joshtriplett.org> wrote:
> > >
> > > On May 4, 2020 3:33:58 PM PDT, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > > >On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
> > > ><daniel.m.jordan@oracle.com> wrote:
> > > >>         /*
> > > >> -        * Initialize and free pages in MAX_ORDER sized increments so
> > > >> -        * that we can avoid introducing any issues with the buddy
> > > >> -        * allocator.
> > > >> +        * More CPUs always led to greater speedups on tested
> > > >systems, up to
> > > >> +        * all the nodes' CPUs.  Use all since the system is
> > > >otherwise idle now.
> > > >>          */
> > > >
> > > >I would be curious about your data. That isn't what I have seen in the
> > > >past. Typically only up to about 8 or 10 CPUs gives you any benefit,
> > > >beyond that I was usually cache/memory bandwidth bound.
> 
> On Skylake it took more than 8 or 10 CPUs, though on other machines the benefit
> of using all versus half or 3/4 of the CPUs is less significant.
> 
> Given that the rest of the system is idle at this point, my main concern is
> whether other archs regress past a certain thread count.

Reposting the data to be consistent with the way the percentages are reported
in the changelog.


    Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
      2 nodes * 26 cores * 2 threads = 104 CPUs
      384G/node = 768G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   4056.7 (  5.5)         --   1763.3 (  4.2)
       2% (  1)      -2.4%   4153.3 (  2.5)      -5.6%   1861.7 (  5.5)
      12% (  6)      35.0%   2637.7 ( 38.7)      80.3%    346.7 ( 37.5)
      25% ( 13)      38.4%   2497.3 ( 38.5)      88.1%    210.0 ( 41.8)
      37% ( 19)      38.9%   2477.0 ( 19.0)      89.5%    185.3 ( 21.5)
      50% ( 26)      39.1%   2471.7 ( 21.4)      89.8%    179.7 ( 25.8)
      75% ( 39)      39.5%   2455.7 ( 33.2)      90.8%    161.7 ( 29.3)
     100% ( 52)      39.9%   2436.7 (  2.1)      91.8%    144.3 (  5.9)
    
    
    Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
      1 node * 16 cores * 2 threads = 32 CPUs
      192G/node = 192G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1957.3 ( 14.0)         --   1093.7 ( 12.9)
       3% (  1)       1.4%   1930.7 ( 10.0)       3.7%   1053.3 (  7.6)
      12% (  4)      41.2%   1151.7 (  9.0)      74.5%    278.7 (  0.6)
      25% (  8)      46.3%   1051.0 (  7.8)      83.7%    178.0 (  2.6)
      38% ( 12)      48.7%   1003.3 (  7.6)      87.0%    141.7 (  3.8)
      50% ( 16)      48.2%   1014.3 ( 20.0)      87.8%    133.3 (  3.2)
      75% ( 24)      49.5%    989.3 (  6.7)      88.4%    126.3 (  1.5)
     100% ( 32)      49.1%    996.0 (  7.2)      88.4%    127.3 (  5.1)
    
    
    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
      2 nodes * 18 cores * 2 threads = 72 CPUs
      128G/node = 256G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1666.0 (  3.5)         --    618.0 (  3.5)
       3% (  1)       1.0%   1649.7 (  1.5)       2.9%    600.0 (  1.0)
      11% (  4)      25.9%   1234.7 ( 21.4)      70.4%    183.0 ( 22.5)
      25% (  9)      29.6%   1173.0 ( 10.0)      80.7%    119.3 (  9.6)
      36% ( 13)      30.8%   1153.7 ( 17.0)      84.0%     99.0 ( 15.6)
      50% ( 18)      31.0%   1150.3 ( 15.5)      84.3%     97.3 ( 16.2)
      75% ( 27)      31.0%   1150.3 (  2.5)      84.6%     95.0 (  5.6)
     100% ( 36)      31.3%   1145.3 (  1.5)      85.6%     89.0 (  1.7)
    
    
    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 8 cores * 2 threads = 16 CPUs
      64G/node = 64G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --   1029.7 ( 42.3)         --    253.7 (  3.1)
       6% (  1)       3.3%    995.3 ( 21.4)       4.3%    242.7 (  5.5)
      12% (  2)      14.0%    885.7 ( 24.4)      46.4%    136.0 (  5.2)
      25% (  4)      18.9%    835.0 ( 21.5)      66.1%     86.0 (  1.7)
      38% (  6)      21.9%    804.7 ( 15.7)      71.4%     72.7 (  2.1)
      50% (  8)      20.8%    815.3 ( 11.7)      74.4%     65.0 (  3.5)
      75% ( 12)      23.5%    787.7 (  2.1)      74.0%     66.0 (  3.6)
     100% ( 16)      23.3%    789.3 ( 15.0)      76.3%     60.0 (  5.6)
    
    
    AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      16G/node = 16G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --    757.7 ( 17.1)         --     57.0 (  0.0)
      25% (  1)      -1.0%    765.3 (  5.5)       3.5%     55.0 (  0.0)
      50% (  2)       4.7%    722.3 ( 21.5)      42.7%     32.7 (  4.6)
      75% (  3)       3.7%    729.7 (  4.9)      54.4%     26.0 (  0.0)
     100% (  4)       6.2%    710.3 ( 15.0)      63.2%     21.0 (  0.0)
    
    
    Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
      1 node * 2 cores * 2 threads = 4 CPUs
      14G/node = 14G memory
    
                   kernel boot                 deferred init
                   ------------------------    ------------------------
    node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
          (  0)         --    656.3 (  7.1)         --     57.3 (  1.5)
      25% (  1)       1.8%    644.7 (  3.1)       0.6%     57.0 (  0.0)
      50% (  2)       6.5%    613.7 (  5.1)      40.7%     34.0 (  5.3)
      75% (  3)       6.9%    611.3 (  6.7)      57.6%     24.3 (  0.6)
     100% (  4)       8.6%    599.7 (  5.9)      62.8%     21.3 (  1.2)


> > > I've found pretty much linear performance up to memory bandwidth, and on the systems I was testing, I didn't saturate memory bandwidth until about the full number of physical cores. From number of cores up to number of threads, the performance stayed about flat; it didn't get any better or worse.
> > 
> > That doesn't sound right though based on the numbers you provided. The
> > system you had was 192GB spread over 2 nodes with 48thread/24core per
> > node, correct? Your numbers went from ~290ms to ~28ms so a 10x
> > decrease, that doesn't sound linear when you spread the work over 24
> > cores to get there. I agree that the numbers largely stay flat once
> > you hit the peak, I have seen similar behavior when I was working on
> > the deferred init code previously. One concern I have though is that
> > we may end up seeing better performance with a subset of cores instead
> > of running all of the cores/threads, especially if features such as
> > turbo come into play. In addition we are talking x86 only so far. I
> > would be interested in seeing if this has benefits or not for other
> > architectures.
> > 
> > Also what is the penalty that is being paid in order to break up the
> > work before-hand and set it up for the parallel work? I would be
> > interested in seeing what the cost is on a system with fewer cores per
> > node, maybe even down to 1. That would tell us how much additional
> > overhead is being added to set things up to run in parallel.
> 
> The numbers above have the 1-thread case.  It seems close to the noise.
> 
> > If I get
> > a chance tomorrow I might try applying the patches and doing some
> > testing myself.
> 
> If you end up doing that, you might find this helpful:
>     https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=patch;h=afc72bf8478b95a1d6d174c269ff3693c60630e0
>     
> and maybe this:
>     https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=patch;h=dff6537eab281e5a9917682c4adf9059c0574223
> 
> Thanks for looking this over.
> 
> [ By the way, I'm going to be out Tuesday but back the rest of the week. ]
>
Alexander Duyck May 5, 2020, 2:55 p.m. UTC | #7
On Mon, May 4, 2020 at 7:11 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Mon, May 04, 2020 at 09:48:44PM -0400, Daniel Jordan wrote:
> > On Mon, May 04, 2020 at 05:40:19PM -0700, Alexander Duyck wrote:
> > > On Mon, May 4, 2020 at 4:44 PM Josh Triplett <josh@joshtriplett.org> wrote:
> > > >
> > > > On May 4, 2020 3:33:58 PM PDT, Alexander Duyck <alexander.duyck@gmail.com> wrote:
> > > > >On Thu, Apr 30, 2020 at 1:12 PM Daniel Jordan
> > > > ><daniel.m.jordan@oracle.com> wrote:
> > > > >>         /*
> > > > >> -        * Initialize and free pages in MAX_ORDER sized increments so
> > > > >> -        * that we can avoid introducing any issues with the buddy
> > > > >> -        * allocator.
> > > > >> +        * More CPUs always led to greater speedups on tested
> > > > >systems, up to
> > > > >> +        * all the nodes' CPUs.  Use all since the system is
> > > > >otherwise idle now.
> > > > >>          */
> > > > >
> > > > >I would be curious about your data. That isn't what I have seen in the
> > > > >past. Typically only up to about 8 or 10 CPUs gives you any benefit,
> > > > >beyond that I was usually cache/memory bandwidth bound.
> >
> > On Skylake it took more than 8 or 10 CPUs, though on other machines the benefit
> > of using all versus half or 3/4 of the CPUs is less significant.
> >
> > Given that the rest of the system is idle at this point, my main concern is
> > whether other archs regress past a certain thread count.
>
> Reposting the data to be consistent with the way the percentages are reported
> in the changelog.
>
>
>     Intel(R) Xeon(R) Platinum 8167M CPU @ 2.00GHz (Skylake, bare metal)
>       2 nodes * 26 cores * 2 threads = 104 CPUs
>       384G/node = 768G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --   4056.7 (  5.5)         --   1763.3 (  4.2)
>        2% (  1)      -2.4%   4153.3 (  2.5)      -5.6%   1861.7 (  5.5)
>       12% (  6)      35.0%   2637.7 ( 38.7)      80.3%    346.7 ( 37.5)
>       25% ( 13)      38.4%   2497.3 ( 38.5)      88.1%    210.0 ( 41.8)
>       37% ( 19)      38.9%   2477.0 ( 19.0)      89.5%    185.3 ( 21.5)
>       50% ( 26)      39.1%   2471.7 ( 21.4)      89.8%    179.7 ( 25.8)
>       75% ( 39)      39.5%   2455.7 ( 33.2)      90.8%    161.7 ( 29.3)
>      100% ( 52)      39.9%   2436.7 (  2.1)      91.8%    144.3 (  5.9)
>
>
>     Intel(R) Xeon(R) CPU E5-2699C v4 @ 2.20GHz (Broadwell, bare metal)
>       1 node * 16 cores * 2 threads = 32 CPUs
>       192G/node = 192G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --   1957.3 ( 14.0)         --   1093.7 ( 12.9)
>        3% (  1)       1.4%   1930.7 ( 10.0)       3.7%   1053.3 (  7.6)
>       12% (  4)      41.2%   1151.7 (  9.0)      74.5%    278.7 (  0.6)
>       25% (  8)      46.3%   1051.0 (  7.8)      83.7%    178.0 (  2.6)
>       38% ( 12)      48.7%   1003.3 (  7.6)      87.0%    141.7 (  3.8)
>       50% ( 16)      48.2%   1014.3 ( 20.0)      87.8%    133.3 (  3.2)
>       75% ( 24)      49.5%    989.3 (  6.7)      88.4%    126.3 (  1.5)
>      100% ( 32)      49.1%    996.0 (  7.2)      88.4%    127.3 (  5.1)
>
>
>     Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, bare metal)
>       2 nodes * 18 cores * 2 threads = 72 CPUs
>       128G/node = 256G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --   1666.0 (  3.5)         --    618.0 (  3.5)
>        3% (  1)       1.0%   1649.7 (  1.5)       2.9%    600.0 (  1.0)
>       11% (  4)      25.9%   1234.7 ( 21.4)      70.4%    183.0 ( 22.5)
>       25% (  9)      29.6%   1173.0 ( 10.0)      80.7%    119.3 (  9.6)
>       36% ( 13)      30.8%   1153.7 ( 17.0)      84.0%     99.0 ( 15.6)
>       50% ( 18)      31.0%   1150.3 ( 15.5)      84.3%     97.3 ( 16.2)
>       75% ( 27)      31.0%   1150.3 (  2.5)      84.6%     95.0 (  5.6)
>      100% ( 36)      31.3%   1145.3 (  1.5)      85.6%     89.0 (  1.7)
>
>
>     AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
>       1 node * 8 cores * 2 threads = 16 CPUs
>       64G/node = 64G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --   1029.7 ( 42.3)         --    253.7 (  3.1)
>        6% (  1)       3.3%    995.3 ( 21.4)       4.3%    242.7 (  5.5)
>       12% (  2)      14.0%    885.7 ( 24.4)      46.4%    136.0 (  5.2)
>       25% (  4)      18.9%    835.0 ( 21.5)      66.1%     86.0 (  1.7)
>       38% (  6)      21.9%    804.7 ( 15.7)      71.4%     72.7 (  2.1)
>       50% (  8)      20.8%    815.3 ( 11.7)      74.4%     65.0 (  3.5)
>       75% ( 12)      23.5%    787.7 (  2.1)      74.0%     66.0 (  3.6)
>      100% ( 16)      23.3%    789.3 ( 15.0)      76.3%     60.0 (  5.6)
>
>
>     AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
>       1 node * 2 cores * 2 threads = 4 CPUs
>       16G/node = 16G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --    757.7 ( 17.1)         --     57.0 (  0.0)
>       25% (  1)      -1.0%    765.3 (  5.5)       3.5%     55.0 (  0.0)
>       50% (  2)       4.7%    722.3 ( 21.5)      42.7%     32.7 (  4.6)
>       75% (  3)       3.7%    729.7 (  4.9)      54.4%     26.0 (  0.0)
>      100% (  4)       6.2%    710.3 ( 15.0)      63.2%     21.0 (  0.0)
>
>
>     Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
>       1 node * 2 cores * 2 threads = 4 CPUs
>       14G/node = 14G memory
>
>                    kernel boot                 deferred init
>                    ------------------------    ------------------------
>     node% (thr)    speedup  time_ms (stdev)    speedup  time_ms (stdev)
>           (  0)         --    656.3 (  7.1)         --     57.3 (  1.5)
>       25% (  1)       1.8%    644.7 (  3.1)       0.6%     57.0 (  0.0)
>       50% (  2)       6.5%    613.7 (  5.1)      40.7%     34.0 (  5.3)
>       75% (  3)       6.9%    611.3 (  6.7)      57.6%     24.3 (  0.6)
>      100% (  4)       8.6%    599.7 (  5.9)      62.8%     21.3 (  1.2)

One question about this data. What is the power management
configuration on the systems when you are running these tests? I'm
just curious if CPU frequency scaling, C states, and turbo are
enabled? I ask because that is what I have seen usually make the
difference in these kind of workloads as the throughput starts
dropping off as you start seeing the core frequency lower and more
cores become active.
Daniel Jordan May 6, 2020, 10:21 p.m. UTC | #8
On Tue, May 05, 2020 at 07:55:43AM -0700, Alexander Duyck wrote:
> One question about this data. What is the power management
> configuration on the systems when you are running these tests? I'm
> just curious if CPU frequency scaling, C states, and turbo are
> enabled?

Yes, intel_pstate is loaded in active mode without hwp and with turbo enabled
(those power management docs are great by the way!) and intel_idle is in use
too.

> I ask because that is what I have seen usually make the
> difference in these kind of workloads as the throughput starts
> dropping off as you start seeing the core frequency lower and more
> cores become active.

If I follow, you're saying there's a chance performance would improve with the
above disabled, but how often would a system be configured that way?  Even if
it were faster, the machine is configured how it's configured, or am I missing
your point?
Alexander Duyck May 6, 2020, 10:36 p.m. UTC | #9
On Wed, May 6, 2020 at 3:21 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Tue, May 05, 2020 at 07:55:43AM -0700, Alexander Duyck wrote:
> > One question about this data. What is the power management
> > configuration on the systems when you are running these tests? I'm
> > just curious if CPU frequency scaling, C states, and turbo are
> > enabled?
>
> Yes, intel_pstate is loaded in active mode without hwp and with turbo enabled
> (those power management docs are great by the way!) and intel_idle is in use
> too.
>
> > I ask because that is what I have seen usually make the
> > difference in these kind of workloads as the throughput starts
> > dropping off as you start seeing the core frequency lower and more
> > cores become active.
>
> If I follow, you're saying there's a chance performance would improve with the
> above disabled, but how often would a system be configured that way?  Even if
> it were faster, the machine is configured how it's configured, or am I missing
> your point?

I think you might be missing my point. What I was getting at is that I
know for performance testing sometimes C states and P states get
disabled in order to get consistent results between runs, it sounds
like you have them enabled though. I was just wondering if you had
disabled them or not. If they were disabled then you wouldn't get the
benefits of turbo and as such adding more cores wouldn't come at a
penalty, while with it enabled the first few cores should start to
slow down as they fell out of turbo mode. So it may be part of the
reason why you are only hitting about 10x at full core count.

As it stands I think your code may speed up a bit if you split the
work up based on section instead of max order. That would get rid of
any cache bouncing you may be doing on the pageblock flags and reduce
the overhead for splitting the work up into individual pieces since
each piece will be bigger.
Daniel Jordan May 6, 2020, 10:43 p.m. UTC | #10
On Wed, May 06, 2020 at 03:36:54PM -0700, Alexander Duyck wrote:
> On Wed, May 6, 2020 at 3:21 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> >
> > On Tue, May 05, 2020 at 07:55:43AM -0700, Alexander Duyck wrote:
> > > One question about this data. What is the power management
> > > configuration on the systems when you are running these tests? I'm
> > > just curious if CPU frequency scaling, C states, and turbo are
> > > enabled?
> >
> > Yes, intel_pstate is loaded in active mode without hwp and with turbo enabled
> > (those power management docs are great by the way!) and intel_idle is in use
> > too.
> >
> > > I ask because that is what I have seen usually make the
> > > difference in these kind of workloads as the throughput starts
> > > dropping off as you start seeing the core frequency lower and more
> > > cores become active.
> >
> > If I follow, you're saying there's a chance performance would improve with the
> > above disabled, but how often would a system be configured that way?  Even if
> > it were faster, the machine is configured how it's configured, or am I missing
> > your point?
> 
> I think you might be missing my point. What I was getting at is that I
> know for performance testing sometimes C states and P states get
> disabled in order to get consistent results between runs, it sounds
> like you have them enabled though. I was just wondering if you had
> disabled them or not. If they were disabled then you wouldn't get the
> benefits of turbo and as such adding more cores wouldn't come at a
> penalty, while with it enabled the first few cores should start to
> slow down as they fell out of turbo mode. So it may be part of the
> reason why you are only hitting about 10x at full core count.

All right, that makes way more sense.

> As it stands I think your code may speed up a bit if you split the
> work up based on section instead of max order. That would get rid of
> any cache bouncing you may be doing on the pageblock flags and reduce
> the overhead for splitting the work up into individual pieces since
> each piece will be bigger.

See my other mail.
Daniel Jordan May 6, 2020, 11:01 p.m. UTC | #11
On Wed, May 06, 2020 at 06:43:35PM -0400, Daniel Jordan wrote:
> On Wed, May 06, 2020 at 03:36:54PM -0700, Alexander Duyck wrote:
> > On Wed, May 6, 2020 at 3:21 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> > >
> > > On Tue, May 05, 2020 at 07:55:43AM -0700, Alexander Duyck wrote:
> > > > One question about this data. What is the power management
> > > > configuration on the systems when you are running these tests? I'm
> > > > just curious if CPU frequency scaling, C states, and turbo are
> > > > enabled?
> > >
> > > Yes, intel_pstate is loaded in active mode without hwp and with turbo enabled
> > > (those power management docs are great by the way!) and intel_idle is in use
> > > too.
> > >
> > > > I ask because that is what I have seen usually make the
> > > > difference in these kind of workloads as the throughput starts
> > > > dropping off as you start seeing the core frequency lower and more
> > > > cores become active.
> > >
> > > If I follow, you're saying there's a chance performance would improve with the
> > > above disabled, but how often would a system be configured that way?  Even if
> > > it were faster, the machine is configured how it's configured, or am I missing
> > > your point?
> > 
> > I think you might be missing my point. What I was getting at is that I
> > know for performance testing sometimes C states and P states get
> > disabled in order to get consistent results between runs, it sounds
> > like you have them enabled though. I was just wondering if you had
> > disabled them or not. If they were disabled then you wouldn't get the
> > benefits of turbo and as such adding more cores wouldn't come at a
> > penalty, while with it enabled the first few cores should start to
> > slow down as they fell out of turbo mode. So it may be part of the
> > reason why you are only hitting about 10x at full core count.

I checked the memory bandwidth of the biggest system, the Skylake.  Couldn't
find official specs for it, all I could quickly find were stream results from a
blog post of ours that quoted a range of about 123-145 GB/s over both nodes
when compiling with gcc.  That's with all CPUs.

Again using all CPUs, multithreaded page init is doing 41 GiB/s per node
assuming it's just touching the 64 bytes of each page struct, so assuming
there's more memory traffic than just struct page, it seems another part of the
reason for only 10x is we're bottlenecked on memory bandwidth.
diff mbox series

Patch

diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933be65ff..e5007206c7601 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -622,13 +622,13 @@  config DEFERRED_STRUCT_PAGE_INIT
 	depends on SPARSEMEM
 	depends on !NEED_PER_CPU_KM
 	depends on 64BIT
+	select PADATA
 	help
 	  Ordinarily all struct pages are initialised during early boot in a
 	  single thread. On very large machines this can take a considerable
 	  amount of time. If this option is set, large machines will bring up
-	  a subset of memmap at boot and then initialise the rest in parallel
-	  by starting one-off "pgdatinitX" kernel thread for each node X. This
-	  has a potential performance impact on processes running early in the
+	  a subset of memmap at boot and then initialise the rest in parallel.
+	  This has a potential performance impact on tasks running early in the
 	  lifetime of the system until these kthreads finish the
 	  initialisation.
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 990514d8f0d94..96d6d0d920c27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@ 
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <linux/psi.h>
+#include <linux/padata.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -1729,6 +1730,25 @@  deferred_init_maxorder(struct zone *zone, unsigned long *start_pfn,
 	return nr_pages;
 }
 
+struct def_init_args {
+	struct zone *zone;
+	atomic_long_t nr_pages;
+};
+
+static void __init deferred_init_memmap_chunk(unsigned long spfn,
+					      unsigned long epfn, void *arg)
+{
+	struct def_init_args *args = arg;
+	unsigned long nr_pages = 0;
+
+	while (spfn < epfn) {
+		nr_pages += deferred_init_maxorder(args->zone, &spfn, epfn);
+		cond_resched();
+	}
+
+	atomic_long_add(nr_pages, &args->nr_pages);
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
@@ -1738,7 +1758,7 @@  static int __init deferred_init_memmap(void *data)
 	unsigned long first_init_pfn, flags;
 	unsigned long start = jiffies;
 	struct zone *zone;
-	int zid;
+	int zid, max_threads;
 	u64 i;
 
 	/* Bind memory initialisation thread to a local node if possible */
@@ -1778,15 +1798,25 @@  static int __init deferred_init_memmap(void *data)
 		goto zone_empty;
 
 	/*
-	 * Initialize and free pages in MAX_ORDER sized increments so
-	 * that we can avoid introducing any issues with the buddy
-	 * allocator.
+	 * More CPUs always led to greater speedups on tested systems, up to
+	 * all the nodes' CPUs.  Use all since the system is otherwise idle now.
 	 */
+	max_threads = max(cpumask_weight(cpumask), 1u);
+
 	for_each_free_mem_pfn_range_in_zone_from(i, zone, &spfn, &epfn) {
-		while (spfn < epfn) {
-			nr_pages += deferred_init_maxorder(zone, &spfn, epfn);
-			cond_resched();
-		}
+		struct def_init_args args = { zone, ATOMIC_LONG_INIT(0) };
+		struct padata_mt_job job = {
+			.thread_fn   = deferred_init_memmap_chunk,
+			.fn_arg      = &args,
+			.start       = spfn,
+			.size	     = epfn - spfn,
+			.align	     = MAX_ORDER_NR_PAGES,
+			.min_chunk   = MAX_ORDER_NR_PAGES,
+			.max_threads = max_threads,
+		};
+
+		padata_do_multithreaded(&job);
+		nr_pages += atomic_long_read(&args.nr_pages);
 	}
 zone_empty:
 	/* Sanity check that the next zone really is unpopulated */