diff mbox series

[v8,74/74] cputlb: queue async flush jobs without the BQL

Message ID 20200326193156.4322-75-robert.foley@linaro.org (mailing list archive)
State New, archived
Headers show
Series per-CPU locks | expand

Commit Message

Robert Foley March 26, 2020, 7:31 p.m. UTC
From: "Emilio G. Cota" <cota@braap.org>

This yields sizable scalability improvements, as the below results show.

Host: Two Intel E5-2683 v3 14-core CPUs at 2.00 GHz (Haswell)

Workload: Ubuntu 18.04 ppc64 compiling the linux kernel with
"make -j N", where N is the number of cores in the guest.

                      Speedup vs a single thread (higher is better):

         14 +---------------------------------------------------------------+
            |       +    +       +      +       +      +      $$$$$$  +     |
            |                                            $$$$$              |
            |                                      $$$$$$                   |
         12 |-+                                $A$$                       +-|
            |                                $$                             |
            |                             $$$                               |
         10 |-+                         $$    ##D#####################D   +-|
            |                        $$$ #####**B****************           |
            |                      $$####*****                   *****      |
            |                    A$#*****                             B     |
          8 |-+                $$B**                                      +-|
            |                $$**                                           |
            |               $**                                             |
          6 |-+           $$*                                             +-|
            |            A**                                                |
            |           $B                                                  |
            |           $                                                   |
          4 |-+        $*                                                 +-|
            |          $                                                    |
            |         $                                                     |
          2 |-+      $                                                    +-|
            |        $                                 +cputlb-no-bql $$A$$ |
            |       A                                   +per-cpu-lock ##D## |
            |       +    +       +      +       +      +     baseline **B** |
          0 +---------------------------------------------------------------+
                    1    4       8      12      16     20      24     28
                                       Guest vCPUs
  png: https://imgur.com/zZRvS7q

Some notes:
- baseline corresponds to the commit before this series

- per-cpu-lock is the commit that converts the CPU loop to per-cpu locks.

- cputlb-no-bql is this commit.

- I'm using taskset to assign cores to threads, favouring locality whenever
  possible but not using SMT. When N=1, I'm using a single host core, which
  leads to superlinear speedups (since with more cores the I/O thread can execute
  while vCPU threads sleep). In the future I might use N+1 host cores for N
  guest cores to avoid this, or perhaps pin guest threads to cores one-by-one.

Single-threaded performance is affected very lightly. Results
below for debian aarch64 bootup+test for the entire series
on an Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz host:

- Before:

 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):

       7269.033478      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.06% )
    30,659,870,302      cycles                    #    4.218 GHz                      ( +-  0.06% )
    54,790,540,051      instructions              #    1.79  insns per cycle          ( +-  0.05% )
     9,796,441,380      branches                  # 1347.695 M/sec                    ( +-  0.05% )
       165,132,201      branch-misses             #    1.69% of all branches          ( +-  0.12% )

       7.287011656 seconds time elapsed                                          ( +-  0.10% )

- After:

       7375.924053      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.13% )
    31,107,548,846      cycles                    #    4.217 GHz                      ( +-  0.12% )
    55,355,668,947      instructions              #    1.78  insns per cycle          ( +-  0.05% )
     9,929,917,664      branches                  # 1346.261 M/sec                    ( +-  0.04% )
       166,547,442      branch-misses             #    1.68% of all branches          ( +-  0.09% )

       7.389068145 seconds time elapsed                                          ( +-  0.13% )

That is, a 1.37% slowdown.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Robert Foley <robert.foley@linaro.org>
---
 accel/tcg/cputlb.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

Comments

Alex Bennée May 12, 2020, 4:27 p.m. UTC | #1
Robert Foley <robert.foley@linaro.org> writes:

> From: "Emilio G. Cota" <cota@braap.org>
>
> This yields sizable scalability improvements, as the below results show.
>
> Host: Two Intel E5-2683 v3 14-core CPUs at 2.00 GHz (Haswell)
>
> Workload: Ubuntu 18.04 ppc64 compiling the linux kernel with
> "make -j N", where N is the number of cores in the guest.
>
>                       Speedup vs a single thread (higher is better):
>
>          14 +---------------------------------------------------------------+
>             |       +    +       +      +       +      +      $$$$$$  +     |
>             |                                            $$$$$              |
>             |                                      $$$$$$                   |
>          12 |-+                                $A$$                       +-|
>             |                                $$                             |
>             |                             $$$                               |
>          10 |-+                         $$    ##D#####################D   +-|
>             |                        $$$ #####**B****************           |
>             |                      $$####*****                   *****      |
>             |                    A$#*****                             B     |
>           8 |-+                $$B**                                      +-|
>             |                $$**                                           |
>             |               $**                                             |
>           6 |-+           $$*                                             +-|
>             |            A**                                                |
>             |           $B                                                  |
>             |           $                                                   |
>           4 |-+        $*                                                 +-|
>             |          $                                                    |
>             |         $                                                     |
>           2 |-+      $                                                    +-|
>             |        $                                 +cputlb-no-bql $$A$$ |
>             |       A                                   +per-cpu-lock ##D## |
>             |       +    +       +      +       +      +     baseline **B** |
>           0 +---------------------------------------------------------------+
>                     1    4       8      12      16     20      24     28
>                                        Guest vCPUs
>   png: https://imgur.com/zZRvS7q

Can we re-run these numbers on the re-based series?
Alex Bennée May 12, 2020, 6:38 p.m. UTC | #2
Robert Foley <robert.foley@linaro.org> writes:

> From: "Emilio G. Cota" <cota@braap.org>
>
> This yields sizable scalability improvements, as the below results show.
>
> Host: Two Intel E5-2683 v3 14-core CPUs at 2.00 GHz (Haswell)
>
> Workload: Ubuntu 18.04 ppc64 compiling the linux kernel with
> "make -j N", where N is the number of cores in the guest.
>
<snip>

So my numbers running a aarch64 guest running pigz with an x86_64 host
the gains start to tail off past -smp 12 but still seem to be showing
some gain up to -smp 16 (the host has 24 cores):

  ./aarch64-softmmu/qemu-system-aarch64 -machine virt,graphics=on,gic-version=3,virtualization=on -cpu cortex-a53 -serial mon:stdio -nic user,model=virtio-net-pci,hostfwd=tcp::2222-:22 -device virtio-scsi-pci -device scsi-hd,drive=hd0 -blockdev driver=raw,node-name=hd0,discard=unmap,file.driver=host_device,file.filename=/dev/zvol/hackpool-0/debian-buster-arm64 -kernel ../../../linux.git/builds/arm64.nopreempt/arch/arm64/boot/Image -append "console=ttyAMA0 root=/dev/sda2 systemd.unit=benchmark-pigz.service" -display none -m 4096 -snapshot -smp $SMP

  | Command                     | Mean [s]         |    Min...Max [s] |
  |-----------------------------+------------------+------------------|
  | =$QEMU $QEMU_ARGS  -smp 4=  | 146.738 ± 62.272 | 43.861...246.139 |
  | =$QEMU $QEMU_ARGS  -smp 5=  | 33.984 ± 13.370  |  29.501...72.032 |
  | =$QEMU $QEMU_ARGS  -smp 6=  | 26.128 ± 0.189   |  25.837...26.475 |
  | =$QEMU $QEMU_ARGS  -smp 7=  | 23.459 ± 0.090   |  23.252...23.560 |
  | =$QEMU $QEMU_ARGS  -smp 8=  | 21.579 ± 0.117   |  21.418...21.764 |
  | =$QEMU $QEMU_ARGS  -smp 9=  | 20.163 ± 0.142   |  19.938...20.387 |
  | =$QEMU $QEMU_ARGS  -smp 10= | 19.028 ± 0.106   |  18.877...19.183 |
  | =$QEMU $QEMU_ARGS  -smp 11= | 18.166 ± 0.093   |  18.081...18.386 |
  | =$QEMU $QEMU_ARGS  -smp 12= | 17.464 ± 0.067   |  17.383...17.614 |
  | =$QEMU $QEMU_ARGS  -smp 13= | 16.928 ± 0.104   |  16.754...17.158 |
  | =$QEMU $QEMU_ARGS  -smp 14= | 16.615 ± 0.188   |  16.486...17.105 |
  | =$QEMU $QEMU_ARGS  -smp 15= | 16.344 ± 0.176   |  16.094...16.680 |
  | =$QEMU $QEMU_ARGS  -smp 16= | 16.085 ± 0.215   |  15.869...16.623 |
Robert Foley May 12, 2020, 7:26 p.m. UTC | #3
On Tue, 12 May 2020 at 12:27, Alex Bennée <alex.bennee@linaro.org> wrote:
> Robert Foley <robert.foley@linaro.org> writes:
>
> > From: "Emilio G. Cota" <cota@braap.org>
> >
> > This yields sizable scalability improvements, as the below results show.
> >
> > Host: Two Intel E5-2683 v3 14-core CPUs at 2.00 GHz (Haswell)
> >
> > Workload: Ubuntu 18.04 ppc64 compiling the linux kernel with
> > "make -j N", where N is the number of cores in the guest.
> >
> >                       Speedup vs a single thread (higher is better):
snip
> >   png: https://imgur.com/zZRvS7q
>
> Can we re-run these numbers on the re-based series?

Sure, we will re-run the numbers.

Regards,
-Rob
Robert Foley May 18, 2020, 1:46 p.m. UTC | #4
We re-ran the numbers with the latest re-based series.

We used an aarch64 ubuntu VM image with a host CPU:
Intel(R) Xeon(R) Silver 4114 CPU @ 2.20GHz, 2 CPUs, 10 cores/CPU,
20 Threads/CPU.  40 cores total.

For the bare hardware and kvm tests (first chart) the host CPU was:
HiSilicon 1620 CPU 2600 Mhz,  2 CPUs, 64 Cores per CPU, 128 CPUs total.

First, we ran a test of building the kernel in the VM.
We did not see any major improvements nor major regressions.
We show the results of the Speedup of building the kernel
on bare hardware compared with kvm and QEMU (both the baseline and cpu locks).


                   Speedup vs a single thread for kernel build

  40 +----------------------------------------------------------------------+
     |         +         +         +          +         +         +  **     |
     |                                                bare hardwar********* |
     |                                                          kvm ####### |
  35 |-+                                                   baseline $$$$$$$-|
     |                                                    *cpu lock %%%%%%% |
     |                                                 ***                  |
     |                                               **                     |
  30 |-+                                          ***                     +-|
     |                                         ***                          |
     |                                      ***                             |
     |                                    **                                |
  25 |-+                               ***                                +-|
     |                              ***                                     |
     |                            **                                        |
     |                          **                                          |
  20 |-+                      **                                          +-|
     |                      **                                #########     |
     |                    **                  ################              |
     |                  **          ##########                              |
     |                **         ###                                        |
  15 |-+             *       ####                                         +-|
     |             **     ###                                               |
     |            *    ###                                                  |
     |           *  ###                                                     |
  10 |-+       **###                                                      +-|
     |        *##                                                           |
     |       ##  $$$$$$$$$$$$$$$$                                           |
     |     #$$$$$%%%%%%%%%%%%%%%%%%%%                                       |
   5 |-+  $%%%%%%                    %%%$%$%$%$%$%$%$%$%$%$%$%$%$%$%$%    +-|
     |   %%                                                           %     |
     | %%                                                                   |
     |%        +         +         +          +         +         +         |
   0 +----------------------------------------------------------------------+
     0         10        20        30         40        50        60        70
                                   Guest vCPUs


After seeing these results and the scaling limits inherent in the build itself,
we decided to run a test which might show the scaling improvements clearer.
So we chose unix bench.

               Unix bench result (higher is better) vs number vCPUs.

  3000 +--------------------------------------------------------------------+
       |      +      +      +      +      +     +      +      +      +      |
       |                                                   baseline ******* |
       |             #                                     cpu lock ####### |
       |           ##*#                                                     |
  2500 |-+        #** *#                                                  +-|
       |          #    *#                                                   |
       |         #*    *#                                                   |
       |         #      *#                                                  |
       |        #*       #                                                  |
       |        #        *#                                                 |
  2000 |-+     #*         #                                               +-|
       |       #          *#                                                |
       |      #*           *#                                               |
       |      #             *####                                           |
       |     #*             *    ###                                        |
  1500 |-+   #               ***    ##                                    +-|
       |     #                  *     ##                                    |
       |    #                    *      ###                                 |
       |    #                     **       ##                               |
       |    #                       *        ###                            |
       |   #                         *          ##                          |
  1000 |-+ #                          **          #                       +-|
       |  #                             *          ###                      |
       |  #                              **           #                     |
       |  #                                *           #                    |
       | #*                                 *           ##                  |
   500 |-#                                   **           #         #     +-|
       | #                                     *           #      ## #      |
       |#*                                      *           ##   #    #     |
       |#*                                       **            ##      #    |
       |*                                                     #         #   |
       |*     +      +      +      +      +     +  **********************#  |
     0 +--------------------------------------------------------------------+
       0      10     20     30     40     50    60     70     80     90    100
                                    Guest vCPUs

We also ran tests to compare the boot times.  This test showed the most
improvements compared to the baseline.

              Boot time in seconds (lower is better) vs number vCPUs.

  550 +---------------------------------------------------------------------+
      |      +      +      +      +      +      +      +      +      +   *  |
      |                                                    baseline ******* |
  500 |-+                                                  cpu lock #######-|
      |                                                              *      |
      |                                                             *       |
      |                                                            *        |
  450 |-+                                                        **      #+-|
      |                                                         *       #   |
      |                                            **          *      ##    |
  400 |-+                                         *  **      **      #    +-|
      |                                           *    *   **       #       |
      |                                          *       **       ##        |
  350 |-+                                       *       *        #        +-|
      |                                         *              ##           |
      |                                        *              #             |
  300 |-+                                     *             ##            +-|
      |                                       *            #                |
      |                                      *           ##                 |
      |                                     *           #                   |
  250 |-+                                 **           #                  +-|
      |                                  *           ##                     |
      |                                **           #                       |
  200 |-+                           ***           ##                      +-|
      |                           **           ###                          |
      |                          *         ####                             |
  150 |-+                       *    ######                               +-|
      |                     ****  ###                                       |
      |*                   *    ##                                          |
      |#*                #######                                            |
  100 |-#          ***###                                                 +-|
      | #*     #######                                                      |
      |  ######     +      +      +      +      +      +      +      +      |
   50 +---------------------------------------------------------------------+
      0      10     20     30     40     50     60     70     80     90    100
                                    Guest vCPUs

Pictures are also here:
https://drive.google.com/file/d/1ASg5XyP9hNfN9VysXC3qe5s9QSJlwFAt/view?usp=sharing

We will plan to update this commit in the series with the final two results
(unix bench and boot times).

Regards,
-Rob


On Tue, 12 May 2020 at 15:26, Robert Foley <robert.foley@linaro.org> wrote:
>
> On Tue, 12 May 2020 at 12:27, Alex Bennée <alex.bennee@linaro.org> wrote:
> > Robert Foley <robert.foley@linaro.org> writes:
> >
> > > From: "Emilio G. Cota" <cota@braap.org>
> > >
> > > This yields sizable scalability improvements, as the below results show.
> > >
> > > Host: Two Intel E5-2683 v3 14-core CPUs at 2.00 GHz (Haswell)
> > >
> > > Workload: Ubuntu 18.04 ppc64 compiling the linux kernel with
> > > "make -j N", where N is the number of cores in the guest.
> > >
> > >                       Speedup vs a single thread (higher is better):
> snip
> > >   png: https://imgur.com/zZRvS7q
> >
> > Can we re-run these numbers on the re-based series?
>
> Sure, we will re-run the numbers.
>
> Regards,
> -Rob
Emilio Cota May 20, 2020, 4:46 a.m. UTC | #5
On Mon, May 18, 2020 at 09:46:36 -0400, Robert Foley wrote:
> We re-ran the numbers with the latest re-based series.
> 
> We used an aarch64 ubuntu VM image with a host CPU:
> Intel(R) Xeon(R) Silver 4114 CPU @ 2.20GHz, 2 CPUs, 10 cores/CPU,
> 20 Threads/CPU.  40 cores total.
> 
> For the bare hardware and kvm tests (first chart) the host CPU was:
> HiSilicon 1620 CPU 2600 Mhz,  2 CPUs, 64 Cores per CPU, 128 CPUs total.
> 
> First, we ran a test of building the kernel in the VM.
> We did not see any major improvements nor major regressions.
> We show the results of the Speedup of building the kernel
> on bare hardware compared with kvm and QEMU (both the baseline and cpu locks).
> 
> 
>                    Speedup vs a single thread for kernel build
> 
>   40 +----------------------------------------------------------------------+
>      |         +         +         +          +         +         +  **     |
>      |                                                bare hardwar********* |
>      |                                                          kvm ####### |
>   35 |-+                                                   baseline $$$$$$$-|
>      |                                                    *cpu lock %%%%%%% |
>      |                                                 ***                  |
>      |                                               **                     |
>   30 |-+                                          ***                     +-|
>      |                                         ***                          |
>      |                                      ***                             |
>      |                                    **                                |
>   25 |-+                               ***                                +-|
>      |                              ***                                     |
>      |                            **                                        |
>      |                          **                                          |
>   20 |-+                      **                                          +-|
>      |                      **                                #########     |
>      |                    **                  ################              |
>      |                  **          ##########                              |
>      |                **         ###                                        |
>   15 |-+             *       ####                                         +-|
>      |             **     ###                                               |
>      |            *    ###                                                  |
>      |           *  ###                                                     |
>   10 |-+       **###                                                      +-|
>      |        *##                                                           |
>      |       ##  $$$$$$$$$$$$$$$$                                           |
>      |     #$$$$$%%%%%%%%%%%%%%%%%%%%                                       |
>    5 |-+  $%%%%%%                    %%%$%$%$%$%$%$%$%$%$%$%$%$%$%$%$%    +-|
>      |   %%                                                           %     |
>      | %%                                                                   |
>      |%        +         +         +          +         +         +         |
>    0 +----------------------------------------------------------------------+
>      0         10        20        30         40        50        60        70
>                                    Guest vCPUs
> 
> 
> After seeing these results and the scaling limits inherent in the build itself,
> we decided to run a test which might show the scaling improvements clearer.

Thanks for doing these tests. I know from experience that benchmarking
is hard and incredibly time consuming, so please do not be discouraged by
my comments below.

A couple of points:

1. I am not familiar with aarch64 KVM but I'd expect it to scale almost
like the native run. Are you assigning enough RAM to the guest? Also,
it can help to run the kernel build in a ramfs in the guest.

2. The build itself does not seem to impose a scaling limit, since
it scales very well when run natively (per-thread I presume aarch64 TCG is
still slower than native, even if TCG is run on a faster x86 machine).
The limit here is probably aarch64 TCG. In particular, last time I
checked aarch64 TCG has room for improvement scalability-wise handling
interrupts and some TLB operations; this is likely to explain why we
see no benefit with per-CPU locks, i.e. the bottleneck is elsewhere.
This can be confirmed with the sync profiler.

IIRC I originally used ppc64 for this test because ppc64 TCG does not
have any other big bottlenecks scalability-wise. I just checked but
unfortunately I can't find the ppc64 image I used :( What I can offer
is the script I used to run these benchmarks; see the appended.

Thanks,
		Emilio

---
#!/bin/bash

set -eu

# path to host files
MYHOME=/local/home/cota/src

# guest image
QEMU_INST_PATH=$MYHOME/qemu-inst
IMG=$MYHOME/qemu/img/ppc64/ubuntu.qcow2

ARCH=ppc64
COMMON_ARGS="-M pseries -nodefaults \
		-hda $IMG -nographic -serial stdio \
		-net nic -net user,hostfwd=tcp::2222-:22 \
		-m 48G"

# path to this script's directory, where .txt output will be copied
# from the guest.
QELT=$MYHOME/qelt
HOST_PATH=$QELT/fig/kcomp

# The guest must be able to SSH to the HOST without entering a password.
# The way I set this up is to have a passwordless SSH key in the guest's
# root user, and then copy that key's public key to the host.
# I used the root user because the guest runs on bootup (as root) a
# script that scp's run-guest.sh (see below) from the host, then executes it.
# This is done via a tiny script in the guest invoked from systemd once
# boot-up has completed.
HOST=foo@bar.edu

# This is a script in the host to use an appropriate cpumask to
# use cores in the same socket if possible.
# See https://github.com/cota/cputopology-perl
CPUTOPO=$MYHOME/cputopology-perl

# For each run we create this file that then the guest will SCP
# and execute. It is a quick and dirty way of passing arguments to the guest.
create_file () {
    TAG=$1
    CORES=$2
    NAME=$ARCH.$TAG-$CORES.txt

    echo '#!/bin/bash' > run-guest.sh
    echo 'cp -r /home/cota/linux-4.18-rc7 /tmp2/linux' >> run-guest.sh
    echo "cd /tmp2/linux" >> run-guest.sh
    echo "{ time make -j $CORES vmlinux >/dev/null; } 2>>/home/cota/$NAME" >> run-guest.sh
    # Output with execution time is then scp'ed to the host.
    echo "ssh $HOST 'cat >> $HOST_PATH/$NAME' < /home/cota/$NAME" >> run-guest.sh
    echo "poweroff" >> run-guest.sh
}

# Change here THREADS and also the TAGS that point to different QEMU installations.
for THREADS in 64 32 16; do
    for TAG in cpu-exclusive-work cputlb-no-bql per-cpu-lock cpu-has-work baseline; do
	QEMU=$QEMU_INST_PATH/$TAG/bin/qemu-system-$ARCH
	CPUMASK=$($CPUTOPO/list.pl --policy=compact-smt $THREADS)

	create_file $TAG $THREADS
	time taskset -c $CPUMASK $QEMU $COMMON_ARGS -smp $THREADS
    done
done
Robert Foley May 20, 2020, 3:01 p.m. UTC | #6
On Wed, 20 May 2020 at 00:46, Emilio G. Cota <cota@braap.org> wrote:
>
> On Mon, May 18, 2020 at 09:46:36 -0400, Robert Foley wrote:
>
> Thanks for doing these tests. I know from experience that benchmarking
> is hard and incredibly time consuming, so please do not be discouraged by
> my comments below.
>

Hi,
Thanks for all the comments, and for including the script!
These are all very helpful.

We will work to replicate these results using a PPC VM,
and will re-post them here.

Thanks & Regards,
-Rob

> A couple of points:
>
> 1. I am not familiar with aarch64 KVM but I'd expect it to scale almost
> like the native run. Are you assigning enough RAM to the guest? Also,
> it can help to run the kernel build in a ramfs in the guest.

> 2. The build itself does not seem to impose a scaling limit, since
> it scales very well when run natively (per-thread I presume aarch64 TCG is
> still slower than native, even if TCG is run on a faster x86 machine).
> The limit here is probably aarch64 TCG. In particular, last time I
> checked aarch64 TCG has room for improvement scalability-wise handling
> interrupts and some TLB operations; this is likely to explain why we
> see no benefit with per-CPU locks, i.e. the bottleneck is elsewhere.
> This can be confirmed with the sync profiler.
>
> IIRC I originally used ppc64 for this test because ppc64 TCG does not
> have any other big bottlenecks scalability-wise. I just checked but
> unfortunately I can't find the ppc64 image I used :( What I can offer
> is the script I used to run these benchmarks; see the appended.
>
> Thanks,
>                 Emilio
>
> ---
> #!/bin/bash
>
> set -eu
>
> # path to host files
> MYHOME=/local/home/cota/src
>
> # guest image
> QEMU_INST_PATH=$MYHOME/qemu-inst
> IMG=$MYHOME/qemu/img/ppc64/ubuntu.qcow2
>
> ARCH=ppc64
> COMMON_ARGS="-M pseries -nodefaults \
>                 -hda $IMG -nographic -serial stdio \
>                 -net nic -net user,hostfwd=tcp::2222-:22 \
>                 -m 48G"
>
> # path to this script's directory, where .txt output will be copied
> # from the guest.
> QELT=$MYHOME/qelt
> HOST_PATH=$QELT/fig/kcomp
>
> # The guest must be able to SSH to the HOST without entering a password.
> # The way I set this up is to have a passwordless SSH key in the guest's
> # root user, and then copy that key's public key to the host.
> # I used the root user because the guest runs on bootup (as root) a
> # script that scp's run-guest.sh (see below) from the host, then executes it.
> # This is done via a tiny script in the guest invoked from systemd once
> # boot-up has completed.
> HOST=foo@bar.edu
>
> # This is a script in the host to use an appropriate cpumask to
> # use cores in the same socket if possible.
> # See https://github.com/cota/cputopology-perl
> CPUTOPO=$MYHOME/cputopology-perl
>
> # For each run we create this file that then the guest will SCP
> # and execute. It is a quick and dirty way of passing arguments to the guest.
> create_file () {
>     TAG=$1
>     CORES=$2
>     NAME=$ARCH.$TAG-$CORES.txt
>
>     echo '#!/bin/bash' > run-guest.sh
>     echo 'cp -r /home/cota/linux-4.18-rc7 /tmp2/linux' >> run-guest.sh
>     echo "cd /tmp2/linux" >> run-guest.sh
>     echo "{ time make -j $CORES vmlinux >/dev/null; } 2>>/home/cota/$NAME" >> run-guest.sh
>     # Output with execution time is then scp'ed to the host.
>     echo "ssh $HOST 'cat >> $HOST_PATH/$NAME' < /home/cota/$NAME" >> run-guest.sh
>     echo "poweroff" >> run-guest.sh
> }
>
> # Change here THREADS and also the TAGS that point to different QEMU installations.
> for THREADS in 64 32 16; do
>     for TAG in cpu-exclusive-work cputlb-no-bql per-cpu-lock cpu-has-work baseline; do
>         QEMU=$QEMU_INST_PATH/$TAG/bin/qemu-system-$ARCH
>         CPUMASK=$($CPUTOPO/list.pl --policy=compact-smt $THREADS)
>
>         create_file $TAG $THREADS
>         time taskset -c $CPUMASK $QEMU $COMMON_ARGS -smp $THREADS
>     done
> done
Robert Foley May 21, 2020, 2:17 p.m. UTC | #7
We re-ran the numbers for a ppc64 VM, using the additional configuration
details.
This seems to show the scalability gains much clearer.

                   Speedup vs a single thread for kernel build

  7 +-----------------------------------------------------------------------+
    |         +          +         +         +         +          +         |
    |                                    ###########       baseline ******* |
    |                               #####           ####   cpu lock ####### |
    |                             ##                    ####                |
  6 |-+                         ##                          ##            +-|
    |                         ##                              ####          |
    |                       ##                                    ###       |
    |                     ##        *****                            #      |
    |                   ##      ****     ***                          #     |
    |                 ##     ***            *                               |
  5 |-+             ##    ***                ****                         +-|
    |              #  ****                       **                         |
    |             # **                             **                       |
    |             #*                                 **                     |
    |          #*                                          **               |
    |         #*                                             *              |
    |         #                                               ******        |
    |        #                                                      **      |
    |       #                                                         *     |
  3 |-+     #                                                             +-|
    |      #                                                                |
    |      #                                                                |
    |     #                                                                 |
    |     #                                                                 |
  2 |-+  #                                                                +-|
    |    #                                                                  |
    |   #                                                                   |
    |   #                                                                   |
    |  #                                                                    |
    |  #      +          +         +         +         +          +         |
  1 +-----------------------------------------------------------------------+
    0         5          10        15        20        25         30        35
                                   Guest vCPUs

https://drive.google.com/file/d/1ASg5XyP9hNfN9VysXC3qe5s9QSJlwFAt/view?usp=sharing

Thanks & Regards,
-Rob

On Wed, 20 May 2020 at 11:01, Robert Foley <robert.foley@linaro.org> wrote:
>
> On Wed, 20 May 2020 at 00:46, Emilio G. Cota <cota@braap.org> wrote:
> >
> > On Mon, May 18, 2020 at 09:46:36 -0400, Robert Foley wrote:
> >
> > Thanks for doing these tests. I know from experience that benchmarking
> > is hard and incredibly time consuming, so please do not be discouraged by
> > my comments below.
> >
>
> Hi,
> Thanks for all the comments, and for including the script!
> These are all very helpful.
>
> We will work to replicate these results using a PPC VM,
> and will re-post them here.
>
> Thanks & Regards,
> -Rob
>
> > A couple of points:
> >
> > 1. I am not familiar with aarch64 KVM but I'd expect it to scale almost
> > like the native run. Are you assigning enough RAM to the guest? Also,
> > it can help to run the kernel build in a ramfs in the guest.
>
> > 2. The build itself does not seem to impose a scaling limit, since
> > it scales very well when run natively (per-thread I presume aarch64 TCG is
> > still slower than native, even if TCG is run on a faster x86 machine).
> > The limit here is probably aarch64 TCG. In particular, last time I
> > checked aarch64 TCG has room for improvement scalability-wise handling
> > interrupts and some TLB operations; this is likely to explain why we
> > see no benefit with per-CPU locks, i.e. the bottleneck is elsewhere.
> > This can be confirmed with the sync profiler.
> >
> > IIRC I originally used ppc64 for this test because ppc64 TCG does not
> > have any other big bottlenecks scalability-wise. I just checked but
> > unfortunately I can't find the ppc64 image I used :( What I can offer
> > is the script I used to run these benchmarks; see the appended.
> >
> > Thanks,
> >                 Emilio
> >
> > ---
> > #!/bin/bash
> >
> > set -eu
> >
> > # path to host files
> > MYHOME=/local/home/cota/src
> >
> > # guest image
> > QEMU_INST_PATH=$MYHOME/qemu-inst
> > IMG=$MYHOME/qemu/img/ppc64/ubuntu.qcow2
> >
> > ARCH=ppc64
> > COMMON_ARGS="-M pseries -nodefaults \
> >                 -hda $IMG -nographic -serial stdio \
> >                 -net nic -net user,hostfwd=tcp::2222-:22 \
> >                 -m 48G"
> >
> > # path to this script's directory, where .txt output will be copied
> > # from the guest.
> > QELT=$MYHOME/qelt
> > HOST_PATH=$QELT/fig/kcomp
> >
> > # The guest must be able to SSH to the HOST without entering a password.
> > # The way I set this up is to have a passwordless SSH key in the guest's
> > # root user, and then copy that key's public key to the host.
> > # I used the root user because the guest runs on bootup (as root) a
> > # script that scp's run-guest.sh (see below) from the host, then executes it.
> > # This is done via a tiny script in the guest invoked from systemd once
> > # boot-up has completed.
> > HOST=foo@bar.edu
> >
> > # This is a script in the host to use an appropriate cpumask to
> > # use cores in the same socket if possible.
> > # See https://github.com/cota/cputopology-perl
> > CPUTOPO=$MYHOME/cputopology-perl
> >
> > # For each run we create this file that then the guest will SCP
> > # and execute. It is a quick and dirty way of passing arguments to the guest.
> > create_file () {
> >     TAG=$1
> >     CORES=$2
> >     NAME=$ARCH.$TAG-$CORES.txt
> >
> >     echo '#!/bin/bash' > run-guest.sh
> >     echo 'cp -r /home/cota/linux-4.18-rc7 /tmp2/linux' >> run-guest.sh
> >     echo "cd /tmp2/linux" >> run-guest.sh
> >     echo "{ time make -j $CORES vmlinux >/dev/null; } 2>>/home/cota/$NAME" >> run-guest.sh
> >     # Output with execution time is then scp'ed to the host.
> >     echo "ssh $HOST 'cat >> $HOST_PATH/$NAME' < /home/cota/$NAME" >> run-guest.sh
> >     echo "poweroff" >> run-guest.sh
> > }
> >
> > # Change here THREADS and also the TAGS that point to different QEMU installations.
> > for THREADS in 64 32 16; do
> >     for TAG in cpu-exclusive-work cputlb-no-bql per-cpu-lock cpu-has-work baseline; do
> >         QEMU=$QEMU_INST_PATH/$TAG/bin/qemu-system-$ARCH
> >         CPUMASK=$($CPUTOPO/list.pl --policy=compact-smt $THREADS)
> >
> >         create_file $TAG $THREADS
> >         time taskset -c $CPUMASK $QEMU $COMMON_ARGS -smp $THREADS
> >     done
> > done
diff mbox series

Patch

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index e3b5750c3b..d13feaf3a3 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -284,7 +284,7 @@  static void flush_all_helper(CPUState *src, run_on_cpu_func fn,
 
     CPU_FOREACH(cpu) {
         if (cpu != src) {
-            async_run_on_cpu(cpu, fn, d);
+            async_run_on_cpu_no_bql(cpu, fn, d);
         }
     }
 }
@@ -352,8 +352,8 @@  void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
     tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
 
     if (cpu->created && !qemu_cpu_is_self(cpu)) {
-        async_run_on_cpu(cpu, tlb_flush_by_mmuidx_async_work,
-                         RUN_ON_CPU_HOST_INT(idxmap));
+        async_run_on_cpu_no_bql(cpu, tlb_flush_by_mmuidx_async_work,
+                                RUN_ON_CPU_HOST_INT(idxmap));
     } else {
         tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
     }
@@ -547,7 +547,7 @@  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
          * we can stuff idxmap into the low TARGET_PAGE_BITS, avoid
          * allocating memory for this operation.
          */
-        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_1,
+        async_run_on_cpu_no_bql(cpu, tlb_flush_page_by_mmuidx_async_1,
                          RUN_ON_CPU_TARGET_PTR(addr | idxmap));
     } else {
         TLBFlushPageByMMUIdxData *d = g_new(TLBFlushPageByMMUIdxData, 1);
@@ -555,7 +555,7 @@  void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
         /* Otherwise allocate a structure, freed by the worker.  */
         d->addr = addr;
         d->idxmap = idxmap;
-        async_run_on_cpu(cpu, tlb_flush_page_by_mmuidx_async_2,
+        async_run_on_cpu_no_bql(cpu, tlb_flush_page_by_mmuidx_async_2,
                          RUN_ON_CPU_HOST_PTR(d));
     }
 }