diff mbox series

[3/4] KVM: selftests: Wait for all vCPU to be created before entering guest mode

Message ID 20211111001257.1446428-4-dmatlack@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: selftests: Avoid mmap_sem contention during memory population | expand

Commit Message

David Matlack Nov. 11, 2021, 12:12 a.m. UTC
Thread creation requires taking the mmap_sem in write mode, which causes
vCPU threads running in guest mode to block while they are populating
memory. Fix this by waiting for all vCPU threads to be created and start
running before entering guest mode on any one vCPU thread.

This substantially improves the "Populate memory time" when using 1GiB
pages since it allows all vCPUs to zero pages in parallel rather than
blocking because a writer is waiting (which is waiting for another vCPU
that is busy zeroing a 1GiB page).

Before:

  $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
  ...
  Populate memory time: 52.811184013s

After:

  $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
  ...
  Populate memory time: 10.204573342s

Signed-off-by: David Matlack <dmatlack@google.com>
---
 .../selftests/kvm/lib/perf_test_util.c        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

Comments

Ben Gardon Nov. 11, 2021, 6:17 p.m. UTC | #1
On Wed, Nov 10, 2021 at 4:13 PM David Matlack <dmatlack@google.com> wrote:
>
> Thread creation requires taking the mmap_sem in write mode, which causes
> vCPU threads running in guest mode to block while they are populating
> memory. Fix this by waiting for all vCPU threads to be created and start
> running before entering guest mode on any one vCPU thread.
>
> This substantially improves the "Populate memory time" when using 1GiB
> pages since it allows all vCPUs to zero pages in parallel rather than
> blocking because a writer is waiting (which is waiting for another vCPU
> that is busy zeroing a 1GiB page).
>
> Before:
>
>   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
>   ...
>   Populate memory time: 52.811184013s
>
> After:
>
>   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
>   ...
>   Populate memory time: 10.204573342s
>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  .../selftests/kvm/lib/perf_test_util.c        | 26 +++++++++++++++++++
>  1 file changed, 26 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> index d646477ed16a..722df3a28791 100644
> --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> @@ -22,6 +22,9 @@ struct vcpu_thread {
>
>         /* The pthread backing the vCPU. */
>         pthread_t thread;
> +
> +       /* Set to true once the vCPU thread is up and running. */
> +       bool running;
>  };
>
>  /* The vCPU threads involved in this test. */
> @@ -30,6 +33,9 @@ static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
>  /* The function run by each vCPU thread, as provided by the test. */
>  static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *);
>
> +/* Set to true once all vCPU threads are up and running. */
> +static bool all_vcpu_threads_running;
> +
>  /*
>   * Continuously write to the first 8 bytes of each page in the
>   * specified region.
> @@ -196,6 +202,17 @@ static void *vcpu_thread_main(void *data)
>  {
>         struct vcpu_thread *vcpu = data;
>
> +       WRITE_ONCE(vcpu->running, true);
> +
> +       /*
> +        * Wait for all vCPU threads to be up and running before calling the test-
> +        * provided vCPU thread function. This prevents thread creation (which
> +        * requires taking the mmap_sem in write mode) from interfering with the
> +        * guest faulting in its memory.
> +        */
> +       while (!READ_ONCE(all_vcpu_threads_running))
> +               ;
> +

I can never remember the rules on this so I could be wrong, but you
may want a cpu_relax() in that loop to prevent it from being optimized
out. Maybe the READ_ONCE is sufficient though.

>         vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]);
>
>         return NULL;
> @@ -206,14 +223,23 @@ void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc
>         int vcpu_id;
>
>         vcpu_thread_fn = vcpu_fn;
> +       WRITE_ONCE(all_vcpu_threads_running, false);
>
>         for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
>                 struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id];
>
>                 vcpu->vcpu_id = vcpu_id;
> +               WRITE_ONCE(vcpu->running, false);

Do these need to be WRITE_ONCE? I don't think WRITE_ONCE provides any
extra memory ordering guarantees and I don't know why the compiler
would optimize these out. If they do need to be WRITE_ONCE, they
probably merit comments.

>
>                 pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
>         }
> +
> +       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> +               while (!READ_ONCE(vcpu_threads[vcpu_id].running))
> +                       ;
> +       }
> +
> +       WRITE_ONCE(all_vcpu_threads_running, true);
>  }
>
>  void perf_test_join_vcpu_threads(int vcpus)
> --
> 2.34.0.rc1.387.gb447b232ab-goog
>
David Matlack Nov. 12, 2021, 12:51 a.m. UTC | #2
On Thu, Nov 11, 2021 at 10:17 AM Ben Gardon <bgardon@google.com> wrote:
>
> On Wed, Nov 10, 2021 at 4:13 PM David Matlack <dmatlack@google.com> wrote:
> >
> > Thread creation requires taking the mmap_sem in write mode, which causes
> > vCPU threads running in guest mode to block while they are populating
> > memory. Fix this by waiting for all vCPU threads to be created and start
> > running before entering guest mode on any one vCPU thread.
> >
> > This substantially improves the "Populate memory time" when using 1GiB
> > pages since it allows all vCPUs to zero pages in parallel rather than
> > blocking because a writer is waiting (which is waiting for another vCPU
> > that is busy zeroing a 1GiB page).
> >
> > Before:
> >
> >   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
> >   ...
> >   Populate memory time: 52.811184013s
> >
> > After:
> >
> >   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
> >   ...
> >   Populate memory time: 10.204573342s
> >
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  .../selftests/kvm/lib/perf_test_util.c        | 26 +++++++++++++++++++
> >  1 file changed, 26 insertions(+)
> >
> > diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> > index d646477ed16a..722df3a28791 100644
> > --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> > +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> > @@ -22,6 +22,9 @@ struct vcpu_thread {
> >
> >         /* The pthread backing the vCPU. */
> >         pthread_t thread;
> > +
> > +       /* Set to true once the vCPU thread is up and running. */
> > +       bool running;
> >  };
> >
> >  /* The vCPU threads involved in this test. */
> > @@ -30,6 +33,9 @@ static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
> >  /* The function run by each vCPU thread, as provided by the test. */
> >  static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *);
> >
> > +/* Set to true once all vCPU threads are up and running. */
> > +static bool all_vcpu_threads_running;
> > +
> >  /*
> >   * Continuously write to the first 8 bytes of each page in the
> >   * specified region.
> > @@ -196,6 +202,17 @@ static void *vcpu_thread_main(void *data)
> >  {
> >         struct vcpu_thread *vcpu = data;
> >
> > +       WRITE_ONCE(vcpu->running, true);
> > +
> > +       /*
> > +        * Wait for all vCPU threads to be up and running before calling the test-
> > +        * provided vCPU thread function. This prevents thread creation (which
> > +        * requires taking the mmap_sem in write mode) from interfering with the
> > +        * guest faulting in its memory.
> > +        */
> > +       while (!READ_ONCE(all_vcpu_threads_running))
> > +               ;
> > +
>
> I can never remember the rules on this so I could be wrong, but you
> may want a cpu_relax() in that loop to prevent it from being optimized
> out. Maybe the READ_ONCE is sufficient though.

READ_ONCE is sufficient to prevent the loop from being optimized out
but cpu_relax() is nice to have to play nice with our hyperthread
buddy.

On that note there are a lot of spin waits in the KVM selftests and
none of the ones I've seen use cpu_relax().

I'll take a look at adding cpu_relax() throughout the selftests in v2.

>
> >         vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]);
> >
> >         return NULL;
> > @@ -206,14 +223,23 @@ void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc
> >         int vcpu_id;
> >
> >         vcpu_thread_fn = vcpu_fn;
> > +       WRITE_ONCE(all_vcpu_threads_running, false);
> >
> >         for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> >                 struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id];
> >
> >                 vcpu->vcpu_id = vcpu_id;
> > +               WRITE_ONCE(vcpu->running, false);
>
> Do these need to be WRITE_ONCE? I don't think WRITE_ONCE provides any
> extra memory ordering guarantees and I don't know why the compiler
> would optimize these out. If they do need to be WRITE_ONCE, they
> probably merit comments.

To be completely honest I'm not sure. I included WRITE_ONCE out of
caution to ensure the compiler does not reorder the writes with
respect to the READ_ONCE. I'll need to do a bit more research to
confirm if it's really necessary.

>
> >
> >                 pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
> >         }
> > +
> > +       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> > +               while (!READ_ONCE(vcpu_threads[vcpu_id].running))
> > +                       ;
> > +       }
> > +
> > +       WRITE_ONCE(all_vcpu_threads_running, true);
> >  }
> >
> >  void perf_test_join_vcpu_threads(int vcpus)
> > --
> > 2.34.0.rc1.387.gb447b232ab-goog
> >
David Matlack Nov. 12, 2021, 1:46 a.m. UTC | #3
On Thu, Nov 11, 2021 at 4:51 PM David Matlack <dmatlack@google.com> wrote:
>
> On Thu, Nov 11, 2021 at 10:17 AM Ben Gardon <bgardon@google.com> wrote:
> >
> > On Wed, Nov 10, 2021 at 4:13 PM David Matlack <dmatlack@google.com> wrote:
> > >
> > > Thread creation requires taking the mmap_sem in write mode, which causes
> > > vCPU threads running in guest mode to block while they are populating
> > > memory. Fix this by waiting for all vCPU threads to be created and start
> > > running before entering guest mode on any one vCPU thread.
> > >
> > > This substantially improves the "Populate memory time" when using 1GiB
> > > pages since it allows all vCPUs to zero pages in parallel rather than
> > > blocking because a writer is waiting (which is waiting for another vCPU
> > > that is busy zeroing a 1GiB page).
> > >
> > > Before:
> > >
> > >   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
> > >   ...
> > >   Populate memory time: 52.811184013s
> > >
> > > After:
> > >
> > >   $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb
> > >   ...
> > >   Populate memory time: 10.204573342s
> > >
> > > Signed-off-by: David Matlack <dmatlack@google.com>
> > > ---
> > >  .../selftests/kvm/lib/perf_test_util.c        | 26 +++++++++++++++++++
> > >  1 file changed, 26 insertions(+)
> > >
> > > diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
> > > index d646477ed16a..722df3a28791 100644
> > > --- a/tools/testing/selftests/kvm/lib/perf_test_util.c
> > > +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
> > > @@ -22,6 +22,9 @@ struct vcpu_thread {
> > >
> > >         /* The pthread backing the vCPU. */
> > >         pthread_t thread;
> > > +
> > > +       /* Set to true once the vCPU thread is up and running. */
> > > +       bool running;
> > >  };
> > >
> > >  /* The vCPU threads involved in this test. */
> > > @@ -30,6 +33,9 @@ static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
> > >  /* The function run by each vCPU thread, as provided by the test. */
> > >  static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *);
> > >
> > > +/* Set to true once all vCPU threads are up and running. */
> > > +static bool all_vcpu_threads_running;
> > > +
> > >  /*
> > >   * Continuously write to the first 8 bytes of each page in the
> > >   * specified region.
> > > @@ -196,6 +202,17 @@ static void *vcpu_thread_main(void *data)
> > >  {
> > >         struct vcpu_thread *vcpu = data;
> > >
> > > +       WRITE_ONCE(vcpu->running, true);
> > > +
> > > +       /*
> > > +        * Wait for all vCPU threads to be up and running before calling the test-
> > > +        * provided vCPU thread function. This prevents thread creation (which
> > > +        * requires taking the mmap_sem in write mode) from interfering with the
> > > +        * guest faulting in its memory.
> > > +        */
> > > +       while (!READ_ONCE(all_vcpu_threads_running))
> > > +               ;
> > > +
> >
> > I can never remember the rules on this so I could be wrong, but you
> > may want a cpu_relax() in that loop to prevent it from being optimized
> > out. Maybe the READ_ONCE is sufficient though.
>
> READ_ONCE is sufficient to prevent the loop from being optimized out
> but cpu_relax() is nice to have to play nice with our hyperthread
> buddy.
>
> On that note there are a lot of spin waits in the KVM selftests and
> none of the ones I've seen use cpu_relax().
>
> I'll take a look at adding cpu_relax() throughout the selftests in v2.
>
> >
> > >         vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]);
> > >
> > >         return NULL;
> > > @@ -206,14 +223,23 @@ void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc
> > >         int vcpu_id;
> > >
> > >         vcpu_thread_fn = vcpu_fn;
> > > +       WRITE_ONCE(all_vcpu_threads_running, false);
> > >
> > >         for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> > >                 struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id];
> > >
> > >                 vcpu->vcpu_id = vcpu_id;
> > > +               WRITE_ONCE(vcpu->running, false);
> >
> > Do these need to be WRITE_ONCE? I don't think WRITE_ONCE provides any
> > extra memory ordering guarantees and I don't know why the compiler
> > would optimize these out. If they do need to be WRITE_ONCE, they
> > probably merit comments.
>
> To be completely honest I'm not sure. I included WRITE_ONCE out of
> caution to ensure the compiler does not reorder the writes with
> respect to the READ_ONCE. I'll need to do a bit more research to
> confirm if it's really necessary.

FWIW removing WRITE_ONCE and bumping the optimization level up to O3
did not cause any problems. But this is no proof of course.

This quote from memory-barries.txt makes me think it'd be prudent to
keep the WRITE_ONCE:

     You should assume that the compiler can move READ_ONCE() and
     WRITE_ONCE() past code not containing READ_ONCE(), WRITE_ONCE(),
     barrier(), or similar primitives.

So, for example, the compiler could potentially re-order READ_ONCE
loop below after the write to all_vcpu_threads_running if we did not
include WRITE_ONCE?

>
> >
> > >
> > >                 pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
> > >         }
> > > +
> > > +       for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> > > +               while (!READ_ONCE(vcpu_threads[vcpu_id].running))
> > > +                       ;
> > > +       }
> > > +
> > > +       WRITE_ONCE(all_vcpu_threads_running, true);
> > >  }
> > >
> > >  void perf_test_join_vcpu_threads(int vcpus)
> > > --
> > > 2.34.0.rc1.387.gb447b232ab-goog
> > >
Sean Christopherson Nov. 12, 2021, 4:32 p.m. UTC | #4
On Thu, Nov 11, 2021, David Matlack wrote:
> On Thu, Nov 11, 2021 at 4:51 PM David Matlack <dmatlack@google.com> wrote:
> > > > @@ -196,6 +202,17 @@ static void *vcpu_thread_main(void *data)
> > > >  {
> > > >         struct vcpu_thread *vcpu = data;
> > > >
> > > > +       WRITE_ONCE(vcpu->running, true);
> > > > +
> > > > +       /*
> > > > +        * Wait for all vCPU threads to be up and running before calling the test-
> > > > +        * provided vCPU thread function. This prevents thread creation (which
> > > > +        * requires taking the mmap_sem in write mode) from interfering with the
> > > > +        * guest faulting in its memory.
> > > > +        */
> > > > +       while (!READ_ONCE(all_vcpu_threads_running))

This is way more convoluted than it needs to be:

	atomic_inc(&perf_test_args.nr_running_vcpus);

	while (atomic_read(&perf_test_args.nr_running_vcpus) < perf_test_args.nr_vcpus)
		cpu_relax();

diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
index df9f1a3a3ffb..ce9039ec8c18 100644
--- a/tools/testing/selftests/kvm/include/perf_test_util.h
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -30,6 +30,8 @@ struct perf_test_args {
        uint64_t host_page_size;
        uint64_t guest_page_size;
        int wr_fract;
+       int nr_vcpus;
+       atomic_t nr_running_vcpus;
 
        struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS];
 };


Alternatively it could be a countdown to zero so that only the atomic_t is needed.


> > > I can never remember the rules on this so I could be wrong, but you
> > > may want a cpu_relax() in that loop to prevent it from being optimized
> > > out. Maybe the READ_ONCE is sufficient though.
> >
> > READ_ONCE is sufficient to prevent the loop from being optimized out
> > but cpu_relax() is nice to have to play nice with our hyperthread
> > buddy.
> >
> > On that note there are a lot of spin waits in the KVM selftests and
> > none of the ones I've seen use cpu_relax().
> >
> > I'll take a look at adding cpu_relax() throughout the selftests in v2.
> >
> > >
> > > >         vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]);
> > > >
> > > >         return NULL;
> > > > @@ -206,14 +223,23 @@ void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc
> > > >         int vcpu_id;
> > > >
> > > >         vcpu_thread_fn = vcpu_fn;
> > > > +       WRITE_ONCE(all_vcpu_threads_running, false);
> > > >
> > > >         for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
> > > >                 struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id];
> > > >
> > > >                 vcpu->vcpu_id = vcpu_id;
> > > > +               WRITE_ONCE(vcpu->running, false);
> > >
> > > Do these need to be WRITE_ONCE? I don't think WRITE_ONCE provides any
> > > extra memory ordering guarantees and I don't know why the compiler
> > > would optimize these out. If they do need to be WRITE_ONCE, they
> > > probably merit comments.
> >
> > To be completely honest I'm not sure. I included WRITE_ONCE out of
> > caution to ensure the compiler does not reorder the writes with
> > respect to the READ_ONCE. I'll need to do a bit more research to
> > confirm if it's really necessary.
> 
> FWIW removing WRITE_ONCE and bumping the optimization level up to O3
> did not cause any problems. But this is no proof of course.
> 
> This quote from memory-barries.txt makes me think it'd be prudent to
> keep the WRITE_ONCE:
> 
>      You should assume that the compiler can move READ_ONCE() and
>      WRITE_ONCE() past code not containing READ_ONCE(), WRITE_ONCE(),
>      barrier(), or similar primitives.
> 
> So, for example, the compiler could potentially re-order READ_ONCE
> loop below after the write to all_vcpu_threads_running if we did not
> include WRITE_ONCE?

Practically speaking, no.  pthread_create() undoubtedly has barriers galore, so
unless @vcpus==0, all_vcpu_threads_running won't get re-ordered.

As noted above, READ_ONCE/WRITE_ONCE do provide _some_ guarantees about memory
ordering with respect to the _compiler_.  Notably, the compiler is allowed to
reorder non-ONCE loads/stores around {READ,WRITE}_ONCE.  And emphasis on "compiler".
On x86, that's effectively the same as memory ordering in hardware, because ignoring
WC memory, x86 is strongy ordered.  But arm64 and others are weakly ordered, in
which case {READ,WRITE}_ONCE do not provide any guarantees about how loads/stores
will complete when run on the CPU.

This is a bad example because there's not really a race to be had, e.g. aside from
pthread_create(), there's also the fact that all_vcpu_threads_running=false is a
likely nop since it's zero initialized.  

Here's a contrived example:

static void *vcpu_thread_main(void *data)
{
	while (!READ_ONCE(test_stage))
		cpu_relax();

	READ_ONCE(test_fn)();
}


int main(...)
{
	for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++)
		pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);

	test_fn = do_work;

	WRITE_ONCE(test_stage, 1);
}

On any architecture, the thread could observe a NULL test_fn because it's allowed
to reorder the store to test_fn around the store to test_stage.  Making it

	WRITE_ONCE(test_fn, do_work);
	WRITE_ONCE(test_stage, 1);

would solve the problem on x86 as it would effectively force the compiler to emit:

	test_stage = 0
	barrier() // because of pthread_create()
	test_fn    = do_work
	test_stage = 1

but on arm64 and others, hardware can complete the second store to test_stage
_before_ the store to test_fn, and so the threads could observe a NULL test_fn
even though the compiler was forced to generate a specific order of operations.

To ensure something like the above works on weakly-ordered architctures, the code
would should instead be something like:

static void *vcpu_thread_main(void *data)
{
        while (!READ_ONCE(test_stage))
                cpu_relax();

	smp_rmb();
        test_fn();
}

int main(...)
{
        for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++)
                pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);

        test_fn = do_work;
	smp_wmb();
        test_stage = 1;
}

Final note, the READ_ONCE() in the while loop is still necessary because without
that the compiler would be allowed to assume that test_stage can't be changed
in that loop, e.g. on x86 could generate the following hang the task:

	mov [test_stage], %rax
1:	test %rax, %rax
	jnz 2f
	pause
	jmp 1b
2:
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c
index d646477ed16a..722df3a28791 100644
--- a/tools/testing/selftests/kvm/lib/perf_test_util.c
+++ b/tools/testing/selftests/kvm/lib/perf_test_util.c
@@ -22,6 +22,9 @@  struct vcpu_thread {
 
 	/* The pthread backing the vCPU. */
 	pthread_t thread;
+
+	/* Set to true once the vCPU thread is up and running. */
+	bool running;
 };
 
 /* The vCPU threads involved in this test. */
@@ -30,6 +33,9 @@  static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS];
 /* The function run by each vCPU thread, as provided by the test. */
 static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *);
 
+/* Set to true once all vCPU threads are up and running. */
+static bool all_vcpu_threads_running;
+
 /*
  * Continuously write to the first 8 bytes of each page in the
  * specified region.
@@ -196,6 +202,17 @@  static void *vcpu_thread_main(void *data)
 {
 	struct vcpu_thread *vcpu = data;
 
+	WRITE_ONCE(vcpu->running, true);
+
+	/*
+	 * Wait for all vCPU threads to be up and running before calling the test-
+	 * provided vCPU thread function. This prevents thread creation (which
+	 * requires taking the mmap_sem in write mode) from interfering with the
+	 * guest faulting in its memory.
+	 */
+	while (!READ_ONCE(all_vcpu_threads_running))
+		;
+
 	vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]);
 
 	return NULL;
@@ -206,14 +223,23 @@  void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc
 	int vcpu_id;
 
 	vcpu_thread_fn = vcpu_fn;
+	WRITE_ONCE(all_vcpu_threads_running, false);
 
 	for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
 		struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id];
 
 		vcpu->vcpu_id = vcpu_id;
+		WRITE_ONCE(vcpu->running, false);
 
 		pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu);
 	}
+
+	for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+		while (!READ_ONCE(vcpu_threads[vcpu_id].running))
+			;
+	}
+
+	WRITE_ONCE(all_vcpu_threads_running, true);
 }
 
 void perf_test_join_vcpu_threads(int vcpus)