diff mbox series

[2/2] KVM: arm64: selftests: Introduce get_set_regs_perf test

Message ID 20220217034947.180935-2-reijiw@google.com (mailing list archive)
State New, archived
Headers show
Series [1/2] KVM: selftests: kvm_vm_elf_load() and elfhdr_get() should close fd | expand

Commit Message

Reiji Watanabe Feb. 17, 2022, 3:49 a.m. UTC
Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG
for registers that are returned by KVM_GET_REG_LIST. This is a pseudo
process of saving/restoring registers during live migration, and this
test quantifies the performance of the process.

Signed-off-by: Reiji Watanabe <reijiw@google.com>
---
 tools/testing/selftests/kvm/.gitignore        |   1 +
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++
 3 files changed, 458 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c

Comments

Oliver Upton Feb. 17, 2022, 4:52 a.m. UTC | #1
Hi Reiji,

First off, thanks for looking into this! Seems like a very useful thing
to test :-)

On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote:
> Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG
> for registers that are returned by KVM_GET_REG_LIST. This is a pseudo
> process of saving/restoring registers during live migration, and this
> test quantifies the performance of the process.
> 
> Signed-off-by: Reiji Watanabe <reijiw@google.com>
> ---
>  tools/testing/selftests/kvm/.gitignore        |   1 +
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++

Without having looked at the diff yet, I wonder if this can be done for
multiple architectures, as the whole get/set reg UAPI is used by several
other architectures.

>  3 files changed, 458 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> 
> diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> index dce7de7755e6..b6b18aaf9322 100644
> --- a/tools/testing/selftests/kvm/.gitignore
> +++ b/tools/testing/selftests/kvm/.gitignore
> @@ -2,6 +2,7 @@
>  /aarch64/arch_timer
>  /aarch64/debug-exceptions
>  /aarch64/get-reg-list
> +/aarch64/get_set_regs_perf
>  /aarch64/psci_cpu_on_test
>  /aarch64/vgic_init
>  /aarch64/vgic_irq
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 0e4926bc9a58..46a28ce89002 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test
>  TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
>  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
>  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
> +TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf
>  TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
>  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
>  TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
> diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> new file mode 100644
> index 000000000000..27944ccce4a1
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> @@ -0,0 +1,456 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test
> + *
> + * The test runs performance test of saving registers that are returned
> + * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs
> + * in another VM (a part of behavior during blackout on live migration).
> + *
> + * Copyright (c) 2022, Google LLC.
> + */
> +
> +#define _GNU_SOURCE
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <unistd.h>
> +#include <time.h>
> +#include <pthread.h>
> +
> +#include "kvm_util.h"
> +#include "processor.h"
> +
> +#define	N2USEC(nsec)	((nsec)/1000)

How about defining:

  #define	NSEC_PER_USEC	1000L

and explicitly doing division inline? This is consistent with the kernel
as well.

> +#define	REG_STR_LEN	64
> +
> +struct reg_val {
> +	 /* Big enough buffer even for the longest vector register */
> +	uint64_t	val[32];
> +};
> +
> +struct thread_data {
> +	uint32_t	vcpuid;
> +	bool		is_set_reg;
> +	volatile bool	*start;
> +	struct kvm_vm	*vm;
> +	struct timespec	time;
> +	struct kvm_reg_list	*reg_list;
> +
> +	/*
> +	 * Pointer to an array of reg_val, which will hold a value of each
> +	 * register. The number of elements in it is indicated in reg_list.n.
> +	 */
> +	struct reg_val	*reg_vals;
> +
> +	/*
> +	 * Pointer to an array of timespec, which will hold per register
> +	 * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of
> +	 * elements in it is indicated in reg_list.n.
> +	 */
> +	struct timespec	*reg_time;
> +};
> +
> +pthread_t threads[KVM_MAX_VCPUS];
> +struct thread_data thread_args[KVM_MAX_VCPUS];
> +
> +struct sys_reg_params {
> +	u8	Op0;
> +	u8	Op1;
> +	u8	CRn;
> +	u8	CRm;
> +	u8	Op2;
> +};
> +
> +static int regid_to_params(u64 id, struct sys_reg_params *params)
> +{
> +	switch (id & KVM_REG_SIZE_MASK) {
> +	case KVM_REG_SIZE_U64:
> +		/* Any unused index bits means it's not valid. */
> +		if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK
> +			      | KVM_REG_ARM_COPROC_MASK
> +			      | KVM_REG_ARM64_SYSREG_OP0_MASK
> +			      | KVM_REG_ARM64_SYSREG_OP1_MASK
> +			      | KVM_REG_ARM64_SYSREG_CRN_MASK
> +			      | KVM_REG_ARM64_SYSREG_CRM_MASK
> +			      | KVM_REG_ARM64_SYSREG_OP2_MASK))
> +			return -1;
> +		params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK)
> +			       >> KVM_REG_ARM64_SYSREG_OP0_SHIFT);
> +		params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK)
> +			       >> KVM_REG_ARM64_SYSREG_OP1_SHIFT);
> +		params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK)
> +			       >> KVM_REG_ARM64_SYSREG_CRN_SHIFT);
> +		params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK)
> +			       >> KVM_REG_ARM64_SYSREG_CRM_SHIFT);
> +		params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK)
> +			       >> KVM_REG_ARM64_SYSREG_OP2_SHIFT);
> +		return 0;
> +	default:
> +		return -1;
> +	}
> +}
> +
> +static void reg_id_to_str(uint64_t id, char *str, int len)
> +{
> +	struct sys_reg_params p;
> +
> +	TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
> +		    "Not KVM_REG_ARM64 register (0x%lx)", id);
> +
> +	switch (id & KVM_REG_ARM_COPROC_MASK) {
> +	case KVM_REG_ARM_CORE:
> +		snprintf(str, len, "CORE(id:0x%lx)", id);
> +		break;
> +	case KVM_REG_ARM64_SVE:
> +		snprintf(str, len, "SVE(id:0x%lx)", id);
> +		break;
> +	case KVM_REG_ARM_DEMUX:
> +		snprintf(str, len, "DEMUX(id:0x%lx)", id);
> +		break;
> +	case KVM_REG_ARM_FW:
> +		snprintf(str, len, "FW_REG_%ld", id & 0xffff);
> +		break;
> +	case KVM_REG_ARM64_SYSREG:
> +		if (regid_to_params(id, &p) == 0)
> +			snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)",
> +				 p.Op0, p.Op1, p.CRn, p.CRm, p.Op2);
> +		else
> +			snprintf(str, len, "SYSREG(id:0x%lx)", id);
> +		break;
> +	default:
> +		snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id);
> +	}
> +}

I see that print_reg() in aarch64/get-reg-list.c implements most of
this. Would it be possible to hoist that into a shared helper function?

> +/*
> + * Save or restore registers that were saved by get_regs_vcpu_thread with
> + * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process.
> + */
> +static void *vcpu_thread(void *data)
> +{
> +	int i, ret;
> +	struct thread_data *arg = data;
> +	struct kvm_reg_list *reg_list = arg->reg_list;
> +	bool is_set_reg = arg->is_set_reg;
> +	unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG;
> +	char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG";

Why not use vcpu_{get,set}_reg() instead? They already have informative
assertions that print the name of the ioctl and the name of the errno.

> +	struct timespec ts_reg_diff, ts_reg_start;
> +	struct kvm_one_reg reg;
> +
> +	memset(&arg->time, 0, sizeof(arg->time));
> +
> +	/* Wait until start is set */
> +	while (!(*arg->start))
> +		;

Have you considered using a pthread_barrier for this?

> +	/* Warm up the cache. */
> +	reg.id = reg_list->reg[0];
> +	reg.addr = (uint64_t)&arg->reg_vals[0];
> +	ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
> +	TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);

I wouldn't expect VMMs to do this before starting migration. Do you
think it would be better to replicate the behavior of typical userspace?

> +	pr_debug("%s[%d] start\n", __func__, arg->vcpuid);
> +
> +	/* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */
> +	for (i = 0; i < reg_list->n; i++) {
> +		clock_gettime(CLOCK_MONOTONIC, &ts_reg_start);
> +
> +		reg.id = reg_list->reg[i];
> +		reg.addr = (uint64_t)&arg->reg_vals[i];
> +		ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
> +
> +		ts_reg_diff = timespec_elapsed(ts_reg_start);
> +		arg->time = timespec_add(arg->time, ts_reg_diff);
> +		if (arg->reg_time)
> +			/* Need per register latency separately */
> +			arg->reg_time[i] = ts_reg_diff;
> +
> +		TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);
> +		pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id,
> +			 arg->reg_vals[i].val[0]);
> +	}
> +
> +	return NULL;
> +}
> +
> +void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data)
> +{
> +	uint64_t i;
> +
> +	for (i = 0; i < cnt; i++)
> +		total[i] = timespec_add(total[i], data[i]);
> +}
> +
> +struct result_timespec {
> +	struct timespec src;
> +	struct timespec dst;
> +	struct timespec *src_per_reg;
> +	struct timespec *dst_per_reg;
> +};
> +
> +/*
> + * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test.
> + * Create a VM with the given number of vCPUs, create a thread for each
> + * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG
> + * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing
> + * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on
> + * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to
> + * the caller via @res.
> + */
> +void run_test_one(int nvcpus, struct kvm_reg_list *reg_list,
> +		  struct result_timespec *res)
> +{
> +	bool per_reg_data = res->src_per_reg ? true : false;
> +	uint64_t nregs = reg_list->n;
> +	volatile bool start = false;
> +	int i, ret;
> +	struct kvm_vm *src_vm, *dst_vm;
> +	struct thread_data *targ;
> +	struct timespec src_sum = (struct timespec){0};
> +	struct timespec dst_sum = (struct timespec){0};

Is the cast required?

> +	if (per_reg_data) {
> +		/* Reset the buffers to save the per register result */
> +		memset(res->src_per_reg, 0, nregs * sizeof(struct timespec));
> +		memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec));
> +	}
> +
> +	/* Create VMs to save/restore registers */
> +	src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
> +	dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
> +
> +	/* Start saving registers process for src_vm */
> +
> +	/* Create source VM's vCPU threads */
> +	for (i = 0; i < nvcpus; i++) {
> +		targ = &thread_args[i];
> +		targ->vm = src_vm;
> +		targ->vcpuid = i;
> +		targ->is_set_reg = false;
> +		targ->start = &start;
> +		targ->reg_list = reg_list;
> +		targ->reg_vals = calloc(nregs, sizeof(struct reg_val));
> +		TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals");
> +		if (per_reg_data) {
> +			/* Per register result buffer for the vCPU */
> +			targ->reg_time = calloc(nregs, sizeof(struct timespec));
> +			TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time");
> +		}
> +		ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
> +		TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
> +	}
> +
> +	/*
> +	 * Let threads start saving registers for vCPUs, and wait for all
> +	 * threads to complete restoring registers.
> +	 */
> +	start = true;
> +	for (i = 0; i < nvcpus; i++) {
> +		targ = &thread_args[i];
> +		ret = pthread_join(threads[i], NULL);
> +		TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
> +		src_sum = timespec_add(src_sum, targ->time);

I think it would make more sense to measure wall time instead of CPU
time here. It may not be immediately obvious when running this test why
the reported time scales as the number of vCPUs increases.

> +		if (per_reg_data)
> +			aggregate_time(nregs, res->src_per_reg, targ->reg_time);
> +	}
> +
> +	start = false;
> +	pr_debug("%s Saving registers completed.\n", __func__);
> +
> +	/* Start restoring registers process for dst_vm */
> +
> +	/* Create destination VM's vCPU threads */
> +	for (i = 0; i < nvcpus; i++) {
> +		targ = &thread_args[i];
> +		/* Update fields that are different from the src case */
> +		targ->vm = dst_vm;
> +		targ->is_set_reg = true;
> +		ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
> +		TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
> +	}
> +
> +	/*
> +	 * Let threads start saving registers for vCPUs, and wait for all
> +	 * threads to complete restoring registers.
> +	 */
> +	start = true;
> +	for (i = 0; i < nvcpus; i++) {
> +		targ = &thread_args[i];
> +		ret = pthread_join(threads[i], NULL);
> +		TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
> +
> +		free(targ->reg_vals);
> +		dst_sum = timespec_add(dst_sum, targ->time);
> +		if (per_reg_data) {
> +			aggregate_time(nregs, res->dst_per_reg, targ->reg_time);
> +			free(targ->reg_time);
> +		}
> +	}
> +
> +	kvm_vm_free(src_vm);
> +	kvm_vm_free(dst_vm);
> +
> +	pr_debug("%s Restoring registers completed.\n", __func__);
> +
> +	/* Calculate the vCPU average */
> +	res->src = timespec_div(src_sum, nvcpus);
> +	res->dst = timespec_div(dst_sum, nvcpus);
> +	if (!per_reg_data)
> +		return;
> +
> +	/* Calculate the vCPU average for each register */
> +	for (i = 0; i < nregs; i++) {
> +		res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus);
> +		res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus);
> +	}
> +}
> +
> +/*
> + * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG)
> + * performance test with the given number of vCPUs (@nvcpus) for the given
> + * number (@iterations) of times.
> + * Each iteration of the test returns the latency of saving/restoring all
> + * the registers that KVM_GET_REG_LIST returns (See comments for
> + * run_test_one() for more details of each iteration of the test).
> + * Print the average latency of each iteration, and if @per_reg_result is set,
> + * print the average latency for each register as well.
> + */
> +void run_test(int nvcpus, int iterations, bool per_reg_result)
> +{
> +	int i, j;
> +	uint64_t nregs;
> +	struct result_timespec res = {0};
> +	struct timespec src_sum = {0};
> +	struct timespec dst_sum = {0};
> +	struct timespec avg, src_avg, dst_avg;
> +	struct timespec *src_per_reg_sum, *dst_per_reg_sum;
> +	struct kvm_vm *vm;
> +	struct kvm_reg_list *reg_list;
> +	char reg_str[REG_STR_LEN];
> +
> +	/* Get kvm_reg_list */
> +	vm = vm_create_default(0, 0, NULL);
> +	reg_list = vcpu_get_reg_list(vm, 0);

Would it make sense to test some opt-in capabilities that expose
additional registers (PMU, SVE, etc.)?

> +	kvm_vm_free(vm);
> +	nregs = reg_list->n;
> +
> +	if (per_reg_result) {
> +		/*
> +		 * Allocate buffers to save latency of KVM_GET_ONE_REG
> +		 * and KVM_SET_ONE_REG respectively.
> +		 */
> +
> +		/* Buffers for each iteration */
> +		res.src_per_reg = calloc(nregs, sizeof(struct timespec));
> +		res.dst_per_reg = calloc(nregs, sizeof(struct timespec));
> +
> +		/* Buffers for the sum */
> +		src_per_reg_sum = calloc(nregs, sizeof(struct timespec));
> +		dst_per_reg_sum = calloc(nregs, sizeof(struct timespec));
> +
> +		TEST_ASSERT(res.src_per_reg && res.dst_per_reg &&
> +			    src_per_reg_sum && dst_per_reg_sum,
> +			    "Failed to allocate per register time buffer");
> +	}
> +
> +	pr_info("Iterations %d (vCPUs %d, regs %lld): ",
> +		iterations, nvcpus, reg_list->n);
> +
> +	for (i = 0; i < iterations; i++) {
> +		/* Run the test */
> +		run_test_one(nvcpus, reg_list, &res);
> +
> +		src_sum = timespec_add(src_sum, res.src);
> +		dst_sum = timespec_add(dst_sum, res.dst);
> +
> +		if (!per_reg_result)
> +			continue;
> +
> +		/* Aggregate per register result */
> +		for (j = 0; j < nregs; j++) {
> +			src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j],
> +							  res.src_per_reg[j]);
> +			dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j],
> +							  res.dst_per_reg[j]);
> +		}
> +	}
> +
> +	/* Calculate the iteration average */
> +	src_avg = timespec_div(src_sum, iterations);
> +	dst_avg = timespec_div(dst_sum, iterations);
> +	avg = timespec_add(src_avg, dst_avg);
> +
> +	/* Print the average */
> +	if (avg.tv_sec > 0) {
> +		pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n",
> +			avg.tv_sec, avg.tv_nsec,
> +			src_avg.tv_sec, src_avg.tv_nsec,
> +			dst_avg.tv_sec, dst_avg.tv_nsec);
> +	} else {
> +		pr_info("get+set %ldus (get %ldus, set %ldus)\n",
> +			N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec),
> +			N2USEC(dst_avg.tv_nsec));
> +	}
> +
> +	/* Print per register result when requested */
> +	if (per_reg_result) {
> +		pr_info("Per Register Result:\n");
> +		pr_info(" %-32s %13s  %9s  %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)");
> +		for (i = 0; i < nregs; i++) {
> +			/* Calculate the average for the register */
> +			src_avg = timespec_div(src_per_reg_sum[i], iterations);
> +			dst_avg = timespec_div(dst_per_reg_sum[i], iterations);
> +			avg = timespec_add(src_avg, dst_avg);
> +
> +			assert(avg.tv_sec == 0);
> +			reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN);
> +			pr_info(" %-32s %13ld  %9ld  %9ld\n", reg_str,
> +				avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec);
> +		}

Do you think it would be useful to optionally dump this granular data
into a machine-parseable format (like CSV)? It could be helpful for
analysis. We have several hundred exposed registers (and always
growing!) and grokking stdout might be a bit difficult.

> +		free(res.src_per_reg);
> +		free(res.dst_per_reg);
> +		free(src_per_reg_sum);
> +		free(dst_per_reg_sum);
> +	}
> +
> +	free(reg_list);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	int opt;
> +	int nvcpus = 1;
> +	int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
> +	int iterations = 1000;
> +	bool per_reg_result = false;
> +
> +	while ((opt = getopt(argc, argv, "hri:v:")) != -1) {
> +		switch (opt) {
> +		case 'i':
> +			iterations = atoi(optarg);
> +			break;
> +		case 'v':
> +			nvcpus = atoi(optarg);
> +			TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus,
> +				    "Number of vcpus, must be between 1 and %d",
> +				    max_vcpus);
> +			break;
> +		case 'r':
> +			per_reg_result = true;
> +			break;
> +		case 'h':
> +		default:
> +			printf("usage: %s [-h] [-v vcpus] [-i iterations]\n",
> +			       argv[0]);
> +			exit(0);
> +		}
> +	}
> +
> +	setbuf(stdout, NULL);
> +
> +	run_test(nvcpus, iterations, per_reg_result);
> +
> +	return 0;
> +}
> -- 
> 2.35.1.473.g83b2b277ed-goog
> 

--
Best,
Oliver
Marc Zyngier Feb. 17, 2022, 9:11 a.m. UTC | #2
On Thu, 17 Feb 2022 04:52:10 +0000,
Oliver Upton <oupton@google.com> wrote:
> 
> Hi Reiji,
> 
> First off, thanks for looking into this! Seems like a very useful thing
> to test :-)
> 
> On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote:
> > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG
> > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo
> > process of saving/restoring registers during live migration, and this
> > test quantifies the performance of the process.
> > 
> > Signed-off-by: Reiji Watanabe <reijiw@google.com>
> > ---
> >  tools/testing/selftests/kvm/.gitignore        |   1 +
> >  tools/testing/selftests/kvm/Makefile          |   1 +
> >  .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++
>

[...]

> Would it make sense to test some opt-in capabilities that expose
> additional registers (PMU, SVE, etc.)?

I think this is important. System registers are usually saved/restored
in groups, and due to the way we walk the sysreg array, timings are
unlikely to be uniform. Getting a grip on that could help restructure
the walking if required (either per-group arrays, or maybe a tree
structure).

Note that all of this could equally apply to the guest trapping (the
walk is the same).

And yes, there are a lot of commonalities with get-reg-list, so
reusing some of the existing infrastructure would be a good thing.


	M.
Reiji Watanabe Feb. 19, 2022, 3:53 a.m. UTC | #3
Hi Oliver,

On Wed, Feb 16, 2022 at 8:52 PM Oliver Upton <oupton@google.com> wrote:
>
> Hi Reiji,
>
> First off, thanks for looking into this! Seems like a very useful thing
> to test :-)
>
> On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote:
> > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG
> > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo
> > process of saving/restoring registers during live migration, and this
> > test quantifies the performance of the process.
> >
> > Signed-off-by: Reiji Watanabe <reijiw@google.com>
> > ---
> >  tools/testing/selftests/kvm/.gitignore        |   1 +
> >  tools/testing/selftests/kvm/Makefile          |   1 +
> >  .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++
>
> Without having looked at the diff yet, I wonder if this can be done for
> multiple architectures, as the whole get/set reg UAPI is used by several
> other architectures.
>
> >  3 files changed, 458 insertions(+)
> >  create mode 100644 tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> >
> > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> > index dce7de7755e6..b6b18aaf9322 100644
> > --- a/tools/testing/selftests/kvm/.gitignore
> > +++ b/tools/testing/selftests/kvm/.gitignore
> > @@ -2,6 +2,7 @@
> >  /aarch64/arch_timer
> >  /aarch64/debug-exceptions
> >  /aarch64/get-reg-list
> > +/aarch64/get_set_regs_perf
> >  /aarch64/psci_cpu_on_test
> >  /aarch64/vgic_init
> >  /aarch64/vgic_irq
> > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> > index 0e4926bc9a58..46a28ce89002 100644
> > --- a/tools/testing/selftests/kvm/Makefile
> > +++ b/tools/testing/selftests/kvm/Makefile
> > @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += system_counter_offset_test
> >  TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
> >  TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
> >  TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
> > +TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf
> >  TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
> >  TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
> >  TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
> > diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> > new file mode 100644
> > index 000000000000..27944ccce4a1
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
> > @@ -0,0 +1,456 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test
> > + *
> > + * The test runs performance test of saving registers that are returned
> > + * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs
> > + * in another VM (a part of behavior during blackout on live migration).
> > + *
> > + * Copyright (c) 2022, Google LLC.
> > + */
> > +
> > +#define _GNU_SOURCE
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <stdint.h>
> > +#include <unistd.h>
> > +#include <time.h>
> > +#include <pthread.h>
> > +
> > +#include "kvm_util.h"
> > +#include "processor.h"
> > +
> > +#define      N2USEC(nsec)    ((nsec)/1000)
>
> How about defining:
>
>   #define       NSEC_PER_USEC   1000L
>
> and explicitly doing division inline? This is consistent with the kernel
> as well.

Sure, that sounds good.


> > +#define      REG_STR_LEN     64
> > +
> > +struct reg_val {
> > +      /* Big enough buffer even for the longest vector register */
> > +     uint64_t        val[32];
> > +};
> > +
> > +struct thread_data {
> > +     uint32_t        vcpuid;
> > +     bool            is_set_reg;
> > +     volatile bool   *start;
> > +     struct kvm_vm   *vm;
> > +     struct timespec time;
> > +     struct kvm_reg_list     *reg_list;
> > +
> > +     /*
> > +      * Pointer to an array of reg_val, which will hold a value of each
> > +      * register. The number of elements in it is indicated in reg_list.n.
> > +      */
> > +     struct reg_val  *reg_vals;
> > +
> > +     /*
> > +      * Pointer to an array of timespec, which will hold per register
> > +      * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of
> > +      * elements in it is indicated in reg_list.n.
> > +      */
> > +     struct timespec *reg_time;
> > +};
> > +
> > +pthread_t threads[KVM_MAX_VCPUS];
> > +struct thread_data thread_args[KVM_MAX_VCPUS];
> > +
> > +struct sys_reg_params {
> > +     u8      Op0;
> > +     u8      Op1;
> > +     u8      CRn;
> > +     u8      CRm;
> > +     u8      Op2;
> > +};
> > +
> > +static int regid_to_params(u64 id, struct sys_reg_params *params)
> > +{
> > +     switch (id & KVM_REG_SIZE_MASK) {
> > +     case KVM_REG_SIZE_U64:
> > +             /* Any unused index bits means it's not valid. */
> > +             if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK
> > +                           | KVM_REG_ARM_COPROC_MASK
> > +                           | KVM_REG_ARM64_SYSREG_OP0_MASK
> > +                           | KVM_REG_ARM64_SYSREG_OP1_MASK
> > +                           | KVM_REG_ARM64_SYSREG_CRN_MASK
> > +                           | KVM_REG_ARM64_SYSREG_CRM_MASK
> > +                           | KVM_REG_ARM64_SYSREG_OP2_MASK))
> > +                     return -1;
> > +             params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK)
> > +                            >> KVM_REG_ARM64_SYSREG_OP0_SHIFT);
> > +             params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK)
> > +                            >> KVM_REG_ARM64_SYSREG_OP1_SHIFT);
> > +             params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK)
> > +                            >> KVM_REG_ARM64_SYSREG_CRN_SHIFT);
> > +             params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK)
> > +                            >> KVM_REG_ARM64_SYSREG_CRM_SHIFT);
> > +             params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK)
> > +                            >> KVM_REG_ARM64_SYSREG_OP2_SHIFT);
> > +             return 0;
> > +     default:
> > +             return -1;
> > +     }
> > +}
> > +
> > +static void reg_id_to_str(uint64_t id, char *str, int len)
> > +{
> > +     struct sys_reg_params p;
> > +
> > +     TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
> > +                 "Not KVM_REG_ARM64 register (0x%lx)", id);
> > +
> > +     switch (id & KVM_REG_ARM_COPROC_MASK) {
> > +     case KVM_REG_ARM_CORE:
> > +             snprintf(str, len, "CORE(id:0x%lx)", id);
> > +             break;
> > +     case KVM_REG_ARM64_SVE:
> > +             snprintf(str, len, "SVE(id:0x%lx)", id);
> > +             break;
> > +     case KVM_REG_ARM_DEMUX:
> > +             snprintf(str, len, "DEMUX(id:0x%lx)", id);
> > +             break;
> > +     case KVM_REG_ARM_FW:
> > +             snprintf(str, len, "FW_REG_%ld", id & 0xffff);
> > +             break;
> > +     case KVM_REG_ARM64_SYSREG:
> > +             if (regid_to_params(id, &p) == 0)
> > +                     snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)",
> > +                              p.Op0, p.Op1, p.CRn, p.CRm, p.Op2);
> > +             else
> > +                     snprintf(str, len, "SYSREG(id:0x%lx)", id);
> > +             break;
> > +     default:
> > +             snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id);
> > +     }
> > +}
>
> I see that print_reg() in aarch64/get-reg-list.c implements most of
> this. Would it be possible to hoist that into a shared helper function?

Register name formats that are printed by get-reg-list.c are unnecessarily
long for some registers like below for this testing, which I don't
think is convenient
for this testing.

KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE |
KVM_REG_ARM_CORE_REG(regs.regs[0]),
KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE |
KVM_REG_ARM_CORE_REG(regs.sp),

Although I don't want to use this long format for this testing, I will
look into creating a library function on this so that it can be used
by both (or any other testings).


>
> > +/*
> > + * Save or restore registers that were saved by get_regs_vcpu_thread with
> > + * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process.
> > + */
> > +static void *vcpu_thread(void *data)
> > +{
> > +     int i, ret;
> > +     struct thread_data *arg = data;
> > +     struct kvm_reg_list *reg_list = arg->reg_list;
> > +     bool is_set_reg = arg->is_set_reg;
> > +     unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG;
> > +     char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG";
>
> Why not use vcpu_{get,set}_reg() instead? They already have informative
> assertions that print the name of the ioctl and the name of the errno.

Thank you for the comment !
This comment reminded me of another problem that I wanted to fix:)
(Since TEST_ASSERT unconditionally evaluates its arguments, strerror(),
 which is internally serialized, is called from vcpu_{get,set}_reg())


> > +     struct timespec ts_reg_diff, ts_reg_start;
> > +     struct kvm_one_reg reg;
> > +
> > +     memset(&arg->time, 0, sizeof(arg->time));
> > +
> > +     /* Wait until start is set */
> > +     while (!(*arg->start))
> > +             ;
>
> Have you considered using a pthread_barrier for this?

Thank you for the proposal. I will use that.


>
> > +     /* Warm up the cache. */
> > +     reg.id = reg_list->reg[0];
> > +     reg.addr = (uint64_t)&arg->reg_vals[0];
> > +     ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
> > +     TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);
>
> I wouldn't expect VMMs to do this before starting migration. Do you
> think it would be better to replicate the behavior of typical userspace?

The reason why I added that was because it makes a bit of noise for
per register result (and the impact on the whole is not significant).

ytbcp26:/tmp# ./get_set_regs_perf -v8 -r| tail -n +4| sort -nk2 |tail -n 3
 SYSREG(3,0,0,1,0)                         1084        617        467
 SYSREG(3,3,14,3,2)                        2678        397       2281
 CORE(id:0x6030000000100000)               2855       1459       1396

ytbcp26:/tmp# ./get_set_regs_perf -v8 -r| tail -n +4| sort -nk2 |tail -n 3
 SYSREG(3,0,0,4,0)                          994        461        533
 SYSREG(3,0,0,1,0)                         1052        582        470
 SYSREG(3,3,14,3,2)                        2675        396       2279

So, I would prefer to keep that.

>
> > +     pr_debug("%s[%d] start\n", __func__, arg->vcpuid);
> > +
> > +     /* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */
> > +     for (i = 0; i < reg_list->n; i++) {
> > +             clock_gettime(CLOCK_MONOTONIC, &ts_reg_start);
> > +
> > +             reg.id = reg_list->reg[i];
> > +             reg.addr = (uint64_t)&arg->reg_vals[i];
> > +             ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
> > +
> > +             ts_reg_diff = timespec_elapsed(ts_reg_start);
> > +             arg->time = timespec_add(arg->time, ts_reg_diff);
> > +             if (arg->reg_time)
> > +                     /* Need per register latency separately */
> > +                     arg->reg_time[i] = ts_reg_diff;
> > +
> > +             TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);
> > +             pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id,
> > +                      arg->reg_vals[i].val[0]);
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data)
> > +{
> > +     uint64_t i;
> > +
> > +     for (i = 0; i < cnt; i++)
> > +             total[i] = timespec_add(total[i], data[i]);
> > +}
> > +
> > +struct result_timespec {
> > +     struct timespec src;
> > +     struct timespec dst;
> > +     struct timespec *src_per_reg;
> > +     struct timespec *dst_per_reg;
> > +};
> > +
> > +/*
> > + * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test.
> > + * Create a VM with the given number of vCPUs, create a thread for each
> > + * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG
> > + * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing
> > + * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on
> > + * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to
> > + * the caller via @res.
> > + */
> > +void run_test_one(int nvcpus, struct kvm_reg_list *reg_list,
> > +               struct result_timespec *res)
> > +{
> > +     bool per_reg_data = res->src_per_reg ? true : false;
> > +     uint64_t nregs = reg_list->n;
> > +     volatile bool start = false;
> > +     int i, ret;
> > +     struct kvm_vm *src_vm, *dst_vm;
> > +     struct thread_data *targ;
> > +     struct timespec src_sum = (struct timespec){0};
> > +     struct timespec dst_sum = (struct timespec){0};
>
> Is the cast required?

No, it isn't... I will fix that.

>
> > +     if (per_reg_data) {
> > +             /* Reset the buffers to save the per register result */
> > +             memset(res->src_per_reg, 0, nregs * sizeof(struct timespec));
> > +             memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec));
> > +     }
> > +
> > +     /* Create VMs to save/restore registers */
> > +     src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
> > +     dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
> > +
> > +     /* Start saving registers process for src_vm */
> > +
> > +     /* Create source VM's vCPU threads */
> > +     for (i = 0; i < nvcpus; i++) {
> > +             targ = &thread_args[i];
> > +             targ->vm = src_vm;
> > +             targ->vcpuid = i;
> > +             targ->is_set_reg = false;
> > +             targ->start = &start;
> > +             targ->reg_list = reg_list;
> > +             targ->reg_vals = calloc(nregs, sizeof(struct reg_val));
> > +             TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals");
> > +             if (per_reg_data) {
> > +                     /* Per register result buffer for the vCPU */
> > +                     targ->reg_time = calloc(nregs, sizeof(struct timespec));
> > +                     TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time");
> > +             }
> > +             ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
> > +             TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
> > +     }
> > +
> > +     /*
> > +      * Let threads start saving registers for vCPUs, and wait for all
> > +      * threads to complete restoring registers.
> > +      */
> > +     start = true;
> > +     for (i = 0; i < nvcpus; i++) {
> > +             targ = &thread_args[i];
> > +             ret = pthread_join(threads[i], NULL);
> > +             TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
> > +             src_sum = timespec_add(src_sum, targ->time);
>
> I think it would make more sense to measure wall time instead of CPU
> time here. It may not be immediately obvious when running this test why
> the reported time scales as the number of vCPUs increases.

The result that is going to be shown is vCPU average (I don't want
to include the costs incurred by the test code as much as possible).
I will consider adding an optional behavior to show the worst result
among vCPUs.

>
> > +             if (per_reg_data)
> > +                     aggregate_time(nregs, res->src_per_reg, targ->reg_time);
> > +     }
> > +
> > +     start = false;
> > +     pr_debug("%s Saving registers completed.\n", __func__);
> > +
> > +     /* Start restoring registers process for dst_vm */
> > +
> > +     /* Create destination VM's vCPU threads */
> > +     for (i = 0; i < nvcpus; i++) {
> > +             targ = &thread_args[i];
> > +             /* Update fields that are different from the src case */
> > +             targ->vm = dst_vm;
> > +             targ->is_set_reg = true;
> > +             ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
> > +             TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
> > +     }
> > +
> > +     /*
> > +      * Let threads start saving registers for vCPUs, and wait for all
> > +      * threads to complete restoring registers.
> > +      */
> > +     start = true;
> > +     for (i = 0; i < nvcpus; i++) {
> > +             targ = &thread_args[i];
> > +             ret = pthread_join(threads[i], NULL);
> > +             TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
> > +
> > +             free(targ->reg_vals);
> > +             dst_sum = timespec_add(dst_sum, targ->time);
> > +             if (per_reg_data) {
> > +                     aggregate_time(nregs, res->dst_per_reg, targ->reg_time);
> > +                     free(targ->reg_time);
> > +             }
> > +     }
> > +
> > +     kvm_vm_free(src_vm);
> > +     kvm_vm_free(dst_vm);
> > +
> > +     pr_debug("%s Restoring registers completed.\n", __func__);
> > +
> > +     /* Calculate the vCPU average */
> > +     res->src = timespec_div(src_sum, nvcpus);
> > +     res->dst = timespec_div(dst_sum, nvcpus);
> > +     if (!per_reg_data)
> > +             return;
> > +
> > +     /* Calculate the vCPU average for each register */
> > +     for (i = 0; i < nregs; i++) {
> > +             res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus);
> > +             res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus);
> > +     }
> > +}
> > +
> > +/*
> > + * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG)
> > + * performance test with the given number of vCPUs (@nvcpus) for the given
> > + * number (@iterations) of times.
> > + * Each iteration of the test returns the latency of saving/restoring all
> > + * the registers that KVM_GET_REG_LIST returns (See comments for
> > + * run_test_one() for more details of each iteration of the test).
> > + * Print the average latency of each iteration, and if @per_reg_result is set,
> > + * print the average latency for each register as well.
> > + */
> > +void run_test(int nvcpus, int iterations, bool per_reg_result)
> > +{
> > +     int i, j;
> > +     uint64_t nregs;
> > +     struct result_timespec res = {0};
> > +     struct timespec src_sum = {0};
> > +     struct timespec dst_sum = {0};
> > +     struct timespec avg, src_avg, dst_avg;
> > +     struct timespec *src_per_reg_sum, *dst_per_reg_sum;
> > +     struct kvm_vm *vm;
> > +     struct kvm_reg_list *reg_list;
> > +     char reg_str[REG_STR_LEN];
> > +
> > +     /* Get kvm_reg_list */
> > +     vm = vm_create_default(0, 0, NULL);
> > +     reg_list = vcpu_get_reg_list(vm, 0);
>
> Would it make sense to test some opt-in capabilities that expose
> additional registers (PMU, SVE, etc.)?

Yes, I will implement that.

>
> > +     kvm_vm_free(vm);
> > +     nregs = reg_list->n;
> > +
> > +     if (per_reg_result) {
> > +             /*
> > +              * Allocate buffers to save latency of KVM_GET_ONE_REG
> > +              * and KVM_SET_ONE_REG respectively.
> > +              */
> > +
> > +             /* Buffers for each iteration */
> > +             res.src_per_reg = calloc(nregs, sizeof(struct timespec));
> > +             res.dst_per_reg = calloc(nregs, sizeof(struct timespec));
> > +
> > +             /* Buffers for the sum */
> > +             src_per_reg_sum = calloc(nregs, sizeof(struct timespec));
> > +             dst_per_reg_sum = calloc(nregs, sizeof(struct timespec));
> > +
> > +             TEST_ASSERT(res.src_per_reg && res.dst_per_reg &&
> > +                         src_per_reg_sum && dst_per_reg_sum,
> > +                         "Failed to allocate per register time buffer");
> > +     }
> > +
> > +     pr_info("Iterations %d (vCPUs %d, regs %lld): ",
> > +             iterations, nvcpus, reg_list->n);
> > +
> > +     for (i = 0; i < iterations; i++) {
> > +             /* Run the test */
> > +             run_test_one(nvcpus, reg_list, &res);
> > +
> > +             src_sum = timespec_add(src_sum, res.src);
> > +             dst_sum = timespec_add(dst_sum, res.dst);
> > +
> > +             if (!per_reg_result)
> > +                     continue;
> > +
> > +             /* Aggregate per register result */
> > +             for (j = 0; j < nregs; j++) {
> > +                     src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j],
> > +                                                       res.src_per_reg[j]);
> > +                     dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j],
> > +                                                       res.dst_per_reg[j]);
> > +             }
> > +     }
> > +
> > +     /* Calculate the iteration average */
> > +     src_avg = timespec_div(src_sum, iterations);
> > +     dst_avg = timespec_div(dst_sum, iterations);
> > +     avg = timespec_add(src_avg, dst_avg);
> > +
> > +     /* Print the average */
> > +     if (avg.tv_sec > 0) {
> > +             pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n",
> > +                     avg.tv_sec, avg.tv_nsec,
> > +                     src_avg.tv_sec, src_avg.tv_nsec,
> > +                     dst_avg.tv_sec, dst_avg.tv_nsec);
> > +     } else {
> > +             pr_info("get+set %ldus (get %ldus, set %ldus)\n",
> > +                     N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec),
> > +                     N2USEC(dst_avg.tv_nsec));
> > +     }
> > +
> > +     /* Print per register result when requested */
> > +     if (per_reg_result) {
> > +             pr_info("Per Register Result:\n");
> > +             pr_info(" %-32s %13s  %9s  %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)");
> > +             for (i = 0; i < nregs; i++) {
> > +                     /* Calculate the average for the register */
> > +                     src_avg = timespec_div(src_per_reg_sum[i], iterations);
> > +                     dst_avg = timespec_div(dst_per_reg_sum[i], iterations);
> > +                     avg = timespec_add(src_avg, dst_avg);
> > +
> > +                     assert(avg.tv_sec == 0);
> > +                     reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN);
> > +                     pr_info(" %-32s %13ld  %9ld  %9ld\n", reg_str,
> > +                             avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec);
> > +             }
>
> Do you think it would be useful to optionally dump this granular data
> into a machine-parseable format (like CSV)? It could be helpful for
> analysis. We have several hundred exposed registers (and always
> growing!) and grokking stdout might be a bit difficult.

I am thinking of changing spaces in the output into tabs so that the
output can be parsed easily (the stdout should be redirected to the
file as needed).
Then, since I am checking the result with sort+tail/head, one output
format should be enough for both rather than adding extra options.
BTW, some register names includes ',' because they are printed
like "SYSREG(op0, op1, ...).

Thank you so much for the review!

Regards,
Reiji


>
> > +             free(res.src_per_reg);
> > +             free(res.dst_per_reg);
> > +             free(src_per_reg_sum);
> > +             free(dst_per_reg_sum);
> > +     }
> > +
> > +     free(reg_list);
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > +     int opt;
> > +     int nvcpus = 1;
> > +     int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
> > +     int iterations = 1000;
> > +     bool per_reg_result = false;
> > +
> > +     while ((opt = getopt(argc, argv, "hri:v:")) != -1) {
> > +             switch (opt) {
> > +             case 'i':
> > +                     iterations = atoi(optarg);
> > +                     break;
> > +             case 'v':
> > +                     nvcpus = atoi(optarg);
> > +                     TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus,
> > +                                 "Number of vcpus, must be between 1 and %d",
> > +                                 max_vcpus);
> > +                     break;
> > +             case 'r':
> > +                     per_reg_result = true;
> > +                     break;
> > +             case 'h':
> > +             default:
> > +                     printf("usage: %s [-h] [-v vcpus] [-i iterations]\n",
> > +                            argv[0]);
> > +                     exit(0);
> > +             }
> > +     }
> > +
> > +     setbuf(stdout, NULL);
> > +
> > +     run_test(nvcpus, iterations, per_reg_result);
> > +
> > +     return 0;
> > +}
> > --
> > 2.35.1.473.g83b2b277ed-goog
> >
>
> --
> Best,
> Oliver
Reiji Watanabe Feb. 19, 2022, 4:50 a.m. UTC | #4
Hi Marc,

On Thu, Feb 17, 2022 at 1:12 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Thu, 17 Feb 2022 04:52:10 +0000,
> Oliver Upton <oupton@google.com> wrote:
> >
> > Hi Reiji,
> >
> > First off, thanks for looking into this! Seems like a very useful thing
> > to test :-)
> >
> > On Wed, Feb 16, 2022 at 07:49:47PM -0800, Reiji Watanabe wrote:
> > > Introduce a simple performance test of KVM_GET_ONE_REG/KVM_SET_ONE_REG
> > > for registers that are returned by KVM_GET_REG_LIST. This is a pseudo
> > > process of saving/restoring registers during live migration, and this
> > > test quantifies the performance of the process.
> > >
> > > Signed-off-by: Reiji Watanabe <reijiw@google.com>
> > > ---
> > >  tools/testing/selftests/kvm/.gitignore        |   1 +
> > >  tools/testing/selftests/kvm/Makefile          |   1 +
> > >  .../selftests/kvm/aarch64/get_set_regs_perf.c | 456 ++++++++++++++++++
> >
>
> [...]
>
> > Would it make sense to test some opt-in capabilities that expose
> > additional registers (PMU, SVE, etc.)?
>
> I think this is important. System registers are usually saved/restored

Yes, I will fix the test to include registers for opt-in features
when supported.

> in groups, and due to the way we walk the sysreg array, timings are
> unlikely to be uniform. Getting a grip on that could help restructure
> the walking if required (either per-group arrays, or maybe a tree
> structure).

The biggest system register table that I know is sys_reg_descs[],
and KVM_SET_ONE_REG/KVM_GET_ONE_REG/emulation code already uses
binary search to find the target entry.  So, the search itself
isn't that bad.  The difference between the min and the max
latency of KVM_GET_ONE_REG for the registers is always around
200nsec on Ampere Altra machine as far as I checked.


> Note that all of this could equally apply to the guest trapping (the
> walk is the same).
>
> And yes, there are a lot of commonalities with get-reg-list, so
> reusing some of the existing infrastructure would be a good thing.

I will look into that.

Thanks,
Reiji
Marc Zyngier Feb. 19, 2022, 9:42 a.m. UTC | #5
Hi Reiji,

On 2022-02-19 04:50, Reiji Watanabe wrote:
> Hi Marc,
> 
> On Thu, Feb 17, 2022 at 1:12 AM Marc Zyngier <maz@kernel.org> wrote:
>> 
>> On Thu, 17 Feb 2022 04:52:10 +0000,
>> Oliver Upton <oupton@google.com> wrote:
>> 
>> > Would it make sense to test some opt-in capabilities that expose
>> > additional registers (PMU, SVE, etc.)?
>> 
>> I think this is important. System registers are usually saved/restored
> 
> Yes, I will fix the test to include registers for opt-in features
> when supported.
> 
>> in groups, and due to the way we walk the sysreg array, timings are
>> unlikely to be uniform. Getting a grip on that could help restructure
>> the walking if required (either per-group arrays, or maybe a tree
>> structure).
> 
> The biggest system register table that I know is sys_reg_descs[],
> and KVM_SET_ONE_REG/KVM_GET_ONE_REG/emulation code already uses
> binary search to find the target entry.  So, the search itself
> isn't that bad.  The difference between the min and the max
> latency of KVM_GET_ONE_REG for the registers is always around
> 200nsec on Ampere Altra machine as far as I checked.

Even if it is OK so far, it is bound to get worse over time, as
the architecture keeps adding all sort of things that we'll
eventually have to save/restore.

I see this test as a way to monitor this trend and work out when
we need to invest in something better.

Thanks,

         M.
diff mbox series

Patch

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index dce7de7755e6..b6b18aaf9322 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -2,6 +2,7 @@ 
 /aarch64/arch_timer
 /aarch64/debug-exceptions
 /aarch64/get-reg-list
+/aarch64/get_set_regs_perf
 /aarch64/psci_cpu_on_test
 /aarch64/vgic_init
 /aarch64/vgic_irq
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 0e4926bc9a58..46a28ce89002 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -103,6 +103,7 @@  TEST_GEN_PROGS_x86_64 += system_counter_offset_test
 TEST_GEN_PROGS_aarch64 += aarch64/arch_timer
 TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions
 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
+TEST_GEN_PROGS_aarch64 += aarch64/get_set_regs_perf
 TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_init
 TEST_GEN_PROGS_aarch64 += aarch64/vgic_irq
diff --git a/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
new file mode 100644
index 000000000000..27944ccce4a1
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/get_set_regs_perf.c
@@ -0,0 +1,456 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * get_set_regs_perf - KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test
+ *
+ * The test runs performance test of saving registers that are returned
+ * by KVM_GET_REG_LIST for vCPUs in a VM and restoring them for vCPUs
+ * in another VM (a part of behavior during blackout on live migration).
+ *
+ * Copyright (c) 2022, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define	N2USEC(nsec)	((nsec)/1000)
+#define	REG_STR_LEN	64
+
+struct reg_val {
+	 /* Big enough buffer even for the longest vector register */
+	uint64_t	val[32];
+};
+
+struct thread_data {
+	uint32_t	vcpuid;
+	bool		is_set_reg;
+	volatile bool	*start;
+	struct kvm_vm	*vm;
+	struct timespec	time;
+	struct kvm_reg_list	*reg_list;
+
+	/*
+	 * Pointer to an array of reg_val, which will hold a value of each
+	 * register. The number of elements in it is indicated in reg_list.n.
+	 */
+	struct reg_val	*reg_vals;
+
+	/*
+	 * Pointer to an array of timespec, which will hold per register
+	 * latency of KVM_GET_ONE_REG or KVM_SET_ONE_REG. The number of
+	 * elements in it is indicated in reg_list.n.
+	 */
+	struct timespec	*reg_time;
+};
+
+pthread_t threads[KVM_MAX_VCPUS];
+struct thread_data thread_args[KVM_MAX_VCPUS];
+
+struct sys_reg_params {
+	u8	Op0;
+	u8	Op1;
+	u8	CRn;
+	u8	CRm;
+	u8	Op2;
+};
+
+static int regid_to_params(u64 id, struct sys_reg_params *params)
+{
+	switch (id & KVM_REG_SIZE_MASK) {
+	case KVM_REG_SIZE_U64:
+		/* Any unused index bits means it's not valid. */
+		if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK
+			      | KVM_REG_ARM_COPROC_MASK
+			      | KVM_REG_ARM64_SYSREG_OP0_MASK
+			      | KVM_REG_ARM64_SYSREG_OP1_MASK
+			      | KVM_REG_ARM64_SYSREG_CRN_MASK
+			      | KVM_REG_ARM64_SYSREG_CRM_MASK
+			      | KVM_REG_ARM64_SYSREG_OP2_MASK))
+			return -1;
+		params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP0_SHIFT);
+		params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP1_SHIFT);
+		params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK)
+			       >> KVM_REG_ARM64_SYSREG_CRN_SHIFT);
+		params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK)
+			       >> KVM_REG_ARM64_SYSREG_CRM_SHIFT);
+		params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK)
+			       >> KVM_REG_ARM64_SYSREG_OP2_SHIFT);
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static void reg_id_to_str(uint64_t id, char *str, int len)
+{
+	struct sys_reg_params p;
+
+	TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
+		    "Not KVM_REG_ARM64 register (0x%lx)", id);
+
+	switch (id & KVM_REG_ARM_COPROC_MASK) {
+	case KVM_REG_ARM_CORE:
+		snprintf(str, len, "CORE(id:0x%lx)", id);
+		break;
+	case KVM_REG_ARM64_SVE:
+		snprintf(str, len, "SVE(id:0x%lx)", id);
+		break;
+	case KVM_REG_ARM_DEMUX:
+		snprintf(str, len, "DEMUX(id:0x%lx)", id);
+		break;
+	case KVM_REG_ARM_FW:
+		snprintf(str, len, "FW_REG_%ld", id & 0xffff);
+		break;
+	case KVM_REG_ARM64_SYSREG:
+		if (regid_to_params(id, &p) == 0)
+			snprintf(str, len, "SYSREG(%d,%d,%d,%d,%d)",
+				 p.Op0, p.Op1, p.CRn, p.CRm, p.Op2);
+		else
+			snprintf(str, len, "SYSREG(id:0x%lx)", id);
+		break;
+	default:
+		snprintf(str, len, "KVM_REG_ARM64(id:0x%lx)", id);
+	}
+}
+
+/*
+ * Save or restore registers that were saved by get_regs_vcpu_thread with
+ * KVM_GET_ONE_REG or KVM_SET_ONE_REG. Measure time taken by this process.
+ */
+static void *vcpu_thread(void *data)
+{
+	int i, ret;
+	struct thread_data *arg = data;
+	struct kvm_reg_list *reg_list = arg->reg_list;
+	bool is_set_reg = arg->is_set_reg;
+	unsigned int cmd = is_set_reg ? KVM_SET_ONE_REG : KVM_GET_ONE_REG;
+	char *cmd_str = is_set_reg ? "SET_ONE_REG" : "GET_ONE_REG";
+	struct timespec ts_reg_diff, ts_reg_start;
+	struct kvm_one_reg reg;
+
+	memset(&arg->time, 0, sizeof(arg->time));
+
+	/* Wait until start is set */
+	while (!(*arg->start))
+		;
+
+	/* Warm up the cache. */
+	reg.id = reg_list->reg[0];
+	reg.addr = (uint64_t)&arg->reg_vals[0];
+	ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
+	TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);
+
+	pr_debug("%s[%d] start\n", __func__, arg->vcpuid);
+
+	/* Run KVM_SET_ONE_REG or KVM_GET_ONE_REG for registers in reg_list */
+	for (i = 0; i < reg_list->n; i++) {
+		clock_gettime(CLOCK_MONOTONIC, &ts_reg_start);
+
+		reg.id = reg_list->reg[i];
+		reg.addr = (uint64_t)&arg->reg_vals[i];
+		ret = _vcpu_ioctl(arg->vm, arg->vcpuid, cmd, &reg);
+
+		ts_reg_diff = timespec_elapsed(ts_reg_start);
+		arg->time = timespec_add(arg->time, ts_reg_diff);
+		if (arg->reg_time)
+			/* Need per register latency separately */
+			arg->reg_time[i] = ts_reg_diff;
+
+		TEST_ASSERT(!ret, "%s(id:0x%llx) failed\n", cmd_str, reg.id);
+		pr_debug("%s: id:0x%lx, val:0x%lx\n", cmd_str, (uint64_t)reg.id,
+			 arg->reg_vals[i].val[0]);
+	}
+
+	return NULL;
+}
+
+void aggregate_time(uint64_t cnt, struct timespec *total, struct timespec *data)
+{
+	uint64_t i;
+
+	for (i = 0; i < cnt; i++)
+		total[i] = timespec_add(total[i], data[i]);
+}
+
+struct result_timespec {
+	struct timespec src;
+	struct timespec dst;
+	struct timespec *src_per_reg;
+	struct timespec *dst_per_reg;
+};
+
+/*
+ * Run KVM_GET_ONE_REG/KVM_SET_ONE_REG performance test.
+ * Create a VM with the given number of vCPUs, create a thread for each
+ * vCPU in the VM, and wait until all the threads complete KVM_GET_ONE_REG
+ * for all the registers KVM_GET_REG_LIST returns. Then repeat the same thing
+ * with KVM_SET_ONE_REG. Aggregate the time spent by each thread on
+ * KVM_GET_ONE_REG and KVM_SET_ONE_REG, and return the vCPU average value to
+ * the caller via @res.
+ */
+void run_test_one(int nvcpus, struct kvm_reg_list *reg_list,
+		  struct result_timespec *res)
+{
+	bool per_reg_data = res->src_per_reg ? true : false;
+	uint64_t nregs = reg_list->n;
+	volatile bool start = false;
+	int i, ret;
+	struct kvm_vm *src_vm, *dst_vm;
+	struct thread_data *targ;
+	struct timespec src_sum = (struct timespec){0};
+	struct timespec dst_sum = (struct timespec){0};
+
+	if (per_reg_data) {
+		/* Reset the buffers to save the per register result */
+		memset(res->src_per_reg, 0, nregs * sizeof(struct timespec));
+		memset(res->dst_per_reg, 0, nregs * sizeof(struct timespec));
+	}
+
+	/* Create VMs to save/restore registers */
+	src_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
+	dst_vm = vm_create_default_with_vcpus(nvcpus, 0, 0, NULL, NULL);
+
+	/* Start saving registers process for src_vm */
+
+	/* Create source VM's vCPU threads */
+	for (i = 0; i < nvcpus; i++) {
+		targ = &thread_args[i];
+		targ->vm = src_vm;
+		targ->vcpuid = i;
+		targ->is_set_reg = false;
+		targ->start = &start;
+		targ->reg_list = reg_list;
+		targ->reg_vals = calloc(nregs, sizeof(struct reg_val));
+		TEST_ASSERT(targ->reg_vals, "Failed to allocate reg_vals");
+		if (per_reg_data) {
+			/* Per register result buffer for the vCPU */
+			targ->reg_time = calloc(nregs, sizeof(struct timespec));
+			TEST_ASSERT(targ->reg_time, "Failed to allocate reg_time");
+		}
+		ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
+		TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
+	}
+
+	/*
+	 * Let threads start saving registers for vCPUs, and wait for all
+	 * threads to complete restoring registers.
+	 */
+	start = true;
+	for (i = 0; i < nvcpus; i++) {
+		targ = &thread_args[i];
+		ret = pthread_join(threads[i], NULL);
+		TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
+		src_sum = timespec_add(src_sum, targ->time);
+		if (per_reg_data)
+			aggregate_time(nregs, res->src_per_reg, targ->reg_time);
+	}
+
+	start = false;
+	pr_debug("%s Saving registers completed.\n", __func__);
+
+	/* Start restoring registers process for dst_vm */
+
+	/* Create destination VM's vCPU threads */
+	for (i = 0; i < nvcpus; i++) {
+		targ = &thread_args[i];
+		/* Update fields that are different from the src case */
+		targ->vm = dst_vm;
+		targ->is_set_reg = true;
+		ret = pthread_create(&threads[i], NULL, vcpu_thread, targ);
+		TEST_ASSERT(!ret, "pthread_create failed: %d\n", ret);
+	}
+
+	/*
+	 * Let threads start saving registers for vCPUs, and wait for all
+	 * threads to complete restoring registers.
+	 */
+	start = true;
+	for (i = 0; i < nvcpus; i++) {
+		targ = &thread_args[i];
+		ret = pthread_join(threads[i], NULL);
+		TEST_ASSERT(!ret, "pthread_join failed: %d\n", ret);
+
+		free(targ->reg_vals);
+		dst_sum = timespec_add(dst_sum, targ->time);
+		if (per_reg_data) {
+			aggregate_time(nregs, res->dst_per_reg, targ->reg_time);
+			free(targ->reg_time);
+		}
+	}
+
+	kvm_vm_free(src_vm);
+	kvm_vm_free(dst_vm);
+
+	pr_debug("%s Restoring registers completed.\n", __func__);
+
+	/* Calculate the vCPU average */
+	res->src = timespec_div(src_sum, nvcpus);
+	res->dst = timespec_div(dst_sum, nvcpus);
+	if (!per_reg_data)
+		return;
+
+	/* Calculate the vCPU average for each register */
+	for (i = 0; i < nregs; i++) {
+		res->src_per_reg[i] = timespec_div(res->src_per_reg[i], nvcpus);
+		res->dst_per_reg[i] = timespec_div(res->dst_per_reg[i], nvcpus);
+	}
+}
+
+/*
+ * Run saving/restoring vCPU registers (KVM_GET_ONE_REG/KVM_SET_ONE_REG)
+ * performance test with the given number of vCPUs (@nvcpus) for the given
+ * number (@iterations) of times.
+ * Each iteration of the test returns the latency of saving/restoring all
+ * the registers that KVM_GET_REG_LIST returns (See comments for
+ * run_test_one() for more details of each iteration of the test).
+ * Print the average latency of each iteration, and if @per_reg_result is set,
+ * print the average latency for each register as well.
+ */
+void run_test(int nvcpus, int iterations, bool per_reg_result)
+{
+	int i, j;
+	uint64_t nregs;
+	struct result_timespec res = {0};
+	struct timespec src_sum = {0};
+	struct timespec dst_sum = {0};
+	struct timespec avg, src_avg, dst_avg;
+	struct timespec *src_per_reg_sum, *dst_per_reg_sum;
+	struct kvm_vm *vm;
+	struct kvm_reg_list *reg_list;
+	char reg_str[REG_STR_LEN];
+
+	/* Get kvm_reg_list */
+	vm = vm_create_default(0, 0, NULL);
+	reg_list = vcpu_get_reg_list(vm, 0);
+	kvm_vm_free(vm);
+	nregs = reg_list->n;
+
+	if (per_reg_result) {
+		/*
+		 * Allocate buffers to save latency of KVM_GET_ONE_REG
+		 * and KVM_SET_ONE_REG respectively.
+		 */
+
+		/* Buffers for each iteration */
+		res.src_per_reg = calloc(nregs, sizeof(struct timespec));
+		res.dst_per_reg = calloc(nregs, sizeof(struct timespec));
+
+		/* Buffers for the sum */
+		src_per_reg_sum = calloc(nregs, sizeof(struct timespec));
+		dst_per_reg_sum = calloc(nregs, sizeof(struct timespec));
+
+		TEST_ASSERT(res.src_per_reg && res.dst_per_reg &&
+			    src_per_reg_sum && dst_per_reg_sum,
+			    "Failed to allocate per register time buffer");
+	}
+
+	pr_info("Iterations %d (vCPUs %d, regs %lld): ",
+		iterations, nvcpus, reg_list->n);
+
+	for (i = 0; i < iterations; i++) {
+		/* Run the test */
+		run_test_one(nvcpus, reg_list, &res);
+
+		src_sum = timespec_add(src_sum, res.src);
+		dst_sum = timespec_add(dst_sum, res.dst);
+
+		if (!per_reg_result)
+			continue;
+
+		/* Aggregate per register result */
+		for (j = 0; j < nregs; j++) {
+			src_per_reg_sum[j] = timespec_add(src_per_reg_sum[j],
+							  res.src_per_reg[j]);
+			dst_per_reg_sum[j] = timespec_add(dst_per_reg_sum[j],
+							  res.dst_per_reg[j]);
+		}
+	}
+
+	/* Calculate the iteration average */
+	src_avg = timespec_div(src_sum, iterations);
+	dst_avg = timespec_div(dst_sum, iterations);
+	avg = timespec_add(src_avg, dst_avg);
+
+	/* Print the average */
+	if (avg.tv_sec > 0) {
+		pr_info("get+set %ld.%.9lds (get %ld.%.9lds, set %ld.%.9lds)\n",
+			avg.tv_sec, avg.tv_nsec,
+			src_avg.tv_sec, src_avg.tv_nsec,
+			dst_avg.tv_sec, dst_avg.tv_nsec);
+	} else {
+		pr_info("get+set %ldus (get %ldus, set %ldus)\n",
+			N2USEC(avg.tv_nsec), N2USEC(src_avg.tv_nsec),
+			N2USEC(dst_avg.tv_nsec));
+	}
+
+	/* Print per register result when requested */
+	if (per_reg_result) {
+		pr_info("Per Register Result:\n");
+		pr_info(" %-32s %13s  %9s  %9s\n", "Register", "get+set(ns)", "get(ns)", "set(ns)");
+		for (i = 0; i < nregs; i++) {
+			/* Calculate the average for the register */
+			src_avg = timespec_div(src_per_reg_sum[i], iterations);
+			dst_avg = timespec_div(dst_per_reg_sum[i], iterations);
+			avg = timespec_add(src_avg, dst_avg);
+
+			assert(avg.tv_sec == 0);
+			reg_id_to_str(reg_list->reg[i], reg_str, REG_STR_LEN);
+			pr_info(" %-32s %13ld  %9ld  %9ld\n", reg_str,
+				avg.tv_nsec, src_avg.tv_nsec, dst_avg.tv_nsec);
+		}
+
+		free(res.src_per_reg);
+		free(res.dst_per_reg);
+		free(src_per_reg_sum);
+		free(dst_per_reg_sum);
+	}
+
+	free(reg_list);
+}
+
+int main(int argc, char *argv[])
+{
+	int opt;
+	int nvcpus = 1;
+	int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+	int iterations = 1000;
+	bool per_reg_result = false;
+
+	while ((opt = getopt(argc, argv, "hri:v:")) != -1) {
+		switch (opt) {
+		case 'i':
+			iterations = atoi(optarg);
+			break;
+		case 'v':
+			nvcpus = atoi(optarg);
+			TEST_ASSERT(nvcpus > 0 && nvcpus <= max_vcpus,
+				    "Number of vcpus, must be between 1 and %d",
+				    max_vcpus);
+			break;
+		case 'r':
+			per_reg_result = true;
+			break;
+		case 'h':
+		default:
+			printf("usage: %s [-h] [-v vcpus] [-i iterations]\n",
+			       argv[0]);
+			exit(0);
+		}
+	}
+
+	setbuf(stdout, NULL);
+
+	run_test(nvcpus, iterations, per_reg_result);
+
+	return 0;
+}