@@ -20,6 +20,7 @@ OBJS += trace-plugin.o
ifeq ($(VSOCK_DEFINED), 1)
OBJS += trace-timesync.o
OBJS += trace-timesync-ptp.o
+OBJS += trace-timesync-kvm.o
endif
# Additional util objects
@@ -423,6 +423,7 @@ struct tracecmd_time_sync {
int loop_interval;
pthread_mutex_t lock;
pthread_cond_t cond;
+ pthread_barrier_t first_sync;
char *clock_str;
struct tracecmd_msg_handle *msg_handle;
void *context;
@@ -439,7 +440,7 @@ const char *tracecmd_tsync_proto_select(struct tracecmd_tsync_protos *protos, ch
enum tracecmd_time_sync_role role);
bool tsync_proto_is_supported(const char *proto_name);
void tracecmd_tsync_with_host(struct tracecmd_time_sync *tsync);
-void tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync);
+int tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync);
int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu,
int *count, long long **ts,
long long **offsets, long long **scalings);
@@ -49,5 +49,6 @@ int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int role
int tracecmd_tsync_proto_unregister(char *proto_name);
int ptp_clock_sync_register(void);
+int kvm_clock_sync_register(void);
#endif /* _TRACE_TSYNC_LOCAL_H */
new file mode 100644
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2020, VMware, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
+ *
+ */
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include "trace-cmd.h"
+#include "trace-cmd-private.h"
+#include "tracefs.h"
+#include "trace-tsync-local.h"
+
+#define KVM_DEBUG_FS "/sys/kernel/debug/kvm"
+#define KVM_DEBUG_OFFSET_FILE "tsc-offset"
+#define KVM_DEBUG_SCALING_FILE "tsc-scaling-ratio"
+#define KVM_DEBUG_VCPU_DIR "vcpu"
+
+#define KVM_SYNC_PKT_REQUEST 1
+#define KVM_SYNC_PKT_RESPONSE 2
+
+typedef __s64 s64;
+
+// equal to /sys/kernel/debug/kvm/<VM ID>/vcpu0/tsc-offset
+#define KVM_ACCURACY 0
+#define KVM_NAME "kvm"
+
+struct kvm_clock_sync {
+ int vcpu_count;
+ char **vcpu_offsets;
+ char **vcpu_scalings;
+ int marker_fd;
+ struct tep_handle *tep;
+ int raw_id;
+ unsigned long long ts;
+};
+
+struct kvm_clock_offset_msg {
+ s64 ts;
+ s64 offset;
+ s64 scaling;
+};
+
+static bool kvm_support_check(bool guest)
+{
+ struct stat st;
+ int ret;
+
+ if (guest)
+ return true;
+
+ ret = stat(KVM_DEBUG_FS, &st);
+ if (ret < 0)
+ return false;
+
+ if (!S_ISDIR(st.st_mode))
+ return false;
+ return true;
+}
+
+static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str)
+{
+ struct dirent *entry;
+ char path[PATH_MAX];
+ DIR *dir;
+
+ dir = opendir(dir_str);
+ if (!dir)
+ goto error;
+ while ((entry = readdir(dir))) {
+ if (entry->d_type != DT_DIR) {
+ if (!strncmp(entry->d_name, KVM_DEBUG_OFFSET_FILE,
+ strlen(KVM_DEBUG_OFFSET_FILE))) {
+ snprintf(path, sizeof(path), "%s/%s",
+ dir_str, entry->d_name);
+ kvm->vcpu_offsets[cpu] = strdup(path);
+ }
+ if (!strncmp(entry->d_name, KVM_DEBUG_SCALING_FILE,
+ strlen(KVM_DEBUG_SCALING_FILE))) {
+ snprintf(path, sizeof(path), "%s/%s",
+ dir_str, entry->d_name);
+ kvm->vcpu_scalings[cpu] = strdup(path);
+ }
+ }
+ }
+ if (!kvm->vcpu_offsets[cpu])
+ goto error;
+ closedir(dir);
+ return 0;
+
+error:
+ if (dir)
+ closedir(dir);
+ free(kvm->vcpu_offsets[cpu]);
+ kvm->vcpu_offsets[cpu] = NULL;
+ free(kvm->vcpu_scalings[cpu]);
+ kvm->vcpu_scalings[cpu] = NULL;
+ return -1;
+}
+
+static int kvm_open_debug_files(struct kvm_clock_sync *kvm, int pid)
+{
+ char *vm_dir_str = NULL;
+ struct dirent *entry;
+ char *pid_str = NULL;
+ char path[PATH_MAX];
+ long vcpu;
+ DIR *dir;
+ int i;
+
+ dir = opendir(KVM_DEBUG_FS);
+ if (!dir)
+ goto error;
+ if (asprintf(&pid_str, "%d-", pid) <= 0)
+ goto error;
+ while ((entry = readdir(dir))) {
+ if (!(entry->d_type == DT_DIR &&
+ !strncmp(entry->d_name, pid_str, strlen(pid_str))))
+ continue;
+ asprintf(&vm_dir_str, "%s/%s", KVM_DEBUG_FS, entry->d_name);
+ break;
+ }
+ closedir(dir);
+ dir = NULL;
+ if (!vm_dir_str)
+ goto error;
+ dir = opendir(vm_dir_str);
+ if (!dir)
+ goto error;
+ while ((entry = readdir(dir))) {
+ if (!(entry->d_type == DT_DIR &&
+ !strncmp(entry->d_name, KVM_DEBUG_VCPU_DIR, strlen(KVM_DEBUG_VCPU_DIR))))
+ continue;
+ vcpu = strtol(entry->d_name + strlen(KVM_DEBUG_VCPU_DIR), NULL, 10);
+ if (vcpu < 0 || vcpu >= kvm->vcpu_count)
+ continue;
+ snprintf(path, sizeof(path), "%s/%s", vm_dir_str, entry->d_name);
+ if (kvm_open_vcpu_dir(kvm, vcpu, path) < 0)
+ goto error;
+ }
+ for (i = 0; i < kvm->vcpu_count; i++) {
+ if (!kvm->vcpu_offsets[i])
+ goto error;
+ }
+ closedir(dir);
+ free(pid_str);
+ free(vm_dir_str);
+ return 0;
+error:
+ free(pid_str);
+ free(vm_dir_str);
+ if (dir)
+ closedir(dir);
+ return -1;
+}
+
+static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync,
+ struct kvm_clock_sync *kvm)
+{
+ kvm->vcpu_count = tsync->vcpu_count;
+ kvm->vcpu_offsets = calloc(kvm->vcpu_count, sizeof(char *));
+ kvm->vcpu_scalings = calloc(kvm->vcpu_count, sizeof(char *));
+ if (!kvm->vcpu_offsets || !kvm->vcpu_scalings)
+ goto error;
+ if (kvm_open_debug_files(kvm, tsync->guest_pid) < 0)
+ goto error;
+ return 0;
+
+error:
+ free(kvm->vcpu_offsets);
+ free(kvm->vcpu_scalings);
+ return -1;
+}
+
+static int kvm_clock_sync_init_guest(struct tracecmd_time_sync *tsync,
+ struct kvm_clock_sync *kvm)
+{
+ const char *systems[] = {"ftrace", NULL};
+ struct clock_sync_context *clock_context;
+ struct tep_event *raw;
+ char *path;
+
+ clock_context = (struct clock_sync_context *)tsync->context;
+ path = tracefs_instance_get_dir(clock_context->instance);
+ if (!path)
+ goto error;
+ kvm->tep = tracefs_local_events_system(path, systems);
+ tracefs_put_tracing_file(path);
+ if (!kvm->tep)
+ goto error;
+ raw = tep_find_event_by_name(kvm->tep, "ftrace", "raw_data");
+ if (!raw)
+ goto error;
+
+ kvm->raw_id = raw->id;
+ tep_set_file_bigendian(kvm->tep, tracecmd_host_bigendian());
+ tep_set_local_bigendian(kvm->tep, tracecmd_host_bigendian());
+
+ path = tracefs_instance_get_file(clock_context->instance, "trace_marker_raw");
+ if (!path)
+ goto error;
+ kvm->marker_fd = open(path, O_WRONLY);
+ tracefs_put_tracing_file(path);
+
+ return 0;
+
+error:
+ if (kvm->tep)
+ tep_free(kvm->tep);
+ if (kvm->marker_fd >= 0)
+ close(kvm->marker_fd);
+
+ return -1;
+}
+
+static int kvm_clock_sync_init(struct tracecmd_time_sync *tsync)
+{
+ struct clock_sync_context *clock_context;
+ struct kvm_clock_sync *kvm;
+ int ret;
+
+ if (!tsync || !tsync->context)
+ return -1;
+ clock_context = (struct clock_sync_context *)tsync->context;
+
+ if (!kvm_support_check(clock_context->is_guest))
+ return -1;
+ kvm = calloc(1, sizeof(struct kvm_clock_sync));
+ if (!kvm)
+ return -1;
+ kvm->marker_fd = -1;
+ if (clock_context->is_guest)
+ ret = kvm_clock_sync_init_guest(tsync, kvm);
+ else
+ ret = kvm_clock_sync_init_host(tsync, kvm);
+ if (ret < 0)
+ goto error;
+
+ clock_context->proto_data = kvm;
+ return 0;
+
+error:
+ free(kvm);
+ return -1;
+}
+
+static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync)
+{
+ struct clock_sync_context *clock_context;
+ struct kvm_clock_sync *kvm = NULL;
+ int i;
+
+ clock_context = (struct clock_sync_context *)tsync->context;
+ if (clock_context)
+ kvm = (struct kvm_clock_sync *)clock_context->proto_data;
+ if (kvm) {
+ for (i = 0; i < kvm->vcpu_count; i++) {
+ free(kvm->vcpu_offsets[i]);
+ kvm->vcpu_offsets[i] = NULL;
+ free(kvm->vcpu_scalings[i]);
+ kvm->vcpu_scalings[i] = NULL;
+ }
+ if (kvm->tep)
+ tep_free(kvm->tep);
+ if (kvm->marker_fd >= 0)
+ close(kvm->marker_fd);
+ free(kvm);
+ }
+ return -1;
+}
+
+static int read_ll_form_file(char *file, long long *res)
+{
+ char buf[32];
+ int ret;
+ int fd;
+
+ if (!file)
+ return -1;
+ fd = open(file, O_RDONLY | O_NONBLOCK);
+ if (fd < 0)
+ return -1;
+ ret = read(fd, buf, 32);
+ close(fd);
+ if (ret <= 0)
+ return -1;
+
+ *res = strtoll(buf, NULL, 10);
+
+ return 0;
+}
+
+static int kvm_clock_host(struct tracecmd_time_sync *tsync,
+ long long *offset, long long *scaling,
+ long long *timestamp, unsigned int cpu)
+{
+ char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
+ struct clock_sync_context *clock_context;
+ struct kvm_clock_offset_msg packet;
+ struct kvm_clock_sync *kvm = NULL;
+ long long kvm_scaling = 1;
+ unsigned int sync_msg;
+ long long kvm_offset;
+ unsigned int size;
+ char *msg;
+ int ret;
+
+ clock_context = (struct clock_sync_context *)tsync->context;
+ if (clock_context)
+ kvm = (struct kvm_clock_sync *)clock_context->proto_data;
+ if (!kvm || !kvm->vcpu_offsets || !kvm->vcpu_offsets[0])
+ return -1;
+ if (cpu >= kvm->vcpu_count)
+ return -1;
+ ret = read_ll_form_file(kvm->vcpu_offsets[cpu], &kvm_offset);
+ if (ret < 0)
+ return -1;
+ if (kvm->vcpu_scalings && kvm->vcpu_scalings[cpu])
+ read_ll_form_file(kvm->vcpu_scalings[cpu], &kvm_scaling);
+ msg = (char *)&packet;
+ size = sizeof(packet);
+ ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
+ sync_proto, &sync_msg,
+ &size, &msg);
+ if (ret || strncmp(sync_proto, KVM_NAME, TRACECMD_TSYNC_PNAME_LENGTH) ||
+ sync_msg != KVM_SYNC_PKT_REQUEST)
+ return -1;
+
+ packet.offset = -kvm_offset;
+ packet.scaling = kvm_scaling;
+ ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME,
+ KVM_SYNC_PKT_RESPONSE, sizeof(packet),
+ (char *)&packet);
+ if (ret)
+ return -1;
+
+ *scaling = packet.scaling;
+ *offset = packet.offset;
+ *timestamp = packet.ts;
+
+ return 0;
+}
+
+#define KVM_EVENT_MARKER "kvm sync event"
+static int kvm_marker_find(struct tep_event *event, struct tep_record *record,
+ int cpu, void *context)
+{
+ struct kvm_clock_sync *kvm = (struct kvm_clock_sync *)context;
+ struct tep_format_field *field;
+ struct tep_format_field *id;
+ char *marker;
+
+ /* Make sure this is our event */
+ if (event->id != kvm->raw_id)
+ return 0;
+ id = tep_find_field(event, "id");
+ field = tep_find_field(event, "buf");
+ if (field && id &&
+ record->size >= (id->offset + strlen(KVM_EVENT_MARKER) + 1)) {
+ marker = (char *)(record->data + id->offset);
+ if (!strcmp(marker, KVM_EVENT_MARKER)) {
+ kvm->ts = record->ts;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+static int kvm_clock_guest(struct tracecmd_time_sync *tsync,
+ long long *offset,
+ long long *scaling,
+ long long *timestamp)
+{
+ char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH];
+ struct clock_sync_context *clock_context;
+ struct kvm_clock_offset_msg packet;
+ struct kvm_clock_sync *kvm = NULL;
+ unsigned int sync_msg;
+ unsigned int size;
+ char *msg;
+ int ret;
+
+ clock_context = (struct clock_sync_context *)tsync->context;
+ if (clock_context)
+ kvm = (struct kvm_clock_sync *)clock_context->proto_data;
+ if (!kvm)
+ return -1;
+ kvm->ts = 0;
+ memset(&packet, 0, sizeof(packet));
+ tracefs_instance_file_write(clock_context->instance, "trace", "\0");
+ write(kvm->marker_fd, KVM_EVENT_MARKER, strlen(KVM_EVENT_MARKER) + 1);
+ kvm->ts = 0;
+ tracefs_iterate_raw_events(kvm->tep, clock_context->instance,
+ NULL, 0, kvm_marker_find, kvm);
+ packet.ts = kvm->ts;
+ ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME,
+ KVM_SYNC_PKT_REQUEST, sizeof(packet),
+ (char *)&packet);
+ if (ret)
+ return -1;
+ msg = (char *)&packet;
+ size = sizeof(packet);
+ ret = tracecmd_msg_recv_time_sync(tsync->msg_handle,
+ sync_proto, &sync_msg,
+ &size, &msg);
+ if (ret || strncmp(sync_proto, KVM_NAME, TRACECMD_TSYNC_PNAME_LENGTH) ||
+ sync_msg != KVM_SYNC_PKT_RESPONSE)
+ return -1;
+
+ *scaling = packet.scaling;
+ *offset = packet.offset;
+ *timestamp = packet.ts;
+ return 0;
+}
+
+static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync,
+ long long *offset, long long *scaling,
+ long long *timestamp, unsigned int cpu)
+{
+ struct clock_sync_context *clock_context;
+ int ret;
+
+ if (!tsync || !tsync->context)
+ return -1;
+
+ clock_context = (struct clock_sync_context *)tsync->context;
+
+ if (clock_context->is_guest)
+ ret = kvm_clock_guest(tsync, offset, scaling, timestamp);
+ else
+ ret = kvm_clock_host(tsync, offset, scaling, timestamp, cpu);
+ return ret;
+}
+
+int kvm_clock_sync_register(void)
+{
+ int role = TRACECMD_TIME_SYNC_ROLE_GUEST;
+ int clock = 0;
+
+ if (kvm_support_check(false)) {
+ role |= TRACECMD_TIME_SYNC_ROLE_HOST;
+ clock = TRACECMD_CLOCK_X86_TSC;
+ }
+ return tracecmd_tsync_proto_register(KVM_NAME, KVM_ACCURACY,
+ role, clock, 0,
+ kvm_clock_sync_init,
+ kvm_clock_sync_free,
+ kvm_clock_sync_calc);
+}
+
+int kvm_clock_sync_unregister(void)
+{
+ return tracecmd_tsync_proto_unregister(KVM_NAME);
+}
@@ -63,6 +63,7 @@ static struct tsync_proto *tsync_proto_find(const char *proto_name)
void tracecmd_tsync_init(void)
{
ptp_clock_sync_register();
+ kvm_clock_sync_register();
}
int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int roles,
@@ -433,6 +434,7 @@ void tracecmd_tsync_free(struct tracecmd_time_sync *tsync)
}
pthread_mutex_destroy(&tsync->lock);
pthread_cond_destroy(&tsync->cond);
+ pthread_barrier_destroy(&tsync->first_sync);
free(tsync->clock_str);
}
@@ -630,23 +632,24 @@ static inline void get_ts_loop_delay(struct timespec *timeout, int delay_ms)
* It loops infinite, until the timesync semaphore is released
*
*/
-void tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync)
+int tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync)
{
struct tsync_probe_request_msg probe;
int ts_array_size = CLOCK_TS_ARRAY;
struct tsync_proto *proto;
struct timespec timeout;
+ bool first = true;
bool end = false;
int ret;
int i;
proto = tsync_proto_find(tsync->proto_name);
if (!proto || !proto->clock_sync_calc)
- return;
+ return -1;
clock_context_init(tsync, false);
if (!tsync->context)
- return;
+ return -1;
if (tsync->loop_interval > 0 &&
tsync->loop_interval < (CLOCK_TS_ARRAY * 1000))
@@ -664,6 +667,10 @@ void tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync)
if (ret)
break;
}
+ if (first) {
+ first = false;
+ pthread_barrier_wait(&tsync->first_sync);
+ }
if (end || i < tsync->vcpu_count)
break;
if (tsync->loop_interval > 0) {
@@ -685,4 +692,5 @@ void tracecmd_tsync_with_guest(struct tracecmd_time_sync *tsync)
TRACECMD_TSYNC_PROTO_NONE,
TRACECMD_TIME_SYNC_CMD_STOP,
0, NULL);
+ return 0;
}
@@ -61,14 +61,19 @@ error:
static void *tsync_host_thread(void *data)
{
struct tracecmd_time_sync *tsync = NULL;
+ int ret;
tsync = (struct tracecmd_time_sync *)data;
- tracecmd_tsync_with_guest(tsync);
+ ret = tracecmd_tsync_with_guest(tsync);
tracecmd_msg_handle_close(tsync->msg_handle);
tsync->msg_handle = NULL;
+ /* tsync with guest failed, release the barrier */
+ if (ret)
+ pthread_barrier_wait(&tsync->first_sync);
+
pthread_exit(0);
}
@@ -106,6 +111,7 @@ int tracecmd_host_tsync(struct buffer_instance *instance,
instance->tsync.clock_str = strdup(top_instance.clock);
pthread_mutex_init(&instance->tsync.lock, NULL);
pthread_cond_init(&instance->tsync.cond, NULL);
+ pthread_barrier_init(&instance->tsync.first_sync, NULL, 2);
pthread_attr_init(&attrib);
pthread_attr_setdetachstate(&attrib, PTHREAD_CREATE_JOINABLE);
@@ -117,6 +123,7 @@ int tracecmd_host_tsync(struct buffer_instance *instance,
if (!get_first_cpu(&pin_mask, &mask_size))
pthread_setaffinity_np(instance->tsync_thread, mask_size, pin_mask);
instance->tsync_thread_running = true;
+ pthread_barrier_wait(&instance->tsync.first_sync);
}
if (pin_mask)
Added new timestamp synchronization plugin for KVM hosts. It reads the clock offsets directly from the KVM debug filesystem, if available. The plugin works only with x86-tsc ftrace clock. Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@gmail.com> --- lib/trace-cmd/Makefile | 1 + .../include/private/trace-cmd-private.h | 3 +- lib/trace-cmd/include/trace-tsync-local.h | 1 + lib/trace-cmd/trace-timesync-kvm.c | 460 ++++++++++++++++++ lib/trace-cmd/trace-timesync.c | 14 +- tracecmd/trace-tsync.c | 9 +- 6 files changed, 483 insertions(+), 5 deletions(-) create mode 100644 lib/trace-cmd/trace-timesync-kvm.c