diff mbox series

[v2,24/26] perf callchain: Use pthread keys for tls callchain_cursor

Message ID 20230608232823.4027869-25-irogers@google.com (mailing list archive)
State New, archived
Headers show
Series Fix memory leaks (was reference count checking for thread) | expand

Commit Message

Ian Rogers June 8, 2023, 11:28 p.m. UTC
Pthread keys are more portable than __thread and allow the association
of a destructor with the key. Use the destructor to clean up TLS
callchain cursors to aid understanding memory leaks.

Signed-off-by: Ian Rogers <irogers@google.com>
---
 tools/perf/builtin-c2c.c                      |  4 +-
 tools/perf/builtin-script.c                   | 24 ++++++-----
 tools/perf/util/callchain.c                   | 40 ++++++++++++++++++-
 tools/perf/util/callchain.h                   |  4 +-
 tools/perf/util/db-export.c                   | 10 +++--
 tools/perf/util/hist.c                        | 29 ++++++++------
 .../scripting-engines/trace-event-python.c    | 10 +++--
 7 files changed, 86 insertions(+), 35 deletions(-)

Comments

Arnaldo Carvalho de Melo June 9, 2023, 7:49 p.m. UTC | #1
Em Thu, Jun 08, 2023 at 04:28:21PM -0700, Ian Rogers escreveu:
> Pthread keys are more portable than __thread and allow the association
> of a destructor with the key. Use the destructor to clean up TLS
> callchain cursors to aid understanding memory leaks.
> 
> Signed-off-by: Ian Rogers <irogers@google.com>

Had to add this, and added this to the cset commit log:

    Committer notes:

    Had to fixup a series of unconverted places and also check for the
    return of get_tls_callchain_cursor() as it may fail and return NULL.

    In that unlikely case we now either print something to a file, if the
    caller was expecting to print a callchain, or return an error code to
    state that resolving the callchain isn't possible.

    In some cases this was made easier because thread__resolve_callchain()
    already can fail for other reasons, so this new one (cursor == NULL) can
    be added and the callers don't have to explicitely check for this new
    condition.

Now building it with the containers, will continue reviewing, more eyes
on this would be most welcome...

- Arnaldo

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 96a6611e4e53f448..9714327fd0eadd1b 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -399,6 +399,7 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample)
 	struct addr_location al;
 	struct machine *machine = &kmem_session->machines.host;
 	struct callchain_cursor_node *node;
+	struct callchain_cursor *cursor;
 	u64 result = sample->ip;
 
 	addr_location__init(&al);
@@ -408,14 +409,19 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample)
 	}
 
 	al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
-	sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16);
 
-	callchain_cursor_commit(&callchain_cursor);
+	cursor = get_tls_callchain_cursor();
+	if (cursor == NULL)
+		goto out;
+
+	sample__resolve_callchain(sample, cursor, NULL, evsel, &al, 16);
+
+	callchain_cursor_commit(cursor);
 	while (true) {
 		struct alloc_func key, *caller;
 		u64 addr;
 
-		node = callchain_cursor_current(&callchain_cursor);
+		node = callchain_cursor_current(cursor);
 		if (node == NULL)
 			break;
 
@@ -434,7 +440,7 @@ static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample)
 		} else
 			pr_debug3("skipping alloc function: %s\n", caller->name);
 
-		callchain_cursor_advance(&callchain_cursor);
+		callchain_cursor_advance(cursor);
 	}
 
 	pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 2d80aef4ecccece0..14bf7a8429e76f89 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -589,7 +589,7 @@ static void timehist_save_callchain(struct perf_kwork *kwork,
 	struct symbol *sym;
 	struct thread *thread;
 	struct callchain_cursor_node *node;
-	struct callchain_cursor *cursor = &callchain_cursor;
+	struct callchain_cursor *cursor;
 
 	if (!kwork->show_callchain || sample->callchain == NULL)
 		return;
@@ -601,6 +601,8 @@ static void timehist_save_callchain(struct perf_kwork *kwork,
 		return;
 	}
 
+	cursor = get_tls_callchain_cursor();
+
 	if (thread__resolve_callchain(thread, cursor, evsel, sample,
 				      NULL, NULL, kwork->max_stack + 2) != 0) {
 		pr_debug("Failed to resolve callchain, skipping\n");
@@ -686,12 +688,18 @@ static void timehist_print_event(struct perf_kwork *kwork,
 	 * callchain
 	 */
 	if (kwork->show_callchain) {
+		struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+		if (cursor == NULL)
+			return;
+
 		printf(" ");
+
 		sample__fprintf_sym(sample, al, 0,
 				    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
 				    EVSEL__PRINT_CALLCHAIN_ARROW |
 				    EVSEL__PRINT_SKIP_IGNORED,
-				    &callchain_cursor, symbol_conf.bt_stop_list,
+				    cursor, symbol_conf.bt_stop_list,
 				    stdout);
 	}
 
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index fc8356bd6e3a1d99..8b505e1e5002a680 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -911,7 +911,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
 				  char *buf, int size)
 {
 	struct thread *thread;
-	struct callchain_cursor *cursor = &callchain_cursor;
+	struct callchain_cursor *cursor;
 	struct machine *machine = &session->machines.host;
 	struct symbol *sym;
 	int skip = 0;
@@ -925,6 +925,8 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
 	if (thread == NULL)
 		return -1;
 
+	cursor = get_tls_callchain_cursor();
+
 	/* use caller function name from the callchain */
 	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
 					NULL, NULL, max_stack_depth);
@@ -962,7 +964,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
 
 static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
 {
-	struct callchain_cursor *cursor = &callchain_cursor;
+	struct callchain_cursor *cursor;
 	struct machine *machine = &session->machines.host;
 	struct thread *thread;
 	u64 hash = 0;
@@ -973,6 +975,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
 	if (thread == NULL)
 		return -1;
 
+	cursor = get_tls_callchain_cursor();
 	/* use caller function name from the callchain */
 	ret = thread__resolve_callchain(thread, cursor, evsel, sample,
 					NULL, NULL, max_stack_depth);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index cd79068200e5653e..c9ddf73689cd6c07 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2111,7 +2111,7 @@ static void timehist_print_sample(struct perf_sched *sched,
 			    EVSEL__PRINT_SYM | EVSEL__PRINT_ONELINE |
 			    EVSEL__PRINT_CALLCHAIN_ARROW |
 			    EVSEL__PRINT_SKIP_IGNORED,
-			    &callchain_cursor, symbol_conf.bt_stop_list,  stdout);
+			    get_tls_callchain_cursor(), symbol_conf.bt_stop_list,  stdout);
 
 out:
 	printf("\n");
@@ -2196,7 +2196,7 @@ static void save_task_callchain(struct perf_sched *sched,
 				struct evsel *evsel,
 				struct machine *machine)
 {
-	struct callchain_cursor *cursor = &callchain_cursor;
+	struct callchain_cursor *cursor;
 	struct thread *thread;
 
 	/* want main thread for process - has maps */
@@ -2209,6 +2209,8 @@ static void save_task_callchain(struct perf_sched *sched,
 	if (!sched->show_callchain || sample->callchain == NULL)
 		return;
 
+	cursor = get_tls_callchain_cursor();
+
 	if (thread__resolve_callchain(thread, cursor, evsel, sample,
 				      NULL, NULL, sched->max_stack + 2) != 0) {
 		if (verbose > 0)
@@ -2338,10 +2340,16 @@ static void save_idle_callchain(struct perf_sched *sched,
 				struct idle_thread_runtime *itr,
 				struct perf_sample *sample)
 {
+	struct callchain_cursor *cursor;
+
 	if (!sched->show_callchain || sample->callchain == NULL)
 		return;
 
-	callchain_cursor__copy(&itr->cursor, &callchain_cursor);
+	cursor = get_tls_callchain_cursor();
+	if (cursor == NULL)
+		return;
+
+	callchain_cursor__copy(&itr->cursor, cursor);
 }
 
 static struct thread *timehist_get_thread(struct perf_sched *sched,
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6a1e75f06832bf35..66d4eb185ca7428a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2411,8 +2411,7 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel,
 }
 
 static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel,
-				    struct perf_sample *sample,
-				    struct callchain_cursor *cursor)
+				    struct perf_sample *sample, struct callchain_cursor *cursor)
 {
 	struct addr_location al;
 	int max_stack = evsel->core.attr.sample_max_stack ?
@@ -2437,7 +2436,7 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam
 				        EVSEL__PRINT_DSO |
 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
 
-	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output);
+	return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output);
 }
 
 static const char *errno_to_name(struct evsel *evsel, int err)
@@ -2491,9 +2490,11 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 		goto out;
 
 	if (sample->callchain) {
-		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
 		if (callchain_ret == 0) {
-			if (callchain_cursor.nr < trace->min_stack)
+			if (cursor->nr < trace->min_stack)
 				goto out;
 			callchain_ret = 1;
 		}
@@ -2795,9 +2796,11 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 
 	if (sample->callchain) {
-		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
 		if (callchain_ret == 0) {
-			if (callchain_cursor.nr < trace->min_stack)
+			if (cursor->nr < trace->min_stack)
 				goto out;
 			callchain_ret = 1;
 		}
@@ -2899,9 +2902,11 @@ static int trace__pgfault(struct trace *trace,
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
 
 	if (sample->callchain) {
-		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor);
 		if (callchain_ret == 0) {
-			if (callchain_cursor.nr < trace->min_stack)
+			if (cursor->nr < trace->min_stack)
 				goto out_put;
 			callchain_ret = 1;
 		}
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 7e9bd3b6be9f2655..aee937d14fbbf101 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -987,6 +987,9 @@ int callchain_append(struct callchain_root *root,
 		     struct callchain_cursor *cursor,
 		     u64 period)
 {
+	if (cursor == NULL)
+		return -1;
+
 	if (!cursor->nr)
 		return 0;
 
@@ -1601,6 +1604,8 @@ struct callchain_cursor *get_tls_callchain_cursor(void)
 	cursor = pthread_getspecific(callchain_cursor);
 	if (!cursor) {
 		cursor = zalloc(sizeof(*cursor));
+		if (!cursor)
+			pr_debug3("%s: not enough memory\n", __func__);
 		pthread_setspecific(callchain_cursor, cursor);
 	}
 	return cursor;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index ce9410018cf7a1d5..d2618a47deca8f27 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -209,6 +209,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
 /* Close a cursor writing session. Initialize for the reader */
 static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
 {
+	if (cursor == NULL)
+		return;
 	cursor->curr = cursor->first;
 	cursor->pos = 0;
 }
@@ -217,7 +219,7 @@ static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
 static inline struct callchain_cursor_node *
 callchain_cursor_current(struct callchain_cursor *cursor)
 {
-	if (cursor->pos == cursor->nr)
+	if (cursor == NULL || cursor->pos == cursor->nr)
 		return NULL;
 
 	return cursor->curr;
diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
index cf45ca0e768fb882..8719b3cb5646614d 100644
--- a/tools/perf/util/evsel_fprintf.c
+++ b/tools/perf/util/evsel_fprintf.c
@@ -127,6 +127,9 @@ int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
 	char s = print_oneline ? ' ' : '\t';
 	bool first = true;
 
+	if (cursor == NULL)
+		return fprintf(fp, "<not enough memory for the callchain cursor>%s", print_oneline ? "" : "\n");
+
 	if (sample->callchain) {
 		callchain_cursor_commit(cursor);
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 4004c0915e4f471a..efaf7ac784fc8483 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1031,6 +1031,9 @@ iter_prepare_cumulative_entry(struct hist_entry_iter *iter,
 	struct hist_entry **he_cache;
 	struct callchain_cursor *cursor = get_tls_callchain_cursor();
 
+	if (cursor == NULL)
+		return -ENOMEM;
+
 	callchain_cursor_commit(cursor);
 
 	/*
@@ -1135,6 +1138,9 @@ iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 	struct callchain_cursor cursor, *tls_cursor = get_tls_callchain_cursor();
 	bool fast = hists__has(he_tmp.hists, sym);
 
+	if (tls_cursor == NULL)
+		return -ENOMEM;
+
 	callchain_cursor_snapshot(&cursor, tls_cursor);
 
 	callchain_cursor_advance(tls_cursor);
@@ -1571,6 +1577,9 @@ static int hists__hierarchy_insert_entry(struct hists *hists,
 		    symbol_conf.use_callchain) {
 			struct callchain_cursor *cursor = get_tls_callchain_cursor();
 
+			if (cursor == NULL)
+				return -1;
+
 			callchain_cursor_reset(cursor);
 			if (callchain_merge(cursor,
 					    new_he->callchain,
@@ -1615,11 +1624,13 @@ static int hists__collapse_insert_entry(struct hists *hists,
 			if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) {
 				struct callchain_cursor *cursor = get_tls_callchain_cursor();
 
-				callchain_cursor_reset(cursor);
-				if (callchain_merge(cursor,
-						    iter->callchain,
-						    he->callchain) < 0)
-					ret = -1;
+				if (cursor != NULL) {
+					callchain_cursor_reset(cursor);
+					if (callchain_merge(cursor, iter->callchain, he->callchain) < 0)
+						ret = -1;
+				} else {
+					ret = 0;
+				}
 			}
 			hist_entry__delete(he);
 			return ret;
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index bdad4b8bf77deb4e..4e62843d51b7dbf9 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -3180,6 +3180,9 @@ int thread__resolve_callchain(struct thread *thread,
 {
 	int ret = 0;
 
+	if (cursor == NULL)
+		return -ENOMEM;
+
 	callchain_cursor_reset(cursor);
 
 	if (callchain_param.order == ORDER_CALLEE) {
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 65b761d83a1f8c3c..603091317bed9be4 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -260,6 +260,7 @@ static SV *perl_process_callchain(struct perf_sample *sample,
 				  struct evsel *evsel,
 				  struct addr_location *al)
 {
+	struct callchain_cursor *cursor;
 	AV *list;
 
 	list = newAV();
@@ -269,18 +270,20 @@ static SV *perl_process_callchain(struct perf_sample *sample,
 	if (!symbol_conf.use_callchain || !sample->callchain)
 		goto exit;
 
-	if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+	cursor = get_tls_callchain_cursor();
+
+	if (thread__resolve_callchain(al->thread, cursor, evsel,
 				      sample, NULL, NULL, scripting_max_stack) != 0) {
 		pr_err("Failed to resolve callchain. Skipping\n");
 		goto exit;
 	}
-	callchain_cursor_commit(&callchain_cursor);
+	callchain_cursor_commit(cursor);
 
 
 	while (1) {
 		HV *elem;
 		struct callchain_cursor_node *node;
-		node = callchain_cursor_current(&callchain_cursor);
+		node = callchain_cursor_current(cursor);
 		if (!node)
 			break;
 
@@ -328,7 +331,7 @@ static SV *perl_process_callchain(struct perf_sample *sample,
 			}
 		}
 
-		callchain_cursor_advance(&callchain_cursor);
+		callchain_cursor_advance(cursor);
 		av_push(list, newRV_noinc((SV*)elem));
 	}
diff mbox series

Patch

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 530a44a59f41..a4cf9de7a7b5 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -284,6 +284,7 @@  static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	struct hist_entry *he;
 	struct addr_location al;
 	struct mem_info *mi, *mi_dup;
+	struct callchain_cursor *cursor;
 	int ret;
 
 	addr_location__init(&al);
@@ -297,7 +298,8 @@  static int process_sample_event(struct perf_tool *tool __maybe_unused,
 	if (c2c.stitch_lbr)
 		thread__set_lbr_stitch_enable(al.thread, true);
 
-	ret = sample__resolve_callchain(sample, &callchain_cursor, NULL,
+	cursor = get_tls_callchain_cursor();
+	ret = sample__resolve_callchain(sample, cursor, NULL,
 					evsel, &al, sysctl_perf_event_max_stack);
 	if (ret)
 		goto out;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 784d478c2e05..e3f435e6a7d0 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1557,11 +1557,13 @@  static int perf_sample__fprintf_bts(struct perf_sample *sample,
 		unsigned int print_opts = output[type].print_ip_opts;
 		struct callchain_cursor *cursor = NULL;
 
-		if (symbol_conf.use_callchain && sample->callchain &&
-		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-					      sample, NULL, NULL, scripting_max_stack) == 0)
-			cursor = &callchain_cursor;
-
+		if (symbol_conf.use_callchain && sample->callchain) {
+			cursor = get_tls_callchain_cursor();
+			if (thread__resolve_callchain(al->thread, cursor, evsel,
+						      sample, NULL, NULL,
+						      scripting_max_stack))
+				cursor = NULL;
+		}
 		if (cursor == NULL) {
 			printed += fprintf(fp, " ");
 			if (print_opts & EVSEL__PRINT_SRCLINE) {
@@ -2203,11 +2205,13 @@  static void process_event(struct perf_script *script,
 		if (script->stitch_lbr)
 			thread__set_lbr_stitch_enable(al->thread, true);
 
-		if (symbol_conf.use_callchain && sample->callchain &&
-		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
-					      sample, NULL, NULL, scripting_max_stack) == 0)
-			cursor = &callchain_cursor;
-
+		if (symbol_conf.use_callchain && sample->callchain) {
+			cursor = get_tls_callchain_cursor();
+			if (thread__resolve_callchain(al->thread, cursor, evsel,
+						      sample, NULL, NULL,
+						      scripting_max_stack))
+				cursor = NULL;
+		}
 		fputc(cursor ? '\n' : ' ', fp);
 		sample__fprintf_sym(sample, al, 0, output[type].print_ip_opts, cursor,
 				    symbol_conf.bt_stop_list, fp);
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 909f62b3b266..7e9bd3b6be9f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -58,7 +58,8 @@  struct callchain_param callchain_param_default = {
 	CALLCHAIN_PARAM_DEFAULT
 };
 
-__thread struct callchain_cursor callchain_cursor;
+/* Used for thread-local struct callchain_cursor. */
+static pthread_key_t callchain_cursor;
 
 int parse_callchain_record_opt(const char *arg, struct callchain_param *param)
 {
@@ -1116,7 +1117,7 @@  int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *samp
 	if ((!symbol_conf.use_callchain || sample->callchain == NULL) &&
 		!symbol_conf.show_branchflag_count)
 		return 0;
-	return callchain_append(he->callchain, &callchain_cursor, sample->period);
+	return callchain_append(he->callchain, get_tls_callchain_cursor(), sample->period);
 }
 
 int fill_callchain_info(struct addr_location *al, struct callchain_cursor_node *node,
@@ -1570,6 +1571,41 @@  int callchain_node__make_parent_list(struct callchain_node *node)
 	return -ENOMEM;
 }
 
+static void callchain_cursor__delete(void *vcursor)
+{
+	struct callchain_cursor *cursor = vcursor;
+	struct callchain_cursor_node *node, *next;
+
+	callchain_cursor_reset(cursor);
+	for (node = cursor->first; node != NULL; node = next) {
+		next = node->next;
+		free(node);
+	}
+	free(cursor);
+}
+
+static void init_callchain_cursor_key(void)
+{
+	if (pthread_key_create(&callchain_cursor, callchain_cursor__delete)) {
+		pr_err("callchain cursor creation failed");
+		abort();
+	}
+}
+
+struct callchain_cursor *get_tls_callchain_cursor(void)
+{
+	static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+	struct callchain_cursor *cursor;
+
+	pthread_once(&once_control, init_callchain_cursor_key);
+	cursor = pthread_getspecific(callchain_cursor);
+	if (!cursor) {
+		cursor = zalloc(sizeof(*cursor));
+		pthread_setspecific(callchain_cursor, cursor);
+	}
+	return cursor;
+}
+
 int callchain_cursor__copy(struct callchain_cursor *dst,
 			   struct callchain_cursor *src)
 {
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d95615daed73..ce9410018cf7 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -168,8 +168,6 @@  struct callchain_cursor {
 	struct callchain_cursor_node	*curr;
 };
 
-extern __thread struct callchain_cursor callchain_cursor;
-
 static inline void callchain_init(struct callchain_root *root)
 {
 	INIT_LIST_HEAD(&root->node.val);
@@ -231,6 +229,8 @@  static inline void callchain_cursor_advance(struct callchain_cursor *cursor)
 	cursor->pos++;
 }
 
+struct callchain_cursor *get_tls_callchain_cursor(void);
+
 int callchain_cursor__copy(struct callchain_cursor *dst,
 			   struct callchain_cursor *src);
 
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index 6184696dc266..b9fb71ab7a73 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -215,6 +215,7 @@  static struct call_path *call_path_from_sample(struct db_export *dbe,
 	u64 kernel_start = machine__kernel_start(machine);
 	struct call_path *current = &dbe->cpr->call_path;
 	enum chain_order saved_order = callchain_param.order;
+	struct callchain_cursor *cursor;
 	int err;
 
 	if (!symbol_conf.use_callchain || !sample->callchain)
@@ -226,13 +227,14 @@  static struct call_path *call_path_from_sample(struct db_export *dbe,
 	 * the callchain starting with the root node and ending with the leaf.
 	 */
 	callchain_param.order = ORDER_CALLER;
-	err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
+	cursor = get_tls_callchain_cursor();
+	err = thread__resolve_callchain(thread, cursor, evsel,
 					sample, NULL, NULL, PERF_MAX_STACK_DEPTH);
 	if (err) {
 		callchain_param.order = saved_order;
 		return NULL;
 	}
-	callchain_cursor_commit(&callchain_cursor);
+	callchain_cursor_commit(cursor);
 
 	while (1) {
 		struct callchain_cursor_node *node;
@@ -240,7 +242,7 @@  static struct call_path *call_path_from_sample(struct db_export *dbe,
 		u64 dso_db_id = 0, sym_db_id = 0, offset = 0;
 
 
-		node = callchain_cursor_current(&callchain_cursor);
+		node = callchain_cursor_current(cursor);
 		if (!node)
 			break;
 
@@ -265,7 +267,7 @@  static struct call_path *call_path_from_sample(struct db_export *dbe,
 					     al.sym, node->ip,
 					     kernel_start);
 
-		callchain_cursor_advance(&callchain_cursor);
+		callchain_cursor_advance(cursor);
 		addr_location__exit(&al);
 	}
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index fb218b3e8a7c..4004c0915e4f 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1029,15 +1029,16 @@  iter_prepare_cumulative_entry(struct hist_entry_iter *iter,
 			      struct addr_location *al __maybe_unused)
 {
 	struct hist_entry **he_cache;
+	struct callchain_cursor *cursor = get_tls_callchain_cursor();
 
-	callchain_cursor_commit(&callchain_cursor);
+	callchain_cursor_commit(cursor);
 
 	/*
 	 * This is for detecting cycles or recursions so that they're
 	 * cumulated only one time to prevent entries more than 100%
 	 * overhead.
 	 */
-	he_cache = malloc(sizeof(*he_cache) * (callchain_cursor.nr + 1));
+	he_cache = malloc(sizeof(*he_cache) * (cursor->nr + 1));
 	if (he_cache == NULL)
 		return -ENOMEM;
 
@@ -1072,7 +1073,7 @@  iter_add_single_cumulative_entry(struct hist_entry_iter *iter,
 	 * We need to re-initialize the cursor since callchain_append()
 	 * advanced the cursor to the end.
 	 */
-	callchain_cursor_commit(&callchain_cursor);
+	callchain_cursor_commit(get_tls_callchain_cursor());
 
 	hists__inc_nr_samples(hists, he->filtered);
 
@@ -1085,7 +1086,7 @@  iter_next_cumulative_entry(struct hist_entry_iter *iter,
 {
 	struct callchain_cursor_node *node;
 
-	node = callchain_cursor_current(&callchain_cursor);
+	node = callchain_cursor_current(get_tls_callchain_cursor());
 	if (node == NULL)
 		return 0;
 
@@ -1131,12 +1132,12 @@  iter_add_next_cumulative_entry(struct hist_entry_iter *iter,
 		.raw_size = sample->raw_size,
 	};
 	int i;
-	struct callchain_cursor cursor;
+	struct callchain_cursor cursor, *tls_cursor = get_tls_callchain_cursor();
 	bool fast = hists__has(he_tmp.hists, sym);
 
-	callchain_cursor_snapshot(&cursor, &callchain_cursor);
+	callchain_cursor_snapshot(&cursor, tls_cursor);
 
-	callchain_cursor_advance(&callchain_cursor);
+	callchain_cursor_advance(tls_cursor);
 
 	/*
 	 * Check if there's duplicate entries in the callchain.
@@ -1222,7 +1223,7 @@  int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 	if (al)
 		alm = map__get(al->map);
 
-	err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent,
+	err = sample__resolve_callchain(iter->sample, get_tls_callchain_cursor(), &iter->parent,
 					iter->evsel, al, max_stack_depth);
 	if (err) {
 		map__put(alm);
@@ -1568,8 +1569,10 @@  static int hists__hierarchy_insert_entry(struct hists *hists,
 
 		if (hist_entry__has_callchains(new_he) &&
 		    symbol_conf.use_callchain) {
-			callchain_cursor_reset(&callchain_cursor);
-			if (callchain_merge(&callchain_cursor,
+			struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+			callchain_cursor_reset(cursor);
+			if (callchain_merge(cursor,
 					    new_he->callchain,
 					    he->callchain) < 0)
 				ret = -1;
@@ -1610,8 +1613,10 @@  static int hists__collapse_insert_entry(struct hists *hists,
 				he_stat__add_stat(iter->stat_acc, he->stat_acc);
 
 			if (hist_entry__has_callchains(he) && symbol_conf.use_callchain) {
-				callchain_cursor_reset(&callchain_cursor);
-				if (callchain_merge(&callchain_cursor,
+				struct callchain_cursor *cursor = get_tls_callchain_cursor();
+
+				callchain_cursor_reset(cursor);
+				if (callchain_merge(cursor,
 						    iter->callchain,
 						    he->callchain) < 0)
 					ret = -1;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index d96e5c0fef45..59063ec98619 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -417,6 +417,7 @@  static PyObject *python_process_callchain(struct perf_sample *sample,
 					 struct addr_location *al)
 {
 	PyObject *pylist;
+	struct callchain_cursor *cursor;
 
 	pylist = PyList_New(0);
 	if (!pylist)
@@ -425,19 +426,20 @@  static PyObject *python_process_callchain(struct perf_sample *sample,
 	if (!symbol_conf.use_callchain || !sample->callchain)
 		goto exit;
 
-	if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+	cursor = get_tls_callchain_cursor();
+	if (thread__resolve_callchain(al->thread, cursor, evsel,
 				      sample, NULL, NULL,
 				      scripting_max_stack) != 0) {
 		pr_err("Failed to resolve callchain. Skipping\n");
 		goto exit;
 	}
-	callchain_cursor_commit(&callchain_cursor);
+	callchain_cursor_commit(cursor);
 
 
 	while (1) {
 		PyObject *pyelem;
 		struct callchain_cursor_node *node;
-		node = callchain_cursor_current(&callchain_cursor);
+		node = callchain_cursor_current(cursor);
 		if (!node)
 			break;
 
@@ -493,7 +495,7 @@  static PyObject *python_process_callchain(struct perf_sample *sample,
 					_PyUnicode_FromString(dsoname));
 		}
 
-		callchain_cursor_advance(&callchain_cursor);
+		callchain_cursor_advance(cursor);
 		PyList_Append(pylist, pyelem);
 		Py_DECREF(pyelem);
 	}