diff mbox series

[dwarves,v3,3/4] pahole: Use per-thread btf instances to avoid mutex locking.

Message ID 20220126040509.1862767-4-kuifeng@fb.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Parallelize BTF type info generating of pahole | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR fail merge-conflict
netdev/tree_selection success Not a local patch

Commit Message

Kui-Feng Lee Jan. 26, 2022, 4:05 a.m. UTC
Create an instance of btf for each worker thread, and add type info to
the local btf instance in the steal-function of pahole without mutex
acquiring.  Once finished with all worker threads, merge all
per-thread btf instances to the primary btf instance.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
---
 btf_encoder.c |   5 +++
 btf_encoder.h |   2 +
 pahole.c      | 121 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 120 insertions(+), 8 deletions(-)

Comments

Andrii Nakryiko Jan. 26, 2022, 6:07 a.m. UTC | #1
On Tue, Jan 25, 2022 at 8:07 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> Create an instance of btf for each worker thread, and add type info to
> the local btf instance in the steal-function of pahole without mutex
> acquiring.  Once finished with all worker threads, merge all
> per-thread btf instances to the primary btf instance.
>
> Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> ---
>  btf_encoder.c |   5 +++
>  btf_encoder.h |   2 +
>  pahole.c      | 121 ++++++++++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 120 insertions(+), 8 deletions(-)
>
> diff --git a/btf_encoder.c b/btf_encoder.c
> index 9d015f304e92..56a76f5d7275 100644
> --- a/btf_encoder.c
> +++ b/btf_encoder.c
> @@ -1529,3 +1529,8 @@ int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
>  out:
>         return err;
>  }
> +
> +struct btf *btf_encoder__btf(struct btf_encoder *encoder)
> +{
> +       return encoder->btf;
> +}
> diff --git a/btf_encoder.h b/btf_encoder.h
> index f133b0d7674d..0f0eee84df74 100644
> --- a/btf_encoder.h
> +++ b/btf_encoder.h
> @@ -29,4 +29,6 @@ struct btf_encoder *btf_encoders__first(struct list_head *encoders);
>
>  struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
>
> +struct btf *btf_encoder__btf(struct btf_encoder *encoder);
> +
>  #endif /* _BTF_ENCODER_H_ */
> diff --git a/pahole.c b/pahole.c
> index f3eeaaca4cdf..525eb4d90b08 100644
> --- a/pahole.c
> +++ b/pahole.c
> @@ -29,6 +29,7 @@
>  #include "btf_encoder.h"
>
>  static struct btf_encoder *btf_encoder;
> +static pthread_mutex_t btf_encoder_lock = PTHREAD_MUTEX_INITIALIZER;
>  static char *detached_btf_filename;
>  static bool btf_encode;
>  static bool btf_gen_floats;
> @@ -2798,6 +2799,65 @@ out:
>
>  static struct type_instance *header;
>
> +struct thread_data {
> +       struct btf *btf;
> +       struct btf_encoder *encoder;
> +};
> +
> +static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
> +{
> +       int i;
> +       struct thread_data *threads = calloc(sizeof(struct thread_data), nr_threads);
> +
> +       for (i = 0; i < nr_threads; i++)
> +               thr_data[i] = threads + i;
> +
> +       return 0;
> +}
> +
> +static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
> +{
> +       struct thread_data *thread = thr_data;
> +
> +       if (thread == NULL)
> +               return 0;
> +
> +       /*
> +        * Here we will call btf__dedup() here once we extend
> +        * btf__dedup().
> +        */
> +
> +       if (thread->encoder == btf_encoder) {
> +               /* Release the lock acuqired when created btf_encoder */

typo: acquired

> +               pthread_mutex_unlock(&btf_encoder_lock);

Splitting pthread_mutex lock/unlock like this is extremely dangerous
and error prone. If that's the price for reusing global btf_encoder
for first worker, then I'd rather not reuse btf_encoder or revert back
to doing btf__add_btf() and doing btf_encoder__delete() in the main
thread.

Please question and push back the approach and code review feedback if
something doesn't feel natural or is causing more problems than
solves.

I think the cleanest solution would be to not reuse global btf_encoder
for the first worker. I suspect the time difference isn't big, so I'd
optimize for simplicity and clean separation. But if it is causing
unnecessary slowdown, then as I said, let's just revert back to your
previous approach with doing btf__add_btf() in the main thread.

> +               return 0;
> +       }
> +
> +       pthread_mutex_lock(&btf_encoder_lock);
> +       btf__add_btf(btf_encoder__btf(btf_encoder), thread->btf);
> +       pthread_mutex_unlock(&btf_encoder_lock);
> +
> +       btf_encoder__delete(thread->encoder);
> +       thread->encoder = NULL;
> +
> +       return 0;
> +}
> +
> +static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
> +                                 int error)
> +{
> +       struct thread_data **threads = (struct thread_data **)thr_data;
> +       int i;
> +
> +       for (i = 0; i < nr_threads; i++) {
> +               if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
> +                       btf_encoder__delete(threads[i]->encoder);
> +       }
> +       free(threads[0]);
> +
> +       return 0;
> +}
> +
>  static enum load_steal_kind pahole_stealer(struct cu *cu,
>                                            struct conf_load *conf_load,
>                                            void *thr_data)
> @@ -2819,30 +2879,72 @@ static enum load_steal_kind pahole_stealer(struct cu *cu,
>
>         if (btf_encode) {
>                 static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
> +               struct btf_encoder *encoder;
>
> -               pthread_mutex_lock(&btf_lock);
>                 /*
>                  * FIXME:
>                  *
>                  * This should be really done at main(), but since in the current codebase only at this
>                  * point we'll have cu->elf setup...
>                  */
> +               pthread_mutex_lock(&btf_lock);
>                 if (!btf_encoder) {
> -                       btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
> -                                                      btf_encode_force, btf_gen_floats, global_verbose);
> -                       if (btf_encoder == NULL) {
> -                               ret = LSK__STOP_LOADING;
> -                               goto out_btf;
> +                       /*
> +                        * btf_encoder is the primary encoder.
> +                        * And, it is used by the thread
> +                        * create it.
> +                        */
> +                       btf_encoder = btf_encoder__new(cu, detached_btf_filename,
> +                                                      conf_load->base_btf,
> +                                                      skip_encoding_btf_vars,
> +                                                      btf_encode_force,
> +                                                      btf_gen_floats, global_verbose);
> +                       if (btf_encoder && thr_data) {
> +                               struct thread_data *thread = (struct thread_data *)thr_data;

nit: no need for the cast

> +
> +                               thread->encoder = btf_encoder;
> +                               thread->btf = btf_encoder__btf(btf_encoder);
> +                               /* Will be relased by pahole_thread_exit() */

typo: released

> +                               pthread_mutex_lock(&btf_encoder_lock);
>                         }
>                 }
> +               pthread_mutex_unlock(&btf_lock);
> +

[...]

> @@ -3207,6 +3309,9 @@ int main(int argc, char *argv[])
>         memset(tab, ' ', sizeof(tab) - 1);
>
>         conf_load.steal = pahole_stealer;
> +       conf_load.thread_exit = pahole_thread_exit;
> +       conf_load.threads_prepare = pahole_threads_prepare;
> +       conf_load.threads_collect = pahole_threads_collect;
>
>         // Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
>         if (conf.header_type && !class_name && prettify_input) {
> --
> 2.30.2
>
Kui-Feng Lee Jan. 26, 2022, 6:39 p.m. UTC | #2
On Tue, 2022-01-25 at 22:07 -0800, Andrii Nakryiko wrote:
> On Tue, Jan 25, 2022 at 8:07 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > 
> > Create an instance of btf for each worker thread, and add type info
> > to
> > the local btf instance in the steal-function of pahole without
> > mutex
> > acquiring.  Once finished with all worker threads, merge all
> > per-thread btf instances to the primary btf instance.
> > 
> > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > 
............ cut ...........
> > +       struct thread_data *thread = thr_data;
> > +
> > +       if (thread == NULL)
> > +               return 0;
> > +
> > +       /*
> > +        * Here we will call btf__dedup() here once we extend
> > +        * btf__dedup().
> > +        */
> > +
> > +       if (thread->encoder == btf_encoder) {
> > +               /* Release the lock acuqired when created
> > btf_encoder */
> 
> typo: acquired
> 
> > +               pthread_mutex_unlock(&btf_encoder_lock);
> 
> Splitting pthread_mutex lock/unlock like this is extremely dangerous
> and error prone. If that's the price for reusing global btf_encoder
> for first worker, then I'd rather not reuse btf_encoder or revert
> back
> to doing btf__add_btf() and doing btf_encoder__delete() in the main
> thread.
> 
> Please question and push back the approach and code review feedback
> if
> something doesn't feel natural or is causing more problems than
> solves.
> 
> I think the cleanest solution would be to not reuse global btf_encoder
> for the first worker. I suspect the time difference isn't big, so I'd
> optimize for simplicity and clean separation. But if it is causing
> unnecessary slowdown, then as I said, let's just revert back to your
> previous approach with doing btf__add_btf() in the main thread.
> 

Your concerns make sense.
I tried the solutions w/ & w/o reusing btf_encoder.  The differencies
are obviously.  So, I will rollback to calling btf__add_btf() at the
main thread.

w/o reusing: AVG 5.78467 P50 5.88 P70 6.03 P90 6.10
w/  reusing: AVG 5.304 P50 5.12 P70 5.17 P90 5.46
Andrii Nakryiko Jan. 26, 2022, 7:54 p.m. UTC | #3
On Wed, Jan 26, 2022 at 10:39 AM Kui-Feng Lee <kuifeng@fb.com> wrote:
>
> On Tue, 2022-01-25 at 22:07 -0800, Andrii Nakryiko wrote:
> > On Tue, Jan 25, 2022 at 8:07 PM Kui-Feng Lee <kuifeng@fb.com> wrote:
> > >
> > > Create an instance of btf for each worker thread, and add type info
> > > to
> > > the local btf instance in the steal-function of pahole without
> > > mutex
> > > acquiring.  Once finished with all worker threads, merge all
> > > per-thread btf instances to the primary btf instance.
> > >
> > > Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
> > >
> ............ cut ...........
> > > +       struct thread_data *thread = thr_data;
> > > +
> > > +       if (thread == NULL)
> > > +               return 0;
> > > +
> > > +       /*
> > > +        * Here we will call btf__dedup() here once we extend
> > > +        * btf__dedup().
> > > +        */
> > > +
> > > +       if (thread->encoder == btf_encoder) {
> > > +               /* Release the lock acuqired when created
> > > btf_encoder */
> >
> > typo: acquired
> >
> > > +               pthread_mutex_unlock(&btf_encoder_lock);
> >
> > Splitting pthread_mutex lock/unlock like this is extremely dangerous
> > and error prone. If that's the price for reusing global btf_encoder
> > for first worker, then I'd rather not reuse btf_encoder or revert
> > back
> > to doing btf__add_btf() and doing btf_encoder__delete() in the main
> > thread.
> >
> > Please question and push back the approach and code review feedback
> > if
> > something doesn't feel natural or is causing more problems than
> > solves.
> >
> > I think the cleanest solution would be to not reuse global btf_encoder
> > for the first worker. I suspect the time difference isn't big, so I'd
> > optimize for simplicity and clean separation. But if it is causing
> > unnecessary slowdown, then as I said, let's just revert back to your
> > previous approach with doing btf__add_btf() in the main thread.
> >
>
> Your concerns make sense.
> I tried the solutions w/ & w/o reusing btf_encoder.  The differencies
> are obviously.  So, I will rollback to calling btf__add_btf() at the
> main thread.
>
> w/o reusing: AVG 5.78467 P50 5.88 P70 6.03 P90 6.10
> w/  reusing: AVG 5.304 P50 5.12 P70 5.17 P90 5.46
>

Half a second, wow. Ok, yeah, makes sense.

>
>
diff mbox series

Patch

diff --git a/btf_encoder.c b/btf_encoder.c
index 9d015f304e92..56a76f5d7275 100644
--- a/btf_encoder.c
+++ b/btf_encoder.c
@@ -1529,3 +1529,8 @@  int btf_encoder__encode_cu(struct btf_encoder *encoder, struct cu *cu)
 out:
 	return err;
 }
+
+struct btf *btf_encoder__btf(struct btf_encoder *encoder)
+{
+	return encoder->btf;
+}
diff --git a/btf_encoder.h b/btf_encoder.h
index f133b0d7674d..0f0eee84df74 100644
--- a/btf_encoder.h
+++ b/btf_encoder.h
@@ -29,4 +29,6 @@  struct btf_encoder *btf_encoders__first(struct list_head *encoders);
 
 struct btf_encoder *btf_encoders__next(struct btf_encoder *encoder);
 
+struct btf *btf_encoder__btf(struct btf_encoder *encoder);
+
 #endif /* _BTF_ENCODER_H_ */
diff --git a/pahole.c b/pahole.c
index f3eeaaca4cdf..525eb4d90b08 100644
--- a/pahole.c
+++ b/pahole.c
@@ -29,6 +29,7 @@ 
 #include "btf_encoder.h"
 
 static struct btf_encoder *btf_encoder;
+static pthread_mutex_t btf_encoder_lock = PTHREAD_MUTEX_INITIALIZER;
 static char *detached_btf_filename;
 static bool btf_encode;
 static bool btf_gen_floats;
@@ -2798,6 +2799,65 @@  out:
 
 static struct type_instance *header;
 
+struct thread_data {
+	struct btf *btf;
+	struct btf_encoder *encoder;
+};
+
+static int pahole_threads_prepare(struct conf_load *conf, int nr_threads, void **thr_data)
+{
+	int i;
+	struct thread_data *threads = calloc(sizeof(struct thread_data), nr_threads);
+
+	for (i = 0; i < nr_threads; i++)
+		thr_data[i] = threads + i;
+
+	return 0;
+}
+
+static int pahole_thread_exit(struct conf_load *conf, void *thr_data)
+{
+	struct thread_data *thread = thr_data;
+
+	if (thread == NULL)
+		return 0;
+
+	/*
+	 * Here we will call btf__dedup() here once we extend
+	 * btf__dedup().
+	 */
+
+	if (thread->encoder == btf_encoder) {
+		/* Release the lock acuqired when created btf_encoder */
+		pthread_mutex_unlock(&btf_encoder_lock);
+		return 0;
+	}
+
+	pthread_mutex_lock(&btf_encoder_lock);
+	btf__add_btf(btf_encoder__btf(btf_encoder), thread->btf);
+	pthread_mutex_unlock(&btf_encoder_lock);
+
+	btf_encoder__delete(thread->encoder);
+	thread->encoder = NULL;
+
+	return 0;
+}
+
+static int pahole_threads_collect(struct conf_load *conf, int nr_threads, void **thr_data,
+				  int error)
+{
+	struct thread_data **threads = (struct thread_data **)thr_data;
+	int i;
+
+	for (i = 0; i < nr_threads; i++) {
+		if (threads[i]->encoder && threads[i]->encoder != btf_encoder)
+			btf_encoder__delete(threads[i]->encoder);
+	}
+	free(threads[0]);
+
+	return 0;
+}
+
 static enum load_steal_kind pahole_stealer(struct cu *cu,
 					   struct conf_load *conf_load,
 					   void *thr_data)
@@ -2819,30 +2879,72 @@  static enum load_steal_kind pahole_stealer(struct cu *cu,
 
 	if (btf_encode) {
 		static pthread_mutex_t btf_lock = PTHREAD_MUTEX_INITIALIZER;
+		struct btf_encoder *encoder;
 
-		pthread_mutex_lock(&btf_lock);
 		/*
 		 * FIXME:
 		 *
 		 * This should be really done at main(), but since in the current codebase only at this
 		 * point we'll have cu->elf setup...
 		 */
+		pthread_mutex_lock(&btf_lock);
 		if (!btf_encoder) {
-			btf_encoder = btf_encoder__new(cu, detached_btf_filename, conf_load->base_btf, skip_encoding_btf_vars,
-						       btf_encode_force, btf_gen_floats, global_verbose);
-			if (btf_encoder == NULL) {
-				ret = LSK__STOP_LOADING;
-				goto out_btf;
+			/*
+			 * btf_encoder is the primary encoder.
+			 * And, it is used by the thread
+			 * create it.
+			 */
+			btf_encoder = btf_encoder__new(cu, detached_btf_filename,
+						       conf_load->base_btf,
+						       skip_encoding_btf_vars,
+						       btf_encode_force,
+						       btf_gen_floats, global_verbose);
+			if (btf_encoder && thr_data) {
+				struct thread_data *thread = (struct thread_data *)thr_data;
+
+				thread->encoder = btf_encoder;
+				thread->btf = btf_encoder__btf(btf_encoder);
+				/* Will be relased by pahole_thread_exit() */
+				pthread_mutex_lock(&btf_encoder_lock);
 			}
 		}
+		pthread_mutex_unlock(&btf_lock);
+
+		if (btf_encoder == NULL) {
+			ret = LSK__STOP_LOADING;
+			goto out_btf;
+		}
 
-		if (btf_encoder__encode_cu(btf_encoder, cu)) {
+		/*
+		 * thr_data keeps per-thread data for worker threads.  Each worker thread
+		 * has an encoder.  The main thread will merge the data collected by all
+		 * these encoders to btf_encoder.  However, the first thread reaching this
+		 * function creates btf_encoder and reuses it as its local encoder.  It
+		 * avoids copying the data collected by the first thread.
+		 */
+		if (thr_data) {
+			struct thread_data *thread = (struct thread_data *)thr_data;
+
+			if (thread->encoder == NULL) {
+				thread->encoder =
+					btf_encoder__new(cu, detached_btf_filename,
+							 NULL,
+							 skip_encoding_btf_vars,
+							 btf_encode_force,
+							 btf_gen_floats,
+							 global_verbose);
+				thread->btf = btf_encoder__btf(thread->encoder);
+			}
+			encoder = thread->encoder;
+		} else
+			encoder = btf_encoder;
+
+		if (btf_encoder__encode_cu(encoder, cu)) {
 			fprintf(stderr, "Encountered error while encoding BTF.\n");
 			exit(1);
 		}
 		ret = LSK__DELETE;
 out_btf:
-		pthread_mutex_unlock(&btf_lock);
 		return ret;
 	}
 #if 0
@@ -3207,6 +3309,9 @@  int main(int argc, char *argv[])
 	memset(tab, ' ', sizeof(tab) - 1);
 
 	conf_load.steal = pahole_stealer;
+	conf_load.thread_exit = pahole_thread_exit;
+	conf_load.threads_prepare = pahole_threads_prepare;
+	conf_load.threads_collect = pahole_threads_collect;
 
 	// Make 'pahole --header type < file' a shorter form of 'pahole -C type --count 1 < file'
 	if (conf.header_type && !class_name && prettify_input) {