diff mbox series

[11/12] pahole: Encode BTF serially in a reproducible build

Message ID 20240402193945.17327-12-acme@kernel.org (mailing list archive)
State Not Applicable
Delegated to: BPF
Headers show
Series pahole: Reproducible parallel DWARF loading/serial BTF encoding | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Arnaldo Carvalho de Melo April 2, 2024, 7:39 p.m. UTC
From: Arnaldo Carvalho de Melo <acme@redhat.com>

Now we will ask the cus instance for the next processable CU, i.e. one
that is loaded and is in the same CU order as in the original DWARF
file, under the BTF lock.

With this we can go on loading the DWARF file in parallel and only
serialize the BTF encoding, keeping that order, with this the BTF ids
end up the same both for a serial encoding:

And here are some numbers with a Release build:

  $ cat buildcmd.sh
  mkdir build
  cd build
  cmake -DCMAKE_BUILD_TYPE=Release ..
  cd ..
  make -j $(getconf _NPROCESSORS_ONLN) -C build
  $ rm -rf build
  $ ./buildcmd.sh

Its an Intel Hybrid system, and migrates to/from efficiency/perfomance
cores:

  $ getconf _NPROCESSORS_ONLN
  28
  $ grep -m1 'model name' /proc/cpuinfo
  model name	: Intel(R) Core(TM) i7-14700K
  $

8 performance cores (16 threads), 12 efficiency cores.

Serial encoding:

  $ time perf stat -e cycles -r5 pahole --btf_encode_detached=vmlinux.btf.serial vmlinux

   Performance counter stats for 'pahole --btf_encode_detached=vmlinux.btf.serial vmlinux' (5 runs):

      13,313,169,305      cpu_atom/cycles:u/      ( +- 30.61% )  (0.00%)
      27,985,776,096      cpu_core/cycles:u/      ( +-  0.17% )  (100.00%)

             5.18276 +- 0.00952 seconds time elapsed  ( +-  0.18% )

  real	0m25.937s
  user	0m25.337s
  sys	0m0.533s
  $

Parallel, but non-reproducible:

  $ time perf stat -e cycles -r5 pahole -j --btf_encode_detached=vmlinux.btf.parallel vmlinux

   Performance counter stats for 'pahole -j --btf_encode_detached=vmlinux.btf.parallel vmlinux' (5 runs):

      65,781,092,442      cpu_atom/cycles:u/      ( +-  0.99% )  (42.99%)
      88,578,827,055      cpu_core/cycles:u/      ( +-  0.90% )  (60.93%)

              1.8529 +- 0.0159 seconds time elapsed  ( +-  0.86% )

  real	0m9.293s
  user	1m21.599s
  sys	0m11.348s
  $

Now what we want, a reproducible build done using parallel DWARF loading
+ CUs-ordered-as-in-vmlinux serial BTF encoding:

  $ time perf stat -e cycles -r5 pahole -j --reproducible_build --btf_encode_detached=vmlinux.btf.parallel.reproducible_build vmlinux

   Performance counter stats for 'pahole -j --reproducible_build --btf_encode_detached=vmlinux.btf.parallel.reproducible_build vmlinux' (5 runs):

      21,255,687,225      cpu_atom/cycles:u/      ( +-  0.76% )  (35.06%)
      33,852,263,760      cpu_core/cycles:u/      ( +-  0.24% )  (72.70%)

              2.3632 +- 0.0164 seconds time elapsed  ( +-  0.69% )

  real	0m11.840s
  user	0m35.952s
  sys	0m1.534s
  $

Fastest is off course the unreproducible, fully parallel DWARF loading/
BTF encoding at 1.8529 +- 0.0159 seconds, but doing a reproducible build
in 2.3632 +- 0.0164 seconds is better than completely disabling -j/full
serial at 5.18276 +- 0.00952 seconds.

Comparing the BTF generated:

  $ bpftool btf dump file vmlinux.btf.serial > output.vmlinux.btf.serial
  $ bpftool btf dump file vmlinux.btf.parallel > output.vmlinux.btf.parallel
  $ bpftool btf dump file vmlinux.btf.parallel.reproducible > output.vmlinux.btf.parallel.reproducible

  $ wc -l output.vmlinux.btf.serial output.vmlinux.btf.parallel output.vmlinux.btf.parallel.reproducible
    313404 output.vmlinux.btf.serial
    314345 output.vmlinux.btf.parallel
    313404 output.vmlinux.btf.parallel.reproducible
    941153 total
  $

Non reproducible parallel BTF encoding:

  $ diff -u output.vmlinux.btf.serial output.vmlinux.btf.parallel | head
  --- output.vmlinux.btf.serial	2024-04-02 11:11:56.665027947 -0300
  +++ output.vmlinux.btf.parallel	2024-04-02 11:12:38.490895460 -0300
  @@ -1,1708 +1,2553 @@
   [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none)
  -[2] CONST '(anon)' type_id=1
  -[3] VOLATILE '(anon)' type_id=2
  -[4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2
  -[5] PTR '(anon)' type_id=8
  -[6] CONST '(anon)' type_id=5
  -[7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=(none)
  $

Reproducible:

  $ diff -u output.vmlinux.btf.serial output.vmlinux.btf.parallel.reproducible
  $

And using a test script that I'll add to a nascent repository of
regression tests:

  $ time tests/reproducible_build.sh vmlinux
  Parallel reproducible DWARF Loading/Serial BTF encoding: Ok

  real	1m13.844s
  user	3m3.601s
  sys	0m9.049s
  $

If the number of threads started by pahole is different than what was
requests via its -j command line option, it will fail as well as if the
output of 'bpftool btf dump' differs from the BTF encoded totally
serially to one of the detached BTF encoded using reproducible DWARF
loading/BTF encoding.

In verbose mode:

  $ time VERBOSE=1 tests/reproducible_build.sh vmlinux
  Parallel reproducible DWARF Loading/Serial BTF encoding:
  serial encoding...
  1 threads encoding
  1 threads started
  diff from serial encoding:
  -----------------------------
  2 threads encoding
  2 threads started
  diff from serial encoding:
  -----------------------------
  3 threads encoding
  3 threads started
  diff from serial encoding:
  -----------------------------
  4 threads encoding
  4 threads started
  diff from serial encoding:
  -----------------------------
  5 threads encoding
  5 threads started
  diff from serial encoding:
  -----------------------------
  6 threads encoding
  6 threads started
   diff from serial encoding:
  -----------------------------
  7 threads encoding
  7 threads started
  diff from serial encoding:
  -----------------------------
  8 threads encoding
  8 threads started
  diff from serial encoding:
  -----------------------------
  9 threads encoding
  9 threads started
  diff from serial encoding:
  -----------------------------
  10 threads encoding
  10 threads started
  diff from serial encoding:
  -----------------------------
  11 threads encoding
  11 threads started
  diff from serial encoding:
  -----------------------------
  12 threads encoding
  12 threads started
  diff from serial encoding:
  -----------------------------
  13 threads encoding
  13 threads started
  diff from serial encoding:
  -----------------------------
  14 threads encoding
  14 threads started
  diff from serial encoding:
  -----------------------------
  15 threads encoding
  15 threads started
  diff from serial encoding:
  -----------------------------
  16 threads encoding
  16 threads started
  diff from serial encoding:
  -----------------------------
  17 threads encoding
  17 threads started
  diff from serial encoding:
  -----------------------------
  18 threads encoding
  18 threads started
  diff from serial encoding:
  -----------------------------
  19 threads encoding
  19 threads started
  diff from serial encoding:
  -----------------------------
  20 threads encoding
  20 threads started
  diff from serial encoding:
  -----------------------------
  21 threads encoding
  21 threads started
  diff from serial encoding:
  -----------------------------
  22 threads encoding
  22 threads started
  diff from serial encoding:
  -----------------------------
  23 threads encoding
  23 threads started
  diff from serial encoding:
  -----------------------------
  24 threads encoding
  24 threads started
  diff from serial encoding:
  -----------------------------
  25 threads encoding
  25 threads started
  diff from serial encoding:
  -----------------------------
  26 threads encoding
  26 threads started
  diff from serial encoding:
  -----------------------------
  27 threads encoding
  27 threads started
  diff from serial encoding:
  -----------------------------
  28 threads encoding
  28 threads started
  diff from serial encoding:
  -----------------------------
  Ok

  real	1m14.800s
  user	3m4.315s
  sys	0m8.977s
  $

Cc: Alan Maguire <alan.maguire@oracle.com>
Cc: Kui-Feng Lee <kuifeng@fb.com>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 pahole.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/pahole.c b/pahole.c
index fcb4360f11debeb9..6c7e73835b3e9139 100644
--- a/pahole.c
+++ b/pahole.c
@@ -31,6 +31,7 @@ 
 
 static struct btf_encoder *btf_encoder;
 static char *detached_btf_filename;
+struct cus *cus;
 static bool btf_encode;
 static bool ctf_encode;
 static bool sort_output;
@@ -3324,11 +3325,32 @@  static enum load_steal_kind pahole_stealer(struct cu *cu,
 			encoder = btf_encoder;
 		}
 
+		// Since we don't have yet a way to parallelize the BTF encoding, we
+		// need to ask the loader for the next CU that we can process, one
+		// that is loaded and is in order, if the next one isn't yet loaded,
+		// then return to let the DWARF loader thread to load the next one,
+		// eventually all will get processed, even if when all DWARF loading
+		// threads finish.
+		if (conf_load->reproducible_build) {
+			ret = LSK__KEEPIT; // we're not processing the cu passed to this
+					  // function, so keep it.
+			cu = cus__get_next_processable_cu(cus);
+			if (cu == NULL)
+				goto out_btf;
+		}
+
 		ret = btf_encoder__encode_cu(encoder, cu, conf_load);
 		if (ret < 0) {
 			fprintf(stderr, "Encountered error while encoding BTF.\n");
 			exit(1);
 		}
+
+		if (conf_load->reproducible_build) {
+			ret = LSK__KEEPIT; // we're not processing the cu passed to this function, so keep it.
+			// Equivalent to LSK__DELETE since we processed this
+			cus__remove(cus, cu);
+			cu__delete(cu);
+		}
 out_btf:
 		if (!thr_data) // See comment about reproducibe_build above
 			pthread_mutex_unlock(&btf_lock);
@@ -3632,6 +3654,27 @@  out_free:
 	return ret;
 }
 
+static int cus__flush_reproducible_build(struct cus *cus, struct btf_encoder *encoder, struct conf_load *conf_load)
+{
+	int err = 0;
+
+	while (true) {
+		struct cu *cu = cus__get_next_processable_cu(cus);
+
+		if (cu == NULL)
+			break;
+
+		err = btf_encoder__encode_cu(encoder, cu, conf_load);
+		if (err < 0)
+			break;
+
+		cus__remove(cus, cu);
+		cu__delete(cu);
+	}
+
+	return err;
+}
+
 int main(int argc, char *argv[])
 {
 	int err, remaining, rc = EXIT_FAILURE;
@@ -3692,7 +3735,7 @@  int main(int argc, char *argv[])
 		}
 	}
 
-	struct cus *cus = cus__new();
+	cus = cus__new();
 	if (cus == NULL) {
 		fputs("pahole: insufficient memory\n", stderr);
 		goto out_dwarves_exit;
@@ -3797,6 +3840,12 @@  try_sole_arg_as_class_names:
 	header = NULL;
 
 	if (btf_encode && btf_encoder) { // maybe all CUs were filtered out and thus we don't have an encoder?
+		if (conf_load.reproducible_build &&
+		    cus__flush_reproducible_build(cus, btf_encoder, &conf_load) < 0) {
+			fprintf(stderr, "Encountered error while encoding BTF.\n");
+			exit(1);
+		}
+
 		err = btf_encoder__encode(btf_encoder);
 		if (err) {
 			fputs("Failed to encode BTF\n", stderr);