diff mbox series

[26/34] commit-graph: deduplicate modified path Bloom filters

Message ID 20200529085038.26008-27-szeder.dev@gmail.com (mailing list archive)
State New, archived
Headers show
Series An alternative modified path Bloom filters implementation | expand

Commit Message

SZEDER Gábor May 29, 2020, 8:50 a.m. UTC
Some commits modify the same set of paths, and, consequently, have
identical modified path Bloom filters.

Use a hashmap to find these identical Bloom filters, and omit all
duplicates from the Modified Path Bloom Filters chunk, reducing its
size.

                   MPBF chunk size
                 without       with             Time spent
                    deduplication                deduping
  --------------------------------------------------------
  android-base   8620198     2618764    -69.6%    0.089s
  cmssw         10347018     9094468    -12.1%    0.056s
  cpython         526381      371629    -29.4%    0.013s
  elasticsearch  4733274     3607106    -23.8%    0.029s
  gcc            3724544     3394521     -8.9%    0.072s
  gecko-dev     20591010    17042732    -17.2%    0.235s
  git             160718      139546    -13.2%    0.004s
  glibc           759132      699623     -7.8%    0.009s
  go              458375      421394     -8.1%    0.008s
  jdk           13213208    12158533     -8.0%    0.034s
  linux         26575278    25029700     -5.8%    0.169s
  llvm-project   3988133     3269438    -18.0%    0.085s
  rails           958691      614528    -35.9%    0.015s
  rust           1985782     1682919    -15.3%    0.023s
  tensorflow     4173570     3789768     -9.2%    0.022s
  webkit         5891751     5394507     -8.4%    0.096s

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
---
 commit-graph.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/commit-graph.c b/commit-graph.c
index 56b8f33d10..178bbf0113 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -57,6 +57,7 @@ 
 
 struct modified_path_bloom_filter_info {
 	struct bloom_filter filter;
+	struct modified_path_bloom_filter_info *duplicate_of;
 	/*
 	 * The offset relative to the start of the Modified Path Bloom
 	 * Filters chunk where this Bloom filter has been written,
@@ -1099,6 +1100,9 @@  struct write_commit_graph_context {
 
 		/* Excluding embedded modified path Bloom filters */
 		uint64_t total_filter_size;
+
+		/* Used to find identical modified path Bloom filters */
+		struct hashmap dedup_hashmap;
 	} mpbfctx;
 };
 
@@ -1315,7 +1319,11 @@  static int write_graph_chunk_modified_path_bloom_filters(struct hashfile *f,
 		bfi = modified_path_bloom_filters_peek(
 				&modified_path_bloom_filters, commit);
 
-		if (!bfi || !bfi->filter.nr_bits)
+		if (!bfi)
+			continue;
+		if (bfi->duplicate_of)
+			continue;
+		if (!bfi->filter.nr_bits)
 			continue;
 		if (bfi->filter.nr_bits == GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS)
 			continue;
@@ -1364,6 +1372,9 @@  static int write_graph_chunk_modified_path_bloom_index(struct hashfile *f,
 			 */
 			filterdata[0] |= 1 << 7;
 			hashwrite(f, filterdata, sizeof(filterdata));
+		} else if (bfi->duplicate_of) {
+			uint64_t offset = htonll(bfi->duplicate_of->offset);
+			hashwrite(f, &offset, sizeof(offset));
 		} else if (bfi->offset != -1) {
 			uint64_t offset = htonll(bfi->offset);
 			hashwrite(f, &offset, sizeof(offset));
@@ -1515,6 +1526,53 @@  static void file_change_cb(struct diff_options *options,
 	handle_modified_file(options->change_fn_data, fullpath);
 }
 
+struct modified_path_bloom_filter_dedup_entry {
+	struct hashmap_entry entry;
+	struct modified_path_bloom_filter_info *bfi;
+};
+
+static int modified_path_bloom_filter_dedup_cmp(const void *unused_cmp_data,
+		const struct hashmap_entry *he1,
+		const struct hashmap_entry *he2,
+		const void *unused_keydata)
+{
+	const struct modified_path_bloom_filter_dedup_entry *a, *b;
+
+	a = container_of(he1, const struct modified_path_bloom_filter_dedup_entry, entry);
+	b = container_of(he2, const struct modified_path_bloom_filter_dedup_entry, entry);
+
+	if (a->bfi->filter.nr_bits != b->bfi->filter.nr_bits)
+		return 1;
+
+	return memcmp(a->bfi->filter.bits, b->bfi->filter.bits,
+		      bloom_filter_bytes(&a->bfi->filter));
+}
+
+static int handle_duplicate_modified_path_bloom_filter(
+		struct modified_path_bloom_filter_context *mpbfctx,
+		struct modified_path_bloom_filter_info *bfi)
+{
+	struct modified_path_bloom_filter_dedup_entry *e, stack_entry;
+	unsigned int h;
+
+	h = memhash(bfi->filter.bits, bloom_filter_bytes(&bfi->filter));
+	hashmap_entry_init(&stack_entry.entry, h);
+	stack_entry.bfi = bfi;
+
+	e = hashmap_get_entry(&mpbfctx->dedup_hashmap, &stack_entry, entry,
+			      NULL);
+	if (e) {
+		bfi->duplicate_of = e->bfi;
+		return 1;
+	} else {
+		e = xmalloc(sizeof(*e));
+		hashmap_entry_init(&e->entry, h);
+		e->bfi = bfi;
+		hashmap_add(&mpbfctx->dedup_hashmap, &e->entry);
+		return 0;
+	}
+}
+
 static void create_modified_path_bloom_filter(
 		struct modified_path_bloom_filter_context *mpbfctx,
 		struct commit *commit)
@@ -1547,17 +1605,20 @@  static void create_modified_path_bloom_filter(
 	bfi = modified_path_bloom_filters_at(&modified_path_bloom_filters,
 					     commit);
 	bfi->offset = -1;
-	if (path_component_count > mpbfctx->embedded_limit) {
+	if (path_component_count > mpbfctx->embedded_limit)
 		bloom_filter_init(&bfi->filter, mpbfctx->num_hashes,
 				  path_component_count);
-		mpbfctx->total_filter_size += sizeof(uint32_t) +
-					      bloom_filter_bytes(&bfi->filter);
-	} else
+	else
 		bloom_filter_init_with_size(&bfi->filter,
 				GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS);
 
 	bloom_filter_set_bits(&bfi->filter, mpbfctx->hashes,
 			      mpbfctx->hashes_nr);
+
+	if (path_component_count > mpbfctx->embedded_limit &&
+	    !handle_duplicate_modified_path_bloom_filter(mpbfctx, bfi))
+		mpbfctx->total_filter_size += sizeof(uint32_t) +
+					      bloom_filter_bytes(&bfi->filter);
 }
 
 static void add_missing_parents(struct write_commit_graph_context *ctx, struct commit *commit)
@@ -2396,6 +2457,8 @@  int write_commit_graph(const char *obj_dir,
 		diff_setup_done(&ctx->mpbfctx.diffopt);
 
 		strbuf_init(&ctx->mpbfctx.prev_path, PATH_MAX);
+		hashmap_init(&ctx->mpbfctx.dedup_hashmap,
+			     modified_path_bloom_filter_dedup_cmp, NULL, 0);
 	}
 
 	if (ctx->split) {
@@ -2525,6 +2588,9 @@  int write_commit_graph(const char *obj_dir,
 		strbuf_release(&ctx->mpbfctx.prev_path);
 		free(ctx->mpbfctx.hashed_prefix_lengths);
 		free(ctx->mpbfctx.hashes);
+		hashmap_free_entries(&ctx->mpbfctx.dedup_hashmap,
+				struct modified_path_bloom_filter_dedup_entry,
+				entry);
 	}
 
 	free(ctx);