@@ -26,6 +26,7 @@
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
#define GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_INDEX 0x4d504249 /* "MPBI" */
+#define GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_EXCLUDES 0x4d504258 /* "MPBX" */
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
@@ -308,6 +309,23 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
chunk_repeated = 1;
else
graph->chunk_base_graphs = data + chunk_offset;
+ break;
+
+ case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_INDEX:
+ if (graph->chunk_mpbf_index)
+ chunk_repeated = 1;
+ else {
+ graph->chunk_mpbf_index = data + chunk_offset;
+ graph->chunk_mpbf_index_size = next_chunk_offset - chunk_offset;
+ }
+ break;
+
+ case GRAPH_CHUNKID_MODIFIED_PATH_BLOOM_FILTER_EXCLUDES:
+ if (graph->chunk_mpbf_excludes)
+ chunk_repeated = 1;
+ else
+ graph->chunk_mpbf_excludes = data + chunk_offset;
+ break;
}
if (chunk_repeated) {
@@ -324,6 +342,29 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
return NULL;
}
+ if (graph->chunk_mpbf_index) {
+ int res = 0;
+ git_config_get_bool("core.modifiedPathBloomFilters", &res);
+ if (res) {
+ uint64_t expected_size = sizeof(uint8_t) +
+ graph->num_commits * sizeof(uint64_t);
+ if (graph->chunk_mpbf_index_size != expected_size)
+ BUG(_("for %u commits the Modified Path Bloom Filter Index chunk should be %ld bytes, but it's %ld"),
+ graph->num_commits, expected_size,
+ graph->chunk_mpbf_index_size);
+ graph->num_modified_path_bloom_hashes = *graph->chunk_mpbf_index;
+ if (!graph->num_modified_path_bloom_hashes)
+ BUG(_("Modified Path Bloom Filter Index chunk using 0 hashes"));
+ if (graph->chunk_mpbf_excludes)
+ /*
+ * Modified Path Bloom Filter Excludes are
+ * not supported yet.
+ */
+ graph->chunk_mpbf_index = NULL;
+ } else
+ graph->chunk_mpbf_index = NULL;
+ }
+
return graph;
}
@@ -777,6 +818,68 @@ struct tree *get_commit_tree_in_graph(struct repository *r, const struct commit
return get_commit_tree_in_graph_one(r, r->objects->commit_graph, c);
}
+static int load_modified_path_bloom_filter_from_graph(
+ struct commit_graph *graph, struct commit *commit,
+ struct commit *parent, struct bloom_filter *bf)
+{
+ const uint8_t *bloom_index;
+ int first_parent = 0;
+
+ if (commit->graph_pos == COMMIT_NOT_FROM_GRAPH)
+ return 0;
+ if (!graph)
+ return 0;
+ if (!graph->chunk_mpbf_index)
+ return 0;
+
+ if (!commit->parents || commit->parents->item == parent)
+ first_parent = 1;
+
+ bloom_index = graph->chunk_mpbf_index + sizeof(uint8_t) +
+ sizeof(uint64_t) * commit->graph_pos;
+
+ if (bloom_index[0] & (1 << 7)) {
+ uint64_t v;
+ if (!first_parent)
+ return 0;
+ memcpy(&v, bloom_index, sizeof(v));
+ if (v == GRAPH_MODIFIED_PATH_BLOOM_FILTER_NONE)
+ return 0;
+
+ /* embedded modified path Bloom filter */
+ bf->nr_bits = GRAPH_MODIFIED_PATH_BLOOM_FILTER_EMBEDDED_NR_BITS;
+ bf->bits = (uint8_t*) bloom_index;
+ return 1;
+ }
+ /* support for non-embedded Bloom filters is not implemented yet. */
+ return 0;
+}
+
+enum bloom_result check_modified_path_bloom_filter(struct repository *r,
+ struct pathspec *pathspec, struct commit *parent,
+ struct commit *commit)
+{
+ struct bloom_filter bf;
+ int i;
+
+ if (!pathspec->can_use_modified_path_bloom_filters)
+ return BLOOM_CANT_TELL;
+ if (!load_modified_path_bloom_filter_from_graph(r->objects->commit_graph,
+ commit, parent, &bf))
+ return BLOOM_CANT_TELL;
+
+ for (i = 0; i < pathspec->nr; i++) {
+ struct pathspec_item *pi = &pathspec->items[i];
+
+ if (bloom_filter_check_bits(&bf,
+ pi->modified_path_bloom_hashes,
+ pi->modified_path_bloom_hashes_nr))
+ return BLOOM_POSSIBLY_YES;
+ }
+
+ return BLOOM_DEFINITELY_NOT;
+}
+
static uint32_t modified_path_bloom_seeds[] = {
0xe83c5163U,
0x3b376b0cU,
@@ -807,6 +910,49 @@ static void compute_modified_path_bloom_hashes_for_path(const char *path,
}
}
+void init_pathspec_bloom_fields(struct repository *r,
+ struct pathspec *pathspec)
+{
+ const unsigned bloom_compatible_magic = PATHSPEC_LITERAL;
+ struct commit_graph *graph = r->objects->commit_graph;
+ int i;
+
+ if (!graph)
+ return;
+ if (!graph->chunk_mpbf_index)
+ return;
+ if (!pathspec->nr)
+ return;
+ if (pathspec->has_wildcard)
+ return;
+ if (pathspec->magic & ~bloom_compatible_magic)
+ return;
+
+ for (i = 0; i < pathspec->nr; i++) {
+ struct pathspec_item *pi = &pathspec->items[i];
+ const char *path = pi->match;
+ size_t len = pi->len;
+
+ /*
+ * Pathspec parsing has normalized away any consecutive
+ * slashes, but a trailing slash might still be present,
+ * "remove" it.
+ */
+ if (path[len - 1] == '/')
+ len--;
+
+ pi->modified_path_bloom_hashes_nr = graph->num_modified_path_bloom_hashes;
+ ALLOC_ARRAY(pi->modified_path_bloom_hashes,
+ pi->modified_path_bloom_hashes_nr);
+
+ compute_modified_path_bloom_hashes_for_path(path, len,
+ graph->num_modified_path_bloom_hashes,
+ pi->modified_path_bloom_hashes);
+ }
+
+ pathspec->can_use_modified_path_bloom_filters = 1;
+}
+
struct packed_commit_list {
struct commit **list;
int nr;
@@ -11,6 +11,8 @@ struct commit;
struct repository;
struct raw_object_store;
struct string_list;
+struct bloom_filter;
+struct pathspec;
char *get_commit_graph_filename(const char *obj_dir);
int open_commit_graph(const char *graph_file, int *fd, struct stat *st);
@@ -38,6 +40,12 @@ void load_commit_graph_info(struct repository *r, struct commit *item);
struct tree *get_commit_tree_in_graph(struct repository *r,
const struct commit *c);
+enum bloom_result check_modified_path_bloom_filter(struct repository *r,
+ struct pathspec *pathspec, struct commit *parent,
+ struct commit *commit);
+void init_pathspec_bloom_fields(struct repository *r,
+ struct pathspec *pathspec);
+
struct commit_graph {
int graph_fd;
@@ -59,6 +67,11 @@ struct commit_graph {
const unsigned char *chunk_commit_data;
const unsigned char *chunk_extra_edges;
const unsigned char *chunk_base_graphs;
+ const unsigned char *chunk_mpbf_index;
+ uint64_t chunk_mpbf_index_size;
+ const unsigned char *chunk_mpbf_excludes;
+
+ uint8_t num_modified_path_bloom_hashes;
};
struct commit_graph *load_commit_graph_one_fd_st(int fd, struct stat *st);
@@ -406,6 +406,8 @@ static void init_pathspec_item(struct pathspec_item *item, unsigned flags,
item->attr_check = NULL;
item->attr_match = NULL;
item->attr_match_nr = 0;
+ item->modified_path_bloom_hashes_nr = 0;
+ item->modified_path_bloom_hashes = NULL;
/* PATHSPEC_LITERAL_PATH ignores magic */
if (flags & PATHSPEC_LITERAL_PATH) {
@@ -674,6 +676,12 @@ void copy_pathspec(struct pathspec *dst, const struct pathspec *src)
}
d->attr_check = attr_check_dup(s->attr_check);
+
+ ALLOC_ARRAY(d->modified_path_bloom_hashes,
+ d->modified_path_bloom_hashes_nr);
+ COPY_ARRAY(d->modified_path_bloom_hashes,
+ s->modified_path_bloom_hashes,
+ d->modified_path_bloom_hashes_nr);
}
}
@@ -684,6 +692,7 @@ void clear_pathspec(struct pathspec *pathspec)
for (i = 0; i < pathspec->nr; i++) {
free(pathspec->items[i].match);
free(pathspec->items[i].original);
+ free(pathspec->items[i].modified_path_bloom_hashes);
for (j = 0; j < pathspec->items[i].attr_match_nr; j++)
free(pathspec->items[i].attr_match[j].value);
@@ -32,6 +32,7 @@ struct pathspec {
unsigned int has_wildcard:1;
unsigned int recursive:1;
unsigned int recurse_submodules:1;
+ unsigned int can_use_modified_path_bloom_filters:1;
unsigned magic;
int max_depth;
struct pathspec_item {
@@ -52,6 +53,17 @@ struct pathspec {
} match_mode;
} *attr_match;
struct attr_check *attr_check;
+
+ /*
+ * These fields are only relevant during pathspec-limited
+ * revision walks, init_pathspec_item() leaves them
+ * zero-initialized, but copy_pathspec() copies them,
+ * and clear_pathspec() releases the allocated memory.
+ * IOW: they are only valid if 'struct pathspec's
+ * 'can_use_modified_path_bloom_filters' bit is set.
+ */
+ uint32_t modified_path_bloom_hashes_nr;
+ uint32_t *modified_path_bloom_hashes;
} *items;
};
@@ -29,6 +29,7 @@
#include "prio-queue.h"
#include "hashmap.h"
#include "utf8.h"
+#include "bloom-filter.h"
volatile show_early_output_fn_t show_early_output;
@@ -629,6 +630,7 @@ static int rev_compare_tree(struct rev_info *revs,
{
struct tree *t1 = get_commit_tree(parent);
struct tree *t2 = get_commit_tree(commit);
+ enum bloom_result bloom_ret;
if (!t1)
return REV_TREE_NEW;
@@ -653,6 +655,12 @@ static int rev_compare_tree(struct rev_info *revs,
return REV_TREE_SAME;
}
+ bloom_ret = check_modified_path_bloom_filter(revs->repo,
+ &revs->pruning.pathspec,
+ parent, commit);
+ if (bloom_ret == BLOOM_DEFINITELY_NOT)
+ return REV_TREE_SAME;
+
tree_difference = REV_TREE_SAME;
revs->pruning.flags.has_changes = 0;
diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning);
@@ -662,10 +670,17 @@ static int rev_compare_tree(struct rev_info *revs,
static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
{
struct tree *t1 = get_commit_tree(commit);
+ enum bloom_result bloom_ret;
if (!t1)
return 0;
+ bloom_ret = check_modified_path_bloom_filter(revs->repo,
+ &revs->pruning.pathspec,
+ NULL, commit);
+ if (bloom_ret == BLOOM_DEFINITELY_NOT)
+ return 1;
+
tree_difference = REV_TREE_SAME;
revs->pruning.flags.has_changes = 0;
diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
@@ -3376,6 +3391,9 @@ int prepare_revision_walk(struct rev_info *revs)
simplify_merges(revs);
if (revs->children.name)
set_children(revs);
+
+ init_pathspec_bloom_fields(revs->repo, &revs->pruning.pathspec);
+
return 0;
}
Load the Modified Path Bloom Filter Index (MPBI) chunk and check the embedded modified path Bloom filters within to speed up pathspec-limited revison walks. Extend 'struct pathspec_item' to hold the hashes of each path for modified path Bloom filters. Pathspecs are used by many Git commands, and a lot of them doesn't do any revision walk at all, so don't initialize those fields in parse_pathspec(), but only when we are about to start a pathspec-limited revision walk, i.e. in prepare_revision_walk(). Initialize these fields only in the revs->pruning.pathspec instance, because that's the one used to limit the revision walk. Note that some revision walks re-parse the pathspecs, notably 'git log --follow' and the line-level log, but they don't use this pathspec instance. The table below shows the average runtime and speedup of git rev-list HEAD -- "$path" for 5000+ randomly selected paths from each repository with and without modified path Bloom filters: Average runtime without with MPBI MPBI -------------------------------------------- android-base 0.8780s n/a cmssw 0.3143s 0.2104s -33.1% cpython 0.7453s 0.3410s -54.2% gcc 7.1852s 3.3633s -53.2% gecko-dev 4.6113s n/a git 0.6180s 0.2184s -64.7% glibc 0.5618s 0.3245s -42.2% jdk 0.0482s 0.0395s -18.0% linux 0.7043s 0.4810s -31.6% llvm-project 2.6844s n/a rails 0.2784s 0.1792s -35.6% rust 0.7757s 0.6073s -21.7% webkit 1.9137s 1.4832s -22.5% The results in the homebrew-core repository can be spectacular, though that repository looks like as if it was specifically designed to show off the benefits of this patch: over 98% of its almost 164k commits in its linear history modify one file within the 'Formula' directory, and that directory contains almost 5000 files. Consequently, almost all commits will have a 63 bit "embedded" modified path Bloom filter containing only 2 paths, resulting in a false positive rate less than 0.1%, and they can spare a lot of expensive tree entry scanning to the end of that directory. So looking at the history of the first and last files in that directory: Runtime Max RSS without with without with MPBI MPBI Speedup MPBI MPBI ----------------------------------------------------------------- Formula/a2ps.rb 8.390s 0.218s 38.5x 422884k 129212k Formula/zzz.rb 80.495s 0.176s 450.4x 423744k 68364k ('Formula/a2ps.rb' is modified 32 times and has two false positives, while 'zzz.rb' is modified only twice and happens to have only a single false positive; that's why querying the history of the former is slower, even though it's at the beginning of the directory). Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com> --- commit-graph.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++ commit-graph.h | 13 +++++ pathspec.c | 9 +++ pathspec.h | 12 ++++ revision.c | 18 ++++++ 5 files changed, 198 insertions(+)