diff mbox series

[PoC,--,do,not,apply,2/3] test-tree-bitmap: add "dump" mode

Message ID 20181009231405.GB23730@sigill.intra.peff.net (mailing list archive)
State New, archived
Headers show
Series [PoC,--,do,not,apply,1/3] initial tree-bitmap proof of concept | expand

Commit Message

Jeff King Oct. 9, 2018, 11:14 p.m. UTC
This teaches "gen" mode (formerly the only mode) to include
the list of paths, and to prefix each bitmap with its
matching oid.

The "dump" mode can then read that back in and generate the
list of changed paths. This should be almost identical to:

  git rev-list --all |
  git diff-tree --stdin --name-only -t

The one difference is the sort order: git's diff output is
in tree-sort order, so a subtree "foo" sorts like "foo/",
which is after "foo.bar". Whereas the bitmap path list has a
true byte sort, which puts "foo.bar" after "foo".

Signed-off-by: Jeff King <peff@peff.net>
---
 t/helper/test-tree-bitmap.c | 104 +++++++++++++++++++++++++++++++++++-
 1 file changed, 102 insertions(+), 2 deletions(-)

Comments

Junio C Hamano Oct. 10, 2018, 12:48 a.m. UTC | #1
Jeff King <peff@peff.net> writes:

> The one difference is the sort order: git's diff output is
> in tree-sort order, so a subtree "foo" sorts like "foo/",
> which is after "foo.bar". Whereas the bitmap path list has a
> true byte sort, which puts "foo.bar" after "foo".

If we truly cared, it is easy enough to fix by having a custom
comparison function in 1/3 used in collect_paths() phase.

> +	/* dump it while we have the sorted order in memory */
> +	for (i = 0; i < n; i++) {
> +		printf("%s", sorted[i]->path);
> +		putchar('\0');
> +	}

With printf("%s%c", sorted[i]->path, '\0'); you can lose the braces.

> +	putchar('\0');
> +
>  	free(sorted);
>  }
>  
> @@ -142,6 +150,8 @@ static void generate_bitmap(struct diff_queue_struct *q,
>  
>  	ewah = bitmap_to_ewah(bitmap);
>  	ewah_serialize_strbuf(ewah, &out);
> +
> +	fwrite(data->commit->object.oid.hash, 1, GIT_SHA1_RAWSZ, stdout);
>  	fwrite(out.buf, 1, out.len, stdout);

OK, so per commit, we have ewah bitmap that records the "changed
paths" after the commit object name.  Makes sense.

And the list of paths are based on the "one" side of the filepair.
When we do an equivalent of "git show X", we see "diff-tree X~1 X"
and by collecting the "one" side (i.e. subset of paths in the tree
of X~1 that were modified when going to X) we say "commit X changed
these paths".  Makes sense, too.

> -int cmd_main(int argc, const char **argv)
> +static void do_gen(void)
>  {
>  	struct hashmap paths;
> -

Let's not lose this blank line.

>  	setup_git_directory();
>  	collect_paths(&paths);
>  
>  	walk_paths(generate_bitmap, &paths);
> +}
> +
> +static void do_dump(void)
> +{
> +	struct strbuf in = STRBUF_INIT;
> +	const char *cur;
> +	size_t remain;
> +
> +	const char **paths = NULL;
> +	size_t alloc_paths = 0, nr_paths = 0;
> +
> +	/* slurp stdin; in the real world we'd mmap all this */
> +	strbuf_read(&in, 0, 0);
> +	cur = in.buf;
> +	remain = in.len;
> +
> +	/* read path for each bit; in the real world this would be separate */
> +	while (remain) {
> +		const char *end = memchr(cur, '\0', remain);
> +		if (!end) {
> +			error("truncated input while reading path");
> +			goto out;
> +		}
> +		if (end == cur) {
> +			/* empty field signals end of paths */
> +			cur++;
> +			remain--;
> +			break;
> +		}
> +
> +		ALLOC_GROW(paths, nr_paths + 1, alloc_paths);
> +		paths[nr_paths++] = cur;
> +
> +		remain -= end - cur + 1;
> +		cur = end + 1;
> +	}
> +

OK.

> +	while (remain) {
> +		struct object_id oid;
> +		struct ewah_bitmap *ewah;
> +		ssize_t len;
> +
> +		if (remain < GIT_SHA1_RAWSZ) {
> +			error("truncated input reading oid");
> +			goto out;
> +		}
> +		hashcpy(oid.hash, (const unsigned char *)cur);
> +		cur += GIT_SHA1_RAWSZ;
> +		remain -= GIT_SHA1_RAWSZ;
> +
> +		ewah = ewah_new();
> +		len = ewah_read_mmap(ewah, cur, remain);
> +		if (len < 0) {
> +			ewah_free(ewah);
> +			goto out;
> +		}
> +
> +		printf("%s\n", oid_to_hex(&oid));
> +		ewah_each_bit(ewah, show_path, paths);
> +
> +		ewah_free(ewah);
> +		cur += len;
> +		remain -= len;
> +	}

Makes perfect sense.

> +out:
> +	free(paths);
> +	strbuf_release(&in);
> +}
> +
> +int cmd_main(int argc, const char **argv)
> +{
> +	const char *usage_msg = "test-tree-bitmap <gen|dump>";
> +
> +	if (!argv[1])
> +		usage(usage_msg);
> +	else if (!strcmp(argv[1], "gen"))
> +		do_gen();
> +	else if (!strcmp(argv[1], "dump"))
> +		do_dump();
> +	else
> +		usage(usage_msg);
>  
>  	return 0;
>  }
Jeff King Oct. 11, 2018, 3:13 a.m. UTC | #2
On Wed, Oct 10, 2018 at 09:48:53AM +0900, Junio C Hamano wrote:

> Jeff King <peff@peff.net> writes:
> 
> > The one difference is the sort order: git's diff output is
> > in tree-sort order, so a subtree "foo" sorts like "foo/",
> > which is after "foo.bar". Whereas the bitmap path list has a
> > true byte sort, which puts "foo.bar" after "foo".
> 
> If we truly cared, it is easy enough to fix by having a custom
> comparison function in 1/3 used in collect_paths() phase.

Yep. I thought about doing it just so I could drop this "one difference"
note, but I got lazy.

Running this on linux.git, I do see a few other differences. It looks
like my code does actually compute lists of touched paths for some
merges (presumably using "-c"). That wasn't intended, and it would
actually make my timings less good, but my goal was just to get a rough
idea on size here (but see below).

> > +	/* dump it while we have the sorted order in memory */
> > +	for (i = 0; i < n; i++) {
> > +		printf("%s", sorted[i]->path);
> > +		putchar('\0');
> > +	}
> 
> With printf("%s%c", sorted[i]->path, '\0'); you can lose the braces.

Heh, I didn't really expect review at that level. I'm not even sure this
is a good direction to go versus something like the bloom filters (or
even a more full --raw cache). But if it is, this code is mostly
throw-away anyway, as we'd want to integrate it with the actual diff
code.

My original goal had mostly been to get an idea of the size, and the
"dump" half was there to verify that the results were roughly sane. But
it actually works for rough timing, too. I can generate roughly the same
results as "rev-list --all | diff-tree --stdin -t --name-only" in about
300ms, as opposed to 33s. So that's good.

But it's also a slight cheat, since I'm not actually traversing the
commits, but rather just opening up the bitmaps in the order we wrote
them. ;)

Actually walking the commits (and not looking at the trees) takes ~7s,
so it would at least be more like 33s versus 7.3s. With core.commitgraph,
it's more like 1.1s, so imagine 27s versus 1.4s, I guess.

That's also neglecting any load/lookup time for actual random-access to
the bitmaps. I doubt that's more than a few hundred ms, but that's just
a made-up number.

So I think the rough timings are favorable, but the real proof would
actually be using it from a revision walk, which I haven't written.

> > +	putchar('\0');
> > +
> >  	free(sorted);
> >  }
> >  
> > @@ -142,6 +150,8 @@ static void generate_bitmap(struct diff_queue_struct *q,
> >  
> >  	ewah = bitmap_to_ewah(bitmap);
> >  	ewah_serialize_strbuf(ewah, &out);
> > +
> > +	fwrite(data->commit->object.oid.hash, 1, GIT_SHA1_RAWSZ, stdout);
> >  	fwrite(out.buf, 1, out.len, stdout);
> 
> OK, so per commit, we have ewah bitmap that records the "changed
> paths" after the commit object name.  Makes sense.

Yeah. This format, btw, is garbage. It was just the smallest and
simplest thing I could think of that would work for my case. We'd want
random-access to the bitmaps for each commit, probably via an index
block in the commit-graph file.

> And the list of paths are based on the "one" side of the filepair.
> When we do an equivalent of "git show X", we see "diff-tree X~1 X"
> and by collecting the "one" side (i.e. subset of paths in the tree
> of X~1 that were modified when going to X) we say "commit X changed
> these paths".  Makes sense, too.

I didn't think too hard on whether we'd need to look at the "two" side
ever. I turned off renames, so we'd see deletions via the "one". I feel
like we'd miss additions in that case, though, but from my results, we
do not seem to.

-Peff
diff mbox series

Patch

diff --git a/t/helper/test-tree-bitmap.c b/t/helper/test-tree-bitmap.c
index bc5cf0e514..6f8833344a 100644
--- a/t/helper/test-tree-bitmap.c
+++ b/t/helper/test-tree-bitmap.c
@@ -112,6 +112,14 @@  static void collect_paths(struct hashmap *paths)
 	QSORT(sorted, i, pathmap_entry_strcmp);
 	for (i = 0; i < n; i++)
 		sorted[i]->pos = i;
+
+	/* dump it while we have the sorted order in memory */
+	for (i = 0; i < n; i++) {
+		printf("%s", sorted[i]->path);
+		putchar('\0');
+	}
+	putchar('\0');
+
 	free(sorted);
 }
 
@@ -142,6 +150,8 @@  static void generate_bitmap(struct diff_queue_struct *q,
 
 	ewah = bitmap_to_ewah(bitmap);
 	ewah_serialize_strbuf(ewah, &out);
+
+	fwrite(data->commit->object.oid.hash, 1, GIT_SHA1_RAWSZ, stdout);
 	fwrite(out.buf, 1, out.len, stdout);
 
 	trace_printf("bitmap %s %u %u",
@@ -154,14 +164,104 @@  static void generate_bitmap(struct diff_queue_struct *q,
 	bitmap_free(bitmap);
 }
 
-int cmd_main(int argc, const char **argv)
+static void do_gen(void)
 {
 	struct hashmap paths;
-
 	setup_git_directory();
 	collect_paths(&paths);
 
 	walk_paths(generate_bitmap, &paths);
+}
+
+static void show_path(size_t pos, void *data)
+{
+	const char **paths = data;
+
+	/* assert(pos < nr_paths), but we didn't pass the latter in */
+	printf("%s\n", paths[pos]);
+}
+
+static void do_dump(void)
+{
+	struct strbuf in = STRBUF_INIT;
+	const char *cur;
+	size_t remain;
+
+	const char **paths = NULL;
+	size_t alloc_paths = 0, nr_paths = 0;
+
+	/* slurp stdin; in the real world we'd mmap all this */
+	strbuf_read(&in, 0, 0);
+	cur = in.buf;
+	remain = in.len;
+
+	/* read path for each bit; in the real world this would be separate */
+	while (remain) {
+		const char *end = memchr(cur, '\0', remain);
+		if (!end) {
+			error("truncated input while reading path");
+			goto out;
+		}
+		if (end == cur) {
+			/* empty field signals end of paths */
+			cur++;
+			remain--;
+			break;
+		}
+
+		ALLOC_GROW(paths, nr_paths + 1, alloc_paths);
+		paths[nr_paths++] = cur;
+
+		remain -= end - cur + 1;
+		cur = end + 1;
+	}
+
+	/* read the bitmap for each commit */
+	while (remain) {
+		struct object_id oid;
+		struct ewah_bitmap *ewah;
+		ssize_t len;
+
+		if (remain < GIT_SHA1_RAWSZ) {
+			error("truncated input reading oid");
+			goto out;
+		}
+		hashcpy(oid.hash, (const unsigned char *)cur);
+		cur += GIT_SHA1_RAWSZ;
+		remain -= GIT_SHA1_RAWSZ;
+
+		ewah = ewah_new();
+		len = ewah_read_mmap(ewah, cur, remain);
+		if (len < 0) {
+			ewah_free(ewah);
+			goto out;
+		}
+
+		printf("%s\n", oid_to_hex(&oid));
+		ewah_each_bit(ewah, show_path, paths);
+
+		ewah_free(ewah);
+		cur += len;
+		remain -= len;
+	}
+
+out:
+	free(paths);
+	strbuf_release(&in);
+}
+
+int cmd_main(int argc, const char **argv)
+{
+	const char *usage_msg = "test-tree-bitmap <gen|dump>";
+
+	if (!argv[1])
+		usage(usage_msg);
+	else if (!strcmp(argv[1], "gen"))
+		do_gen();
+	else if (!strcmp(argv[1], "dump"))
+		do_dump();
+	else
+		usage(usage_msg);
 
 	return 0;
 }