diff mbox series

[v2,7/8] pack-bitmap: implement combined filter

Message ID fac3477d979058da0430b974a34f7c7f866bf456.1615813673.git.ps@pks.im (mailing list archive)
State Superseded
Headers show
Series rev-parse: implement object type filter | expand

Commit Message

Patrick Steinhardt March 15, 2021, 1:14 p.m. UTC
When the user has multiple objects filters specified, then this is
internally represented by having a "combined" filter. These combined
filters aren't yet supported by bitmap indices and can thus not be
accelerated.

Fix this by implementing support for these combined filters. The
implementation is quite trivial: when there's a combined filter, we
simply recurse into `filter_bitmap()` for all of the sub-filters.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 pack-bitmap.c                      | 40 +++++++++++++++++++++++++++---
 t/t6113-rev-list-bitmap-filters.sh |  7 ++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

Comments

Jeff King April 6, 2021, 5:54 p.m. UTC | #1
On Mon, Mar 15, 2021 at 02:14:59PM +0100, Patrick Steinhardt wrote:

> When the user has multiple objects filters specified, then this is
> internally represented by having a "combined" filter. These combined
> filters aren't yet supported by bitmap indices and can thus not be
> accelerated.
> 
> Fix this by implementing support for these combined filters. The
> implementation is quite trivial: when there's a combined filter, we
> simply recurse into `filter_bitmap()` for all of the sub-filters.

The goal makes sense.

Before this patch, I think your test:

> +test_expect_success 'combine filter' '
> +	git rev-list --objects --filter=blob:limit=1000 --filter=object:type=blob tag >expect &&
> +	git rev-list --use-bitmap-index \
> +		     --objects --filter=blob:limit=1000 --filter=object:type=blob tag >actual &&
> +	test_bitmap_traversal expect actual
> +'

would pass anyway, because we'd just skip using bitmaps. Is there a way
we can tell that the bitmap code actually kicked in? Maybe a perf test
would make it clear (those aren't always run, but hopefully we'd
eventually notice a regression there).

> +static int filter_supported(struct list_objects_filter_options *filter)
> +{
> +	int i;
> +
> +	switch (filter->choice) {
> +	case LOFC_BLOB_NONE:
> +	case LOFC_BLOB_LIMIT:
> +	case LOFC_OBJECT_TYPE:
> +		return 1;
> +	case LOFC_TREE_DEPTH:
> +		if (filter->tree_exclude_depth == 0)
> +			return 1;
> +		return 0;
> +	case LOFC_COMBINE:
> +		for (i = 0; i < filter->sub_nr; i++)
> +			if (!filter_supported(&filter->sub[i]))
> +				return 0;
> +		return 1;
> +	default:
> +		return 0;
> +	}
> +}

Hmm. This is essentially reproducing the list in filter_bitmap() of
what's OK for bitmaps. So when adding a new filter, it would have to be
added in both places.

Can we preserve that property of the original code? I'd think that just
adding LOFC_COMBINE to filter_bitmap() would be sufficient. I.e., this
hunk:

> +	if (filter->choice == LOFC_COMBINE) {
> +		int i;
> +		for (i = 0; i < filter->sub_nr; i++) {
> +			filter_bitmap(bitmap_git, tip_objects, to_filter,
> +				      &filter->sub[i]);
> +		}
> +		return 0;
> +	}

...except that we need to see if filter_bitmap() returns "-1" for any of
the recursive calls. Which we probably should be doing anyway to
propagate any errors (though I think the only "errors" we'd return are
"not supported", at least for now).

-Peff
Patrick Steinhardt April 9, 2021, 10:31 a.m. UTC | #2
On Tue, Apr 06, 2021 at 01:54:31PM -0400, Jeff King wrote:
> On Mon, Mar 15, 2021 at 02:14:59PM +0100, Patrick Steinhardt wrote:
> 
> > When the user has multiple objects filters specified, then this is
> > internally represented by having a "combined" filter. These combined
> > filters aren't yet supported by bitmap indices and can thus not be
> > accelerated.
> > 
> > Fix this by implementing support for these combined filters. The
> > implementation is quite trivial: when there's a combined filter, we
> > simply recurse into `filter_bitmap()` for all of the sub-filters.
> 
> The goal makes sense.
> 
> Before this patch, I think your test:
> 
> > +test_expect_success 'combine filter' '
> > +	git rev-list --objects --filter=blob:limit=1000 --filter=object:type=blob tag >expect &&
> > +	git rev-list --use-bitmap-index \
> > +		     --objects --filter=blob:limit=1000 --filter=object:type=blob tag >actual &&
> > +	test_bitmap_traversal expect actual
> > +'
> 
> would pass anyway, because we'd just skip using bitmaps. Is there a way
> we can tell that the bitmap code actually kicked in? Maybe a perf test
> would make it clear (those aren't always run, but hopefully we'd
> eventually notice a regression there).
> 
> > +static int filter_supported(struct list_objects_filter_options *filter)
> > +{
> > +	int i;
> > +
> > +	switch (filter->choice) {
> > +	case LOFC_BLOB_NONE:
> > +	case LOFC_BLOB_LIMIT:
> > +	case LOFC_OBJECT_TYPE:
> > +		return 1;
> > +	case LOFC_TREE_DEPTH:
> > +		if (filter->tree_exclude_depth == 0)
> > +			return 1;
> > +		return 0;
> > +	case LOFC_COMBINE:
> > +		for (i = 0; i < filter->sub_nr; i++)
> > +			if (!filter_supported(&filter->sub[i]))
> > +				return 0;
> > +		return 1;
> > +	default:
> > +		return 0;
> > +	}
> > +}
> 
> Hmm. This is essentially reproducing the list in filter_bitmap() of
> what's OK for bitmaps. So when adding a new filter, it would have to be
> added in both places.
> 
> Can we preserve that property of the original code? I'd think that just
> adding LOFC_COMBINE to filter_bitmap() would be sufficient. I.e., this
> hunk:
> 
> > +	if (filter->choice == LOFC_COMBINE) {
> > +		int i;
> > +		for (i = 0; i < filter->sub_nr; i++) {
> > +			filter_bitmap(bitmap_git, tip_objects, to_filter,
> > +				      &filter->sub[i]);
> > +		}
> > +		return 0;
> > +	}
> 
> ...except that we need to see if filter_bitmap() returns "-1" for any of
> the recursive calls. Which we probably should be doing anyway to
> propagate any errors (though I think the only "errors" we'd return are
> "not supported", at least for now).
> 
> -Peff

But wouldn't that mean that we're now needlessly filtering via bitmaps
all the way down the combined filters only to realize at the end that it
cannot work because we've got a tree filter with non-zero tree depth?
Granted, this will not be the common case. But it still feels like we're
doing needless work for cases where we know that bitmaps cannot answer
the query.

Patrick
Patrick Steinhardt April 9, 2021, 11:17 a.m. UTC | #3
On Tue, Apr 06, 2021 at 01:54:31PM -0400, Jeff King wrote:
> On Mon, Mar 15, 2021 at 02:14:59PM +0100, Patrick Steinhardt wrote:
> 
> > When the user has multiple objects filters specified, then this is
> > internally represented by having a "combined" filter. These combined
> > filters aren't yet supported by bitmap indices and can thus not be
> > accelerated.
> > 
> > Fix this by implementing support for these combined filters. The
> > implementation is quite trivial: when there's a combined filter, we
> > simply recurse into `filter_bitmap()` for all of the sub-filters.
> 
> The goal makes sense.
> 
> Before this patch, I think your test:
> 
> > +test_expect_success 'combine filter' '
> > +	git rev-list --objects --filter=blob:limit=1000 --filter=object:type=blob tag >expect &&
> > +	git rev-list --use-bitmap-index \
> > +		     --objects --filter=blob:limit=1000 --filter=object:type=blob tag >actual &&
> > +	test_bitmap_traversal expect actual
> > +'
> 
> would pass anyway, because we'd just skip using bitmaps. Is there a way
> we can tell that the bitmap code actually kicked in? Maybe a perf test
> would make it clear (those aren't always run, but hopefully we'd
> eventually notice a regression there).

I think that's not actually true. Note that we're using
`test_bitmap_traversal`:

    test_bitmap_traversal () {
        if test "$1" = "--no-confirm-bitmaps"
        then
            shift
        elif cmp "$1" "$2"
        then
            echo >&2 "identical raw outputs; are you sure bitmaps were used?"
            return 1
        fi &&
        cut -d' ' -f1 "$1" | sort >"$1.normalized" &&
        sort "$2" >"$2.normalized" &&
        test_cmp "$1.normalized" "$2.normalized" &&
        rm -f "$1.normalized" "$2.normalized"
    }

The output is different when using bitmap indices, which is why the
function knows to fail in case output is the same in both cases. So we
know that it cannot be the same here and thus we also know that the
bitmap case kicked in.

Patrick
Jeff King April 9, 2021, 3:53 p.m. UTC | #4
On Fri, Apr 09, 2021 at 12:31:42PM +0200, Patrick Steinhardt wrote:

> > Hmm. This is essentially reproducing the list in filter_bitmap() of
> > what's OK for bitmaps. So when adding a new filter, it would have to be
> > added in both places.
> > 
> > Can we preserve that property of the original code? I'd think that just
> > adding LOFC_COMBINE to filter_bitmap() would be sufficient. I.e., this
> > hunk:
> > 
> > > +	if (filter->choice == LOFC_COMBINE) {
> > > +		int i;
> > > +		for (i = 0; i < filter->sub_nr; i++) {
> > > +			filter_bitmap(bitmap_git, tip_objects, to_filter,
> > > +				      &filter->sub[i]);
> > > +		}
> > > +		return 0;
> > > +	}
> > 
> > ...except that we need to see if filter_bitmap() returns "-1" for any of
> > the recursive calls. Which we probably should be doing anyway to
> > propagate any errors (though I think the only "errors" we'd return are
> > "not supported", at least for now).
> > 
> > -Peff
> 
> But wouldn't that mean that we're now needlessly filtering via bitmaps
> all the way down the combined filters only to realize at the end that it
> cannot work because we've got a tree filter with non-zero tree depth?
> Granted, this will not be the common case. But it still feels like we're
> doing needless work for cases where we know that bitmaps cannot answer
> the query.

I don't think so. We first call can_filter_bitmap(filter), which passes
NULL for bitmap_git. And then in filter_bitmap(), we only do actual work
if bitmap_git is non-NULL.

This is the same thing that saves us from even loading the bitmaps
(which is itself a non-trivial amount of work) if the filter cannot be
satisfied by bitmaps.

-Peff
Jeff King April 9, 2021, 3:55 p.m. UTC | #5
On Fri, Apr 09, 2021 at 01:17:59PM +0200, Patrick Steinhardt wrote:

> > Before this patch, I think your test:
> > 
> > > +test_expect_success 'combine filter' '
> > > +	git rev-list --objects --filter=blob:limit=1000 --filter=object:type=blob tag >expect &&
> > > +	git rev-list --use-bitmap-index \
> > > +		     --objects --filter=blob:limit=1000 --filter=object:type=blob tag >actual &&
> > > +	test_bitmap_traversal expect actual
> > > +'
> > 
> > would pass anyway, because we'd just skip using bitmaps. Is there a way
> > we can tell that the bitmap code actually kicked in? Maybe a perf test
> > would make it clear (those aren't always run, but hopefully we'd
> > eventually notice a regression there).
> 
> I think that's not actually true. Note that we're using
> `test_bitmap_traversal`:

Ah, right. I forgot about the hackery in test_bitmap_traversal() to let
us tell the difference (even though I was the one who wrote it, I still
consider it hackery ;) ).

So yes, this is a good test that we are allowing the combine filter.

-Peff
diff mbox series

Patch

diff --git a/pack-bitmap.c b/pack-bitmap.c
index 196d38c91d..e33805e076 100644
--- a/pack-bitmap.c
+++ b/pack-bitmap.c
@@ -925,6 +925,29 @@  static void filter_bitmap_object_type(struct bitmap_index *bitmap_git,
 	}
 }
 
+static int filter_supported(struct list_objects_filter_options *filter)
+{
+	int i;
+
+	switch (filter->choice) {
+	case LOFC_BLOB_NONE:
+	case LOFC_BLOB_LIMIT:
+	case LOFC_OBJECT_TYPE:
+		return 1;
+	case LOFC_TREE_DEPTH:
+		if (filter->tree_exclude_depth == 0)
+			return 1;
+		return 0;
+	case LOFC_COMBINE:
+		for (i = 0; i < filter->sub_nr; i++)
+			if (!filter_supported(&filter->sub[i]))
+				return 0;
+		return 1;
+	default:
+		return 0;
+	}
+}
+
 static int filter_bitmap(struct bitmap_index *bitmap_git,
 			 struct object_list *tip_objects,
 			 struct bitmap *to_filter,
@@ -932,6 +955,8 @@  static int filter_bitmap(struct bitmap_index *bitmap_git,
 {
 	if (!filter || filter->choice == LOFC_DISABLED)
 		return 0;
+	if (!filter_supported(filter))
+		return -1;
 
 	if (filter->choice == LOFC_BLOB_NONE) {
 		if (bitmap_git)
@@ -948,8 +973,7 @@  static int filter_bitmap(struct bitmap_index *bitmap_git,
 		return 0;
 	}
 
-	if (filter->choice == LOFC_TREE_DEPTH &&
-	    filter->tree_exclude_depth == 0) {
+	if (filter->choice == LOFC_TREE_DEPTH) {
 		if (bitmap_git)
 			filter_bitmap_tree_depth(bitmap_git, tip_objects,
 						 to_filter,
@@ -965,8 +989,16 @@  static int filter_bitmap(struct bitmap_index *bitmap_git,
 		return 0;
 	}
 
-	/* filter choice not handled */
-	return -1;
+	if (filter->choice == LOFC_COMBINE) {
+		int i;
+		for (i = 0; i < filter->sub_nr; i++) {
+			filter_bitmap(bitmap_git, tip_objects, to_filter,
+				      &filter->sub[i]);
+		}
+		return 0;
+	}
+
+	BUG("unsupported filter choice");
 }
 
 static int can_filter_bitmap(struct list_objects_filter_options *filter)
diff --git a/t/t6113-rev-list-bitmap-filters.sh b/t/t6113-rev-list-bitmap-filters.sh
index fb66735ac8..cb9db7df6f 100755
--- a/t/t6113-rev-list-bitmap-filters.sh
+++ b/t/t6113-rev-list-bitmap-filters.sh
@@ -98,4 +98,11 @@  test_expect_success 'object:type filter' '
 	test_bitmap_traversal expect actual
 '
 
+test_expect_success 'combine filter' '
+	git rev-list --objects --filter=blob:limit=1000 --filter=object:type=blob tag >expect &&
+	git rev-list --use-bitmap-index \
+		     --objects --filter=blob:limit=1000 --filter=object:type=blob tag >actual &&
+	test_bitmap_traversal expect actual
+'
+
 test_done