diff mbox series

[08/11] bundle: parse filter capability

Message ID e7dbb46e6acb5c22a0b456135f3af42b974e0268.1645638911.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Partial bundles | expand

Commit Message

Derrick Stolee Feb. 23, 2022, 5:55 p.m. UTC
From: Derrick Stolee <derrickstolee@github.com>

The v3 bundle format has capabilities, allowing newer versions of Git to
create bundles with newer features. Older versions that do not
understand these new capabilities will fail with a helpful warning.

Create a new capability allowing Git to understand that the contained
pack-file is filtered according to some object filter. Typically, this
filter will be "blob:none" for a blobless partial clone.

This change teaches Git to parse this capability, place its value in the
bundle header, and demonstrate this understanding by adding a message to
'git bundle verify'.

Signed-off-by: Derrick Stolee <derrickstolee@github.com>
---
 bundle.c                      | 17 ++++++++++++++++-
 bundle.h                      |  3 +++
 list-objects-filter-options.c |  2 +-
 list-objects-filter-options.h |  5 +++++
 4 files changed, 25 insertions(+), 2 deletions(-)

Comments

Ævar Arnfjörð Bjarmason March 7, 2022, 3:38 p.m. UTC | #1
On Wed, Feb 23 2022, Derrick Stolee via GitGitGadget wrote:

> From: Derrick Stolee <derrickstolee@github.com>
>
> The v3 bundle format has capabilities, allowing newer versions of Git to
> create bundles with newer features. Older versions that do not
> understand these new capabilities will fail with a helpful warning.
>
> Create a new capability allowing Git to understand that the contained
> pack-file is filtered according to some object filter. Typically, this
> filter will be "blob:none" for a blobless partial clone.
>
> This change teaches Git to parse this capability, place its value in the
> bundle header, and demonstrate this understanding by adding a message to
> 'git bundle verify'.
>
> Signed-off-by: Derrick Stolee <derrickstolee@github.com>
> ---
>  bundle.c                      | 17 ++++++++++++++++-
>  bundle.h                      |  3 +++
>  list-objects-filter-options.c |  2 +-
>  list-objects-filter-options.h |  5 +++++
>  4 files changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/bundle.c b/bundle.c
> index dc56db9a50a..2afced4d991 100644
> --- a/bundle.c
> +++ b/bundle.c
> @@ -11,7 +11,7 @@
>  #include "run-command.h"
>  #include "refs.h"
>  #include "strvec.h"
> -
> +#include "list-objects-filter-options.h"
>  
>  static const char v2_bundle_signature[] = "# v2 git bundle\n";
>  static const char v3_bundle_signature[] = "# v3 git bundle\n";
> @@ -33,6 +33,8 @@ void bundle_header_release(struct bundle_header *header)
>  {
>  	string_list_clear(&header->prerequisites, 1);
>  	string_list_clear(&header->references, 1);
> +	list_objects_filter_release(header->filter);
> +	free(header->filter);
>  }
>  
>  static int parse_capability(struct bundle_header *header, const char *capability)
> @@ -45,6 +47,11 @@ static int parse_capability(struct bundle_header *header, const char *capability
>  		header->hash_algo = &hash_algos[algo];
>  		return 0;
>  	}
> +	if (skip_prefix(capability, "filter=", &arg)) {
> +		CALLOC_ARRAY(header->filter, 1);
> +		parse_list_objects_filter(header->filter, arg);
> +		return 0;
> +	}
>  	return error(_("unknown capability '%s'"), capability);
>  }
>  
> @@ -220,6 +227,8 @@ int verify_bundle(struct repository *r,
>  	req_nr = revs.pending.nr;
>  	setup_revisions(2, argv, &revs, NULL);
>  
> +	revs.filter = header->filter;
> +
>  	if (prepare_revision_walk(&revs))
>  		die(_("revision walk setup failed"));
>  
> @@ -259,6 +268,12 @@ int verify_bundle(struct repository *r,
>  			     r->nr),
>  			  r->nr);
>  		list_refs(r, 0, NULL);
> +
> +		if (header->filter) {
> +			printf_ln("The bundle uses this filter: %s",
> +				  list_objects_filter_spec(header->filter));
> +		}
> +
>  		r = &header->prerequisites;
>  		if (!r->nr) {
>  			printf_ln(_("The bundle records a complete history."));
> diff --git a/bundle.h b/bundle.h
> index 06009fe6b1f..eb026153d56 100644
> --- a/bundle.h
> +++ b/bundle.h
> @@ -5,11 +5,14 @@
>  #include "cache.h"
>  #include "string-list.h"
>  
> +struct list_objects_filter_options;
> +

For the other ones we include the relevant header, do the same here (or
if there's a need to not do it, do we need it for the rest too?)

>  struct bundle_header {
>  	unsigned version;
>  	struct string_list prerequisites;
>  	struct string_list references;
>  	const struct git_hash_algo *hash_algo;
> +	struct list_objects_filter_options *filter;
>  };

I haven't tried, but any reason this needs to be a *filter
v.s. embedding it in the struct?

Then we'd just need list_objects_filter_release() and not the free() as
well.

Is it because you're piggy-backing on "if (header->filter)" as "do we
have it" state, better to check .nr?

> @@ -55,7 +55,7 @@ const char *list_object_filter_config_name(enum list_objects_filter_choice c)
>   * expand_list_objects_filter_spec() first).  We also "intern" the arg for the
>   * convenience of the current command.
>   */

These API docs....

> -static int gently_parse_list_objects_filter(
> +int gently_parse_list_objects_filter(
>  	struct list_objects_filter_options *filter_options,
>  	const char *arg,
>  	struct strbuf *errbuf)
> diff --git a/list-objects-filter-options.h b/list-objects-filter-options.h
> index da5b6737e27..347a99c28cf 100644
> --- a/list-objects-filter-options.h
> +++ b/list-objects-filter-options.h
> @@ -72,6 +72,11 @@ struct list_objects_filter_options {
>  /* Normalized command line arguments */
>  #define CL_ARG__FILTER "filter"

...should be moved here, presumably.

> +int gently_parse_list_objects_filter(
> +	struct list_objects_filter_options *filter_options,
> +	const char *arg,
> +	struct strbuf *errbuf);
> +
Ævar Arnfjörð Bjarmason March 7, 2022, 3:55 p.m. UTC | #2
On Wed, Feb 23 2022, Derrick Stolee via GitGitGadget wrote:

> From: Derrick Stolee <derrickstolee@github.com>
> [...]
>  static int parse_capability(struct bundle_header *header, const char *capability)
> @@ -45,6 +47,11 @@ static int parse_capability(struct bundle_header *header, const char *capability
>  		header->hash_algo = &hash_algos[algo];
>  		return 0;
>  	}
> +	if (skip_prefix(capability, "filter=", &arg)) {
> +		CALLOC_ARRAY(header->filter, 1);
> +		parse_list_objects_filter(header->filter, arg);
> +		return 0;
> +	}
>  	return error(_("unknown capability '%s'"), capability);
>  }

[Something I should have noted in the other reply, but missed].

Before this series we just had the object-format capability, and now we
have a 2nd one.

As before we'll return errors un unknown capabilities.

I think it's worthwhile to stop here & think how we think about
cross-version compatibility between git versions. I.e. we're not
changing the *format version* here (nor is it needed), but just adding a
new capability that older gits won't know about.

I don't know if this is a case where older versions could limp along in
some cases and still unbundle these, probably that's never happening?

So probably nothing needs to change here, I was just wondering *if* we
had capabilities that were optional in some cases whether we shouldn't
while-we're-at-it give those some prefix indicating that, and have older
versions just issue a warning().

Then have them just try to call index-pack & see if that worked.

But yeah, all of that probably isn't applicable here at all :)
Derrick Stolee March 7, 2022, 4:14 p.m. UTC | #3
On 3/7/2022 10:38 AM, Ævar Arnfjörð Bjarmason wrote:
> 
> On Wed, Feb 23 2022, Derrick Stolee via GitGitGadget wrote:
> 
>> From: Derrick Stolee <derrickstolee@github.com>
...
>> diff --git a/bundle.h b/bundle.h
>> index 06009fe6b1f..eb026153d56 100644
>> --- a/bundle.h
>> +++ b/bundle.h
>> @@ -5,11 +5,14 @@
>>  #include "cache.h"
>>  #include "string-list.h"
>>  
>> +struct list_objects_filter_options;
>> +
> 
> For the other ones we include the relevant header, do the same here (or
> if there's a need to not do it, do we need it for the rest too?)

The others are .c files that require looking into the struct. This
declaration is all that's required for this header file.

>>  struct bundle_header {
>>  	unsigned version;
>>  	struct string_list prerequisites;
>>  	struct string_list references;
>>  	const struct git_hash_algo *hash_algo;
>> +	struct list_objects_filter_options *filter;
>>  };
> 
> I haven't tried, but any reason this needs to be a *filter
> v.s. embedding it in the struct?
> 
> Then we'd just need list_objects_filter_release() and not the free() as
> well.
> 
> Is it because you're piggy-backing on "if (header->filter)" as "do we
> have it" state, better to check .nr?

Yes. I replied to Junio before that there is some assumption in
the filtering code that the .nr == 0 case is listed as a BUG()
so we would possibly be breaking expectations in a different
way doing the embedded version.

>> @@ -55,7 +55,7 @@ const char *list_object_filter_config_name(enum list_objects_filter_choice c)
>>   * expand_list_objects_filter_spec() first).  We also "intern" the arg for the
>>   * convenience of the current command.
>>   */
> 
> These API docs....
> 
>> -static int gently_parse_list_objects_filter(
>> +int gently_parse_list_objects_filter(
>>  	struct list_objects_filter_options *filter_options,
>>  	const char *arg,
>>  	struct strbuf *errbuf)
>> diff --git a/list-objects-filter-options.h b/list-objects-filter-options.h
>> index da5b6737e27..347a99c28cf 100644
>> --- a/list-objects-filter-options.h
>> +++ b/list-objects-filter-options.h
>> @@ -72,6 +72,11 @@ struct list_objects_filter_options {
>>  /* Normalized command line arguments */
>>  #define CL_ARG__FILTER "filter"
> 
> ...should be moved here, presumably.

Yes. Thanks!
-Stolee
Ævar Arnfjörð Bjarmason March 7, 2022, 4:22 p.m. UTC | #4
On Mon, Mar 07 2022, Derrick Stolee wrote:

> On 3/7/2022 10:38 AM, Ævar Arnfjörð Bjarmason wrote:
>> 
>> On Wed, Feb 23 2022, Derrick Stolee via GitGitGadget wrote:
>> 
>>> From: Derrick Stolee <derrickstolee@github.com>
> ...
>>> diff --git a/bundle.h b/bundle.h
>>> index 06009fe6b1f..eb026153d56 100644
>>> --- a/bundle.h
>>> +++ b/bundle.h
>>> @@ -5,11 +5,14 @@
>>>  #include "cache.h"
>>>  #include "string-list.h"
>>>  
>>> +struct list_objects_filter_options;
>>> +
>> 
>> For the other ones we include the relevant header, do the same here (or
>> if there's a need to not do it, do we need it for the rest too?)
>
> The others are .c files that require looking into the struct. This
> declaration is all that's required for this header file.
>
>>>  struct bundle_header {
>>>  	unsigned version;
>>>  	struct string_list prerequisites;
>>>  	struct string_list references;
>>>  	const struct git_hash_algo *hash_algo;
>>> +	struct list_objects_filter_options *filter;
>>>  };
>> 
>> I haven't tried, but any reason this needs to be a *filter
>> v.s. embedding it in the struct?
>> 
>> Then we'd just need list_objects_filter_release() and not the free() as
>> well.
>> 
>> Is it because you're piggy-backing on "if (header->filter)" as "do we
>> have it" state, better to check .nr?
>
> Yes. I replied to Junio before that there is some assumption in
> the filtering code that the .nr == 0 case is listed as a BUG()
> so we would possibly be breaking expectations in a different
> way doing the embedded version.

Having an "unsigned int using_filter:1" or whatever IMO makes that much
clearer than needing to carefully eyeball code that's already using
embedded structs & see why the one exception that's malloc'd is because
of that or some other reason...
Derrick Stolee March 7, 2022, 4:29 p.m. UTC | #5
On 3/7/2022 11:22 AM, Ævar Arnfjörð Bjarmason wrote:
> 
> On Mon, Mar 07 2022, Derrick Stolee wrote:
> 
>> On 3/7/2022 10:38 AM, Ævar Arnfjörð Bjarmason wrote:
>>>
>>> On Wed, Feb 23 2022, Derrick Stolee via GitGitGadget wrote:
>>>
>>>> From: Derrick Stolee <derrickstolee@github.com>
>> ...
>>>> diff --git a/bundle.h b/bundle.h
>>>> index 06009fe6b1f..eb026153d56 100644
>>>> --- a/bundle.h
>>>> +++ b/bundle.h
>>>> @@ -5,11 +5,14 @@
>>>>  #include "cache.h"
>>>>  #include "string-list.h"
>>>>  
>>>> +struct list_objects_filter_options;
>>>> +
>>>
>>> For the other ones we include the relevant header, do the same here (or
>>> if there's a need to not do it, do we need it for the rest too?)
>>
>> The others are .c files that require looking into the struct. This
>> declaration is all that's required for this header file.
>>
>>>>  struct bundle_header {
>>>>  	unsigned version;
>>>>  	struct string_list prerequisites;
>>>>  	struct string_list references;
>>>>  	const struct git_hash_algo *hash_algo;
>>>> +	struct list_objects_filter_options *filter;
>>>>  };
>>>
>>> I haven't tried, but any reason this needs to be a *filter
>>> v.s. embedding it in the struct?
>>>
>>> Then we'd just need list_objects_filter_release() and not the free() as
>>> well.
>>>
>>> Is it because you're piggy-backing on "if (header->filter)" as "do we
>>> have it" state, better to check .nr?
>>
>> Yes. I replied to Junio before that there is some assumption in
>> the filtering code that the .nr == 0 case is listed as a BUG()
>> so we would possibly be breaking expectations in a different
>> way doing the embedded version.
> 
> Having an "unsigned int using_filter:1" or whatever IMO makes that much
> clearer than needing to carefully eyeball code that's already using
> embedded structs & see why the one exception that's malloc'd is because
> of that or some other reason...

I think your recommended "using_filter" is messy. Having this
struct be a pointer instead of embedded self-documents that it
is optional (and can be NULL) but that if it is non-NULL, then
it should be considered and valid.

Here, I'm focusing on not allowing a non-sensical state, such
as using_filter = 0 but filter is actually populated with a
valid filter. The possibility of this state means there is a
higher chance of introducing a bug over time by not keeping
these values coupled.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/bundle.c b/bundle.c
index dc56db9a50a..2afced4d991 100644
--- a/bundle.c
+++ b/bundle.c
@@ -11,7 +11,7 @@ 
 #include "run-command.h"
 #include "refs.h"
 #include "strvec.h"
-
+#include "list-objects-filter-options.h"
 
 static const char v2_bundle_signature[] = "# v2 git bundle\n";
 static const char v3_bundle_signature[] = "# v3 git bundle\n";
@@ -33,6 +33,8 @@  void bundle_header_release(struct bundle_header *header)
 {
 	string_list_clear(&header->prerequisites, 1);
 	string_list_clear(&header->references, 1);
+	list_objects_filter_release(header->filter);
+	free(header->filter);
 }
 
 static int parse_capability(struct bundle_header *header, const char *capability)
@@ -45,6 +47,11 @@  static int parse_capability(struct bundle_header *header, const char *capability
 		header->hash_algo = &hash_algos[algo];
 		return 0;
 	}
+	if (skip_prefix(capability, "filter=", &arg)) {
+		CALLOC_ARRAY(header->filter, 1);
+		parse_list_objects_filter(header->filter, arg);
+		return 0;
+	}
 	return error(_("unknown capability '%s'"), capability);
 }
 
@@ -220,6 +227,8 @@  int verify_bundle(struct repository *r,
 	req_nr = revs.pending.nr;
 	setup_revisions(2, argv, &revs, NULL);
 
+	revs.filter = header->filter;
+
 	if (prepare_revision_walk(&revs))
 		die(_("revision walk setup failed"));
 
@@ -259,6 +268,12 @@  int verify_bundle(struct repository *r,
 			     r->nr),
 			  r->nr);
 		list_refs(r, 0, NULL);
+
+		if (header->filter) {
+			printf_ln("The bundle uses this filter: %s",
+				  list_objects_filter_spec(header->filter));
+		}
+
 		r = &header->prerequisites;
 		if (!r->nr) {
 			printf_ln(_("The bundle records a complete history."));
diff --git a/bundle.h b/bundle.h
index 06009fe6b1f..eb026153d56 100644
--- a/bundle.h
+++ b/bundle.h
@@ -5,11 +5,14 @@ 
 #include "cache.h"
 #include "string-list.h"
 
+struct list_objects_filter_options;
+
 struct bundle_header {
 	unsigned version;
 	struct string_list prerequisites;
 	struct string_list references;
 	const struct git_hash_algo *hash_algo;
+	struct list_objects_filter_options *filter;
 };
 
 #define BUNDLE_HEADER_INIT \
diff --git a/list-objects-filter-options.c b/list-objects-filter-options.c
index fd8d59f653a..b9d10770e4f 100644
--- a/list-objects-filter-options.c
+++ b/list-objects-filter-options.c
@@ -55,7 +55,7 @@  const char *list_object_filter_config_name(enum list_objects_filter_choice c)
  * expand_list_objects_filter_spec() first).  We also "intern" the arg for the
  * convenience of the current command.
  */
-static int gently_parse_list_objects_filter(
+int gently_parse_list_objects_filter(
 	struct list_objects_filter_options *filter_options,
 	const char *arg,
 	struct strbuf *errbuf)
diff --git a/list-objects-filter-options.h b/list-objects-filter-options.h
index da5b6737e27..347a99c28cf 100644
--- a/list-objects-filter-options.h
+++ b/list-objects-filter-options.h
@@ -72,6 +72,11 @@  struct list_objects_filter_options {
 /* Normalized command line arguments */
 #define CL_ARG__FILTER "filter"
 
+int gently_parse_list_objects_filter(
+	struct list_objects_filter_options *filter_options,
+	const char *arg,
+	struct strbuf *errbuf);
+
 void list_objects_filter_die_if_populated(
 	struct list_objects_filter_options *filter_options);