diff mbox series

[04/10] evolve: add support for parsing metacommits

Message ID 2e9a4a9bd819785404e8a5343385f4fb2bc06109.1663959325.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add the Git Change command | expand

Commit Message

Stefan Xenos Sept. 23, 2022, 6:55 p.m. UTC
From: Stefan Xenos <sxenos@google.com>

This patch adds the get_metacommit_content method, which can classify
commits as either metacommits or normal commits, determine whether they
are abandoned, and extract the content commit's object id from the
metacommit.

Signed-off-by: Stefan Xenos <sxenos@google.com>
Signed-off-by: Chris Poucet <poucet@google.com>
---
 Makefile            |   1 +
 metacommit-parser.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 metacommit-parser.h |  19 ++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 metacommit-parser.c
 create mode 100644 metacommit-parser.h

Comments

Phillip Wood Sept. 26, 2022, 1:27 p.m. UTC | #1
Hi Chris

On 23/09/2022 19:55, Stefan Xenos via GitGitGadget wrote:
> From: Stefan Xenos <sxenos@google.com>
> 
> This patch adds the get_metacommit_content method, which can classify
> commits as either metacommits or normal commits, determine whether they
> are abandoned, and extract the content commit's object id from the
> metacommit.
> 
> Signed-off-by: Stefan Xenos <sxenos@google.com>
> Signed-off-by: Chris Poucet <poucet@google.com>
> ---
>   Makefile            |   1 +
>   metacommit-parser.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
>   metacommit-parser.h |  19 ++++++++
>   3 files changed, 130 insertions(+)
>   create mode 100644 metacommit-parser.c
>   create mode 100644 metacommit-parser.h
> 
> diff --git a/Makefile b/Makefile
> index cac3452edb9..b2bcc00c289 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -999,6 +999,7 @@ LIB_OBJS += merge-ort.o
>   LIB_OBJS += merge-ort-wrappers.o
>   LIB_OBJS += merge-recursive.o
>   LIB_OBJS += merge.o
> +LIB_OBJS += metacommit-parser.o

There seems to be a problem with the indent here

>   LIB_OBJS += midx.o
>   LIB_OBJS += name-hash.o
>   LIB_OBJS += negotiator/default.o

 > diff --git a/metacommit-parser.h b/metacommit-parser.h
 > new file mode 100644
 > index 00000000000..1c74bd6d699
 > --- /dev/null
 > +++ b/metacommit-parser.h
 > @@ -0,0 +1,19 @@
 > +#ifndef METACOMMIT_PARSER_H
 > +#define METACOMMIT_PARSER_H
 > +
 > +#include "commit.h"
 > +#include "hash.h"
 > +
 > +/* Indicates a normal commit (non-metacommit) */
 > +#define METACOMMIT_TYPE_NONE 0
 > +/* Indicates a metacommit with normal content (non-abandoned) */
 > +#define METACOMMIT_TYPE_NORMAL 1
 > +/* Indicates a metacommit with abandoned content */
 > +#define METACOMMIT_TYPE_ABANDONED 2

Is it possible to define these as an enum? It would make the signature 
of get_meta_commit_content() nicer.

 > +struct commit;

What's this for? We're including commit.h above.

 > +extern int get_metacommit_content(
 > +	struct commit *commit, struct object_id *content);

> diff --git a/metacommit-parser.c b/metacommit-parser.c
> new file mode 100644
> index 00000000000..70c1428bfc6
> --- /dev/null
> +++ b/metacommit-parser.c
> @@ -0,0 +1,110 @@
> +#include "cache.h"
> +#include "metacommit-parser.h"
> +#include "commit.h"
> +
> +/*
> + * Search the commit buffer for a line starting with the given key. Unlike
> + * find_commit_header, this also searches the commit message body.
> + */

There is no explanation in the code or commit message as to why this 
function is needed. The documentation added in the first commit says 
that "parent-type" header is a commit header. I think the answer is that 
this series does not implement that header but uses the commit message 
instead. That's perfectly fine for a proof of concept but it is 
precisely the sort of detail that should be described it the commit 
message and probably flagged up in the cover letter.

> +static const char *find_key(const char *msg, const char *key, size_t *out_len)
> +{
> +	int key_len = strlen(key);
> +	const char *line = msg;
> +
> +	while (line) {
> +		const char *eol = strchrnul(line, '\n');
> +
> +		if (eol - line > key_len && !memcmp(line, key, key_len) &&
> +		    line[key_len] == ' ') {
> +			*out_len = eol - line - key_len - 1;
> +			return line + key_len + 1;
> +		}
> +		line = *eol ? eol + 1 : NULL;
> +	}
> +	return NULL;
> +}
> +
> +static struct commit *get_commit_by_index(struct commit_list *to_search, int index)
> +{
> +	while (to_search && index) {
> +		to_search = to_search->next;
> +		index--;
> +	}
> +
> +	if (!to_search)
> +		return NULL;
> +
> +	return to_search->item;
> +}

This function is a useful utility for struct commit_list and should live 
in commit.c. It could be used to simplify object-name.c:get_parent() for 
example.

> +/*
> + * Writes the index of the content parent to "result". Returns the metacommit
> + * type. See the METACOMMIT_TYPE_* constants.
> + */
> +static int index_of_content_commit(const char *buffer, int *result)

I found the signature confusing as it is returning an int but that is 
not the index. Switching to an enum for the metacommit types would 
clarify that.

> +{
> +	int index = 0;
> +	int ret = METACOMMIT_TYPE_NONE;
> +	size_t parent_types_size;
> +	const char *parent_types = find_key(buffer, "parent-type",
> +		&parent_types_size);
> +	const char *end;
> +	const char *enum_start = parent_types;
> +	int enum_length = 0;
> +
> +	if (!parent_types)
> +		return METACOMMIT_TYPE_NONE;
> +
> +	end = &parent_types[parent_types_size];
> +
> +	while (1) {
> +		char next = *parent_types;
> +		if (next == ' ' || parent_types >= end) {
> +			if (enum_length == 1) {

if enum_length != 1 then there is an error in the parent-type header and 
we should probably bail out.

> +				char first_char_in_enum = *enum_start;

It's not just the first character, it's the only character, do we really 
need such a long variable name? (how about just calling it "type")

I'll try and take at look at the next couple of patches later in the week.

Best Wishes

Phillip
Chris P Oct. 4, 2022, 11:21 a.m. UTC | #2
> > This patch adds the get_metacommit_content method, which can classify
> > commits as either metacommits or normal commits, determine whether they
> > are abandoned, and extract the content commit's object id from the
> > metacommit.
> > diff --git a/Makefile b/Makefile
> > index cac3452edb9..b2bcc00c289 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -999,6 +999,7 @@ LIB_OBJS += merge-ort.o
> >   LIB_OBJS += merge-ort-wrappers.o
> >   LIB_OBJS += merge-recursive.o
> >   LIB_OBJS += merge.o
> > +LIB_OBJS += metacommit-parser.o
>
> There seems to be a problem with the indent here

I'm not sure I follow, there's not indentation on that line?
>
> >   LIB_OBJS += midx.o
> >   LIB_OBJS += name-hash.o
> >   LIB_OBJS += negotiator/default.o
>
>  > diff --git a/metacommit-parser.h b/metacommit-parser.h
>  > new file mode 100644
>  > index 00000000000..1c74bd6d699
>  > --- /dev/null
>  > +++ b/metacommit-parser.h
>  > @@ -0,0 +1,19 @@
>  > +#ifndef METACOMMIT_PARSER_H
>  > +#define METACOMMIT_PARSER_H
>  > +
>  > +#include "commit.h"
>  > +#include "hash.h"
>  > +
>  > +/* Indicates a normal commit (non-metacommit) */
>  > +#define METACOMMIT_TYPE_NONE 0
>  > +/* Indicates a metacommit with normal content (non-abandoned) */
>  > +#define METACOMMIT_TYPE_NORMAL 1
>  > +/* Indicates a metacommit with abandoned content */
>  > +#define METACOMMIT_TYPE_ABANDONED 2
>
> Is it possible to define these as an enum? It would make the signature
> of get_meta_commit_content() nicer.
>
>  > +struct commit;
>
> What's this for? We're including commit.h above.

Forgot to remove this as I added the include commit.h later.

>
>  > +extern int get_metacommit_content(
>  > +    struct commit *commit, struct object_id *content);
>
> > diff --git a/metacommit-parser.c b/metacommit-parser.c
> > new file mode 100644
> > index 00000000000..70c1428bfc6
> > --- /dev/null
> > +++ b/metacommit-parser.c
> > @@ -0,0 +1,110 @@
> > +#include "cache.h"
> > +#include "metacommit-parser.h"
> > +#include "commit.h"
> > +
> > +/*
> > + * Search the commit buffer for a line starting with the given key. Unlike
> > + * find_commit_header, this also searches the commit message body.
> > + */
>
> There is no explanation in the code or commit message as to why this
> function is needed. The documentation added in the first commit says
> that "parent-type" header is a commit header. I think the answer is that
> this series does not implement that header but uses the commit message
> instead. That's perfectly fine for a proof of concept but it is
> precisely the sort of detail that should be described it the commit
> message and probably flagged up in the cover letter.

I admit I thought I thought this was part of the header because it
shows up before
the blank line before the commit title.

How do I make this a commit header?

>
> > +static const char *find_key(const char *msg, const char *key, size_t *out_len)
> > +{
> > +     int key_len = strlen(key);
> > +     const char *line = msg;
> > +
> > +     while (line) {
> > +             const char *eol = strchrnul(line, '\n');
> > +
> > +             if (eol - line > key_len && !memcmp(line, key, key_len) &&
> > +                 line[key_len] == ' ') {
> > +                     *out_len = eol - line - key_len - 1;
> > +                     return line + key_len + 1;
> > +             }
> > +             line = *eol ? eol + 1 : NULL;
> > +     }
> > +     return NULL;
> > +}
> > +
> > +static struct commit *get_commit_by_index(struct commit_list *to_search, int index)
> > +{
> > +     while (to_search && index) {
> > +             to_search = to_search->next;
> > +             index--;
> > +     }
> > +
> > +     if (!to_search)
> > +             return NULL;
> > +
> > +     return to_search->item;
> > +}
>
> This function is a useful utility for struct commit_list and should live
> in commit.c. It could be used to simplify object-name.c:get_parent() for
> example.

Done.  I'll defer cleaning up get_parent to a potentially later change to avoid
muddying up this change too much.

>
> > +/*
> > + * Writes the index of the content parent to "result". Returns the metacommit
> > + * type. See the METACOMMIT_TYPE_* constants.
> > + */
> > +static int index_of_content_commit(const char *buffer, int *result)
>
> I found the signature confusing as it is returning an int but that is
> not the index. Switching to an enum for the metacommit types would
> clarify that.

Done.

>
> > +{
> > +     int index = 0;
> > +     int ret = METACOMMIT_TYPE_NONE;
> > +     size_t parent_types_size;
> > +     const char *parent_types = find_key(buffer, "parent-type",
> > +             &parent_types_size);
> > +     const char *end;
> > +     const char *enum_start = parent_types;
> > +     int enum_length = 0;
> > +
> > +     if (!parent_types)
> > +             return METACOMMIT_TYPE_NONE;
> > +
> > +     end = &parent_types[parent_types_size];
> > +
> > +     while (1) {
> > +             char next = *parent_types;
> > +             if (next == ' ' || parent_types >= end) {
> > +                     if (enum_length == 1) {
>
> if enum_length != 1 then there is an error in the parent-type header and
> we should probably bail out.
>
> > +                             char first_char_in_enum = *enum_start;
>
> It's not just the first character, it's the only character, do we really
> need such a long variable name? (how about just calling it "type")

Done.

> I'll try and take at look at the next couple of patches later in the week.

Thank you for all the reviews!

-- simply chris
Phillip Wood Oct. 4, 2022, 2:10 p.m. UTC | #3
Hi Chris

On 04/10/2022 12:21, Chris P wrote:
>>> This patch adds the get_metacommit_content method, which can classify
>>> commits as either metacommits or normal commits, determine whether they
>>> are abandoned, and extract the content commit's object id from the
>>> metacommit.
>>> diff --git a/Makefile b/Makefile
>>> index cac3452edb9..b2bcc00c289 100644
>>> --- a/Makefile
>>> +++ b/Makefile
>>> @@ -999,6 +999,7 @@ LIB_OBJS += merge-ort.o
>>>    LIB_OBJS += merge-ort-wrappers.o
>>>    LIB_OBJS += merge-recursive.o
>>>    LIB_OBJS += merge.o
>>> +LIB_OBJS += metacommit-parser.o
>>
>> There seems to be a problem with the indent here
> 
> I'm not sure I follow, there's not indentation on that line?

For some reason LIB_OBJS on that line does not line up with the lines 
either side of it in my mailer, but looking at the patch on 
lore.kernel.org it seems fine so I think the problem was at my end.

>>> diff --git a/metacommit-parser.c b/metacommit-parser.c
>>> new file mode 100644
>>> index 00000000000..70c1428bfc6
>>> --- /dev/null
>>> +++ b/metacommit-parser.c
>>> @@ -0,0 +1,110 @@
>>> +#include "cache.h"
>>> +#include "metacommit-parser.h"
>>> +#include "commit.h"
>>> +
>>> +/*
>>> + * Search the commit buffer for a line starting with the given key. Unlike
>>> + * find_commit_header, this also searches the commit message body.
>>> + */
>>
>> There is no explanation in the code or commit message as to why this
>> function is needed. The documentation added in the first commit says
>> that "parent-type" header is a commit header. I think the answer is that
>> this series does not implement that header but uses the commit message
>> instead. That's perfectly fine for a proof of concept but it is
>> precisely the sort of detail that should be described it the commit
>> message and probably flagged up in the cover letter.
> 
> I admit I thought I thought this was part of the header because it
> shows up before
> the blank line before the commit title.

If I create a meta-commit and then run "git cat-file commit" on it I see

tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904
parent fd7e455287603d5bb2e3623dc442b592411cbfe9
parent d79ce1670bdcb76e6d1da2ae095e890ccb326ae9
author A U Thor <author@example.com> 1112912113 -0700
committer C O Mitter <committer@example.com> 1112912113 -0700

parent-type c r

i.e. the parent-type comes after the blank line that separates the 
headers from the message

> How do I make this a commit header?

I've left some comments on the patch that creates the meta-commits. 
Since I wrote the above Junio has commented[1] that he prefers the 
commit message approach to adding a new header so I'd leave the creation 
as it is for now and change find_key() just to look at the commit 
message. (I do prefer the idea of a new header as it provides an 
unambiguous way to distinguish meta-commits from normal commits but lets 
see how using the commit message pans out)

[1] https://lore.kernel.org/git/xmqqsfkbqjgz.fsf@gitster.g/

>>> +static const char *find_key(const char *msg, const char *key, size_t *out_len)
>>> +{
>>> +     int key_len = strlen(key);
>>> +     const char *line = msg;
>>> +
>>> +     while (line) {
>>> +             const char *eol = strchrnul(line, '\n');
>>> +
>>> +             if (eol - line > key_len && !memcmp(line, key, key_len) &&
>>> +                 line[key_len] == ' ') {
>>> +                     *out_len = eol - line - key_len - 1;
>>> +                     return line + key_len + 1;
>>> +             }
>>> +             line = *eol ? eol + 1 : NULL;
>>> +     }
>>> +     return NULL;
>>> +}
>>> +
>>> +static struct commit *get_commit_by_index(struct commit_list *to_search, int index)
>>> +{
>>> +     while (to_search && index) {
>>> +             to_search = to_search->next;
>>> +             index--;
>>> +     }
>>> +
>>> +     if (!to_search)
>>> +             return NULL;
>>> +
>>> +     return to_search->item;
>>> +}
>>
>> This function is a useful utility for struct commit_list and should live
>> in commit.c. It could be used to simplify object-name.c:get_parent() for
>> example.
> 
> Done.  I'll defer cleaning up get_parent to a potentially later change to avoid
> muddying up this change too much.

Sure, get_parent() was meant as an example of why the function is useful 
outside of this work, while you're very welcome to clean it up please 
don't feel that you are obliged to.

>> I'll try and take at look at the next couple of patches later in the week.
> 
> Thank you for all the reviews!

You're welcome, I'm excited to see evolve getting some attention again.

Phillip


> -- simply chris
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index cac3452edb9..b2bcc00c289 100644
--- a/Makefile
+++ b/Makefile
@@ -999,6 +999,7 @@  LIB_OBJS += merge-ort.o
 LIB_OBJS += merge-ort-wrappers.o
 LIB_OBJS += merge-recursive.o
 LIB_OBJS += merge.o
+LIB_OBJS += metacommit-parser.o
 LIB_OBJS += midx.o
 LIB_OBJS += name-hash.o
 LIB_OBJS += negotiator/default.o
diff --git a/metacommit-parser.c b/metacommit-parser.c
new file mode 100644
index 00000000000..70c1428bfc6
--- /dev/null
+++ b/metacommit-parser.c
@@ -0,0 +1,110 @@ 
+#include "cache.h"
+#include "metacommit-parser.h"
+#include "commit.h"
+
+/*
+ * Search the commit buffer for a line starting with the given key. Unlike
+ * find_commit_header, this also searches the commit message body.
+ */
+static const char *find_key(const char *msg, const char *key, size_t *out_len)
+{
+	int key_len = strlen(key);
+	const char *line = msg;
+
+	while (line) {
+		const char *eol = strchrnul(line, '\n');
+
+		if (eol - line > key_len && !memcmp(line, key, key_len) &&
+		    line[key_len] == ' ') {
+			*out_len = eol - line - key_len - 1;
+			return line + key_len + 1;
+		}
+		line = *eol ? eol + 1 : NULL;
+	}
+	return NULL;
+}
+
+static struct commit *get_commit_by_index(struct commit_list *to_search, int index)
+{
+	while (to_search && index) {
+		to_search = to_search->next;
+		index--;
+	}
+
+	if (!to_search)
+		return NULL;
+
+	return to_search->item;
+}
+
+/*
+ * Writes the index of the content parent to "result". Returns the metacommit
+ * type. See the METACOMMIT_TYPE_* constants.
+ */
+static int index_of_content_commit(const char *buffer, int *result)
+{
+	int index = 0;
+	int ret = METACOMMIT_TYPE_NONE;
+	size_t parent_types_size;
+	const char *parent_types = find_key(buffer, "parent-type",
+		&parent_types_size);
+	const char *end;
+	const char *enum_start = parent_types;
+	int enum_length = 0;
+
+	if (!parent_types)
+		return METACOMMIT_TYPE_NONE;
+
+	end = &parent_types[parent_types_size];
+
+	while (1) {
+		char next = *parent_types;
+		if (next == ' ' || parent_types >= end) {
+			if (enum_length == 1) {
+				char first_char_in_enum = *enum_start;
+				if (first_char_in_enum == 'c') {
+					ret = METACOMMIT_TYPE_NORMAL;
+					break;
+				}
+				if (first_char_in_enum == 'a') {
+					ret = METACOMMIT_TYPE_ABANDONED;
+					break;
+				}
+			}
+			if (parent_types >= end)
+				return METACOMMIT_TYPE_NONE;
+			enum_start = parent_types + 1;
+			enum_length = 0;
+			index++;
+		} else {
+			enum_length++;
+		}
+		parent_types++;
+	}
+
+	*result = index;
+	return ret;
+}
+
+/*
+ * Writes the content parent's object id to "content".
+ * Returns the metacommit type. See the METACOMMIT_TYPE_* constants.
+ */
+int get_metacommit_content(struct commit *commit, struct object_id *content)
+{
+	const char *buffer = get_commit_buffer(commit, NULL);
+	int index = 0;
+	int ret = index_of_content_commit(buffer, &index);
+	struct commit *content_parent;
+
+	if (ret == METACOMMIT_TYPE_NONE)
+		return ret;
+
+	content_parent = get_commit_by_index(commit->parents, index);
+
+	if (!content_parent)
+		return METACOMMIT_TYPE_NONE;
+
+	oidcpy(content, &(content_parent->object.oid));
+	return ret;
+}
diff --git a/metacommit-parser.h b/metacommit-parser.h
new file mode 100644
index 00000000000..1c74bd6d699
--- /dev/null
+++ b/metacommit-parser.h
@@ -0,0 +1,19 @@ 
+#ifndef METACOMMIT_PARSER_H
+#define METACOMMIT_PARSER_H
+
+#include "commit.h"
+#include "hash.h"
+
+/* Indicates a normal commit (non-metacommit) */
+#define METACOMMIT_TYPE_NONE 0
+/* Indicates a metacommit with normal content (non-abandoned) */
+#define METACOMMIT_TYPE_NORMAL 1
+/* Indicates a metacommit with abandoned content */
+#define METACOMMIT_TYPE_ABANDONED 2
+
+struct commit;
+
+extern int get_metacommit_content(
+	struct commit *commit, struct object_id *content);
+
+#endif