diff mbox series

[v2,06/20] merge-ort: implement a very basic collect_merge_info()

Message ID 20201102204344.342633-7-newren@gmail.com (mailing list archive)
State New, archived
Headers show
Series fundamentals of merge-ort implementation | expand

Commit Message

Elijah Newren Nov. 2, 2020, 8:43 p.m. UTC
This does not actually collect any necessary info other than the
pathnames involved, since it just allocates an all-zero conflict_info
and stuffs that into paths.  However, it invokes the traverse_trees()
machinery to walk over all the paths and sets up the basic
infrastructure we need.

I have left out a few obvious optimizations to try to make this patch as
short and obvious as possible.  A subsequent patch will add some of
those back in with some more useful data fields before we introduce a
patch that actually sets up the conflict_info fields.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 merge-ort.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 2 deletions(-)

Comments

Jonathan Tan Nov. 6, 2020, 10:19 p.m. UTC | #1
> diff --git a/merge-ort.c b/merge-ort.c
> index 537da9f6df..626eb9713e 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -77,13 +77,130 @@ static int err(struct merge_options *opt, const char *err, ...)
>  	return -1;
>  }
>  
> +static int collect_merge_info_callback(int n,
> +				       unsigned long mask,
> +				       unsigned long dirmask,
> +				       struct name_entry *names,
> +				       struct traverse_info *info)
> +{

[snip]

> +	unsigned mbase_null = !(mask & 1);
> +	unsigned side1_null = !(mask & 2);
> +	unsigned side2_null = !(mask & 4);

Should these be "int"?

> +	/*
> +	 * A bunch of sanity checks verifying that traverse_trees() calls
> +	 * us the way I expect.  Could just remove these at some point,
> +	 * though maybe they are helpful to future code readers.
> +	 */
> +	assert(mbase_null == is_null_oid(&names[0].oid));
> +	assert(side1_null == is_null_oid(&names[1].oid));
> +	assert(side2_null == is_null_oid(&names[2].oid));
> +	assert(!mbase_null || !side1_null || !side2_null);
> +	assert(mask > 0 && mask < 8);

These were helpful to me.

> +	/* Other invariant checks, mostly for documentation purposes. */
> +	assert(mask == (dirmask | filemask));

But not this - filemask was computed in this function, so I need not
look elsewhere to see that this is correct.

> +	/*
> +	 * TODO: record information about the path other than all zeros,
> +	 * so we can resolve later in process_entries.
> +	 */
> +	ci = xcalloc(1, sizeof(struct conflict_info));
> +	strmap_put(&opti->paths, fullpath, ci);

OK - so each entry is a full-size conflict_info to store all relevant
information. Presumably some of these will be converted later into what
is effectively a struct merged_info (so, the extra struct conflict_info
fields are unused but memory is still occupied).

I do see that in patch 10, there is an optimization that directly
allocates the smaller struct merged_info when it is known at this point
that there is no conflict.

[snip rest of function]

>  static int collect_merge_info(struct merge_options *opt,
>  			      struct tree *merge_base,
>  			      struct tree *side1,
>  			      struct tree *side2)
>  {
> -	/* TODO: Implement this using traverse_trees() */
> -	die("Not yet implemented.");
> +	int ret;
> +	struct tree_desc t[3];
> +	struct traverse_info info;
> +	char *toplevel_dir_placeholder = "";
> +
> +	opt->priv->current_dir_name = toplevel_dir_placeholder;
> +	setup_traverse_info(&info, toplevel_dir_placeholder);

I thought that this was written like this (instead of inlining the 2
double-quotes) to ensure that the string-equality-is-pointer-equality
characteristic holds, but I see that that characteristic is for
directory_name in struct merged_info, not current_dir_name in struct
merge_options_internal. Any reason for not inlining ""?

[snip rest of function]
Elijah Newren Nov. 6, 2020, 11:10 p.m. UTC | #2
On Fri, Nov 6, 2020 at 2:19 PM Jonathan Tan <jonathantanmy@google.com> wrote:
>
> > diff --git a/merge-ort.c b/merge-ort.c
> > index 537da9f6df..626eb9713e 100644
> > --- a/merge-ort.c
> > +++ b/merge-ort.c
> > @@ -77,13 +77,130 @@ static int err(struct merge_options *opt, const char *err, ...)
> >       return -1;
> >  }
> >
> > +static int collect_merge_info_callback(int n,
> > +                                    unsigned long mask,
> > +                                    unsigned long dirmask,
> > +                                    struct name_entry *names,
> > +                                    struct traverse_info *info)
> > +{
>
> [snip]
>
> > +     unsigned mbase_null = !(mask & 1);
> > +     unsigned side1_null = !(mask & 2);
> > +     unsigned side2_null = !(mask & 4);
>
> Should these be "int"?

Does the type matter, particularly since "boolean" isn't available?

> > +     /*
> > +      * A bunch of sanity checks verifying that traverse_trees() calls
> > +      * us the way I expect.  Could just remove these at some point,
> > +      * though maybe they are helpful to future code readers.
> > +      */
> > +     assert(mbase_null == is_null_oid(&names[0].oid));
> > +     assert(side1_null == is_null_oid(&names[1].oid));
> > +     assert(side2_null == is_null_oid(&names[2].oid));
> > +     assert(!mbase_null || !side1_null || !side2_null);
> > +     assert(mask > 0 && mask < 8);
>
> These were helpful to me.
>
> > +     /* Other invariant checks, mostly for documentation purposes. */
> > +     assert(mask == (dirmask | filemask));
>
> But not this - filemask was computed in this function, so I need not
> look elsewhere to see that this is correct.
>
> > +     /*
> > +      * TODO: record information about the path other than all zeros,
> > +      * so we can resolve later in process_entries.
> > +      */
> > +     ci = xcalloc(1, sizeof(struct conflict_info));
> > +     strmap_put(&opti->paths, fullpath, ci);
>
> OK - so each entry is a full-size conflict_info to store all relevant
> information. Presumably some of these will be converted later into what
> is effectively a struct merged_info (so, the extra struct conflict_info
> fields are unused but memory is still occupied).
>
> I do see that in patch 10, there is an optimization that directly
> allocates the smaller struct merged_info when it is known at this point
> that there is no conflict.

Yep.  :-)

> [snip rest of function]
>
> >  static int collect_merge_info(struct merge_options *opt,
> >                             struct tree *merge_base,
> >                             struct tree *side1,
> >                             struct tree *side2)
> >  {
> > -     /* TODO: Implement this using traverse_trees() */
> > -     die("Not yet implemented.");
> > +     int ret;
> > +     struct tree_desc t[3];
> > +     struct traverse_info info;
> > +     char *toplevel_dir_placeholder = "";
> > +
> > +     opt->priv->current_dir_name = toplevel_dir_placeholder;
> > +     setup_traverse_info(&info, toplevel_dir_placeholder);
>
> I thought that this was written like this (instead of inlining the 2
> double-quotes) to ensure that the string-equality-is-pointer-equality
> characteristic holds, but I see that that characteristic is for
> directory_name in struct merged_info, not current_dir_name in struct
> merge_options_internal. Any reason for not inlining ""?

You're really digging in; I love it.  From setup_path_info(), the
directory_name is set from the current_dir_name:
        path_info->merged.directory_name = current_dir_name;
(and if you follow where the current_dir_name parameter gets its value
from, you find that it came indirectly from
opt->priv->current_dir_name), so current_dir_name must meet all the
requirements on merge_info's directory_name field.

Perhaps there's still some kind of additional simplification possible
here, but directory rename detection is an area that has to take some
special care around this requirement.  I simplified the code a little
bit in this area as I was trying to break off a good first 20 patches
to submit, but even if we can simplify it more, the structure is just
going to come back later.
Jonathan Tan Nov. 9, 2020, 8:59 p.m. UTC | #3
> > > +     unsigned mbase_null = !(mask & 1);
> > > +     unsigned side1_null = !(mask & 2);
> > > +     unsigned side2_null = !(mask & 4);
> >
> > Should these be "int"?
> 
> Does the type matter, particularly since "boolean" isn't available?

It doesn't, which is why I would expect the most generic type - if I see
something else, I would be led to think that there was a specific reason
for choosing that. But if I'm in the minority, that's fine.

> > I thought that this was written like this (instead of inlining the 2
> > double-quotes) to ensure that the string-equality-is-pointer-equality
> > characteristic holds, but I see that that characteristic is for
> > directory_name in struct merged_info, not current_dir_name in struct
> > merge_options_internal. Any reason for not inlining ""?
> 
> You're really digging in; I love it.  From setup_path_info(), the
> directory_name is set from the current_dir_name:
>         path_info->merged.directory_name = current_dir_name;
> (and if you follow where the current_dir_name parameter gets its value
> from, you find that it came indirectly from
> opt->priv->current_dir_name), so current_dir_name must meet all the
> requirements on merge_info's directory_name field.
> 
> Perhaps there's still some kind of additional simplification possible
> here, but directory rename detection is an area that has to take some
> special care around this requirement.  I simplified the code a little
> bit in this area as I was trying to break off a good first 20 patches
> to submit, but even if we can simplify it more, the structure is just
> going to come back later.

Ah, I see.
Derrick Stolee Nov. 11, 2020, 2:38 p.m. UTC | #4
On 11/2/2020 3:43 PM, Elijah Newren wrote:
> +	/* +1 in both of the following lines to include the NUL byte */
> +	fullpath = xmalloc(len+1);
> +	make_traverse_path(fullpath, len+1, info, p->path, p->pathlen);

nit: s/len+1/len + 1/g

> +		void *buf[3] = {NULL,};

This "{NULL,}" seems odd to me. I suppose there is a reason why it
isn't "{ NULL, NULL, NULL }"?

> +		const char *original_dir_name;
> +		int i, ret;
> +
> +		ci->match_mask &= filemask;
> +		newinfo = *info;
> +		newinfo.prev = info;
> +		newinfo.name = p->path;
> +		newinfo.namelen = p->pathlen;
> +		newinfo.pathlen = st_add3(newinfo.pathlen, p->pathlen, 1);
> +
> +		for (i = 0; i < 3; i++, dirmask >>= 1) {

This multi-action iterator borders on "too clever". It seems like
placing "dirmask >>= 1;" or "dirmask = dirmask >> 1;" at the end
of the block would be equivalent and less jarring to a reader.

I was thinking it doesn't really matter, except that dirmask is not
in the initializer or sentinel of the for(), so having it here does
not immediately make sense.

(This has been too much writing for such an inconsequential line
of code. Sorry.)

> +			const struct object_id *oid = NULL;
> +			if (dirmask & 1)
> +				oid = &names[i].oid;
> +			buf[i] = fill_tree_descriptor(opt->repo, t + i, oid);
> +		}


>  static int collect_merge_info(struct merge_options *opt,
>  			      struct tree *merge_base,
>  			      struct tree *side1,
>  			      struct tree *side2)
>  {
> -	/* TODO: Implement this using traverse_trees() */
> -	die("Not yet implemented.");
> +	int ret;
> +	struct tree_desc t[3];
> +	struct traverse_info info;
> +	char *toplevel_dir_placeholder = "";

It seems like this should be "const char *"

> +	init_tree_desc(t+0, merge_base->buffer, merge_base->size);
> +	init_tree_desc(t+1, side1->buffer, side1->size);
> +	init_tree_desc(t+2, side2->buffer, side2->size);

More space issues: s/t+/t + /g

I'm only really able to engage in this at a surface level, it
seems, but maybe I'll have more to say as the implementation
grows.

Thanks,
-Stolee
Elijah Newren Nov. 11, 2020, 5:02 p.m. UTC | #5
On Wed, Nov 11, 2020 at 6:38 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 11/2/2020 3:43 PM, Elijah Newren wrote:
> > +     /* +1 in both of the following lines to include the NUL byte */
> > +     fullpath = xmalloc(len+1);
> > +     make_traverse_path(fullpath, len+1, info, p->path, p->pathlen);
>
> nit: s/len+1/len + 1/g
>
> > +             void *buf[3] = {NULL,};
>
> This "{NULL,}" seems odd to me. I suppose there is a reason why it
> isn't "{ NULL, NULL, NULL }"?

Probably because I was copying from unpack-trees.c, which deals with a
variable number of trees instead of always exactly 3.  But yeah, it'd
probably be more straightforward as { NULL, NULL, NULL }.

> > +             const char *original_dir_name;
> > +             int i, ret;
> > +
> > +             ci->match_mask &= filemask;
> > +             newinfo = *info;
> > +             newinfo.prev = info;
> > +             newinfo.name = p->path;
> > +             newinfo.namelen = p->pathlen;
> > +             newinfo.pathlen = st_add3(newinfo.pathlen, p->pathlen, 1);
> > +
> > +             for (i = 0; i < 3; i++, dirmask >>= 1) {
>
> This multi-action iterator borders on "too clever". It seems like
> placing "dirmask >>= 1;" or "dirmask = dirmask >> 1;" at the end
> of the block would be equivalent and less jarring to a reader.
>
> I was thinking it doesn't really matter, except that dirmask is not
> in the initializer or sentinel of the for(), so having it here does
> not immediately make sense.
>
> (This has been too much writing for such an inconsequential line
> of code. Sorry.)

Yeah, copied from unpack-trees.c:traverse_trees_recursive().  The
newinfo variable name and a bunch of the surrounding lines were copied
from there too.  I can switch it, though, if it makes it easier.

> > +                     const struct object_id *oid = NULL;
> > +                     if (dirmask & 1)
> > +                             oid = &names[i].oid;
> > +                     buf[i] = fill_tree_descriptor(opt->repo, t + i, oid);
> > +             }
>
>
> >  static int collect_merge_info(struct merge_options *opt,
> >                             struct tree *merge_base,
> >                             struct tree *side1,
> >                             struct tree *side2)
> >  {
> > -     /* TODO: Implement this using traverse_trees() */
> > -     die("Not yet implemented.");
> > +     int ret;
> > +     struct tree_desc t[3];
> > +     struct traverse_info info;
> > +     char *toplevel_dir_placeholder = "";
>
> It seems like this should be "const char *"
>
> > +     init_tree_desc(t+0, merge_base->buffer, merge_base->size);
> > +     init_tree_desc(t+1, side1->buffer, side1->size);
> > +     init_tree_desc(t+2, side2->buffer, side2->size);
>
> More space issues: s/t+/t + /g

In my defense:

$ git grep init_tree_desc.*t.*\+ | grep -v merge-ort
builtin/merge.c: init_tree_desc(t+i, trees[i]->buffer, trees[i]->size);
builtin/read-tree.c: init_tree_desc(t+i, tree->buffer, tree->size);
merge-recursive.c: init_tree_desc_from_tree(t+0, common);
merge-recursive.c: init_tree_desc_from_tree(t+1, head);
merge-recursive.c: init_tree_desc_from_tree(t+2, merge);
merge.c: init_tree_desc(t+i, trees[i]->buffer, trees[i]->size);

None of which blames to me.  :-)

I can fix it up, though...at least the merge-ort one.  Someone else
can go through existing code if they so desire.

> I'm only really able to engage in this at a surface level, it
> seems, but maybe I'll have more to say as the implementation
> grows.

It _might_ be helpful to compare to unpack-trees.c's unpack_callback()
and traverse_trees_recursive(), but there's so much unrelated stuff
there that it's possible that just gets in the way more than it helps.
Regardless, thanks for taking a look and spotting little fixes; every
bit helps.
diff mbox series

Patch

diff --git a/merge-ort.c b/merge-ort.c
index 537da9f6df..626eb9713e 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -77,13 +77,130 @@  static int err(struct merge_options *opt, const char *err, ...)
 	return -1;
 }
 
+static int collect_merge_info_callback(int n,
+				       unsigned long mask,
+				       unsigned long dirmask,
+				       struct name_entry *names,
+				       struct traverse_info *info)
+{
+	/*
+	 * n is 3.  Always.
+	 * common ancestor (mbase) has mask 1, and stored in index 0 of names
+	 * head of side 1  (side1) has mask 2, and stored in index 1 of names
+	 * head of side 2  (side2) has mask 4, and stored in index 2 of names
+	 */
+	struct merge_options *opt = info->data;
+	struct merge_options_internal *opti = opt->priv;
+	struct conflict_info *ci;
+	struct name_entry *p;
+	size_t len;
+	char *fullpath;
+	unsigned filemask = mask & ~dirmask;
+	unsigned mbase_null = !(mask & 1);
+	unsigned side1_null = !(mask & 2);
+	unsigned side2_null = !(mask & 4);
+
+	/* n = 3 is a fundamental assumption. */
+	if (n != 3)
+		BUG("Called collect_merge_info_callback wrong");
+
+	/*
+	 * A bunch of sanity checks verifying that traverse_trees() calls
+	 * us the way I expect.  Could just remove these at some point,
+	 * though maybe they are helpful to future code readers.
+	 */
+	assert(mbase_null == is_null_oid(&names[0].oid));
+	assert(side1_null == is_null_oid(&names[1].oid));
+	assert(side2_null == is_null_oid(&names[2].oid));
+	assert(!mbase_null || !side1_null || !side2_null);
+	assert(mask > 0 && mask < 8);
+
+	/* Other invariant checks, mostly for documentation purposes. */
+	assert(mask == (dirmask | filemask));
+
+	/*
+	 * Get the name of the relevant filepath, which we'll pass to
+	 * setup_path_info() for tracking.
+	 */
+	p = names;
+	while (!p->mode)
+		p++;
+	len = traverse_path_len(info, p->pathlen);
+
+	/* +1 in both of the following lines to include the NUL byte */
+	fullpath = xmalloc(len+1);
+	make_traverse_path(fullpath, len+1, info, p->path, p->pathlen);
+
+	/*
+	 * TODO: record information about the path other than all zeros,
+	 * so we can resolve later in process_entries.
+	 */
+	ci = xcalloc(1, sizeof(struct conflict_info));
+	strmap_put(&opti->paths, fullpath, ci);
+
+	/* If dirmask, recurse into subdirectories */
+	if (dirmask) {
+		struct traverse_info newinfo;
+		struct tree_desc t[3];
+		void *buf[3] = {NULL,};
+		const char *original_dir_name;
+		int i, ret;
+
+		ci->match_mask &= filemask;
+		newinfo = *info;
+		newinfo.prev = info;
+		newinfo.name = p->path;
+		newinfo.namelen = p->pathlen;
+		newinfo.pathlen = st_add3(newinfo.pathlen, p->pathlen, 1);
+
+		for (i = 0; i < 3; i++, dirmask >>= 1) {
+			const struct object_id *oid = NULL;
+			if (dirmask & 1)
+				oid = &names[i].oid;
+			buf[i] = fill_tree_descriptor(opt->repo, t + i, oid);
+		}
+
+		original_dir_name = opti->current_dir_name;
+		opti->current_dir_name = fullpath;
+		ret = traverse_trees(NULL, 3, t, &newinfo);
+		opti->current_dir_name = original_dir_name;
+
+		for (i = 0; i < 3; i++)
+			free(buf[i]);
+
+		if (ret < 0)
+			return -1;
+	}
+
+	return mask;
+}
+
 static int collect_merge_info(struct merge_options *opt,
 			      struct tree *merge_base,
 			      struct tree *side1,
 			      struct tree *side2)
 {
-	/* TODO: Implement this using traverse_trees() */
-	die("Not yet implemented.");
+	int ret;
+	struct tree_desc t[3];
+	struct traverse_info info;
+	char *toplevel_dir_placeholder = "";
+
+	opt->priv->current_dir_name = toplevel_dir_placeholder;
+	setup_traverse_info(&info, toplevel_dir_placeholder);
+	info.fn = collect_merge_info_callback;
+	info.data = opt;
+	info.show_all_errors = 1;
+
+	parse_tree(merge_base);
+	parse_tree(side1);
+	parse_tree(side2);
+	init_tree_desc(t+0, merge_base->buffer, merge_base->size);
+	init_tree_desc(t+1, side1->buffer, side1->size);
+	init_tree_desc(t+2, side2->buffer, side2->size);
+
+	ret = traverse_trees(NULL, 3, t, &info);
+
+	return ret;
 }
 
 static int detect_and_process_renames(struct merge_options *opt,