diff mbox series

[02/11] merge-ort: add initial outline for basic rename detection

Message ID b9e0e1a60b92a6a220193bf9fe80796df91126a7.1607542887.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series merge-ort: add basic rename detection | expand

Commit Message

Elijah Newren Dec. 9, 2020, 7:41 p.m. UTC
From: Elijah Newren <newren@gmail.com>

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 merge-ort.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 8 deletions(-)

Comments

Derrick Stolee Dec. 11, 2020, 2:39 a.m. UTC | #1
On 12/9/2020 2:41 PM, Elijah Newren via GitGitGadget wrote:
> From: Elijah Newren <newren@gmail.com>
> 
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>  merge-ort.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 60 insertions(+), 8 deletions(-)
> 
> diff --git a/merge-ort.c b/merge-ort.c
> index 90baedac407..92b765dd3f0 100644
> --- a/merge-ort.c
> +++ b/merge-ort.c
> @@ -617,20 +617,72 @@ static int handle_content_merge(struct merge_options *opt,
>  
>  /*** Function Grouping: functions related to regular rename detection ***/
>  
> +static int process_renames(struct merge_options *opt,
> +			   struct diff_queue_struct *renames)
> +static int compare_pairs(const void *a_, const void *b_)
> +/* Call diffcore_rename() to compute which files have changed on given side */
> +static void detect_regular_renames(struct merge_options *opt,
> +				   struct tree *merge_base,
> +				   struct tree *side,
> +				   unsigned side_index)
> +static int collect_renames(struct merge_options *opt,
> +			   struct diff_queue_struct *result,
> +			   unsigned side_index)

standard "I promise this will follow soon!" strategy, OK.

>  static int detect_and_process_renames(struct merge_options *opt,
>  				      struct tree *merge_base,
>  				      struct tree *side1,
>  				      struct tree *side2)
>  {
> -	int clean = 1;
> +	struct diff_queue_struct combined;
> +	struct rename_info *renames = opt->priv->renames;

(Re: my concerns that we don't need 'renames' to be a pointer,
this could easily be "renames = &opt->priv.renames;")

> +	int s, clean = 1;
> +
> +	memset(&combined, 0, sizeof(combined));
> +
> +	detect_regular_renames(opt, merge_base, side1, 1);
> +	detect_regular_renames(opt, merge_base, side2, 2);

Find the renames in each side's diff.

I think the use of "1" and "2" here might be better situated
for an enum. Perhaps:

enum merge_side {
	MERGE_SIDE1 = 0,
	MERGE_SIDE2 = 1,
};

(Note, I shift these values to 0 and 1, respectively, allowing
us to truncate the pairs array to two entries while still
being mentally clear.)

> +
> +	ALLOC_GROW(combined.queue,
> +		   renames->pairs[1].nr + renames->pairs[2].nr,
> +		   combined.alloc);
> +	clean &= collect_renames(opt, &combined, 1);
> +	clean &= collect_renames(opt, &combined, 2);

Magic numbers again.

> +	QSORT(combined.queue, combined.nr, compare_pairs);
> +
> +	clean &= process_renames(opt, &combined);

I need to mentally remember that "clean" is a return state,
but _not_ a fail/success result. Even though we are using
"&=" here, it shouldn't be "&&=" or even "if (method()) return 1;"

Looking at how "clean" is used in struct merge_result, I
wonder if there is a reason to use an "int" over a simple
"unsigned" or even "unsigned clean:1;" You use -1 in places
as well as a case of "mi->clean = !!resolved;"

If there is more meaning to values other than "clean" or
"!clean", then an enum might be valuable.

> +	/* Free memory for renames->pairs[] and combined */
> +	for (s = 1; s <= 2; s++) {
> +		free(renames->pairs[s].queue);
> +		DIFF_QUEUE_CLEAR(&renames->pairs[s]);
> +	}

This loop is particularly unusual. Perhaps it would be
better to do this instead:

	free(renames->pairs[MERGE_SIDE1].queue);
	free(renames->pairs[MERGE_SIDE2].queue);
	DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE1]);
	DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE2]);

> +	if (combined.nr) {
> +		int i;
> +		for (i = 0; i < combined.nr; i++)
> +			diff_free_filepair(combined.queue[i]);
> +		free(combined.queue);
> +	}
>  
> -	/*
> -	 * Rename detection works by detecting file similarity.  Here we use
> -	 * a really easy-to-implement scheme: files are similar IFF they have
> -	 * the same filename.  Therefore, by this scheme, there are no renames.
> -	 *
> -	 * TODO: Actually implement a real rename detection scheme.
> -	 */
>  	return clean;

I notice that this change causes detect_and_process_renames() to
change from an "unhelpful result, but success" to "die() always".

I wonder if there is value in swapping the order of the patches
to implement the static methods first. Of course, you hit the
"unreferenced static method" problem, so maybe your strategy is
better after all.

Thanks,
-Stolee
Elijah Newren Dec. 11, 2020, 9:40 a.m. UTC | #2
On Thu, Dec 10, 2020 at 6:39 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 12/9/2020 2:41 PM, Elijah Newren via GitGitGadget wrote:
> > From: Elijah Newren <newren@gmail.com>
> >
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> >  merge-ort.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 60 insertions(+), 8 deletions(-)
> >
> > diff --git a/merge-ort.c b/merge-ort.c
> > index 90baedac407..92b765dd3f0 100644
> > --- a/merge-ort.c
> > +++ b/merge-ort.c
> > @@ -617,20 +617,72 @@ static int handle_content_merge(struct merge_options *opt,
> >
> >  /*** Function Grouping: functions related to regular rename detection ***/
> >
> > +static int process_renames(struct merge_options *opt,
> > +                        struct diff_queue_struct *renames)
> > +static int compare_pairs(const void *a_, const void *b_)
> > +/* Call diffcore_rename() to compute which files have changed on given side */
> > +static void detect_regular_renames(struct merge_options *opt,
> > +                                struct tree *merge_base,
> > +                                struct tree *side,
> > +                                unsigned side_index)
> > +static int collect_renames(struct merge_options *opt,
> > +                        struct diff_queue_struct *result,
> > +                        unsigned side_index)
>
> standard "I promise this will follow soon!" strategy, OK.
>
> >  static int detect_and_process_renames(struct merge_options *opt,
> >                                     struct tree *merge_base,
> >                                     struct tree *side1,
> >                                     struct tree *side2)
> >  {
> > -     int clean = 1;
> > +     struct diff_queue_struct combined;
> > +     struct rename_info *renames = opt->priv->renames;
>
> (Re: my concerns that we don't need 'renames' to be a pointer,
> this could easily be "renames = &opt->priv.renames;")

Yeah, there'll be a lot of these...

>
> > +     int s, clean = 1;
> > +
> > +     memset(&combined, 0, sizeof(combined));
> > +
> > +     detect_regular_renames(opt, merge_base, side1, 1);
> > +     detect_regular_renames(opt, merge_base, side2, 2);
>
> Find the renames in each side's diff.
>
> I think the use of "1" and "2" here might be better situated
> for an enum. Perhaps:

I can see where you're coming from...but that's a monumentally huge
shift.  What about all my "loops over the sides"?  Sure, the ones that
are only two lines long could be just turned into code without a loop,
but when the loop is 140 lines long, that doesn't make much sense.
Loop over enum ranges?

Are all the variables that track an index still okay?    Do I need to
also introduce an enum for all the filemask/dirmask/match_mask/etc.
variables:

enum merge_mask {
     MERGE_JUST_BASE = 1,
     MERGE_JUST_SIDE1 = 2,
     MERGE_BASE_AND_SIDE1 = 3,
     MERGE_JUST_SIDE2 = 4,
     MERGE_BASE_AND_SIDE2 = 5,
     MERGE_SIDE1_AND_SIDE2 = 6,
     MERGE_BASE_AND_SIDE1_AND_SIDE2 = 7
}

?  That seems like a pretty big pain to use.

Also, what about the code that uses a side index to get the other side
index?  Or my conversions from side indices to masks (or vice versa)?
I tend to put comments by these, but there's a _lot_ of them.


I suspect this one would take a week to change, and I'd miss several
locations, and....some cases would certainly look cleaner but I
suspect some would be far uglier and end up being unchanged and then
leave us with a mess of trying to understand both.

What if I added a big comment near the top of the file that we've got
dozens of variables that are arrays of size 3 which are meant to be
indexed by the "side" of the three-way merge that it is tracking
information for:
    0: merge_base
    1: side1
    2: side2
(though several of the variables might have index 0 unused since it
doesn't track anything specifically for the merge base), and further
that masks are used in certain variables which try to track which
sides are present or match, with 2<<SIDE being the bit to track that a
given side is present/relevant.

I mean, this stuff is all over the place throughout the 4500 line
merge-ort.c file.  And it is in lots of diffcore-rename.c (which will
grow by about 1K lines as well).

> enum merge_side {
>         MERGE_SIDE1 = 0,
>         MERGE_SIDE2 = 1,
> };
>
> (Note, I shift these values to 0 and 1, respectively, allowing
> us to truncate the pairs array to two entries while still
> being mentally clear.)

As mentioned in the previous patch, the shift of the indices would
cause me at least a large amount of mental confusion and I suspect it
would for others too.  Both conflict_info.stages[] and
conflict_info.pathnames[] are meant to be indexed by the side of the
merge (or the base), but using conflict_info.stages[MERGE_SIDE1] or
conflict_info.pathnames[MERGE_SIDE2] as you have them defined here
would provide the wrong answer.  Since there's a conflict_info per
file and it is used all over the code, this would just be ripe for
off-by-one errors.

Since both conflict_info.stages[] and renames->pairs[] are meant to be
indexed by the merge side, this kind of conflict is inevitable.  The
only clean solution is making both be arrays of size three, and just
skipping index 0 in the variables that don't need to track something
for the merge_base.

> > +
> > +     ALLOC_GROW(combined.queue,
> > +                renames->pairs[1].nr + renames->pairs[2].nr,
> > +                combined.alloc);
> > +     clean &= collect_renames(opt, &combined, 1);
> > +     clean &= collect_renames(opt, &combined, 2);
>
> Magic numbers again.
>
> > +     QSORT(combined.queue, combined.nr, compare_pairs);
> > +
> > +     clean &= process_renames(opt, &combined);
>
> I need to mentally remember that "clean" is a return state,
> but _not_ a fail/success result. Even though we are using
> "&=" here, it shouldn't be "&&=" or even "if (method()) return 1;"
>
> Looking at how "clean" is used in struct merge_result, I
> wonder if there is a reason to use an "int" over a simple
> "unsigned" or even "unsigned clean:1;" You use -1 in places
> as well as a case of "mi->clean = !!resolved;"

Heh, when I used an unsigned for a boolean, Jonathan Tan asked why I
didn't use an int.  When I use an int for a boolean, you ask why I
don't use an unsigned.  I think my stock answer should just be that
the other reviewer suggested it.  ;-)

> If there is more meaning to values other than "clean" or
> "!clean", then an enum might be valuable.

Yeah, this came from unpack-trees.c and merge-recursive.c, where the
value is usually 1 (clean) or 0 (not clean), but the special value of
-1 signals something went wrong enough that we need to stop further
processing and return up the call chain for any necessary cleanup
(e.g. removal of lock files).  The value of -1 is only used for things
like "disk-is-full, can't write any more files to the working
directory", or "failed to read one of the trees involved in the merge
from the git object store".

-1, though, is the return value from unpack_trees(), traverse_trees(),
and perhaps other places in the code, so I'd be worried about
attempting to use a different special value for fear that I'd miss
converting the return value I got from one of those to the new special
value.

merge-ort has far fewer locations where -1 appears (in part because
the checkout() code is an external function rather than being
sprinkled everywhere), and it tends to cause the code to return
immediately, so most all call sites can assume a simple boolean value
of either 0 or 1.

> > +     /* Free memory for renames->pairs[] and combined */
> > +     for (s = 1; s <= 2; s++) {
> > +             free(renames->pairs[s].queue);
> > +             DIFF_QUEUE_CLEAR(&renames->pairs[s]);
> > +     }
>
> This loop is particularly unusual. Perhaps it would be
> better to do this instead:
>
>         free(renames->pairs[MERGE_SIDE1].queue);
>         free(renames->pairs[MERGE_SIDE2].queue);
>         DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE1]);
>         DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE2]);

If this were the only one, then I'd absolutely agree.  There's 7 such
loops in the version of merge-ort.c in the 'ort' branch.  I can't get
rid of all of them, because even though some are short, some of those
are very long for-loops.  (The long ones use "side" for the loop
counter instead of "s" -- maybe I should use "side" even on the short
ones?)

There's also another 6 that loop over the sides including the
merge-base (thus including index 0).  If those count, it's closer to
13.

> > +     if (combined.nr) {
> > +             int i;
> > +             for (i = 0; i < combined.nr; i++)
> > +                     diff_free_filepair(combined.queue[i]);
> > +             free(combined.queue);
> > +     }
> >
> > -     /*
> > -      * Rename detection works by detecting file similarity.  Here we use
> > -      * a really easy-to-implement scheme: files are similar IFF they have
> > -      * the same filename.  Therefore, by this scheme, there are no renames.
> > -      *
> > -      * TODO: Actually implement a real rename detection scheme.
> > -      */
> >       return clean;
>
> I notice that this change causes detect_and_process_renames() to
> change from an "unhelpful result, but success" to "die() always".
>
> I wonder if there is value in swapping the order of the patches
> to implement the static methods first. Of course, you hit the
> "unreferenced static method" problem, so maybe your strategy is
> better after all.

I used to do that kind of thing, but the unreferenced static method
problem is really annoying.  It means the code doesn't even compile,
which is a bad state to submit patches in.  I can work around that by
adding "(void)unused_funcname;" expressions somewhere in the code, but
reviewers tend to be even more surprised by those.
Elijah Newren Dec. 13, 2020, 7:47 a.m. UTC | #3
Hi,

Sorry for two different email responses to the same email...

Addressing the comments on this patchset mean re-submitting
en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
patches against a series that isn't published by Junio, I'll need to
ask Junio to temporarily drop both of these series, then later
resubmit en/merge-ort-2 after he publishes my updates to
en/merge-ort-impl.  Then when he publishes my updates to
en/merge-ort-2, I'll be able to submit my already-rebased patches for
en/merge-ort-3.

A couple extra comments below...

On Thu, Dec 10, 2020 at 6:39 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 12/9/2020 2:41 PM, Elijah Newren via GitGitGadget wrote:
> > From: Elijah Newren <newren@gmail.com>
> >
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> >  merge-ort.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 60 insertions(+), 8 deletions(-)
> >
> > diff --git a/merge-ort.c b/merge-ort.c
> > index 90baedac407..92b765dd3f0 100644
> > --- a/merge-ort.c
> > +++ b/merge-ort.c
> > @@ -617,20 +617,72 @@ static int handle_content_merge(struct merge_options *opt,
> >
> >  /*** Function Grouping: functions related to regular rename detection ***/
> >
> > +static int process_renames(struct merge_options *opt,
> > +                        struct diff_queue_struct *renames)
> > +static int compare_pairs(const void *a_, const void *b_)
> > +/* Call diffcore_rename() to compute which files have changed on given side */
> > +static void detect_regular_renames(struct merge_options *opt,
> > +                                struct tree *merge_base,
> > +                                struct tree *side,
> > +                                unsigned side_index)
> > +static int collect_renames(struct merge_options *opt,
> > +                        struct diff_queue_struct *result,
> > +                        unsigned side_index)
>
> standard "I promise this will follow soon!" strategy, OK.
>
> >  static int detect_and_process_renames(struct merge_options *opt,
> >                                     struct tree *merge_base,
> >                                     struct tree *side1,
> >                                     struct tree *side2)
> >  {
> > -     int clean = 1;
> > +     struct diff_queue_struct combined;
> > +     struct rename_info *renames = opt->priv->renames;
>
> (Re: my concerns that we don't need 'renames' to be a pointer,
> this could easily be "renames = &opt->priv.renames;")
>
> > +     int s, clean = 1;
> > +
> > +     memset(&combined, 0, sizeof(combined));
> > +
> > +     detect_regular_renames(opt, merge_base, side1, 1);
> > +     detect_regular_renames(opt, merge_base, side2, 2);
>
> Find the renames in each side's diff.
>
> I think the use of "1" and "2" here might be better situated
> for an enum. Perhaps:
>
> enum merge_side {
>         MERGE_SIDE1 = 0,
>         MERGE_SIDE2 = 1,
> };
>
> (Note, I shift these values to 0 and 1, respectively, allowing
> us to truncate the pairs array to two entries while still
> being mentally clear.)

So, after mulling it over for a while, I created a

enum merge_side {
    MERGE_BASE = 0,
    MERGE_SIDE1 = 1,
    MERGE_SIDE2 = 2
};

and I made use of it in several places.  I just avoided going to an
extreme with it (e.g. adding another enum for masks or changing all
possibly relevant variables from ints to enum merge_side), and used it
more as a document-when-values-are-meant-to-refer-to-sides-of-the-merge
kind of thing.  Of course, this affects two previous patchsets and not
just this one, so I'll have to post a _lot_ of new patches...   :-)

> > +
> > +     ALLOC_GROW(combined.queue,
> > +                renames->pairs[1].nr + renames->pairs[2].nr,
> > +                combined.alloc);
> > +     clean &= collect_renames(opt, &combined, 1);
> > +     clean &= collect_renames(opt, &combined, 2);
>
> Magic numbers again.
>
> > +     QSORT(combined.queue, combined.nr, compare_pairs);
> > +
> > +     clean &= process_renames(opt, &combined);
>
> I need to mentally remember that "clean" is a return state,
> but _not_ a fail/success result. Even though we are using
> "&=" here, it shouldn't be "&&=" or even "if (method()) return 1;"
>
> Looking at how "clean" is used in struct merge_result, I
> wonder if there is a reason to use an "int" over a simple
> "unsigned" or even "unsigned clean:1;" You use -1 in places
> as well as a case of "mi->clean = !!resolved;"

Something I missed in my reply yesterday...

Note that mi->clean is NOT from struct merge_result.  It is from
struct merged_info, and in that struct it IS defined as "unsigned
clean:1", i.e. it is a true boolean.  The merged_info.clean field is
used to determine whether a specific path merged cleanly.

"clean" from struct merge_result is whether the entirety of the merge
was clean or not.  It's almost a boolean, but allows for a
"catastrophic problem encountered" value.  I added the following
comment:
/*
* Whether the merge is clean; possible values:
*    1: clean
*    0: not clean (merge conflicts)
*   <0: operation aborted prematurely.  (object database
*       unreadable, disk full, etc.)  Worktree may be left in an
*       inconsistent state if operation failed near the end.
*/

This also means that I either abort and return a negative value, or I
can continue treating merge_result's "clean" field as a boolean.

But again, this isn't new to this patchset; it affects the patchset
before the patchset before this one.

> If there is more meaning to values other than "clean" or
> "!clean", then an enum might be valuable.
>
> > +     /* Free memory for renames->pairs[] and combined */
> > +     for (s = 1; s <= 2; s++) {
> > +             free(renames->pairs[s].queue);
> > +             DIFF_QUEUE_CLEAR(&renames->pairs[s]);
> > +     }
>
> This loop is particularly unusual. Perhaps it would be
> better to do this instead:
>
>         free(renames->pairs[MERGE_SIDE1].queue);
>         free(renames->pairs[MERGE_SIDE2].queue);
>         DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE1]);
>         DIFF_QUEUE_CLEAR(&renames->pairs[MERGE_SIDE2]);
>
> > +     if (combined.nr) {
> > +             int i;
> > +             for (i = 0; i < combined.nr; i++)
> > +                     diff_free_filepair(combined.queue[i]);
> > +             free(combined.queue);
> > +     }
> >
> > -     /*
> > -      * Rename detection works by detecting file similarity.  Here we use
> > -      * a really easy-to-implement scheme: files are similar IFF they have
> > -      * the same filename.  Therefore, by this scheme, there are no renames.
> > -      *
> > -      * TODO: Actually implement a real rename detection scheme.
> > -      */
> >       return clean;
>
> I notice that this change causes detect_and_process_renames() to
> change from an "unhelpful result, but success" to "die() always".
>
> I wonder if there is value in swapping the order of the patches
> to implement the static methods first. Of course, you hit the
> "unreferenced static method" problem, so maybe your strategy is
> better after all.
>
> Thanks,
> -Stolee
Derrick Stolee Dec. 14, 2020, 2:33 p.m. UTC | #4
On 12/13/2020 2:47 AM, Elijah Newren wrote:
> Hi,
> 
> Sorry for two different email responses to the same email...
> 
> Addressing the comments on this patchset mean re-submitting
> en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
> series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
> patches against a series that isn't published by Junio, I'll need to
> ask Junio to temporarily drop both of these series, then later
> resubmit en/merge-ort-2 after he publishes my updates to
> en/merge-ort-impl.  Then when he publishes my updates to
> en/merge-ort-2, I'll be able to submit my already-rebased patches for
> en/merge-ort-3.

Let's chat privately about perhaps creatin
 
> A couple extra comments below...


>>> +     int s, clean = 1;
>>> +
>>> +     memset(&combined, 0, sizeof(combined));
>>> +
>>> +     detect_regular_renames(opt, merge_base, side1, 1);
>>> +     detect_regular_renames(opt, merge_base, side2, 2);
>>
>> Find the renames in each side's diff.
>>
>> I think the use of "1" and "2" here might be better situated
>> for an enum. Perhaps:
>>
>> enum merge_side {
>>         MERGE_SIDE1 = 0,
>>         MERGE_SIDE2 = 1,
>> };
>>
>> (Note, I shift these values to 0 and 1, respectively, allowing
>> us to truncate the pairs array to two entries while still
>> being mentally clear.)
> 
> So, after mulling it over for a while, I created a
> 
> enum merge_side {
>     MERGE_BASE = 0,
>     MERGE_SIDE1 = 1,
>     MERGE_SIDE2 = 2
> };
> 
> and I made use of it in several places.  I just avoided going to an
> extreme with it (e.g. adding another enum for masks or changing all
> possibly relevant variables from ints to enum merge_side), and used it
> more as a document-when-values-are-meant-to-refer-to-sides-of-the-merge
> kind of thing.  Of course, this affects two previous patchsets and not
> just this one, so I'll have to post a _lot_ of new patches...   :-)

I appreciate using names for the meaning behind a numerical constant.
You mentioned in the other thread that this will eventually expand to
a list of 10 entries, which is particularly frightening if we don't
get some control over it now.

I generally prefer using types to convey meaning as well, but I'm
willing to relax on this because I believe C won't complain if you
pass a literal int into an enum-typed parameter, so the compiler
doesn't help enough in that sense.

> Something I missed in my reply yesterday...
> 
> Note that mi->clean is NOT from struct merge_result.  It is from
> struct merged_info, and in that struct it IS defined as "unsigned
> clean:1", i.e. it is a true boolean.  The merged_info.clean field is
> used to determine whether a specific path merged cleanly.
> 
> "clean" from struct merge_result is whether the entirety of the merge
> was clean or not.  It's almost a boolean, but allows for a
> "catastrophic problem encountered" value.  I added the following
> comment:
> /*
> * Whether the merge is clean; possible values:
> *    1: clean
> *    0: not clean (merge conflicts)
> *   <0: operation aborted prematurely.  (object database
> *       unreadable, disk full, etc.)  Worktree may be left in an
> *       inconsistent state if operation failed near the end.
> */
> 
> This also means that I either abort and return a negative value, or I
> can continue treating merge_result's "clean" field as a boolean.

Having this comment helps a lot!
 
> But again, this isn't new to this patchset; it affects the patchset
> before the patchset before this one.

Right, when I had the current change checked out, I don't see the
patch that introduced the 'clean' member (though, I _could_ have
blamed to find out). Instead, I just got confused and thought it
worth a question. Your comment prevents this question in the future.

Thanks,
-Stolee
Johannes Schindelin Dec. 14, 2020, 3:42 p.m. UTC | #5
Hi Elijah & Stolee,

On Mon, 14 Dec 2020, Derrick Stolee wrote:

> On 12/13/2020 2:47 AM, Elijah Newren wrote:
> >
> > Sorry for two different email responses to the same email...
> >
> > Addressing the comments on this patchset mean re-submitting
> > en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
> > series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
> > patches against a series that isn't published by Junio, I'll need to
> > ask Junio to temporarily drop both of these series, then later
> > resubmit en/merge-ort-2 after he publishes my updates to
> > en/merge-ort-impl.  Then when he publishes my updates to
> > en/merge-ort-2, I'll be able to submit my already-rebased patches for
> > en/merge-ort-3.
>
> Let's chat privately about perhaps creatin

Yes, I am totally willing to push up temporary branches if that helps you,
or even giving you push permissions to do that.

Ciao,
Dscho
Elijah Newren Dec. 14, 2020, 4:11 p.m. UTC | #6
Hi,

On Mon, Dec 14, 2020 at 7:42 AM Johannes Schindelin
<Johannes.Schindelin@gmx.de> wrote:
>
> Hi Elijah & Stolee,
>
> On Mon, 14 Dec 2020, Derrick Stolee wrote:
>
> > On 12/13/2020 2:47 AM, Elijah Newren wrote:
> > >
> > > Sorry for two different email responses to the same email...
> > >
> > > Addressing the comments on this patchset mean re-submitting
> > > en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
> > > series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
> > > patches against a series that isn't published by Junio, I'll need to
> > > ask Junio to temporarily drop both of these series, then later
> > > resubmit en/merge-ort-2 after he publishes my updates to
> > > en/merge-ort-impl.  Then when he publishes my updates to
> > > en/merge-ort-2, I'll be able to submit my already-rebased patches for
> > > en/merge-ort-3.
> >
> > Let's chat privately about perhaps creatin
>
> Yes, I am totally willing to push up temporary branches if that helps you,
> or even giving you push permissions to do that.
>
> Ciao,
> Dscho

Given the amount of changes left to push up, I suspect there'll be
more cases where it'd be useful.  If I could get push permissions, and
a suggested namespace to use for such temporary branches, that'd help.

In this particular case, though, one of my two fears was already
realized -- Junio jumped in and did the work of rebasing and conflict
resolving for en/merge-ort-2 and en/merge-ort-3.  I didn't want to
burden him with that extra work, but what he pushed up for
en/merge-ort-2 is identical to what I have.  So, all I have to do is
push en/merge-ort-3 with the extra changes I have in it.  So this
particular time is taken care of.

Thanks!
Johannes Schindelin Dec. 14, 2020, 4:50 p.m. UTC | #7
Hi Elijah,

On Mon, 14 Dec 2020, Elijah Newren wrote:

> On Mon, Dec 14, 2020 at 7:42 AM Johannes Schindelin
> <Johannes.Schindelin@gmx.de> wrote:
> >
> > Hi Elijah & Stolee,
> >
> > On Mon, 14 Dec 2020, Derrick Stolee wrote:
> >
> > > On 12/13/2020 2:47 AM, Elijah Newren wrote:
> > > >
> > > > Sorry for two different email responses to the same email...
> > > >
> > > > Addressing the comments on this patchset mean re-submitting
> > > > en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
> > > > series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
> > > > patches against a series that isn't published by Junio, I'll need to
> > > > ask Junio to temporarily drop both of these series, then later
> > > > resubmit en/merge-ort-2 after he publishes my updates to
> > > > en/merge-ort-impl.  Then when he publishes my updates to
> > > > en/merge-ort-2, I'll be able to submit my already-rebased patches for
> > > > en/merge-ort-3.
> > >
> > > Let's chat privately about perhaps creatin
> >
> > Yes, I am totally willing to push up temporary branches if that helps you,
> > or even giving you push permissions to do that.
> >
> > Ciao,
> > Dscho
>
> Given the amount of changes left to push up, I suspect there'll be
> more cases where it'd be useful.  If I could get push permissions, and
> a suggested namespace to use for such temporary branches, that'd help.

I invited you into the GitGitGadget organization.

Recently, I pushed up `hanwen/libreftable` to have a proper base branch
for myself, but I think that was probably not a good idea. I should have
opened a namespace like `temp/` or some such.

> In this particular case, though, one of my two fears was already
> realized -- Junio jumped in and did the work of rebasing and conflict
> resolving for en/merge-ort-2 and en/merge-ort-3.  I didn't want to
> burden him with that extra work, but what he pushed up for
> en/merge-ort-2 is identical to what I have.  So, all I have to do is
> push en/merge-ort-3 with the extra changes I have in it.  So this
> particular time is taken care of.

Understood.

Thanks,
Dscho
Elijah Newren Dec. 14, 2020, 5:35 p.m. UTC | #8
On Mon, Dec 14, 2020 at 6:33 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 12/13/2020 2:47 AM, Elijah Newren wrote:
> > Hi,
> >
> > Sorry for two different email responses to the same email...
> >
> > Addressing the comments on this patchset mean re-submitting
> > en/merge-ort-impl, and causing conflicts in en/merge-ort-2 and this
> > series en/merge-ort-3.  Since gitgitgadget will not allow me to submit
> > patches against a series that isn't published by Junio, I'll need to
> > ask Junio to temporarily drop both of these series, then later
> > resubmit en/merge-ort-2 after he publishes my updates to
> > en/merge-ort-impl.  Then when he publishes my updates to
> > en/merge-ort-2, I'll be able to submit my already-rebased patches for
> > en/merge-ort-3.
>
> Let's chat privately about perhaps creatin
>
> > A couple extra comments below...
>
>
> >>> +     int s, clean = 1;
> >>> +
> >>> +     memset(&combined, 0, sizeof(combined));
> >>> +
> >>> +     detect_regular_renames(opt, merge_base, side1, 1);
> >>> +     detect_regular_renames(opt, merge_base, side2, 2);
> >>
> >> Find the renames in each side's diff.
> >>
> >> I think the use of "1" and "2" here might be better situated
> >> for an enum. Perhaps:
> >>
> >> enum merge_side {
> >>         MERGE_SIDE1 = 0,
> >>         MERGE_SIDE2 = 1,
> >> };
> >>
> >> (Note, I shift these values to 0 and 1, respectively, allowing
> >> us to truncate the pairs array to two entries while still
> >> being mentally clear.)
> >
> > So, after mulling it over for a while, I created a
> >
> > enum merge_side {
> >     MERGE_BASE = 0,
> >     MERGE_SIDE1 = 1,
> >     MERGE_SIDE2 = 2
> > };
> >
> > and I made use of it in several places.  I just avoided going to an
> > extreme with it (e.g. adding another enum for masks or changing all
> > possibly relevant variables from ints to enum merge_side), and used it
> > more as a document-when-values-are-meant-to-refer-to-sides-of-the-merge
> > kind of thing.  Of course, this affects two previous patchsets and not
> > just this one, so I'll have to post a _lot_ of new patches...   :-)
>
> I appreciate using names for the meaning behind a numerical constant.
> You mentioned in the other thread that this will eventually expand to
> a list of 10 entries, which is particularly frightening if we don't
> get some control over it now.
>
> I generally prefer using types to convey meaning as well, but I'm
> willing to relax on this because I believe C won't complain if you
> pass a literal int into an enum-typed parameter, so the compiler
> doesn't help enough in that sense.

Yeah, I went through my 'ort' branch with all 10 entries and did a
regex search for \b[12]\b throughout merge-ort.c, then considered each
one in turn, updating to the new enum where it made sense.  Then
backported the changes across en/merge-ort-impl and en/merge-ort-3
(and I just /submit-ted the en/merge-ort-3 updates to the list).  Took
quite a while, of course, but I feel it's in good shape.

So, take a look at the new sets of series and let me know what you think.

> > Something I missed in my reply yesterday...
> >
> > Note that mi->clean is NOT from struct merge_result.  It is from
> > struct merged_info, and in that struct it IS defined as "unsigned
> > clean:1", i.e. it is a true boolean.  The merged_info.clean field is
> > used to determine whether a specific path merged cleanly.
> >
> > "clean" from struct merge_result is whether the entirety of the merge
> > was clean or not.  It's almost a boolean, but allows for a
> > "catastrophic problem encountered" value.  I added the following
> > comment:
> > /*
> > * Whether the merge is clean; possible values:
> > *    1: clean
> > *    0: not clean (merge conflicts)
> > *   <0: operation aborted prematurely.  (object database
> > *       unreadable, disk full, etc.)  Worktree may be left in an
> > *       inconsistent state if operation failed near the end.
> > */
> >
> > This also means that I either abort and return a negative value, or I
> > can continue treating merge_result's "clean" field as a boolean.
>
> Having this comment helps a lot!
>
> > But again, this isn't new to this patchset; it affects the patchset
> > before the patchset before this one.
>
> Right, when I had the current change checked out, I don't see the
> patch that introduced the 'clean' member (though, I _could_ have
> blamed to find out). Instead, I just got confused and thought it
> worth a question. Your comment prevents this question in the future.

Yeah, definitely worth the question.  I've been buried in
merge-recursive.c & related areas so long that I've forgotten that
certain things are weird or surprising on first look.  The more of
those we can flag and document, the better.
diff mbox series

Patch

diff --git a/merge-ort.c b/merge-ort.c
index 90baedac407..92b765dd3f0 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -617,20 +617,72 @@  static int handle_content_merge(struct merge_options *opt,
 
 /*** Function Grouping: functions related to regular rename detection ***/
 
+static int process_renames(struct merge_options *opt,
+			   struct diff_queue_struct *renames)
+{
+	die("Not yet implemented.");
+}
+
+static int compare_pairs(const void *a_, const void *b_)
+{
+	die("Not yet implemented.");
+}
+
+/* Call diffcore_rename() to compute which files have changed on given side */
+static void detect_regular_renames(struct merge_options *opt,
+				   struct tree *merge_base,
+				   struct tree *side,
+				   unsigned side_index)
+{
+	die("Not yet implemented.");
+}
+
+/*
+ * Get information of all renames which occurred in 'side_pairs', discarding
+ * non-renames.
+ */
+static int collect_renames(struct merge_options *opt,
+			   struct diff_queue_struct *result,
+			   unsigned side_index)
+{
+	die("Not yet implemented.");
+}
+
 static int detect_and_process_renames(struct merge_options *opt,
 				      struct tree *merge_base,
 				      struct tree *side1,
 				      struct tree *side2)
 {
-	int clean = 1;
+	struct diff_queue_struct combined;
+	struct rename_info *renames = opt->priv->renames;
+	int s, clean = 1;
+
+	memset(&combined, 0, sizeof(combined));
+
+	detect_regular_renames(opt, merge_base, side1, 1);
+	detect_regular_renames(opt, merge_base, side2, 2);
+
+	ALLOC_GROW(combined.queue,
+		   renames->pairs[1].nr + renames->pairs[2].nr,
+		   combined.alloc);
+	clean &= collect_renames(opt, &combined, 1);
+	clean &= collect_renames(opt, &combined, 2);
+	QSORT(combined.queue, combined.nr, compare_pairs);
+
+	clean &= process_renames(opt, &combined);
+
+	/* Free memory for renames->pairs[] and combined */
+	for (s = 1; s <= 2; s++) {
+		free(renames->pairs[s].queue);
+		DIFF_QUEUE_CLEAR(&renames->pairs[s]);
+	}
+	if (combined.nr) {
+		int i;
+		for (i = 0; i < combined.nr; i++)
+			diff_free_filepair(combined.queue[i]);
+		free(combined.queue);
+	}
 
-	/*
-	 * Rename detection works by detecting file similarity.  Here we use
-	 * a really easy-to-implement scheme: files are similar IFF they have
-	 * the same filename.  Therefore, by this scheme, there are no renames.
-	 *
-	 * TODO: Actually implement a real rename detection scheme.
-	 */
 	return clean;
 }