diff mbox series

[v2,7/8] diff: add ability to insert additional headers for paths

Message ID e94706513038adc85e818e8736ad713b2e6b6be4.1640419160.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Add a new --remerge-diff capability to show & log | expand

Commit Message

Elijah Newren Dec. 25, 2021, 7:59 a.m. UTC
From: Elijah Newren <newren@gmail.com>

When additional headers are provided, we need to
  * add diff_filepairs to diff_queued_diff for each paths in the
    additional headers map which, unless that path is part of
    another diff_filepair already found in diff_queued_diff
  * format the headers (colorization, line_prefix for --graph)
  * make sure the various codepaths that attempt to return early
    if there are "no changes" take into account the headers that
    need to be shown.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 diff.c     | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 diff.h     |   3 +-
 log-tree.c |   2 +-
 3 files changed, 115 insertions(+), 6 deletions(-)

Comments

Johannes Altmanninger Dec. 28, 2021, 10:57 a.m. UTC | #1
On Sat, Dec 25, 2021 at 07:59:18AM +0000, Elijah Newren via GitGitGadget wrote:
> From: Elijah Newren <newren@gmail.com>
> 
> When additional headers are provided, we need to
>   * add diff_filepairs to diff_queued_diff for each paths in the
>     additional headers map which, unless that path is part of
>     another diff_filepair already found in diff_queued_diff
>   * format the headers (colorization, line_prefix for --graph)
>   * make sure the various codepaths that attempt to return early
>     if there are "no changes" take into account the headers that
>     need to be shown.
> 
> Signed-off-by: Elijah Newren <newren@gmail.com>
> ---
>  diff.c     | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  diff.h     |   3 +-
>  log-tree.c |   2 +-
>  3 files changed, 115 insertions(+), 6 deletions(-)
> 
> diff --git a/diff.c b/diff.c
> index 861282db1c3..aaa6a19f158 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -27,6 +27,7 @@
>  #include "help.h"
>  #include "promisor-remote.h"
>  #include "dir.h"
> +#include "strmap.h"
>  
>  #ifdef NO_FAST_WORKING_DIRECTORY
>  #define FAST_WORKING_DIRECTORY 0
> @@ -3406,6 +3407,31 @@ struct userdiff_driver *get_textconv(struct repository *r,
>  	return userdiff_get_textconv(r, one->driver);
>  }
>  
> +static struct strbuf *additional_headers(struct diff_options *o,
> +					 const char *path)
> +{
> +	if (!o->additional_path_headers)
> +		return NULL;
> +	return strmap_get(o->additional_path_headers, path);
> +}
> +
> +static void add_formatted_headers(struct strbuf *msg,
> +				  struct strbuf *more_headers,
> +				  const char *line_prefix,
> +				  const char *meta,
> +				  const char *reset)
> +{
> +	char *next, *newline;
> +
> +	for (next = more_headers->buf; *next; next = newline) {
> +		newline = strchrnul(next, '\n');
> +		strbuf_addf(msg, "%s%s%.*s%s\n", line_prefix, meta,
> +			    (int)(newline - next), next, reset);
> +		if (*newline)
> +			newline++;
> +	}
> +}
> +
>  static void builtin_diff(const char *name_a,
>  			 const char *name_b,
>  			 struct diff_filespec *one,
> @@ -3464,6 +3490,17 @@ static void builtin_diff(const char *name_a,
>  	b_two = quote_two(b_prefix, name_b + (*name_b == '/'));
>  	lbl[0] = DIFF_FILE_VALID(one) ? a_one : "/dev/null";
>  	lbl[1] = DIFF_FILE_VALID(two) ? b_two : "/dev/null";
> +	if (!DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two)) {
> +		/*
> +		 * We should only reach this point for pairs from
> +		 * create_filepairs_for_header_only_notifications().  For
> +		 * these, we should avoid the "/dev/null" special casing
> +		 * above, meaning we avoid showing such pairs as either
> +		 * "new file" or "deleted file" below.
> +		 */
> +		lbl[0] = a_one;
> +		lbl[1] = b_two;
> +	}

not so familiar with this logic, but I saw that without this change, the
rename/rename conflict test fails. Is this because we add a file pair under
the original name (that's been renamed on both sides). I wonder if we
can sketch such a case in the comment.

>  	strbuf_addf(&header, "%s%sdiff --git %s %s%s\n", line_prefix, meta, a_one, b_two, reset);
>  	if (lbl[0][0] == '/') {
>  		/* /dev/null */
> @@ -4328,6 +4365,7 @@ static void fill_metainfo(struct strbuf *msg,
>  	const char *set = diff_get_color(use_color, DIFF_METAINFO);
>  	const char *reset = diff_get_color(use_color, DIFF_RESET);
>  	const char *line_prefix = diff_line_prefix(o);
> +	struct strbuf *more_headers = NULL;
>  
>  	*must_show_header = 1;
>  	strbuf_init(msg, PATH_MAX * 2 + 300);
> @@ -4364,6 +4402,11 @@ static void fill_metainfo(struct strbuf *msg,
>  	default:
>  		*must_show_header = 0;
>  	}
> +	if ((more_headers = additional_headers(o, name))) {
> +		add_formatted_headers(msg, more_headers,
> +				      line_prefix, set, reset);
> +		*must_show_header = 1;
> +	}
>  	if (one && two && !oideq(&one->oid, &two->oid)) {
>  		const unsigned hexsz = the_hash_algo->hexsz;
>  		int abbrev = o->abbrev ? o->abbrev : DEFAULT_ABBREV;
> @@ -5852,12 +5895,22 @@ int diff_unmodified_pair(struct diff_filepair *p)
>  
>  static void diff_flush_patch(struct diff_filepair *p, struct diff_options *o)
>  {
> -	if (diff_unmodified_pair(p))
> +	/*
> +	 * Check if we can return early without showing a diff.  Note that
> +	 * diff_filepair only stores {oid, path, mode, is_valid}
> +	 * information for each path, and thus diff_unmodified_pair() only
> +	 * considers those bits of info.  However, we do not want pairs
> +	 * created by create_filepairs_for_header_only_notifications() to
> +	 * be ignored, so return early if both p is unmodified AND
> +	 * p->one->path is not in additional headers.
> +	 */
> +	if (diff_unmodified_pair(p) && !additional_headers(o, p->one->path))
>  		return;
>  
> +	/* Actually, we can also return early to avoid showing tree diffs */
>  	if ((DIFF_FILE_VALID(p->one) && S_ISDIR(p->one->mode)) ||
>  	    (DIFF_FILE_VALID(p->two) && S_ISDIR(p->two->mode)))
> -		return; /* no tree diffs in patch format */
> +		return;
>  
>  	run_diff(p, o);
>  }
> @@ -5888,10 +5941,14 @@ static void diff_flush_checkdiff(struct diff_filepair *p,
>  	run_checkdiff(p, o);
>  }
>  
> -int diff_queue_is_empty(void)
> +int diff_queue_is_empty(struct diff_options *o)
>  {
>  	struct diff_queue_struct *q = &diff_queued_diff;
>  	int i;
> +
> +	if (o->additional_path_headers &&
> +	    !strmap_empty(o->additional_path_headers))
> +		return 0;
>  	for (i = 0; i < q->nr; i++)
>  		if (!diff_unmodified_pair(q->queue[i]))
>  			return 0;
> @@ -6325,6 +6382,54 @@ void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc)
>  		warning(_(rename_limit_advice), varname, needed);
>  }
>  
> +static void create_filepairs_for_header_only_notifications(struct diff_options *o)
> +{
> +	struct strset present;
> +	struct diff_queue_struct *q = &diff_queued_diff;
> +	struct hashmap_iter iter;
> +	struct strmap_entry *e;
> +	int i;
> +
> +	strset_init_with_options(&present, /*pool*/ NULL, /*strdup*/ 0);
> +
> +	/*
> +	 * Find out which paths exist in diff_queued_diff, preferring
> +	 * one->path for any pair that has multiple paths.

Why do we prefer one->path?

> +	 */
> +	for (i = 0; i < q->nr; i++) {
> +		struct diff_filepair *p = q->queue[i];
> +		char *path = p->one->path ? p->one->path : p->two->path;
> +
> +		if (strmap_contains(o->additional_path_headers, path))
> +			strset_add(&present, path);
> +	}
> +
> +	/*
> +	 * Loop over paths in additional_path_headers; for each NOT already
> +	 * in diff_queued_diff, create a synthetic filepair and insert that
> +	 * into diff_queued_diff.
> +	 */
> +	strmap_for_each_entry(o->additional_path_headers, &iter, e) {
> +		if (!strset_contains(&present, e->key)) {
> +			struct diff_filespec *one, *two;
> +			struct diff_filepair *p;
> +
> +			one = alloc_filespec(e->key);
> +			two = alloc_filespec(e->key);
> +			fill_filespec(one, null_oid(), 0, 0);
> +			fill_filespec(two, null_oid(), 0, 0);
> +			p = diff_queue(q, one, two);
> +			p->status = DIFF_STATUS_MODIFIED;
> +		}
> +	}

All these string hash-maps are not really typical for a C program. I'm sure
they are the best choice for an advanced merge algorithm but they are not
really necessary for computing/printing a diff. It feels like this is an
implementation detail from merge-ort that's leaking into other components.

What we want to do is

	for file_pair in additional_headers:
		if not already_queued(file_pair):
			queue(file_pair)

to do that, you use a temporary has-set ("present") that records everything
that's already queued (already_queued() is a lookup in that set).

Let's assume both the queue and additional_headers are sorted arrays.
Then we could efficiently merge them (like a merge-sort algorithm)
without ever allocating a temporary hash map.

I haven't checked if this is practical (better wait for feedback).
We'd probably need to convert the strmap additional_path_headers into an
array and sort it (I guess our hash map does not guarantee any ordering?)

> +
> +	/* Re-sort the filepairs */
> +	diffcore_fix_diff_index();
> +
> +	/* Cleanup */
> +	strset_clear(&present);

Not a strong opinion, but I'd probably drop this comment

> +}
> +
>  static void diff_flush_patch_all_file_pairs(struct diff_options *o)
>  {
>  	int i;
> @@ -6337,6 +6442,9 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
>  	if (o->color_moved)
>  		o->emitted_symbols = &esm;
>  
> +	if (o->additional_path_headers)
> +		create_filepairs_for_header_only_notifications(o);
> +
>  	for (i = 0; i < q->nr; i++) {
>  		struct diff_filepair *p = q->queue[i];
>  		if (check_pair_status(p))
> @@ -6413,7 +6521,7 @@ void diff_flush(struct diff_options *options)
>  	 * Order: raw, stat, summary, patch
>  	 * or:    name/name-status/checkdiff (other bits clear)
>  	 */
> -	if (!q->nr)
> +	if (!q->nr && !options->additional_path_headers)
>  		goto free_queue;
>  
>  	if (output_format & (DIFF_FORMAT_RAW |
> diff --git a/diff.h b/diff.h
> index 8ba85c5e605..06a0a67afda 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -395,6 +395,7 @@ struct diff_options {
>  
>  	struct repository *repo;
>  	struct option *parseopts;
> +	struct strmap *additional_path_headers;
>  
>  	int no_free;
>  };
> @@ -593,7 +594,7 @@ void diffcore_fix_diff_index(void);
>  "                show all files diff when -S is used and hit is found.\n" \
>  "  -a  --text    treat all files as text.\n"
>  
> -int diff_queue_is_empty(void);
> +int diff_queue_is_empty(struct diff_options*);
>  void diff_flush(struct diff_options*);
>  void diff_free(struct diff_options*);
>  void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc);
> diff --git a/log-tree.c b/log-tree.c
> index d4655b63d75..33c28f537a6 100644
> --- a/log-tree.c
> +++ b/log-tree.c
> @@ -850,7 +850,7 @@ int log_tree_diff_flush(struct rev_info *opt)
>  	opt->shown_dashes = 0;
>  	diffcore_std(&opt->diffopt);
>  
> -	if (diff_queue_is_empty()) {
> +	if (diff_queue_is_empty(&opt->diffopt)) {
>  		int saved_fmt = opt->diffopt.output_format;
>  		opt->diffopt.output_format = DIFF_FORMAT_NO_OUTPUT;
>  		diff_flush(&opt->diffopt);
> -- 
> gitgitgadget
>
Elijah Newren Dec. 28, 2021, 9:09 p.m. UTC | #2
On Tue, Dec 28, 2021 at 2:57 AM Johannes Altmanninger <aclopte@gmail.com> wrote:
>
> On Sat, Dec 25, 2021 at 07:59:18AM +0000, Elijah Newren via GitGitGadget wrote:
> > From: Elijah Newren <newren@gmail.com>
> >
> > When additional headers are provided, we need to
> >   * add diff_filepairs to diff_queued_diff for each paths in the
> >     additional headers map which, unless that path is part of
> >     another diff_filepair already found in diff_queued_diff
> >   * format the headers (colorization, line_prefix for --graph)
> >   * make sure the various codepaths that attempt to return early
> >     if there are "no changes" take into account the headers that
> >     need to be shown.
> >
> > Signed-off-by: Elijah Newren <newren@gmail.com>
> > ---
> >  diff.c     | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> >  diff.h     |   3 +-
> >  log-tree.c |   2 +-
> >  3 files changed, 115 insertions(+), 6 deletions(-)
> >
> > diff --git a/diff.c b/diff.c
> > index 861282db1c3..aaa6a19f158 100644
> > --- a/diff.c
> > +++ b/diff.c
> > @@ -27,6 +27,7 @@
> >  #include "help.h"
> >  #include "promisor-remote.h"
> >  #include "dir.h"
> > +#include "strmap.h"
> >
> >  #ifdef NO_FAST_WORKING_DIRECTORY
> >  #define FAST_WORKING_DIRECTORY 0
> > @@ -3406,6 +3407,31 @@ struct userdiff_driver *get_textconv(struct repository *r,
> >       return userdiff_get_textconv(r, one->driver);
> >  }
> >
> > +static struct strbuf *additional_headers(struct diff_options *o,
> > +                                      const char *path)
> > +{
> > +     if (!o->additional_path_headers)
> > +             return NULL;
> > +     return strmap_get(o->additional_path_headers, path);
> > +}
> > +
> > +static void add_formatted_headers(struct strbuf *msg,
> > +                               struct strbuf *more_headers,
> > +                               const char *line_prefix,
> > +                               const char *meta,
> > +                               const char *reset)
> > +{
> > +     char *next, *newline;
> > +
> > +     for (next = more_headers->buf; *next; next = newline) {
> > +             newline = strchrnul(next, '\n');
> > +             strbuf_addf(msg, "%s%s%.*s%s\n", line_prefix, meta,
> > +                         (int)(newline - next), next, reset);
> > +             if (*newline)
> > +                     newline++;
> > +     }
> > +}
> > +
> >  static void builtin_diff(const char *name_a,
> >                        const char *name_b,
> >                        struct diff_filespec *one,
> > @@ -3464,6 +3490,17 @@ static void builtin_diff(const char *name_a,
> >       b_two = quote_two(b_prefix, name_b + (*name_b == '/'));
> >       lbl[0] = DIFF_FILE_VALID(one) ? a_one : "/dev/null";
> >       lbl[1] = DIFF_FILE_VALID(two) ? b_two : "/dev/null";
> > +     if (!DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two)) {
> > +             /*
> > +              * We should only reach this point for pairs from
> > +              * create_filepairs_for_header_only_notifications().  For
> > +              * these, we should avoid the "/dev/null" special casing
> > +              * above, meaning we avoid showing such pairs as either
> > +              * "new file" or "deleted file" below.
> > +              */
> > +             lbl[0] = a_one;
> > +             lbl[1] = b_two;
> > +     }
>
> not so familiar with this logic, but I saw that without this change, the
> rename/rename conflict test fails. Is this because we add a file pair under
> the original name (that's been renamed on both sides). I wonder if we
> can sketch such a case in the comment.

That may be the only current test in the testsuite that fails without
this bit of logic, but I don't want the comment to be specific to the
rename/rename case.  Whenever we have a conflict/warning/whatever
message from the merge machinery tied to a path which doesn't show up
in either the automatic merge or the recorded merge commit, we will
hit this situation.  Even if I were to give a complete listing of all
the current cases, more could be added in the future.

> > +static void create_filepairs_for_header_only_notifications(struct diff_options *o)
> > +{
> > +     struct strset present;
> > +     struct diff_queue_struct *q = &diff_queued_diff;
> > +     struct hashmap_iter iter;
> > +     struct strmap_entry *e;
> > +     int i;
> > +
> > +     strset_init_with_options(&present, /*pool*/ NULL, /*strdup*/ 0);
> > +
> > +     /*
> > +      * Find out which paths exist in diff_queued_diff, preferring
> > +      * one->path for any pair that has multiple paths.
>
> Why do we prefer one->path?

run_diff() sets name = one->path, passes it along to run_diff_cmd(),
and from there it goes to fill_metainfo() and either
run_external_diff() or builtin_diff().

I'm wondering if I should just ignore two->path entirely and only use
one->path; I think I partially looked at both because of various
places in diff.c that already do but give preferential treatment to
one->path (diffnamecmp(), the calls to show_submodule*diff*(), what is
passed to write_name_quoted() in diff_flush_raw()).

> > +      */
> > +     for (i = 0; i < q->nr; i++) {
> > +             struct diff_filepair *p = q->queue[i];
> > +             char *path = p->one->path ? p->one->path : p->two->path;
> > +
> > +             if (strmap_contains(o->additional_path_headers, path))
> > +                     strset_add(&present, path);
> > +     }
> > +
> > +     /*
> > +      * Loop over paths in additional_path_headers; for each NOT already
> > +      * in diff_queued_diff, create a synthetic filepair and insert that
> > +      * into diff_queued_diff.
> > +      */
> > +     strmap_for_each_entry(o->additional_path_headers, &iter, e) {
> > +             if (!strset_contains(&present, e->key)) {
> > +                     struct diff_filespec *one, *two;
> > +                     struct diff_filepair *p;
> > +
> > +                     one = alloc_filespec(e->key);
> > +                     two = alloc_filespec(e->key);
> > +                     fill_filespec(one, null_oid(), 0, 0);
> > +                     fill_filespec(two, null_oid(), 0, 0);
> > +                     p = diff_queue(q, one, two);
> > +                     p->status = DIFF_STATUS_MODIFIED;
> > +             }
> > +     }
>
> All these string hash-maps are not really typical for a C program. I'm sure
> they are the best choice for an advanced merge algorithm

Agreed up to here.

> but they are not
> really necessary for computing/printing a diff.

Technically agree that it _could_ be solved a different way, but the
strmaps are a much more natural solution to this problem in this
particular case; more on this below.

> It feels like this is an
> implementation detail from merge-ort that's leaking into other components.

And I disagree here, on _both_ the explicit point and the underlying
suggestion that you seem to be making that strmap should be avoided
outside of merging.  The strmap.[ch] type was originally a suggestion
from Peff for areas of git completely unrelated to merging (see the
beginning of https://lore.kernel.org/git/20200821194857.GD1165@coredump.intra.peff.net/,
and the first link in that email).  It's a new datatype for git, much
like strbuf or string_list or whatever before it, that is there to be
used when it's a natural fit for the problem at hand.  The lack of
strmap previously led folks to abuse other existing data structures
(and in a way that often led to poor performance to boot).

> What we want to do is
>
>         for file_pair in additional_headers:
>                 if not already_queued(file_pair):
>                         queue(file_pair)

Yes, precisely.

> to do that, you use a temporary has-set ("present") that records everything
> that's already queued (already_queued() is a lookup in that set).
>
> Let's assume both the queue and additional_headers are sorted arrays.

That's a bad assumption; we can't rely on *either* being sorted.  I
actually started my implementation by trying exactly what you mention
first; I too thought it'd be more natural and clearer to do this.  Of
course, before implementing it, I had to verify whether
diff_queued_diff was sorted.  So, I added some code that would check
the order and fail if the queue wasn't sorted.  7 of the test files in
the regression testsuite had one or more failing tests.

I think the queue was intended to be sorted (see
diffcore_fix_diff_index()), but in practice it's not.  And I'm worried
that if I find the current cases where it fails to be sorted and "fix"
them (though I don't actually know if this was intentional or not so I
don't know if that's really a fix or a break), that I'd end up with
additional cases in the future where they fail to be sorted anyway.
So, no matter what, relying on diff_queued_diff being sorted seems
ill-advised.

Also...

> Then we could efficiently merge them (like a merge-sort algorithm)
> without ever allocating a temporary hash map.
>
> I haven't checked if this is practical (better wait for feedback).
> We'd probably need to convert the strmap additional_path_headers into an
> array and sort it (I guess our hash map does not guarantee any ordering?)

Right, strmap has no ordering either.  I was willing to stick those
into a string_list and sort them, but making temporary copies of both
the strmap and the diff_queued_diff just to sort them so that I can
reasonably cheaply ask "are items from this thing present in this
other thing?" seems to be stretching things a bit too far.
maps/hashes provide a very nice "is this item present" lookup and are
a natural way to ask that.  Since that is exactly the question I am
asking, I think they are the better data structure here.  So, this was
not at all a leak of merge-ort datastructures, but rather a picking of
the appropriate data structures for the problem at hand.

> > +
> > +     /* Re-sort the filepairs */
> > +     diffcore_fix_diff_index();
> > +
> > +     /* Cleanup */
> > +     strset_clear(&present);
>
> Not a strong opinion, but I'd probably drop this comment
>
> > +}
> > +
> >  static void diff_flush_patch_all_file_pairs(struct diff_options *o)
> >  {
> >       int i;
> > @@ -6337,6 +6442,9 @@ static void diff_flush_patch_all_file_pairs(struct diff_options *o)
> >       if (o->color_moved)
> >               o->emitted_symbols = &esm;
> >
> > +     if (o->additional_path_headers)
> > +             create_filepairs_for_header_only_notifications(o);
> > +
> >       for (i = 0; i < q->nr; i++) {
> >               struct diff_filepair *p = q->queue[i];
> >               if (check_pair_status(p))
> > @@ -6413,7 +6521,7 @@ void diff_flush(struct diff_options *options)
> >        * Order: raw, stat, summary, patch
> >        * or:    name/name-status/checkdiff (other bits clear)
> >        */
> > -     if (!q->nr)
> > +     if (!q->nr && !options->additional_path_headers)
> >               goto free_queue;
> >
> >       if (output_format & (DIFF_FORMAT_RAW |
> > diff --git a/diff.h b/diff.h
> > index 8ba85c5e605..06a0a67afda 100644
> > --- a/diff.h
> > +++ b/diff.h
> > @@ -395,6 +395,7 @@ struct diff_options {
> >
> >       struct repository *repo;
> >       struct option *parseopts;
> > +     struct strmap *additional_path_headers;
> >
> >       int no_free;
> >  };
> > @@ -593,7 +594,7 @@ void diffcore_fix_diff_index(void);
> >  "                show all files diff when -S is used and hit is found.\n" \
> >  "  -a  --text    treat all files as text.\n"
> >
> > -int diff_queue_is_empty(void);
> > +int diff_queue_is_empty(struct diff_options*);
> >  void diff_flush(struct diff_options*);
> >  void diff_free(struct diff_options*);
> >  void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc);
> > diff --git a/log-tree.c b/log-tree.c
> > index d4655b63d75..33c28f537a6 100644
> > --- a/log-tree.c
> > +++ b/log-tree.c
> > @@ -850,7 +850,7 @@ int log_tree_diff_flush(struct rev_info *opt)
> >       opt->shown_dashes = 0;
> >       diffcore_std(&opt->diffopt);
> >
> > -     if (diff_queue_is_empty()) {
> > +     if (diff_queue_is_empty(&opt->diffopt)) {
> >               int saved_fmt = opt->diffopt.output_format;
> >               opt->diffopt.output_format = DIFF_FORMAT_NO_OUTPUT;
> >               diff_flush(&opt->diffopt);
> > --
> > gitgitgadget
> >
Johannes Altmanninger Dec. 29, 2021, 12:16 a.m. UTC | #3
On Tue, Dec 28, 2021 at 01:09:57PM -0800, Elijah Newren wrote:
> On Tue, Dec 28, 2021 at 2:57 AM Johannes Altmanninger <aclopte@gmail.com> wrote:
> >
> > On Sat, Dec 25, 2021 at 07:59:18AM +0000, Elijah Newren via GitGitGadget wrote:
> > > +     for (i = 0; i < q->nr; i++) {
> > > +             struct diff_filepair *p = q->queue[i];
> > > +             char *path = p->one->path ? p->one->path : p->two->path;
> > > +
> > > +             if (strmap_contains(o->additional_path_headers, path))
> > > +                     strset_add(&present, path);
> > > +     }
> > > +
> > > +     /*
> > > +      * Loop over paths in additional_path_headers; for each NOT already
> > > +      * in diff_queued_diff, create a synthetic filepair and insert that
> > > +      * into diff_queued_diff.
> > > +      */
> > > +     strmap_for_each_entry(o->additional_path_headers, &iter, e) {
> > > +             if (!strset_contains(&present, e->key)) {
> > > +                     struct diff_filespec *one, *two;
> > > +                     struct diff_filepair *p;
> > > +
> > > +                     one = alloc_filespec(e->key);
> > > +                     two = alloc_filespec(e->key);
> > > +                     fill_filespec(one, null_oid(), 0, 0);
> > > +                     fill_filespec(two, null_oid(), 0, 0);
> > > +                     p = diff_queue(q, one, two);
> > > +                     p->status = DIFF_STATUS_MODIFIED;
> > > +             }
> > > +     }
> >
> > All these string hash-maps are not really typical for a C program. I'm sure
> > they are the best choice for an advanced merge algorithm
> 
> Agreed up to here.
> 
> > but they are not
> > really necessary for computing/printing a diff.
> 
> Technically agree that it _could_ be solved a different way, but the
> strmaps are a much more natural solution to this problem in this
> particular case; more on this below.

Oh yeah, I agree that strmaps are the more intuitive solution.

> 
> > It feels like this is an
> > implementation detail from merge-ort that's leaking into other components.
> 
> And I disagree here, on _both_ the explicit point and the underlying
> suggestion that you seem to be making that strmap should be avoided
> outside of merging.  The strmap.[ch] type was originally a suggestion
> from Peff for areas of git completely unrelated to merging (see the
> beginning of https://lore.kernel.org/git/20200821194857.GD1165@coredump.intra.peff.net/,
> and the first link in that email).  It's a new datatype for git, much
> like strbuf or string_list or whatever before it, that is there to be
> used when it's a natural fit for the problem at hand.  The lack of
> strmap previously led folks to abuse other existing data structures
> (and in a way that often led to poor performance to boot).

Right, all those rename-detection performance fixes were pretty dazzling

> 
> > What we want to do is
> >
> >         for file_pair in additional_headers:
> >                 if not already_queued(file_pair):
> >                         queue(file_pair)
> 
> Yes, precisely.
> 
> > to do that, you use a temporary has-set ("present") that records everything
> > that's already queued (already_queued() is a lookup in that set).
> >
> > Let's assume both the queue and additional_headers are sorted arrays.
> 
> That's a bad assumption; we can't rely on *either* being sorted.  I

OK, I hadn't checked if the queue is sorted

> actually started my implementation by trying exactly what you mention
> first; I too thought it'd be more natural and clearer to do this.  Of
> course, before implementing it, I had to verify whether
> diff_queued_diff was sorted.  So, I added some code that would check
> the order and fail if the queue wasn't sorted.  7 of the test files in
> the regression testsuite had one or more failing tests.
> 
> I think the queue was intended to be sorted (see
> diffcore_fix_diff_index()), but in practice it's not.  And I'm worried
> that if I find the current cases where it fails to be sorted and "fix"
> them (though I don't actually know if this was intentional or not so I
> don't know if that's really a fix or a break), that I'd end up with
> additional cases in the future where they fail to be sorted anyway.
> So, no matter what, relying on diff_queued_diff being sorted seems
> ill-advised.
> 
> Also...
> 
> > Then we could efficiently merge them (like a merge-sort algorithm)
> > without ever allocating a temporary hash map.
> >
> > I haven't checked if this is practical (better wait for feedback).
> > We'd probably need to convert the strmap additional_path_headers into an
> > array and sort it (I guess our hash map does not guarantee any ordering?)
> 
> Right, strmap has no ordering either.  I was willing to stick those
> into a string_list and sort them, but making temporary copies of both
> the strmap and the diff_queued_diff just to sort them so that I can

But you already sort diff_queued_diff at the end of
create_filepairs_for_header_only_notifications(), so sorting a bit earlier
in that function, before enqueueing the new entries won't change the final
result, and allows us to work with a sorted queue; no need for a temporary
copy (we'd only need to copy the strmap).

> reasonably cheaply ask "are items from this thing present in this
> other thing?" seems to be stretching things a bit too far.
> maps/hashes provide a very nice "is this item present" lookup and are
> a natural way to ask that.  Since that is exactly the question I am
> asking, I think they are the better data structure here.

Yeah that makes sense. In theory if we ask
"What is the union of the queued pairs and the extra pairs induced by
conflict messages?"  we could abstract away the "is this item present"
lookup but in practice that's hard.

> So, this was not at all a leak of merge-ort datastructures, but rather a
> picking of the appropriate data structures for the problem at hand.

I think we have two viable solutions to this problem
1. use a temporary strset to figure out which pairs to add
2. use a temporary array, sort it, and "merge" the two arrays

I agree that 1 is more intuitive and natural for humans, and it's probably the
way to go. But it is a bit less elegant because it adds a strmap entry for
each pair in the queue, whereas 2 only needs to add an array element for
each pair with non-content conflicts, which are much fewer. (Okay that's a
minor detail.)  With the right abstractions 2 is pretty simple as well:

	j = 0
	extra_headers = sorted((key, val) for key, val in additional_headers)
	for i in 0..len(queue):
		while j < len(extra_headers) && compare(extra_headers[j].key, queue[i]) <= 0:
			if compare(extra_headers[j].key, queue[i]) < 0:
				enqueue(file_pair_for(extra_headers[j]))
			j++

where

	def compare(key: str, pair: diff_filepair) -> int:
		other = pair.one ? pair.one.path : pair.two.path # Mimic diffnamecmp
		return strcmp(key, other)
Elijah Newren Dec. 30, 2021, 10:04 p.m. UTC | #4
On Tue, Dec 28, 2021 at 4:16 PM Johannes Altmanninger <aclopte@gmail.com> wrote:
>
> On Tue, Dec 28, 2021 at 01:09:57PM -0800, Elijah Newren wrote:
> > On Tue, Dec 28, 2021 at 2:57 AM Johannes Altmanninger <aclopte@gmail.com> wrote:
> > >
> > > On Sat, Dec 25, 2021 at 07:59:18AM +0000, Elijah Newren via GitGitGadget wrote:
....
> > > but they are not
> > > really necessary for computing/printing a diff.
> >
> > Technically agree that it _could_ be solved a different way, but the
> > strmaps are a much more natural solution to this problem in this
> > particular case; more on this below.
>
> Oh yeah, I agree that strmaps are the more intuitive solution.

Cool, sounds like we're heading towards consensus.

> > > It feels like this is an
> > > implementation detail from merge-ort that's leaking into other components.
> >
> > And I disagree here, on _both_ the explicit point and the underlying
> > suggestion that you seem to be making that strmap should be avoided
> > outside of merging.  The strmap.[ch] type was originally a suggestion
> > from Peff for areas of git completely unrelated to merging (see the
> > beginning of https://lore.kernel.org/git/20200821194857.GD1165@coredump.intra.peff.net/,
> > and the first link in that email).  It's a new datatype for git, much
> > like strbuf or string_list or whatever before it, that is there to be
> > used when it's a natural fit for the problem at hand.  The lack of
> > strmap previously led folks to abuse other existing data structures
> > (and in a way that often led to poor performance to boot).
>
> Right, all those rename-detection performance fixes were pretty dazzling

I actually wasn't talking about rename-detection or merge machinery
(though it applies there too).  The original strmap proposal was
suggested in an email entitled, "ordered string-list considered
harmful", and neither rename-detection nor merge machinery were part
of the thread at the time.  I also didn't participate in the
thread...until I implemented the suggested API with some tweaks and
submitted it about a year later.

> > > What we want to do is
> > >
> > >         for file_pair in additional_headers:
> > >                 if not already_queued(file_pair):
> > >                         queue(file_pair)
> >
> > Yes, precisely.
> >
> > > to do that, you use a temporary has-set ("present") that records everything
> > > that's already queued (already_queued() is a lookup in that set).
> > >
> > > Let's assume both the queue and additional_headers are sorted arrays.
> >
> > That's a bad assumption; we can't rely on *either* being sorted.  I
>
...
> > > Then we could efficiently merge them (like a merge-sort algorithm)
> > > without ever allocating a temporary hash map.
> > >
> > > I haven't checked if this is practical (better wait for feedback).
> > > We'd probably need to convert the strmap additional_path_headers into an
> > > array and sort it (I guess our hash map does not guarantee any ordering?)
> >
> > Right, strmap has no ordering either.  I was willing to stick those
> > into a string_list and sort them, but making temporary copies of both
> > the strmap and the diff_queued_diff just to sort them so that I can
>
> But you already sort diff_queued_diff at the end of
> create_filepairs_for_header_only_notifications(), so sorting a bit earlier
> in that function, before enqueueing the new entries won't change the final
> result, and allows us to work with a sorted queue; no need for a temporary
> copy (we'd only need to copy the strmap).

Good point, although...

> > reasonably cheaply ask "are items from this thing present in this
> > other thing?" seems to be stretching things a bit too far.
> > maps/hashes provide a very nice "is this item present" lookup and are
> > a natural way to ask that.  Since that is exactly the question I am
> > asking, I think they are the better data structure here.
>
> Yeah that makes sense. In theory if we ask
> "What is the union of the queued pairs and the extra pairs induced by
> conflict messages?"  we could abstract away the "is this item present"
> lookup but in practice that's hard.
>
> > So, this was not at all a leak of merge-ort datastructures, but rather a
> > picking of the appropriate data structures for the problem at hand.
>
> I think we have two viable solutions to this problem
> 1. use a temporary strset to figure out which pairs to add
> 2. use a temporary array, sort it, and "merge" the two arrays
>
> I agree that 1 is more intuitive and natural for humans, and it's probably the
> way to go. But it is a bit less elegant because it adds a strmap entry for
> each pair in the queue, whereas 2 only needs to add an array element for
> each pair with non-content conflicts, which are much fewer. (Okay that's a
> minor detail.)  With the right abstractions 2 is pretty simple as well:
>
>         j = 0
>         extra_headers = sorted((key, val) for key, val in additional_headers)

Right, so this is two sorts instead of one.  (Sorting both the
diff_queued_diff initially, as well as the additional_headers, before
then attempting to merge the two.)  Probably a win performance-wise,
but just noting that it makes the code slightly less simple.

>         for i in 0..len(queue):
>                 while j < len(extra_headers) && compare(extra_headers[j].key, queue[i]) <= 0:
>                         if compare(extra_headers[j].key, queue[i]) < 0:

The duplicate comparison (two calls to strcmp) probably kills any
performance gain you were aiming for with this strategy.  Fixable, but
it does make the code longer.

>                                 enqueue(file_pair_for(extra_headers[j]))

The queue is an array of sorted items, so enqueue here would be
insertion into an already sorted list.  Inserting N items into a list
of M items is quadratic (O(N*M)) -- unless you meant to just append to
the end and add a third sort at the end?

>                         j++

At the end of the for loop, there may be remaining additional headers
that sort after all those found in the queue, so you'll need an
additional loop to handle those.

> where
>
>         def compare(key: str, pair: diff_filepair) -> int:
>                 other = pair.one ? pair.one.path : pair.two.path # Mimic diffnamecmp
>                 return strcmp(key, other)


Since you clearly felt this approach might be better, I went and
implemented it (+ tested and debugged):

 diff.c | 112 +++++++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 38 deletions(-)

diff --git a/diff.c b/diff.c
index d771406e69..0cdaa2e2ab 100644
--- a/diff.c
+++ b/diff.c
@@ -6389,52 +6389,87 @@ void diff_warn_rename_limit(const char
*varname, int needed, int degraded_cc)
                warning(_(rename_limit_advice), varname, needed);
 }

+static void diff_queue_placeholder(char *path)
+{
+       struct diff_filespec *one, *two;
+       struct diff_filepair *p;
+
+       one = alloc_filespec(path);
+       two = alloc_filespec(path);
+       fill_filespec(one, null_oid(), 0, 0);
+       fill_filespec(two, null_oid(), 0, 0);
+       p = diff_queue(&diff_queued_diff, one, two);
+       p->status = DIFF_STATUS_MODIFIED;
+}
+
 static void create_filepairs_for_header_only_notifications(struct
diff_options *o)
 {
-       struct strset present;
-       struct diff_queue_struct *q = &diff_queued_diff;
+       struct diff_queue_struct tmp_queue = { 0 };
+       struct string_list tmp_list = STRING_LIST_INIT_NODUP;
        struct hashmap_iter iter;
        struct strmap_entry *e;
+       char *queue_path = NULL, *list_path = NULL;
        int i;
+       int j;

-       strset_init_with_options(&present, /*pool*/ NULL, /*strdup*/ 0);
-
-       /*
-        * Find out which paths exist in diff_queued_diff, preferring
-        * one->path for any pair that has multiple paths.
-        */
-       for (i = 0; i < q->nr; i++) {
-               struct diff_filepair *p = q->queue[i];
-               char *path = p->one->path ? p->one->path : p->two->path;
-
-               if (strmap_contains(o->additional_path_headers, path))
-                       strset_add(&present, path);
-       }
-
-       /*
-        * Loop over paths in additional_path_headers; for each NOT already
-        * in diff_queued_diff, create a synthetic filepair and insert that
-        * into diff_queued_diff.
-        */
-       strmap_for_each_entry(o->additional_path_headers, &iter, e) {
-               if (!strset_contains(&present, e->key)) {
-                       struct diff_filespec *one, *two;
-                       struct diff_filepair *p;
-
-                       one = alloc_filespec(e->key);
-                       two = alloc_filespec(e->key);
-                       fill_filespec(one, null_oid(), 0, 0);
-                       fill_filespec(two, null_oid(), 0, 0);
-                       p = diff_queue(q, one, two);
-                       p->status = DIFF_STATUS_MODIFIED;
-               }
-       }
-
-       /* Re-sort the filepairs */
+       /* Ensure existing filepairs are sorted */
        diffcore_fix_diff_index();

-       /* Cleanup */
-       strset_clear(&present);
+       /* Get a sorted list of additional_path_headers */
+       strmap_for_each_entry(o->additional_path_headers, &iter, e) {
+               string_list_append(&tmp_list, e->key);
+       }
+       string_list_sort(&tmp_list);
+
+       /*
+        * Move everything from diff_queued_diff to tmp_queue.  We'll copy
+        * them back one-by-one, with extra entries inserted from tmp_list.
+        */
+       SWAP(tmp_queue, diff_queued_diff);
+
+       /*
+        * Add entries from tmp_queue and tmp_list to diff_queued_diff, keeping
+        * the overall list sorted.
+        */
+       j = 0;
+       for (i = 0; i < tmp_queue.nr; i++) {
+               struct diff_filepair *p = tmp_queue.queue[i];
+               queue_path = p->one->path ? p->one->path : p->two->path;
+
+               while (j < tmp_list.nr) {
+                       int cmp;
+
+                       list_path = tmp_list.items[j].string;
+                       cmp = strcmp(queue_path, list_path);
+
+                       if (cmp < 0)
+                               break;
+                       else if (cmp > 0)
+                               diff_queue_placeholder(list_path);
+                       j++;
+               }
+               diff_q(&diff_queued_diff, p);
+       }
+       /*
+        * We've got all the entries from tmp_queue now, but may have more
+        * from tmp_list to insert.  Make sure to only add new entries for
+        * strings not already in diff_queued_diff.
+        */
+       if (j < tmp_list.nr && !strcmp(queue_path, list_path))
+               j++;
+       while (j < tmp_list.nr) {
+               char *list_path = tmp_list.items[j].string;
+               diff_queue_placeholder(list_path);
+               j++;
+       }
+
+       /*
+        * We *only* free tmp_queue.queue, not the stuff it points to because
+        * that has been copied into diff_queued_diff.  Zero out tmp_queue to
+        * make it clear we don't want to free anything else.
+        */
+       free(tmp_queue.queue);
+       memset(&tmp_queue, 0, sizeof(tmp_queue));
 }

 static void diff_flush_patch_all_file_pairs(struct diff_options *o)


It's actually considerably more code as you can see from the diffstat,
and feels like we're reaching into some ugly internals with tmp_queue
(the SWAP and the special-case freeing) in order to get the desired
performance improvements.  And it was already O(NlogN) overall (due to
the sort), which doesn't change with this new algorithm.  It's really,
really hard for me to imagine a case where we have large numbers of
additional headers.  Even if someone else can imagine that we for some
reason have a huge number of conflicts in order to generate a huge
number of additional headers...how could the performance of sorting
O(N) filenames and merging these lists possibly matter in comparison
to the O(N) three-way file merges that would likely have been
performed from those conflicts?

So, I'm going to throw this code away and keep the original.

It was an interesting idea and exercise; thanks for keeping me on my toes.
Johannes Altmanninger Dec. 31, 2021, 3:07 a.m. UTC | #5
On Thu, Dec 30, 2021 at 02:04:30PM -0800, Elijah Newren wrote:
> >                                 enqueue(file_pair_for(extra_headers[j]))
> 
> The queue is an array of sorted items, so enqueue here would be
> insertion into an already sorted list.  Inserting N items into a list
> of M items is quadratic (O(N*M)) -- unless you meant to just append to
> the end and add a third sort at the end?

yeah I would have probably used a third sort

> 
> >                         j++
> 
> At the end of the for loop, there may be remaining additional headers
> that sort after all those found in the queue, so you'll need an
> additional loop to handle those.

my bad, I should have tried it

> It's actually considerably more code as you can see from the diffstat,
> and feels like we're reaching into some ugly internals with tmp_queue
> (the SWAP and the special-case freeing) in order to get the desired
> performance improvements.  And it was already O(NlogN) overall (due to
> the sort), which doesn't change with this new algorithm.  It's really,
> really hard for me to imagine a case where we have large numbers of
> additional headers.  Even if someone else can imagine that we for some
> reason have a huge number of conflicts in order to generate a huge
> number of additional headers...how could the performance of sorting
> O(N) filenames and merging these lists possibly matter in comparison
> to the O(N) three-way file merges that would likely have been
> performed from those conflicts?

Yeah, I agree with that conclusion, it's surely not worth the added complexity.
Seeing the code definitely helps, thanks.

> 
> So, I'm going to throw this code away and keep the original.
> 
> It was an interesting idea and exercise; thanks for keeping me on my toes.
diff mbox series

Patch

diff --git a/diff.c b/diff.c
index 861282db1c3..aaa6a19f158 100644
--- a/diff.c
+++ b/diff.c
@@ -27,6 +27,7 @@ 
 #include "help.h"
 #include "promisor-remote.h"
 #include "dir.h"
+#include "strmap.h"
 
 #ifdef NO_FAST_WORKING_DIRECTORY
 #define FAST_WORKING_DIRECTORY 0
@@ -3406,6 +3407,31 @@  struct userdiff_driver *get_textconv(struct repository *r,
 	return userdiff_get_textconv(r, one->driver);
 }
 
+static struct strbuf *additional_headers(struct diff_options *o,
+					 const char *path)
+{
+	if (!o->additional_path_headers)
+		return NULL;
+	return strmap_get(o->additional_path_headers, path);
+}
+
+static void add_formatted_headers(struct strbuf *msg,
+				  struct strbuf *more_headers,
+				  const char *line_prefix,
+				  const char *meta,
+				  const char *reset)
+{
+	char *next, *newline;
+
+	for (next = more_headers->buf; *next; next = newline) {
+		newline = strchrnul(next, '\n');
+		strbuf_addf(msg, "%s%s%.*s%s\n", line_prefix, meta,
+			    (int)(newline - next), next, reset);
+		if (*newline)
+			newline++;
+	}
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -3464,6 +3490,17 @@  static void builtin_diff(const char *name_a,
 	b_two = quote_two(b_prefix, name_b + (*name_b == '/'));
 	lbl[0] = DIFF_FILE_VALID(one) ? a_one : "/dev/null";
 	lbl[1] = DIFF_FILE_VALID(two) ? b_two : "/dev/null";
+	if (!DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two)) {
+		/*
+		 * We should only reach this point for pairs from
+		 * create_filepairs_for_header_only_notifications().  For
+		 * these, we should avoid the "/dev/null" special casing
+		 * above, meaning we avoid showing such pairs as either
+		 * "new file" or "deleted file" below.
+		 */
+		lbl[0] = a_one;
+		lbl[1] = b_two;
+	}
 	strbuf_addf(&header, "%s%sdiff --git %s %s%s\n", line_prefix, meta, a_one, b_two, reset);
 	if (lbl[0][0] == '/') {
 		/* /dev/null */
@@ -4328,6 +4365,7 @@  static void fill_metainfo(struct strbuf *msg,
 	const char *set = diff_get_color(use_color, DIFF_METAINFO);
 	const char *reset = diff_get_color(use_color, DIFF_RESET);
 	const char *line_prefix = diff_line_prefix(o);
+	struct strbuf *more_headers = NULL;
 
 	*must_show_header = 1;
 	strbuf_init(msg, PATH_MAX * 2 + 300);
@@ -4364,6 +4402,11 @@  static void fill_metainfo(struct strbuf *msg,
 	default:
 		*must_show_header = 0;
 	}
+	if ((more_headers = additional_headers(o, name))) {
+		add_formatted_headers(msg, more_headers,
+				      line_prefix, set, reset);
+		*must_show_header = 1;
+	}
 	if (one && two && !oideq(&one->oid, &two->oid)) {
 		const unsigned hexsz = the_hash_algo->hexsz;
 		int abbrev = o->abbrev ? o->abbrev : DEFAULT_ABBREV;
@@ -5852,12 +5895,22 @@  int diff_unmodified_pair(struct diff_filepair *p)
 
 static void diff_flush_patch(struct diff_filepair *p, struct diff_options *o)
 {
-	if (diff_unmodified_pair(p))
+	/*
+	 * Check if we can return early without showing a diff.  Note that
+	 * diff_filepair only stores {oid, path, mode, is_valid}
+	 * information for each path, and thus diff_unmodified_pair() only
+	 * considers those bits of info.  However, we do not want pairs
+	 * created by create_filepairs_for_header_only_notifications() to
+	 * be ignored, so return early if both p is unmodified AND
+	 * p->one->path is not in additional headers.
+	 */
+	if (diff_unmodified_pair(p) && !additional_headers(o, p->one->path))
 		return;
 
+	/* Actually, we can also return early to avoid showing tree diffs */
 	if ((DIFF_FILE_VALID(p->one) && S_ISDIR(p->one->mode)) ||
 	    (DIFF_FILE_VALID(p->two) && S_ISDIR(p->two->mode)))
-		return; /* no tree diffs in patch format */
+		return;
 
 	run_diff(p, o);
 }
@@ -5888,10 +5941,14 @@  static void diff_flush_checkdiff(struct diff_filepair *p,
 	run_checkdiff(p, o);
 }
 
-int diff_queue_is_empty(void)
+int diff_queue_is_empty(struct diff_options *o)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
+
+	if (o->additional_path_headers &&
+	    !strmap_empty(o->additional_path_headers))
+		return 0;
 	for (i = 0; i < q->nr; i++)
 		if (!diff_unmodified_pair(q->queue[i]))
 			return 0;
@@ -6325,6 +6382,54 @@  void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc)
 		warning(_(rename_limit_advice), varname, needed);
 }
 
+static void create_filepairs_for_header_only_notifications(struct diff_options *o)
+{
+	struct strset present;
+	struct diff_queue_struct *q = &diff_queued_diff;
+	struct hashmap_iter iter;
+	struct strmap_entry *e;
+	int i;
+
+	strset_init_with_options(&present, /*pool*/ NULL, /*strdup*/ 0);
+
+	/*
+	 * Find out which paths exist in diff_queued_diff, preferring
+	 * one->path for any pair that has multiple paths.
+	 */
+	for (i = 0; i < q->nr; i++) {
+		struct diff_filepair *p = q->queue[i];
+		char *path = p->one->path ? p->one->path : p->two->path;
+
+		if (strmap_contains(o->additional_path_headers, path))
+			strset_add(&present, path);
+	}
+
+	/*
+	 * Loop over paths in additional_path_headers; for each NOT already
+	 * in diff_queued_diff, create a synthetic filepair and insert that
+	 * into diff_queued_diff.
+	 */
+	strmap_for_each_entry(o->additional_path_headers, &iter, e) {
+		if (!strset_contains(&present, e->key)) {
+			struct diff_filespec *one, *two;
+			struct diff_filepair *p;
+
+			one = alloc_filespec(e->key);
+			two = alloc_filespec(e->key);
+			fill_filespec(one, null_oid(), 0, 0);
+			fill_filespec(two, null_oid(), 0, 0);
+			p = diff_queue(q, one, two);
+			p->status = DIFF_STATUS_MODIFIED;
+		}
+	}
+
+	/* Re-sort the filepairs */
+	diffcore_fix_diff_index();
+
+	/* Cleanup */
+	strset_clear(&present);
+}
+
 static void diff_flush_patch_all_file_pairs(struct diff_options *o)
 {
 	int i;
@@ -6337,6 +6442,9 @@  static void diff_flush_patch_all_file_pairs(struct diff_options *o)
 	if (o->color_moved)
 		o->emitted_symbols = &esm;
 
+	if (o->additional_path_headers)
+		create_filepairs_for_header_only_notifications(o);
+
 	for (i = 0; i < q->nr; i++) {
 		struct diff_filepair *p = q->queue[i];
 		if (check_pair_status(p))
@@ -6413,7 +6521,7 @@  void diff_flush(struct diff_options *options)
 	 * Order: raw, stat, summary, patch
 	 * or:    name/name-status/checkdiff (other bits clear)
 	 */
-	if (!q->nr)
+	if (!q->nr && !options->additional_path_headers)
 		goto free_queue;
 
 	if (output_format & (DIFF_FORMAT_RAW |
diff --git a/diff.h b/diff.h
index 8ba85c5e605..06a0a67afda 100644
--- a/diff.h
+++ b/diff.h
@@ -395,6 +395,7 @@  struct diff_options {
 
 	struct repository *repo;
 	struct option *parseopts;
+	struct strmap *additional_path_headers;
 
 	int no_free;
 };
@@ -593,7 +594,7 @@  void diffcore_fix_diff_index(void);
 "                show all files diff when -S is used and hit is found.\n" \
 "  -a  --text    treat all files as text.\n"
 
-int diff_queue_is_empty(void);
+int diff_queue_is_empty(struct diff_options*);
 void diff_flush(struct diff_options*);
 void diff_free(struct diff_options*);
 void diff_warn_rename_limit(const char *varname, int needed, int degraded_cc);
diff --git a/log-tree.c b/log-tree.c
index d4655b63d75..33c28f537a6 100644
--- a/log-tree.c
+++ b/log-tree.c
@@ -850,7 +850,7 @@  int log_tree_diff_flush(struct rev_info *opt)
 	opt->shown_dashes = 0;
 	diffcore_std(&opt->diffopt);
 
-	if (diff_queue_is_empty()) {
+	if (diff_queue_is_empty(&opt->diffopt)) {
 		int saved_fmt = opt->diffopt.output_format;
 		opt->diffopt.output_format = DIFF_FORMAT_NO_OUTPUT;
 		diff_flush(&opt->diffopt);