diff mbox series

[v2,2/3] diff: add -I<regex> that ignores matching changes

Message ID 20201012091751.19594-3-michal@isc.org (mailing list archive)
State New, archived
Headers show
Series diff: add -I<regex> that ignores matching changes | expand

Commit Message

Michał Kępień Oct. 12, 2020, 9:17 a.m. UTC
Add a new diff option that enables ignoring changes whose all lines
(changed, removed, and added) match a given regular expression.  This is
similar to the -I option in standalone diff utilities and can be used
e.g. to ignore changes which only affect code comments or to look for
unrelated changes in commits containing a large number of automatically
applied modifications (e.g. a tree-wide string replacement).  The
difference between -G/-S and the new -I option is that the latter
filters output on a per-change basis.

Use the 'ignore' field of xdchange_t for marking a change as ignored or
not.  Since the same field is used by --ignore-blank-lines, identical
hunk emitting rules apply for --ignore-blank-lines and -I.  These two
options can also be used together in the same git invocation (they are
complementary to each other).

Rename xdl_mark_ignorable() to xdl_mark_ignorable_lines(), to indicate
that it is logically a "sibling" of xdl_mark_ignorable_regex() rather
than its "parent".

Signed-off-by: Michał Kępień <michal@isc.org>
---
 Documentation/diff-options.txt |  4 +++
 diff.c                         | 23 +++++++++++++++++
 diff.h                         |  4 +++
 xdiff/xdiff.h                  |  4 +++
 xdiff/xdiffi.c                 | 47 ++++++++++++++++++++++++++++++++--
 5 files changed, 80 insertions(+), 2 deletions(-)

Comments

Johannes Schindelin Oct. 12, 2020, 11:20 a.m. UTC | #1
Hi Michał,

On Mon, 12 Oct 2020, Michał Kępień wrote:

> @@ -5203,6 +5207,22 @@ static int diff_opt_patience(const struct option *opt,
>  	return 0;
>  }
>
> +static int diff_opt_ignore_regex(const struct option *opt,
> +				 const char *arg, int unset)
> +{
> +	struct diff_options *options = opt->value;
> +	regex_t *regex;
> +
> +	BUG_ON_OPT_NEG(unset);
> +	regex = xcalloc(1, sizeof(*regex));
> +	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
> +		die("invalid regex: %s", arg);
> +	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
> +		   options->ignore_regex_alloc);
> +	options->ignore_regex[options->ignore_regex_nr++] = regex;

A slightly more elegant way would be to have `ignore_regex` have the type
`regex_t *` and use `ALLOC_GROW_BY()` (which zeroes the newly-added
elements automagically).

> +	return 0;
> +}
> +
>  static int diff_opt_pickaxe_regex(const struct option *opt,
>  				  const char *arg, int unset)
>  {
> [...]
> @@ -5491,6 +5511,9 @@ static void prep_parse_options(struct diff_options *options)
>  		OPT_BIT_F(0, "ignore-blank-lines", &options->xdl_opts,
>  			  N_("ignore changes whose lines are all blank"),
>  			  XDF_IGNORE_BLANK_LINES, PARSE_OPT_NONEG),
> +		OPT_CALLBACK_F('I', NULL, options, N_("<regex>"),
> +			       N_("ignore changes whose all lines match <regex>"),
> +			       0, diff_opt_ignore_regex),
>  		OPT_BIT(0, "indent-heuristic", &options->xdl_opts,
>  			N_("heuristic to shift diff hunk boundaries for easy reading"),
>  			XDF_INDENT_HEURISTIC),

Are we releasing the `ignore_regex` anywhere?

Thanks,
Dscho

> diff --git a/diff.h b/diff.h
> index 11de52e9e9..80dbd3dfdc 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -234,6 +234,10 @@ struct diff_options {
>  	 */
>  	const char *pickaxe;
>
> +	/* see Documentation/diff-options.txt */
> +	regex_t **ignore_regex;
> +	size_t ignore_regex_nr, ignore_regex_alloc;
> +
>  	const char *single_follow;
>  	const char *a_prefix, *b_prefix;
>  	const char *line_prefix;
> diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
> index 032e3a9f41..883a0d770e 100644
> --- a/xdiff/xdiff.h
> +++ b/xdiff/xdiff.h
> @@ -79,6 +79,10 @@ typedef struct s_mmbuffer {
>  typedef struct s_xpparam {
>  	unsigned long flags;
>
> +	/* See Documentation/diff-options.txt. */
> +	regex_t **ignore_regex;
> +	size_t ignore_regex_nr;
> +
>  	/* See Documentation/diff-options.txt. */
>  	char **anchors;
>  	size_t anchors_nr;
> diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c
> index bd035139f9..380eb728ed 100644
> --- a/xdiff/xdiffi.c
> +++ b/xdiff/xdiffi.c
> @@ -998,7 +998,7 @@ static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
>  	return 0;
>  }
>
> -static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
> +static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags)
>  {
>  	xdchange_t *xch;
>
> @@ -1019,6 +1019,46 @@ static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
>  	}
>  }
>
> +static int record_matches_regex(xrecord_t *rec, xpparam_t const *xpp) {
> +	regmatch_t regmatch;
> +	int i;
> +
> +	for (i = 0; i < xpp->ignore_regex_nr; i++)
> +		if (!regexec_buf(xpp->ignore_regex[i], rec->ptr, rec->size, 1,
> +				 &regmatch, 0))
> +			return 1;
> +
> +	return 0;
> +}
> +
> +static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe,
> +				     xpparam_t const *xpp)
> +{
> +	xdchange_t *xch;
> +
> +	for (xch = xscr; xch; xch = xch->next) {
> +		xrecord_t **rec;
> +		int ignore = 1;
> +		long i;
> +
> +		/*
> +		 * Do not override --ignore-blank-lines.
> +		 */
> +		if (xch->ignore)
> +			continue;
> +
> +		rec = &xe->xdf1.recs[xch->i1];
> +		for (i = 0; i < xch->chg1 && ignore; i++)
> +			ignore = record_matches_regex(rec[i], xpp);
> +
> +		rec = &xe->xdf2.recs[xch->i2];
> +		for (i = 0; i < xch->chg2 && ignore; i++)
> +			ignore = record_matches_regex(rec[i], xpp);
> +
> +		xch->ignore = ignore;
> +	}
> +}
> +
>  int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
>  	     xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
>  	xdchange_t *xscr;
> @@ -1038,7 +1078,10 @@ int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
>  	}
>  	if (xscr) {
>  		if (xpp->flags & XDF_IGNORE_BLANK_LINES)
> -			xdl_mark_ignorable(xscr, &xe, xpp->flags);
> +			xdl_mark_ignorable_lines(xscr, &xe, xpp->flags);
> +
> +		if (xpp->ignore_regex)
> +			xdl_mark_ignorable_regex(xscr, &xe, xpp);
>
>  		if (ef(&xe, xscr, ecb, xecfg) < 0) {
>
> --
> 2.28.0
>
>
Junio C Hamano Oct. 12, 2020, 6:01 p.m. UTC | #2
Michał Kępień <michal@isc.org> writes:

> Add a new diff option that enables ignoring changes whose all lines
> (changed, removed, and added) match a given regular expression.  This is
> similar to the -I option in standalone diff utilities and can be used
> e.g. to ignore changes which only affect code comments or to look for
> unrelated changes in commits containing a large number of automatically
> applied modifications (e.g. a tree-wide string replacement).  The
> difference between -G/-S and the new -I option is that the latter
> filters output on a per-change basis.
>
> Use the 'ignore' field of xdchange_t for marking a change as ignored or
> not.  Since the same field is used by --ignore-blank-lines, identical
> hunk emitting rules apply for --ignore-blank-lines and -I.  These two
> options can also be used together in the same git invocation (they are
> complementary to each other).
>
> Rename xdl_mark_ignorable() to xdl_mark_ignorable_lines(), to indicate
> that it is logically a "sibling" of xdl_mark_ignorable_regex() rather
> than its "parent".
>
> Signed-off-by: Michał Kępień <michal@isc.org>

Well explained.

> +-I<regex>::

Since we are mimicking other folks' feature, giving also the

--ignore-matching-lines=<regex>

synonym to that their users are familiar with would be a good idea,
no?

> +	Ignore changes whose all lines match <regex>.  This option may
> +	be specified more than once.
> +
>  --inter-hunk-context=<lines>::
>  	Show the context between diff hunks, up to the specified number
>  	of lines, thereby fusing hunks that are close to each other.
> diff --git a/diff.c b/diff.c
> index 2bb2f8f57e..c5d4920fee 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -3587,6 +3587,8 @@ static void builtin_diff(const char *name_a,
>  		if (header.len && !o->flags.suppress_diff_headers)
>  			ecbdata.header = &header;
>  		xpp.flags = o->xdl_opts;
> +		xpp.ignore_regex = o->ignore_regex;
> +		xpp.ignore_regex_nr = o->ignore_regex_nr;
>  		xpp.anchors = o->anchors;
>  		xpp.anchors_nr = o->anchors_nr;
>  		xecfg.ctxlen = o->context;
> @@ -3716,6 +3718,8 @@ static void builtin_diffstat(const char *name_a, const char *name_b,
>  		memset(&xpp, 0, sizeof(xpp));
>  		memset(&xecfg, 0, sizeof(xecfg));
>  		xpp.flags = o->xdl_opts;
> +		xpp.ignore_regex = o->ignore_regex;
> +		xpp.ignore_regex_nr = o->ignore_regex_nr;
>  		xpp.anchors = o->anchors;
>  		xpp.anchors_nr = o->anchors_nr;
>  		xecfg.ctxlen = o->context;
> @@ -5203,6 +5207,22 @@ static int diff_opt_patience(const struct option *opt,
>  	return 0;
>  }
>  
> +static int diff_opt_ignore_regex(const struct option *opt,
> +				 const char *arg, int unset)
> +{
> +	struct diff_options *options = opt->value;
> +	regex_t *regex;
> +
> +	BUG_ON_OPT_NEG(unset);
> +	regex = xcalloc(1, sizeof(*regex));
> +	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
> +		die("invalid regex: %s", arg);
> +	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
> +		   options->ignore_regex_alloc);
> +	options->ignore_regex[options->ignore_regex_nr++] = regex;
> +	return 0;
> +}

Don't these parse-options callback functions have a way to tell the
caller die instead of them themselves dying like this?  Would it
work better to "return error(... message ...)", or would that give
two error messages?  In anycase, this is end-user facing, so we'd
want it to be localizable, e.g.

	die(_("invalid regexp given to -I: '%s'", arg));

probably.

>  static int diff_opt_pickaxe_regex(const struct option *opt,
>  				  const char *arg, int unset)
>  {

Other than that, nicely done.

Thanks.
Junio C Hamano Oct. 12, 2020, 8 p.m. UTC | #3
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Hi Michał,
>
> On Mon, 12 Oct 2020, Michał Kępień wrote:
>
>> @@ -5203,6 +5207,22 @@ static int diff_opt_patience(const struct option *opt,
>>  	return 0;
>>  }
>>
>> +static int diff_opt_ignore_regex(const struct option *opt,
>> +				 const char *arg, int unset)
>> +{
>> +	struct diff_options *options = opt->value;
>> +	regex_t *regex;
>> +
>> +	BUG_ON_OPT_NEG(unset);
>> +	regex = xcalloc(1, sizeof(*regex));
>> +	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
>> +		die("invalid regex: %s", arg);
>> +	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
>> +		   options->ignore_regex_alloc);
>> +	options->ignore_regex[options->ignore_regex_nr++] = regex;
>
> A slightly more elegant way would be to have `ignore_regex` have the type
> `regex_t *` and use `ALLOC_GROW_BY()` (which zeroes the newly-added
> elements automagically).

It may be "elegant", but we we know if it is "correct" on
everybody's implementation of regex_t?

A struct like

	struct foo {
		char *str;
		char in_place_buffer[10];
	};

where str points at in_place_buffer[] only when it needs to point at
a very short string, and points at an allocated memory on heap, can
not be safely copied and used, and an array of such a struct breaks
when ALLOC_GROW_BY() needs to call realloc().  Keeping an array of
pointers to regex_t and growing it would not break even for such a
structure type.

Since we cannot rely on the implementation detail of regex_t on a
single platform, I'd rather not to see anybody suggest such an
"elegant" approach.

Thanks.
Junio C Hamano Oct. 12, 2020, 8:04 p.m. UTC | #4
Michał Kępień <michal@isc.org> writes:

> +	/* see Documentation/diff-options.txt */

This comment adds negative value.

If it were

	/* "-I<regexp>" */

the readers won't have to switch to the file only to find out that
the comment didn't tell them where in the file to look at. 

> +	regex_t **ignore_regex;
> +	size_t ignore_regex_nr, ignore_regex_alloc;
> +
>  	const char *single_follow;
>  	const char *a_prefix, *b_prefix;
>  	const char *line_prefix;
> diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
> index 032e3a9f41..883a0d770e 100644
> --- a/xdiff/xdiff.h
> +++ b/xdiff/xdiff.h
> @@ -79,6 +79,10 @@ typedef struct s_mmbuffer {
>  typedef struct s_xpparam {
>  	unsigned long flags;
>  
> +	/* See Documentation/diff-options.txt. */

Likewise.

> +	regex_t **ignore_regex;
> +	size_t ignore_regex_nr;
> +
>  	/* See Documentation/diff-options.txt. */
>  	char **anchors;
>  	size_t anchors_nr;
Johannes Schindelin Oct. 12, 2020, 8:39 p.m. UTC | #5
Hi Junio,

On Mon, 12 Oct 2020, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> > Hi Michał,
> >
> > On Mon, 12 Oct 2020, Michał Kępień wrote:
> >
> >> @@ -5203,6 +5207,22 @@ static int diff_opt_patience(const struct option *opt,
> >>  	return 0;
> >>  }
> >>
> >> +static int diff_opt_ignore_regex(const struct option *opt,
> >> +				 const char *arg, int unset)
> >> +{
> >> +	struct diff_options *options = opt->value;
> >> +	regex_t *regex;
> >> +
> >> +	BUG_ON_OPT_NEG(unset);
> >> +	regex = xcalloc(1, sizeof(*regex));
> >> +	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
> >> +		die("invalid regex: %s", arg);
> >> +	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
> >> +		   options->ignore_regex_alloc);
> >> +	options->ignore_regex[options->ignore_regex_nr++] = regex;
> >
> > A slightly more elegant way would be to have `ignore_regex` have the type
> > `regex_t *` and use `ALLOC_GROW_BY()` (which zeroes the newly-added
> > elements automagically).
>
> It may be "elegant", but we we know if it is "correct" on
> everybody's implementation of regex_t?
>
> A struct like
>
> 	struct foo {
> 		char *str;
> 		char in_place_buffer[10];
> 	};
>
> where str points at in_place_buffer[] only when it needs to point at
> a very short string, and points at an allocated memory on heap, can
> not be safely copied and used, and an array of such a struct breaks
> when ALLOC_GROW_BY() needs to call realloc().  Keeping an array of
> pointers to regex_t and growing it would not break even for such a
> structure type.
>
> Since we cannot rely on the implementation detail of regex_t on a
> single platform, I'd rather not to see anybody suggest such an
> "elegant" approach.

True, such an implementation detail is totally conceivable.

Thanks for the sanity check.

Having said that, my suggestion to use `ALLOC_GROW_BY()` was triggered by
the use of `xcalloc()`, and now that I think further about it, an
`xmalloc()` should be plenty enough: `regcomp()` does not need that struct
to be initialized to all zero.

Ciao,
Dscho
Junio C Hamano Oct. 12, 2020, 9:43 p.m. UTC | #6
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> the use of `xcalloc()`, and now that I think further about it, an
> `xmalloc()` should be plenty enough: `regcomp()` does not need that struct
> to be initialized to all zero.

Yup, I think it makes sense.
Michał Kępień Oct. 13, 2020, 6:36 a.m. UTC | #7
Hi Johannes,

> > @@ -5491,6 +5511,9 @@ static void prep_parse_options(struct diff_options *options)
> >  		OPT_BIT_F(0, "ignore-blank-lines", &options->xdl_opts,
> >  			  N_("ignore changes whose lines are all blank"),
> >  			  XDF_IGNORE_BLANK_LINES, PARSE_OPT_NONEG),
> > +		OPT_CALLBACK_F('I', NULL, options, N_("<regex>"),
> > +			       N_("ignore changes whose all lines match <regex>"),
> > +			       0, diff_opt_ignore_regex),
> >  		OPT_BIT(0, "indent-heuristic", &options->xdl_opts,
> >  			N_("heuristic to shift diff hunk boundaries for easy reading"),
> >  			XDF_INDENT_HEURISTIC),
> 
> Are we releasing the `ignore_regex` anywhere?

Oops, I tried to mimic what is done for 'anchors', and I failed to
notice that apparently the elements of the options->anchors array are
only free()'d when --patience is also used and the array pointer itself
is never free()'d at all.  Given this, I believe I need to fix
diff_opt_ignore_regex() in patch 2 and also make sure that the memory
allocated in diff_opt_anchored() gets properly released - in another
preliminary clean-up patch?

At first glance, diff_flush() - specifically the part below the
'free_queue' label - looks like a sane place to free() things.  Am I
mistaken?
Michał Kępień Oct. 13, 2020, 6:37 a.m. UTC | #8
> > the use of `xcalloc()`, and now that I think further about it, an
> > `xmalloc()` should be plenty enough: `regcomp()` does not need that struct
> > to be initialized to all zero.
> 
> Yup, I think it makes sense.

Sure thing, I will switch to xmalloc() in v3.  xcalloc()'s semantics
("allocate one structure of this size") appealed to me, but there is
indeed no need to zero-initialize the allocated memory before passing it
to regcomp().
Michał Kępień Oct. 13, 2020, 6:38 a.m. UTC | #9
> > +-I<regex>::
> 
> Since we are mimicking other folks' feature, giving also the
> 
> --ignore-matching-lines=<regex>
> 
> synonym to that their users are familiar with would be a good idea,
> no?

Some diff implementations (e.g. the OpenBSD one) lack the long option
and I was aiming for the "greatest common divisor" for all diff
implementations, but sure, I do not see how adding the synonym could
hurt.  I will do that in v3.

> > @@ -5203,6 +5207,22 @@ static int diff_opt_patience(const struct option *opt,
> >  	return 0;
> >  }
> >  
> > +static int diff_opt_ignore_regex(const struct option *opt,
> > +				 const char *arg, int unset)
> > +{
> > +	struct diff_options *options = opt->value;
> > +	regex_t *regex;
> > +
> > +	BUG_ON_OPT_NEG(unset);
> > +	regex = xcalloc(1, sizeof(*regex));
> > +	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
> > +		die("invalid regex: %s", arg);
> > +	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
> > +		   options->ignore_regex_alloc);
> > +	options->ignore_regex[options->ignore_regex_nr++] = regex;
> > +	return 0;
> > +}
> 
> Don't these parse-options callback functions have a way to tell the
> caller die instead of them themselves dying like this?  Would it
> work better to "return error(... message ...)", or would that give
> two error messages?

Yes, that would indeed look better, thanks - just one error message is
generated when "return error(...)" is used.  I just failed to notice
that it is a thing.  I will use it in v3.

> In anycase, this is end-user facing, so we'd
> want it to be localizable, e.g.
> 
> 	die(_("invalid regexp given to -I: '%s'", arg));
> 
> probably.

Absolutely, I will fix that in v3.  I tried to mimic what other similar
code in the tree does and a quick `git grep 'die.*regex'` yielded mostly
non-localized strings, so I settled for one as well.
Michał Kępień Oct. 13, 2020, 6:38 a.m. UTC | #10
> > +	/* see Documentation/diff-options.txt */
> 
> This comment adds negative value.
> 
> If it were
> 
> 	/* "-I<regexp>" */
> 
> the readers won't have to switch to the file only to find out that
> the comment didn't tell them where in the file to look at. 

Agreed, thanks, I will fix this in v3.

> > +	regex_t **ignore_regex;
> > +	size_t ignore_regex_nr, ignore_regex_alloc;
> > +
> >  	const char *single_follow;
> >  	const char *a_prefix, *b_prefix;
> >  	const char *line_prefix;
> > diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
> > index 032e3a9f41..883a0d770e 100644
> > --- a/xdiff/xdiff.h
> > +++ b/xdiff/xdiff.h
> > @@ -79,6 +79,10 @@ typedef struct s_mmbuffer {
> >  typedef struct s_xpparam {
> >  	unsigned long flags;
> >  
> > +	/* See Documentation/diff-options.txt. */
> 
> Likewise.

I will fix this in v3.
Johannes Schindelin Oct. 13, 2020, 12:02 p.m. UTC | #11
Hi Michał,

On Tue, 13 Oct 2020, Michał Kępień wrote:

> Hi Johannes,
>
> > > @@ -5491,6 +5511,9 @@ static void prep_parse_options(struct diff_options *options)
> > >  		OPT_BIT_F(0, "ignore-blank-lines", &options->xdl_opts,
> > >  			  N_("ignore changes whose lines are all blank"),
> > >  			  XDF_IGNORE_BLANK_LINES, PARSE_OPT_NONEG),
> > > +		OPT_CALLBACK_F('I', NULL, options, N_("<regex>"),
> > > +			       N_("ignore changes whose all lines match <regex>"),
> > > +			       0, diff_opt_ignore_regex),
> > >  		OPT_BIT(0, "indent-heuristic", &options->xdl_opts,
> > >  			N_("heuristic to shift diff hunk boundaries for easy reading"),
> > >  			XDF_INDENT_HEURISTIC),
> >
> > Are we releasing the `ignore_regex` anywhere?
>
> Oops, I tried to mimic what is done for 'anchors', and I failed to
> notice that apparently the elements of the options->anchors array are
> only free()'d when --patience is also used and the array pointer itself
> is never free()'d at all.  Given this, I believe I need to fix
> diff_opt_ignore_regex() in patch 2 and also make sure that the memory
> allocated in diff_opt_anchored() gets properly released - in another
> preliminary clean-up patch?
>
> At first glance, diff_flush() - specifically the part below the
> 'free_queue' label - looks like a sane place to free() things.  Am I
> mistaken?

Oh wow, from a cursory look it seems as if the diff machinery was not
exactly careful with releasing memory. I might be mistaken, but if I am
not, then this would deserve a separate patch series, methinks.

Ciao,
Dscho
Junio C Hamano Oct. 13, 2020, 3:49 p.m. UTC | #12
Michał Kępień <michal@isc.org> writes:

> ...  xcalloc()'s semantics
> ("allocate one structure of this size") appealed to me, ...

FWIW, it appeals to me, too ;-)  But if we are going to pass it to
regcomp(), there is no point clearing it beforehand.
Junio C Hamano Oct. 13, 2020, 3:53 p.m. UTC | #13
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Oh wow, from a cursory look it seems as if the diff machinery was not
> exactly careful with releasing memory. I might be mistaken, but if I am
> not, then this would deserve a separate patch series, methinks.

I wouldn't be surprised if newer parts of it is much less careful
than the older parts of the machinery.  Most callers of the diff
machinery, including "log -p" that repeatedly generates patches,
would however run just a single setup for the entire series of diff
invocations before tearing it down AFIAR, so leaking the result of
parsing the command line option may not be such a big deal to these
callers.  But some callers added recently may not follow the access
pattern, in which case the machinery may have to be made more
leakproof.

Thanks.
Michał Kępień Oct. 13, 2020, 6:45 p.m. UTC | #14
Hi Johannes,

> Oh wow, from a cursory look it seems as if the diff machinery was not
> exactly careful with releasing memory. I might be mistaken, but if I am
> not, then this would deserve a separate patch series, methinks.

Oh, okay.  Since that sounds like a non-trivial task, I should probably
finish this patch series first, making sure it releases memory properly,
and only then move on to looking for memory leaks in diff.c.
diff mbox series

Patch

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index 573fb9bb71..f8ccf85173 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -687,6 +687,10 @@  endif::git-format-patch[]
 --ignore-blank-lines::
 	Ignore changes whose lines are all blank.
 
+-I<regex>::
+	Ignore changes whose all lines match <regex>.  This option may
+	be specified more than once.
+
 --inter-hunk-context=<lines>::
 	Show the context between diff hunks, up to the specified number
 	of lines, thereby fusing hunks that are close to each other.
diff --git a/diff.c b/diff.c
index 2bb2f8f57e..c5d4920fee 100644
--- a/diff.c
+++ b/diff.c
@@ -3587,6 +3587,8 @@  static void builtin_diff(const char *name_a,
 		if (header.len && !o->flags.suppress_diff_headers)
 			ecbdata.header = &header;
 		xpp.flags = o->xdl_opts;
+		xpp.ignore_regex = o->ignore_regex;
+		xpp.ignore_regex_nr = o->ignore_regex_nr;
 		xpp.anchors = o->anchors;
 		xpp.anchors_nr = o->anchors_nr;
 		xecfg.ctxlen = o->context;
@@ -3716,6 +3718,8 @@  static void builtin_diffstat(const char *name_a, const char *name_b,
 		memset(&xpp, 0, sizeof(xpp));
 		memset(&xecfg, 0, sizeof(xecfg));
 		xpp.flags = o->xdl_opts;
+		xpp.ignore_regex = o->ignore_regex;
+		xpp.ignore_regex_nr = o->ignore_regex_nr;
 		xpp.anchors = o->anchors;
 		xpp.anchors_nr = o->anchors_nr;
 		xecfg.ctxlen = o->context;
@@ -5203,6 +5207,22 @@  static int diff_opt_patience(const struct option *opt,
 	return 0;
 }
 
+static int diff_opt_ignore_regex(const struct option *opt,
+				 const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+	regex_t *regex;
+
+	BUG_ON_OPT_NEG(unset);
+	regex = xcalloc(1, sizeof(*regex));
+	if (regcomp(regex, arg, REG_EXTENDED | REG_NEWLINE))
+		die("invalid regex: %s", arg);
+	ALLOC_GROW(options->ignore_regex, options->ignore_regex_nr + 1,
+		   options->ignore_regex_alloc);
+	options->ignore_regex[options->ignore_regex_nr++] = regex;
+	return 0;
+}
+
 static int diff_opt_pickaxe_regex(const struct option *opt,
 				  const char *arg, int unset)
 {
@@ -5491,6 +5511,9 @@  static void prep_parse_options(struct diff_options *options)
 		OPT_BIT_F(0, "ignore-blank-lines", &options->xdl_opts,
 			  N_("ignore changes whose lines are all blank"),
 			  XDF_IGNORE_BLANK_LINES, PARSE_OPT_NONEG),
+		OPT_CALLBACK_F('I', NULL, options, N_("<regex>"),
+			       N_("ignore changes whose all lines match <regex>"),
+			       0, diff_opt_ignore_regex),
 		OPT_BIT(0, "indent-heuristic", &options->xdl_opts,
 			N_("heuristic to shift diff hunk boundaries for easy reading"),
 			XDF_INDENT_HEURISTIC),
diff --git a/diff.h b/diff.h
index 11de52e9e9..80dbd3dfdc 100644
--- a/diff.h
+++ b/diff.h
@@ -234,6 +234,10 @@  struct diff_options {
 	 */
 	const char *pickaxe;
 
+	/* see Documentation/diff-options.txt */
+	regex_t **ignore_regex;
+	size_t ignore_regex_nr, ignore_regex_alloc;
+
 	const char *single_follow;
 	const char *a_prefix, *b_prefix;
 	const char *line_prefix;
diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
index 032e3a9f41..883a0d770e 100644
--- a/xdiff/xdiff.h
+++ b/xdiff/xdiff.h
@@ -79,6 +79,10 @@  typedef struct s_mmbuffer {
 typedef struct s_xpparam {
 	unsigned long flags;
 
+	/* See Documentation/diff-options.txt. */
+	regex_t **ignore_regex;
+	size_t ignore_regex_nr;
+
 	/* See Documentation/diff-options.txt. */
 	char **anchors;
 	size_t anchors_nr;
diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c
index bd035139f9..380eb728ed 100644
--- a/xdiff/xdiffi.c
+++ b/xdiff/xdiffi.c
@@ -998,7 +998,7 @@  static int xdl_call_hunk_func(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
 	return 0;
 }
 
-static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
+static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags)
 {
 	xdchange_t *xch;
 
@@ -1019,6 +1019,46 @@  static void xdl_mark_ignorable(xdchange_t *xscr, xdfenv_t *xe, long flags)
 	}
 }
 
+static int record_matches_regex(xrecord_t *rec, xpparam_t const *xpp) {
+	regmatch_t regmatch;
+	int i;
+
+	for (i = 0; i < xpp->ignore_regex_nr; i++)
+		if (!regexec_buf(xpp->ignore_regex[i], rec->ptr, rec->size, 1,
+				 &regmatch, 0))
+			return 1;
+
+	return 0;
+}
+
+static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe,
+				     xpparam_t const *xpp)
+{
+	xdchange_t *xch;
+
+	for (xch = xscr; xch; xch = xch->next) {
+		xrecord_t **rec;
+		int ignore = 1;
+		long i;
+
+		/*
+		 * Do not override --ignore-blank-lines.
+		 */
+		if (xch->ignore)
+			continue;
+
+		rec = &xe->xdf1.recs[xch->i1];
+		for (i = 0; i < xch->chg1 && ignore; i++)
+			ignore = record_matches_regex(rec[i], xpp);
+
+		rec = &xe->xdf2.recs[xch->i2];
+		for (i = 0; i < xch->chg2 && ignore; i++)
+			ignore = record_matches_regex(rec[i], xpp);
+
+		xch->ignore = ignore;
+	}
+}
+
 int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 	     xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
 	xdchange_t *xscr;
@@ -1038,7 +1078,10 @@  int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 	}
 	if (xscr) {
 		if (xpp->flags & XDF_IGNORE_BLANK_LINES)
-			xdl_mark_ignorable(xscr, &xe, xpp->flags);
+			xdl_mark_ignorable_lines(xscr, &xe, xpp->flags);
+
+		if (xpp->ignore_regex)
+			xdl_mark_ignorable_regex(xscr, &xe, xpp);
 
 		if (ef(&xe, xscr, ecb, xecfg) < 0) {