diff mbox series

[v2] grep: add --max-count command line option

Message ID pull.1278.v2.git.git.1655789777023.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series [v2] grep: add --max-count command line option | expand

Commit Message

Carlos López June 21, 2022, 5:36 a.m. UTC
From: =?UTF-8?q?Carlos=20L=C3=B3pez?= <00xc@protonmail.com>

This patch adds a command line option analogous to that of GNU
grep(1)'s -m / --max-count, which users might already be used to.
This makes it possible to limit the amount of matches shown in the
output while keeping the functionality of other options such as -C
(show code context) or -p (show containing function), which would be
difficult to do with a shell pipeline (e.g. head(1)).

Signed-off-by: Carlos López 00xc@protonmail.com
---
    grep: add --max-count command line option
    
    This patch adds a command line option analogous to that of GNU grep(1)'s
    -m / --max-count, which users might already be used to. This makes it
    possible to limit the amount of matches shown in the output while
    keeping the functionality of other options such as -C (show code
    context) or -p (show containing function), which would be difficult to
    do with a shell pipeline (e.g. head(1)).
    
    Signed-off-by: Carlos López 00xc@protonmail.com

Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1278%2F00xc%2Fmaster-v2
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1278/00xc/master-v2
Pull-Request: https://github.com/git/git/pull/1278

Range-diff vs v1:

 1:  f89c6e244aa ! 1:  ee7eb298854 grep: add --max-count command line option
     @@ grep.c: static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, i
       		bol = eol + 1;
       		if (!left)
       			break;
     -+		if (opt->max_count != (unsigned)-1 && count == opt->max_count)
     ++		if (opt->max_count != -1 && count == opt->max_count)
      +			break;
       		left--;
       		lno++;
     @@ grep.h: struct grep_opt {
       	int show_hunk_mark;
       	int file_break;
       	int heading;
     -+	unsigned max_count;
     ++	int max_count;
       	void *priv;
       
       	void (*output)(struct grep_opt *opt, const void *data, size_t size);
     @@ grep.h: struct grep_opt {
       	.relative = 1, \
       	.pathname = 1, \
       	.max_depth = -1, \
     -+	.max_count = (unsigned)-1, \
     ++	.max_count = -1, \
       	.pattern_type_option = GREP_PATTERN_TYPE_UNSPECIFIED, \
       	.colors = { \
       		[GREP_COLOR_CONTEXT] = "", \


 Documentation/git-grep.txt | 8 ++++++++
 builtin/grep.c             | 9 +++++++++
 grep.c                     | 2 ++
 grep.h                     | 2 ++
 4 files changed, 21 insertions(+)


base-commit: 5b71c59bc3b9365075e2a175aa7b6f2b0c84ce44

Comments

Junio C Hamano June 21, 2022, 4:27 p.m. UTC | #1
"Carlos L. via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: =?UTF-8?q?Carlos=20L=C3=B3pez?= <00xc@protonmail.com>
>
> This patch adds a command line option analogous to that of GNU
> grep(1)'s -m / --max-count, which users might already be used to.
> This makes it possible to limit the amount of matches shown in the
> output while keeping the functionality of other options such as -C
> (show code context) or -p (show containing function), which would be
> difficult to do with a shell pipeline (e.g. head(1)).
>
> Signed-off-by: Carlos López 00xc@protonmail.com
> ---
> ...
>  Documentation/git-grep.txt | 8 ++++++++
>  builtin/grep.c             | 9 +++++++++
>  grep.c                     | 2 ++
>  grep.h                     | 2 ++
>  4 files changed, 21 insertions(+)

Tests?

> diff --git a/Documentation/git-grep.txt b/Documentation/git-grep.txt
> index 3d393fbac1b..19b817d5e58 100644
> --- a/Documentation/git-grep.txt
> +++ b/Documentation/git-grep.txt
> @@ -23,6 +23,7 @@ SYNOPSIS
>  	   [--break] [--heading] [-p | --show-function]
>  	   [-A <post-context>] [-B <pre-context>] [-C <context>]
>  	   [-W | --function-context]
> +	   [(-m | --max-count) <num>]
>  	   [--threads <num>]
>  	   [-f <file>] [-e] <pattern>
>  	   [--and|--or|--not|(|)|-e <pattern>...]
> @@ -238,6 +239,13 @@ providing this option will cause it to die.
>  	`git diff` works out patch hunk headers (see 'Defining a
>  	custom hunk-header' in linkgit:gitattributes[5]).
>  
> +-m <num>::
> +--max-count <num>::
> +	Limit the amount of matches per file. When using the `-v` or
> +	`--invert-match` option, the search stops after the specified
> +	number of non-matches. A value of -1 will return unlimited
> +	results (the default).

Hmph ...

> +	/*
> +	 * Optimize out the case where the amount of matches is limited to zero.
> +	 * We do this to keep results consistent with GNU grep(1).
> +	 */
> +	if (opt.max_count == 0)
> +		exit(EXIT_FAILURE);
> +

OK, so "stop before seeing any match" logically leads to "we found
nothing, so exit with non-zero".

> diff --git a/grep.c b/grep.c
> index 82eb7da1022..b32ab75cb6b 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -1686,6 +1686,8 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
>  		bol = eol + 1;
>  		if (!left)
>  			break;
> +		if (opt->max_count != -1 && count == opt->max_count)
> +			break;

I would have written it "if (0 <= opt->max_count && ...)".  What
happens when a trickster asks you to do "git grep -m -2"?

I guess what I am getting at is if we are better off saying that
negative means unlimited, instead of special casing -1 like this.  I
didn't think it through so it may be perfectly possible that what
you wrote makes more sense than "anything negative is unlimited".

I dunno.

>  		left--;
>  		lno++;
>  	}

Thanks.

> diff --git a/grep.h b/grep.h
> index c722d25ed9d..bdcadce61b8 100644
> --- a/grep.h
> +++ b/grep.h
> @@ -171,6 +171,7 @@ struct grep_opt {
>  	int show_hunk_mark;
>  	int file_break;
>  	int heading;
> +	int max_count;
>  	void *priv;
>  
>  	void (*output)(struct grep_opt *opt, const void *data, size_t size);
> @@ -181,6 +182,7 @@ struct grep_opt {
>  	.relative = 1, \
>  	.pathname = 1, \
>  	.max_depth = -1, \
> +	.max_count = -1, \
>  	.pattern_type_option = GREP_PATTERN_TYPE_UNSPECIFIED, \
>  	.colors = { \
>  		[GREP_COLOR_CONTEXT] = "", \
>
> base-commit: 5b71c59bc3b9365075e2a175aa7b6f2b0c84ce44
Carlos López June 22, 2022, 6:41 a.m. UTC | #2
Hi,

Just a couple of questions.

On Tuesday, June 21st, 2022 at 18:27, Junio C Hamano <gitster@pobox.com> wrote:

> "Carlos L. via GitGitGadget" gitgitgadget@gmail.com writes:
>
> > From: =?UTF-8?q?Carlos=20L=C3=B3pez?= 00xc@protonmail.com
> >
> > This patch adds a command line option analogous to that of GNU
> > grep(1)'s -m / --max-count, which users might already be used to.
> > This makes it possible to limit the amount of matches shown in the
> > output while keeping the functionality of other options such as -C
> > (show code context) or -p (show containing function), which would be
> > difficult to do with a shell pipeline (e.g. head(1)).
> >
> > Signed-off-by: Carlos López 00xc@protonmail.com
> > ---
> > ...
> > Documentation/git-grep.txt | 8 ++++++++
> > builtin/grep.c | 9 +++++++++
> > grep.c | 2 ++
> > grep.h | 2 ++
> > 4 files changed, 21 insertions(+)
>
>
> Tests?

Right. Is it OK if I include my test(s) in t/t7810-grep.sh, or should it be a different/new file?

> > diff --git a/grep.c b/grep.c
> > index 82eb7da1022..b32ab75cb6b 100644
> > --- a/grep.c
> > +++ b/grep.c
> > @@ -1686,6 +1686,8 @@ static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
> > bol = eol + 1;
> > if (!left)
> > break;
> > + if (opt->max_count != -1 && count == opt->max_count)
> > + break;
>
>
> I would have written it "if (0 <= opt->max_count && ...)". What
>
> happens when a trickster asks you to do "git grep -m -2"?

Fair enough. Since it's already optimized out above, is there any reason we need to include zero (<=)?

> I guess what I am getting at is if we are better off saying that
> negative means unlimited, instead of special casing -1 like this. I
> didn't think it through so it may be perfectly possible that what
> you wrote makes more sense than "anything negative is unlimited".
>
> I dunno.

I think you're right, I'll adjust my patch.

Best,
Carlos
Junio C Hamano June 22, 2022, 6:56 a.m. UTC | #3
"Carlos L." <00xc@protonmail.com> writes:

>> Tests?
>
> Right. Is it OK if I include my test(s) in t/t7810-grep.sh, or
> should it be a different/new file?

It is preferrable to add new tests to existing scripts, rather than
adding a new (and short) one.

Thanks.
Carlos López June 22, 2022, 1:23 p.m. UTC | #4
Hi,

On Wednesday, June 22nd, 2022 at 12:22, Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:

> On Tue, Jun 21 2022, Carlos L. via GitGitGadget wrote:
>
> > From: =?UTF-8?q?Carlos=20L=C3=B3pez?= 00xc@protonmail.com
> >
> > This patch adds a command line option analogous to that of GNU
> > grep(1)'s -m / --max-count, which users might already be used to.
>
>
> Thanks, this seems useful.
>
> > This makes it possible to limit the amount of matches shown in the
> > output while keeping the functionality of other options such as -C
> > (show code context) or -p (show containing function), which would be
> > difficult to do with a shell pipeline (e.g. head(1)).
>
>
> We start multi-threaded grep workers, how does this code handle races
> between them finding things, this count being incremented, and the "do
> we have sufficient results?" check?
>
> Is it guarded by the relevant mutexes?

AFAICT only a single thread runs on each file via grep_source_1(), and we check `count`, which is local to this function.

> > + /*
> > + * Optimize out the case where the amount of matches is limited to zero.
> > + * We do this to keep results consistent with GNU grep(1).
> > + */
> > + if (opt.max_count == 0)
> > + exit(EXIT_FAILURE);
>
>
> Don't use exit() in cmd_grep(), you should use "return 1".

I'll use return in my follow-up patch, this can be improved afterwards.

> But even better use usage_msg_opt() here, i.e. inform the user why this
> was bad.
>
> Or hrm, it seems GNU grep silently returns 1 here, perhaps --max-count=0
> is a feature for some?
>
> If this is intentional it's worth documenting and testing it explicitly.

I will add a sentence about this in Documentation/git-grep.txt.

> Re the comments from others about size_t or whatever, it might be better
> here to use OPT_CALLBACK and an unsigned type.
>
> Then just have a "int have_max_count:1", which IMO is more obvious than
> using integer wrap-around to test "didn't provide this flag".

FWIW, I think it's fine to use int and a negative value as an special encoding, max_depth does the same thing. These are per-file matches, so they should not go over 2 billion in reasonable use cases.
diff mbox series

Patch

diff --git a/Documentation/git-grep.txt b/Documentation/git-grep.txt
index 3d393fbac1b..19b817d5e58 100644
--- a/Documentation/git-grep.txt
+++ b/Documentation/git-grep.txt
@@ -23,6 +23,7 @@  SYNOPSIS
 	   [--break] [--heading] [-p | --show-function]
 	   [-A <post-context>] [-B <pre-context>] [-C <context>]
 	   [-W | --function-context]
+	   [(-m | --max-count) <num>]
 	   [--threads <num>]
 	   [-f <file>] [-e] <pattern>
 	   [--and|--or|--not|(|)|-e <pattern>...]
@@ -238,6 +239,13 @@  providing this option will cause it to die.
 	`git diff` works out patch hunk headers (see 'Defining a
 	custom hunk-header' in linkgit:gitattributes[5]).
 
+-m <num>::
+--max-count <num>::
+	Limit the amount of matches per file. When using the `-v` or
+	`--invert-match` option, the search stops after the specified
+	number of non-matches. A value of -1 will return unlimited
+	results (the default).
+
 --threads <num>::
 	Number of grep worker threads to use.
 	See `grep.threads` in 'CONFIGURATION' for more information.
diff --git a/builtin/grep.c b/builtin/grep.c
index bcb07ea7f75..4ab28995da0 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -961,6 +961,8 @@  int cmd_grep(int argc, const char **argv, const char *prefix)
 		OPT_BOOL_F(0, "ext-grep", &external_grep_allowed__ignored,
 			   N_("allow calling of grep(1) (ignored by this build)"),
 			   PARSE_OPT_NOCOMPLETE),
+		OPT_INTEGER('m', "max-count", &opt.max_count,
+			N_("maximum number of results per file")),
 		OPT_END()
 	};
 	grep_prefix = prefix;
@@ -1101,6 +1103,13 @@  int cmd_grep(int argc, const char **argv, const char *prefix)
 	if (recurse_submodules && untracked)
 		die(_("--untracked not supported with --recurse-submodules"));
 
+	/*
+	 * Optimize out the case where the amount of matches is limited to zero.
+	 * We do this to keep results consistent with GNU grep(1).
+	 */
+	if (opt.max_count == 0)
+		exit(EXIT_FAILURE);
+
 	if (show_in_pager) {
 		if (num_threads > 1)
 			warning(_("invalid option combination, ignoring --threads"));
diff --git a/grep.c b/grep.c
index 82eb7da1022..b32ab75cb6b 100644
--- a/grep.c
+++ b/grep.c
@@ -1686,6 +1686,8 @@  static int grep_source_1(struct grep_opt *opt, struct grep_source *gs, int colle
 		bol = eol + 1;
 		if (!left)
 			break;
+		if (opt->max_count != -1 && count == opt->max_count)
+			break;
 		left--;
 		lno++;
 	}
diff --git a/grep.h b/grep.h
index c722d25ed9d..bdcadce61b8 100644
--- a/grep.h
+++ b/grep.h
@@ -171,6 +171,7 @@  struct grep_opt {
 	int show_hunk_mark;
 	int file_break;
 	int heading;
+	int max_count;
 	void *priv;
 
 	void (*output)(struct grep_opt *opt, const void *data, size_t size);
@@ -181,6 +182,7 @@  struct grep_opt {
 	.relative = 1, \
 	.pathname = 1, \
 	.max_depth = -1, \
+	.max_count = -1, \
 	.pattern_type_option = GREP_PATTERN_TYPE_UNSPECIFIED, \
 	.colors = { \
 		[GREP_COLOR_CONTEXT] = "", \