diff mbox series

[v2,6/8] grep: add repository to OID grep sources

Message ID 50c69a988b2afca68f06e36949e4a8cc3a93940a.1628888668.git.jonathantanmy@google.com (mailing list archive)
State Superseded
Headers show
Series In grep, no adding submodule ODB as alternates | expand

Commit Message

Jonathan Tan Aug. 13, 2021, 9:05 p.m. UTC
Record the repository whenever an OID grep source is created, and teach
the worker threads to explicitly provide the repository when accessing
objects.

Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 builtin/grep.c | 15 ++++++---------
 grep.c         |  7 +++++--
 grep.h         | 13 ++++++++++++-
 3 files changed, 23 insertions(+), 12 deletions(-)

Comments

Matheus Tavares Aug. 16, 2021, 2:48 p.m. UTC | #1
On Fri, Aug 13, 2021 at 6:05 PM Jonathan Tan <jonathantanmy@google.com> wrote:
>
> Record the repository whenever an OID grep source is created, and teach
> the worker threads to explicitly provide the repository when accessing
> objects.
>
> Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
> ---
>  builtin/grep.c | 15 ++++++---------
>  grep.c         |  7 +++++--
>  grep.h         | 13 ++++++++++++-
>  3 files changed, 23 insertions(+), 12 deletions(-)
>
> diff --git a/builtin/grep.c b/builtin/grep.c
> index 69d8ea0808..d27e95e092 100644
> --- a/builtin/grep.c
> +++ b/builtin/grep.c
> @@ -349,7 +349,7 @@ static int grep_oid(struct grep_opt *opt, const struct object_id *oid,
>         struct grep_source gs;
>
>         grep_source_name(opt, filename, tree_name_len, &pathbuf);
> -       grep_source_init_oid(&gs, pathbuf.buf, path, oid);
> +       grep_source_init_oid(&gs, pathbuf.buf, path, oid, opt->repo);
>         strbuf_release(&pathbuf);
>
>         if (num_threads > 1) {
> @@ -462,14 +462,11 @@ static int grep_submodule(struct grep_opt *opt,
>         repo_read_gitmodules(subrepo, 0);
>
>         /*
> -        * NEEDSWORK: This adds the submodule's object directory to the list of
> -        * alternates for the single in-memory object store.  This has some bad
> -        * consequences for memory (processed objects will never be freed) and
> -        * performance (this increases the number of pack files git has to pay
> -        * attention to, to the sum of the number of pack files in all the
> -        * repositories processed so far).  This can be removed once the object
> -        * store is no longer global and instead is a member of the repository
> -        * object.
> +        * All code paths tested by test code no longer need submodule ODBs to
> +        * be added as alternates, but add it to the list just in case.
> +        * Submodule ODBs added through add_submodule_odb_by_path() will be
> +        * lazily registered as alternates when needed (and except in an
> +        * unexpected code interaction, it won't be needed).

Nice.

>          */
>         add_submodule_odb_by_path(subrepo->objects->odb->path);
>         obj_read_unlock();
> diff --git a/grep.c b/grep.c
> index 8a8105c2eb..79598f245f 100644
> --- a/grep.c
> +++ b/grep.c
> @@ -1863,7 +1863,8 @@ void grep_source_init_file(struct grep_source *gs, const char *name,
>  }
>
>  void grep_source_init_oid(struct grep_source *gs, const char *name,
> -                         const char *path, const struct object_id *oid)
> +                         const char *path, const struct object_id *oid,
> +                         struct repository *repo)
>  {
>         gs->type = GREP_SOURCE_OID;
>         gs->name = xstrdup_or_null(name);
> @@ -1872,6 +1873,7 @@ void grep_source_init_oid(struct grep_source *gs, const char *name,
>         gs->size = 0;
>         gs->driver = NULL;
>         gs->identifier = oiddup(oid);
> +       gs->repo = repo;
>  }
>
>  void grep_source_clear(struct grep_source *gs)
> @@ -1900,7 +1902,8 @@ static int grep_source_load_oid(struct grep_source *gs)
>  {
>         enum object_type type;
>
> -       gs->buf = read_object_file(gs->identifier, &type, &gs->size);
> +       gs->buf = repo_read_object_file(gs->repo, gs->identifier, &type,
> +                                       &gs->size);
>         if (!gs->buf)
>                 return error(_("'%s': unable to read %s"),
>                              gs->name,
> diff --git a/grep.h b/grep.h
> index 480b3f5bba..24c6b64cea 100644
> --- a/grep.h
> +++ b/grep.h
> @@ -120,7 +120,16 @@ struct grep_opt {
>         struct grep_pat *header_list;
>         struct grep_pat **header_tail;
>         struct grep_expr *pattern_expression;
> +
> +       /*
> +        * NEEDSWORK: See if we can remove this field, because the repository
> +        * should probably be per-source, not per-repo.

Hmm, I think the "not per-repo" part is a bit confusing, as it refers
to "the repository" ("the repository should not be per-repo"?) Could
we remove that part?

Maybe we could also be a bit more specific regarding the suggested
conversion:  "See if we can remove this field, because the repository
should probably be per-source. That is, grep.c functions using
`grep_opt.repo` should probably start using `grep_source.repo`
instead." (But that's nitpicking from my part, feel free to ignore
it.)

>             [...] This is potentially the
> +        * cause of at least one bug - "git grep" ignoring the textconv
> +        * attributes from submodules. See [1] for more information.
> +        * [1] https://lore.kernel.org/git/CAHd-oW5iEQarYVxEXoTG-ua2zdoybTrSjCBKtO0YT292fm0NQQ@mail.gmail.com/
> +        */
>         struct repository *repo;
> +
>         const char *prefix;
>         int prefix_length;
>         regex_t regexp;
> @@ -187,6 +196,7 @@ struct grep_source {
>                 GREP_SOURCE_BUF,
>         } type;
>         void *identifier;
> +       struct repository *repo; /* if GREP_SOURCE_OID */
>
>         char *buf;
>         unsigned long size;
> @@ -198,7 +208,8 @@ struct grep_source {
>  void grep_source_init_file(struct grep_source *gs, const char *name,
>                            const char *path);
>  void grep_source_init_oid(struct grep_source *gs, const char *name,
> -                         const char *path, const struct object_id *oid);
> +                         const char *path, const struct object_id *oid,
> +                         struct repository *repo);
>  void grep_source_clear_data(struct grep_source *gs);
>  void grep_source_clear(struct grep_source *gs);
>  void grep_source_load_driver(struct grep_source *gs,
> --
> 2.33.0.rc1.237.g0d66db33f3-goog
>
Jonathan Tan Aug. 16, 2021, 7:44 p.m. UTC | #2
> > @@ -120,7 +120,16 @@ struct grep_opt {
> >         struct grep_pat *header_list;
> >         struct grep_pat **header_tail;
> >         struct grep_expr *pattern_expression;
> > +
> > +       /*
> > +        * NEEDSWORK: See if we can remove this field, because the repository
> > +        * should probably be per-source, not per-repo.
> 
> Hmm, I think the "not per-repo" part is a bit confusing, as it refers
> to "the repository" ("the repository should not be per-repo"?) Could
> we remove that part?
> 
> Maybe we could also be a bit more specific regarding the suggested
> conversion:  "See if we can remove this field, because the repository
> should probably be per-source. That is, grep.c functions using
> `grep_opt.repo` should probably start using `grep_source.repo`
> instead." (But that's nitpicking from my part, feel free to ignore
> it.)

Thanks - your suggestion is much clearer, so I'll use it.
diff mbox series

Patch

diff --git a/builtin/grep.c b/builtin/grep.c
index 69d8ea0808..d27e95e092 100644
--- a/builtin/grep.c
+++ b/builtin/grep.c
@@ -349,7 +349,7 @@  static int grep_oid(struct grep_opt *opt, const struct object_id *oid,
 	struct grep_source gs;
 
 	grep_source_name(opt, filename, tree_name_len, &pathbuf);
-	grep_source_init_oid(&gs, pathbuf.buf, path, oid);
+	grep_source_init_oid(&gs, pathbuf.buf, path, oid, opt->repo);
 	strbuf_release(&pathbuf);
 
 	if (num_threads > 1) {
@@ -462,14 +462,11 @@  static int grep_submodule(struct grep_opt *opt,
 	repo_read_gitmodules(subrepo, 0);
 
 	/*
-	 * NEEDSWORK: This adds the submodule's object directory to the list of
-	 * alternates for the single in-memory object store.  This has some bad
-	 * consequences for memory (processed objects will never be freed) and
-	 * performance (this increases the number of pack files git has to pay
-	 * attention to, to the sum of the number of pack files in all the
-	 * repositories processed so far).  This can be removed once the object
-	 * store is no longer global and instead is a member of the repository
-	 * object.
+	 * All code paths tested by test code no longer need submodule ODBs to
+	 * be added as alternates, but add it to the list just in case.
+	 * Submodule ODBs added through add_submodule_odb_by_path() will be
+	 * lazily registered as alternates when needed (and except in an
+	 * unexpected code interaction, it won't be needed).
 	 */
 	add_submodule_odb_by_path(subrepo->objects->odb->path);
 	obj_read_unlock();
diff --git a/grep.c b/grep.c
index 8a8105c2eb..79598f245f 100644
--- a/grep.c
+++ b/grep.c
@@ -1863,7 +1863,8 @@  void grep_source_init_file(struct grep_source *gs, const char *name,
 }
 
 void grep_source_init_oid(struct grep_source *gs, const char *name,
-			  const char *path, const struct object_id *oid)
+			  const char *path, const struct object_id *oid,
+			  struct repository *repo)
 {
 	gs->type = GREP_SOURCE_OID;
 	gs->name = xstrdup_or_null(name);
@@ -1872,6 +1873,7 @@  void grep_source_init_oid(struct grep_source *gs, const char *name,
 	gs->size = 0;
 	gs->driver = NULL;
 	gs->identifier = oiddup(oid);
+	gs->repo = repo;
 }
 
 void grep_source_clear(struct grep_source *gs)
@@ -1900,7 +1902,8 @@  static int grep_source_load_oid(struct grep_source *gs)
 {
 	enum object_type type;
 
-	gs->buf = read_object_file(gs->identifier, &type, &gs->size);
+	gs->buf = repo_read_object_file(gs->repo, gs->identifier, &type,
+					&gs->size);
 	if (!gs->buf)
 		return error(_("'%s': unable to read %s"),
 			     gs->name,
diff --git a/grep.h b/grep.h
index 480b3f5bba..24c6b64cea 100644
--- a/grep.h
+++ b/grep.h
@@ -120,7 +120,16 @@  struct grep_opt {
 	struct grep_pat *header_list;
 	struct grep_pat **header_tail;
 	struct grep_expr *pattern_expression;
+
+	/*
+	 * NEEDSWORK: See if we can remove this field, because the repository
+	 * should probably be per-source, not per-repo. This is potentially the
+	 * cause of at least one bug - "git grep" ignoring the textconv
+	 * attributes from submodules. See [1] for more information.
+	 * [1] https://lore.kernel.org/git/CAHd-oW5iEQarYVxEXoTG-ua2zdoybTrSjCBKtO0YT292fm0NQQ@mail.gmail.com/
+	 */
 	struct repository *repo;
+
 	const char *prefix;
 	int prefix_length;
 	regex_t regexp;
@@ -187,6 +196,7 @@  struct grep_source {
 		GREP_SOURCE_BUF,
 	} type;
 	void *identifier;
+	struct repository *repo; /* if GREP_SOURCE_OID */
 
 	char *buf;
 	unsigned long size;
@@ -198,7 +208,8 @@  struct grep_source {
 void grep_source_init_file(struct grep_source *gs, const char *name,
 			   const char *path);
 void grep_source_init_oid(struct grep_source *gs, const char *name,
-			  const char *path, const struct object_id *oid);
+			  const char *path, const struct object_id *oid,
+			  struct repository *repo);
 void grep_source_clear_data(struct grep_source *gs);
 void grep_source_clear(struct grep_source *gs);
 void grep_source_load_driver(struct grep_source *gs,