diff mbox series

[v3,4/9] tree: handle submodule case for read_tree_at properly

Message ID d3d1738e670d5dbf1378fc5c3209b2e98234a771.1665973401.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series archive: Add --recurse-submodules to git-archive command | expand

Commit Message

Heather Lapointe Oct. 17, 2022, 2:23 a.m. UTC
From: Heather Lapointe <alpha@alphaservcomputing.solutions>

This supports traversal into an actual submodule for read_tree_at.
The logic is blocked on pathspec->recurse_submodules now,
but previously hadn't been executed due to all fn() cases
returning early for submodules.

Signed-off-by: Heather Lapointe <alpha@alphaservcomputing.solutions>
---
 tree.c | 88 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 61 insertions(+), 27 deletions(-)

Comments

Phillip Wood Oct. 17, 2022, 1:48 p.m. UTC | #1
Hi Heather

On 17/10/2022 03:23, Heather Lapointe via GitGitGadget wrote:
> From: Heather Lapointe <alpha@alphaservcomputing.solutions>
> 
> This supports traversal into an actual submodule for read_tree_at.
> The logic is blocked on pathspec->recurse_submodules now,

I'm struggling to understand what this is saying.

> but previously hadn't been executed due to all fn() cases
> returning early for submodules.
> 
> Signed-off-by: Heather Lapointe <alpha@alphaservcomputing.solutions>
> ---
>   tree.c | 88 ++++++++++++++++++++++++++++++++++++++++------------------
>   1 file changed, 61 insertions(+), 27 deletions(-)
> 
> diff --git a/tree.c b/tree.c
> index 13f9173d45e..2a087c010f9 100644
> --- a/tree.c
> +++ b/tree.c
> @@ -8,6 +8,7 @@
>   #include "alloc.h"
>   #include "tree-walk.h"
>   #include "repository.h"
> +#include "pathspec.h"
>   
>   const char *tree_type = "tree";
>   
> @@ -47,40 +48,73 @@ int read_tree_at(struct repository *r,
>   			return -1;
>   		}
>   
> -		if (S_ISDIR(entry.mode))
> +		if (S_ISDIR(entry.mode)) {
>   			oidcpy(&oid, &entry.oid);
> -		else if (S_ISGITLINK(entry.mode)) {
> -			struct commit *commit;
>   
> -			commit = lookup_commit(r, &entry.oid);
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			strbuf_addch(base, '/');
> +			retval = read_tree_at(r, lookup_tree(r, &oid),
> +						base, pathspec,
> +						fn, context);
> +			strbuf_setlen(base, oldlen);
> +			if (retval)
> +				return -1;
> +		} else if (pathspec->recurse_submodules && S_ISGITLINK(entry.mode)) {
> +			struct commit *commit;
> +			struct repository subrepo;
> +			struct repository* subrepo_p = &subrepo;

Normally we'd just use &subrepo wherever we want a pointer rather than 
defining an alias like this. For example it is common to see

	struct strbuf buf = STRBUF_INIT;

	strbuf_add(&buf, "hello world");

we don't define a buf_p variable

> +			struct tree* submodule_tree;
> +			char *submodule_rel_path;
> +			int name_base_len = 0;
> +
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			submodule_rel_path = base->buf;
> +			// repo_submodule_init expects a path relative to submodule_prefix

I found the comments in this section code code helpful, but single line 
comments should be formatted as
	/* single line comment */

> +			if (r->submodule_prefix) {
> +				name_base_len = strlen(r->submodule_prefix);
> +				// we should always expect to start with submodule_prefix
> +				assert(!strncmp(submodule_rel_path, r->submodule_prefix, name_base_len));

Rather than using assert() we tend to use BUG() as that then provides a 
grep-able message. It also means that we wont have an out of bounds 
access if the invariant is violated when compiling with NDEBUG. So we 
could drop the comment and write

	if (strncmp(submodule_rel_path, r->submodule_prefix, name_base_len)
		BUG("missing submodule path prefix");

> +				// strip the prefix
> +				submodule_rel_path += name_base_len;
> +				// if submodule_prefix doesn't end with a /, we want to get rid of that too

I think there is a typo here - if the prefix does end with a / then 
we're dropping it.

> +				if (is_dir_sep(submodule_rel_path[0])) {
> +					submodule_rel_path++;
> +				}
> +			}
> +
> +			if (repo_submodule_init(subrepo_p, r, submodule_rel_path, null_oid()))
> +				die("couldn't init submodule %s", base->buf);
> +
> +			if (repo_read_index(subrepo_p) < 0)
> +				die("index file corrupt");
> +
> +			commit = lookup_commit(subrepo_p, &entry.oid);
>   			if (!commit)
> -				die("Commit %s in submodule path %s%s not found",
> +				die("Commit %s in submodule path %s not found",
>   				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> -
> -			// FIXME: This is the wrong repo instance (it refers to the superproject)
> -			// it will always fail as is (will fix in later patch)
> -			// This current codepath isn't executed by any existing callbacks
> -			// so it wouldn't show up as an issue at this time.
> -			if (repo_parse_commit(r, commit))

Style comment for the patch that added this code. Multi-line comments 
should be formatted as
	/*
	 * Multi-line
	 * comment
	 */

Best Wishes

Phillip

> -				die("Invalid commit %s in submodule path %s%s",
> +				    base->buf);
> +
> +			if (repo_parse_commit(subrepo_p, commit))
> +				die("Invalid commit %s in submodule path %s",
>   				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> +				    base->buf);
>   
> -			oidcpy(&oid, get_commit_tree_oid(commit));
> -		}
> -		else
> -			continue;
> +			submodule_tree = repo_get_commit_tree(subrepo_p, commit);
> +			oidcpy(&oid, submodule_tree ? &submodule_tree->object.oid : NULL);
>   
> -		len = tree_entry_len(&entry);
> -		strbuf_add(base, entry.path, len);
> -		strbuf_addch(base, '/');
> -		retval = read_tree_at(r, lookup_tree(r, &oid),
> -				      base, pathspec,
> -				      fn, context);
> -		strbuf_setlen(base, oldlen);
> -		if (retval)
> -			return -1;
> +			strbuf_addch(base, '/');
> +
> +			retval = read_tree_at(subrepo_p, lookup_tree(subrepo_p, &oid),
> +						base, pathspec,
> +						fn, context);
> +			if (retval)
> +			    die("failed to read tree for %s", base->buf);
> +			strbuf_setlen(base, oldlen);
> +			repo_clear(subrepo_p);
> +		}
> +		// else, this is a file (or a submodule, but no pathspec->recurse_submodules)
>   	}
>   	return 0;
>   }
Junio C Hamano Oct. 17, 2022, 1:56 p.m. UTC | #2
"Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: Heather Lapointe <alpha@alphaservcomputing.solutions>
>
> This supports traversal into an actual submodule for read_tree_at.
> The logic is blocked on pathspec->recurse_submodules now,
> but previously hadn't been executed due to all fn() cases
> returning early for submodules.
>
> Signed-off-by: Heather Lapointe <alpha@alphaservcomputing.solutions>
> ---
>  tree.c | 88 ++++++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 61 insertions(+), 27 deletions(-)
>
> diff --git a/tree.c b/tree.c
> index 13f9173d45e..2a087c010f9 100644
> --- a/tree.c
> +++ b/tree.c
> @@ -8,6 +8,7 @@
>  #include "alloc.h"
>  #include "tree-walk.h"
>  #include "repository.h"
> +#include "pathspec.h"
>  
>  const char *tree_type = "tree";
>  
> @@ -47,40 +48,73 @@ int read_tree_at(struct repository *r,
>  			return -1;
>  		}
>  
> -		if (S_ISDIR(entry.mode))
> +		if (S_ISDIR(entry.mode)) {
>  			oidcpy(&oid, &entry.oid);
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			strbuf_addch(base, '/');
> +			retval = read_tree_at(r, lookup_tree(r, &oid),
> +						base, pathspec,
> +						fn, context);
> +			strbuf_setlen(base, oldlen);
> +			if (retval)
> +				return -1;

The diff output makes it appear as if we are now adding many extra
processing to a normal directory case, but it actually folds the
code that was originally outside the if/else if/ cascade here.  So
I think this is not breaking the normal directory case.

> +		} else if (pathspec->recurse_submodules && S_ISGITLINK(entry.mode)) {
> +			struct commit *commit;
> +			struct repository subrepo;
> +			struct repository* subrepo_p = &subrepo;
> +			struct tree* submodule_tree;

In our codebase, star/asterisk for a pointer declaration sticks to
the variable, not the type.

cf. Documentation/CodingGuidelines

> +			char *submodule_rel_path;

Funny that the new code sometimes gets it right ;-)

> +			int name_base_len = 0;
> +
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			submodule_rel_path = base->buf;
> +			// repo_submodule_init expects a path relative to submodule_prefix

We avoid // comments.

> +			if (r->submodule_prefix) {
> +				name_base_len = strlen(r->submodule_prefix);
> +				// we should always expect to start with submodule_prefix
> +				assert(!strncmp(submodule_rel_path, r->submodule_prefix, name_base_len));
> +				// strip the prefix
> +				submodule_rel_path += name_base_len;
> +				// if submodule_prefix doesn't end with a /, we want to get rid of that too
> +				if (is_dir_sep(submodule_rel_path[0])) {
> +					submodule_rel_path++;
> +				}
> +			}
> +
> +			if (repo_submodule_init(subrepo_p, r, submodule_rel_path, null_oid()))
> +				die("couldn't init submodule %s", base->buf);
> +
> +			if (repo_read_index(subrepo_p) < 0)
> +				die("index file corrupt");

Why?  You are going to ask the object store of the submodule
repository, and to do so you do not need to have its index read into
the core.

> +			commit = lookup_commit(subrepo_p, &entry.oid);
>  			if (!commit)
> -				die("Commit %s in submodule path %s%s not found",
> +				die("Commit %s in submodule path %s not found",
>  				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> -
> -			// FIXME: This is the wrong repo instance (it refers to the superproject)
> -			// it will always fail as is (will fix in later patch)
> -			// This current codepath isn't executed by any existing callbacks
> -			// so it wouldn't show up as an issue at this time.
> -			if (repo_parse_commit(r, commit))
> -				die("Invalid commit %s in submodule path %s%s",
> +				    base->buf);
> +
> +			if (repo_parse_commit(subrepo_p, commit))
> +				die("Invalid commit %s in submodule path %s",
>  				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> +				    base->buf);
>  
> -			oidcpy(&oid, get_commit_tree_oid(commit));
> -		}
> -		else
> -			continue;
> +			submodule_tree = repo_get_commit_tree(subrepo_p, commit);
> +			oidcpy(&oid, submodule_tree ? &submodule_tree->object.oid : NULL);
>  
> -		len = tree_entry_len(&entry);
> -		strbuf_add(base, entry.path, len);
> -		strbuf_addch(base, '/');
> -		retval = read_tree_at(r, lookup_tree(r, &oid),
> -				      base, pathspec,
> -				      fn, context);
> -		strbuf_setlen(base, oldlen);
> -		if (retval)
> -			return -1;
> +			strbuf_addch(base, '/');
> +
> +			retval = read_tree_at(subrepo_p, lookup_tree(subrepo_p, &oid),
> +						base, pathspec,
> +						fn, context);
> +			if (retval)
> +			    die("failed to read tree for %s", base->buf);
> +			strbuf_setlen(base, oldlen);
> +			repo_clear(subrepo_p);

This is a lot of new code, which must be done correctly.  An easier
way out to use the add_submodule_odb() trick that the original code
assumed becomes somewhat tempting (I guess we would do that in fn()
that would tell us to recurse into this codepath upon seeing a gitlink
entry).  Then we wouldn't have had to touch any tree() calls that were
taught to take "struct repository *" in earlier steps in this series.

But at some point, we would need to bite the bullet and plumb the
repository pointer through the callchain of more APIs, and this may
be that point.  I dunno.

> +		}
> +		// else, this is a file (or a submodule, but no pathspec->recurse_submodules)
>  	}
>  	return 0;
>  }
Glen Choo Oct. 26, 2022, 10:48 p.m. UTC | #3
"Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:

> @@ -47,40 +48,73 @@ int read_tree_at(struct repository *r,
>  			return -1;
>  		}
>  
> -		if (S_ISDIR(entry.mode))
> +		if (S_ISDIR(entry.mode)) {
>  			oidcpy(&oid, &entry.oid);
> -		else if (S_ISGITLINK(entry.mode)) {
> -			struct commit *commit;
>  
> -			commit = lookup_commit(r, &entry.oid);
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			strbuf_addch(base, '/');
> +			retval = read_tree_at(r, lookup_tree(r, &oid),
> +						base, pathspec,
> +						fn, context);
> +			strbuf_setlen(base, oldlen);
> +			if (retval)
> +				return -1;
> +		} else if (pathspec->recurse_submodules && S_ISGITLINK(entry.mode)) {
> +			struct commit *commit;
> +			struct repository subrepo;
> +			struct repository* subrepo_p = &subrepo;
> +			struct tree* submodule_tree;
> +			char *submodule_rel_path;
> +			int name_base_len = 0;
> +
> +			len = tree_entry_len(&entry);
> +			strbuf_add(base, entry.path, len);
> +			submodule_rel_path = base->buf;
> +			// repo_submodule_init expects a path relative to submodule_prefix
> +			if (r->submodule_prefix) {
> +				name_base_len = strlen(r->submodule_prefix);
> +				// we should always expect to start with submodule_prefix
> +				assert(!strncmp(submodule_rel_path, r->submodule_prefix, name_base_len));
> +				// strip the prefix
> +				submodule_rel_path += name_base_len;
> +				// if submodule_prefix doesn't end with a /, we want to get rid of that too
> +				if (is_dir_sep(submodule_rel_path[0])) {
> +					submodule_rel_path++;
> +				}
> +			}
> +
> +			if (repo_submodule_init(subrepo_p, r, submodule_rel_path, null_oid()))
> +				die("couldn't init submodule %s", base->buf);
> +
> +			if (repo_read_index(subrepo_p) < 0)
> +				die("index file corrupt");
> +
> +			commit = lookup_commit(subrepo_p, &entry.oid);
>  			if (!commit)
> -				die("Commit %s in submodule path %s%s not found",
> +				die("Commit %s in submodule path %s not found",
>  				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> -
> -			// FIXME: This is the wrong repo instance (it refers to the superproject)
> -			// it will always fail as is (will fix in later patch)
> -			// This current codepath isn't executed by any existing callbacks
> -			// so it wouldn't show up as an issue at this time.
> -			if (repo_parse_commit(r, commit))
> -				die("Invalid commit %s in submodule path %s%s",
> +				    base->buf);
> +
> +			if (repo_parse_commit(subrepo_p, commit))
> +				die("Invalid commit %s in submodule path %s",
>  				    oid_to_hex(&entry.oid),
> -				    base->buf, entry.path);
> +				    base->buf);
>  
> -			oidcpy(&oid, get_commit_tree_oid(commit));
> -		}
> -		else
> -			continue;
> +			submodule_tree = repo_get_commit_tree(subrepo_p, commit);
> +			oidcpy(&oid, submodule_tree ? &submodule_tree->object.oid : NULL);
>  
> -		len = tree_entry_len(&entry);
> -		strbuf_add(base, entry.path, len);
> -		strbuf_addch(base, '/');
> -		retval = read_tree_at(r, lookup_tree(r, &oid),
> -				      base, pathspec,
> -				      fn, context);
> -		strbuf_setlen(base, oldlen);
> -		if (retval)
> -			return -1;
> +			strbuf_addch(base, '/');
> +
> +			retval = read_tree_at(subrepo_p, lookup_tree(subrepo_p, &oid),
> +						base, pathspec,
> +						fn, context);
> +			if (retval)
> +			    die("failed to read tree for %s", base->buf);
> +			strbuf_setlen(base, oldlen);
> +			repo_clear(subrepo_p);
> +		}
> +		// else, this is a file (or a submodule, but no pathspec->recurse_submodules)

In this patch, we say that we can ignore a submodule when
pathspec->recurse_submodules is 0, but unless I'm missing something, I
don't think that's the case. The preimage is:

		else if (S_ISGITLINK(entry.mode)) {
			struct commit *commit;

			commit = lookup_commit(r, &entry.oid);
			if (!commit)
				die("Commit %s in submodule path %s%s not found",
				    oid_to_hex(&entry.oid),
				    base->buf, entry.path);

      /* ... */
			if (repo_parse_commit(r, commit))
				die("Invalid commit %s in submodule path %s%s",
				    oid_to_hex(&entry.oid),
				    base->buf, entry.path);

			oidcpy(&oid, get_commit_tree_oid(commit));
		}
		else
			continue;

		len = tree_entry_len(&entry);
		strbuf_add(base, entry.path, len);
		strbuf_addch(base, '/');
		retval = read_tree_at(r, lookup_tree(r, &oid),
				      base, pathspec,
				      fn, context);

which isn't a no-op since we actually do recurse into the gitlink. I
don't know whether the subsequent call actually succeeds though (e.g.
maybe it always failed and it was just a de facto no-op?), but that's
much harder to prove. Since this function has callers outside of "git
archive", it would be better to be conservative and keep the original
behavior in the S_ISGITLINK(entry.mode) && !pathspec->recurse_submodules
case.

>  	}
>  	return 0;
>  }
> -- 
> gitgitgadget
Jonathan Tan Oct. 27, 2022, 6:43 p.m. UTC | #4
"Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:
> From: Heather Lapointe <alpha@alphaservcomputing.solutions>
> 
> This supports traversal into an actual submodule for read_tree_at.
> The logic is blocked on pathspec->recurse_submodules now,

What do you mean by "blocked"? Do you mean only that 
pathspec->recurse_submodules needs to be specified, or do you also mean 
that no caller specifies pathspec->recurse_submodules now, so this code 
path is never executed? 

> but previously hadn't been executed due to all fn() cases
> returning early for submodules.

What do you mean by "previously hadn't been executed due to..."? At a 
glance, I would think that this new logic is introduced in this patch, 
so of course it would never have been previously executed. 

> +			if (repo_submodule_init(subrepo_p, r, submodule_rel_path, null_oid()))

I don't think this can be null_oid() here - it has to match the tree 
from which you constructed submodule_rel_path. (That tree may not be 
the tree at HEAD.)
diff mbox series

Patch

diff --git a/tree.c b/tree.c
index 13f9173d45e..2a087c010f9 100644
--- a/tree.c
+++ b/tree.c
@@ -8,6 +8,7 @@ 
 #include "alloc.h"
 #include "tree-walk.h"
 #include "repository.h"
+#include "pathspec.h"
 
 const char *tree_type = "tree";
 
@@ -47,40 +48,73 @@  int read_tree_at(struct repository *r,
 			return -1;
 		}
 
-		if (S_ISDIR(entry.mode))
+		if (S_ISDIR(entry.mode)) {
 			oidcpy(&oid, &entry.oid);
-		else if (S_ISGITLINK(entry.mode)) {
-			struct commit *commit;
 
-			commit = lookup_commit(r, &entry.oid);
+			len = tree_entry_len(&entry);
+			strbuf_add(base, entry.path, len);
+			strbuf_addch(base, '/');
+			retval = read_tree_at(r, lookup_tree(r, &oid),
+						base, pathspec,
+						fn, context);
+			strbuf_setlen(base, oldlen);
+			if (retval)
+				return -1;
+		} else if (pathspec->recurse_submodules && S_ISGITLINK(entry.mode)) {
+			struct commit *commit;
+			struct repository subrepo;
+			struct repository* subrepo_p = &subrepo;
+			struct tree* submodule_tree;
+			char *submodule_rel_path;
+			int name_base_len = 0;
+
+			len = tree_entry_len(&entry);
+			strbuf_add(base, entry.path, len);
+			submodule_rel_path = base->buf;
+			// repo_submodule_init expects a path relative to submodule_prefix
+			if (r->submodule_prefix) {
+				name_base_len = strlen(r->submodule_prefix);
+				// we should always expect to start with submodule_prefix
+				assert(!strncmp(submodule_rel_path, r->submodule_prefix, name_base_len));
+				// strip the prefix
+				submodule_rel_path += name_base_len;
+				// if submodule_prefix doesn't end with a /, we want to get rid of that too
+				if (is_dir_sep(submodule_rel_path[0])) {
+					submodule_rel_path++;
+				}
+			}
+
+			if (repo_submodule_init(subrepo_p, r, submodule_rel_path, null_oid()))
+				die("couldn't init submodule %s", base->buf);
+
+			if (repo_read_index(subrepo_p) < 0)
+				die("index file corrupt");
+
+			commit = lookup_commit(subrepo_p, &entry.oid);
 			if (!commit)
-				die("Commit %s in submodule path %s%s not found",
+				die("Commit %s in submodule path %s not found",
 				    oid_to_hex(&entry.oid),
-				    base->buf, entry.path);
-
-			// FIXME: This is the wrong repo instance (it refers to the superproject)
-			// it will always fail as is (will fix in later patch)
-			// This current codepath isn't executed by any existing callbacks
-			// so it wouldn't show up as an issue at this time.
-			if (repo_parse_commit(r, commit))
-				die("Invalid commit %s in submodule path %s%s",
+				    base->buf);
+
+			if (repo_parse_commit(subrepo_p, commit))
+				die("Invalid commit %s in submodule path %s",
 				    oid_to_hex(&entry.oid),
-				    base->buf, entry.path);
+				    base->buf);
 
-			oidcpy(&oid, get_commit_tree_oid(commit));
-		}
-		else
-			continue;
+			submodule_tree = repo_get_commit_tree(subrepo_p, commit);
+			oidcpy(&oid, submodule_tree ? &submodule_tree->object.oid : NULL);
 
-		len = tree_entry_len(&entry);
-		strbuf_add(base, entry.path, len);
-		strbuf_addch(base, '/');
-		retval = read_tree_at(r, lookup_tree(r, &oid),
-				      base, pathspec,
-				      fn, context);
-		strbuf_setlen(base, oldlen);
-		if (retval)
-			return -1;
+			strbuf_addch(base, '/');
+
+			retval = read_tree_at(subrepo_p, lookup_tree(subrepo_p, &oid),
+						base, pathspec,
+						fn, context);
+			if (retval)
+			    die("failed to read tree for %s", base->buf);
+			strbuf_setlen(base, oldlen);
+			repo_clear(subrepo_p);
+		}
+		// else, this is a file (or a submodule, but no pathspec->recurse_submodules)
 	}
 	return 0;
 }