diff mbox series

[v3,8/9] archive: add --recurse-submodules to git-archive command

Message ID 4672e3d958625cd76eb8056ab434e9a37f52661e.1665973401.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series archive: Add --recurse-submodules to git-archive command | expand

Commit Message

Heather Lapointe Oct. 17, 2022, 2:23 a.m. UTC
From: Heather Lapointe <alpha@alphaservcomputing.solutions>

This makes it possible to include submodule contents in an archive command.

The default behavior remains the same, do not write submodule contents
to the resulting archive.

Signed-off-by: Heather Lapointe <alpha@alphaservcomputing.solutions>
---
 Documentation/git-archive.txt |  6 +++++-
 archive.c                     | 36 +++++++++++++++++++++++++++++++++--
 archive.h                     |  1 +
 3 files changed, 40 insertions(+), 3 deletions(-)

Comments

Glen Choo Oct. 26, 2022, 11:34 p.m. UTC | #1
"Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:

> index 34549d849f1..f81ef741487 100644
> --- a/archive.c
> +++ b/archive.c
> @@ -213,6 +214,25 @@ static void queue_directory(const struct object_id *oid,
>  	oidcpy(&d->oid, oid);
>  }
>  
> +static void queue_submodule(
> +		struct repository *superproject,
> +		const struct object_id *oid,
> +		struct strbuf *base, const char *filename,
> +		unsigned mode, struct archiver_context *c)
> +{
> +	struct repository subrepo;
> +
> +	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
> +		return;
> +
> +	if (repo_read_index(&subrepo) < 0)
> +		die("index file corrupt");
> +
> +    queue_directory(oid, base, filename, mode, c);
> +
> +	repo_clear(&subrepo);
> +}
> +

This bit is puzzling to me because we init the submodule, read its
index, and then don't read objects from it at all. How does this work
when we aren't reading objects from the submodule we init here? My guess
is that read_tree() is already doing the heavy lifting of recursing into
submodules, so we don't need to worry any more about init-ing submodules
in archive.c, which is great.

So in effect, this is just checking whether we can read the submodule
and its index. We can drop this check since we already do that check in
read_tree().

What's much more surprising is that you can delete the entire function
body (even queue_directory()!) and the tests still pass! The tests are
definitely testing what they say they are (I've also checked the
tarballs), so I'm not sure what's going on.

I commented out queue_directory() in the S_ISDIR case, and the only test
failures I saw were:

- t5000.68, which uses a glob in its pathspec. I tried using a glob for
  in the archive submodule tests, but I couldn't reproduce the failure.
- t5004.11, which is a really big test case that I didn't bother looking
  deeply into.

So I'm at a loss as to what queue_directory() actually does. My best
guess at a reproduction would be to make a subdirectory in t5000.68 a
submodule. If we do find such a reproducing case, we should add it to
the test suite.

>  static int write_directory(
>  		struct repository *repo,
>  		struct archiver_context *c)
> @@ -228,9 +248,11 @@ static int write_directory(
>  		write_directory(repo, c) ||
>  		write_archive_entry(repo, &d->oid, d->path, d->baselen,
>  				    d->path + d->baselen, d->mode,
> -				    c) != READ_TREE_RECURSIVE;
> +				    c);
>  	free(d);
> -	return ret ? -1 : 0;
> +	if (ret == READ_TREE_RECURSIVE)
> +		return 0;
> +	return ret;
>  }
>  
>  static int queue_or_write_archive_entry(
> @@ -263,6 +285,11 @@ static int queue_or_write_archive_entry(
>  			return 0;
>  		queue_directory(oid, base, filename, mode, c);
>  		return READ_TREE_RECURSIVE;
> +	} else if (c->args->recurse_submodules && S_ISGITLINK(mode)) {
> +		if (is_submodule_active(r, filename)) {
> +			queue_submodule(r, oid, base, filename, mode, c);
> +			return READ_TREE_RECURSIVE;
> +		}

If we are omitting inactive submodules from the archive, we should test
this behavior.

>  	}
>  
>  	if (write_directory(r, c))
> @@ -446,6 +473,7 @@ static void parse_pathspec_arg(
>  		       PATHSPEC_PREFER_FULL,
>  		       "", pathspec);
>  	ar_args->pathspec.recursive = 1;
> +	ar_args->pathspec.recurse_submodules = ar_args->recurse_submodules;
>  	if (pathspec) {
>  		while (*pathspec) {
>  			if (**pathspec && !path_exists(repo, ar_args, *pathspec))
> @@ -609,6 +637,7 @@ static int parse_archive_args(int argc, const char **argv,
>  	int verbose = 0;
>  	int i;
>  	int list = 0;
> +	int recurse_submodules = 0;
>  	int worktree_attributes = 0;
>  	struct option opts[] = {
>  		OPT_GROUP(""),
> @@ -623,6 +652,8 @@ static int parse_archive_args(int argc, const char **argv,
>  		  add_file_cb, (intptr_t)&base },
>  		OPT_STRING('o', "output", &output, N_("file"),
>  			N_("write the archive to this file")),
> +		OPT_BOOL(0, "recurse-submodules", &recurse_submodules,
> +			N_("include submodules in archive")),
>  		OPT_BOOL(0, "worktree-attributes", &worktree_attributes,
>  			N_("read .gitattributes in working directory")),
>  		OPT__VERBOSE(&verbose, N_("report archived files on stderr")),
> @@ -686,6 +717,7 @@ static int parse_archive_args(int argc, const char **argv,
>  	args->verbose = verbose;
>  	args->base = base;
>  	args->baselen = strlen(base);
> +	args->recurse_submodules = recurse_submodules;
>  	args->worktree_attributes = worktree_attributes;
>  
>  	return argc;
> diff --git a/archive.h b/archive.h
> index 540a3b12130..1b21484dda6 100644
> --- a/archive.h
> +++ b/archive.h
> @@ -18,6 +18,7 @@ struct archiver_args {
>  	timestamp_t time;
>  	struct pathspec pathspec;
>  	unsigned int verbose : 1;
> +	unsigned int recurse_submodules : 1;
>  	unsigned int worktree_attributes : 1;
>  	unsigned int convert : 1;
>  	int compression_level;
> -- 
> gitgitgadget
René Scharfe Oct. 27, 2022, 7:09 a.m. UTC | #2
Am 27.10.22 um 01:34 schrieb Glen Choo:
> "Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:
>
>> index 34549d849f1..f81ef741487 100644
>> --- a/archive.c
>> +++ b/archive.c
>> @@ -213,6 +214,25 @@ static void queue_directory(const struct object_id *oid,
>>  	oidcpy(&d->oid, oid);
>>  }
>>
>> +static void queue_submodule(
>> +		struct repository *superproject,
>> +		const struct object_id *oid,
>> +		struct strbuf *base, const char *filename,
>> +		unsigned mode, struct archiver_context *c)
>> +{
>> +	struct repository subrepo;
>> +
>> +	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
>> +		return;
>> +
>> +	if (repo_read_index(&subrepo) < 0)
>> +		die("index file corrupt");
>> +
>> +    queue_directory(oid, base, filename, mode, c);
>> +
>> +	repo_clear(&subrepo);
>> +}
>> +

> What's much more surprising is that you can delete the entire function
> body (even queue_directory()!) and the tests still pass! The tests are
> definitely testing what they say they are (I've also checked the
> tarballs), so I'm not sure what's going on.
>
> I commented out queue_directory() in the S_ISDIR case, and the only test
> failures I saw were:
>
> - t5000.68, which uses a glob in its pathspec. I tried using a glob for
>   in the archive submodule tests, but I couldn't reproduce the failure.
> - t5004.11, which is a really big test case that I didn't bother looking
>   deeply into.
>
> So I'm at a loss as to what queue_directory() actually does.
An archive doesn't strictly need directory entries.  If it contains a
file with a deeply nested path then extractors will create the parent
directory hierarchy regardless.  diff(1) won't notice any difference.
Directory entries are mainly included to specify the permission bits.

t5000.68 checks for the directory entries in the output given by the
option --verbose of git archive.  t5004.11 checks the number of archive
entries (including directories) using "zipinfo -h".

René
Glen Choo Oct. 27, 2022, 5:29 p.m. UTC | #3
René Scharfe <l.s.r@web.de> writes:

> Am 27.10.22 um 01:34 schrieb Glen Choo:
>> "Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:
>>
>>> index 34549d849f1..f81ef741487 100644
>>> --- a/archive.c
>>> +++ b/archive.c
>>> @@ -213,6 +214,25 @@ static void queue_directory(const struct object_id *oid,
>>>  	oidcpy(&d->oid, oid);
>>>  }
>>>
>>> +static void queue_submodule(
>>> +		struct repository *superproject,
>>> +		const struct object_id *oid,
>>> +		struct strbuf *base, const char *filename,
>>> +		unsigned mode, struct archiver_context *c)
>>> +{
>>> +	struct repository subrepo;
>>> +
>>> +	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
>>> +		return;
>>> +
>>> +	if (repo_read_index(&subrepo) < 0)
>>> +		die("index file corrupt");
>>> +
>>> +    queue_directory(oid, base, filename, mode, c);
>>> +
>>> +	repo_clear(&subrepo);
>>> +}
>>> +
>
>> What's much more surprising is that you can delete the entire function
>> body (even queue_directory()!) and the tests still pass! The tests are
>> definitely testing what they say they are (I've also checked the
>> tarballs), so I'm not sure what's going on.
>>
>> I commented out queue_directory() in the S_ISDIR case, and the only test
>> failures I saw were:
>>
>> - t5000.68, which uses a glob in its pathspec. I tried using a glob for
>>   in the archive submodule tests, but I couldn't reproduce the failure.
>> - t5004.11, which is a really big test case that I didn't bother looking
>>   deeply into.
>>
>> So I'm at a loss as to what queue_directory() actually does.
> An archive doesn't strictly need directory entries.  If it contains a
> file with a deeply nested path then extractors will create the parent
> directory hierarchy regardless.  diff(1) won't notice any difference.
> Directory entries are mainly included to specify the permission bits.

Thanks. In that case, we should probably also test the case where there
are empty directories (e.g. when a file is excluded by a pathspec), and
we should also check the permission bits.

>
> t5000.68 checks for the directory entries in the output given by the
> option --verbose of git archive.  t5004.11 checks the number of archive
> entries (including directories) using "zipinfo -h".
>
> René
Glen Choo Oct. 27, 2022, 5:30 p.m. UTC | #4
René Scharfe <l.s.r@web.de> writes:

> Am 27.10.22 um 01:34 schrieb Glen Choo:
>> "Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:
>>
>>> index 34549d849f1..f81ef741487 100644
>>> --- a/archive.c
>>> +++ b/archive.c
>>> @@ -213,6 +214,25 @@ static void queue_directory(const struct object_id *oid,
>>>  	oidcpy(&d->oid, oid);
>>>  }
>>>
>>> +static void queue_submodule(
>>> +		struct repository *superproject,
>>> +		const struct object_id *oid,
>>> +		struct strbuf *base, const char *filename,
>>> +		unsigned mode, struct archiver_context *c)
>>> +{
>>> +	struct repository subrepo;
>>> +
>>> +	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
>>> +		return;
>>> +
>>> +	if (repo_read_index(&subrepo) < 0)
>>> +		die("index file corrupt");
>>> +
>>> +    queue_directory(oid, base, filename, mode, c);
>>> +
>>> +	repo_clear(&subrepo);
>>> +}
>>> +
>
>> What's much more surprising is that you can delete the entire function
>> body (even queue_directory()!) and the tests still pass! The tests are
>> definitely testing what they say they are (I've also checked the
>> tarballs), so I'm not sure what's going on.
>>
>> I commented out queue_directory() in the S_ISDIR case, and the only test
>> failures I saw were:
>>
>> - t5000.68, which uses a glob in its pathspec. I tried using a glob for
>>   in the archive submodule tests, but I couldn't reproduce the failure.
>> - t5004.11, which is a really big test case that I didn't bother looking
>>   deeply into.
>>
>> So I'm at a loss as to what queue_directory() actually does.
> An archive doesn't strictly need directory entries.  If it contains a
> file with a deeply nested path then extractors will create the parent
> directory hierarchy regardless.  diff(1) won't notice any difference.
> Directory entries are mainly included to specify the permission bits.

Thanks. In that case, we should probably also test the case where there
are empty directories (e.g. when a file is excluded by a pathspec), and
we should also check the permission bits.

>
> t5000.68 checks for the directory entries in the output given by the
> option --verbose of git archive.  t5004.11 checks the number of archive
> entries (including directories) using "zipinfo -h".
>
> René
Glen Choo Oct. 27, 2022, 5:33 p.m. UTC | #5
René Scharfe <l.s.r@web.de> writes:

> Am 27.10.22 um 01:34 schrieb Glen Choo:
>> "Heather Lapointe via GitGitGadget" <gitgitgadget@gmail.com> writes:
>>
>>> index 34549d849f1..f81ef741487 100644
>>> --- a/archive.c
>>> +++ b/archive.c
>>> @@ -213,6 +214,25 @@ static void queue_directory(const struct object_id *oid,
>>>  	oidcpy(&d->oid, oid);
>>>  }
>>>
>>> +static void queue_submodule(
>>> +		struct repository *superproject,
>>> +		const struct object_id *oid,
>>> +		struct strbuf *base, const char *filename,
>>> +		unsigned mode, struct archiver_context *c)
>>> +{
>>> +	struct repository subrepo;
>>> +
>>> +	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
>>> +		return;
>>> +
>>> +	if (repo_read_index(&subrepo) < 0)
>>> +		die("index file corrupt");
>>> +
>>> +    queue_directory(oid, base, filename, mode, c);
>>> +
>>> +	repo_clear(&subrepo);
>>> +}
>>> +
>
>> What's much more surprising is that you can delete the entire function
>> body (even queue_directory()!) and the tests still pass! The tests are
>> definitely testing what they say they are (I've also checked the
>> tarballs), so I'm not sure what's going on.
>>
>> I commented out queue_directory() in the S_ISDIR case, and the only test
>> failures I saw were:
>>
>> - t5000.68, which uses a glob in its pathspec. I tried using a glob for
>>   in the archive submodule tests, but I couldn't reproduce the failure.
>> - t5004.11, which is a really big test case that I didn't bother looking
>>   deeply into.
>>
>> So I'm at a loss as to what queue_directory() actually does.
> An archive doesn't strictly need directory entries.  If it contains a
> file with a deeply nested path then extractors will create the parent
> directory hierarchy regardless.  diff(1) won't notice any difference.
> Directory entries are mainly included to specify the permission bits.

Thanks. In that case, we should probably also test the case where there
are empty directories (e.g. when a file is excluded by a pathspec), and
we should also check the permission bits.

>
> t5000.68 checks for the directory entries in the output given by the
> option --verbose of git archive.  t5004.11 checks the number of archive
> entries (including directories) using "zipinfo -h".
>
> René
diff mbox series

Patch

diff --git a/Documentation/git-archive.txt b/Documentation/git-archive.txt
index 60c040988bb..22f54428b98 100644
--- a/Documentation/git-archive.txt
+++ b/Documentation/git-archive.txt
@@ -10,7 +10,8 @@  SYNOPSIS
 --------
 [verse]
 'git archive' [--format=<fmt>] [--list] [--prefix=<prefix>/] [<extra>]
-	      [-o <file> | --output=<file>] [--worktree-attributes]
+	      [-o <file> | --output=<file>]
+	      [--recurse-submodules] [--worktree-attributes]
 	      [--remote=<repo> [--exec=<git-upload-archive>]] <tree-ish>
 	      [<path>...]
 
@@ -82,6 +83,9 @@  The file mode is limited to a regular file, and the option may be
 subject to platform-dependent command-line limits. For non-trivial
 cases, write an untracked file and use `--add-file` instead.
 
+--recurse-submodules
+	Include submodules recursively in archive.
+
 --worktree-attributes::
 	Look for attributes in .gitattributes files in the working tree
 	as well (see <<ATTRIBUTES>>).
diff --git a/archive.c b/archive.c
index 34549d849f1..f81ef741487 100644
--- a/archive.c
+++ b/archive.c
@@ -10,6 +10,7 @@ 
 #include "unpack-trees.h"
 #include "dir.h"
 #include "quote.h"
+#include "submodule.h"
 
 static char const * const archive_usage[] = {
 	N_("git archive [<options>] <tree-ish> [<path>...]"),
@@ -213,6 +214,25 @@  static void queue_directory(const struct object_id *oid,
 	oidcpy(&d->oid, oid);
 }
 
+static void queue_submodule(
+		struct repository *superproject,
+		const struct object_id *oid,
+		struct strbuf *base, const char *filename,
+		unsigned mode, struct archiver_context *c)
+{
+	struct repository subrepo;
+
+	if (repo_submodule_init(&subrepo, superproject, filename, null_oid()))
+		return;
+
+	if (repo_read_index(&subrepo) < 0)
+		die("index file corrupt");
+
+    queue_directory(oid, base, filename, mode, c);
+
+	repo_clear(&subrepo);
+}
+
 static int write_directory(
 		struct repository *repo,
 		struct archiver_context *c)
@@ -228,9 +248,11 @@  static int write_directory(
 		write_directory(repo, c) ||
 		write_archive_entry(repo, &d->oid, d->path, d->baselen,
 				    d->path + d->baselen, d->mode,
-				    c) != READ_TREE_RECURSIVE;
+				    c);
 	free(d);
-	return ret ? -1 : 0;
+	if (ret == READ_TREE_RECURSIVE)
+		return 0;
+	return ret;
 }
 
 static int queue_or_write_archive_entry(
@@ -263,6 +285,11 @@  static int queue_or_write_archive_entry(
 			return 0;
 		queue_directory(oid, base, filename, mode, c);
 		return READ_TREE_RECURSIVE;
+	} else if (c->args->recurse_submodules && S_ISGITLINK(mode)) {
+		if (is_submodule_active(r, filename)) {
+			queue_submodule(r, oid, base, filename, mode, c);
+			return READ_TREE_RECURSIVE;
+		}
 	}
 
 	if (write_directory(r, c))
@@ -446,6 +473,7 @@  static void parse_pathspec_arg(
 		       PATHSPEC_PREFER_FULL,
 		       "", pathspec);
 	ar_args->pathspec.recursive = 1;
+	ar_args->pathspec.recurse_submodules = ar_args->recurse_submodules;
 	if (pathspec) {
 		while (*pathspec) {
 			if (**pathspec && !path_exists(repo, ar_args, *pathspec))
@@ -609,6 +637,7 @@  static int parse_archive_args(int argc, const char **argv,
 	int verbose = 0;
 	int i;
 	int list = 0;
+	int recurse_submodules = 0;
 	int worktree_attributes = 0;
 	struct option opts[] = {
 		OPT_GROUP(""),
@@ -623,6 +652,8 @@  static int parse_archive_args(int argc, const char **argv,
 		  add_file_cb, (intptr_t)&base },
 		OPT_STRING('o', "output", &output, N_("file"),
 			N_("write the archive to this file")),
+		OPT_BOOL(0, "recurse-submodules", &recurse_submodules,
+			N_("include submodules in archive")),
 		OPT_BOOL(0, "worktree-attributes", &worktree_attributes,
 			N_("read .gitattributes in working directory")),
 		OPT__VERBOSE(&verbose, N_("report archived files on stderr")),
@@ -686,6 +717,7 @@  static int parse_archive_args(int argc, const char **argv,
 	args->verbose = verbose;
 	args->base = base;
 	args->baselen = strlen(base);
+	args->recurse_submodules = recurse_submodules;
 	args->worktree_attributes = worktree_attributes;
 
 	return argc;
diff --git a/archive.h b/archive.h
index 540a3b12130..1b21484dda6 100644
--- a/archive.h
+++ b/archive.h
@@ -18,6 +18,7 @@  struct archiver_args {
 	timestamp_t time;
 	struct pathspec pathspec;
 	unsigned int verbose : 1;
+	unsigned int recurse_submodules : 1;
 	unsigned int worktree_attributes : 1;
 	unsigned int convert : 1;
 	int compression_level;