diff mbox series

[3/9] clone: add --sparse mode

Message ID 4ccd36b3966b62934cfbb0ec65e2851b50dfa1e3.1566313865.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series New sparse-checkout builtin and "cone" mode | expand

Commit Message

Linus Arver via GitGitGadget Aug. 20, 2019, 3:11 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

When someone wants to clone a large repository, but plans to work
using a sparse-checkout file, they either need to do a full
checkout first and then reduce the patterns they included, or
clone with --no-checkout, set up their patterns, and then run
a checkout manually. This requires knowing a lot about the repo
shape and how sparse-checkout works.

Add a new '--sparse' option to 'git clone' that initializes the
sparse-checkout file to include the following patterns:

	/*
	!/*/*

These patterns include every file in the root directory, but
no directories. This allows a repo to include files like a
README or a bootstrapping script to grow enlistments from that
point.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 Documentation/git-clone.txt        |  8 +++++++-
 builtin/clone.c                    | 27 +++++++++++++++++++++++++++
 t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

Comments

Elijah Newren Aug. 23, 2019, 11:17 p.m. UTC | #1
On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> When someone wants to clone a large repository, but plans to work
> using a sparse-checkout file, they either need to do a full
> checkout first and then reduce the patterns they included, or
> clone with --no-checkout, set up their patterns, and then run
> a checkout manually. This requires knowing a lot about the repo
> shape and how sparse-checkout works.
>
> Add a new '--sparse' option to 'git clone' that initializes the
> sparse-checkout file to include the following patterns:
>
>         /*
>         !/*/*
>
> These patterns include every file in the root directory, but
> no directories. This allows a repo to include files like a
> README or a bootstrapping script to grow enlistments from that
> point.

Nice.

>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  Documentation/git-clone.txt        |  8 +++++++-
>  builtin/clone.c                    | 27 +++++++++++++++++++++++++++
>  t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
>  3 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
> index 34011c2940..0fe91d2f04 100644
> --- a/Documentation/git-clone.txt
> +++ b/Documentation/git-clone.txt
> @@ -15,7 +15,7 @@ SYNOPSIS
>           [--dissociate] [--separate-git-dir <git dir>]
>           [--depth <depth>] [--[no-]single-branch] [--no-tags]
>           [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
> -         [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
> +         [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
>           [<directory>]
>
>  DESCRIPTION
> @@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
>         used, neither remote-tracking branches nor the related
>         configuration variables are created.
>
> +--sparse::
> +       Initialize the sparse-checkout file so the working
> +       directory starts with only the files in the root
> +       of the repository. The sparse-checkout file can be
> +       modified to grow the working directory as needed.
> +
>  --mirror::
>         Set up a mirror of the source repository.  This implies `--bare`.
>         Compared to `--bare`, `--mirror` not only maps local branches of the
> diff --git a/builtin/clone.c b/builtin/clone.c
> index f665b28ccc..d6d49a73ff 100644
> --- a/builtin/clone.c
> +++ b/builtin/clone.c
> @@ -60,6 +60,7 @@ static const char *real_git_dir;
>  static char *option_upload_pack = "git-upload-pack";
>  static int option_verbosity;
>  static int option_progress = -1;
> +static int option_sparse_checkout;
>  static enum transport_family family;
>  static struct string_list option_config = STRING_LIST_INIT_NODUP;
>  static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
> @@ -147,6 +148,8 @@ static struct option builtin_clone_options[] = {
>         OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>         OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
>                     N_("any cloned submodules will use their remote-tracking branch")),
> +       OPT_BOOL(0, "sparse", &option_sparse_checkout,
> +                   N_("initialize sparse-checkout file to include only files at root")),
>         OPT_END()
>  };
>
> @@ -734,6 +737,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
>         }
>  }
>
> +static int git_sparse_checkout_init(const char *repo)
> +{
> +       struct argv_array argv = ARGV_ARRAY_INIT;
> +       int result = 0;
> +       argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
> +
> +       /*
> +        * We must apply the setting in the current process
> +        * for the later checkout to use the sparse-checkout file.
> +        */
> +       core_apply_sparse_checkout = 1;
> +
> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
> +               error(_("failed to initialize sparse-checkout"));
> +               result = 1;
> +       }

Sigh...so much forking of additional processes.  I'd really rather
that we were reducing how much of this we are doing in the codebase
instead of adding more.  Every fork makes following stuff in a
debugger harder.

> +
> +       argv_array_clear(&argv);
> +       return result;
> +}
> +
>  static int checkout(int submodule_progress)
>  {
>         struct object_id oid;
> @@ -1107,6 +1131,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
>         if (option_required_reference.nr || option_optional_reference.nr)
>                 setup_reference();
>
> +       if (option_sparse_checkout && git_sparse_checkout_init(repo))
> +               return 1;
> +
>         remote = remote_get(option_origin);
>
>         strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
> index 35ab84aabd..b7d5f15830 100755
> --- a/t/t1091-sparse-checkout-builtin.sh
> +++ b/t/t1091-sparse-checkout-builtin.sh
> @@ -87,4 +87,17 @@ test_expect_success 'init with existing sparse-checkout' '
>         test_cmp expect dir
>  '
>
> +test_expect_success 'clone --sparse' '
> +       git clone --sparse repo clone &&
> +       git -C clone sparse-checkout list >actual &&
> +       cat >expect <<-EOF &&
> +               /*
> +               !/*/*
> +       EOF
> +       test_cmp expect actual &&
> +       ls clone >dir &&
> +       echo a >expect &&
> +       test_cmp expect dir

Checking that a toplevel entry is present, but not checking that an
entry from a subdir is missing as expected?
Derrick Stolee Sept. 18, 2019, 1:51 p.m. UTC | #2
On 8/23/2019 7:17 PM, Elijah Newren wrote:
> On Tue, Aug 20, 2019 at 8:12 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> When someone wants to clone a large repository, but plans to work
>> using a sparse-checkout file, they either need to do a full
>> checkout first and then reduce the patterns they included, or
>> clone with --no-checkout, set up their patterns, and then run
>> a checkout manually. This requires knowing a lot about the repo
>> shape and how sparse-checkout works.
>>
>> Add a new '--sparse' option to 'git clone' that initializes the
>> sparse-checkout file to include the following patterns:
>>
>>         /*
>>         !/*/*
>>
>> These patterns include every file in the root directory, but
>> no directories. This allows a repo to include files like a
>> README or a bootstrapping script to grow enlistments from that
>> point.
> 
> Nice.
> 
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  Documentation/git-clone.txt        |  8 +++++++-
>>  builtin/clone.c                    | 27 +++++++++++++++++++++++++++
>>  t/t1091-sparse-checkout-builtin.sh | 13 +++++++++++++
>>  3 files changed, 47 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
>> index 34011c2940..0fe91d2f04 100644
>> --- a/Documentation/git-clone.txt
>> +++ b/Documentation/git-clone.txt
>> @@ -15,7 +15,7 @@ SYNOPSIS
>>           [--dissociate] [--separate-git-dir <git dir>]
>>           [--depth <depth>] [--[no-]single-branch] [--no-tags]
>>           [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
>> -         [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
>> +         [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
>>           [<directory>]
>>
>>  DESCRIPTION
>> @@ -156,6 +156,12 @@ objects from the source repository into a pack in the cloned repository.
>>         used, neither remote-tracking branches nor the related
>>         configuration variables are created.
>>
>> +--sparse::
>> +       Initialize the sparse-checkout file so the working
>> +       directory starts with only the files in the root
>> +       of the repository. The sparse-checkout file can be
>> +       modified to grow the working directory as needed.
>> +
>>  --mirror::
>>         Set up a mirror of the source repository.  This implies `--bare`.
>>         Compared to `--bare`, `--mirror` not only maps local branches of the
>> diff --git a/builtin/clone.c b/builtin/clone.c
>> index f665b28ccc..d6d49a73ff 100644
>> --- a/builtin/clone.c
>> +++ b/builtin/clone.c
>> @@ -60,6 +60,7 @@ static const char *real_git_dir;
>>  static char *option_upload_pack = "git-upload-pack";
>>  static int option_verbosity;
>>  static int option_progress = -1;
>> +static int option_sparse_checkout;
>>  static enum transport_family family;
>>  static struct string_list option_config = STRING_LIST_INIT_NODUP;
>>  static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
>> @@ -147,6 +148,8 @@ static struct option builtin_clone_options[] = {
>>         OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
>>         OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
>>                     N_("any cloned submodules will use their remote-tracking branch")),
>> +       OPT_BOOL(0, "sparse", &option_sparse_checkout,
>> +                   N_("initialize sparse-checkout file to include only files at root")),
>>         OPT_END()
>>  };
>>
>> @@ -734,6 +737,27 @@ static void update_head(const struct ref *our, const struct ref *remote,
>>         }
>>  }
>>
>> +static int git_sparse_checkout_init(const char *repo)
>> +{
>> +       struct argv_array argv = ARGV_ARRAY_INIT;
>> +       int result = 0;
>> +       argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
>> +
>> +       /*
>> +        * We must apply the setting in the current process
>> +        * for the later checkout to use the sparse-checkout file.
>> +        */
>> +       core_apply_sparse_checkout = 1;
>> +
>> +       if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
>> +               error(_("failed to initialize sparse-checkout"));
>> +               result = 1;
>> +       }
> 
> Sigh...so much forking of additional processes.  I'd really rather
> that we were reducing how much of this we are doing in the codebase
> instead of adding more.  Every fork makes following stuff in a
> debugger harder.

At the moment, this is the simplest way to do this interaction. The
init subcommand is doing multiple things, and we can consider moving
this to be a library method instead of builtin-specific code later.

This is not a huge performance hit, as "clone" is called only once
per repo.

>> +
>> +       argv_array_clear(&argv);
>> +       return result;
>> +}
>> +
>>  static int checkout(int submodule_progress)
>>  {
>>         struct object_id oid;
>> @@ -1107,6 +1131,9 @@ int cmd_clone(int argc, const char **argv, const char *prefix)
>>         if (option_required_reference.nr || option_optional_reference.nr)
>>                 setup_reference();
>>
>> +       if (option_sparse_checkout && git_sparse_checkout_init(repo))
>> +               return 1;
>> +
>>         remote = remote_get(option_origin);
>>
>>         strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
>> diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
>> index 35ab84aabd..b7d5f15830 100755
>> --- a/t/t1091-sparse-checkout-builtin.sh
>> +++ b/t/t1091-sparse-checkout-builtin.sh
>> @@ -87,4 +87,17 @@ test_expect_success 'init with existing sparse-checkout' '
>>         test_cmp expect dir
>>  '
>>
>> +test_expect_success 'clone --sparse' '
>> +       git clone --sparse repo clone &&
>> +       git -C clone sparse-checkout list >actual &&
>> +       cat >expect <<-EOF &&
>> +               /*
>> +               !/*/*
>> +       EOF
>> +       test_cmp expect actual &&
>> +       ls clone >dir &&
>> +       echo a >expect &&
>> +       test_cmp expect dir
> 
> Checking that a toplevel entry is present, but not checking that an
> entry from a subdir is missing as expected?

This test is checking that the file "a" is the _only_ entry in the root
of the repo. The directories "folder1" and "folder2" are not present, since
we are comparing the ls output to "expect".

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/Documentation/git-clone.txt b/Documentation/git-clone.txt
index 34011c2940..0fe91d2f04 100644
--- a/Documentation/git-clone.txt
+++ b/Documentation/git-clone.txt
@@ -15,7 +15,7 @@  SYNOPSIS
 	  [--dissociate] [--separate-git-dir <git dir>]
 	  [--depth <depth>] [--[no-]single-branch] [--no-tags]
 	  [--recurse-submodules[=<pathspec>]] [--[no-]shallow-submodules]
-	  [--[no-]remote-submodules] [--jobs <n>] [--] <repository>
+	  [--[no-]remote-submodules] [--jobs <n>] [--sparse] [--] <repository>
 	  [<directory>]
 
 DESCRIPTION
@@ -156,6 +156,12 @@  objects from the source repository into a pack in the cloned repository.
 	used, neither remote-tracking branches nor the related
 	configuration variables are created.
 
+--sparse::
+	Initialize the sparse-checkout file so the working
+	directory starts with only the files in the root
+	of the repository. The sparse-checkout file can be
+	modified to grow the working directory as needed.
+
 --mirror::
 	Set up a mirror of the source repository.  This implies `--bare`.
 	Compared to `--bare`, `--mirror` not only maps local branches of the
diff --git a/builtin/clone.c b/builtin/clone.c
index f665b28ccc..d6d49a73ff 100644
--- a/builtin/clone.c
+++ b/builtin/clone.c
@@ -60,6 +60,7 @@  static const char *real_git_dir;
 static char *option_upload_pack = "git-upload-pack";
 static int option_verbosity;
 static int option_progress = -1;
+static int option_sparse_checkout;
 static enum transport_family family;
 static struct string_list option_config = STRING_LIST_INIT_NODUP;
 static struct string_list option_required_reference = STRING_LIST_INIT_NODUP;
@@ -147,6 +148,8 @@  static struct option builtin_clone_options[] = {
 	OPT_PARSE_LIST_OBJECTS_FILTER(&filter_options),
 	OPT_BOOL(0, "remote-submodules", &option_remote_submodules,
 		    N_("any cloned submodules will use their remote-tracking branch")),
+	OPT_BOOL(0, "sparse", &option_sparse_checkout,
+		    N_("initialize sparse-checkout file to include only files at root")),
 	OPT_END()
 };
 
@@ -734,6 +737,27 @@  static void update_head(const struct ref *our, const struct ref *remote,
 	}
 }
 
+static int git_sparse_checkout_init(const char *repo)
+{
+	struct argv_array argv = ARGV_ARRAY_INIT;
+	int result = 0;
+	argv_array_pushl(&argv, "-C", repo, "sparse-checkout", "init", NULL);
+
+	/*
+	 * We must apply the setting in the current process
+	 * for the later checkout to use the sparse-checkout file.
+	 */
+	core_apply_sparse_checkout = 1;
+
+	if (run_command_v_opt(argv.argv, RUN_GIT_CMD)) {
+		error(_("failed to initialize sparse-checkout"));
+		result = 1;
+	}
+
+	argv_array_clear(&argv);
+	return result;
+}
+
 static int checkout(int submodule_progress)
 {
 	struct object_id oid;
@@ -1107,6 +1131,9 @@  int cmd_clone(int argc, const char **argv, const char *prefix)
 	if (option_required_reference.nr || option_optional_reference.nr)
 		setup_reference();
 
+	if (option_sparse_checkout && git_sparse_checkout_init(repo))
+		return 1;
+
 	remote = remote_get(option_origin);
 
 	strbuf_addf(&default_refspec, "+%s*:%s*", src_ref_prefix,
diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh
index 35ab84aabd..b7d5f15830 100755
--- a/t/t1091-sparse-checkout-builtin.sh
+++ b/t/t1091-sparse-checkout-builtin.sh
@@ -87,4 +87,17 @@  test_expect_success 'init with existing sparse-checkout' '
 	test_cmp expect dir
 '
 
+test_expect_success 'clone --sparse' '
+	git clone --sparse repo clone &&
+	git -C clone sparse-checkout list >actual &&
+	cat >expect <<-EOF &&
+		/*
+		!/*/*
+	EOF
+	test_cmp expect actual &&
+	ls clone >dir &&
+	echo a >expect &&
+	test_cmp expect dir
+'
+
 test_done
\ No newline at end of file