diff mbox series

[v3,11/12] wt-status: expand added sparse directory entries

Message ID 3b42783d4a86473420480b2789d61d8103e6e7d4.1621017072.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Sparse-index: integrate with status | expand

Commit Message

Derrick Stolee May 14, 2021, 6:31 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

It is difficult, but possible, to get into a state where we intend to
add a directory that is outside of the sparse-checkout definition. Add a
test to t1092-sparse-checkout-compatibility.sh that demonstrates this
using a combination of 'git reset --mixed' and 'git checkout --orphan'.

This test failed before because the output of 'git status
--porcelain=v2' would not match on the lines for folder1/:

* The sparse-checkout repo (with a full index) would output each path
  name that is intended to be added.

* The sparse-index repo would only output that "folder1/" is staged for
  addition.

The status should report the full list of files to be added, and so this
sparse-directory entry should be expanded to a full list when reaching
it inside the wt_status_collect_changes_initial() method. Use
read_tree_at() to assist.

Somehow, this loop over the cache entries was not guarded by
ensure_full_index() as intended.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 t/t1092-sparse-checkout-compatibility.sh | 28 +++++++++++++
 wt-status.c                              | 50 ++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

Comments

Elijah Newren May 18, 2021, 2:27 a.m. UTC | #1
On Fri, May 14, 2021 at 11:31 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> It is difficult, but possible, to get into a state where we intend to
> add a directory that is outside of the sparse-checkout definition. Add a

Then we need to fix that; allowing things to be added outside the
sparse-checkout definition is a bug[1][2].  That's an invariant I
believe we should maintain everywhere; things get really confusing to
users somewhere later down the road if we don't.  Matheus worked to
fix that with 'git add'; if there are other commands that need fixing
too, then we should also fix them.

[1] https://lore.kernel.org/git/CABPp-BFhyFiKSXdLM5q5t=ZKzr6V0pY7dbheierRaOHFbMEdkg@mail.gmail.com/
[2] https://lore.kernel.org/git/CABPp-BF0ZhbSs42R3Bw_r-hbhQ71qtbXSBqXdq0djyaan=8p=A@mail.gmail.com/

> test to t1092-sparse-checkout-compatibility.sh that demonstrates this
> using a combination of 'git reset --mixed' and 'git checkout --orphan'.

I think `git checkout --orphan` should just throw an error if
sparse-checkout is in use.  Allowing adding paths outside the
sparse-checkout set causes too much collateral and deferred confusion
for users.

> This test failed before because the output of 'git status
> --porcelain=v2' would not match on the lines for folder1/:
>
> * The sparse-checkout repo (with a full index) would output each path
>   name that is intended to be added.
>
> * The sparse-index repo would only output that "folder1/" is staged for
>   addition.
>
> The status should report the full list of files to be added, and so this
> sparse-directory entry should be expanded to a full list when reaching
> it inside the wt_status_collect_changes_initial() method. Use
> read_tree_at() to assist.

Having a sparse directory entry whose object_id in the index does not
match HEAD should be an error.  Have a CE_SKIP_WORKTREE non-directory
whose object_id in the index does not match HEAD should also be an
error.  I don't think we should complicate the code to try to handle
violations of those assumptions.  I do think we should add checks to
enforce that constraint (or BUG() if it's violated).

And yeah, that also means 'git sparse-checkout add/set' would need to
error out if paths are requested to be sparsified despite being
different from HEAD.

> Somehow, this loop over the cache entries was not guarded by
> ensure_full_index() as intended.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  t/t1092-sparse-checkout-compatibility.sh | 28 +++++++++++++
>  wt-status.c                              | 50 ++++++++++++++++++++++++
>  2 files changed, 78 insertions(+)
>
> diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh
> index 59faf7381093..cd3669d36b53 100755
> --- a/t/t1092-sparse-checkout-compatibility.sh
> +++ b/t/t1092-sparse-checkout-compatibility.sh
> @@ -492,4 +492,32 @@ test_expect_success 'sparse-index is not expanded' '
>         test_region ! index ensure_full_index trace2.txt
>  '
>
> +test_expect_success 'reset mixed and checkout orphan' '
> +       init_repos &&
> +
> +       test_all_match git checkout rename-out-to-in &&
> +       test_all_match git reset --mixed HEAD~1 &&
> +       test_sparse_match test-tool read-cache --table --expand &&
> +       test_all_match git status --porcelain=v2 &&
> +       test_all_match git status --porcelain=v2 &&
> +
> +       # At this point, sparse-checkouts behave differently
> +       # from the full-checkout.
> +       test_sparse_match git checkout --orphan new-branch &&
> +       test_sparse_match test-tool read-cache --table --expand &&
> +       test_sparse_match git status --porcelain=v2 &&
> +       test_sparse_match git status --porcelain=v2
> +'
> +
> +test_expect_success 'add everything with deep new file' '
> +       init_repos &&
> +
> +       run_on_sparse git sparse-checkout set deep/deeper1/deepest &&
> +
> +       run_on_all touch deep/deeper1/x &&
> +       test_all_match git add . &&
> +       test_all_match git status --porcelain=v2 &&
> +       test_all_match git status --porcelain=v2
> +'
> +
>  test_done
> diff --git a/wt-status.c b/wt-status.c
> index 0425169c1895..90db8bd659fa 100644
> --- a/wt-status.c
> +++ b/wt-status.c
> @@ -654,6 +654,34 @@ static void wt_status_collect_changes_index(struct wt_status *s)
>         run_diff_index(&rev, 1);
>  }
>
> +static int add_file_to_list(const struct object_id *oid,
> +                           struct strbuf *base, const char *path,
> +                           unsigned int mode, void *context)
> +{
> +       struct string_list_item *it;
> +       struct wt_status_change_data *d;
> +       struct wt_status *s = context;
> +       char *full_name;
> +
> +       if (S_ISDIR(mode))
> +               return READ_TREE_RECURSIVE;
> +
> +       full_name = xstrfmt("%s%s", base->buf, path);
> +       it = string_list_insert(&s->change, full_name);
> +       d = it->util;
> +       if (!d) {
> +               CALLOC_ARRAY(d, 1);
> +               it->util = d;
> +       }
> +
> +       d->index_status = DIFF_STATUS_ADDED;
> +       /* Leave {mode,oid}_head zero for adds. */
> +       d->mode_index = mode;
> +       oidcpy(&d->oid_index, oid);
> +       s->committable = 1;
> +       return 0;
> +}
> +
>  static void wt_status_collect_changes_initial(struct wt_status *s)
>  {
>         struct index_state *istate = s->repo->index;
> @@ -668,6 +696,28 @@ static void wt_status_collect_changes_initial(struct wt_status *s)
>                         continue;
>                 if (ce_intent_to_add(ce))
>                         continue;
> +               if (S_ISSPARSEDIR(ce->ce_mode)) {
> +                       /*
> +                        * This is a sparse directory entry, so we want to collect all
> +                        * of the added files within the tree. This requires recursively
> +                        * expanding the trees to find the elements that are new in this
> +                        * tree and marking them with DIFF_STATUS_ADDED.
> +                        */
> +                       struct strbuf base = STRBUF_INIT;
> +                       struct pathspec ps;
> +                       struct tree *tree = lookup_tree(istate->repo, &ce->oid);
> +
> +                       memset(&ps, 0, sizeof(ps));
> +                       ps.recursive = 1;
> +                       ps.has_wildcard = 1;
> +                       ps.max_depth = -1;
> +
> +                       strbuf_add(&base, ce->name, ce->ce_namelen);
> +                       read_tree_at(istate->repo, tree, &base, &ps,
> +                                    add_file_to_list, s);
> +                       continue;
> +               }
> +
>                 it = string_list_insert(&s->change, ce->name);
>                 d = it->util;
>                 if (!d) {
> --
> gitgitgadget

This was a really nice catch that you got this particular testcase.
While I disagree with the fix, I do have to say nice work on the catch
and the implementation otherwise.
Derrick Stolee May 18, 2021, 6:26 p.m. UTC | #2
On 5/17/2021 10:27 PM, Elijah Newren wrote:
> On Fri, May 14, 2021 at 11:31 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> It is difficult, but possible, to get into a state where we intend to
>> add a directory that is outside of the sparse-checkout definition. Add a
> 
> Then we need to fix that; allowing things to be added outside the
> sparse-checkout definition is a bug[1][2].  That's an invariant I
> believe we should maintain everywhere; things get really confusing to
> users somewhere later down the road if we don't.  Matheus worked to
> fix that with 'git add'; if there are other commands that need fixing
> too, then we should also fix them.
> 
> [1] https://lore.kernel.org/git/CABPp-BFhyFiKSXdLM5q5t=ZKzr6V0pY7dbheierRaOHFbMEdkg@mail.gmail.com/
> [2] https://lore.kernel.org/git/CABPp-BF0ZhbSs42R3Bw_r-hbhQ71qtbXSBqXdq0djyaan=8p=A@mail.gmail.com/
> 
>> test to t1092-sparse-checkout-compatibility.sh that demonstrates this
>> using a combination of 'git reset --mixed' and 'git checkout --orphan'.
> 
> I think `git checkout --orphan` should just throw an error if
> sparse-checkout is in use.  Allowing adding paths outside the
> sparse-checkout set causes too much collateral and deferred confusion
> for users.

I've been trying to strike an interesting balance of creating
performance improvements without changing behavior, trying to
defer those behavior changes to an isolated instance. I think
that approach is unavoidable with the 'git add' work that I
pulled out of this series and will return to soon.

However, here I think it would be too much to start throwing
an error in this case. I think that change is a bit too much.

The thing I can try to do, instead of the current approach, is
to not allow sparse directory entries to differ between the
index and HEAD. That will satisfy this case, but also a lot of
other painful cases.

I have no idea how to actually accomplish that, but I'll start
digging.

>> This test failed before because the output of 'git status
>> --porcelain=v2' would not match on the lines for folder1/:
>>
>> * The sparse-checkout repo (with a full index) would output each path
>>   name that is intended to be added.
>>
>> * The sparse-index repo would only output that "folder1/" is staged for
>>   addition.
>>
>> The status should report the full list of files to be added, and so this
>> sparse-directory entry should be expanded to a full list when reaching
>> it inside the wt_status_collect_changes_initial() method. Use
>> read_tree_at() to assist.
> 
> Having a sparse directory entry whose object_id in the index does not
> match HEAD should be an error.

I can get behind this understanding.

>  Have a CE_SKIP_WORKTREE non-directory
> whose object_id in the index does not match HEAD should also be an
> error.

I'm less convinced here. At minimum, I'm not willing to stake
a firm claim and change the behavior around this statement in
the current series.

>  I don't think we should complicate the code to try to handle
> violations of those assumptions.  I do think we should add checks to
> enforce that constraint (or BUG() if it's violated).

A BUG() is likely too strict, because existing Git clients can
get users into this state, and then they upgrade and are suddenly
in a BUG() state. We should perhaps do our best effort to avoid
this case and handle it as appropriately as possible.

> And yeah, that also means 'git sparse-checkout add/set' would need to
> error out if paths are requested to be sparsified despite being
> different from HEAD.

This would be a reasonable thing, assuming the established
behavior is changed.

>> Somehow, this loop over the cache entries was not guarded by
>> ensure_full_index() as intended.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  t/t1092-sparse-checkout-compatibility.sh | 28 +++++++++++++
>>  wt-status.c                              | 50 ++++++++++++++++++++++++
>>  2 files changed, 78 insertions(+)
>>
>> diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh
>> index 59faf7381093..cd3669d36b53 100755
>> --- a/t/t1092-sparse-checkout-compatibility.sh
>> +++ b/t/t1092-sparse-checkout-compatibility.sh
>> @@ -492,4 +492,32 @@ test_expect_success 'sparse-index is not expanded' '
>>         test_region ! index ensure_full_index trace2.txt
>>  '
>>
>> +test_expect_success 'reset mixed and checkout orphan' '
>> +       init_repos &&
>> +
>> +       test_all_match git checkout rename-out-to-in &&
>> +       test_all_match git reset --mixed HEAD~1 &&
>> +       test_sparse_match test-tool read-cache --table --expand &&
>> +       test_all_match git status --porcelain=v2 &&
>> +       test_all_match git status --porcelain=v2 &&
>> +
>> +       # At this point, sparse-checkouts behave differently
>> +       # from the full-checkout.
>> +       test_sparse_match git checkout --orphan new-branch &&
>> +       test_sparse_match test-tool read-cache --table --expand &&
>> +       test_sparse_match git status --porcelain=v2 &&
>> +       test_sparse_match git status --porcelain=v2
>> +'
>> +
>> +test_expect_success 'add everything with deep new file' '
>> +       init_repos &&
>> +
>> +       run_on_sparse git sparse-checkout set deep/deeper1/deepest &&
>> +
>> +       run_on_all touch deep/deeper1/x &&
>> +       test_all_match git add . &&
>> +       test_all_match git status --porcelain=v2 &&
>> +       test_all_match git status --porcelain=v2
>> +'>
> This was a really nice catch that you got this particular testcase.
> While I disagree with the fix, I do have to say nice work on the catch
> and the implementation otherwise.

This test exists almost verbatim in the Scalar and VFS For Git
functional tests. I have no idea what context caused it to be
necessary.

I can understand your aversion to the solution I presented here.
Preventing sparse directory entries that differ from the tree at
HEAD for that path should be more robust to future integrations.

Thanks,
-Stolee
Derrick Stolee May 18, 2021, 7:04 p.m. UTC | #3
On 5/18/2021 2:26 PM, Derrick Stolee wrote:
> On 5/17/2021 10:27 PM, Elijah Newren wrote:
>> On Fri, May 14, 2021 at 11:31 AM Derrick Stolee via GitGitGadget
>> <gitgitgadget@gmail.com> wrote:
>>>
>>> From: Derrick Stolee <dstolee@microsoft.com>
>>>
>>> It is difficult, but possible, to get into a state where we intend to
>>> add a directory that is outside of the sparse-checkout definition. Add a
>>
>> Then we need to fix that; allowing things to be added outside the
>> sparse-checkout definition is a bug[1][2].  That's an invariant I
>> believe we should maintain everywhere; things get really confusing to
>> users somewhere later down the road if we don't.  Matheus worked to
>> fix that with 'git add'; if there are other commands that need fixing
>> too, then we should also fix them.
>>
>> [1] https://lore.kernel.org/git/CABPp-BFhyFiKSXdLM5q5t=ZKzr6V0pY7dbheierRaOHFbMEdkg@mail.gmail.com/
>> [2] https://lore.kernel.org/git/CABPp-BF0ZhbSs42R3Bw_r-hbhQ71qtbXSBqXdq0djyaan=8p=A@mail.gmail.com/
>>
>>> test to t1092-sparse-checkout-compatibility.sh that demonstrates this
>>> using a combination of 'git reset --mixed' and 'git checkout --orphan'.
>>
>> I think `git checkout --orphan` should just throw an error if
>> sparse-checkout is in use.  Allowing adding paths outside the
>> sparse-checkout set causes too much collateral and deferred confusion
>> for users.
> 
> I've been trying to strike an interesting balance of creating
> performance improvements without changing behavior, trying to
> defer those behavior changes to an isolated instance. I think
> that approach is unavoidable with the 'git add' work that I
> pulled out of this series and will return to soon.
> 
> However, here I think it would be too much to start throwing
> an error in this case. I think that change is a bit too much.
> 
> The thing I can try to do, instead of the current approach, is
> to not allow sparse directory entries to differ between the
> index and HEAD. That will satisfy this case, but also a lot of
> other painful cases.
> 
> I have no idea how to actually accomplish that, but I'll start
> digging.

It didn't take much digging to discover that this is likely
impossible, or rather it would be a drastic change to make this
happen.

The immediate issue is trying to prevent sparse directory entries
from existing when the contained paths don't match what exists at
HEAD. However, in the 'git checkout --orphan' case, we are using
a full index for the unpack_trees() that updates the in-memory
index according to the paths at HEAD, then updates HEAD to point
to a non-existing ref. The sparse directories are only created as
part of convert_to_sparse() within do_write_index(). At that
point, there is no HEAD provided. Trying to load it from scratch
violates the fact that HEAD is being staged to change _after_ the
index updates in a command like 'git checkout'.

So, the drastic change to make this work would be to update the
index API to require a root tree to be provided whenever writing
the index. However, that doesn't make sense, either! What do we
do when in a conflicted state?

What if a user modifies HEAD manually to point to a new ref?

Such a change would couple the index to the concept of HEAD in
an unproductive way, I think. The index data structure exists
as a separate entity that is frequently _compared_ to HEAD, and
the solution presented in this patch presents a way to keep the
comparison of a sparse-index and HEAD to be the same as if we
had a full index.

So, after looking into it, I'm back in favor of this change and
forever allowing sparse cache entries to differ from HEAD,
because there is no way to avoid it.

Thanks,
-Stolee
Elijah Newren May 19, 2021, 8:38 a.m. UTC | #4
On Tue, May 18, 2021 at 12:05 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 5/18/2021 2:26 PM, Derrick Stolee wrote:
> > On 5/17/2021 10:27 PM, Elijah Newren wrote:
> >> On Fri, May 14, 2021 at 11:31 AM Derrick Stolee via GitGitGadget
> >> <gitgitgadget@gmail.com> wrote:
> >>>
> >>> From: Derrick Stolee <dstolee@microsoft.com>
> >>>
> >>> It is difficult, but possible, to get into a state where we intend to
> >>> add a directory that is outside of the sparse-checkout definition. Add a
> >>
> >> Then we need to fix that; allowing things to be added outside the
> >> sparse-checkout definition is a bug[1][2].  That's an invariant I
> >> believe we should maintain everywhere; things get really confusing to
> >> users somewhere later down the road if we don't.  Matheus worked to
> >> fix that with 'git add'; if there are other commands that need fixing
> >> too, then we should also fix them.
> >>
> >> [1] https://lore.kernel.org/git/CABPp-BFhyFiKSXdLM5q5t=ZKzr6V0pY7dbheierRaOHFbMEdkg@mail.gmail.com/
> >> [2] https://lore.kernel.org/git/CABPp-BF0ZhbSs42R3Bw_r-hbhQ71qtbXSBqXdq0djyaan=8p=A@mail.gmail.com/
> >>
> >>> test to t1092-sparse-checkout-compatibility.sh that demonstrates this
> >>> using a combination of 'git reset --mixed' and 'git checkout --orphan'.
> >>
> >> I think `git checkout --orphan` should just throw an error if
> >> sparse-checkout is in use.  Allowing adding paths outside the
> >> sparse-checkout set causes too much collateral and deferred confusion
> >> for users.
> >
> > I've been trying to strike an interesting balance of creating
> > performance improvements without changing behavior, trying to
> > defer those behavior changes to an isolated instance. I think
> > that approach is unavoidable with the 'git add' work that I
> > pulled out of this series and will return to soon.
> >
> > However, here I think it would be too much to start throwing
> > an error in this case. I think that change is a bit too much.
> >
> > The thing I can try to do, instead of the current approach, is
> > to not allow sparse directory entries to differ between the
> > index and HEAD. That will satisfy this case, but also a lot of
> > other painful cases.
> >
> > I have no idea how to actually accomplish that, but I'll start
> > digging.
>
> It didn't take much digging to discover that this is likely
> impossible, or rather it would be a drastic change to make this
> happen.
>
> The immediate issue is trying to prevent sparse directory entries
> from existing when the contained paths don't match what exists at
> HEAD. However, in the 'git checkout --orphan' case, we are using
> a full index for the unpack_trees() that updates the in-memory
> index according to the paths at HEAD, then updates HEAD to point
> to a non-existing ref. The sparse directories are only created as
> part of convert_to_sparse() within do_write_index(). At that
> point, there is no HEAD provided. Trying to load it from scratch
> violates the fact that HEAD is being staged to change _after_ the
> index updates in a command like 'git checkout'.
>
> So, the drastic change to make this work would be to update the
> index API to require a root tree to be provided whenever writing
> the index. However, that doesn't make sense, either! What do we
> do when in a conflicted state?
>
> What if a user modifies HEAD manually to point to a new ref?
>
> Such a change would couple the index to the concept of HEAD in
> an unproductive way, I think. The index data structure exists
> as a separate entity that is frequently _compared_ to HEAD, and
> the solution presented in this patch presents a way to keep the
> comparison of a sparse-index and HEAD to be the same as if we
> had a full index.
>
> So, after looking into it, I'm back in favor of this change and
> forever allowing sparse cache entries to differ from HEAD,
> because there is no way to avoid it.

Doh, thanks for digging in and entertaining the idea.  I'm worried
we'll get lots of confused users over the years from not being able to
do this, but you do make some good points.

I still think `git checkout --orphan` should be an error when in a
sparse checkout -- the point of a sparse checkout is that you only
care about a subset of files, whereas checkout --orphan fundamentally
says you are throwing away history but care about each and every file
since you are staging "changes" from all of them to include in some
new commit soon.  They just seem in strong opposition to me, and it
seems likely to result in surprises for some of the users when despite
the --orphan request and them fixing up the working directory how they
like, they get some new commit that contains files that aren't in
their working tree.  (In contrast, `git switch --orphan` would
probably be fine in a sparse checkout, precisely because it really
does empty everything).  However, I do agree with you that such a
change belongs in a separate series.  So, yes, your patch is good, and
I'll raise the behavioral change later.

(Sorry for being slow to respond and still not getting to all your
good reviews of my series; I'm a bit limited in my time for git right
now...)
diff mbox series

Patch

diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh
index 59faf7381093..cd3669d36b53 100755
--- a/t/t1092-sparse-checkout-compatibility.sh
+++ b/t/t1092-sparse-checkout-compatibility.sh
@@ -492,4 +492,32 @@  test_expect_success 'sparse-index is not expanded' '
 	test_region ! index ensure_full_index trace2.txt
 '
 
+test_expect_success 'reset mixed and checkout orphan' '
+	init_repos &&
+
+	test_all_match git checkout rename-out-to-in &&
+	test_all_match git reset --mixed HEAD~1 &&
+	test_sparse_match test-tool read-cache --table --expand &&
+	test_all_match git status --porcelain=v2 &&
+	test_all_match git status --porcelain=v2 &&
+
+	# At this point, sparse-checkouts behave differently
+	# from the full-checkout.
+	test_sparse_match git checkout --orphan new-branch &&
+	test_sparse_match test-tool read-cache --table --expand &&
+	test_sparse_match git status --porcelain=v2 &&
+	test_sparse_match git status --porcelain=v2
+'
+
+test_expect_success 'add everything with deep new file' '
+	init_repos &&
+
+	run_on_sparse git sparse-checkout set deep/deeper1/deepest &&
+
+	run_on_all touch deep/deeper1/x &&
+	test_all_match git add . &&
+	test_all_match git status --porcelain=v2 &&
+	test_all_match git status --porcelain=v2
+'
+
 test_done
diff --git a/wt-status.c b/wt-status.c
index 0425169c1895..90db8bd659fa 100644
--- a/wt-status.c
+++ b/wt-status.c
@@ -654,6 +654,34 @@  static void wt_status_collect_changes_index(struct wt_status *s)
 	run_diff_index(&rev, 1);
 }
 
+static int add_file_to_list(const struct object_id *oid,
+			    struct strbuf *base, const char *path,
+			    unsigned int mode, void *context)
+{
+	struct string_list_item *it;
+	struct wt_status_change_data *d;
+	struct wt_status *s = context;
+	char *full_name;
+
+	if (S_ISDIR(mode))
+		return READ_TREE_RECURSIVE;
+
+	full_name = xstrfmt("%s%s", base->buf, path);
+	it = string_list_insert(&s->change, full_name);
+	d = it->util;
+	if (!d) {
+		CALLOC_ARRAY(d, 1);
+		it->util = d;
+	}
+
+	d->index_status = DIFF_STATUS_ADDED;
+	/* Leave {mode,oid}_head zero for adds. */
+	d->mode_index = mode;
+	oidcpy(&d->oid_index, oid);
+	s->committable = 1;
+	return 0;
+}
+
 static void wt_status_collect_changes_initial(struct wt_status *s)
 {
 	struct index_state *istate = s->repo->index;
@@ -668,6 +696,28 @@  static void wt_status_collect_changes_initial(struct wt_status *s)
 			continue;
 		if (ce_intent_to_add(ce))
 			continue;
+		if (S_ISSPARSEDIR(ce->ce_mode)) {
+			/*
+			 * This is a sparse directory entry, so we want to collect all
+			 * of the added files within the tree. This requires recursively
+			 * expanding the trees to find the elements that are new in this
+			 * tree and marking them with DIFF_STATUS_ADDED.
+			 */
+			struct strbuf base = STRBUF_INIT;
+			struct pathspec ps;
+			struct tree *tree = lookup_tree(istate->repo, &ce->oid);
+
+			memset(&ps, 0, sizeof(ps));
+			ps.recursive = 1;
+			ps.has_wildcard = 1;
+			ps.max_depth = -1;
+
+			strbuf_add(&base, ce->name, ce->ce_namelen);
+			read_tree_at(istate->repo, tree, &base, &ps,
+				     add_file_to_list, s);
+			continue;
+		}
+
 		it = string_list_insert(&s->change, ce->name);
 		d = it->util;
 		if (!d) {