diff mbox series

[02/10] unpack-trees: make sparse aware

Message ID 0a3892d2ec9e4acd4cba1c1d0390acc60dc6e50f.1618322497.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Sparse-index: integrate with status and add | expand

Commit Message

Derrick Stolee April 13, 2021, 2:01 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

As a first step to integrate 'git status' and 'git add' with the sparse
index, we must start integrating unpack_trees() with sparse directory
entries. These changes are currently impossible to trigger because
unpack_trees() calls ensure_full_index() if command_requires_full_index
is true. This is the case for all commands at the moment. As we expand
more commands to be sparse-aware, we might find that more changes are
required to unpack_trees(). The current changes will suffice for
'status' and 'add'.

unpack_trees() calls the traverse_trees() API using unpack_callback()
to decide if we should recurse into a subtree. We must add new abilities
to skip a subtree if it corresponds to a sparse directory entry.

It is important to be careful about the trailing directory separator
that exists in the sparse directory entries but not in the subtree
paths.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.h           |  2 +-
 preload-index.c |  2 ++
 read-cache.c    |  3 +++
 unpack-trees.c  | 24 ++++++++++++++++++++++--
 4 files changed, 28 insertions(+), 3 deletions(-)

Comments

Elijah Newren April 20, 2021, 11 p.m. UTC | #1
On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> As a first step to integrate 'git status' and 'git add' with the sparse
> index, we must start integrating unpack_trees() with sparse directory
> entries. These changes are currently impossible to trigger because
> unpack_trees() calls ensure_full_index() if command_requires_full_index
> is true. This is the case for all commands at the moment. As we expand
> more commands to be sparse-aware, we might find that more changes are
> required to unpack_trees(). The current changes will suffice for
> 'status' and 'add'.
>
> unpack_trees() calls the traverse_trees() API using unpack_callback()
> to decide if we should recurse into a subtree. We must add new abilities
> to skip a subtree if it corresponds to a sparse directory entry.
>
> It is important to be careful about the trailing directory separator
> that exists in the sparse directory entries but not in the subtree
> paths.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  dir.h           |  2 +-
>  preload-index.c |  2 ++
>  read-cache.c    |  3 +++
>  unpack-trees.c  | 24 ++++++++++++++++++++++--
>  4 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/dir.h b/dir.h
> index 51cb0e217247..9d6666f520f3 100644
> --- a/dir.h
> +++ b/dir.h
> @@ -503,7 +503,7 @@ static inline int ce_path_match(struct index_state *istate,
>                                 char *seen)
>  {
>         return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
> -                             S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
> +                             S_ISSPARSEDIR(ce->ce_mode) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));

I'm confused why this change would be needed, or why it'd semantically
be meaningful here either.  Doesn't S_ISSPARSEDIR() being true imply
S_ISDIR() is true (and perhaps even vice versa?).

By chance, was this a leftover from your early RFC changes from a few
series ago when you had an entirely different mode for sparse
directory entries?

>  }
>
>  static inline int dir_path_match(struct index_state *istate,
> diff --git a/preload-index.c b/preload-index.c
> index e5529a586366..35e67057ca9b 100644
> --- a/preload-index.c
> +++ b/preload-index.c
> @@ -55,6 +55,8 @@ static void *preload_thread(void *_data)
>                         continue;
>                 if (S_ISGITLINK(ce->ce_mode))
>                         continue;
> +               if (S_ISSPARSEDIR(ce->ce_mode))
> +                       continue;
>                 if (ce_uptodate(ce))
>                         continue;
>                 if (ce_skip_worktree(ce))

Don't we have S_ISSPARSEDIR(ce->ce_mode) implies ce_skip_worktree(ce)?
 Is this a duplicate check?  If so, is it still desirable for
future-proofing or code clarity, or is it strictly redundant?

> diff --git a/read-cache.c b/read-cache.c
> index 29ffa9ac5db9..6308234b4838 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
>                         continue;
>
> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> +                       continue;
> +

I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
!ignore_skip_worktree and why it'd be desirable to refresh
skip-worktree entries.  However, this is tangential to your patch and
has apparently been around since 2009 (in particular, from 56cac48c35
("ie_match_stat(): do not ignore skip-worktree bit with
CE_MATCH_IGNORE_VALID", 2009-12-14)).

>                 if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
>                         filtered = 1;
>
> diff --git a/unpack-trees.c b/unpack-trees.c
> index dddf106d5bd4..9a62e823928a 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -586,6 +586,13 @@ static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
>  {
>         ce->ce_flags |= CE_UNPACKED;
>
> +       /*
> +        * If this is a sparse directory, don't advance cache_bottom.
> +        * That will be advanced later using the cache-tree data.
> +        */
> +       if (S_ISSPARSEDIR(ce->ce_mode))
> +               return;
> +

I don't understand cache_bottom stuff; we might want to get Junio to
look over it.  Or maybe I just need to dig a bit further and attempt
to understand it.

>         if (o->cache_bottom < o->src_index->cache_nr &&
>             o->src_index->cache[o->cache_bottom] == ce) {
>                 int bottom = o->cache_bottom;
> @@ -984,6 +991,9 @@ static int do_compare_entry(const struct cache_entry *ce,
>         ce_len -= pathlen;
>         ce_name = ce->name + pathlen;
>
> +       /* remove directory separator if a sparse directory entry */
> +       if (S_ISSPARSEDIR(ce->ce_mode))
> +               ce_len--;
>         return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);

Shouldn't we be passing ce->ce_mode instead of S_IFREG here as well?

Note the following sort order:
   foo
   foo.txt
   foo/
   foo/bar

You've trimmed off the '/', so 'foo/' would be ordered where 'foo' is,
but df_name_compare() exists to make "foo" sort exactly where "foo/"
would when "foo" is a directory.  Will your df_name_compare() call
here result in foo.txt being placed after all the "foo/<subpath>"
entries in the index and perhaps cause other problems down the line?
(Are there issues, e.g. with cache-trees getting wrong ordering from
this, or even writing out indexes or tree objects with the wrong
ordering?  I've written out trees to disk with wrong ordering before
and git usually survives but gets really confused with diffs.)

Since at least one caller of compare_entry() takes the return result
and does a "if (cmp < 0)", this order is going to matter in some
cases.  Perhaps we need some testcases where there is a sparse
directory entry named "foo/" and a file recorded in some relevant tree
with the name "foo.txt" to be able to trigger these lines of code?

>  }
>
> @@ -993,6 +1003,10 @@ static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
>         if (cmp)
>                 return cmp;
>
> +       /* If ce is a sparse directory, then allow equality here. */
> +       if (S_ISSPARSEDIR(ce->ce_mode))
> +               return 0;
> +

Um...so a sparse directory compares equal to _anything_ at all?  I'm
really confused why this would be desirable.  Am I missing something
here?

>         /*
>          * Even if the beginning compared identically, the ce should
>          * compare as bigger than a directory leading up to it!
> @@ -1243,6 +1257,7 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>         struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
>         struct unpack_trees_options *o = info->data;
>         const struct name_entry *p = names;
> +       unsigned recurse = 1;

"recurse" sent my mind off into questions about safety checks, base
cases, etc., instead of just the simple "we don't want to read in
directories corresponding to sparse entries".  I think this would be
clearer either if the variable had the sparsity concept embedded in
its name somewhere (e.g. "unsigned sparse_entry = 0", and check for
(!sparse_entry) instead of (recurse) below), or with a comment about
why there are cases where you want to avoid recursion.

>
>         /* Find first entry with a real name (we could use "mask" too) */
>         while (!p->mode)
> @@ -1284,12 +1299,16 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>                                         }
>                                 }
>                                 src[0] = ce;
> +
> +                               if (S_ISSPARSEDIR(ce->ce_mode))
> +                                       recurse = 0;

Ah, the context here doesn't show it but this is in the "if (!cmp)"
block, i.e. if we found a match for the sparse directory.  This makes
sense, to me, _if_ we ignore the above question about sparse
directories matching equal to anything and everything.

>                         }
>                         break;
>                 }
>         }
>
> -       if (unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
> +       if (recurse &&
> +           unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
>                 return -1;
>
>         if (o->merge && src[0]) {
> @@ -1319,7 +1338,8 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>                         }
>                 }
>
> -               if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
> +               if (recurse &&
> +                   traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>                                              names, info) < 0)
>                         return -1;
>                 return mask;

Nice.  :-)


I think your patch was mostly about the recurse stuff, which other
than the name or a comment about it look good to me.  However, all the
other preparatory small tweaks brought up a lot of questions or
confusion for me.  I'm worried there might be a bug or two, though I
may have just misunderstood some of the code bits.
Derrick Stolee April 21, 2021, 1:41 p.m. UTC | #2
On 4/20/2021 7:00 PM, Elijah Newren wrote:
> On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>> diff --git a/dir.h b/dir.h
>> index 51cb0e217247..9d6666f520f3 100644
>> --- a/dir.h
>> +++ b/dir.h
>> @@ -503,7 +503,7 @@ static inline int ce_path_match(struct index_state *istate,
>>                                 char *seen)
>>  {
>>         return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
>> -                             S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
>> +                             S_ISSPARSEDIR(ce->ce_mode) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
> 
> I'm confused why this change would be needed, or why it'd semantically
> be meaningful here either.  Doesn't S_ISSPARSEDIR() being true imply
> S_ISDIR() is true (and perhaps even vice versa?).
> 
> By chance, was this a leftover from your early RFC changes from a few
> series ago when you had an entirely different mode for sparse
> directory entries?

I will double-check on this with additional testing and debugging.
Your comments below make it clear that this patch would benefit from
some additional splitting.

>>  }
>>
>>  static inline int dir_path_match(struct index_state *istate,
>> diff --git a/preload-index.c b/preload-index.c
>> index e5529a586366..35e67057ca9b 100644
>> --- a/preload-index.c
>> +++ b/preload-index.c
>> @@ -55,6 +55,8 @@ static void *preload_thread(void *_data)
>>                         continue;
>>                 if (S_ISGITLINK(ce->ce_mode))
>>                         continue;
>> +               if (S_ISSPARSEDIR(ce->ce_mode))
>> +                       continue;
>>                 if (ce_uptodate(ce))
>>                         continue;
>>                 if (ce_skip_worktree(ce))
> 
> Don't we have S_ISSPARSEDIR(ce->ce_mode) implies ce_skip_worktree(ce)?
>  Is this a duplicate check?  If so, is it still desirable for
> future-proofing or code clarity, or is it strictly redundant?

You're right, we could skip this one because the ce_skip_worktree(ce)
is enough to cover this case. I think I created this one because I was
auditing uses of S_ISGITLINK().

>> diff --git a/read-cache.c b/read-cache.c
>> index 29ffa9ac5db9..6308234b4838 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
>>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
>>                         continue;
>>
>> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
>> +                       continue;
>> +
> 
> I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> !ignore_skip_worktree and why it'd be desirable to refresh
> skip-worktree entries.  However, this is tangential to your patch and
> has apparently been around since 2009 (in particular, from 56cac48c35
> ("ie_match_stat(): do not ignore skip-worktree bit with
> CE_MATCH_IGNORE_VALID", 2009-12-14)).

This is probably better served with a statement like this earlier in
the method:

	if (ignore_skip_worktree)
		ensure_full_index(istate);

It seems like ignoring the skip worktree bits is a rare occasion and
it will be worth expanding the index for that case.

>>                 if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
>>                         filtered = 1;
>>
>> diff --git a/unpack-trees.c b/unpack-trees.c
>> index dddf106d5bd4..9a62e823928a 100644
>> --- a/unpack-trees.c
>> +++ b/unpack-trees.c
>> @@ -586,6 +586,13 @@ static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
>>  {
>>         ce->ce_flags |= CE_UNPACKED;
>>
>> +       /*
>> +        * If this is a sparse directory, don't advance cache_bottom.
>> +        * That will be advanced later using the cache-tree data.
>> +        */
>> +       if (S_ISSPARSEDIR(ce->ce_mode))
>> +               return;
>> +
> 
> I don't understand cache_bottom stuff; we might want to get Junio to
> look over it.  Or maybe I just need to dig a bit further and attempt
> to understand it.

I remember looking very careful at this when I created this (and found
it worth a comment) but I don't recall enough off the top of my head.
This is worth splitting out with a careful message, which will force me
to reexamine the cache_bottom member.

>>         if (o->cache_bottom < o->src_index->cache_nr &&
>>             o->src_index->cache[o->cache_bottom] == ce) {
>>                 int bottom = o->cache_bottom;
>> @@ -984,6 +991,9 @@ static int do_compare_entry(const struct cache_entry *ce,
>>         ce_len -= pathlen;
>>         ce_name = ce->name + pathlen;
>>
>> +       /* remove directory separator if a sparse directory entry */
>> +       if (S_ISSPARSEDIR(ce->ce_mode))
>> +               ce_len--;
>>         return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
> 
> Shouldn't we be passing ce->ce_mode instead of S_IFREG here as well?
> 
> Note the following sort order:
>    foo
>    foo.txt
>    foo/
>    foo/bar
> 
> You've trimmed off the '/', so 'foo/' would be ordered where 'foo' is,
> but df_name_compare() exists to make "foo" sort exactly where "foo/"
> would when "foo" is a directory.  Will your df_name_compare() call
> here result in foo.txt being placed after all the "foo/<subpath>"
> entries in the index and perhaps cause other problems down the line?
> (Are there issues, e.g. with cache-trees getting wrong ordering from
> this, or even writing out indexes or tree objects with the wrong
> ordering?  I've written out trees to disk with wrong ordering before
> and git usually survives but gets really confused with diffs.)
> 
> Since at least one caller of compare_entry() takes the return result
> and does a "if (cmp < 0)", this order is going to matter in some
> cases.  Perhaps we need some testcases where there is a sparse
> directory entry named "foo/" and a file recorded in some relevant tree
> with the name "foo.txt" to be able to trigger these lines of code?

I will do some testing to find out why removing the separator here was
necessary or valuable.

>>  }
>>
>> @@ -993,6 +1003,10 @@ static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
>>         if (cmp)
>>                 return cmp;
>>
>> +       /* If ce is a sparse directory, then allow equality here. */
>> +       if (S_ISSPARSEDIR(ce->ce_mode))
>> +               return 0;
>> +
> 
> Um...so a sparse directory compares equal to _anything_ at all?  I'm
> really confused why this would be desirable.  Am I missing something
> here?

The context is that is removed from the patch is that "cmp" is the
response from do_compare_entry(), which does a length-limited comparison.
If cmp is non-zero, then we've already returned the difference.

The rest of the method is checking if the 'info' input is actually a
parent directory of the _path_ given at this cache entry.

>>         /*
>>          * Even if the beginning compared identically, the ce should
>>          * compare as bigger than a directory leading up to it!

The line after this is:

	return ce_namelen(ce) > traverse_path_len(info, tree_entry_len(n));

This comparison is saying "these paths match up to the directory specified
by info and n, but we need 'ce' to be a file within that directory." But
in the case of a sparse directory entry, we can skip this comparison.

>> @@ -1243,6 +1257,7 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>>         struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
>>         struct unpack_trees_options *o = info->data;
>>         const struct name_entry *p = names;
>> +       unsigned recurse = 1;
> 
> "recurse" sent my mind off into questions about safety checks, base
> cases, etc., instead of just the simple "we don't want to read in
> directories corresponding to sparse entries".  I think this would be
> clearer either if the variable had the sparsity concept embedded in
> its name somewhere (e.g. "unsigned sparse_entry = 0", and check for
> (!sparse_entry) instead of (recurse) below), or with a comment about
> why there are cases where you want to avoid recursion.

I can understand that. This callback is confusing because it _does_
recurse, but through a sequence of methods instead of actually calling
itself.

It would be better to say something like "unpack_subdirectories = 1"
and disabling it when we are in a sparse directory.

>>
>>         /* Find first entry with a real name (we could use "mask" too) */
>>         while (!p->mode)
>> @@ -1284,12 +1299,16 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>>                                         }
>>                                 }
>>                                 src[0] = ce;
>> +
>> +                               if (S_ISSPARSEDIR(ce->ce_mode))
>> +                                       recurse = 0;
> 
> Ah, the context here doesn't show it but this is in the "if (!cmp)"
> block, i.e. if we found a match for the sparse directory.  This makes
> sense, to me, _if_ we ignore the above question about sparse
> directories matching equal to anything and everything.

I believe that "anything and everything" concern has been resolved.

>> @@ -1319,7 +1338,8 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>>                         }
>>                 }
>>
>> -               if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>> +               if (recurse &&
>> +                   traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>>                                              names, info) < 0)
>>                         return -1;
>>                 return mask;
> 
> Nice.  :-)
> 
> 
> I think your patch was mostly about the recurse stuff, which other
> than the name or a comment about it look good to me.  However, all the
> other preparatory small tweaks brought up a lot of questions or
> confusion for me.  I'm worried there might be a bug or two, though I
> may have just misunderstood some of the code bits.
 
This patch could probably be split up a little to make these things
clearer. Thanks for bringing up the tricky bits.

-Stolee
Elijah Newren April 21, 2021, 4:11 p.m. UTC | #3
// Adding Matheus to cc due to the ignore_skip_worktree bit, given his
experience and expertise with the checkout and unpack-trees code.

On Wed, Apr 21, 2021 at 6:41 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > <gitgitgadget@gmail.com> wrote:
> >> diff --git a/dir.h b/dir.h
> >> index 51cb0e217247..9d6666f520f3 100644
> >> --- a/dir.h
> >> +++ b/dir.h
> >> @@ -503,7 +503,7 @@ static inline int ce_path_match(struct index_state *istate,
> >>                                 char *seen)
> >>  {
> >>         return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
> >> -                             S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
> >> +                             S_ISSPARSEDIR(ce->ce_mode) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
> >
> > I'm confused why this change would be needed, or why it'd semantically
> > be meaningful here either.  Doesn't S_ISSPARSEDIR() being true imply
> > S_ISDIR() is true (and perhaps even vice versa?).
> >
> > By chance, was this a leftover from your early RFC changes from a few
> > series ago when you had an entirely different mode for sparse
> > directory entries?
>
> I will double-check on this with additional testing and debugging.
> Your comments below make it clear that this patch would benefit from
> some additional splitting.
>
> >>  }
> >>
> >>  static inline int dir_path_match(struct index_state *istate,
> >> diff --git a/preload-index.c b/preload-index.c
> >> index e5529a586366..35e67057ca9b 100644
> >> --- a/preload-index.c
> >> +++ b/preload-index.c
> >> @@ -55,6 +55,8 @@ static void *preload_thread(void *_data)
> >>                         continue;
> >>                 if (S_ISGITLINK(ce->ce_mode))
> >>                         continue;
> >> +               if (S_ISSPARSEDIR(ce->ce_mode))
> >> +                       continue;
> >>                 if (ce_uptodate(ce))
> >>                         continue;
> >>                 if (ce_skip_worktree(ce))
> >
> > Don't we have S_ISSPARSEDIR(ce->ce_mode) implies ce_skip_worktree(ce)?
> >  Is this a duplicate check?  If so, is it still desirable for
> > future-proofing or code clarity, or is it strictly redundant?
>
> You're right, we could skip this one because the ce_skip_worktree(ce)
> is enough to cover this case. I think I created this one because I was
> auditing uses of S_ISGITLINK().
>
> >> diff --git a/read-cache.c b/read-cache.c
> >> index 29ffa9ac5db9..6308234b4838 100644
> >> --- a/read-cache.c
> >> +++ b/read-cache.c
> >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> >>                         continue;
> >>
> >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> >> +                       continue;
> >> +
> >
> > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > !ignore_skip_worktree and why it'd be desirable to refresh
> > skip-worktree entries.  However, this is tangential to your patch and
> > has apparently been around since 2009 (in particular, from 56cac48c35
> > ("ie_match_stat(): do not ignore skip-worktree bit with
> > CE_MATCH_IGNORE_VALID", 2009-12-14)).
>
> This is probably better served with a statement like this earlier in
> the method:
>
>         if (ignore_skip_worktree)
>                 ensure_full_index(istate);
>
> It seems like ignoring the skip worktree bits is a rare occasion and
> it will be worth expanding the index for that case.

Maybe...I read the commit message that introduced the behavior and
it's not very convincing to me that SKIP_WORKTREE should be ignored
(it's also not that clear to me what the conditions are; is it just
update-index --really-refresh?); it may be worth double checking on
that assumption first, especially given how many other bugs existed
with skip_worktree stuff for years.  If it's necessary, then I agree
that your extra if-check makes sense.

In particular, I think it'd be really dumb for "update-index
--really-refresh" to read in and populate a huge subdirectory just to
stat files that don't exist because they are in directories that don't
exist.  And I think there's a pretty good argument to not update stat
information for skip_worktree entries in non-sparse-index cases even
in the presence of that flag, especially given Matheus' other recent
changes in this area (the emails just before we got to the point of
discussing SKIP_WORKTREE and racy clean entries...speaking of which,
it might be worthwhile pinging Matheus' for opinions on this issue
too.)

> >>                 if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
> >>                         filtered = 1;
> >>
> >> diff --git a/unpack-trees.c b/unpack-trees.c
> >> index dddf106d5bd4..9a62e823928a 100644
> >> --- a/unpack-trees.c
> >> +++ b/unpack-trees.c
> >> @@ -586,6 +586,13 @@ static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
> >>  {
> >>         ce->ce_flags |= CE_UNPACKED;
> >>
> >> +       /*
> >> +        * If this is a sparse directory, don't advance cache_bottom.
> >> +        * That will be advanced later using the cache-tree data.
> >> +        */
> >> +       if (S_ISSPARSEDIR(ce->ce_mode))
> >> +               return;
> >> +
> >
> > I don't understand cache_bottom stuff; we might want to get Junio to
> > look over it.  Or maybe I just need to dig a bit further and attempt
> > to understand it.
>
> I remember looking very careful at this when I created this (and found
> it worth a comment) but I don't recall enough off the top of my head.
> This is worth splitting out with a careful message, which will force me
> to reexamine the cache_bottom member.
>
> >>         if (o->cache_bottom < o->src_index->cache_nr &&
> >>             o->src_index->cache[o->cache_bottom] == ce) {
> >>                 int bottom = o->cache_bottom;
> >> @@ -984,6 +991,9 @@ static int do_compare_entry(const struct cache_entry *ce,
> >>         ce_len -= pathlen;
> >>         ce_name = ce->name + pathlen;
> >>
> >> +       /* remove directory separator if a sparse directory entry */
> >> +       if (S_ISSPARSEDIR(ce->ce_mode))
> >> +               ce_len--;
> >>         return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
> >
> > Shouldn't we be passing ce->ce_mode instead of S_IFREG here as well?
> >
> > Note the following sort order:
> >    foo
> >    foo.txt
> >    foo/
> >    foo/bar
> >
> > You've trimmed off the '/', so 'foo/' would be ordered where 'foo' is,
> > but df_name_compare() exists to make "foo" sort exactly where "foo/"
> > would when "foo" is a directory.  Will your df_name_compare() call
> > here result in foo.txt being placed after all the "foo/<subpath>"
> > entries in the index and perhaps cause other problems down the line?
> > (Are there issues, e.g. with cache-trees getting wrong ordering from
> > this, or even writing out indexes or tree objects with the wrong
> > ordering?  I've written out trees to disk with wrong ordering before
> > and git usually survives but gets really confused with diffs.)
> >
> > Since at least one caller of compare_entry() takes the return result
> > and does a "if (cmp < 0)", this order is going to matter in some
> > cases.  Perhaps we need some testcases where there is a sparse
> > directory entry named "foo/" and a file recorded in some relevant tree
> > with the name "foo.txt" to be able to trigger these lines of code?
>
> I will do some testing to find out why removing the separator here was
> necessary or valuable.

I think you removed the separator because df_name_compare() assumes it
gets a regular filename (i.e. no trailing '/') and manually adds one
based on mode for directories.  You were probably worried about what
amounts to a non-sensical double '/', but df_name_compare() wouldn't
actually get to that point unless someone somehow recorded a path
within a git tree object that ended with a trailing '/'.  I'd rather
not have to worry about the double '/' and explain why it isn't
possible (or wonder about whether git trees with trailing '/'
characters could be recorded on some OS), so I think the trimming of
the separator as you did makes sense.

What doesn't make sense to me is that the code just below had a
hardcoded S_IFREG that it passed to df_name_compare, based on "this is
a cache entry, and index entries are _always_ regular files".  You
didn't change that, even though it's now a false assumption.
symlinks, and regular files should be passed as S_IFREG there, I'm not
sure what should be passed for submodules (though the fact that it's
been using S_IFREG for years suggests maybe that is the mode we want
for it, so we can't use ce->ce_mode), and I'm pretty sure sparse
directory entries should be passed as S_IFDIR in order to get the
sorting right unless you stop stripping the trailing '/' character.
I'm not exactly sure where the sorting for do_compare_entry() affects
the code later, but I tried to trace it out a little in my comments
above in order to guide some testing.

> >>  }
> >>
> >> @@ -993,6 +1003,10 @@ static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
> >>         if (cmp)
> >>                 return cmp;
> >>
> >> +       /* If ce is a sparse directory, then allow equality here. */
> >> +       if (S_ISSPARSEDIR(ce->ce_mode))
> >> +               return 0;
> >> +
> >
> > Um...so a sparse directory compares equal to _anything_ at all?  I'm
> > really confused why this would be desirable.  Am I missing something
> > here?
>
> The context is that is removed from the patch is that "cmp" is the
> response from do_compare_entry(), which does a length-limited comparison.
> If cmp is non-zero, then we've already returned the difference.
>
> The rest of the method is checking if the 'info' input is actually a
> parent directory of the _path_ given at this cache entry.

Ah, thanks for the explanation.  So the only way we get here with
cmp==0 when we're dealing with a sparse directory entry is if we found
a directory by the same name....

> >>         /*
> >>          * Even if the beginning compared identically, the ce should
> >>          * compare as bigger than a directory leading up to it!
>
> The line after this is:
>
>         return ce_namelen(ce) > traverse_path_len(info, tree_entry_len(n));
>
> This comparison is saying "these paths match up to the directory specified
> by info and n, but we need 'ce' to be a file within that directory." But
> in the case of a sparse directory entry, we can skip this comparison.

Isn't this "must skip" rather than "can skip"?  If we're considering
the ce path "foo/bar/", then the traverse_path would be "foo/bar" and
we'd have:
    ce_namelen(ce) == 1 + traverse_path_len(info, tree_entry_len(n))
so this would return 1 for the comparison making them be treated as
non-equal even though they are what we consider equal entries.

In any event, it seems like this new check could use a better comment
than "then allow equality here".

> >> @@ -1243,6 +1257,7 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
> >>         struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
> >>         struct unpack_trees_options *o = info->data;
> >>         const struct name_entry *p = names;
> >> +       unsigned recurse = 1;
> >
> > "recurse" sent my mind off into questions about safety checks, base
> > cases, etc., instead of just the simple "we don't want to read in
> > directories corresponding to sparse entries".  I think this would be
> > clearer either if the variable had the sparsity concept embedded in
> > its name somewhere (e.g. "unsigned sparse_entry = 0", and check for
> > (!sparse_entry) instead of (recurse) below), or with a comment about
> > why there are cases where you want to avoid recursion.
>
> I can understand that. This callback is confusing because it _does_
> recurse, but through a sequence of methods instead of actually calling
> itself.
>
> It would be better to say something like "unpack_subdirectories = 1"
> and disabling it when we are in a sparse directory.

I like that name.

>
> >>
> >>         /* Find first entry with a real name (we could use "mask" too) */
> >>         while (!p->mode)
> >> @@ -1284,12 +1299,16 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
> >>                                         }
> >>                                 }
> >>                                 src[0] = ce;
> >> +
> >> +                               if (S_ISSPARSEDIR(ce->ce_mode))
> >> +                                       recurse = 0;
> >
> > Ah, the context here doesn't show it but this is in the "if (!cmp)"
> > block, i.e. if we found a match for the sparse directory.  This makes
> > sense, to me, _if_ we ignore the above question about sparse
> > directories matching equal to anything and everything.
>
> I believe that "anything and everything" concern has been resolved.

Yes, if we just improve the "then allow equality here" comment.

> >> @@ -1319,7 +1338,8 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
> >>                         }
> >>                 }
> >>
> >> -               if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
> >> +               if (recurse &&
> >> +                   traverse_trees_recursive(n, dirmask, mask & ~dirmask,
> >>                                              names, info) < 0)
> >>                         return -1;
> >>                 return mask;
> >
> > Nice.  :-)
> >
> >
> > I think your patch was mostly about the recurse stuff, which other
> > than the name or a comment about it look good to me.  However, all the
> > other preparatory small tweaks brought up a lot of questions or
> > confusion for me.  I'm worried there might be a bug or two, though I
> > may have just misunderstood some of the code bits.
>
> This patch could probably be split up a little to make these things
> clearer. Thanks for bringing up the tricky bits.
>
> -Stolee
Derrick Stolee April 21, 2021, 5:27 p.m. UTC | #4
On 4/20/2021 7:00 PM, Elijah Newren wrote:
> On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:

>> diff --git a/read-cache.c b/read-cache.c
>> index 29ffa9ac5db9..6308234b4838 100644
>> --- a/read-cache.c
>> +++ b/read-cache.c
>> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
>>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
>>                         continue;
>>
>> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
>> +                       continue;
>> +
> 
> I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> !ignore_skip_worktree and why it'd be desirable to refresh
> skip-worktree entries.  However, this is tangential to your patch and
> has apparently been around since 2009 (in particular, from 56cac48c35
> ("ie_match_stat(): do not ignore skip-worktree bit with
> CE_MATCH_IGNORE_VALID", 2009-12-14)).

I did some more digging on this part here. There has been movement in
this space!

The thing that triggers this ignore_skip_worktree variable inside
refresh_index() is now the REFRESH_IGNORE_SKIP_WORKTREE flag which was
introduced recently and is set only by builtin/add.c:refresh(), by
Matheus: a20f704 (add: warn when asked to update SKIP_WORKTREE entries,
2021-04-08).

This means that we can (for now) keep the behavior the same by adding

	if (ignore_skip_worktree)
		ensure_full_index(istate);

before the loop. This prevents the expansion during 'git status', but
requires modification before we are ready for 'git add' to work
correctly. Specifically, 'git add' currently warns only when adding
something that exactly matches a tracked file with SKIP_WORKTREE. It
does _not_ warn when adding something that is untracked but would have
the SKIP_WORKTREE bit if it was tracked. We will need to add that
extra warning if we want to avoid expanding during 'git add'.

Alternatively, we can decide to change the behavior here and send an
error() and return failure if they try to add something that would
live within a sparse-directory entry. I will think more on this and
have a good answer before v2 is ready.

Thanks,
-Stolee
Matheus Tavares April 21, 2021, 6:55 p.m. UTC | #5
On Wed, Apr 21, 2021 at 2:27 PM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > <gitgitgadget@gmail.com> wrote:
>
> >> diff --git a/read-cache.c b/read-cache.c
> >> index 29ffa9ac5db9..6308234b4838 100644
> >> --- a/read-cache.c
> >> +++ b/read-cache.c
> >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> >>                         continue;
> >>
> >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> >> +                       continue;
> >> +
> >
> > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > !ignore_skip_worktree and why it'd be desirable to refresh
> > skip-worktree entries.  However, this is tangential to your patch and
> > has apparently been around since 2009 (in particular, from 56cac48c35
> > ("ie_match_stat(): do not ignore skip-worktree bit with
> > CE_MATCH_IGNORE_VALID", 2009-12-14)).
>
> I did some more digging on this part here. There has been movement in
> this space!
>
> The thing that triggers this ignore_skip_worktree variable inside
> refresh_index() is now the REFRESH_IGNORE_SKIP_WORKTREE flag which was
> introduced recently and is set only by builtin/add.c:refresh(), by
> Matheus: a20f704 (add: warn when asked to update SKIP_WORKTREE entries,
> 2021-04-08).
>
> This means that we can (for now) keep the behavior the same by adding
>
>         if (ignore_skip_worktree)
>                 ensure_full_index(istate);
>
> before the loop.

Hmm, I don't think we need to expand the index here.
ignore_skip_worktree makes the loop below ignore entries with the
skip_worktree bit set. Since sparse dirs also have this bit set, we
will already get the behavior we want :)

However, I think we will need to expand the index at
`find_pathspecs_matching_against_index()` in order to find and warn
about the pathspecs that have matches among skip_worktree entries...

> This prevents the expansion during 'git status', but
> requires modification before we are ready for 'git add' to work
> correctly. Specifically, 'git add' currently warns only when adding
> something that exactly matches a tracked file with SKIP_WORKTREE. It
> does _not_ warn when adding something that is untracked but would have
> the SKIP_WORKTREE bit if it was tracked. We will need to add that
> extra warning if we want to avoid expanding during 'git add'.

Hmm, I see :( I was trying to think if it would be possible to do the
pathspec matching (for the warning) without having to expand the
index, but then there are the untracked files... If the user gives
"a/*/c" and we have "a/b/" as a sparse dir, we don't know if "a/b/c"
is a skip_worktree entry or an untracked file without expanding the
index...

> Alternatively, we can decide to change the behavior here and send an
> error() and return failure if they try to add something that would
> live within a sparse-directory entry.

I think this behavior would be tricky to replicate on non-sparse-index
sparse-checkouts, if we were to do that. We would have to pathspec
match each untracked file against the sparsity patterns, perhaps?
Elijah Newren April 21, 2021, 6:56 p.m. UTC | #6
On Wed, Apr 21, 2021 at 10:27 AM Derrick Stolee <stolee@gmail.com> wrote:
>
> On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > <gitgitgadget@gmail.com> wrote:
>
> >> diff --git a/read-cache.c b/read-cache.c
> >> index 29ffa9ac5db9..6308234b4838 100644
> >> --- a/read-cache.c
> >> +++ b/read-cache.c
> >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> >>                         continue;
> >>
> >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> >> +                       continue;
> >> +
> >
> > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > !ignore_skip_worktree and why it'd be desirable to refresh
> > skip-worktree entries.  However, this is tangential to your patch and
> > has apparently been around since 2009 (in particular, from 56cac48c35
> > ("ie_match_stat(): do not ignore skip-worktree bit with
> > CE_MATCH_IGNORE_VALID", 2009-12-14)).
>
> I did some more digging on this part here. There has been movement in
> this space!
>
> The thing that triggers this ignore_skip_worktree variable inside
> refresh_index() is now the REFRESH_IGNORE_SKIP_WORKTREE flag which was
> introduced recently and is set only by builtin/add.c:refresh(), by
> Matheus: a20f704 (add: warn when asked to update SKIP_WORKTREE entries,
> 2021-04-08).
>
> This means that we can (for now) keep the behavior the same by adding
>
>         if (ignore_skip_worktree)
>                 ensure_full_index(istate);
>
> before the loop. This prevents the expansion during 'git status', but
> requires modification before we are ready for 'git add' to work
> correctly. Specifically, 'git add' currently warns only when adding
> something that exactly matches a tracked file with SKIP_WORKTREE. It
> does _not_ warn when adding something that is untracked but would have
> the SKIP_WORKTREE bit if it was tracked. We will need to add that
> extra warning if we want to avoid expanding during 'git add'.
>
> Alternatively, we can decide to change the behavior here and send an
> error() and return failure if they try to add something that would
> live within a sparse-directory entry. I will think more on this and
> have a good answer before v2 is ready.

See my comments on 01/10; users are already getting surprised by "git
add" today and has been going on for months (though not super
frequently).  When they try to "git add" an untracked path that would
not match any path specifications in $GIT_DIR/info/sparse-checkout,
the fact that "git add" doesn't error out (or at the very least give a
warning) causes _subsequent_ commands to surprise the user with their
behavior; the fact that it is some later command that does weird stuff
(removing the file from the working tree) makes it harder for them to
try to understand and make sense of.  So, I'd say we do want to change
the behavior here...and not just for sparse-indexes but
sparse-checkouts in general.

As for how this affects the code, I think I'm behind both you and
Matheus on understanding here, but I'm starting to think it was a good
idea for me to spout my offhand comment on what looked like a funny
code smell that I thought was unrelated to your patch.  Sounds like it
is causing some good digging...I'll try to read up more on the results
when you send v2.  :-)
Elijah Newren April 21, 2021, 7:10 p.m. UTC | #7
On Wed, Apr 21, 2021 at 11:55 AM Matheus Tavares Bernardino
<matheus.bernardino@usp.br> wrote:
>
> On Wed, Apr 21, 2021 at 2:27 PM Derrick Stolee <stolee@gmail.com> wrote:
> >
> > On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > > <gitgitgadget@gmail.com> wrote:
> >
> > >> diff --git a/read-cache.c b/read-cache.c
> > >> index 29ffa9ac5db9..6308234b4838 100644
> > >> --- a/read-cache.c
> > >> +++ b/read-cache.c
> > >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> > >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> > >>                         continue;
> > >>
> > >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> > >> +                       continue;
> > >> +
> > >
> > > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > > !ignore_skip_worktree and why it'd be desirable to refresh
> > > skip-worktree entries.  However, this is tangential to your patch and
> > > has apparently been around since 2009 (in particular, from 56cac48c35
> > > ("ie_match_stat(): do not ignore skip-worktree bit with
> > > CE_MATCH_IGNORE_VALID", 2009-12-14)).
> >
> > I did some more digging on this part here. There has been movement in
> > this space!
> >
> > The thing that triggers this ignore_skip_worktree variable inside
> > refresh_index() is now the REFRESH_IGNORE_SKIP_WORKTREE flag which was
> > introduced recently and is set only by builtin/add.c:refresh(), by
> > Matheus: a20f704 (add: warn when asked to update SKIP_WORKTREE entries,
> > 2021-04-08).
> >
> > This means that we can (for now) keep the behavior the same by adding
> >
> >         if (ignore_skip_worktree)
> >                 ensure_full_index(istate);
> >
> > before the loop.
>
> Hmm, I don't think we need to expand the index here.
> ignore_skip_worktree makes the loop below ignore entries with the
> skip_worktree bit set. Since sparse dirs also have this bit set, we
> will already get the behavior we want :)
>
> However, I think we will need to expand the index at
> `find_pathspecs_matching_against_index()` in order to find and warn
> about the pathspecs that have matches among skip_worktree entries...
>
> > This prevents the expansion during 'git status', but
> > requires modification before we are ready for 'git add' to work
> > correctly. Specifically, 'git add' currently warns only when adding
> > something that exactly matches a tracked file with SKIP_WORKTREE. It
> > does _not_ warn when adding something that is untracked but would have
> > the SKIP_WORKTREE bit if it was tracked. We will need to add that
> > extra warning if we want to avoid expanding during 'git add'.
>
> Hmm, I see :( I was trying to think if it would be possible to do the
> pathspec matching (for the warning) without having to expand the
> index, but then there are the untracked files... If the user gives
> "a/*/c" and we have "a/b/" as a sparse dir, we don't know if "a/b/c"
> is a skip_worktree entry or an untracked file without expanding the
> index...

I thought Stolee's series added something that could allow us to check
that e.g. "a/b/c" corresponded to an entry under the sparse directory
"a/b/" and thus is a would-be-sparse entry.  Can we use that?

> > Alternatively, we can decide to change the behavior here and send an
> > error() and return failure if they try to add something that would
> > live within a sparse-directory entry.
>
> I think this behavior would be tricky to replicate on non-sparse-index
> sparse-checkouts, if we were to do that. We would have to pathspec
> match each untracked file against the sparsity patterns, perhaps?

By way of analogy, don't we have to pay the cost of pathspec matching
each tree entry against the sparsity patterns when doing a checkout
before putting those entries into the index?  Since "git add" is
trying to put new entries into the index, doesn't it make sense for it
to pay the same cost for the untracked paths it is about to place
there?

Sure, that can be expensive for non-cone mode, but that's the price
users pay for using sparse-checkouts and not using cone mode, and they
pay it every time they try to update the index with some new checkout.
I think "git add" should be treated similarly as another way to update
the index -- especially since users will get confused (and have gotten
confused) by subsequent commands if we don't do those checks.
Matheus Tavares April 21, 2021, 7:51 p.m. UTC | #8
On Wed, Apr 21, 2021 at 4:11 PM Elijah Newren <newren@gmail.com> wrote:
>
> On Wed, Apr 21, 2021 at 11:55 AM Matheus Tavares Bernardino
> <matheus.bernardino@usp.br> wrote:
> >
> > On Wed, Apr 21, 2021 at 2:27 PM Derrick Stolee <stolee@gmail.com> wrote:
> > >
> > > On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > > > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > > > <gitgitgadget@gmail.com> wrote:
> > >
> > > >> diff --git a/read-cache.c b/read-cache.c
> > > >> index 29ffa9ac5db9..6308234b4838 100644
> > > >> --- a/read-cache.c
> > > >> +++ b/read-cache.c
> > > >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> > > >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> > > >>                         continue;
> > > >>
> > > >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> > > >> +                       continue;
> > > >> +
> > > >
> > > > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > > > !ignore_skip_worktree and why it'd be desirable to refresh
> > > > skip-worktree entries.  However, this is tangential to your patch and
> > > > has apparently been around since 2009 (in particular, from 56cac48c35
> > > > ("ie_match_stat(): do not ignore skip-worktree bit with
> > > > CE_MATCH_IGNORE_VALID", 2009-12-14)).
> > >
> > > I did some more digging on this part here. There has been movement in
> > > this space!
> > >
> > > The thing that triggers this ignore_skip_worktree variable inside
> > > refresh_index() is now the REFRESH_IGNORE_SKIP_WORKTREE flag which was
> > > introduced recently and is set only by builtin/add.c:refresh(), by
> > > Matheus: a20f704 (add: warn when asked to update SKIP_WORKTREE entries,
> > > 2021-04-08).
> > >
> > > This means that we can (for now) keep the behavior the same by adding
> > >
> > >         if (ignore_skip_worktree)
> > >                 ensure_full_index(istate);
> > >
> > > before the loop.
> >
> > Hmm, I don't think we need to expand the index here.
> > ignore_skip_worktree makes the loop below ignore entries with the
> > skip_worktree bit set. Since sparse dirs also have this bit set, we
> > will already get the behavior we want :)
> >
> > However, I think we will need to expand the index at
> > `find_pathspecs_matching_against_index()` in order to find and warn
> > about the pathspecs that have matches among skip_worktree entries...
> >
> > > This prevents the expansion during 'git status', but
> > > requires modification before we are ready for 'git add' to work
> > > correctly. Specifically, 'git add' currently warns only when adding
> > > something that exactly matches a tracked file with SKIP_WORKTREE. It
> > > does _not_ warn when adding something that is untracked but would have
> > > the SKIP_WORKTREE bit if it was tracked. We will need to add that
> > > extra warning if we want to avoid expanding during 'git add'.
> >
> > Hmm, I see :( I was trying to think if it would be possible to do the
> > pathspec matching (for the warning) without having to expand the
> > index, but then there are the untracked files... If the user gives
> > "a/*/c" and we have "a/b/" as a sparse dir, we don't know if "a/b/c"
> > is a skip_worktree entry or an untracked file without expanding the
> > index...
>
> I thought Stolee's series added something that could allow us to check
> that e.g. "a/b/c" corresponded to an entry under the sparse directory
> "a/b/" and thus is a would-be-sparse entry.  Can we use that?

Yes, you mean for the warning on untracked paths that would become
sparse entries, right? The problem I was considering there was the
warning on tracked entries only, in which case I'm not sure if it
would help.

> > > Alternatively, we can decide to change the behavior here and send an
> > > error() and return failure if they try to add something that would
> > > live within a sparse-directory entry.
> >
> > I think this behavior would be tricky to replicate on non-sparse-index
> > sparse-checkouts, if we were to do that. We would have to pathspec
> > match each untracked file against the sparsity patterns, perhaps?
>
> By way of analogy, don't we have to pay the cost of pathspec matching
> each tree entry against the sparsity patterns when doing a checkout
> before putting those entries into the index?  Since "git add" is
> trying to put new entries into the index, doesn't it make sense for it
> to pay the same cost for the untracked paths it is about to place
> there?
>
> Sure, that can be expensive for non-cone mode, but that's the price
> users pay for using sparse-checkouts and not using cone mode, and they
> pay it every time they try to update the index with some new checkout.
> I think "git add" should be treated similarly as another way to update
> the index -- especially since users will get confused (and have gotten
> confused) by subsequent commands if we don't do those checks.

Good point. Yeah, that all makes sense :)
Matheus Tavares April 22, 2021, 2:24 a.m. UTC | #9
On Wed, Apr 21, 2021 at 1:11 PM Elijah Newren <newren@gmail.com> wrote:
>
> // Adding Matheus to cc due to the ignore_skip_worktree bit, given his
> experience and expertise with the checkout and unpack-trees code.
>
> On Wed, Apr 21, 2021 at 6:41 AM Derrick Stolee <stolee@gmail.com> wrote:
> >
> > On 4/20/2021 7:00 PM, Elijah Newren wrote:
> > > On Tue, Apr 13, 2021 at 7:01 AM Derrick Stolee via GitGitGadget
> > > <gitgitgadget@gmail.com> wrote:
> > >>
> > >> diff --git a/read-cache.c b/read-cache.c
> > >> index 29ffa9ac5db9..6308234b4838 100644
> > >> --- a/read-cache.c
> > >> +++ b/read-cache.c
> > >> @@ -1594,6 +1594,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
> > >>                 if (ignore_skip_worktree && ce_skip_worktree(ce))
> > >>                         continue;
> > >>
> > >> +               if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
> > >> +                       continue;
> > >> +
> > >
> > > I'm a bit confused about what could trigger ce_skip_worktree(ce) &&
> > > !ignore_skip_worktree and why it'd be desirable to refresh
> > > skip-worktree entries.

The skip-worktree entries are not really refreshed in refresh_index(),
even when !ignore_skip_worktree (which is the default case; i.e.
without the REFRESH_IGNORE_SKIP_WORKTREE flag).

This flag (which is currently only used by `git add --refresh`s code
at `builtin/add.c:refresh()`), just makes refresh_index() skip the
following operations on skip-worktree entries: pathspec matching,
marking the matches on `seen`, checking/warning if unmerged, and
marking the entry as up-to-date (i.e. with the in-memory CE_UPTODATE
bit).

I added this flag in mt/add-rm-in-sparse-checkout and changed
`builtin/add.c:refresh()` to use it mainly because we needed a `seen`
array with only matches from non-skip-worktree entries so that we
could later decide when to emit the warning. (In fact, the original
implementation of the flag only controlled whether sparse matches
would be marked on `seen` or not [1])

[1]: https://lore.kernel.org/git/d65b214dd1d83a2e8710a9bbf98477c1929f0d5e.1614138107.git.matheus.bernardino@usp.br/

Perhaps we could alternatively make refresh_index() skip the
previously mentioned operations on all skip-worktrees entries
*unconditionally*. I.e. having, early in the loop:

if (ce_skip_worktree(ce))
        continue;

But I'm not familiar enough with CE_UPTODATE and how it's used in
different parts of the code base, so I didn't want to risk introducing
any bugs at refresh_index() callers that might want/expect the
function to set the CE_UPTODATE bit on the skip-worktree entries. The
case of `git add --refresh` was much narrower and easier to analyze,
and that's what we were interested in for the warning. That's why I
only changed the behavior there :)

> > > However, this is tangential to your patch and
> > > has apparently been around since 2009 (in particular, from 56cac48c35
> > > ("ie_match_stat(): do not ignore skip-worktree bit with
> > > CE_MATCH_IGNORE_VALID", 2009-12-14)).

Note that the `CE_MATCH_IGNORE_SKIP_WORKTREE` added in this patch does
control if refresh_cache_ent() will refresh skip-worktree entries, but
refresh_index() allways calls this function *without* this flag.
Derrick Stolee April 23, 2021, 8:16 p.m. UTC | #10
On 4/21/2021 2:56 PM, Elijah Newren wrote:
> On Wed, Apr 21, 2021 at 10:27 AM Derrick Stolee <stolee@gmail.com> wrote:
>> Alternatively, we can decide to change the behavior here and send an
>> error() and return failure if they try to add something that would
>> live within a sparse-directory entry. I will think more on this and
>> have a good answer before v2 is ready.
> 
> See my comments on 01/10; users are already getting surprised by "git
> add" today and has been going on for months (though not super
> frequently).  When they try to "git add" an untracked path that would
> not match any path specifications in $GIT_DIR/info/sparse-checkout,
> the fact that "git add" doesn't error out (or at the very least give a
> warning) causes _subsequent_ commands to surprise the user with their
> behavior; the fact that it is some later command that does weird stuff
> (removing the file from the working tree) makes it harder for them to
> try to understand and make sense of.  So, I'd say we do want to change
> the behavior here...and not just for sparse-indexes but
> sparse-checkouts in general.
> 
> As for how this affects the code, I think I'm behind both you and
> Matheus on understanding here, but I'm starting to think it was a good
> idea for me to spout my offhand comment on what looked like a funny
> code smell that I thought was unrelated to your patch.  Sounds like it
> is causing some good digging...I'll try to read up more on the results
> when you send v2.  :-)

I think there are enough strange thing happening with 'git add' that I
want to take some time to figure out the right approach here. In v2, I
will delete the changes to builtin/add.c and instead focus on making
'git status' faster with a sparse-index. The 'git add' improvements
will follow in another series after I take enough time to understand
all of these special modes.

I think this split is especially important if we decide that changing
the behavior is the best thing to do here.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/dir.h b/dir.h
index 51cb0e217247..9d6666f520f3 100644
--- a/dir.h
+++ b/dir.h
@@ -503,7 +503,7 @@  static inline int ce_path_match(struct index_state *istate,
 				char *seen)
 {
 	return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
-			      S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
+			      S_ISSPARSEDIR(ce->ce_mode) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
 }
 
 static inline int dir_path_match(struct index_state *istate,
diff --git a/preload-index.c b/preload-index.c
index e5529a586366..35e67057ca9b 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -55,6 +55,8 @@  static void *preload_thread(void *_data)
 			continue;
 		if (S_ISGITLINK(ce->ce_mode))
 			continue;
+		if (S_ISSPARSEDIR(ce->ce_mode))
+			continue;
 		if (ce_uptodate(ce))
 			continue;
 		if (ce_skip_worktree(ce))
diff --git a/read-cache.c b/read-cache.c
index 29ffa9ac5db9..6308234b4838 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1594,6 +1594,9 @@  int refresh_index(struct index_state *istate, unsigned int flags,
 		if (ignore_skip_worktree && ce_skip_worktree(ce))
 			continue;
 
+		if (istate->sparse_index && S_ISSPARSEDIR(ce->ce_mode))
+			continue;
+
 		if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
 			filtered = 1;
 
diff --git a/unpack-trees.c b/unpack-trees.c
index dddf106d5bd4..9a62e823928a 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -586,6 +586,13 @@  static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
 {
 	ce->ce_flags |= CE_UNPACKED;
 
+	/*
+	 * If this is a sparse directory, don't advance cache_bottom.
+	 * That will be advanced later using the cache-tree data.
+	 */
+	if (S_ISSPARSEDIR(ce->ce_mode))
+		return;
+
 	if (o->cache_bottom < o->src_index->cache_nr &&
 	    o->src_index->cache[o->cache_bottom] == ce) {
 		int bottom = o->cache_bottom;
@@ -984,6 +991,9 @@  static int do_compare_entry(const struct cache_entry *ce,
 	ce_len -= pathlen;
 	ce_name = ce->name + pathlen;
 
+	/* remove directory separator if a sparse directory entry */
+	if (S_ISSPARSEDIR(ce->ce_mode))
+		ce_len--;
 	return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
 }
 
@@ -993,6 +1003,10 @@  static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
 	if (cmp)
 		return cmp;
 
+	/* If ce is a sparse directory, then allow equality here. */
+	if (S_ISSPARSEDIR(ce->ce_mode))
+		return 0;
+
 	/*
 	 * Even if the beginning compared identically, the ce should
 	 * compare as bigger than a directory leading up to it!
@@ -1243,6 +1257,7 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 	struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
 	struct unpack_trees_options *o = info->data;
 	const struct name_entry *p = names;
+	unsigned recurse = 1;
 
 	/* Find first entry with a real name (we could use "mask" too) */
 	while (!p->mode)
@@ -1284,12 +1299,16 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 					}
 				}
 				src[0] = ce;
+
+				if (S_ISSPARSEDIR(ce->ce_mode))
+					recurse = 0;
 			}
 			break;
 		}
 	}
 
-	if (unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
+	if (recurse &&
+	    unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
 		return -1;
 
 	if (o->merge && src[0]) {
@@ -1319,7 +1338,8 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 			}
 		}
 
-		if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
+		if (recurse &&
+		    traverse_trees_recursive(n, dirmask, mask & ~dirmask,
 					     names, info) < 0)
 			return -1;
 		return mask;