diff mbox series

[16/27] unpack-trees: make sparse aware

Message ID 45cf57c9c40bebb7383b8aab19c82fc4e41d2cd3.1611596534.git.gitgitgadget@gmail.com (mailing list archive)
State New
Headers show
Series Sparse Index | expand

Commit Message

Derrick Stolee Jan. 25, 2021, 5:42 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

As a first step to integrate 'git status' and 'git add' with the sparse
index, we must start integrating unpack_trees() with sparse directory
entries. These changes are currently impossible to trigger because
unpack_trees() calls ensure_full_index() if command_requires_full_index
is true. This is the case for all commands at the moment. As we expand
more commands to be sparse-aware, we might find that more changes are
required to unpack_trees(). The current changes will suffice for
'status' and 'add'.

unpack_trees() calls the traverse_trees() API using unpack_callback()
to decide if we should recurse into a subtree. We must add new abilities
to skip a subtree if it corresponds to a sparse directory entry.

It is important to be careful about the trailing directory separator
that exists in the sparse directory entries but not in the subtree
paths.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 dir.h           |  2 +-
 preload-index.c |  2 ++
 read-cache.c    |  3 +++
 unpack-trees.c  | 24 ++++++++++++++++++++++--
 4 files changed, 28 insertions(+), 3 deletions(-)

Comments

Elijah Newren Feb. 1, 2021, 8:50 p.m. UTC | #1
On Mon, Jan 25, 2021 at 9:42 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> As a first step to integrate 'git status' and 'git add' with the sparse
> index, we must start integrating unpack_trees() with sparse directory
> entries. These changes are currently impossible to trigger because
> unpack_trees() calls ensure_full_index() if command_requires_full_index
> is true. This is the case for all commands at the moment. As we expand
> more commands to be sparse-aware, we might find that more changes are
> required to unpack_trees(). The current changes will suffice for
> 'status' and 'add'.
>
> unpack_trees() calls the traverse_trees() API using unpack_callback()
> to decide if we should recurse into a subtree. We must add new abilities
> to skip a subtree if it corresponds to a sparse directory entry.

Makes sense.

> It is important to be careful about the trailing directory separator
> that exists in the sparse directory entries but not in the subtree
> paths.

The comment makes me wonder if leaving the trailing directory
separator out would be better, as it'd allow direct comparisons.  Of
course, you have a better idea of what is easier or harder based on
this decision.  Is there any chance you have a quick list of the
places that the code was simplified by this decision and a list of
places like this one that were made slightly harder?

> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  dir.h           |  2 +-
>  preload-index.c |  2 ++
>  read-cache.c    |  3 +++
>  unpack-trees.c  | 24 ++++++++++++++++++++++--
>  4 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/dir.h b/dir.h
> index facfae47402..300305ec335 100644
> --- a/dir.h
> +++ b/dir.h
> @@ -503,7 +503,7 @@ static inline int ce_path_match(const struct index_state *istate,
>                                 char *seen)
>  {
>         return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
> -                             S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
> +                             S_ISSPARSEDIR(ce) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));

I think this hunk becomes unnecessary if you use ce_mode = 040000 for
sparse directory entries.

>  }
>
>  static inline int dir_path_match(const struct index_state *istate,
> diff --git a/preload-index.c b/preload-index.c
> index ed6eaa47388..323fc8c5100 100644
> --- a/preload-index.c
> +++ b/preload-index.c
> @@ -54,6 +54,8 @@ static void *preload_thread(void *_data)
>                         continue;
>                 if (S_ISGITLINK(ce->ce_mode))
>                         continue;
> +               if (S_ISSPARSEDIR(ce))
> +                       continue;
>                 if (ce_uptodate(ce))
>                         continue;
>                 if (ce_skip_worktree(ce))
> diff --git a/read-cache.c b/read-cache.c
> index 65679d70d7c..ab0c2b86ec0 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -1572,6 +1572,9 @@ int refresh_index(struct index_state *istate, unsigned int flags,
>                 if (ignore_submodules && S_ISGITLINK(ce->ce_mode))
>                         continue;
>
> +               if (istate->sparse_index && S_ISSPARSEDIR(ce))
> +                       continue;
> +
>                 if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
>                         filtered = 1;
>
> diff --git a/unpack-trees.c b/unpack-trees.c
> index b324eec2a5d..90644856a80 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -583,6 +583,13 @@ static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
>  {
>         ce->ce_flags |= CE_UNPACKED;
>
> +       /*
> +        * If this is a sparse directory, don't advance cache_bottom.
> +        * That will be advanced later using the cache-tree data.
> +        */
> +       if (S_ISSPARSEDIR(ce))
> +               return;

I don't grok the cache_bottom stuff -- in general, nothing specific
about your patch.  But since I don't grok that stuff, it means I don't
understand how your comment here relates; you may want to ping another
reviewer about this portion of the patch.

> +
>         if (o->cache_bottom < o->src_index->cache_nr &&
>             o->src_index->cache[o->cache_bottom] == ce) {
>                 int bottom = o->cache_bottom;
> @@ -980,6 +987,9 @@ static int do_compare_entry(const struct cache_entry *ce,
>         ce_len -= pathlen;
>         ce_name = ce->name + pathlen;
>
> +       /* remove directory separator if a sparse directory entry */
> +       if (S_ISSPARSEDIR(ce))
> +               ce_len--;

Here's where your comment about trailing separator comes in; makes sense.

>         return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
>  }
>
> @@ -989,6 +999,10 @@ static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
>         if (cmp)
>                 return cmp;
>
> +       /* If ce is a sparse directory, then allow equality here. */
> +       if (S_ISSPARSEDIR(ce))
> +               return 0;
> +

This seems surprising to me.  Is there a chance you are comparing
sparse directory A with sparse directory B and you return with
equality?  Or sparse_directory A with regular file B?  Do the callers
still do the right thing?  If your code change here is right, it seems
like it deserves an extra comment either in the code or the commit
message.

>         /*
>          * Even if the beginning compared identically, the ce should
>          * compare as bigger than a directory leading up to it!
> @@ -1239,6 +1253,7 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>         struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
>         struct unpack_trees_options *o = info->data;
>         const struct name_entry *p = names;
> +       unsigned recurse = 1;
>
>         /* Find first entry with a real name (we could use "mask" too) */
>         while (!p->mode)
> @@ -1280,12 +1295,16 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>                                         }
>                                 }
>                                 src[0] = ce;
> +
> +                               if (S_ISSPARSEDIR(ce))
> +                                       recurse = 0;
>                         }
>                         break;
>                 }
>         }
>
> -       if (unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
> +       if (recurse &&
> +           unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
>                 return -1;
>
>         if (o->merge && src[0]) {
> @@ -1315,7 +1334,8 @@ static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
>                         }
>                 }
>
> -               if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
> +               if (recurse &&
> +                   traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>                                              names, info) < 0)
>                         return -1;
>                 return mask;

The unpack_callback() code has some comparison to a cache-tree, but
I'd assume that you'd need to update cache-tree.c somewhat to take
advantage of these sparse directory entries.  Am I wrong, and you just
get cache-tree.c working with sparse directory entries for free?  Or
is this something coming in a later patch?
Derrick Stolee Feb. 9, 2021, 5:23 p.m. UTC | #2
On 2/1/2021 3:50 PM, Elijah Newren wrote:
> On Mon, Jan 25, 2021 at 9:42 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>> It is important to be careful about the trailing directory separator
>> that exists in the sparse directory entries but not in the subtree
>> paths.
> 
> The comment makes me wonder if leaving the trailing directory
> separator out would be better, as it'd allow direct comparisons.  Of
> course, you have a better idea of what is easier or harder based on
> this decision.  Is there any chance you have a quick list of the
> places that the code was simplified by this decision and a list of
> places like this one that were made slightly harder?

I'm going through all of your comments and making notes about areas
to fix and clean up before starting a new series for full review.

This question of the trailing slash is important, and I will take
particular care about answering it as I rework the series. However,
the questions in this patch poke at the right places...

>> +       /* remove directory separator if a sparse directory entry */
>> +       if (S_ISSPARSEDIR(ce))
>> +               ce_len--;
> 
> Here's where your comment about trailing separator comes in; makes sense.
> 
>>         return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
>>  }
>>
>> @@ -989,6 +999,10 @@ static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
>>         if (cmp)
>>                 return cmp;
>>
>> +       /* If ce is a sparse directory, then allow equality here. */
>> +       if (S_ISSPARSEDIR(ce))
>> +               return 0;
>> +
> 
> This seems surprising to me.  Is there a chance you are comparing
> sparse directory A with sparse directory B and you return with
> equality?  Or sparse_directory A with regular file B?  Do the callers
> still do the right thing?  If your code change here is right, it seems
> like it deserves an extra comment either in the code or the commit
> message.

Sometimes a caller is asking for the first index entry corresponding
to a directory. In these cases, the input could be "A/B/C/". We want
to ensure that a sparse directory entry corresponding exactly to that
directory is correctly matched. If we place "A/B/C" in the index instead,
this search becomes more complicated (I think; I will justify this more
after thinking about it). 

At this point in time, we are just saying "We found the entry with
equal path value!" and not failing with the check in the rest of the
method:

	/*
	 * Even if the beginning compared identically, the ce should
	 * compare as bigger than a directory leading up to it!
	 */
	return ce_namelen(ce) > traverse_path_len(info, tree_entry_len(n));

>> -               if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>> +               if (recurse &&
>> +                   traverse_trees_recursive(n, dirmask, mask & ~dirmask,
>>                                              names, info) < 0)
>>                         return -1;
>>                 return mask;
> 
> The unpack_callback() code has some comparison to a cache-tree, but
> I'd assume that you'd need to update cache-tree.c somewhat to take
> advantage of these sparse directory entries.  Am I wrong, and you just
> get cache-tree.c working with sparse directory entries for free?  Or
> is this something coming in a later patch?

In the RFC, I integrate the cache-tree with the sparse-index at the
very end. I will move that integration to be much earlier in the next
submission, so it becomes part of the format discussion.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/dir.h b/dir.h
index facfae47402..300305ec335 100644
--- a/dir.h
+++ b/dir.h
@@ -503,7 +503,7 @@  static inline int ce_path_match(const struct index_state *istate,
 				char *seen)
 {
 	return match_pathspec(istate, pathspec, ce->name, ce_namelen(ce), 0, seen,
-			      S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
+			      S_ISSPARSEDIR(ce) || S_ISDIR(ce->ce_mode) || S_ISGITLINK(ce->ce_mode));
 }
 
 static inline int dir_path_match(const struct index_state *istate,
diff --git a/preload-index.c b/preload-index.c
index ed6eaa47388..323fc8c5100 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -54,6 +54,8 @@  static void *preload_thread(void *_data)
 			continue;
 		if (S_ISGITLINK(ce->ce_mode))
 			continue;
+		if (S_ISSPARSEDIR(ce))
+			continue;
 		if (ce_uptodate(ce))
 			continue;
 		if (ce_skip_worktree(ce))
diff --git a/read-cache.c b/read-cache.c
index 65679d70d7c..ab0c2b86ec0 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -1572,6 +1572,9 @@  int refresh_index(struct index_state *istate, unsigned int flags,
 		if (ignore_submodules && S_ISGITLINK(ce->ce_mode))
 			continue;
 
+		if (istate->sparse_index && S_ISSPARSEDIR(ce))
+			continue;
+
 		if (pathspec && !ce_path_match(istate, ce, pathspec, seen))
 			filtered = 1;
 
diff --git a/unpack-trees.c b/unpack-trees.c
index b324eec2a5d..90644856a80 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -583,6 +583,13 @@  static void mark_ce_used(struct cache_entry *ce, struct unpack_trees_options *o)
 {
 	ce->ce_flags |= CE_UNPACKED;
 
+	/*
+	 * If this is a sparse directory, don't advance cache_bottom.
+	 * That will be advanced later using the cache-tree data.
+	 */
+	if (S_ISSPARSEDIR(ce))
+		return;
+
 	if (o->cache_bottom < o->src_index->cache_nr &&
 	    o->src_index->cache[o->cache_bottom] == ce) {
 		int bottom = o->cache_bottom;
@@ -980,6 +987,9 @@  static int do_compare_entry(const struct cache_entry *ce,
 	ce_len -= pathlen;
 	ce_name = ce->name + pathlen;
 
+	/* remove directory separator if a sparse directory entry */
+	if (S_ISSPARSEDIR(ce))
+		ce_len--;
 	return df_name_compare(ce_name, ce_len, S_IFREG, name, namelen, mode);
 }
 
@@ -989,6 +999,10 @@  static int compare_entry(const struct cache_entry *ce, const struct traverse_inf
 	if (cmp)
 		return cmp;
 
+	/* If ce is a sparse directory, then allow equality here. */
+	if (S_ISSPARSEDIR(ce))
+		return 0;
+
 	/*
 	 * Even if the beginning compared identically, the ce should
 	 * compare as bigger than a directory leading up to it!
@@ -1239,6 +1253,7 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 	struct cache_entry *src[MAX_UNPACK_TREES + 1] = { NULL, };
 	struct unpack_trees_options *o = info->data;
 	const struct name_entry *p = names;
+	unsigned recurse = 1;
 
 	/* Find first entry with a real name (we could use "mask" too) */
 	while (!p->mode)
@@ -1280,12 +1295,16 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 					}
 				}
 				src[0] = ce;
+
+				if (S_ISSPARSEDIR(ce))
+					recurse = 0;
 			}
 			break;
 		}
 	}
 
-	if (unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
+	if (recurse &&
+	    unpack_nondirectories(n, mask, dirmask, src, names, info) < 0)
 		return -1;
 
 	if (o->merge && src[0]) {
@@ -1315,7 +1334,8 @@  static int unpack_callback(int n, unsigned long mask, unsigned long dirmask, str
 			}
 		}
 
-		if (traverse_trees_recursive(n, dirmask, mask & ~dirmask,
+		if (recurse &&
+		    traverse_trees_recursive(n, dirmask, mask & ~dirmask,
 					     names, info) < 0)
 			return -1;
 		return mask;