Message ID | 330e0c0977480d0506801854fcaa6c9f2b014569.1633641339.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Sparse Index: integrate with reset | expand |
Hi Victoria On 07/10/2021 22:15, Victoria Dye via GitGitGadget wrote: > From: Victoria Dye <vdye@github.com> > > Remove `ensure_full_index` guard on `prime_cache_tree` and update > `prime_cache_tree_rec` to correctly reconstruct sparse directory entries in > the cache tree. While processing a tree's entries, `prime_cache_tree_rec` > must determine whether a directory entry is sparse or not by searching for > it in the index (*without* expanding the index). If a matching sparse > directory index entry is found, no subtrees are added to the cache tree > entry and the entry count is set to 1 (representing the sparse directory > itself). Otherwise, the tree is assumed to not be sparse and its subtrees > are recursively added to the cache tree. I was looking at the callers to prime_cache_tree() this morning and would like to suggest an alternative approach - just delete prime_cache_tree() and all of its callers! As far as I can see it is only ever called after a successful call to unpack_trees() and since 52fca2184d ("unpack-trees: populate cache-tree on successful merge", 2015-07-28) unpack_trees() updates the cache tree for the caller. All the call sites are pretty obvious apart from the one in t/help/test-fast-rebase.c where unpack_trees() is called by merge_switch_to_result() Best Wishes Phillip > Helped-by: Elijah Newren <newren@gmail.com> > Signed-off-by: Victoria Dye <vdye@github.com> > --- > cache-tree.c | 47 ++++++++++++++++++++++-- > cache.h | 10 +++++ > read-cache.c | 27 ++++++++++---- > t/t1092-sparse-checkout-compatibility.sh | 15 +++++++- > 4 files changed, 86 insertions(+), 13 deletions(-) > > diff --git a/cache-tree.c b/cache-tree.c > index 9be19c85b66..2866101052c 100644 > --- a/cache-tree.c > +++ b/cache-tree.c > @@ -740,15 +740,26 @@ out: > return ret; > } > > +static void prime_cache_tree_sparse_dir(struct cache_tree *it, > + struct tree *tree) > +{ > + > + oidcpy(&it->oid, &tree->object.oid); > + it->entry_count = 1; > +} > + > static void prime_cache_tree_rec(struct repository *r, > struct cache_tree *it, > - struct tree *tree) > + struct tree *tree, > + struct strbuf *tree_path) > { > struct tree_desc desc; > struct name_entry entry; > int cnt; > + int base_path_len = tree_path->len; > > oidcpy(&it->oid, &tree->object.oid); > + > init_tree_desc(&desc, tree->buffer, tree->size); > cnt = 0; > while (tree_entry(&desc, &entry)) { > @@ -757,14 +768,40 @@ static void prime_cache_tree_rec(struct repository *r, > else { > struct cache_tree_sub *sub; > struct tree *subtree = lookup_tree(r, &entry.oid); > + > if (!subtree->object.parsed) > parse_tree(subtree); > sub = cache_tree_sub(it, entry.path); > sub->cache_tree = cache_tree(); > - prime_cache_tree_rec(r, sub->cache_tree, subtree); > + > + /* > + * Recursively-constructed subtree path is only needed when working > + * in a sparse index (where it's used to determine whether the > + * subtree is a sparse directory in the index). > + */ > + if (r->index->sparse_index) { > + strbuf_setlen(tree_path, base_path_len); > + strbuf_grow(tree_path, base_path_len + entry.pathlen + 1); > + strbuf_add(tree_path, entry.path, entry.pathlen); > + strbuf_addch(tree_path, '/'); > + } > + > + /* > + * If a sparse index is in use, the directory being processed may be > + * sparse. To confirm that, we can check whether an entry with that > + * exact name exists in the index. If it does, the created subtree > + * should be sparse. Otherwise, cache tree expansion should continue > + * as normal. > + */ > + if (r->index->sparse_index && > + index_entry_exists(r->index, tree_path->buf, tree_path->len)) > + prime_cache_tree_sparse_dir(sub->cache_tree, subtree); > + else > + prime_cache_tree_rec(r, sub->cache_tree, subtree, tree_path); > cnt += sub->cache_tree->entry_count; > } > } > + > it->entry_count = cnt; > } > > @@ -772,12 +809,14 @@ void prime_cache_tree(struct repository *r, > struct index_state *istate, > struct tree *tree) > { > + struct strbuf tree_path = STRBUF_INIT; > + > trace2_region_enter("cache-tree", "prime_cache_tree", the_repository); > cache_tree_free(&istate->cache_tree); > istate->cache_tree = cache_tree(); > > - ensure_full_index(istate); > - prime_cache_tree_rec(r, istate->cache_tree, tree); > + prime_cache_tree_rec(r, istate->cache_tree, tree, &tree_path); > + strbuf_release(&tree_path); > istate->cache_changed |= CACHE_TREE_CHANGED; > trace2_region_leave("cache-tree", "prime_cache_tree", the_repository); > } > diff --git a/cache.h b/cache.h > index f6295f3b048..1d3e4665562 100644 > --- a/cache.h > +++ b/cache.h > @@ -816,6 +816,16 @@ struct cache_entry *index_file_exists(struct index_state *istate, const char *na > */ > int index_name_pos(struct index_state *, const char *name, int namelen); > > +/* > + * Determines whether an entry with the given name exists within the > + * given index. The return value is 1 if an exact match is found, otherwise > + * it is 0. Note that, unlike index_name_pos, this function does not expand > + * the index if it is sparse. If an item exists within the full index but it > + * is contained within a sparse directory (and not in the sparse index), 0 is > + * returned. > + */ > +int index_entry_exists(struct index_state *, const char *name, int namelen); > + > /* > * Some functions return the negative complement of an insert position when a > * precise match was not found but a position was found where the entry would > diff --git a/read-cache.c b/read-cache.c > index f5d4385c408..c079ece981a 100644 > --- a/read-cache.c > +++ b/read-cache.c > @@ -68,6 +68,11 @@ > */ > #define CACHE_ENTRY_PATH_LENGTH 80 > > +enum index_search_mode { > + NO_EXPAND_SPARSE = 0, > + EXPAND_SPARSE = 1 > +}; > + > static inline struct cache_entry *mem_pool__ce_alloc(struct mem_pool *mem_pool, size_t len) > { > struct cache_entry *ce; > @@ -551,7 +556,10 @@ int cache_name_stage_compare(const char *name1, int len1, int stage1, const char > return 0; > } > > -static int index_name_stage_pos(struct index_state *istate, const char *name, int namelen, int stage) > +static int index_name_stage_pos(struct index_state *istate, > + const char *name, int namelen, > + int stage, > + enum index_search_mode search_mode) > { > int first, last; > > @@ -570,7 +578,7 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in > first = next+1; > } > > - if (istate->sparse_index && > + if (search_mode == EXPAND_SPARSE && istate->sparse_index && > first > 0) { > /* Note: first <= istate->cache_nr */ > struct cache_entry *ce = istate->cache[first - 1]; > @@ -586,7 +594,7 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in > ce_namelen(ce) < namelen && > !strncmp(name, ce->name, ce_namelen(ce))) { > ensure_full_index(istate); > - return index_name_stage_pos(istate, name, namelen, stage); > + return index_name_stage_pos(istate, name, namelen, stage, search_mode); > } > } > > @@ -595,7 +603,12 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in > > int index_name_pos(struct index_state *istate, const char *name, int namelen) > { > - return index_name_stage_pos(istate, name, namelen, 0); > + return index_name_stage_pos(istate, name, namelen, 0, EXPAND_SPARSE); > +} > + > +int index_entry_exists(struct index_state *istate, const char *name, int namelen) > +{ > + return index_name_stage_pos(istate, name, namelen, 0, NO_EXPAND_SPARSE) >= 0; > } > > int remove_index_entry_at(struct index_state *istate, int pos) > @@ -1222,7 +1235,7 @@ static int has_dir_name(struct index_state *istate, > */ > } > > - pos = index_name_stage_pos(istate, name, len, stage); > + pos = index_name_stage_pos(istate, name, len, stage, EXPAND_SPARSE); > if (pos >= 0) { > /* > * Found one, but not so fast. This could > @@ -1322,7 +1335,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e > strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) > pos = index_pos_to_insert_pos(istate->cache_nr); > else > - pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); > + pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce), EXPAND_SPARSE); > > /* existing match? Just replace it. */ > if (pos >= 0) { > @@ -1357,7 +1370,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e > if (!ok_to_replace) > return error(_("'%s' appears as both a file and as a directory"), > ce->name); > - pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); > + pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce), EXPAND_SPARSE); > pos = -pos-1; > } > return pos + 1; > diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh > index 875cdcb0495..4ac93874cb2 100755 > --- a/t/t1092-sparse-checkout-compatibility.sh > +++ b/t/t1092-sparse-checkout-compatibility.sh > @@ -756,9 +756,9 @@ test_expect_success 'sparse-index is not expanded' ' > ensure_not_expanded checkout - && > ensure_not_expanded switch rename-out-to-out && > ensure_not_expanded switch - && > - git -C sparse-index reset --hard && > + ensure_not_expanded reset --hard && > ensure_not_expanded checkout rename-out-to-out -- deep/deeper1 && > - git -C sparse-index reset --hard && > + ensure_not_expanded reset --hard && > ensure_not_expanded restore -s rename-out-to-out -- deep/deeper1 && > > echo >>sparse-index/README.md && > @@ -768,6 +768,17 @@ test_expect_success 'sparse-index is not expanded' ' > echo >>sparse-index/untracked.txt && > ensure_not_expanded add . && > > + for ref in update-deep update-folder1 update-folder2 update-deep > + do > + echo >>sparse-index/README.md && > + ensure_not_expanded reset --hard $ref || return 1 > + done && > + > + ensure_not_expanded reset --hard update-deep && > + ensure_not_expanded reset --keep base && > + ensure_not_expanded reset --merge update-deep && > + ensure_not_expanded reset --hard && > + > ensure_not_expanded checkout -f update-deep && > test_config -C sparse-index pull.twohead ort && > ( >
Phillip Wood wrote: > Hi Victoria > > On 07/10/2021 22:15, Victoria Dye via GitGitGadget wrote: >> From: Victoria Dye <vdye@github.com> >> >> Remove `ensure_full_index` guard on `prime_cache_tree` and update >> `prime_cache_tree_rec` to correctly reconstruct sparse directory entries in >> the cache tree. While processing a tree's entries, `prime_cache_tree_rec` >> must determine whether a directory entry is sparse or not by searching for >> it in the index (*without* expanding the index). If a matching sparse >> directory index entry is found, no subtrees are added to the cache tree >> entry and the entry count is set to 1 (representing the sparse directory >> itself). Otherwise, the tree is assumed to not be sparse and its subtrees >> are recursively added to the cache tree. > > I was looking at the callers to prime_cache_tree() this morning and would like to suggest an alternative approach - just delete prime_cache_tree() and all of its callers! As far as I can see it is only ever called after a successful call to unpack_trees() and since 52fca2184d ("unpack-trees: populate cache-tree on successful merge", 2015-07-28) unpack_trees() updates the cache tree for the caller. All the call sites are pretty obvious apart from the one in t/help/test-fast-rebase.c where unpack_trees() is called by merge_switch_to_result() > It looks like `prime_cache_tree` can be removed mostly without issue, but it causes the two last tests in `t4058-diff-duplicates.sh` to fail. Those tests document failure cases when dealing with duplicate tree entries [1], and it looks like `prime_cache_tree` was creating the appearance of a fully-reset index but was still leaving it in a state where subsequent operations could fail. I'm inclined to say the solution here would be to update the tests to document the "new" failure behavior and proceed with removing `prime_cache_tree`, because: * the test using `git reset --hard` disables `GIT_TEST_CHECK_CACHE_TREE`, indicating that `prime_cache_tree` already wasn't behaving correctly * attempting to fix the overarching issues with duplicate tree entries will substantially delay this patch series * a duplicate entry fix is largely unrelated to the intended scope of the series Another option would be to leave `prime_cache_tree` as it is, but with it being apparently useless outside of mostly-broken use cases in `t4058`, it seems like a waste to keep it around. [1] ac14de13b2 (t4058: explore duplicate tree entry handling in a bit more detail, 2020-12-11)
Victoria Dye <vdye@github.com> writes: > Phillip Wood wrote: >> I was looking at the callers to prime_cache_tree() this morning >> and would like to suggest an alternative approach - just delete >> prime_cache_tree() and all of its callers! Do you mean the calls added by new patches without understanding what they are doing, or all calls to it? Every time you update a path in the index from the working tree (e.g. "git add") and other sources, the directory in the cache-tree that includes the path is invalidated, and the surviving subtrees of cache-tree is used to speed up writing the index as a tree object, doing "diff-index --cached" (hence "git status"), etc. So over time, the cache-tree "degrades" as you muck with the index entries. When you write out the index as a tree, we by definition have to know the object names of all the tree objects that correspond to each directory in the index. A fully valid cache-tree is saved when it happens, so the above process can start over. There are cases other than "git write-tree" that we can cheaply learn the object names of all the tree objects that correspond to each directory in the index. When we read the index from an existing tree object, we know which tree (and its subtrees) we populated the index from, so we can salvage a degraded cache-tree. "reset --hard" and "reset --mixed" may be good opportunities, so is "checkout <branch>" that starts from a clean index. And cache tree priming is a mechanism to take advantage of such an opportunity. The cache-tree does not have to be primed and all you lose is performance, so priming can be removed mostly "without an issue", if you are not paying attention to cache-tree degradation. Priming with incorrect data, however, would leave permanent damage by writing a wrong tree via "git write-tree" (hence "git commit") and showing a wrong diff via "git diff-index [--cached]" (hence "git status" and probably "git add -- <pathspec>"), so not priming is safer than priming incorrectly. HTH.
On 08/10/2021 19:31, Junio C Hamano wrote: > Victoria Dye <vdye@github.com> writes: > >> Phillip Wood wrote: > >>> I was looking at the callers to prime_cache_tree() this morning >>> and would like to suggest an alternative approach - just delete >>> prime_cache_tree() and all of its callers! > > Do you mean the calls added by new patches without understanding > what they are doing, or all calls to it? I mean all calls to prime_cache_tree() after having understood (or at least thinking that I understand) what they are doing. As I tried to explain in the part of my message that you have cut (a) a successful call to unpack_trees() updates the cache tree (b) all the existing calls to prime_cache_tree() follow a successful call to unpack_trees() and nothing touches in index in between the call to unpack_trees() and prime_cache_tree(). Maybe I've misunderstood something but that leads me believe those calls can be removed without degrading performance. Best Wishes Phillip > Every time you update a path in the index from the working tree > (e.g. "git add") and other sources, the directory in the cache-tree > that includes the path is invalidated, and the surviving subtrees of > cache-tree is used to speed up writing the index as a tree object, > doing "diff-index --cached" (hence "git status"), etc. So over > time, the cache-tree "degrades" as you muck with the index entries. > > When you write out the index as a tree, we by definition have to > know the object names of all the tree objects that correspond to > each directory in the index. A fully valid cache-tree is saved when > it happens, so the above process can start over. > > There are cases other than "git write-tree" that we can cheaply > learn the object names of all the tree objects that correspond to > each directory in the index. When we read the index from an > existing tree object, we know which tree (and its subtrees) we > populated the index from, so we can salvage a degraded cache-tree. > > "reset --hard" and "reset --mixed" may be good opportunities, so is > "checkout <branch>" that starts from a clean index. And cache tree > priming is a mechanism to take advantage of such an opportunity. > > The cache-tree does not have to be primed and all you lose is > performance, so priming can be removed mostly "without an issue", if > you are not paying attention to cache-tree degradation. Priming > with incorrect data, however, would leave permanent damage by > writing a wrong tree via "git write-tree" (hence "git commit") and > showing a wrong diff via "git diff-index [--cached]" (hence "git > status" and probably "git add -- <pathspec>"), so not priming is > safer than priming incorrectly. > > HTH. >
Phillip Wood <phillip.wood123@gmail.com> writes: > On 08/10/2021 19:31, Junio C Hamano wrote: >> Victoria Dye <vdye@github.com> writes: >> >>> Phillip Wood wrote: >> >>>> I was looking at the callers to prime_cache_tree() this morning >>>> and would like to suggest an alternative approach - just delete >>>> prime_cache_tree() and all of its callers! >> Do you mean the calls added by new patches without understanding >> what they are doing, or all calls to it? > > I mean all calls to prime_cache_tree() after having understood (or at > least thinking that I understand) what they are doing. Sorry, my statement was confusingly written. I meant "calls added by new patches, written by those who do not understand what prime_cache_tree() calls are doing", but after re-reading it, I think it could be taken to be referring to "you may be commenting without understanding what prime_cache_tree() calls are doing", which wasn't my intention. > (a) a successful call to unpack_trees() updates the cache tree > > (b) all the existing calls to prime_cache_tree() follow a successful > call to unpack_trees() and nothing touches in index in between the > call to unpack_trees() and prime_cache_tree(). Ahh, OK. I think we originally avoided calling cache_tree_update() lightly (because it is essentially a "write-tree", a fairly heavy-weight operation, without I/O) and instead relied on prime_cache_tree() to get degraded cache-tree back into freshness. What I forgot was that 52fca218 (unpack-trees: populate cache-tree on successful merge, 2015-07-28) added cache_tree_update() there at the end of unpack_trees(). The commit covers quite a wide range of operations---the log message says "merge", but in fact anything that uses unpack_trees() including branch switching and the resetting of the index are affected, and they cause a full reconstruction of the cache tree by calling cache_tree_update(). For most callers of prime_cache_tree(), like the ones in "git read-tree" and "git reset", it is immediately obvious that we just read from the same tree, and we should have everything from the tree and nothing else in the resulting index, so it is clear that the prime_cache_tree() call is recreating the same cache-tree information that we already should have computed ourselves, and these calls can go (or if "prime" is still cheaper than "update", these callers can pass an option to tell unpack_trees() to skip the cache_tree_update() call, because they will call "prime" immediately after). For other callers it is not immediately obvious, but I trust you are correctly reading the code ;-) Thanks.
Junio C Hamano wrote: > For most callers of prime_cache_tree(), like the ones in "git > read-tree" and "git reset", it is immediately obvious that we just > read from the same tree, and we should have everything from the tree > and nothing else in the resulting index, so it is clear that the > prime_cache_tree() call is recreating the same cache-tree > information that we already should have computed ourselves, and > these calls can go (or if "prime" is still cheaper than "update", > these callers can pass an option to tell unpack_trees() to skip the > cache_tree_update() call, because they will call "prime" immediately > after). > After some basic performance testing of `git reset [--hard]`, it's not clear whether `cache_tree_update` is definitively faster or slower than `prime_cache_tree`; more conclusive results would indicate which of the two could be skipped. I'd like to defer this to a future patch (tracking it with an internal issue so I don't forget) where I can perform a more thorough analysis across all of the commands currently using `prime_cache_tree` and update its usage accordingly.
Victoria Dye <vdye@github.com> writes: > After some basic performance testing of `git reset [--hard]`, it's not clear > whether `cache_tree_update` is definitively faster or slower than > `prime_cache_tree`; more conclusive results would indicate which of the two > could be skipped. I'd like to defer this to a future patch (tracking it with > an internal issue so I don't forget) where I can perform a more thorough > analysis across all of the commands currently using `prime_cache_tree` and > update its usage accordingly. Yup. That sounds sensible. Concentrating on correctness first is a good direction to go.
On 10/10/2021 23:03, Junio C Hamano wrote: > Phillip Wood <phillip.wood123@gmail.com> writes: > >> On 08/10/2021 19:31, Junio C Hamano wrote: >>> Victoria Dye <vdye@github.com> writes: >>> >>>> Phillip Wood wrote: >>> >>>>> I was looking at the callers to prime_cache_tree() this morning >>>>> and would like to suggest an alternative approach - just delete >>>>> prime_cache_tree() and all of its callers! >>> Do you mean the calls added by new patches without understanding >>> what they are doing, or all calls to it? >> >> I mean all calls to prime_cache_tree() after having understood (or at >> least thinking that I understand) what they are doing. > > Sorry, my statement was confusingly written. I meant "calls added > by new patches, written by those who do not understand what > prime_cache_tree() calls are doing", but after re-reading it, I > think it could be taken to be referring to "you may be commenting > without understanding what prime_cache_tree() calls are doing", > which wasn't my intention. Thanks for clarifying that, I had misunderstood what you had written. >> (a) a successful call to unpack_trees() updates the cache tree >> >> (b) all the existing calls to prime_cache_tree() follow a successful >> call to unpack_trees() and nothing touches in index in between the >> call to unpack_trees() and prime_cache_tree(). > > Ahh, OK. > > I think we originally avoided calling cache_tree_update() lightly > (because it is essentially a "write-tree", a fairly heavy-weight > operation, without I/O) and instead relied on prime_cache_tree() to > get degraded cache-tree back into freshness. > > What I forgot was that 52fca218 (unpack-trees: populate cache-tree > on successful merge, 2015-07-28) added cache_tree_update() there at > the end of unpack_trees(). The commit covers quite a wide range of > operations---the log message says "merge", but in fact anything that > uses unpack_trees() including branch switching and the resetting of > the index are affected, and they cause a full reconstruction of the > cache tree by calling cache_tree_update(). > > For most callers of prime_cache_tree(), like the ones in "git > read-tree" and "git reset", it is immediately obvious that we just > read from the same tree, and we should have everything from the tree > and nothing else in the resulting index, so it is clear that the > prime_cache_tree() call is recreating the same cache-tree > information that we already should have computed ourselves, and > these calls can go (or if "prime" is still cheaper than "update", > these callers can pass an option to tell unpack_trees() to skip the > cache_tree_update() call, because they will call "prime" immediately > after). I haven't really thought this through but could we teach unpack_trees() to call prime_cache_tree() rather than cache_tree_update() when that would be safe? For callers that use oneway_merge() merge it should always be safe I think and it might be possible to modify twoway_merge() to signal if the final tree in the index matches the second one passed to it. We could have a more general mechanism for the callback to signal if it is safe to prime the tree but I suspect the callers that are using custom callbacks are not updating the whole tree. Best Wishes Phillip > For other callers it is not immediately obvious, but I trust you are > correctly reading the code ;-) > > Thanks. > > >
On 08/10/2021 18:14, Victoria Dye wrote: > Phillip Wood wrote: >> Hi Victoria >> >> On 07/10/2021 22:15, Victoria Dye via GitGitGadget wrote: >>> From: Victoria Dye <vdye@github.com> >>> >>> Remove `ensure_full_index` guard on `prime_cache_tree` and update >>> `prime_cache_tree_rec` to correctly reconstruct sparse directory entries in >>> the cache tree. While processing a tree's entries, `prime_cache_tree_rec` >>> must determine whether a directory entry is sparse or not by searching for >>> it in the index (*without* expanding the index). If a matching sparse >>> directory index entry is found, no subtrees are added to the cache tree >>> entry and the entry count is set to 1 (representing the sparse directory >>> itself). Otherwise, the tree is assumed to not be sparse and its subtrees >>> are recursively added to the cache tree. >> >> I was looking at the callers to prime_cache_tree() this morning and would like to suggest an alternative approach - just delete prime_cache_tree() and all of its callers! As far as I can see it is only ever called after a successful call to unpack_trees() and since 52fca2184d ("unpack-trees: populate cache-tree on successful merge", 2015-07-28) unpack_trees() updates the cache tree for the caller. All the call sites are pretty obvious apart from the one in t/help/test-fast-rebase.c where unpack_trees() is called by merge_switch_to_result() >> > > It looks like `prime_cache_tree` can be removed mostly without issue, but > it causes the two last tests in `t4058-diff-duplicates.sh` to fail. Those > tests document failure cases when dealing with duplicate tree entries [1], > and it looks like `prime_cache_tree` was creating the appearance of a > fully-reset index but was still leaving it in a state where subsequent > operations could fail. > > I'm inclined to say the solution here would be to update the tests to > document the "new" failure behavior and proceed with removing > `prime_cache_tree`, because: > > * the test using `git reset --hard` disables `GIT_TEST_CHECK_CACHE_TREE`, > indicating that `prime_cache_tree` already wasn't behaving correctly > * attempting to fix the overarching issues with duplicate tree entries will > substantially delay this patch series > * a duplicate entry fix is largely unrelated to the intended scope of the > series That sounds like a good way forward Best Wishes Phillip > Another option would be to leave `prime_cache_tree` as it is, but with it > being apparently useless outside of mostly-broken use cases in `t4058`, it > seems like a waste to keep it around. > > [1] ac14de13b2 (t4058: explore duplicate tree entry handling in a bit more detail, 2020-12-11) >
Phillip Wood <phillip.wood123@gmail.com> writes: > I haven't really thought this through but could we teach > unpack_trees() to call prime_cache_tree() rather than > cache_tree_update() when that would be safe? For callers that use > oneway_merge() merge it should always be safe I think and it might be > possible to modify twoway_merge() to signal if the final tree in the > index matches the second one passed to it. We could have a more > general mechanism for the callback to signal if it is safe to prime > the tree but I suspect the callers that are using custom callbacks are > not updating the whole tree. Before going in any direction, other than doing nothing ;-), we'd need to see how expensive "prime" and "update" are. Having said that. * Your idea is quite beneficial for callers of unpack_trees() as they no longer have to decide whether they want to make a separate call to "prime". * Right now we do not seem to have a codepath that (1) populates the index entries from existing trees (not necessarily making the index in complete sync with the trees) without unpack_trees() and (2) does "prime" to fix the cache tree but such a codepath may want to do either "prime" or "update", or neither. When it knows that it damages cache-tree so badly, and that it is often expected that the user would make many other changes to the index before writing it out as a tree, it may choose not to do either. Thanks.
diff --git a/cache-tree.c b/cache-tree.c index 9be19c85b66..2866101052c 100644 --- a/cache-tree.c +++ b/cache-tree.c @@ -740,15 +740,26 @@ out: return ret; } +static void prime_cache_tree_sparse_dir(struct cache_tree *it, + struct tree *tree) +{ + + oidcpy(&it->oid, &tree->object.oid); + it->entry_count = 1; +} + static void prime_cache_tree_rec(struct repository *r, struct cache_tree *it, - struct tree *tree) + struct tree *tree, + struct strbuf *tree_path) { struct tree_desc desc; struct name_entry entry; int cnt; + int base_path_len = tree_path->len; oidcpy(&it->oid, &tree->object.oid); + init_tree_desc(&desc, tree->buffer, tree->size); cnt = 0; while (tree_entry(&desc, &entry)) { @@ -757,14 +768,40 @@ static void prime_cache_tree_rec(struct repository *r, else { struct cache_tree_sub *sub; struct tree *subtree = lookup_tree(r, &entry.oid); + if (!subtree->object.parsed) parse_tree(subtree); sub = cache_tree_sub(it, entry.path); sub->cache_tree = cache_tree(); - prime_cache_tree_rec(r, sub->cache_tree, subtree); + + /* + * Recursively-constructed subtree path is only needed when working + * in a sparse index (where it's used to determine whether the + * subtree is a sparse directory in the index). + */ + if (r->index->sparse_index) { + strbuf_setlen(tree_path, base_path_len); + strbuf_grow(tree_path, base_path_len + entry.pathlen + 1); + strbuf_add(tree_path, entry.path, entry.pathlen); + strbuf_addch(tree_path, '/'); + } + + /* + * If a sparse index is in use, the directory being processed may be + * sparse. To confirm that, we can check whether an entry with that + * exact name exists in the index. If it does, the created subtree + * should be sparse. Otherwise, cache tree expansion should continue + * as normal. + */ + if (r->index->sparse_index && + index_entry_exists(r->index, tree_path->buf, tree_path->len)) + prime_cache_tree_sparse_dir(sub->cache_tree, subtree); + else + prime_cache_tree_rec(r, sub->cache_tree, subtree, tree_path); cnt += sub->cache_tree->entry_count; } } + it->entry_count = cnt; } @@ -772,12 +809,14 @@ void prime_cache_tree(struct repository *r, struct index_state *istate, struct tree *tree) { + struct strbuf tree_path = STRBUF_INIT; + trace2_region_enter("cache-tree", "prime_cache_tree", the_repository); cache_tree_free(&istate->cache_tree); istate->cache_tree = cache_tree(); - ensure_full_index(istate); - prime_cache_tree_rec(r, istate->cache_tree, tree); + prime_cache_tree_rec(r, istate->cache_tree, tree, &tree_path); + strbuf_release(&tree_path); istate->cache_changed |= CACHE_TREE_CHANGED; trace2_region_leave("cache-tree", "prime_cache_tree", the_repository); } diff --git a/cache.h b/cache.h index f6295f3b048..1d3e4665562 100644 --- a/cache.h +++ b/cache.h @@ -816,6 +816,16 @@ struct cache_entry *index_file_exists(struct index_state *istate, const char *na */ int index_name_pos(struct index_state *, const char *name, int namelen); +/* + * Determines whether an entry with the given name exists within the + * given index. The return value is 1 if an exact match is found, otherwise + * it is 0. Note that, unlike index_name_pos, this function does not expand + * the index if it is sparse. If an item exists within the full index but it + * is contained within a sparse directory (and not in the sparse index), 0 is + * returned. + */ +int index_entry_exists(struct index_state *, const char *name, int namelen); + /* * Some functions return the negative complement of an insert position when a * precise match was not found but a position was found where the entry would diff --git a/read-cache.c b/read-cache.c index f5d4385c408..c079ece981a 100644 --- a/read-cache.c +++ b/read-cache.c @@ -68,6 +68,11 @@ */ #define CACHE_ENTRY_PATH_LENGTH 80 +enum index_search_mode { + NO_EXPAND_SPARSE = 0, + EXPAND_SPARSE = 1 +}; + static inline struct cache_entry *mem_pool__ce_alloc(struct mem_pool *mem_pool, size_t len) { struct cache_entry *ce; @@ -551,7 +556,10 @@ int cache_name_stage_compare(const char *name1, int len1, int stage1, const char return 0; } -static int index_name_stage_pos(struct index_state *istate, const char *name, int namelen, int stage) +static int index_name_stage_pos(struct index_state *istate, + const char *name, int namelen, + int stage, + enum index_search_mode search_mode) { int first, last; @@ -570,7 +578,7 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in first = next+1; } - if (istate->sparse_index && + if (search_mode == EXPAND_SPARSE && istate->sparse_index && first > 0) { /* Note: first <= istate->cache_nr */ struct cache_entry *ce = istate->cache[first - 1]; @@ -586,7 +594,7 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in ce_namelen(ce) < namelen && !strncmp(name, ce->name, ce_namelen(ce))) { ensure_full_index(istate); - return index_name_stage_pos(istate, name, namelen, stage); + return index_name_stage_pos(istate, name, namelen, stage, search_mode); } } @@ -595,7 +603,12 @@ static int index_name_stage_pos(struct index_state *istate, const char *name, in int index_name_pos(struct index_state *istate, const char *name, int namelen) { - return index_name_stage_pos(istate, name, namelen, 0); + return index_name_stage_pos(istate, name, namelen, 0, EXPAND_SPARSE); +} + +int index_entry_exists(struct index_state *istate, const char *name, int namelen) +{ + return index_name_stage_pos(istate, name, namelen, 0, NO_EXPAND_SPARSE) >= 0; } int remove_index_entry_at(struct index_state *istate, int pos) @@ -1222,7 +1235,7 @@ static int has_dir_name(struct index_state *istate, */ } - pos = index_name_stage_pos(istate, name, len, stage); + pos = index_name_stage_pos(istate, name, len, stage, EXPAND_SPARSE); if (pos >= 0) { /* * Found one, but not so fast. This could @@ -1322,7 +1335,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0) pos = index_pos_to_insert_pos(istate->cache_nr); else - pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); + pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce), EXPAND_SPARSE); /* existing match? Just replace it. */ if (pos >= 0) { @@ -1357,7 +1370,7 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e if (!ok_to_replace) return error(_("'%s' appears as both a file and as a directory"), ce->name); - pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce)); + pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce), EXPAND_SPARSE); pos = -pos-1; } return pos + 1; diff --git a/t/t1092-sparse-checkout-compatibility.sh b/t/t1092-sparse-checkout-compatibility.sh index 875cdcb0495..4ac93874cb2 100755 --- a/t/t1092-sparse-checkout-compatibility.sh +++ b/t/t1092-sparse-checkout-compatibility.sh @@ -756,9 +756,9 @@ test_expect_success 'sparse-index is not expanded' ' ensure_not_expanded checkout - && ensure_not_expanded switch rename-out-to-out && ensure_not_expanded switch - && - git -C sparse-index reset --hard && + ensure_not_expanded reset --hard && ensure_not_expanded checkout rename-out-to-out -- deep/deeper1 && - git -C sparse-index reset --hard && + ensure_not_expanded reset --hard && ensure_not_expanded restore -s rename-out-to-out -- deep/deeper1 && echo >>sparse-index/README.md && @@ -768,6 +768,17 @@ test_expect_success 'sparse-index is not expanded' ' echo >>sparse-index/untracked.txt && ensure_not_expanded add . && + for ref in update-deep update-folder1 update-folder2 update-deep + do + echo >>sparse-index/README.md && + ensure_not_expanded reset --hard $ref || return 1 + done && + + ensure_not_expanded reset --hard update-deep && + ensure_not_expanded reset --keep base && + ensure_not_expanded reset --merge update-deep && + ensure_not_expanded reset --hard && + ensure_not_expanded checkout -f update-deep && test_config -C sparse-index pull.twohead ort && (