diff mbox series

[v2,1/5] sparse-checkout: refactor skip worktree retry logic

Message ID 93d0baed0b0f435e5656cef04cf103b5e2e0f41a.1719412192.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series sparse-index: improve clear_skip_worktree_from_present_files() | expand

Commit Message

Derrick Stolee June 26, 2024, 2:29 p.m. UTC
From: Derrick Stolee <stolee@gmail.com>

The clear_skip_worktree_from_present_files() method was introduced in
af6a51875a (repo_read_index: clear SKIP_WORKTREE bit from files present
in worktree, 2022-01-14) to help cases where sparse-checkout is enabled
but some paths outside of the sparse-checkout also exist on disk.  This
operation can be slow as it needs to check path existence in a way not
stored in the index, so caching was introduced in d79d299352 (Accelerate
clear_skip_worktree_from_present_files() by caching, 2022-01-14).

If users are having trouble with the performance of this operation and
don't care about paths outside of the sparse-checkout, they can disable
them using the sparse.expectFilesOutsideOfPatterns config option
introduced in ecc7c8841d (repo_read_index: add config to expect files
outside sparse patterns, 2022-02-25).

This check is particularly confusing in the presence of a sparse index,
as a sparse tree entry corresponding to an existing directory must first
be expanded to a full index before examining the paths within. This is
currently implemented using a 'goto' and a boolean variable to ensure we
restart only once.

Even with that caching, it was noticed that this could take a long time
to execute. 89aaab11a3 (index: add trace2 region for clear skip
worktree, 2022-11-03) introduced trace2 regions to measure this time.
Further, the way the loop repeats itself was slightly confusing and
prone to breakage, so a BUG() statement was added in 8c7abdc596 (index:
raise a bug if the index is materialised more than once, 2022-11-03) to
be sure that the second run of the loop does not hit any sparse trees.

One thing that can be confusing about the current setup is that the
trace2 regions nest and it is not clear that a second loop is running
after a sparse index is expanded. Here is an example of what the regions
look like in a typical case:

| region_enter | ... | label:clear_skip_worktree_from_present_files
| region_enter | ... | ..label:update
| region_leave | ... | ..label:update
| region_enter | ... | ..label:ensure_full_index
| region_enter | ... | ....label:update
| region_leave | ... | ....label:update
| region_leave | ... | ..label:ensure_full_index
| data         | ... | ..sparse_path_count:1
| data         | ... | ..sparse_path_count_full:269538
| region_leave | ... | label:clear_skip_worktree_from_present_files

One thing that is particularly difficult to understand about these
regions is that most of the time is spent between the close of the
ensure_full_index region and the reporting of the end data. This is
because of the restart of the loop being within the same region as the
first iteration of the loop.

This change refactors the method into two separate methods that are
traced separately. This will be more important later when we change
other features of the methods, but for now the only functional change is
the difference in the structure of the trace regions.

After this change, the same telemetry section is split into three
distinct chunks:

| region_enter | ... | label:clear_skip_worktree_from_present_files_sparse
| data         | ... | ..sparse_path_count:1
| region_leave | ... | label:clear_skip_worktree_from_present_files_sparse
| region_enter | ... | label:update
| region_leave | ... | label:update
| region_enter | ... | label:ensure_full_index
| region_enter | ... | ..label:update
| region_leave | ... | ..label:update
| region_leave | ... | label:ensure_full_index
| region_enter | ... | label:clear_skip_worktree_from_present_files_full
| data         | ... | ..full_path_count:269538
| region_leave | ... | label:clear_skip_worktree_from_present_files_full

Here, we see the sparse loop terminating early with its first sparse
path being a sparse directory containing a file. Then, that loop's
region terminates before ensure_full_index begins (in this case, the
cache-tree must also be computed). Then, _after_ the index is expanded,
the full loop begins with its own region.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 sparse-index.c | 77 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 24 deletions(-)

Comments

Junio C Hamano June 27, 2024, 8:59 p.m. UTC | #1
"Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:

> -void clear_skip_worktree_from_present_files(struct index_state *istate)
> +static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
>  {
>  	const char *last_dirname = NULL;
>  	size_t dir_len = 0;
>  	int dir_found = 1;
>  
> -	int i;
> -	int path_count[2] = {0, 0};
> -	int restarted = 0;
> +	int path_count = 0;
> +	int to_restart = 0;
>  
> -	if (!core_apply_sparse_checkout ||
> -	    sparse_expect_files_outside_of_patterns)
> -		return;
> -
> -	trace2_region_enter("index", "clear_skip_worktree_from_present_files",
> +	trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse",
>  			    istate->repo);
> -restart:
> -	for (i = 0; i < istate->cache_nr; i++) {
> +	for (int i = 0; i < istate->cache_nr; i++) {
>  		struct cache_entry *ce = istate->cache[i];
>  
>  		if (ce_skip_worktree(ce)) {
> -			path_count[restarted]++;
> +			path_count++;
>  			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
>  				if (S_ISSPARSEDIR(ce->ce_mode)) {
> -					if (restarted)
> -						BUG("ensure-full-index did not fully flatten?");
> -					ensure_full_index(istate);
> -					restarted = 1;
> -					goto restart;
> +					to_restart = 1;
> +					break;
>  				}
>  				ce->ce_flags &= ~CE_SKIP_WORKTREE;
>  			}
>  		}
>  	}

Both original and the rewritten code shares one trait, which is that
it goes from the beginning to check all paths that are marked with
SKIP_WORKTREE bit to clear CE_SKIP_WORKTREE bits from them, until
they find a S_ISSPARSEDIR entry and lazily call ensure_full_index(),
but then when retrying after calling ensure_full_index(), it
restarts from the beginning.  I wonder if it would help, especially
if the S_ISSPARSEDIR entry comes very late in the index (e.g., by
returning "here is where we have already checked during the first
run, until we realized that we first need to do ensure_full_index()"
to the caller from here, and then the caller tells the second phase
to restart from there), to reduce the number of calls to path_found()?
Elijah Newren June 28, 2024, 12:31 a.m. UTC | #2
On Wed, Jun 26, 2024 at 7:29 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <stolee@gmail.com>
>
[...]
> If users are having trouble with the performance of this operation and
> don't care about paths outside of the sparse-checkout, they can disable
> them using the sparse.expectFilesOutsideOfPatterns config option
> introduced in ecc7c8841d (repo_read_index: add config to expect files
> outside sparse patterns, 2022-02-25).

So, I had some heartburn with this paragraph when reading v1, but
decided to focus on other stuff.  But it still really bugs me while
reading v2.  So...

The purpose for the sparse.expectFilesOutsideOfPatterns option is very
specifically for the virtual-filesystem usecase (Google, specifically)
where sparse files aren't meant to stay sparse but be brought to life
the instant they are accessed (via a specialized filesystem and kernel
modules and whatnot).  Using it for any other reason, such as a
workaround for performance issues here, seems risky to me and I don't
like seeing it suggested.  The "if users...don't care about paths
outside of the sparse-checkout" really doesn't do it justice.  See the
"User-facing issues" section of
https://lore.kernel.org/git/b263cc75b7d4f426fb051d2cae5ae0fabeebb9c9.1642092230.git.gitgitgadget@gmail.com/;
we're foisting all those bugs back on users if they don't have such a
specialized filesystem and turn this knob on.  And then we'll get many
bug reports that are close to impossible to track down, and virtually
impossible to fix even if we do track it down.  I had for a while
suggested numerous fixes we needed in order to make
SKIP_WORKTREE-but-actually-present files work better, which required
fixes all over the codebase and which was serving as an impediment
e.g. to Victoria's desired sparse-index changes.  We gave up on those
other efforts long ago in favor of this much cleaner and simpler
solution of just clearing the SKIP_WORKTREE bit if the file wasn't
missing.

Further, I've seen people read git.git commit messages on Stack
Overflow and make suggestions based off them, so I'm a bit worried
this paragraph as worded will end up causing active harm, when I think
it's original intent was merely to provide full context around the
history of the clear_skip_worktree_from_present_files_sparse()
function.  While this bit of information is part of the history of
this function, I'm not sure it's really all that relevant to your
series.

Could we omit this paragraph, or if you want to keep it, reword it in
some way that doesn't make it look like some tradeoff that isn't that
big of a deal for non-specialized-vfs users?
Elijah Newren June 28, 2024, 12:51 a.m. UTC | #3
On Thu, Jun 27, 2024 at 1:59 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com> writes:
>
> > -void clear_skip_worktree_from_present_files(struct index_state *istate)
> > +static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
> >  {
> >       const char *last_dirname = NULL;
> >       size_t dir_len = 0;
> >       int dir_found = 1;
> >
> > -     int i;
> > -     int path_count[2] = {0, 0};
> > -     int restarted = 0;
> > +     int path_count = 0;
> > +     int to_restart = 0;
> >
> > -     if (!core_apply_sparse_checkout ||
> > -         sparse_expect_files_outside_of_patterns)
> > -             return;
> > -
> > -     trace2_region_enter("index", "clear_skip_worktree_from_present_files",
> > +     trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse",
> >                           istate->repo);
> > -restart:
> > -     for (i = 0; i < istate->cache_nr; i++) {
> > +     for (int i = 0; i < istate->cache_nr; i++) {
> >               struct cache_entry *ce = istate->cache[i];
> >
> >               if (ce_skip_worktree(ce)) {
> > -                     path_count[restarted]++;
> > +                     path_count++;
> >                       if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
> >                               if (S_ISSPARSEDIR(ce->ce_mode)) {
> > -                                     if (restarted)
> > -                                             BUG("ensure-full-index did not fully flatten?");
> > -                                     ensure_full_index(istate);
> > -                                     restarted = 1;
> > -                                     goto restart;
> > +                                     to_restart = 1;
> > +                                     break;
> >                               }
> >                               ce->ce_flags &= ~CE_SKIP_WORKTREE;
> >                       }
> >               }
> >       }
>
> Both original and the rewritten code shares one trait, which is that
> it goes from the beginning to check all paths that are marked with
> SKIP_WORKTREE bit to clear CE_SKIP_WORKTREE bits from them, until
> they find a S_ISSPARSEDIR entry and lazily call ensure_full_index(),
> but then when retrying after calling ensure_full_index(), it
> restarts from the beginning.  I wonder if it would help, especially
> if the S_ISSPARSEDIR entry comes very late in the index (e.g., by

"the S_ISSPARSEDIR entry"?  Are you assuming there is only one?

> returning "here is where we have already checked during the first
> run, until we realized that we first need to do ensure_full_index()"
> to the caller from here, and then the caller tells the second phase
> to restart from there), to reduce the number of calls to path_found()?

My first assumption about how to "restart from there", was to pass the
current index, i, to the more involved check.  That might fail to be
the correct index when there are multiple S_ISSPARSEDIR entries and
the first one (or more) are actually missing from the working copy as
expected.  However, if "restart from there" is just passing the
name of the relevant directory and we do some kind of binary search to
find where the earliest entry under that directory would exist in the
expanded index, then that seems like a reasonable additional
optimization.
Derrick Stolee June 28, 2024, 1:49 a.m. UTC | #4
On 6/27/24 8:51 PM, Elijah Newren wrote:
> On Thu, Jun 27, 2024 at 1:59 PM Junio C Hamano <gitster@pobox.com> wrote:
>>
>> Both original and the rewritten code shares one trait, which is that
>> it goes from the beginning to check all paths that are marked with
>> SKIP_WORKTREE bit to clear CE_SKIP_WORKTREE bits from them, until
>> they find a S_ISSPARSEDIR entry and lazily call ensure_full_index(),
>> but then when retrying after calling ensure_full_index(), it
>> restarts from the beginning.  I wonder if it would help, especially
>> if the S_ISSPARSEDIR entry comes very late in the index (e.g., by
> 
> "the S_ISSPARSEDIR entry"?  Are you assuming there is only one?

In fact, the situation is more drastic when there are many sparse
directories and this is one late in the list. The ensure_full_index()
call could cause many new entries that should be no-ops earlier in
the lex order.

>> returning "here is where we have already checked during the first
>> run, until we realized that we first need to do ensure_full_index()"
>> to the caller from here, and then the caller tells the second phase
>> to restart from there), to reduce the number of calls to path_found()?
> 
> My first assumption about how to "restart from there", was to pass the
> current index, i, to the more involved check.  That might fail to be
> the correct index when there are multiple S_ISSPARSEDIR entries and
> the first one (or more) are actually missing from the working copy as
> expected.  However, if "restart from there" is just passing the
> name of the relevant directory and we do some kind of binary search to
> find where the earliest entry under that directory would exist in the
> expanded index, then that seems like a reasonable additional
> optimization.

Yes, you're on the right track. Requires replacing the response from
the sparse loop to be the existing cache entry position or path
instead of only a boolean "should restart" indicator.

Another option would be to explore the whole index for the sparse
directories that exist, then expand the index only to those paths.
That should be fastest, but would require using the partially-
expanded state that has so far only happened inside of the
sparse-checkout builtin.

All this to say: there are some interesting directions to go from
here. I'm willing to take some time to explore the options. Should
we consider merging this improvement now and then consider further
improvements in a follow-up series (if they are fruitful)?

Thanks,
-Stolee
Derrick Stolee June 28, 2024, 1:56 a.m. UTC | #5
On 6/27/24 8:31 PM, Elijah Newren wrote:
> On Wed, Jun 26, 2024 at 7:29 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <stolee@gmail.com>
>>
> [...]
>> If users are having trouble with the performance of this operation and
>> don't care about paths outside of the sparse-checkout, they can disable
>> them using the sparse.expectFilesOutsideOfPatterns config option
>> introduced in ecc7c8841d (repo_read_index: add config to expect files
>> outside sparse patterns, 2022-02-25).
> 
> So, I had some heartburn with this paragraph when reading v1, but
> decided to focus on other stuff.  But it still really bugs me while
> reading v2.  So...

Thanks for bringing this up with the details I've omitted from the
context of my response.

> Could we omit this paragraph, or if you want to keep it, reword it in
> some way that doesn't make it look like some tradeoff that isn't that
> big of a deal for non-specialized-vfs users?

Makes sense to delete the paragraph. I could ship a v3, or is this
a commit-message surgery you'd be willing to do on my behalf, Junio?

Thanks,
-Stolee
Junio C Hamano June 28, 2024, 5:50 a.m. UTC | #6
Elijah Newren <newren@gmail.com> writes:

>> restarts from the beginning.  I wonder if it would help, especially
>> if the S_ISSPARSEDIR entry comes very late in the index (e.g., by
>
> "the S_ISSPARSEDIR entry"?  Are you assuming there is only one?

No, but I was referring to the first one we encounter in the first
attempt loop that causes us to run the ensure_full() thing.  If the
first SPARSEDIR comes very late, there are many index entries before
that one that are already dealt with in the first attempt loop, and
for these early entries that are already handled, the second attempt
loop will do nothing but just skip.

> expected.  However, if "restart from there" is just passing the
> name of the relevant directory and we do some kind of binary search to
> find where the earliest entry under that directory would exist in the
> expanded index, then that seems like a reasonable additional
> optimization.

Perhaps.  I do not think it is known if it is worth doing, and I do
think it is better to finish this current round and then explore
further optimization opportunities on top after the dust settles.

Thanks.
diff mbox series

Patch

diff --git a/sparse-index.c b/sparse-index.c
index e48e40cae71..e0457c87fff 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -486,49 +486,78 @@  static int path_found(const char *path, const char **dirname, size_t *dir_len,
 	return 0;
 }
 
-void clear_skip_worktree_from_present_files(struct index_state *istate)
+static int clear_skip_worktree_from_present_files_sparse(struct index_state *istate)
 {
 	const char *last_dirname = NULL;
 	size_t dir_len = 0;
 	int dir_found = 1;
 
-	int i;
-	int path_count[2] = {0, 0};
-	int restarted = 0;
+	int path_count = 0;
+	int to_restart = 0;
 
-	if (!core_apply_sparse_checkout ||
-	    sparse_expect_files_outside_of_patterns)
-		return;
-
-	trace2_region_enter("index", "clear_skip_worktree_from_present_files",
+	trace2_region_enter("index", "clear_skip_worktree_from_present_files_sparse",
 			    istate->repo);
-restart:
-	for (i = 0; i < istate->cache_nr; i++) {
+	for (int i = 0; i < istate->cache_nr; i++) {
 		struct cache_entry *ce = istate->cache[i];
 
 		if (ce_skip_worktree(ce)) {
-			path_count[restarted]++;
+			path_count++;
 			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found)) {
 				if (S_ISSPARSEDIR(ce->ce_mode)) {
-					if (restarted)
-						BUG("ensure-full-index did not fully flatten?");
-					ensure_full_index(istate);
-					restarted = 1;
-					goto restart;
+					to_restart = 1;
+					break;
 				}
 				ce->ce_flags &= ~CE_SKIP_WORKTREE;
 			}
 		}
 	}
 
-	if (path_count[0])
-		trace2_data_intmax("index", istate->repo,
-				   "sparse_path_count", path_count[0]);
-	if (restarted)
-		trace2_data_intmax("index", istate->repo,
-				   "sparse_path_count_full", path_count[1]);
-	trace2_region_leave("index", "clear_skip_worktree_from_present_files",
+	trace2_data_intmax("index", istate->repo,
+			   "sparse_path_count", path_count);
+	trace2_region_leave("index", "clear_skip_worktree_from_present_files_sparse",
+			    istate->repo);
+	return to_restart;
+}
+
+static void clear_skip_worktree_from_present_files_full(struct index_state *istate)
+{
+	const char *last_dirname = NULL;
+	size_t dir_len = 0;
+	int dir_found = 1;
+
+	int path_count = 0;
+
+	trace2_region_enter("index", "clear_skip_worktree_from_present_files_full",
 			    istate->repo);
+	for (int i = 0; i < istate->cache_nr; i++) {
+		struct cache_entry *ce = istate->cache[i];
+
+		if (S_ISSPARSEDIR(ce->ce_mode))
+			BUG("ensure-full-index did not fully flatten?");
+
+		if (ce_skip_worktree(ce)) {
+			path_count++;
+			if (path_found(ce->name, &last_dirname, &dir_len, &dir_found))
+				ce->ce_flags &= ~CE_SKIP_WORKTREE;
+		}
+	}
+
+	trace2_data_intmax("index", istate->repo,
+			   "full_path_count", path_count);
+	trace2_region_leave("index", "clear_skip_worktree_from_present_files_full",
+			    istate->repo);
+}
+
+void clear_skip_worktree_from_present_files(struct index_state *istate)
+{
+	if (!core_apply_sparse_checkout ||
+	    sparse_expect_files_outside_of_patterns)
+		return;
+
+	if (clear_skip_worktree_from_present_files_sparse(istate)) {
+		ensure_full_index(istate);
+		clear_skip_worktree_from_present_files_full(istate);
+	}
 }
 
 /*