diff mbox series

[v6,20/28] fsmonitor: optimize processing of directory events

Message ID 48af0813deccab924d3591b4df025b17bf309260.1650662994.git.gitgitgadget@gmail.com (mailing list archive)
State Accepted
Commit 95a4e78a746be2b87972c03c82e8b1fe8fe0bea1
Headers show
Series Builtin FSMonitor Part 3 | expand

Commit Message

Jeff Hostetler April 22, 2022, 9:29 p.m. UTC
From: Jeff Hostetler <jeffhost@microsoft.com>

Teach Git to perform binary search over the cache-entries for a directory
notification and then linearly scan forward to find the immediate children.

Previously, when the FSMonitor reported a modified directory Git would
perform a linear search on the entire cache-entry array for all
entries matching that directory prefix and invalidate them.  Since the
cache-entry array is already sorted, we can use a binary search to
find the first matching entry and then only linearly walk forward and
invalidate entries until the prefix changes.

Also, the original code would invalidate anything having the same
directory prefix.  Since a directory event should only be received for
items that are immediately within the directory (and not within
sub-directories of it), only invalidate those entries and not the
whole subtree.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 fsmonitor.c | 71 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 17 deletions(-)

Comments

Johannes Schindelin May 12, 2022, 3:08 p.m. UTC | #1
Hi Jeff,

On Fri, 22 Apr 2022, Jeff Hostetler via GitGitGadget wrote:

> From: Jeff Hostetler <jeffhost@microsoft.com>
>
> Teach Git to perform binary search over the cache-entries for a directory
> notification and then linearly scan forward to find the immediate children.
>
> Previously, when the FSMonitor reported a modified directory Git would
> perform a linear search on the entire cache-entry array for all
> entries matching that directory prefix and invalidate them.  Since the
> cache-entry array is already sorted, we can use a binary search to
> find the first matching entry and then only linearly walk forward and
> invalidate entries until the prefix changes.
>
> Also, the original code would invalidate anything having the same
> directory prefix.  Since a directory event should only be received for
> items that are immediately within the directory (and not within
> sub-directories of it), only invalidate those entries and not the
> whole subtree.
>
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  fsmonitor.c | 71 ++++++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 54 insertions(+), 17 deletions(-)
>
> diff --git a/fsmonitor.c b/fsmonitor.c
> index 292a6742b4f..e1229c289cf 100644
> --- a/fsmonitor.c
> +++ b/fsmonitor.c
> @@ -184,30 +184,68 @@ static int query_fsmonitor_hook(struct repository *r,
>  static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
>  {
>  	int i, len = strlen(name);
> -	if (name[len - 1] == '/') {
> +	int pos = index_name_pos(istate, name, len);
> +
> +	trace_printf_key(&trace_fsmonitor,
> +			 "fsmonitor_refresh_callback '%s' (pos %d)",
> +			 name, pos);
>
> +	if (name[len - 1] == '/') {
>  		/*
> -		 * TODO We should binary search to find the first path with
> -		 * TODO this directory prefix.  Then linearly update entries
> -		 * TODO while the prefix matches.  Taking care to search without
> -		 * TODO the trailing slash -- because '/' sorts after a few
> -		 * TODO interesting special chars, like '.' and ' '.
> +		 * The daemon can decorate directory events, such as
> +		 * moves or renames, with a trailing slash if the OS
> +		 * FS Event contains sufficient information, such as
> +		 * MacOS.
> +		 *
> +		 * Use this to invalidate the entire cone under that
> +		 * directory.
> +		 *
> +		 * We do not expect an exact match because the index
> +		 * does not normally contain directory entries, so we
> +		 * start at the insertion point and scan.
>  		 */
> +		if (pos < 0)
> +			pos = -pos - 1;
>
>  		/* Mark all entries for the folder invalid */
> -		for (i = 0; i < istate->cache_nr; i++) {
> -			if (istate->cache[i]->ce_flags & CE_FSMONITOR_VALID &&
> -			    starts_with(istate->cache[i]->name, name))
> -				istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
> +		for (i = pos; i < istate->cache_nr; i++) {
> +			if (!starts_with(istate->cache[i]->name, name))
> +				break;
> +			istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
>  		}
> -		/* Need to remove the / from the path for the untracked cache */
> +
> +		/*
> +		 * We need to remove the traling "/" from the path
> +		 * for the untracked cache.
> +		 */
>  		name[len - 1] = '\0';
> +	} else if (pos >= 0) {
> +		/*
> +		 * We have an exact match for this path and can just
> +		 * invalidate it.
> +		 */
> +		istate->cache[pos]->ce_flags &= ~CE_FSMONITOR_VALID;
>  	} else {
> -		int pos = index_name_pos(istate, name, strlen(name));
> -
> -		if (pos >= 0) {
> -			struct cache_entry *ce = istate->cache[pos];
> -			ce->ce_flags &= ~CE_FSMONITOR_VALID;
> +		/*
> +		 * The path is not a tracked file -or- it is a
> +		 * directory event on a platform that cannot
> +		 * distinguish between file and directory events in
> +		 * the event handler, such as Windows.
> +		 *
> +		 * Scan as if it is a directory and invalidate the
> +		 * cone under it.  (But remember to ignore items
> +		 * between "name" and "name/", such as "name-" and
> +		 * "name.".
> +		 */
> +		pos = -pos - 1;
> +
> +		for (i = pos; i < istate->cache_nr; i++) {
> +			if (!starts_with(istate->cache[i]->name, name))
> +				break;
> +			if ((unsigned char)istate->cache[i]->name[len] > '/')

Nice attention to detail casting `istate->cache[i]->name[len]` to
`(unsigned char)` before comparing to '/'!

> +				break;
> +			if (istate->cache[i]->name[len] == '/')
> +				istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
>  		}
>  	}
>
> @@ -215,7 +253,6 @@ static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
>  	 * Mark the untracked cache dirty even if it wasn't found in the index
>  	 * as it could be a new untracked file.
>  	 */
> -	trace_printf_key(&trace_fsmonitor, "fsmonitor_refresh_callback '%s'", name);

Did you mean to remove this statement in this patch? Not a big issue, but
I wonder what the rationale for it is, and since I have an inquisitive
mind, I figured I'd just ask.

Thanks,
Dscho

>  	untracked_cache_invalidate_path(istate, name, 0);
>  }
>
> --
> gitgitgadget
>
>
Jeff Hostetler May 17, 2022, 8:17 p.m. UTC | #2
On 5/12/22 11:08 AM, Johannes Schindelin wrote:
> Hi Jeff,
> 
> On Fri, 22 Apr 2022, Jeff Hostetler via GitGitGadget wrote:
> 
>> From: Jeff Hostetler <jeffhost@microsoft.com>
>>
>> Teach Git to perform binary search over the cache-entries for a directory
>> notification and then linearly scan forward to find the immediate children.
>>
[...]

>>   static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
>>   {
>>   	int i, len = strlen(name);
>> -	if (name[len - 1] == '/') {
>> +	int pos = index_name_pos(istate, name, len);
>> +
>> +	trace_printf_key(&trace_fsmonitor,
>> +			 "fsmonitor_refresh_callback '%s' (pos %d)",
>> +			 name, pos);
>>
[...]

>> +	if (name[len - 1] == '/') {
[...]
>>   	}

>> @@ -215,7 +253,6 @@ static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
>>   	 * Mark the untracked cache dirty even if it wasn't found in the index
>>   	 * as it could be a new untracked file.
>>   	 */
>> -	trace_printf_key(&trace_fsmonitor, "fsmonitor_refresh_callback '%s'", name);
> 
> Did you mean to remove this statement in this patch? Not a big issue, but
> I wonder what the rationale for it is, and since I have an inquisitive
> mind, I figured I'd just ask.

I just moved it to the top of the function.  That lets me see `name`
before it is modified in one of the else arms (it was helpful to see
whether the daemon sent a trailing slash or not).  And I also wanted
to see the computed value of `pos` (before the "-pos - 1" tricks).

Jeff
Johannes Schindelin May 24, 2022, 11:53 a.m. UTC | #3
Hi Jeff,

On Tue, 17 May 2022, Jeff Hostetler wrote:

> On 5/12/22 11:08 AM, Johannes Schindelin wrote:
> >
> > On Fri, 22 Apr 2022, Jeff Hostetler via GitGitGadget wrote:
> >
> > > From: Jeff Hostetler <jeffhost@microsoft.com>
> > >
> > > Teach Git to perform binary search over the cache-entries for a directory
> > > notification and then linearly scan forward to find the immediate
> > > children.
> > >
> [...]
>
> > >   static void fsmonitor_refresh_callback(struct index_state *istate, char
> > >   *name)
> > >   {
> > >   	int i, len = strlen(name);
> > > -	if (name[len - 1] == '/') {
> > > +	int pos = index_name_pos(istate, name, len);
> > > +
> > > +	trace_printf_key(&trace_fsmonitor,
> > > +			 "fsmonitor_refresh_callback '%s' (pos %d)",
> > > +			 name, pos);
> > >
> [...]
>
> > > +	if (name[len - 1] == '/') {
> [...]
> > >    }
>
> > > @@ -215,7 +253,6 @@ static void fsmonitor_refresh_callback(struct
> > > index_state *istate, char *name)
> > >     * Mark the untracked cache dirty even if it wasn't found in the index
> > >     * as it could be a new untracked file.
> > >     */
> > > -	trace_printf_key(&trace_fsmonitor, "fsmonitor_refresh_callback '%s'",
> > > name);
> >
> > Did you mean to remove this statement in this patch? Not a big issue, but
> > I wonder what the rationale for it is, and since I have an inquisitive
> > mind, I figured I'd just ask.
>
> I just moved it to the top of the function.  That lets me see `name`
> before it is modified in one of the else arms (it was helpful to see
> whether the daemon sent a trailing slash or not).  And I also wanted
> to see the computed value of `pos` (before the "-pos - 1" tricks).

Oh, I missed that it was moved rather than removed. Sorry for that!

Thank you,
Dscho
diff mbox series

Patch

diff --git a/fsmonitor.c b/fsmonitor.c
index 292a6742b4f..e1229c289cf 100644
--- a/fsmonitor.c
+++ b/fsmonitor.c
@@ -184,30 +184,68 @@  static int query_fsmonitor_hook(struct repository *r,
 static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
 {
 	int i, len = strlen(name);
-	if (name[len - 1] == '/') {
+	int pos = index_name_pos(istate, name, len);
+
+	trace_printf_key(&trace_fsmonitor,
+			 "fsmonitor_refresh_callback '%s' (pos %d)",
+			 name, pos);
 
+	if (name[len - 1] == '/') {
 		/*
-		 * TODO We should binary search to find the first path with
-		 * TODO this directory prefix.  Then linearly update entries
-		 * TODO while the prefix matches.  Taking care to search without
-		 * TODO the trailing slash -- because '/' sorts after a few
-		 * TODO interesting special chars, like '.' and ' '.
+		 * The daemon can decorate directory events, such as
+		 * moves or renames, with a trailing slash if the OS
+		 * FS Event contains sufficient information, such as
+		 * MacOS.
+		 *
+		 * Use this to invalidate the entire cone under that
+		 * directory.
+		 *
+		 * We do not expect an exact match because the index
+		 * does not normally contain directory entries, so we
+		 * start at the insertion point and scan.
 		 */
+		if (pos < 0)
+			pos = -pos - 1;
 
 		/* Mark all entries for the folder invalid */
-		for (i = 0; i < istate->cache_nr; i++) {
-			if (istate->cache[i]->ce_flags & CE_FSMONITOR_VALID &&
-			    starts_with(istate->cache[i]->name, name))
-				istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
+		for (i = pos; i < istate->cache_nr; i++) {
+			if (!starts_with(istate->cache[i]->name, name))
+				break;
+			istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
 		}
-		/* Need to remove the / from the path for the untracked cache */
+
+		/*
+		 * We need to remove the traling "/" from the path
+		 * for the untracked cache.
+		 */
 		name[len - 1] = '\0';
+	} else if (pos >= 0) {
+		/*
+		 * We have an exact match for this path and can just
+		 * invalidate it.
+		 */
+		istate->cache[pos]->ce_flags &= ~CE_FSMONITOR_VALID;
 	} else {
-		int pos = index_name_pos(istate, name, strlen(name));
-
-		if (pos >= 0) {
-			struct cache_entry *ce = istate->cache[pos];
-			ce->ce_flags &= ~CE_FSMONITOR_VALID;
+		/*
+		 * The path is not a tracked file -or- it is a
+		 * directory event on a platform that cannot
+		 * distinguish between file and directory events in
+		 * the event handler, such as Windows.
+		 *
+		 * Scan as if it is a directory and invalidate the
+		 * cone under it.  (But remember to ignore items
+		 * between "name" and "name/", such as "name-" and
+		 * "name.".
+		 */
+		pos = -pos - 1;
+
+		for (i = pos; i < istate->cache_nr; i++) {
+			if (!starts_with(istate->cache[i]->name, name))
+				break;
+			if ((unsigned char)istate->cache[i]->name[len] > '/')
+				break;
+			if (istate->cache[i]->name[len] == '/')
+				istate->cache[i]->ce_flags &= ~CE_FSMONITOR_VALID;
 		}
 	}
 
@@ -215,7 +253,6 @@  static void fsmonitor_refresh_callback(struct index_state *istate, char *name)
 	 * Mark the untracked cache dirty even if it wasn't found in the index
 	 * as it could be a new untracked file.
 	 */
-	trace_printf_key(&trace_fsmonitor, "fsmonitor_refresh_callback '%s'", name);
 	untracked_cache_invalidate_path(istate, name, 0);
 }