diff mbox series

[1/8] tree-walk: report recursion counts

Message ID f727880add6e0380248279e1ad79f80762868a6c.1609356413.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Cleanups around index operations | expand

Commit Message

Derrick Stolee Dec. 30, 2020, 7:26 p.m. UTC
From: Derrick Stolee <dstolee@microsoft.com>

The traverse_trees() method recusively walks through trees, but also
prunes the tree-walk based on a callback. Some callers, such as
unpack_trees(), are quite complicated and can have wildly different
performance between two different commands.

Create constants that count these values and then report the results at
the end of a process. These counts are cumulative across multiple "root"
instances of traverse_trees(), but they provide reproducible values for
demonstrating improvements to the pruning algorithm when possible.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
---
 tree-walk.c    | 33 +++++++++++++++++++++++++++++++++
 unpack-trees.c |  1 -
 2 files changed, 33 insertions(+), 1 deletion(-)

Comments

Elijah Newren Dec. 30, 2020, 7:42 p.m. UTC | #1
On Wed, Dec 30, 2020 at 11:26 AM Derrick Stolee via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: Derrick Stolee <dstolee@microsoft.com>
>
> The traverse_trees() method recusively walks through trees, but also

recursively -- you're missing the second 'r'.

> prunes the tree-walk based on a callback. Some callers, such as
> unpack_trees(), are quite complicated and can have wildly different
> performance between two different commands.

Not sure it belongs in the commit message, but you do have me curious
what you're digging in to...

> Create constants that count these values and then report the results at
> the end of a process. These counts are cumulative across multiple "root"
> instances of traverse_trees(), but they provide reproducible values for
> demonstrating improvements to the pruning algorithm when possible.
>
> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
> ---
>  tree-walk.c    | 33 +++++++++++++++++++++++++++++++++
>  unpack-trees.c |  1 -
>  2 files changed, 33 insertions(+), 1 deletion(-)

From the subject, you are changing tree-walk.  unpack-trees depends on
tree-walk, but why is something exposed to it with this kind of
change?  Maybe I'll see when I get to it.

>
> diff --git a/tree-walk.c b/tree-walk.c
> index 0160294712b..2d6226d5f18 100644
> --- a/tree-walk.c
> +++ b/tree-walk.c
> @@ -4,6 +4,7 @@
>  #include "object-store.h"
>  #include "tree.h"
>  #include "pathspec.h"
> +#include "json-writer.h"
>
>  static const char *get_mode(const char *str, unsigned int *modep)
>  {
> @@ -167,6 +168,25 @@ int tree_entry_gently(struct tree_desc *desc, struct name_entry *entry)
>         return 1;
>  }
>
> +static int traverse_trees_atexit_registered;
> +static int traverse_trees_count;
> +static int traverse_trees_cur_depth;
> +static int traverse_trees_max_depth;
> +
> +static void trace2_traverse_trees_statistics_atexit(void)
> +{
> +       struct json_writer jw = JSON_WRITER_INIT;
> +
> +       jw_object_begin(&jw, 0);
> +       jw_object_intmax(&jw, "traverse_trees_count", traverse_trees_count);
> +       jw_object_intmax(&jw, "traverse_trees_max_depth", traverse_trees_max_depth);
> +       jw_end(&jw);
> +
> +       trace2_data_json("traverse_trees", the_repository, "statistics", &jw);
> +
> +       jw_release(&jw);
> +}

Yeah, I don't know the json_writer or trace2 stuff; might be nice to
cc Josh Steadmon or someone to review this patch.  (Or perhaps he
already reviewed internally?)

> +
>  void setup_traverse_info(struct traverse_info *info, const char *base)
>  {
>         size_t pathlen = strlen(base);
> @@ -180,6 +200,11 @@ void setup_traverse_info(struct traverse_info *info, const char *base)
>         info->namelen = pathlen;
>         if (pathlen)
>                 info->prev = &dummy;
> +
> +       if (trace2_is_enabled() && !traverse_trees_atexit_registered) {
> +               atexit(trace2_traverse_trees_statistics_atexit);
> +               traverse_trees_atexit_registered = 1;
> +       }
>  }
>
>  char *make_traverse_path(char *path, size_t pathlen,
> @@ -416,6 +441,12 @@ int traverse_trees(struct index_state *istate,
>         int interesting = 1;
>         char *traverse_path;
>
> +       traverse_trees_count++;
> +       traverse_trees_cur_depth++;
> +
> +       if (traverse_trees_cur_depth > traverse_trees_max_depth)
> +               traverse_trees_max_depth = traverse_trees_cur_depth;
> +
>         if (n >= ARRAY_SIZE(entry))
>                 BUG("traverse_trees() called with too many trees (%d)", n);
>
> @@ -515,6 +546,8 @@ int traverse_trees(struct index_state *istate,
>         free(traverse_path);
>         info->traverse_path = NULL;
>         strbuf_release(&base);
> +
> +       traverse_trees_cur_depth--;

I double-checked to see if there were any other return sites in this
function.  There aren't, which is nice and keeps this clean.

>         return error;
>  }
>
> diff --git a/unpack-trees.c b/unpack-trees.c
> index 323280dd48b..02f484604ac 100644
> --- a/unpack-trees.c
> +++ b/unpack-trees.c
> @@ -1559,7 +1559,6 @@ static void populate_from_existing_patterns(struct unpack_trees_options *o,
>         free(sparse);
>  }
>
> -

Did you mean to combine this cleanup with some other patch?  If not,
could it be put into its own patch?

>  static int verify_absent(const struct cache_entry *,
>                          enum unpack_trees_error_types,
>                          struct unpack_trees_options *);
> --
> gitgitgadget

Seems like a good change other than a few small nits.  I don't know
the json_writer/trace2 stuff, so you might want another reviewer, but
it's only a few lines and seems relatively straightforward.
Derrick Stolee Dec. 30, 2020, 7:51 p.m. UTC | #2
On 12/30/2020 2:42 PM, Elijah Newren wrote:
> On Wed, Dec 30, 2020 at 11:26 AM Derrick Stolee via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: Derrick Stolee <dstolee@microsoft.com>
>>
>> The traverse_trees() method recusively walks through trees, but also
> 
> recursively -- you're missing the second 'r'.

Thanks.

>> prunes the tree-walk based on a callback. Some callers, such as
>> unpack_trees(), are quite complicated and can have wildly different
>> performance between two different commands.
> 
> Not sure it belongs in the commit message, but you do have me curious
> what you're digging in to...

I'm still testing whether my idea will work out. Hopefully soon. ;)
 
>> Create constants that count these values and then report the results at
>> the end of a process. These counts are cumulative across multiple "root"
>> instances of traverse_trees(), but they provide reproducible values for
>> demonstrating improvements to the pruning algorithm when possible.
>>
>> Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
>> ---
>>  tree-walk.c    | 33 +++++++++++++++++++++++++++++++++
>>  unpack-trees.c |  1 -
>>  2 files changed, 33 insertions(+), 1 deletion(-)
> 
> From the subject, you are changing tree-walk.  unpack-trees depends on
> tree-walk, but why is something exposed to it with this kind of
> change?  Maybe I'll see when I get to it.

Oops. I originally reported the stats at the end of unpack_trees(), but
it seems I didn't completely back out that change.

>> diff --git a/tree-walk.c b/tree-walk.c
>> index 0160294712b..2d6226d5f18 100644
>> --- a/tree-walk.c
>> +++ b/tree-walk.c
>> @@ -4,6 +4,7 @@
>>  #include "object-store.h"
>>  #include "tree.h"
>>  #include "pathspec.h"
>> +#include "json-writer.h"
>>
>>  static const char *get_mode(const char *str, unsigned int *modep)
>>  {
>> @@ -167,6 +168,25 @@ int tree_entry_gently(struct tree_desc *desc, struct name_entry *entry)
>>         return 1;
>>  }
>>
>> +static int traverse_trees_atexit_registered;
>> +static int traverse_trees_count;
>> +static int traverse_trees_cur_depth;
>> +static int traverse_trees_max_depth;
>> +
>> +static void trace2_traverse_trees_statistics_atexit(void)
>> +{
>> +       struct json_writer jw = JSON_WRITER_INIT;
>> +
>> +       jw_object_begin(&jw, 0);
>> +       jw_object_intmax(&jw, "traverse_trees_count", traverse_trees_count);
>> +       jw_object_intmax(&jw, "traverse_trees_max_depth", traverse_trees_max_depth);
>> +       jw_end(&jw);
>> +
>> +       trace2_data_json("traverse_trees", the_repository, "statistics", &jw);
>> +
>> +       jw_release(&jw);
>> +}
> 
> Yeah, I don't know the json_writer or trace2 stuff; might be nice to
> cc Josh Steadmon or someone to review this patch.  (Or perhaps he
> already reviewed internally?)

I guess I could have pointed out that this change is modeled after
a similar statistics reporting in 42e50e78 (revision.c: add trace2
stats around Bloom filter usage, 2020-04-06).

>> +
>>  void setup_traverse_info(struct traverse_info *info, const char *base)
>>  {
>>         size_t pathlen = strlen(base);
>> @@ -180,6 +200,11 @@ void setup_traverse_info(struct traverse_info *info, const char *base)
>>         info->namelen = pathlen;
>>         if (pathlen)
>>                 info->prev = &dummy;
>> +
>> +       if (trace2_is_enabled() && !traverse_trees_atexit_registered) {
>> +               atexit(trace2_traverse_trees_statistics_atexit);
>> +               traverse_trees_atexit_registered = 1;
>> +       }
>>  }
>>
>>  char *make_traverse_path(char *path, size_t pathlen,
>> @@ -416,6 +441,12 @@ int traverse_trees(struct index_state *istate,
>>         int interesting = 1;
>>         char *traverse_path;
>>
>> +       traverse_trees_count++;
>> +       traverse_trees_cur_depth++;
>> +
>> +       if (traverse_trees_cur_depth > traverse_trees_max_depth)
>> +               traverse_trees_max_depth = traverse_trees_cur_depth;
>> +
>>         if (n >= ARRAY_SIZE(entry))
>>                 BUG("traverse_trees() called with too many trees (%d)", n);
>>
>> @@ -515,6 +546,8 @@ int traverse_trees(struct index_state *istate,
>>         free(traverse_path);
>>         info->traverse_path = NULL;
>>         strbuf_release(&base);
>> +
>> +       traverse_trees_cur_depth--;
> 
> I double-checked to see if there were any other return sites in this
> function.  There aren't, which is nice and keeps this clean.
> 
>>         return error;
>>  }
>>
>> diff --git a/unpack-trees.c b/unpack-trees.c
>> index 323280dd48b..02f484604ac 100644
>> --- a/unpack-trees.c
>> +++ b/unpack-trees.c
>> @@ -1559,7 +1559,6 @@ static void populate_from_existing_patterns(struct unpack_trees_options *o,
>>         free(sparse);
>>  }
>>
>> -
> 
> Did you mean to combine this cleanup with some other patch?  If not,
> could it be put into its own patch?

It should be dropped! Thanks.

>>  static int verify_absent(const struct cache_entry *,
>>                          enum unpack_trees_error_types,
>>                          struct unpack_trees_options *);
>> --
>> gitgitgadget
> 
> Seems like a good change other than a few small nits.  I don't know
> the json_writer/trace2 stuff, so you might want another reviewer, but
> it's only a few lines and seems relatively straightforward.

I should point out that an easy way to test this locally is
to run

  GIT_TRACE2_PERF=1 git read-tree -mu HEAD

to trigger a full recursive walk. The output JSON payload looks
like this:

  statistics:{"traverse_trees_count":203,"traverse_trees_max_depth":7}

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/tree-walk.c b/tree-walk.c
index 0160294712b..2d6226d5f18 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -4,6 +4,7 @@ 
 #include "object-store.h"
 #include "tree.h"
 #include "pathspec.h"
+#include "json-writer.h"
 
 static const char *get_mode(const char *str, unsigned int *modep)
 {
@@ -167,6 +168,25 @@  int tree_entry_gently(struct tree_desc *desc, struct name_entry *entry)
 	return 1;
 }
 
+static int traverse_trees_atexit_registered;
+static int traverse_trees_count;
+static int traverse_trees_cur_depth;
+static int traverse_trees_max_depth;
+
+static void trace2_traverse_trees_statistics_atexit(void)
+{
+	struct json_writer jw = JSON_WRITER_INIT;
+
+	jw_object_begin(&jw, 0);
+	jw_object_intmax(&jw, "traverse_trees_count", traverse_trees_count);
+	jw_object_intmax(&jw, "traverse_trees_max_depth", traverse_trees_max_depth);
+	jw_end(&jw);
+
+	trace2_data_json("traverse_trees", the_repository, "statistics", &jw);
+
+	jw_release(&jw);
+}
+
 void setup_traverse_info(struct traverse_info *info, const char *base)
 {
 	size_t pathlen = strlen(base);
@@ -180,6 +200,11 @@  void setup_traverse_info(struct traverse_info *info, const char *base)
 	info->namelen = pathlen;
 	if (pathlen)
 		info->prev = &dummy;
+
+	if (trace2_is_enabled() && !traverse_trees_atexit_registered) {
+		atexit(trace2_traverse_trees_statistics_atexit);
+		traverse_trees_atexit_registered = 1;
+	}
 }
 
 char *make_traverse_path(char *path, size_t pathlen,
@@ -416,6 +441,12 @@  int traverse_trees(struct index_state *istate,
 	int interesting = 1;
 	char *traverse_path;
 
+	traverse_trees_count++;
+	traverse_trees_cur_depth++;
+
+	if (traverse_trees_cur_depth > traverse_trees_max_depth)
+		traverse_trees_max_depth = traverse_trees_cur_depth;
+
 	if (n >= ARRAY_SIZE(entry))
 		BUG("traverse_trees() called with too many trees (%d)", n);
 
@@ -515,6 +546,8 @@  int traverse_trees(struct index_state *istate,
 	free(traverse_path);
 	info->traverse_path = NULL;
 	strbuf_release(&base);
+
+	traverse_trees_cur_depth--;
 	return error;
 }
 
diff --git a/unpack-trees.c b/unpack-trees.c
index 323280dd48b..02f484604ac 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -1559,7 +1559,6 @@  static void populate_from_existing_patterns(struct unpack_trees_options *o,
 	free(sparse);
 }
 
-
 static int verify_absent(const struct cache_entry *,
 			 enum unpack_trees_error_types,
 			 struct unpack_trees_options *);