diff mbox series

[v3,6/9] bugreport: count loose objects

Message ID 20191025025129.250049-7-emilyshaffer@google.com (mailing list archive)
State New, archived
Headers show
Series add git-bugreport tool | expand

Commit Message

Emily Shaffer Oct. 25, 2019, 2:51 a.m. UTC
The number of unpacked objects in a user's repository may help us
understand the root of the problem they're seeing, especially if a
command is running unusually slowly.

Rather than directly invoking 'git-count-objects', which may sometimes
fail unexpectedly on Git for Windows, manually count the contents of
.git/objects. Additionally, since we may wish to inspect other
directories' contents for bugreport in the future, put the directory
listing into a helper function.

Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
---
 bugreport.c         | 72 +++++++++++++++++++++++++++++++++++++++++++++
 bugreport.h         |  6 ++++
 builtin/bugreport.c |  4 +++
 3 files changed, 82 insertions(+)

Comments

Johannes Schindelin Oct. 28, 2019, 3:07 p.m. UTC | #1
Hi Emily,

On Thu, 24 Oct 2019, Emily Shaffer wrote:

> The number of unpacked objects in a user's repository may help us
> understand the root of the problem they're seeing, especially if a
> command is running unusually slowly.
>
> Rather than directly invoking 'git-count-objects', which may sometimes
> fail unexpectedly on Git for Windows, manually count the contents of
> .git/objects. Additionally, since we may wish to inspect other
> directories' contents for bugreport in the future, put the directory
> listing into a helper function.

Thank you, much appreciated!

I guess the next step is to count the number of packs, and the number of
submodules ;-)

>
> Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
> ---
>  bugreport.c         | 72 +++++++++++++++++++++++++++++++++++++++++++++
>  bugreport.h         |  6 ++++
>  builtin/bugreport.c |  4 +++
>  3 files changed, 82 insertions(+)
>
> diff --git a/bugreport.c b/bugreport.c
> index 9d7f44ff28..54e1d47103 100644
> --- a/bugreport.c
> +++ b/bugreport.c
> @@ -5,8 +5,11 @@
>  #include "exec-cmd.h"
>  #include "help.h"
>  #include "run-command.h"
> +#include "strbuf.h"

Why not append this to the end of the `#include` list, as is common in
Git's commit history?

>  #include "version.h"
>
> +#include "dirent.h"

This header (although with pointy brackets instead of double quotes) is
already included in `git-compat-util.h`

> +
>  /**
>   * A sorted list of config options which we will add to the bugreport. Managed
>   * by 'gather_whitelist(...)'.
> @@ -147,3 +150,72 @@ void get_populated_hooks(struct strbuf *hook_info)
>  		}
>  	}
>  }
> +
> +/**
> + * Fill 'contents' with the contents of the dir at 'dirpath'.

Since you start this comment in JavaDoc style, there should be an almost
empty line after this one ("almost" because it still contains the
asterisk, of course).

> + * If 'filter' is nonzero, the contents are filtered on d_type as 'type' - see
> + * 'man readdir'. opendir() doesn't take string length as an arg, so don't
> + * bother passing it in.
> + */
> +void list_contents_of_dir(struct string_list *contents, struct strbuf *dirpath,

Shouldn't this be `static`?

> +			  int filter, unsigned char type)
> +{
> +	struct dirent *dir = NULL;
> +	DIR *dh = NULL;
> +
> +	dh = opendir(dirpath->buf);
> +	while (dh && (dir = readdir(dh))) {
> +		if (!filter || type == dir->d_type) {
> +			string_list_append(contents, dir->d_name);
> +		}
> +	}
> +}
> +
> +
> +void get_object_counts(struct strbuf *obj_info)

Oops. This function is no longer used.

> +{
> +	struct child_process cp = CHILD_PROCESS_INIT;
> +	struct strbuf std_out = STRBUF_INIT;
> +
> +	argv_array_push(&cp.args, "count-objects");
> +	argv_array_push(&cp.args, "-vH");
> +	cp.git_cmd = 1;
> +	capture_command(&cp, &std_out, 0);
> +
> +	strbuf_reset(obj_info);
> +	strbuf_addstr(obj_info, "git-count-objects -vH:\n");
> +	strbuf_addbuf(obj_info, &std_out);
> +}
> +
> +void get_loose_object_summary(struct strbuf *obj_info)
> +{
> +	struct strbuf dirpath = STRBUF_INIT;
> +	struct string_list subdirs = STRING_LIST_INIT_DUP;
> +	struct string_list_item *subdir;
> +
> +	strbuf_reset(obj_info);
> +
> +	strbuf_addstr(&dirpath, get_object_directory());
> +	strbuf_complete(&dirpath, '/');
> +
> +	list_contents_of_dir(&subdirs, &dirpath, 1, DT_DIR);
> +
> +	for_each_string_list_item(subdir, &subdirs)
> +	{
> +		struct strbuf subdir_buf = STRBUF_INIT;
> +		struct string_list objects = STRING_LIST_INIT_DUP;
> +
> +		/*
> +		 * Only interested in loose objects - so dirs named with the
> +		 * first byte of the object ID
> +		 */
> +		if (strlen(subdir->string) != 2 || !strcmp(subdir->string, ".."))
> +			continue;
> +
> +		strbuf_addbuf(&subdir_buf, &dirpath);
> +		strbuf_addstr(&subdir_buf, subdir->string);
> +		list_contents_of_dir(&objects, &subdir_buf, 0, 0);
> +		strbuf_addf(obj_info, "%s: %d objects\n", subdir->string,
> +			    objects.nr);

Hmm. Not only does this leak `objects`, it also throws away the contents
that we so painfully constructed.

Wouldn't it make more sense to do something like this instead?

static int is_hex(const char *string, size_t count)
{
	for (; count; string++, count--)
		if (hexval(*string) < 0)
			return 0;
	return 1;
}

static ssize_t count_loose_objects(struct strbuf *objects_path)
{
	ssize_t ret = 0;
	size_t len;
	struct dirent *d;
	DIR *dir, *subdir;

	dir = opendir(objects_path->buf);
	if (!dir)
		return -1;

	strbuf_complete(objects_path, '/');
	len = objects_path->len;
	while ((d = readdir(dir))) {
		if (d->d_type != DT_DIR)
			continue;
		strbuf_setlen(objects_path, len);
		strbuf_addstr(objects_path, d->d_name);
		subdir = opendir(objects_path->buf);
		if (!subdir)
			continue;
		while ((d = readdir(subdir)))
			if (d->dt_type == DT_REG &&
			    is_hex(dir->d_name, the_repository->hash_algo->hexsz))
				ret++;
		closedir(subdir);
	}
	closedir(dir);
	strbuf_reset(objects_path, len);
	return ret;
}

Ciao,
Dscho

> +	}
> +}
> diff --git a/bugreport.h b/bugreport.h
> index 942a5436e3..09ad0c2599 100644
> --- a/bugreport.h
> +++ b/bugreport.h
> @@ -18,3 +18,9 @@ void get_whitelisted_config(struct strbuf *sys_info);
>   * contents of hook_info will be discarded.
>   */
>  void get_populated_hooks(struct strbuf *hook_info);
> +
> +/**
> + * Adds the output of `git count-object -vH`. The previous contents of hook_info
> + * will be discarded.
> + */
> +void get_loose_object_summary(struct strbuf *obj_info);
> diff --git a/builtin/bugreport.c b/builtin/bugreport.c
> index a0eefba498..b2ab194207 100644
> --- a/builtin/bugreport.c
> +++ b/builtin/bugreport.c
> @@ -64,6 +64,10 @@ int cmd_bugreport(int argc, const char **argv, const char *prefix)
>  	get_populated_hooks(&buffer);
>  	strbuf_write(&buffer, report);
>
> +	add_header(report, "Object Counts");
> +	get_loose_object_summary(&buffer);
> +	strbuf_write(&buffer, report);
> +
>  	fclose(report);
>
>  	launch_editor(report_path.buf, NULL, NULL);
> --
> 2.24.0.rc0.303.g954a862665-goog
>
>
Josh Steadmon Oct. 29, 2019, 9:18 p.m. UTC | #2
On 2019.10.24 19:51, Emily Shaffer wrote:
[snip]
> diff --git a/bugreport.h b/bugreport.h
> index 942a5436e3..09ad0c2599 100644
> --- a/bugreport.h
> +++ b/bugreport.h
> @@ -18,3 +18,9 @@ void get_whitelisted_config(struct strbuf *sys_info);
>   * contents of hook_info will be discarded.
>   */
>  void get_populated_hooks(struct strbuf *hook_info);
> +
> +/**
> + * Adds the output of `git count-object -vH`. The previous contents of hook_info
> + * will be discarded.
> + */
> +void get_loose_object_summary(struct strbuf *obj_info);

Looks like a copy/paste typo here, shouldn't "hook_info" be "obj_info"
in the comment?
Emily Shaffer Dec. 10, 2019, 10:34 p.m. UTC | #3
On Mon, Oct 28, 2019 at 04:07:40PM +0100, Johannes Schindelin wrote:
> Hi Emily,
> 
> On Thu, 24 Oct 2019, Emily Shaffer wrote:
> 
> > The number of unpacked objects in a user's repository may help us
> > understand the root of the problem they're seeing, especially if a
> > command is running unusually slowly.
> >
> > Rather than directly invoking 'git-count-objects', which may sometimes
> > fail unexpectedly on Git for Windows, manually count the contents of
> > .git/objects. Additionally, since we may wish to inspect other
> > directories' contents for bugreport in the future, put the directory
> > listing into a helper function.
> 
> Thank you, much appreciated!
> 
> I guess the next step is to count the number of packs, and the number of
> submodules ;-)
> 
> >
> > Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
> > ---
> >  bugreport.c         | 72 +++++++++++++++++++++++++++++++++++++++++++++
> >  bugreport.h         |  6 ++++
> >  builtin/bugreport.c |  4 +++
> >  3 files changed, 82 insertions(+)
> >
> > diff --git a/bugreport.c b/bugreport.c
> > index 9d7f44ff28..54e1d47103 100644
> > --- a/bugreport.c
> > +++ b/bugreport.c
> > @@ -5,8 +5,11 @@
> >  #include "exec-cmd.h"
> >  #include "help.h"
> >  #include "run-command.h"
> > +#include "strbuf.h"
> 
> Why not append this to the end of the `#include` list, as is common in
> Git's commit history?
> 
> >  #include "version.h"
> >
> > +#include "dirent.h"
> 
> This header (although with pointy brackets instead of double quotes) is
> already included in `git-compat-util.h`
> 
> > +
> >  /**
> >   * A sorted list of config options which we will add to the bugreport. Managed
> >   * by 'gather_whitelist(...)'.
> > @@ -147,3 +150,72 @@ void get_populated_hooks(struct strbuf *hook_info)
> >  		}
> >  	}
> >  }
> > +
> > +/**
> > + * Fill 'contents' with the contents of the dir at 'dirpath'.
> 
> Since you start this comment in JavaDoc style, there should be an almost
> empty line after this one ("almost" because it still contains the
> asterisk, of course).
> 
> > + * If 'filter' is nonzero, the contents are filtered on d_type as 'type' - see
> > + * 'man readdir'. opendir() doesn't take string length as an arg, so don't
> > + * bother passing it in.
> > + */
> > +void list_contents_of_dir(struct string_list *contents, struct strbuf *dirpath,
> 
> Shouldn't this be `static`?
> 
> > +			  int filter, unsigned char type)
> > +{
> > +	struct dirent *dir = NULL;
> > +	DIR *dh = NULL;
> > +
> > +	dh = opendir(dirpath->buf);
> > +	while (dh && (dir = readdir(dh))) {
> > +		if (!filter || type == dir->d_type) {
> > +			string_list_append(contents, dir->d_name);
> > +		}
> > +	}
> > +}
> > +
> > +
> > +void get_object_counts(struct strbuf *obj_info)
> 
> Oops. This function is no longer used.
> 
> > +{
> > +	struct child_process cp = CHILD_PROCESS_INIT;
> > +	struct strbuf std_out = STRBUF_INIT;
> > +
> > +	argv_array_push(&cp.args, "count-objects");
> > +	argv_array_push(&cp.args, "-vH");
> > +	cp.git_cmd = 1;
> > +	capture_command(&cp, &std_out, 0);
> > +
> > +	strbuf_reset(obj_info);
> > +	strbuf_addstr(obj_info, "git-count-objects -vH:\n");
> > +	strbuf_addbuf(obj_info, &std_out);
> > +}
> > +
> > +void get_loose_object_summary(struct strbuf *obj_info)
> > +{
> > +	struct strbuf dirpath = STRBUF_INIT;
> > +	struct string_list subdirs = STRING_LIST_INIT_DUP;
> > +	struct string_list_item *subdir;
> > +
> > +	strbuf_reset(obj_info);
> > +
> > +	strbuf_addstr(&dirpath, get_object_directory());
> > +	strbuf_complete(&dirpath, '/');
> > +
> > +	list_contents_of_dir(&subdirs, &dirpath, 1, DT_DIR);
> > +
> > +	for_each_string_list_item(subdir, &subdirs)
> > +	{
> > +		struct strbuf subdir_buf = STRBUF_INIT;
> > +		struct string_list objects = STRING_LIST_INIT_DUP;
> > +
> > +		/*
> > +		 * Only interested in loose objects - so dirs named with the
> > +		 * first byte of the object ID
> > +		 */
> > +		if (strlen(subdir->string) != 2 || !strcmp(subdir->string, ".."))
> > +			continue;
> > +
> > +		strbuf_addbuf(&subdir_buf, &dirpath);
> > +		strbuf_addstr(&subdir_buf, subdir->string);
> > +		list_contents_of_dir(&objects, &subdir_buf, 0, 0);
> > +		strbuf_addf(obj_info, "%s: %d objects\n", subdir->string,
> > +			    objects.nr);
> 
> Hmm. Not only does this leak `objects`, it also throws away the contents
> that we so painfully constructed.
> 
> Wouldn't it make more sense to do something like this instead?
> 
> static int is_hex(const char *string, size_t count)
> {
> 	for (; count; string++, count--)
> 		if (hexval(*string) < 0)
> 			return 0;
> 	return 1;
> }

True if the string matches [0-9a-fA-F]*, false otherwise.

> 
> static ssize_t count_loose_objects(struct strbuf *objects_path)
> {
> 	ssize_t ret = 0;
> 	size_t len;
> 	struct dirent *d;
> 	DIR *dir, *subdir;
> 
> 	dir = opendir(objects_path->buf);
> 	if (!dir)
> 		return -1;
> 
> 	strbuf_complete(objects_path, '/');

starting with .git/objects/...

> 	len = objects_path->len;
> 	while ((d = readdir(dir))) {
For all contents of dir...
> 		if (d->d_type != DT_DIR)
> 			continue;
..which are directories...
> 		strbuf_setlen(objects_path, len);
> 		strbuf_addstr(objects_path, d->d_name);
> 		subdir = opendir(objects_path->buf);
show all the contents of that subdirectory.
> 		if (!subdir)
> 			continue;
> 		while ((d = readdir(subdir)))
for each regular file there,
> 			if (d->dt_type == DT_REG &&
if it's a regular file,
> 			    is_hex(dir->d_name, the_repository->hash_algo->hexsz))
and the directory is named like an object prefix,
> 				ret++;
increase the total count of numbers of loose objects.
> 		closedir(subdir);
> 	}
> 	closedir(dir);
> 	strbuf_reset(objects_path, len);
> 	return ret;
> }

The foremost difference here is that the loose object count previously
was not given in total - instead, it was divided up by object prefix. I
can't speak to whether that's actually very important to know about, but
the original request from stolee was to have the summary by dirname.
That's certainly still possible with a light modification to this code.

(Suggestion from Stolee, some months ago earlier in this thread:)

  As mentioned before, I've sometimes found it helpful to know the data shape for the object
  store. Having a few extra steps such as the following could be nice:
  
          echo "[Loose Objects]"
          for objdir in $(find "$GIT_DIR/objects/??" -type d)
          do
                  echo "$objdir: $(ls $objdir | wc -l)"
          done
          echo
  ...

Checking that the directory we're about to inspect is hexval rather than
that it's only two characters long is also an interesting point. I'd
probably rather do both, though, since I think we both missed
futureproofing by a little bit.

Dropping the many string_list is fine by me - call it object-oriented
habits dying hard.

I worry somewhat on delving into every directory and only afterwards
checking whether the directory is one we care about, but that's an easy
modification too to your basic suggestion ("don't use a string list for
that, silly").

Thanks. I'll make the changes with the modifications I mentioned and add
your Helped-by line on this commit.

 - Emily

> 
> Ciao,
> Dscho
> 
> > +	}
> > +}
> > diff --git a/bugreport.h b/bugreport.h
> > index 942a5436e3..09ad0c2599 100644
> > --- a/bugreport.h
> > +++ b/bugreport.h
> > @@ -18,3 +18,9 @@ void get_whitelisted_config(struct strbuf *sys_info);
> >   * contents of hook_info will be discarded.
> >   */
> >  void get_populated_hooks(struct strbuf *hook_info);
> > +
> > +/**
> > + * Adds the output of `git count-object -vH`. The previous contents of hook_info
> > + * will be discarded.
> > + */
> > +void get_loose_object_summary(struct strbuf *obj_info);
> > diff --git a/builtin/bugreport.c b/builtin/bugreport.c
> > index a0eefba498..b2ab194207 100644
> > --- a/builtin/bugreport.c
> > +++ b/builtin/bugreport.c
> > @@ -64,6 +64,10 @@ int cmd_bugreport(int argc, const char **argv, const char *prefix)
> >  	get_populated_hooks(&buffer);
> >  	strbuf_write(&buffer, report);
> >
> > +	add_header(report, "Object Counts");
> > +	get_loose_object_summary(&buffer);
> > +	strbuf_write(&buffer, report);
> > +
> >  	fclose(report);
> >
> >  	launch_editor(report_path.buf, NULL, NULL);
> > --
> > 2.24.0.rc0.303.g954a862665-goog
> >
> >
diff mbox series

Patch

diff --git a/bugreport.c b/bugreport.c
index 9d7f44ff28..54e1d47103 100644
--- a/bugreport.c
+++ b/bugreport.c
@@ -5,8 +5,11 @@ 
 #include "exec-cmd.h"
 #include "help.h"
 #include "run-command.h"
+#include "strbuf.h"
 #include "version.h"
 
+#include "dirent.h"
+
 /**
  * A sorted list of config options which we will add to the bugreport. Managed
  * by 'gather_whitelist(...)'.
@@ -147,3 +150,72 @@  void get_populated_hooks(struct strbuf *hook_info)
 		}
 	}
 }
+
+/**
+ * Fill 'contents' with the contents of the dir at 'dirpath'.
+ * If 'filter' is nonzero, the contents are filtered on d_type as 'type' - see
+ * 'man readdir'. opendir() doesn't take string length as an arg, so don't
+ * bother passing it in.
+ */
+void list_contents_of_dir(struct string_list *contents, struct strbuf *dirpath,
+			  int filter, unsigned char type)
+{
+	struct dirent *dir = NULL;
+	DIR *dh = NULL;
+
+	dh = opendir(dirpath->buf);
+	while (dh && (dir = readdir(dh))) {
+		if (!filter || type == dir->d_type) {
+			string_list_append(contents, dir->d_name);
+		}
+	}
+}
+
+
+void get_object_counts(struct strbuf *obj_info)
+{
+	struct child_process cp = CHILD_PROCESS_INIT;
+	struct strbuf std_out = STRBUF_INIT;
+
+	argv_array_push(&cp.args, "count-objects");
+	argv_array_push(&cp.args, "-vH");
+	cp.git_cmd = 1;
+	capture_command(&cp, &std_out, 0);
+
+	strbuf_reset(obj_info);
+	strbuf_addstr(obj_info, "git-count-objects -vH:\n");
+	strbuf_addbuf(obj_info, &std_out);
+}
+
+void get_loose_object_summary(struct strbuf *obj_info)
+{
+	struct strbuf dirpath = STRBUF_INIT;
+	struct string_list subdirs = STRING_LIST_INIT_DUP;
+	struct string_list_item *subdir;
+
+	strbuf_reset(obj_info);
+
+	strbuf_addstr(&dirpath, get_object_directory());
+	strbuf_complete(&dirpath, '/');
+
+	list_contents_of_dir(&subdirs, &dirpath, 1, DT_DIR);
+
+	for_each_string_list_item(subdir, &subdirs)
+	{
+		struct strbuf subdir_buf = STRBUF_INIT;
+		struct string_list objects = STRING_LIST_INIT_DUP;
+
+		/*
+		 * Only interested in loose objects - so dirs named with the
+		 * first byte of the object ID
+		 */
+		if (strlen(subdir->string) != 2 || !strcmp(subdir->string, ".."))
+			continue;
+
+		strbuf_addbuf(&subdir_buf, &dirpath);
+		strbuf_addstr(&subdir_buf, subdir->string);
+		list_contents_of_dir(&objects, &subdir_buf, 0, 0);
+		strbuf_addf(obj_info, "%s: %d objects\n", subdir->string,
+			    objects.nr);
+	}
+}
diff --git a/bugreport.h b/bugreport.h
index 942a5436e3..09ad0c2599 100644
--- a/bugreport.h
+++ b/bugreport.h
@@ -18,3 +18,9 @@  void get_whitelisted_config(struct strbuf *sys_info);
  * contents of hook_info will be discarded.
  */
 void get_populated_hooks(struct strbuf *hook_info);
+
+/**
+ * Adds the output of `git count-object -vH`. The previous contents of hook_info
+ * will be discarded.
+ */
+void get_loose_object_summary(struct strbuf *obj_info);
diff --git a/builtin/bugreport.c b/builtin/bugreport.c
index a0eefba498..b2ab194207 100644
--- a/builtin/bugreport.c
+++ b/builtin/bugreport.c
@@ -64,6 +64,10 @@  int cmd_bugreport(int argc, const char **argv, const char *prefix)
 	get_populated_hooks(&buffer);
 	strbuf_write(&buffer, report);
 
+	add_header(report, "Object Counts");
+	get_loose_object_summary(&buffer);
+	strbuf_write(&buffer, report);
+
 	fclose(report);
 
 	launch_editor(report_path.buf, NULL, NULL);