diff mbox series

[3/5] scalar: teach `diagnose` to gather packfile info

Message ID 330b36de799f82425c22bec50e6e42f0e495cab8.1643186507.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series scalar: implement the subcommand "diagnose" | expand

Commit Message

Matthew John Cheetham Jan. 26, 2022, 8:41 a.m. UTC
From: Matthew John Cheetham <mjcheetham@outlook.com>

Teach the `scalar diagnose` command to gather file size information
about pack files.

Signed-off-by: Matthew John Cheetham <mjcheetham@outlook.com>
---
 contrib/scalar/scalar.c          | 39 ++++++++++++++++++++++++++++++++
 contrib/scalar/t/t9099-scalar.sh |  2 ++
 2 files changed, 41 insertions(+)

Comments

Taylor Blau Jan. 26, 2022, 10:43 p.m. UTC | #1
On Wed, Jan 26, 2022 at 08:41:45AM +0000, Matthew John Cheetham via GitGitGadget wrote:
> From: Matthew John Cheetham <mjcheetham@outlook.com>
>
> Teach the `scalar diagnose` command to gather file size information
> about pack files.
>
> Signed-off-by: Matthew John Cheetham <mjcheetham@outlook.com>
> ---
>  contrib/scalar/scalar.c          | 39 ++++++++++++++++++++++++++++++++
>  contrib/scalar/t/t9099-scalar.sh |  2 ++
>  2 files changed, 41 insertions(+)
>
> diff --git a/contrib/scalar/scalar.c b/contrib/scalar/scalar.c
> index e26fb2fc018..690933ffdf3 100644
> --- a/contrib/scalar/scalar.c
> +++ b/contrib/scalar/scalar.c
> @@ -653,6 +653,39 @@ cleanup:
>  	return res;
>  }
>
> +static void dir_file_stats(struct strbuf *buf, const char *path)
> +{
> +	DIR *dir = opendir(path);
> +	struct dirent *e;
> +	struct stat e_stat;
> +	struct strbuf file_path = STRBUF_INIT;
> +	size_t base_path_len;
> +
> +	if (!dir)
> +		return;
> +
> +	strbuf_addstr(buf, "Contents of ");
> +	strbuf_add_absolute_path(buf, path);
> +	strbuf_addstr(buf, ":\n");
> +
> +	strbuf_add_absolute_path(&file_path, path);
> +	strbuf_addch(&file_path, '/');
> +	base_path_len = file_path.len;
> +
> +	while ((e = readdir(dir)) != NULL)

Hmm. Is there a reason that this couldn't use
for_each_file_in_pack_dir() with a callback that just does the stat()
and buffer manipulation?

I don't think it's critical either way, but it would eliminate some of
the boilerplate that is shared between this implementation and the one
that already exists in for_each_file_in_pack_dir().

> +		if (!is_dot_or_dotdot(e->d_name) && e->d_type == DT_REG) {
> +			strbuf_setlen(&file_path, base_path_len);
> +			strbuf_addstr(&file_path, e->d_name);

For what it's worth, I think the callback would start here:

> +			if (!stat(file_path.buf, &e_stat))
> +				strbuf_addf(buf, "%-70s %16"PRIuMAX"\n",
> +					    e->d_name,
> +					    (uintmax_t)e_stat.st_size);

...and end here.

Thanks,
Taylor
Derrick Stolee Jan. 27, 2022, 3:14 p.m. UTC | #2
On 1/26/2022 5:43 PM, Taylor Blau wrote:
> On Wed, Jan 26, 2022 at 08:41:45AM +0000, Matthew John Cheetham via GitGitGadget wrote:
>> From: Matthew John Cheetham <mjcheetham@outlook.com>
>>
>> Teach the `scalar diagnose` command to gather file size information
>> about pack files.
>>
>> Signed-off-by: Matthew John Cheetham <mjcheetham@outlook.com>
>> ---
>>  contrib/scalar/scalar.c          | 39 ++++++++++++++++++++++++++++++++
>>  contrib/scalar/t/t9099-scalar.sh |  2 ++
>>  2 files changed, 41 insertions(+)
>>
>> diff --git a/contrib/scalar/scalar.c b/contrib/scalar/scalar.c
>> index e26fb2fc018..690933ffdf3 100644
>> --- a/contrib/scalar/scalar.c
>> +++ b/contrib/scalar/scalar.c
>> @@ -653,6 +653,39 @@ cleanup:
>>  	return res;
>>  }
>>
>> +static void dir_file_stats(struct strbuf *buf, const char *path)
>> +{
>> +	DIR *dir = opendir(path);
>> +	struct dirent *e;
>> +	struct stat e_stat;
>> +	struct strbuf file_path = STRBUF_INIT;
>> +	size_t base_path_len;
>> +
>> +	if (!dir)
>> +		return;
>> +
>> +	strbuf_addstr(buf, "Contents of ");
>> +	strbuf_add_absolute_path(buf, path);
>> +	strbuf_addstr(buf, ":\n");
>> +
>> +	strbuf_add_absolute_path(&file_path, path);
>> +	strbuf_addch(&file_path, '/');
>> +	base_path_len = file_path.len;
>> +
>> +	while ((e = readdir(dir)) != NULL)
> 
> Hmm. Is there a reason that this couldn't use
> for_each_file_in_pack_dir() with a callback that just does the stat()
> and buffer manipulation?
> 
> I don't think it's critical either way, but it would eliminate some of
> the boilerplate that is shared between this implementation and the one
> that already exists in for_each_file_in_pack_dir().

It's helpful to see if there are other crud files in the pack
directory. This method is also extended in microsoft/git to
scan the alternates directory (which we expect to exist as the
"shared objects cache).

We might want to modify the implementation in this series to
run dir_file_stats() on each odb in the_repository. This would
give us the data for the shared object cache for free while
being more general to other Git repos. (It would require us to
do some reaction work in microsoft/git and be a change of
behavior, but we are the only ones who have looked at these
diagnose files before, so that change will be easy to manage.)

Thanks,
-Stolee
Johannes Schindelin Feb. 6, 2022, 9:38 p.m. UTC | #3
Hi Stolee & Taylor,

On Thu, 27 Jan 2022, Derrick Stolee wrote:

> On 1/26/2022 5:43 PM, Taylor Blau wrote:
> > On Wed, Jan 26, 2022 at 08:41:45AM +0000, Matthew John Cheetham via GitGitGadget wrote:
> >> From: Matthew John Cheetham <mjcheetham@outlook.com>
> >>
> >> Teach the `scalar diagnose` command to gather file size information
> >> about pack files.
> >>
> >> Signed-off-by: Matthew John Cheetham <mjcheetham@outlook.com>
> >> ---
> >>  contrib/scalar/scalar.c          | 39 ++++++++++++++++++++++++++++++++
> >>  contrib/scalar/t/t9099-scalar.sh |  2 ++
> >>  2 files changed, 41 insertions(+)
> >>
> >> diff --git a/contrib/scalar/scalar.c b/contrib/scalar/scalar.c
> >> index e26fb2fc018..690933ffdf3 100644
> >> --- a/contrib/scalar/scalar.c
> >> +++ b/contrib/scalar/scalar.c
> >> @@ -653,6 +653,39 @@ cleanup:
> >>  	return res;
> >>  }
> >>
> >> +static void dir_file_stats(struct strbuf *buf, const char *path)
> >> +{
> >> +	DIR *dir = opendir(path);
> >> +	struct dirent *e;
> >> +	struct stat e_stat;
> >> +	struct strbuf file_path = STRBUF_INIT;
> >> +	size_t base_path_len;
> >> +
> >> +	if (!dir)
> >> +		return;
> >> +
> >> +	strbuf_addstr(buf, "Contents of ");
> >> +	strbuf_add_absolute_path(buf, path);
> >> +	strbuf_addstr(buf, ":\n");
> >> +
> >> +	strbuf_add_absolute_path(&file_path, path);
> >> +	strbuf_addch(&file_path, '/');
> >> +	base_path_len = file_path.len;
> >> +
> >> +	while ((e = readdir(dir)) != NULL)
> >
> > Hmm. Is there a reason that this couldn't use
> > for_each_file_in_pack_dir() with a callback that just does the stat()
> > and buffer manipulation?
> >
> > I don't think it's critical either way, but it would eliminate some of
> > the boilerplate that is shared between this implementation and the one
> > that already exists in for_each_file_in_pack_dir().
>
> It's helpful to see if there are other crud files in the pack
> directory. This method is also extended in microsoft/git to
> scan the alternates directory (which we expect to exist as the
> "shared objects cache).
>
> We might want to modify the implementation in this series to
> run dir_file_stats() on each odb in the_repository. This would
> give us the data for the shared object cache for free while
> being more general to other Git repos. (It would require us to
> do some reaction work in microsoft/git and be a change of
> behavior, but we are the only ones who have looked at these
> diagnose files before, so that change will be easy to manage.)

Good points all around. I went with the `for_each_file_in_pack_dir()`
approach, and threw in the now very simple change to also enumerate the
alternates, if there are any.

And yes, that will require some reaction work in microsoft/git, but for an
obvious improvement like this one, I don't grumble about the extra burden.

Ciao,
Dscho
diff mbox series

Patch

diff --git a/contrib/scalar/scalar.c b/contrib/scalar/scalar.c
index e26fb2fc018..690933ffdf3 100644
--- a/contrib/scalar/scalar.c
+++ b/contrib/scalar/scalar.c
@@ -653,6 +653,39 @@  cleanup:
 	return res;
 }
 
+static void dir_file_stats(struct strbuf *buf, const char *path)
+{
+	DIR *dir = opendir(path);
+	struct dirent *e;
+	struct stat e_stat;
+	struct strbuf file_path = STRBUF_INIT;
+	size_t base_path_len;
+
+	if (!dir)
+		return;
+
+	strbuf_addstr(buf, "Contents of ");
+	strbuf_add_absolute_path(buf, path);
+	strbuf_addstr(buf, ":\n");
+
+	strbuf_add_absolute_path(&file_path, path);
+	strbuf_addch(&file_path, '/');
+	base_path_len = file_path.len;
+
+	while ((e = readdir(dir)) != NULL)
+		if (!is_dot_or_dotdot(e->d_name) && e->d_type == DT_REG) {
+			strbuf_setlen(&file_path, base_path_len);
+			strbuf_addstr(&file_path, e->d_name);
+			if (!stat(file_path.buf, &e_stat))
+				strbuf_addf(buf, "%-70s %16"PRIuMAX"\n",
+					    e->d_name,
+					    (uintmax_t)e_stat.st_size);
+		}
+
+	strbuf_release(&file_path);
+	closedir(dir);
+}
+
 static int cmd_diagnose(int argc, const char **argv)
 {
 	struct option options[] = {
@@ -695,6 +728,12 @@  static int cmd_diagnose(int argc, const char **argv)
 	if ((res = stage(tmp_dir.buf, &buf, "diagnostics.log")))
 		goto diagnose_cleanup;
 
+	strbuf_reset(&buf);
+	dir_file_stats(&buf, ".git/objects/pack");
+
+	if ((res = stage(tmp_dir.buf, &buf, "packs-local.txt")))
+		goto diagnose_cleanup;
+
 	if ((res = stage_directory(tmp_dir.buf, ".git", 0)) ||
 	    (res = stage_directory(tmp_dir.buf, ".git/hooks", 0)) ||
 	    (res = stage_directory(tmp_dir.buf, ".git/info", 0)) ||
diff --git a/contrib/scalar/t/t9099-scalar.sh b/contrib/scalar/t/t9099-scalar.sh
index ecd06e207c2..b1745851e31 100755
--- a/contrib/scalar/t/t9099-scalar.sh
+++ b/contrib/scalar/t/t9099-scalar.sh
@@ -75,6 +75,8 @@  test_expect_success UNZIP 'scalar diagnose' '
 	folder=${zip_path%.zip} &&
 	test_path_is_missing "$folder" &&
 	unzip -p "$zip_path" diagnostics.log >out &&
+	test_file_not_empty out &&
+	unzip -p "$zip_path" packs-local.txt >out &&
 	test_file_not_empty out
 '