diff mbox series

prune: recursively prune objects directory

Message ID 20221122000927.M873500@dcvr (mailing list archive)
State New, archived
Headers show
Series prune: recursively prune objects directory | expand

Commit Message

Eric Wong Nov. 22, 2022, 12:09 a.m. UTC
Junio C Hamano <gitster@pobox.com> wrote:
> Junio C Hamano <gitster@pobox.com> writes:
> 
> >>  	prune_packed_objects(show_only ? PRUNE_PACKED_DRY_RUN : 0);
> >> -	remove_temporary_files(get_object_directory());
> >>  	s = mkpathdup("%s/pack", get_object_directory());
> >>  	remove_temporary_files(s);
> >>  	free(s);
> >
> > I actually was hinting at making the remove_temporary_files()
> > recurse, so that you do not need the separate invocation in pack/
> > subdirectory.
> >
> > Or make 256 calls for each of the fan-out subdirectory, in which
> > case the ENOENT silencing you did would really matter and shine.
> 
> But of course, neither is any part of this topic.  They are possible
> follow-on works.
> 
> Thanks and sorry for making a confusing statement that could be
> mistaken as "let's do this too", which wasn't what I meant.

Oh, no worries.  I already wrote this earlier and got distracted
with something else while waiting for tests :x.  Anyways, the
below supercedes my original patch and I think it's better in
every way.

I am unsure about duplicating ishex() from name-rev.c, however...

------8<-----
Subject: [PATCH] prune: recursively prune objects directory

$GIT_DIR/objects/pack may be removed to save inodes in shared
repositories, so avoid scanning it if it does not exist.  Loose
object directories ($GIT_DIR/objects/??) may have old temporary
files, so we now prune those, too.

Recursion is limited to a single level since git doesn't use
deeper levels.  This avoids the risk of stack overflows via
infinite recursion when pruning untrusted repos.

We'll also emit the system error in case a directory cannot be
opened to help users diagnose permissions problems or resource
constraints.

Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/prune.c  | 28 ++++++++++++++++++++--------
 t/t5304-prune.sh | 16 ++++++++++++++++
 2 files changed, 36 insertions(+), 8 deletions(-)

Comments

Junio C Hamano Nov. 22, 2022, 1:28 a.m. UTC | #1
Eric Wong <e@80x24.org> writes:

> I am unsure about duplicating ishex() from name-rev.c, however...

Yeah, I wonder why name-rev.c does not use isxdigit() in the first
place.

> ------8<-----
> Subject: [PATCH] prune: recursively prune objects directory
>
> $GIT_DIR/objects/pack may be removed to save inodes in shared
> repositories, so avoid scanning it if it does not exist.  Loose
> object directories ($GIT_DIR/objects/??) may have old temporary
> files, so we now prune those, too.
>
> Recursion is limited to a single level since git doesn't use
> deeper levels.  This avoids the risk of stack overflows via
> infinite recursion when pruning untrusted repos.
>
> We'll also emit the system error in case a directory cannot be
> opened to help users diagnose permissions problems or resource
> constraints.
>
> Signed-off-by: Eric Wong <e@80x24.org>
> ---
>  builtin/prune.c  | 28 ++++++++++++++++++++--------
>  t/t5304-prune.sh | 16 ++++++++++++++++
>  2 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/prune.c b/builtin/prune.c
> index df376b2ed1..0f6a33690a 100644
> --- a/builtin/prune.c
> +++ b/builtin/prune.c
> @@ -114,25 +114,41 @@ static int prune_subdir(unsigned int nr, const char *path, void *data)
>  	return 0;
>  }
>  
> +/*
> + * XXX ishex is duplicated in builtin/name-rev.c, perhaps git-compat-util.h
> + * is a better home for it
> + */
> +#define ishex(x) (isdigit((x)) || ((x) >= 'a' && (x) <= 'f'))
> +static int is_loose_prefix(const char *d_name)
> +{
> +	return strlen(d_name) == 2 && ishex(d_name[0]) && ishex(d_name[1]);
> +}
> +
>  /*
>   * Write errors (particularly out of space) can result in
>   * failed temporary packs (and more rarely indexes and other
>   * files beginning with "tmp_") accumulating in the object
>   * and the pack directories.
>   */
> -static void remove_temporary_files(const char *path)
> +static void remove_temporary_files(const char *path, int recurse)
>  {
>  	DIR *dir;
>  	struct dirent *de;
>  
>  	dir = opendir(path);
>  	if (!dir) {
> -		fprintf(stderr, "Unable to open directory %s\n", path);
> +		warning_errno(_("unable to open directory %s"), path);
>  		return;
>  	}
>  	while ((de = readdir(dir)) != NULL)
> -		if (starts_with(de->d_name, "tmp_"))
> +		if (starts_with(de->d_name, "tmp_")) {
>  			prune_tmp_file(mkpath("%s/%s", path, de->d_name));
> +		} else if (recurse && (strcmp(de->d_name, "packs") == 0 ||
> +					is_loose_prefix(de->d_name))) {

OK, the intent is to be careful and deal only with the fan-out
directories objects/[0-9a-f]{2}/ and objects/pack/ and leave crufts
in objects/info and any other unknown subdirectories, which makes
sense.

Two nits are:

 - "packs" wants to be "pack".
 - "strcmp() == 0" wants to be "!strcmp()".

> +			char *s = mkpathdup("%s/%s", path, de->d_name);
> +			remove_temporary_files(s, 0);
> +			free(s);
> +		}
>  	closedir(dir);
>  }
>  
> @@ -150,7 +166,6 @@ int cmd_prune(int argc, const char **argv, const char *prefix)
>  			 N_("limit traversal to objects outside promisor packfiles")),
>  		OPT_END()
>  	};
> -	char *s;
>  
>  	expire = TIME_MAX;
>  	save_commit_buffer = 0;
> @@ -186,10 +201,7 @@ int cmd_prune(int argc, const char **argv, const char *prefix)
>  				      prune_cruft, prune_subdir, &revs);
>  
>  	prune_packed_objects(show_only ? PRUNE_PACKED_DRY_RUN : 0);
> -	remove_temporary_files(get_object_directory());
> -	s = mkpathdup("%s/pack", get_object_directory());
> -	remove_temporary_files(s);
> -	free(s);
> +	remove_temporary_files(get_object_directory(), 1);
>  
>  	if (is_repository_shallow(the_repository)) {
>  		perform_reachability_traversal(&revs);
> diff --git a/t/t5304-prune.sh b/t/t5304-prune.sh
> index 8ae314af58..8c2278035e 100755
> --- a/t/t5304-prune.sh
> +++ b/t/t5304-prune.sh
> @@ -29,6 +29,22 @@ test_expect_success setup '
>  	git gc
>  '
>  
> +test_expect_success 'prune stale loose objects' '
> +	mkdir .git/objects/aa &&
> +	>.git/objects/aa/tmp_foo &&
> +	test-tool chmtime =-86501 .git/objects/aa/tmp_foo &&
> +	git prune --expire 1.day &&
> +	test_path_is_missing .git/objects/aa/tmp_foo
> +'
> +
> +test_expect_success 'bare repo prune is quiet without $GIT_DIR/objects/pack' '
> +	git clone -q --shared --template= --bare . bare.git &&
> +	rmdir bare.git/objects/pack &&
> +	git --git-dir=bare.git prune --no-progress 2>prune.err &&
> +	test_must_be_empty prune.err &&
> +	rm -r bare.git prune.err
> +'

Is the last "clean-up" step necessary?

> +
>  test_expect_success 'prune stale packs' '
>  	orig_pack=$(echo .git/objects/pack/*.pack) &&
>  	>.git/objects/tmp_1.pack &&

Other than that, looks like a good idea.

Thanks.
Eric Wong Nov. 22, 2022, 9:59 a.m. UTC | #2
Junio C Hamano <gitster@pobox.com> wrote:
> Eric Wong <e@80x24.org> writes:
> 
> > I am unsure about duplicating ishex() from name-rev.c, however...
> 
> Yeah, I wonder why name-rev.c does not use isxdigit() in the first
> place.

isxdigit includes uppercase [A-F].  I think being strict is
better, here.  I don't want to open up a can of worms if we
become tolerant of 3rd-party git implementations developed on
case-insensitive FSes.

> > -static void remove_temporary_files(const char *path)
> > +static void remove_temporary_files(const char *path, int recurse)
> >  {
> >  	DIR *dir;
> >  	struct dirent *de;
> >  
> >  	dir = opendir(path);
> >  	if (!dir) {
> > -		fprintf(stderr, "Unable to open directory %s\n", path);
> > +		warning_errno(_("unable to open directory %s"), path);
> >  		return;
> >  	}
> >  	while ((de = readdir(dir)) != NULL)
> > -		if (starts_with(de->d_name, "tmp_"))
> > +		if (starts_with(de->d_name, "tmp_")) {
> >  			prune_tmp_file(mkpath("%s/%s", path, de->d_name));
> > +		} else if (recurse && (strcmp(de->d_name, "packs") == 0 ||
> > +					is_loose_prefix(de->d_name))) {
> 
> OK, the intent is to be careful and deal only with the fan-out
> directories objects/[0-9a-f]{2}/ and objects/pack/ and leave crufts
> in objects/info and any other unknown subdirectories, which makes
> sense.
> 
> Two nits are:
> 
>  - "packs" wants to be "pack".

OK, fixed.  Along with existing test cases, since packs handling
wasn't being tested properly.

>  - "strcmp() == 0" wants to be "!strcmp()".

OK

> > diff --git a/t/t5304-prune.sh b/t/t5304-prune.sh
> > index 8ae314af58..8c2278035e 100755
> > --- a/t/t5304-prune.sh
> > +++ b/t/t5304-prune.sh
> > @@ -29,6 +29,22 @@ test_expect_success setup '
> >  	git gc
> >  '
> >  
> > +test_expect_success 'prune stale loose objects' '
> > +	mkdir .git/objects/aa &&
> > +	>.git/objects/aa/tmp_foo &&
> > +	test-tool chmtime =-86501 .git/objects/aa/tmp_foo &&
> > +	git prune --expire 1.day &&
> > +	test_path_is_missing .git/objects/aa/tmp_foo
> > +'
> > +
> > +test_expect_success 'bare repo prune is quiet without $GIT_DIR/objects/pack' '
> > +	git clone -q --shared --template= --bare . bare.git &&
> > +	rmdir bare.git/objects/pack &&
> > +	git --git-dir=bare.git prune --no-progress 2>prune.err &&
> > +	test_must_be_empty prune.err &&
> > +	rm -r bare.git prune.err
> > +'
> 
> Is the last "clean-up" step necessary?

Guess not, removed in v2 below.

> > +
> >  test_expect_success 'prune stale packs' '
> >  	orig_pack=$(echo .git/objects/pack/*.pack) &&
> >  	>.git/objects/tmp_1.pack &&
> 
> Other than that, looks like a good idea.

'prune stale packs' was actually insufficient for catching
the extraneous `s' in `pack'.  I've kept existing checks against
objects/tmp_*, but added extra checks for objects/pack/tmp_*

v2 fixes:
* `pack' directory fixed, tests added
* !strcmp
* remove needless cleanup step in test

-----8<-----
Subject: [PATCH] prune: recursively prune objects directory

$GIT_DIR/objects/pack may be removed to save inodes in shared
repositories, so avoid scanning it if it does not exist.  Loose
object directories ($GIT_DIR/objects/??) may have old temporary
files, so we now prune those, too.

Recursion is limited to a single level since git doesn't use
deeper levels.  This avoids the risk of stack overflows via
infinite recursion when pruning untrusted repos.

We'll also emit the system error in case a directory cannot be
opened to help users diagnose permissions problems or resource
constraints.

Signed-off-by: Eric Wong <e@80x24.org>
---
Interdiff:
  diff --git a/builtin/prune.c b/builtin/prune.c
  index 0f6a33690a..a05f1a2704 100644
  --- a/builtin/prune.c
  +++ b/builtin/prune.c
  @@ -143,7 +143,7 @@ static void remove_temporary_files(const char *path, int recurse)
   	while ((de = readdir(dir)) != NULL)
   		if (starts_with(de->d_name, "tmp_")) {
   			prune_tmp_file(mkpath("%s/%s", path, de->d_name));
  -		} else if (recurse && (strcmp(de->d_name, "packs") == 0 ||
  +		} else if (recurse && (!strcmp(de->d_name, "pack") ||
   					is_loose_prefix(de->d_name))) {
   			char *s = mkpathdup("%s/%s", path, de->d_name);
   			remove_temporary_files(s, 0);
  diff --git a/t/t5304-prune.sh b/t/t5304-prune.sh
  index 8c2278035e..64d5f4e5b3 100755
  --- a/t/t5304-prune.sh
  +++ b/t/t5304-prune.sh
  @@ -41,19 +41,23 @@ test_expect_success 'bare repo prune is quiet without $GIT_DIR/objects/pack' '
   	git clone -q --shared --template= --bare . bare.git &&
   	rmdir bare.git/objects/pack &&
   	git --git-dir=bare.git prune --no-progress 2>prune.err &&
  -	test_must_be_empty prune.err &&
  -	rm -r bare.git prune.err
  +	test_must_be_empty prune.err
   '
   
   test_expect_success 'prune stale packs' '
   	orig_pack=$(echo .git/objects/pack/*.pack) &&
   	>.git/objects/tmp_1.pack &&
   	>.git/objects/tmp_2.pack &&
  -	test-tool chmtime =-86501 .git/objects/tmp_1.pack &&
  +	>.git/objects/pack/tmp_3.pack &&
  +	>.git/objects/pack/tmp_4.pack &&
  +	test-tool chmtime =-86501 .git/objects/tmp_1.pack \
  +		.git/objects/pack/tmp_3.pack &&
   	git prune --expire 1.day &&
   	test_path_is_file $orig_pack &&
   	test_path_is_file .git/objects/tmp_2.pack &&
  -	test_path_is_missing .git/objects/tmp_1.pack
  +	test_path_is_file .git/objects/pack/tmp_4.pack &&
  +	test_path_is_missing .git/objects/tmp_1.pack &&
  +	test_path_is_missing .git/objects/pack/tmp_3.pack
   '
   
   test_expect_success 'prune --expire' '

 builtin/prune.c  | 28 ++++++++++++++++++++--------
 t/t5304-prune.sh | 24 ++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/builtin/prune.c b/builtin/prune.c
index df376b2ed1..a05f1a2704 100644
--- a/builtin/prune.c
+++ b/builtin/prune.c
@@ -114,25 +114,41 @@ static int prune_subdir(unsigned int nr, const char *path, void *data)
 	return 0;
 }
 
+/*
+ * XXX ishex is duplicated in builtin/name-rev.c, perhaps git-compat-util.h
+ * is a better home for it
+ */
+#define ishex(x) (isdigit((x)) || ((x) >= 'a' && (x) <= 'f'))
+static int is_loose_prefix(const char *d_name)
+{
+	return strlen(d_name) == 2 && ishex(d_name[0]) && ishex(d_name[1]);
+}
+
 /*
  * Write errors (particularly out of space) can result in
  * failed temporary packs (and more rarely indexes and other
  * files beginning with "tmp_") accumulating in the object
  * and the pack directories.
  */
-static void remove_temporary_files(const char *path)
+static void remove_temporary_files(const char *path, int recurse)
 {
 	DIR *dir;
 	struct dirent *de;
 
 	dir = opendir(path);
 	if (!dir) {
-		fprintf(stderr, "Unable to open directory %s\n", path);
+		warning_errno(_("unable to open directory %s"), path);
 		return;
 	}
 	while ((de = readdir(dir)) != NULL)
-		if (starts_with(de->d_name, "tmp_"))
+		if (starts_with(de->d_name, "tmp_")) {
 			prune_tmp_file(mkpath("%s/%s", path, de->d_name));
+		} else if (recurse && (!strcmp(de->d_name, "pack") ||
+					is_loose_prefix(de->d_name))) {
+			char *s = mkpathdup("%s/%s", path, de->d_name);
+			remove_temporary_files(s, 0);
+			free(s);
+		}
 	closedir(dir);
 }
 
@@ -150,7 +166,6 @@ int cmd_prune(int argc, const char **argv, const char *prefix)
 			 N_("limit traversal to objects outside promisor packfiles")),
 		OPT_END()
 	};
-	char *s;
 
 	expire = TIME_MAX;
 	save_commit_buffer = 0;
@@ -186,10 +201,7 @@ int cmd_prune(int argc, const char **argv, const char *prefix)
 				      prune_cruft, prune_subdir, &revs);
 
 	prune_packed_objects(show_only ? PRUNE_PACKED_DRY_RUN : 0);
-	remove_temporary_files(get_object_directory());
-	s = mkpathdup("%s/pack", get_object_directory());
-	remove_temporary_files(s);
-	free(s);
+	remove_temporary_files(get_object_directory(), 1);
 
 	if (is_repository_shallow(the_repository)) {
 		perform_reachability_traversal(&revs);
diff --git a/t/t5304-prune.sh b/t/t5304-prune.sh
index 8ae314af58..64d5f4e5b3 100755
--- a/t/t5304-prune.sh
+++ b/t/t5304-prune.sh
@@ -29,15 +29,35 @@ test_expect_success setup '
 	git gc
 '
 
+test_expect_success 'prune stale loose objects' '
+	mkdir .git/objects/aa &&
+	>.git/objects/aa/tmp_foo &&
+	test-tool chmtime =-86501 .git/objects/aa/tmp_foo &&
+	git prune --expire 1.day &&
+	test_path_is_missing .git/objects/aa/tmp_foo
+'
+
+test_expect_success 'bare repo prune is quiet without $GIT_DIR/objects/pack' '
+	git clone -q --shared --template= --bare . bare.git &&
+	rmdir bare.git/objects/pack &&
+	git --git-dir=bare.git prune --no-progress 2>prune.err &&
+	test_must_be_empty prune.err
+'
+
 test_expect_success 'prune stale packs' '
 	orig_pack=$(echo .git/objects/pack/*.pack) &&
 	>.git/objects/tmp_1.pack &&
 	>.git/objects/tmp_2.pack &&
-	test-tool chmtime =-86501 .git/objects/tmp_1.pack &&
+	>.git/objects/pack/tmp_3.pack &&
+	>.git/objects/pack/tmp_4.pack &&
+	test-tool chmtime =-86501 .git/objects/tmp_1.pack \
+		.git/objects/pack/tmp_3.pack &&
 	git prune --expire 1.day &&
 	test_path_is_file $orig_pack &&
 	test_path_is_file .git/objects/tmp_2.pack &&
-	test_path_is_missing .git/objects/tmp_1.pack
+	test_path_is_file .git/objects/pack/tmp_4.pack &&
+	test_path_is_missing .git/objects/tmp_1.pack &&
+	test_path_is_missing .git/objects/pack/tmp_3.pack
 '
 
 test_expect_success 'prune --expire' '
Junio C Hamano Nov. 22, 2022, 11:16 p.m. UTC | #3
Eric Wong <e@80x24.org> writes:

> Junio C Hamano <gitster@pobox.com> wrote:
>> Eric Wong <e@80x24.org> writes:
>> 
>> > I am unsure about duplicating ishex() from name-rev.c, however...
>> 
>> Yeah, I wonder why name-rev.c does not use isxdigit() in the first
>> place.
>
> isxdigit includes uppercase [A-F].  I think being strict is
> better, here.  I don't want to open up a can of worms if we
> become tolerant of 3rd-party git implementations developed on
> case-insensitive FSes.

OK, we do not recurse into .git/objects/AA/ for the same reason why
we do not recurse into .git/objects/info/.  We do expect [0-9a-f]{2}
and pack to be directories, so we go silent if they are missing, but
we do complain if somebody creates a regular file .git/objects/aa
for fun.

I agree that isxdigit() is not a good match.  I also agree with what
you said about it belong to git-compat-util.h but let's leave it for
a future clean-up patch to remove both copies of ishex(), introduce
islxdigit() in git-compat-util.h and use it as its replacement.
diff mbox series

Patch

diff --git a/builtin/prune.c b/builtin/prune.c
index df376b2ed1..0f6a33690a 100644
--- a/builtin/prune.c
+++ b/builtin/prune.c
@@ -114,25 +114,41 @@  static int prune_subdir(unsigned int nr, const char *path, void *data)
 	return 0;
 }
 
+/*
+ * XXX ishex is duplicated in builtin/name-rev.c, perhaps git-compat-util.h
+ * is a better home for it
+ */
+#define ishex(x) (isdigit((x)) || ((x) >= 'a' && (x) <= 'f'))
+static int is_loose_prefix(const char *d_name)
+{
+	return strlen(d_name) == 2 && ishex(d_name[0]) && ishex(d_name[1]);
+}
+
 /*
  * Write errors (particularly out of space) can result in
  * failed temporary packs (and more rarely indexes and other
  * files beginning with "tmp_") accumulating in the object
  * and the pack directories.
  */
-static void remove_temporary_files(const char *path)
+static void remove_temporary_files(const char *path, int recurse)
 {
 	DIR *dir;
 	struct dirent *de;
 
 	dir = opendir(path);
 	if (!dir) {
-		fprintf(stderr, "Unable to open directory %s\n", path);
+		warning_errno(_("unable to open directory %s"), path);
 		return;
 	}
 	while ((de = readdir(dir)) != NULL)
-		if (starts_with(de->d_name, "tmp_"))
+		if (starts_with(de->d_name, "tmp_")) {
 			prune_tmp_file(mkpath("%s/%s", path, de->d_name));
+		} else if (recurse && (strcmp(de->d_name, "packs") == 0 ||
+					is_loose_prefix(de->d_name))) {
+			char *s = mkpathdup("%s/%s", path, de->d_name);
+			remove_temporary_files(s, 0);
+			free(s);
+		}
 	closedir(dir);
 }
 
@@ -150,7 +166,6 @@  int cmd_prune(int argc, const char **argv, const char *prefix)
 			 N_("limit traversal to objects outside promisor packfiles")),
 		OPT_END()
 	};
-	char *s;
 
 	expire = TIME_MAX;
 	save_commit_buffer = 0;
@@ -186,10 +201,7 @@  int cmd_prune(int argc, const char **argv, const char *prefix)
 				      prune_cruft, prune_subdir, &revs);
 
 	prune_packed_objects(show_only ? PRUNE_PACKED_DRY_RUN : 0);
-	remove_temporary_files(get_object_directory());
-	s = mkpathdup("%s/pack", get_object_directory());
-	remove_temporary_files(s);
-	free(s);
+	remove_temporary_files(get_object_directory(), 1);
 
 	if (is_repository_shallow(the_repository)) {
 		perform_reachability_traversal(&revs);
diff --git a/t/t5304-prune.sh b/t/t5304-prune.sh
index 8ae314af58..8c2278035e 100755
--- a/t/t5304-prune.sh
+++ b/t/t5304-prune.sh
@@ -29,6 +29,22 @@  test_expect_success setup '
 	git gc
 '
 
+test_expect_success 'prune stale loose objects' '
+	mkdir .git/objects/aa &&
+	>.git/objects/aa/tmp_foo &&
+	test-tool chmtime =-86501 .git/objects/aa/tmp_foo &&
+	git prune --expire 1.day &&
+	test_path_is_missing .git/objects/aa/tmp_foo
+'
+
+test_expect_success 'bare repo prune is quiet without $GIT_DIR/objects/pack' '
+	git clone -q --shared --template= --bare . bare.git &&
+	rmdir bare.git/objects/pack &&
+	git --git-dir=bare.git prune --no-progress 2>prune.err &&
+	test_must_be_empty prune.err &&
+	rm -r bare.git prune.err
+'
+
 test_expect_success 'prune stale packs' '
 	orig_pack=$(echo .git/objects/pack/*.pack) &&
 	>.git/objects/tmp_1.pack &&