diff mbox series

[GSoC,v13,06/10] git refs: add verify subcommand

Message ID ZqeYsNtl90N1fVDy@ArchLinux (mailing list archive)
State Superseded
Headers show
Series ref consistency check infra setup | expand

Commit Message

shejialuo July 29, 2024, 1:27 p.m. UTC
Introduce a new subcommand "verify" in git-refs(1) to allow the user to
check the reference database consistency and also this subcommand will
be used as the entry point of checking refs for "git-fsck(1)". Last, add
"verbose" field into "fsck_options" to indicate whether we should print
verbose messages when checking refs and objects consistency.

Mentored-by: Patrick Steinhardt <ps@pks.im>
Mentored-by: Karthik Nayak <karthik.188@gmail.com>
Signed-off-by: shejialuo <shejialuo@gmail.com>
---
 Documentation/git-refs.txt | 13 +++++++++++
 builtin/refs.c             | 44 ++++++++++++++++++++++++++++++++++++++
 fsck.h                     |  1 +
 3 files changed, 58 insertions(+)

Comments

Patrick Steinhardt July 30, 2024, 8:31 a.m. UTC | #1
On Mon, Jul 29, 2024 at 09:27:12PM +0800, shejialuo wrote:

The subject should probably start with "builtin/refs", not "git refs".

> Introduce a new subcommand "verify" in git-refs(1) to allow the user to
> check the reference database consistency and also this subcommand will
> be used as the entry point of checking refs for "git-fsck(1)". Last, add
> "verbose" field into "fsck_options" to indicate whether we should print
> verbose messages when checking refs and objects consistency.

Nice. I very much like that we now have a common home for such low-level
ref-related commands. Also, "verify" is neatly in line with e.g. `git
commit-graph verify".

> @@ -39,6 +43,15 @@ include::ref-storage-format.txt[]
>  	can be used to double check that the migration works as expected before
>  	performing the actual migration.
>  
> +The following options are specific to 'git refs verify':
> +
> +--strict::
> +	Enable more strict checking, every WARN severity for the `Fsck Messages`
> +	be seen as ERROR. See linkgit:git-fsck[1].

How about:

    "Enable stricter error checking. This will cause warnings to be
    reported as errors. See linkgit:git-fsck[1]."

> +--verbose::
> +	When verifying the reference database consistency, be chatty.

I wonder whether this really helps all that much. It doesn't really say
what it adds on top of the default mode. So unless we document what
exactly this changes, I rather think we can just leave it aways as
basically everyone knows what a "--verbose" flag does.

>  KNOWN LIMITATIONS
>  -----------------
>  
> diff --git a/builtin/refs.c b/builtin/refs.c
> index 46dcd150d4..4831c9e28e 100644
> --- a/builtin/refs.c
> +++ b/builtin/refs.c
> @@ -1,4 +1,6 @@
>  #include "builtin.h"
> +#include "config.h"
> +#include "fsck.h"
>  #include "parse-options.h"
>  #include "refs.h"
>  #include "repository.h"
> @@ -7,6 +9,9 @@
>  #define REFS_MIGRATE_USAGE \
>  	N_("git refs migrate --ref-format=<format> [--dry-run]")
>  
> +#define REFS_VERIFY_USAGE \
> +	N_("git refs verify [--strict] [--verbose]")
> +
>  static int cmd_refs_migrate(int argc, const char **argv, const char *prefix)
>  {
>  	const char * const migrate_usage[] = {
> @@ -58,15 +63,54 @@ static int cmd_refs_migrate(int argc, const char **argv, const char *prefix)
>  	return err;
>  }
>  
> +static int cmd_refs_verify(int argc, const char **argv, const char *prefix)
> +{
> +	struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;

So we don't ever end up using `FSCK_REFS_OPTIONS_STRICT`? If so, I think
we should just drop that declaration in the preceding patch.

> +	const char * const verify_usage[] = {
> +		REFS_VERIFY_USAGE,
> +		NULL,
> +	};
> +	unsigned int verbose = 0, strict = 0;
> +	struct option options[] = {
> +		OPT__VERBOSE(&verbose, N_("be verbose")),
> +		OPT_BOOL(0, "strict", &strict, N_("enable strict checking")),
> +		OPT_END(),
> +	};
> +	int ret;
> +
> +	argc = parse_options(argc, argv, prefix, options, verify_usage, 0);
> +	if (argc)
> +		usage(_("'git refs verify' takes no arguments"));
> +
> +	if (verbose)
> +		fsck_refs_options.verbose = 1;
> +	if (strict)
> +		fsck_refs_options.strict = 1;

Instead of manually setting those variables, we can pass pointers to
those member variables in the `struct option`s directly.

> +	git_config(git_fsck_config, &fsck_refs_options);
> +	prepare_repo_settings(the_repository);
> +
> +	ret = refs_fsck(get_main_ref_store(the_repository), &fsck_refs_options);
> +
> +	/*
> +	 * Explicitly free the allocated array and "skip_oids" set
> +	 */
> +	free(fsck_refs_options.msg_type);
> +	oidset_clear(&fsck_refs_options.skip_oids);

Should we provide a `fsck_options_clear()` function that does this for
us? Otherwise we'll have to adapt callsites of `refs_fsck` whenever
internal implementation details of the subsystem add newly allocated
members.

> +	return ret;
> +}
> +
>  int cmd_refs(int argc, const char **argv, const char *prefix)
>  {
>  	const char * const refs_usage[] = {
>  		REFS_MIGRATE_USAGE,
> +		REFS_VERIFY_USAGE,
>  		NULL,
>  	};
>  	parse_opt_subcommand_fn *fn = NULL;
>  	struct option opts[] = {
>  		OPT_SUBCOMMAND("migrate", &fn, cmd_refs_migrate),
> +		OPT_SUBCOMMAND("verify", &fn, cmd_refs_verify),
>  		OPT_END(),
>  	};
>  
> diff --git a/fsck.h b/fsck.h
> index a4a4ba88ee..b03dba442e 100644
> --- a/fsck.h
> +++ b/fsck.h
> @@ -155,6 +155,7 @@ struct fsck_options {
>  	fsck_walk_func walk;
>  	fsck_error error_func;
>  	unsigned strict:1;
> +	unsigned verbose:1;

Okay. Let's see whether this field will be used in a subsequent patch.
If not, we should drop it and get rid of the option altogether, I guess.

Patrick
shejialuo July 30, 2024, 3:59 p.m. UTC | #2
On Tue, Jul 30, 2024 at 10:31:37AM +0200, Patrick Steinhardt wrote:
> On Mon, Jul 29, 2024 at 09:27:12PM +0800, shejialuo wrote:
> 
> The subject should probably start with "builtin/refs", not "git refs".
> 

Yes, I will improve this in the next version.

> > @@ -39,6 +43,15 @@ include::ref-storage-format.txt[]
> >  	can be used to double check that the migration works as expected before
> >  	performing the actual migration.
> >  
> > +The following options are specific to 'git refs verify':
> > +
> > +--strict::
> > +	Enable more strict checking, every WARN severity for the `Fsck Messages`
> > +	be seen as ERROR. See linkgit:git-fsck[1].
> 
> How about:
> 
>     "Enable stricter error checking. This will cause warnings to be
>     reported as errors. See linkgit:git-fsck[1]."
> 

Yes, it is much more clear. Actually, I really feel hard to write a good
document.

> > +--verbose::
> > +	When verifying the reference database consistency, be chatty.
> 
> I wonder whether this really helps all that much. It doesn't really say
> what it adds on top of the default mode. So unless we document what
> exactly this changes, I rather think we can just leave it aways as
> basically everyone knows what a "--verbose" flag does.
> 

Yes, I think so. `--verbose` is a common flag. However, we have already
added this, so we may just leave it here. It's not bad to add more
information.

> > +static int cmd_refs_verify(int argc, const char **argv, const char *prefix)
> > +{
> > +	struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;
> 
> So we don't ever end up using `FSCK_REFS_OPTIONS_STRICT`? If so, I think
> we should just drop that declaration in the preceding patch.
> 

I agree here. I will delete `FSCK_REFS_OPTIONS_STRICT`.

> > +	const char * const verify_usage[] = {
> > +		REFS_VERIFY_USAGE,
> > +		NULL,
> > +	};
> > +	unsigned int verbose = 0, strict = 0;
> > +	struct option options[] = {
> > +		OPT__VERBOSE(&verbose, N_("be verbose")),
> > +		OPT_BOOL(0, "strict", &strict, N_("enable strict checking")),
> > +		OPT_END(),
> > +	};
> > +	int ret;
> > +
> > +	argc = parse_options(argc, argv, prefix, options, verify_usage, 0);
> > +	if (argc)
> > +		usage(_("'git refs verify' takes no arguments"));
> > +
> > +	if (verbose)
> > +		fsck_refs_options.verbose = 1;
> > +	if (strict)
> > +		fsck_refs_options.strict = 1;
> 
> Instead of manually setting those variables, we can pass pointers to
> those member variables in the `struct option`s directly.
> 

Yes, but I have tried but found that the types are mismatching, I will
find a way to do this.

> > +	git_config(git_fsck_config, &fsck_refs_options);
> > +	prepare_repo_settings(the_repository);
> > +
> > +	ret = refs_fsck(get_main_ref_store(the_repository), &fsck_refs_options);
> > +
> > +	/*
> > +	 * Explicitly free the allocated array and "skip_oids" set
> > +	 */
> > +	free(fsck_refs_options.msg_type);
> > +	oidset_clear(&fsck_refs_options.skip_oids);
> 
> Should we provide a `fsck_options_clear()` function that does this for
> us? Otherwise we'll have to adapt callsites of `refs_fsck` whenever
> internal implementation details of the subsystem add newly allocated
> members.
> 

Yes, I agree with this. I wanna talk more on this. In the first time, I
did not call `oidset_clear` and I failed the CI tests. It made me
confused. Because we never use "skip_oids" in the ref check, why the
tests said that "fsck_refs.options.skip_oids" was not freed.

This is because when executing the command "git -c fsck.skipList=.. fsck",
in the subprocess `git refs verify`, the code will still setup the
"skip_oids" by the config. So we should explicitly free the "skip_oids".

But how does "fsck.c" free "skip_oids", actually "fsck.c" never frees
"skip_oids". This is because "git-fsck(1)" defines the following:

  static struct fsck_options fsck_walk_options = FSCK_OPTIONS_DEFAULT;
  static struct fsck_options fsck_obj_options = FSCK_OPTIONS_DEFAULT;

Because these two options are "static", so there is no memory leak. We
leave it to the operating system. So maybe a more simple way is just to
add "static" identifier in "cmd_refs_verify" which means:

  - struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;
  + static struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;

But I don't think we should use `static`, because Eric has told me that
making a variable "static" will make the code harder to "libfy". So
let's use "fsck_options_clear" function instead.
Eric Sunshine July 30, 2024, 5:56 p.m. UTC | #3
On Tue, Jul 30, 2024 at 11:59 AM shejialuo <shejialuo@gmail.com> wrote:
> On Tue, Jul 30, 2024 at 10:31:37AM +0200, Patrick Steinhardt wrote:
> > On Mon, Jul 29, 2024 at 09:27:12PM +0800, shejialuo wrote:
> > > +   /*
> > > +    * Explicitly free the allocated array and "skip_oids" set
> > > +    */
> > > +   free(fsck_refs_options.msg_type);
> > > +   oidset_clear(&fsck_refs_options.skip_oids);
> >
> > Should we provide a `fsck_options_clear()` function that does this for
> > us? Otherwise we'll have to adapt callsites of `refs_fsck` whenever
> > internal implementation details of the subsystem add newly allocated
> > members.
> [...]
> But how does "fsck.c" free "skip_oids", actually "fsck.c" never frees
> "skip_oids". This is because "git-fsck(1)" defines the following:
>
>   static struct fsck_options fsck_walk_options = FSCK_OPTIONS_DEFAULT;
>   static struct fsck_options fsck_obj_options = FSCK_OPTIONS_DEFAULT;
>
> Because these two options are "static", so there is no memory leak. We
> leave it to the operating system. So maybe a more simple way is just to
> add "static" identifier in "cmd_refs_verify" which means:
>
>   - struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;
>   + static struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;
>
> But I don't think we should use `static`, because Eric has told me that
> making a variable "static" will make the code harder to "libfy". So
> let's use "fsck_options_clear" function instead.

I haven't been following this topic closely and I'm not familiar with
this code (and don't have much time now to dig into it), but I suspect
the context here is rather different from the one[*] in which I was
highly skeptical of the use of `static`. The `static` in that earlier
case was suspicious/questionable for two reasons. First, it was a case
of premature optimization (which, by definition, is frowned upon).
Second, it was in a "library" function (namely, top-level
fsck.c:fsck_refs_error_function()) which may someday become a linkable
library which other programs (aside from `git` itself) may utilize.
Having a static strbuf in the library function makes the function
non-reentrant and takes memory management out of the hands of the
client.

In the case under discussion here (namely `builtin/fsck.c`), it is a
Git-specific command, not library code. As such, "libification" is
much less of an issue since Git-specific command code is less likely
to be reused by some other project. (However, that's not to say that
we shouldn't worry about unnecessary use of `static` even in builtin
commands; code from those commands does periodically migrate from
`builtin/*.c` to top-level library oriented `*.c`.)

So, considering that the variable under discussion:

    struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;

is part of a builtin command rather than library code, we don't have
to worry about "libification" so much, thus making it `static` would
be a workable approach. However, doing so merely to avoid complaint by
the leak-checker does not seem like good justification. Hence, keeping
this variable non-static and freeing it explicitly seems a better idea
(which is what this code does presently).

I do agree with Patrick that adding fsck_options_clear() to top-level
`fsck.h` would be sensible since it frees callers from having to know
implementation details of `fsck_options`.

By the way, regarding the static `fsck_walk_options` and
`fsck_obj_options` those are probably global static for convenience
rather than out of necessity. It might very well be possible to make
those local variables in builtin/fsck.c:cmd_fsck() and then plumb them
through to called functions so that they don't have to be static, and
then they would be freed manually by cmd_fsck(), as well. However,
that sort of change is well outside the scope of this topic.

[*] https://lore.kernel.org/git/CAPig+cR=RgMeaAy1PRGgHu6_Ak+7=_-5tGvBZRekKRxi7GtdHw@mail.gmail.com/
diff mbox series

Patch

diff --git a/Documentation/git-refs.txt b/Documentation/git-refs.txt
index 5b99e04385..1244a85b64 100644
--- a/Documentation/git-refs.txt
+++ b/Documentation/git-refs.txt
@@ -10,6 +10,7 @@  SYNOPSIS
 --------
 [verse]
 'git refs migrate' --ref-format=<format> [--dry-run]
+'git refs verify' [--strict] [--verbose]
 
 DESCRIPTION
 -----------
@@ -22,6 +23,9 @@  COMMANDS
 migrate::
 	Migrate ref store between different formats.
 
+verify::
+	Verify reference database consistency.
+
 OPTIONS
 -------
 
@@ -39,6 +43,15 @@  include::ref-storage-format.txt[]
 	can be used to double check that the migration works as expected before
 	performing the actual migration.
 
+The following options are specific to 'git refs verify':
+
+--strict::
+	Enable more strict checking, every WARN severity for the `Fsck Messages`
+	be seen as ERROR. See linkgit:git-fsck[1].
+
+--verbose::
+	When verifying the reference database consistency, be chatty.
+
 KNOWN LIMITATIONS
 -----------------
 
diff --git a/builtin/refs.c b/builtin/refs.c
index 46dcd150d4..4831c9e28e 100644
--- a/builtin/refs.c
+++ b/builtin/refs.c
@@ -1,4 +1,6 @@ 
 #include "builtin.h"
+#include "config.h"
+#include "fsck.h"
 #include "parse-options.h"
 #include "refs.h"
 #include "repository.h"
@@ -7,6 +9,9 @@ 
 #define REFS_MIGRATE_USAGE \
 	N_("git refs migrate --ref-format=<format> [--dry-run]")
 
+#define REFS_VERIFY_USAGE \
+	N_("git refs verify [--strict] [--verbose]")
+
 static int cmd_refs_migrate(int argc, const char **argv, const char *prefix)
 {
 	const char * const migrate_usage[] = {
@@ -58,15 +63,54 @@  static int cmd_refs_migrate(int argc, const char **argv, const char *prefix)
 	return err;
 }
 
+static int cmd_refs_verify(int argc, const char **argv, const char *prefix)
+{
+	struct fsck_options fsck_refs_options = FSCK_REFS_OPTIONS_DEFAULT;
+	const char * const verify_usage[] = {
+		REFS_VERIFY_USAGE,
+		NULL,
+	};
+	unsigned int verbose = 0, strict = 0;
+	struct option options[] = {
+		OPT__VERBOSE(&verbose, N_("be verbose")),
+		OPT_BOOL(0, "strict", &strict, N_("enable strict checking")),
+		OPT_END(),
+	};
+	int ret;
+
+	argc = parse_options(argc, argv, prefix, options, verify_usage, 0);
+	if (argc)
+		usage(_("'git refs verify' takes no arguments"));
+
+	if (verbose)
+		fsck_refs_options.verbose = 1;
+	if (strict)
+		fsck_refs_options.strict = 1;
+
+	git_config(git_fsck_config, &fsck_refs_options);
+	prepare_repo_settings(the_repository);
+
+	ret = refs_fsck(get_main_ref_store(the_repository), &fsck_refs_options);
+
+	/*
+	 * Explicitly free the allocated array and "skip_oids" set
+	 */
+	free(fsck_refs_options.msg_type);
+	oidset_clear(&fsck_refs_options.skip_oids);
+	return ret;
+}
+
 int cmd_refs(int argc, const char **argv, const char *prefix)
 {
 	const char * const refs_usage[] = {
 		REFS_MIGRATE_USAGE,
+		REFS_VERIFY_USAGE,
 		NULL,
 	};
 	parse_opt_subcommand_fn *fn = NULL;
 	struct option opts[] = {
 		OPT_SUBCOMMAND("migrate", &fn, cmd_refs_migrate),
+		OPT_SUBCOMMAND("verify", &fn, cmd_refs_verify),
 		OPT_END(),
 	};
 
diff --git a/fsck.h b/fsck.h
index a4a4ba88ee..b03dba442e 100644
--- a/fsck.h
+++ b/fsck.h
@@ -155,6 +155,7 @@  struct fsck_options {
 	fsck_walk_func walk;
 	fsck_error error_func;
 	unsigned strict:1;
+	unsigned verbose:1;
 	enum fsck_msg_type *msg_type;
 	struct oidset skip_oids;
 	struct oidset gitmodules_found;