diff mbox series

[v2,2/4] SANITIZE tests: fix memory leaks in t13*config*, add to whitelist

Message ID patch-2.4-867e8e9a6c-20210714T172251Z-avarab@gmail.com (mailing list archive)
State New, archived
Headers show
Series add a test mode for SANITIZE=leak, run it in CI | expand

Commit Message

Ævar Arnfjörð Bjarmason July 14, 2021, 5:23 p.m. UTC
Fix a couple of trivial memory leaks introduced in 3efd0bedc6 (config:
add conditional include, 2017-03-01) and my own 867ad08a26 (hooks:
allow customizing where the hook directory is, 2016-05-04).

In the latter case the "fix" is UNLEAK() on the global variable. This
allows us to run all t13*config* tests under SANITIZE=leak.

With this change we can now run almost the whole set of config.c
tests (t13*config) under SANITIZE=leak, so let's do so, with a few
exceptions:

 * The test added in ce81b1da23 (config: add new way to pass config
   via `--config-env`, 2021-01-12), it fails in GitHub CI, but passes
   for me locally. Let's just skip it for now.

 * Ditto the split_cmdline and "aliases of builtins" tests, the former
   required splitting up an existing test, there an issue with the test
   that would have also been revealed by skipping it.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 config.c          | 17 ++++++++++++-----
 t/t1300-config.sh | 16 ++++++++++------
 t/test-lib.sh     |  1 +
 3 files changed, 23 insertions(+), 11 deletions(-)

Comments

Andrzej Hunt July 14, 2021, 6:57 p.m. UTC | #1
On 14/07/2021 19:23, Ævar Arnfjörð Bjarmason wrote:
> Fix a couple of trivial memory leaks introduced in 3efd0bedc6 (config:
> add conditional include, 2017-03-01) and my own 867ad08a26 (hooks:
> allow customizing where the hook directory is, 2016-05-04).
> 
> In the latter case the "fix" is UNLEAK() on the global variable. This
> allows us to run all t13*config* tests under SANITIZE=leak.
> 
> With this change we can now run almost the whole set of config.c
> tests (t13*config) under SANITIZE=leak, so let's do so, with a few
> exceptions:
> 
>   * The test added in ce81b1da23 (config: add new way to pass config
>     via `--config-env`, 2021-01-12), it fails in GitHub CI, but passes
>     for me locally. Let's just skip it for now.
> 
>   * Ditto the split_cmdline and "aliases of builtins" tests, the former
>     required splitting up an existing test, there an issue with the test
>     that would have also been revealed by skipping it.
> 
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>   config.c          | 17 ++++++++++++-----
>   t/t1300-config.sh | 16 ++++++++++------
>   t/test-lib.sh     |  1 +
>   3 files changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/config.c b/config.c
> index f9c400ad30..38e132c0e2 100644
> --- a/config.c
> +++ b/config.c
> @@ -138,8 +138,10 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>   		return config_error_nonbool("include.path");
>   
>   	expanded = expand_user_path(path, 0);
> -	if (!expanded)
> -		return error(_("could not expand include path '%s'"), path);
> +	if (!expanded) {
> +		ret = error(_("could not expand include path '%s'"), path);
> +		goto cleanup;
> +	}
>   	path = expanded;
>   
>   	/*
> @@ -149,8 +151,10 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>   	if (!is_absolute_path(path)) {
>   		char *slash;
>   
> -		if (!cf || !cf->path)
> -			return error(_("relative config includes must come from files"));
> +		if (!cf || !cf->path) {
> +			ret = error(_("relative config includes must come from files"));
> +			goto cleanup;
> +		}
>   
>   		slash = find_last_dir_sep(cf->path);
>   		if (slash)
> @@ -168,6 +172,7 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>   		ret = git_config_from_file(git_config_include, path, inc);
>   		inc->depth--;
>   	}
> +cleanup:
>   	strbuf_release(&buf);
>   	free(expanded);
>   	return ret;
> @@ -1331,8 +1336,10 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>   	if (!strcmp(var, "core.attributesfile"))
>   		return git_config_pathname(&git_attributes_file, var, value);
>   
> -	if (!strcmp(var, "core.hookspath"))
> +	if (!strcmp(var, "core.hookspath")) {
> +		UNLEAK(git_hooks_path);
>   		return git_config_pathname(&git_hooks_path, var, value);
> +	}

Why is the UNLEAK necessary here? We generally want to limit use of 
UNLEAK to cmd_* functions or direct helpers. git_default_core_config() 
seems generic enough that it could be called from anywhere, and using 
UNLEAK here means we're potentially masking a real leak?

IIUC the leak here happens because:
- git_hooks_path is a global variable - hence it's unlikely we'd ever
   bother cleaning it up.
- git_default_core_config() gets called a first time with
   core.hookspath, and we end up allocating new memory into
   git_hooks_path.
- git_default_core_config() gets called again with core.hookspath,
   and we overwrite git_hooks_path with a new string which leaks
   the string that git_hooks_path used to point to.

So I think the real fix is to free(git_hooks_path) instead of an UNLEAK? 
(Looking at the surrounding code, it looks like the same pattern of leak 
might be repeated for other similar globals - is it worth auditing those 
while we're here?)

>   
>   	if (!strcmp(var, "core.bare")) {
>   		is_bare_repository_cfg = git_config_bool(var, value);
> diff --git a/t/t1300-config.sh b/t/t1300-config.sh
> index 9ff46f3b04..93ad0f4887 100755
> --- a/t/t1300-config.sh
> +++ b/t/t1300-config.sh
> @@ -1050,12 +1050,16 @@ test_expect_success SYMLINKS 'symlink to nonexistent configuration' '
>   	test_must_fail git config --file=linktolinktonada --list
>   '
>   
> -test_expect_success 'check split_cmdline return' "
> -	git config alias.split-cmdline-fix 'echo \"' &&
> -	test_must_fail git split-cmdline-fix &&
> +test_expect_success 'setup check split_cmdline return' "
>   	echo foo > foo &&
>   	git add foo &&
> -	git commit -m 'initial commit' &&
> +	git commit -m 'initial commit'
> +"
> +
> +test_expect_success !SANITIZE_LEAK 'check split_cmdline return' "
> +	git config alias.split-cmdline-fix 'echo \"' &&
> +	test_must_fail git split-cmdline-fix &&
> +
>   	git config branch.main.mergeoptions 'echo \"' &&
>   	test_must_fail git merge main
>   "
> @@ -1101,7 +1105,7 @@ test_expect_success 'key sanity-checking' '
>   	git config foo."ba =z".bar false
>   '
>   
> -test_expect_success 'git -c works with aliases of builtins' '
> +test_expect_success !SANITIZE_LEAK 'git -c works with aliases of builtins' '
>   	git config alias.checkconfig "-c foo.check=bar config foo.check" &&
>   	echo bar >expect &&
>   	git checkconfig >actual &&
> @@ -1397,7 +1401,7 @@ test_expect_success 'git --config-env with missing value' '
>   	grep "invalid config format: config" error
>   '
>   
> -test_expect_success 'git --config-env fails with invalid parameters' '
> +test_expect_success !SANITIZE_LEAK 'git --config-env fails with invalid parameters' '
>   	test_must_fail git --config-env=foo.flag config --bool foo.flag 2>error &&
>   	test_i18ngrep "invalid config format: foo.flag" error &&
>   	test_must_fail git --config-env=foo.flag= config --bool foo.flag 2>error &&
> diff --git a/t/test-lib.sh b/t/test-lib.sh
> index 9201510e16..98e20950c3 100644
> --- a/t/test-lib.sh
> +++ b/t/test-lib.sh
> @@ -1370,6 +1370,7 @@ maybe_skip_all_sanitize_leak () {
>   	add_sanitize_leak_true 't000*'
>   	add_sanitize_leak_true 't001*'
>   	add_sanitize_leak_true 't006*'
> +	add_sanitize_leak_true 't13*config*'
>   
>   	# Blacklist patterns (overrides whitelist)
>   	add_sanitize_leak_false 't000[469]*'
>
Ævar Arnfjörð Bjarmason July 14, 2021, 10:56 p.m. UTC | #2
On Wed, Jul 14 2021, Andrzej Hunt wrote:

> On 14/07/2021 19:23, Ævar Arnfjörð Bjarmason wrote:
>> Fix a couple of trivial memory leaks introduced in 3efd0bedc6 (config:
>> add conditional include, 2017-03-01) and my own 867ad08a26 (hooks:
>> allow customizing where the hook directory is, 2016-05-04).
>> In the latter case the "fix" is UNLEAK() on the global
>> variable. This
>> allows us to run all t13*config* tests under SANITIZE=leak.
>> With this change we can now run almost the whole set of config.c
>> tests (t13*config) under SANITIZE=leak, so let's do so, with a few
>> exceptions:
>>   * The test added in ce81b1da23 (config: add new way to pass config
>>     via `--config-env`, 2021-01-12), it fails in GitHub CI, but passes
>>     for me locally. Let's just skip it for now.
>>   * Ditto the split_cmdline and "aliases of builtins" tests, the
>> former
>>     required splitting up an existing test, there an issue with the test
>>     that would have also been revealed by skipping it.
>> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
>> ---
>>   config.c          | 17 ++++++++++++-----
>>   t/t1300-config.sh | 16 ++++++++++------
>>   t/test-lib.sh     |  1 +
>>   3 files changed, 23 insertions(+), 11 deletions(-)
>> diff --git a/config.c b/config.c
>> index f9c400ad30..38e132c0e2 100644
>> --- a/config.c
>> +++ b/config.c
>> @@ -138,8 +138,10 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>>   		return config_error_nonbool("include.path");
>>     	expanded = expand_user_path(path, 0);
>> -	if (!expanded)
>> -		return error(_("could not expand include path '%s'"), path);
>> +	if (!expanded) {
>> +		ret = error(_("could not expand include path '%s'"), path);
>> +		goto cleanup;
>> +	}
>>   	path = expanded;
>>     	/*
>> @@ -149,8 +151,10 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>>   	if (!is_absolute_path(path)) {
>>   		char *slash;
>>   -		if (!cf || !cf->path)
>> -			return error(_("relative config includes must come from files"));
>> +		if (!cf || !cf->path) {
>> +			ret = error(_("relative config includes must come from files"));
>> +			goto cleanup;
>> +		}
>>     		slash = find_last_dir_sep(cf->path);
>>   		if (slash)
>> @@ -168,6 +172,7 @@ static int handle_path_include(const char *path, struct config_include_data *inc
>>   		ret = git_config_from_file(git_config_include, path, inc);
>>   		inc->depth--;
>>   	}
>> +cleanup:
>>   	strbuf_release(&buf);
>>   	free(expanded);
>>   	return ret;
>> @@ -1331,8 +1336,10 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>>   	if (!strcmp(var, "core.attributesfile"))
>>   		return git_config_pathname(&git_attributes_file, var, value);
>>   -	if (!strcmp(var, "core.hookspath"))
>> +	if (!strcmp(var, "core.hookspath")) {
>> +		UNLEAK(git_hooks_path);
>>   		return git_config_pathname(&git_hooks_path, var, value);
>> +	}
>
> Why is the UNLEAK necessary here? We generally want to limit use of
> UNLEAK to cmd_* functions or direct helpers. git_default_core_config() 
> seems generic enough that it could be called from anywhere, and using
> UNLEAK here means we're potentially masking a real leak?
>
> IIUC the leak here happens because:
> - git_hooks_path is a global variable - hence it's unlikely we'd ever
>   bother cleaning it up.
> - git_default_core_config() gets called a first time with
>   core.hookspath, and we end up allocating new memory into
>   git_hooks_path.
> - git_default_core_config() gets called again with core.hookspath,
>   and we overwrite git_hooks_path with a new string which leaks
>   the string that git_hooks_path used to point to.
>
> So I think the real fix is to free(git_hooks_path) instead of an
> UNLEAK? (Looking at the surrounding code, it looks like the same
> pattern of leak might be repeated for other similar globals - is it
> worth auditing those while we're here?)

Good point, I'll fix that.

I was doing this rather blindly to see if I could get this larg batch of
tests to pass with some a minimal fixes/whitelisting of some "known
bad".

>>     	if (!strcmp(var, "core.bare")) {
>>   		is_bare_repository_cfg = git_config_bool(var, value);
>> diff --git a/t/t1300-config.sh b/t/t1300-config.sh
>> index 9ff46f3b04..93ad0f4887 100755
>> --- a/t/t1300-config.sh
>> +++ b/t/t1300-config.sh
>> @@ -1050,12 +1050,16 @@ test_expect_success SYMLINKS 'symlink to nonexistent configuration' '
>>   	test_must_fail git config --file=linktolinktonada --list
>>   '
>>   -test_expect_success 'check split_cmdline return' "
>> -	git config alias.split-cmdline-fix 'echo \"' &&
>> -	test_must_fail git split-cmdline-fix &&
>> +test_expect_success 'setup check split_cmdline return' "
>>   	echo foo > foo &&
>>   	git add foo &&
>> -	git commit -m 'initial commit' &&
>> +	git commit -m 'initial commit'
>> +"
>> +
>> +test_expect_success !SANITIZE_LEAK 'check split_cmdline return' "
>> +	git config alias.split-cmdline-fix 'echo \"' &&
>> +	test_must_fail git split-cmdline-fix &&
>> +
>>   	git config branch.main.mergeoptions 'echo \"' &&
>>   	test_must_fail git merge main
>>   "
>> @@ -1101,7 +1105,7 @@ test_expect_success 'key sanity-checking' '
>>   	git config foo."ba =z".bar false
>>   '
>>   -test_expect_success 'git -c works with aliases of builtins' '
>> +test_expect_success !SANITIZE_LEAK 'git -c works with aliases of builtins' '
>>   	git config alias.checkconfig "-c foo.check=bar config foo.check" &&
>>   	echo bar >expect &&
>>   	git checkconfig >actual &&
>> @@ -1397,7 +1401,7 @@ test_expect_success 'git --config-env with missing value' '
>>   	grep "invalid config format: config" error
>>   '
>>   -test_expect_success 'git --config-env fails with invalid
>> parameters' '
>> +test_expect_success !SANITIZE_LEAK 'git --config-env fails with invalid parameters' '
>>   	test_must_fail git --config-env=foo.flag config --bool foo.flag 2>error &&
>>   	test_i18ngrep "invalid config format: foo.flag" error &&
>>   	test_must_fail git --config-env=foo.flag= config --bool foo.flag 2>error &&
>> diff --git a/t/test-lib.sh b/t/test-lib.sh
>> index 9201510e16..98e20950c3 100644
>> --- a/t/test-lib.sh
>> +++ b/t/test-lib.sh
>> @@ -1370,6 +1370,7 @@ maybe_skip_all_sanitize_leak () {
>>   	add_sanitize_leak_true 't000*'
>>   	add_sanitize_leak_true 't001*'
>>   	add_sanitize_leak_true 't006*'
>> +	add_sanitize_leak_true 't13*config*'
>>     	# Blacklist patterns (overrides whitelist)
>>   	add_sanitize_leak_false 't000[469]*'
>>
Jeff King July 15, 2021, 9:42 p.m. UTC | #3
On Wed, Jul 14, 2021 at 08:57:37PM +0200, Andrzej Hunt wrote:

> > @@ -1331,8 +1336,10 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
> >   	if (!strcmp(var, "core.attributesfile"))
> >   		return git_config_pathname(&git_attributes_file, var, value);
> > -	if (!strcmp(var, "core.hookspath"))
> > +	if (!strcmp(var, "core.hookspath")) {
> > +		UNLEAK(git_hooks_path);
> >   		return git_config_pathname(&git_hooks_path, var, value);
> > +	}
> 
> Why is the UNLEAK necessary here? We generally want to limit use of UNLEAK
> to cmd_* functions or direct helpers. git_default_core_config() seems
> generic enough that it could be called from anywhere, and using UNLEAK here
> means we're potentially masking a real leak?
> 
> IIUC the leak here happens because:
> - git_hooks_path is a global variable - hence it's unlikely we'd ever
>   bother cleaning it up.
> - git_default_core_config() gets called a first time with
>   core.hookspath, and we end up allocating new memory into
>   git_hooks_path.
> - git_default_core_config() gets called again with core.hookspath,
>   and we overwrite git_hooks_path with a new string which leaks
>   the string that git_hooks_path used to point to.
> 
> So I think the real fix is to free(git_hooks_path) instead of an UNLEAK?
> (Looking at the surrounding code, it looks like the same pattern of leak
> might be repeated for other similar globals - is it worth auditing those
> while we're here?)

This is a common leak pattern in Git. We do something like:

  static const char *foo = "default";
  ...
  int config_cb(const char *var, const char *value, void *)
  {
          if (!strcmp(var, "core.foo"))
	          foo = xstrdup(value);
  }

So we leak if the variable appears twice. But we can't just call
"free(foo)" here. In the first call, it's pointing to a string literal!

In the case of git_hooks_path, it defaults to NULL, so this works out
OK. But it's setting up a trap for somebody later on, who assigns it a
default value (and the compiler won't help; it's a "const char *", so
the assignment is fine, and the free() would already be casting away the
constness).

I see a few possible solutions:

  - instead of strdup'ing long-lived config values, strintern() them.
    This is really leaking them, but in a way that we hold on to the old
    values. This is actually more or less what UNLEAK() is doing under
    the hood (saving a reference to the old buffer, even the variable is
    overwritten).

  - find a way to tell when a string comes from the heap versus a
    literal. I don't think you can do this portably without keeping your
    own separate flag. We could abstract away some of the pain with a
    struct like:

       struct def_string {
               /* might point to heap memory; const because you must
                * check flag before modifying */
               const char *value;
               int from_heap;
       }

       /* regular static initialization is OK if you don't want a default */
       #define DEF_STRING_INIT(str) { .value = str }

       static void def_string_set(struct def_string *ds, const char *value)
       {
               if (ds->from_heap)
                       free(ds->value);
               ds->value = xstrdup(value);
               ds->from_heap = 1;
       }

    The annoying thing is all of the users need to refer to
    git_hook_path.value instead of just git_hook_path. If you don't mind
    a little macro hackery, we could get around that by declaring pairs
    of variables. Like:

      #define DEF_STRING_DECLARE(name, value) \
      const char *name = value; \
      int name##_from_heap

      #define DEF_STRING_SET(name, value) do { \
              if (name##_from_heap) \
                      free(name); \
              name = xstrdup(value); \
              name##_from_heap = 1; \
      } while(0)

I can't say I _love_ any of that, but I think it would work (and
probably we'd adapt our helpers like git_config_pathname() to take a
def_string. Or I guess just have a def_string_free() which can be called
before writing into them).

But maybe there's a better solution I'm missing.

-Peff
Andrzej Hunt July 16, 2021, 5:18 a.m. UTC | #4
On 15/07/2021 23:42, Jeff King wrote:
> On Wed, Jul 14, 2021 at 08:57:37PM +0200, Andrzej Hunt wrote:
> 
>>> @@ -1331,8 +1336,10 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>>>    	if (!strcmp(var, "core.attributesfile"))
>>>    		return git_config_pathname(&git_attributes_file, var, value);
>>> -	if (!strcmp(var, "core.hookspath"))
>>> +	if (!strcmp(var, "core.hookspath")) {
>>> +		UNLEAK(git_hooks_path);
>>>    		return git_config_pathname(&git_hooks_path, var, value);
>>> +	}
>>
>> Why is the UNLEAK necessary here? We generally want to limit use of UNLEAK
>> to cmd_* functions or direct helpers. git_default_core_config() seems
>> generic enough that it could be called from anywhere, and using UNLEAK here
>> means we're potentially masking a real leak?
>>
>> IIUC the leak here happens because:
>> - git_hooks_path is a global variable - hence it's unlikely we'd ever
>>    bother cleaning it up.
>> - git_default_core_config() gets called a first time with
>>    core.hookspath, and we end up allocating new memory into
>>    git_hooks_path.
>> - git_default_core_config() gets called again with core.hookspath,
>>    and we overwrite git_hooks_path with a new string which leaks
>>    the string that git_hooks_path used to point to.
>>
>> So I think the real fix is to free(git_hooks_path) instead of an UNLEAK?
>> (Looking at the surrounding code, it looks like the same pattern of leak
>> might be repeated for other similar globals - is it worth auditing those
>> while we're here?)
> 
> This is a common leak pattern in Git. We do something like:
> 
>    static const char *foo = "default";
>    ...
>    int config_cb(const char *var, const char *value, void *)
>    {
>            if (!strcmp(var, "core.foo"))
> 	          foo = xstrdup(value);
>    }
> 
> So we leak if the variable appears twice. But we can't just call
> "free(foo)" here. In the first call, it's pointing to a string literal!
> 
> In the case of git_hooks_path, it defaults to NULL, so this works out
> OK. But it's setting up a trap for somebody later on, who assigns it a
> default value (and the compiler won't help; it's a "const char *", so
> the assignment is fine, and the free() would already be casting away the
> constness).

Ah, right. I didn't think about the risk of future breakages.

> 
> I see a few possible solutions:
>  [...]
> I can't say I _love_ any of that, but I think it would work (and
> probably we'd adapt our helpers like git_config_pathname() to take a
> def_string. Or I guess just have a def_string_free() which can be called
> before writing into them).

Is it worth sidestepping the whole globals issue by migrating 
core.hookspath (and other string config values) to be fetched via 
git_config_get_pathname() and equivalents at the point of use instead?

I looked at the commit below which introduced git_config_get* which 
suggests that these methods were indeed intended to be an improvement 
over the callback based API, and IIUC switching over should have a bunch 
of advantages:
  - Removes some potential bugs that can happen if git_config() was never
    called with the right callback.
  - Potentially reduces the number of times we have to iterate over the
    config in the first place (assuming we migrate *all* config access
    and not just strings).
  - Fewer globals - which reduces potential for such leaks (and probably
    makes it easier to read the code in the first place).
OTOH I'm not familiar enough with this code to know what the 
disadvantages of such a migration might be (it's definitely going to be 
a lot of work... but that's going to apply to any of the approaches we 
can choose to fix these leaks).

git_config_get* were introduced in:
   3c8687a73e (add `config_set` API for caching config-like files, 
2014-07-28)
Ævar Arnfjörð Bjarmason July 16, 2021, 7:46 a.m. UTC | #5
On Thu, Jul 15 2021, Jeff King wrote:

> On Wed, Jul 14, 2021 at 08:57:37PM +0200, Andrzej Hunt wrote:
>
>> > @@ -1331,8 +1336,10 @@ static int git_default_core_config(const char *var, const char *value, void *cb)
>> >   	if (!strcmp(var, "core.attributesfile"))
>> >   		return git_config_pathname(&git_attributes_file, var, value);
>> > -	if (!strcmp(var, "core.hookspath"))
>> > +	if (!strcmp(var, "core.hookspath")) {
>> > +		UNLEAK(git_hooks_path);
>> >   		return git_config_pathname(&git_hooks_path, var, value);
>> > +	}
>> 
>> Why is the UNLEAK necessary here? We generally want to limit use of UNLEAK
>> to cmd_* functions or direct helpers. git_default_core_config() seems
>> generic enough that it could be called from anywhere, and using UNLEAK here
>> means we're potentially masking a real leak?
>> 
>> IIUC the leak here happens because:
>> - git_hooks_path is a global variable - hence it's unlikely we'd ever
>>   bother cleaning it up.
>> - git_default_core_config() gets called a first time with
>>   core.hookspath, and we end up allocating new memory into
>>   git_hooks_path.
>> - git_default_core_config() gets called again with core.hookspath,
>>   and we overwrite git_hooks_path with a new string which leaks
>>   the string that git_hooks_path used to point to.
>> 
>> So I think the real fix is to free(git_hooks_path) instead of an UNLEAK?
>> (Looking at the surrounding code, it looks like the same pattern of leak
>> might be repeated for other similar globals - is it worth auditing those
>> while we're here?)
>
> This is a common leak pattern in Git. We do something like:
>
>   static const char *foo = "default";
>   ...
>   int config_cb(const char *var, const char *value, void *)
>   {
>           if (!strcmp(var, "core.foo"))
> 	          foo = xstrdup(value);
>   }
>
> So we leak if the variable appears twice. But we can't just call
> "free(foo)" here. In the first call, it's pointing to a string literal!
>
> In the case of git_hooks_path, it defaults to NULL, so this works out
> OK. But it's setting up a trap for somebody later on, who assigns it a
> default value (and the compiler won't help; it's a "const char *", so
> the assignment is fine, and the free() would already be casting away the
> constness).
>
> I see a few possible solutions:
>
>   - instead of strdup'ing long-lived config values, strintern() them.
>     This is really leaking them, but in a way that we hold on to the old
>     values. This is actually more or less what UNLEAK() is doing under
>     the hood (saving a reference to the old buffer, even the variable is
>     overwritten).
>
>   - find a way to tell when a string comes from the heap versus a
>     literal. I don't think you can do this portably without keeping your
>     own separate flag. We could abstract away some of the pain with a
>     struct like:
>
>        struct def_string {
>                /* might point to heap memory; const because you must
>                 * check flag before modifying */
>                const char *value;
>                int from_heap;
>        }
>
>        /* regular static initialization is OK if you don't want a default */
>        #define DEF_STRING_INIT(str) { .value = str }
>
>        static void def_string_set(struct def_string *ds, const char *value)
>        {
>                if (ds->from_heap)
>                        free(ds->value);
>                ds->value = xstrdup(value);
>                ds->from_heap = 1;
>        }
>
>     The annoying thing is all of the users need to refer to
>     git_hook_path.value instead of just git_hook_path. If you don't mind
>     a little macro hackery, we could get around that by declaring pairs
>     of variables. Like:
>
>       #define DEF_STRING_DECLARE(name, value) \
>       const char *name = value; \
>       int name##_from_heap
>
>       #define DEF_STRING_SET(name, value) do { \
>               if (name##_from_heap) \
>                       free(name); \
>               name = xstrdup(value); \
>               name##_from_heap = 1; \
>       } while(0)
>
> I can't say I _love_ any of that, but I think it would work (and
> probably we'd adapt our helpers like git_config_pathname() to take a
> def_string. Or I guess just have a def_string_free() which can be called
> before writing into them).
>
> But maybe there's a better solution I'm missing.

Instead of: "int from_heap" in your "def_string" I think we should just
use "struct string_list_item". I.e. you want a void* here. Why?

<Digression>

I have an unsent series for handling some more common cases in the
string-list API. I started writing it due to a very related problem,
i.e. that we conflate "string init dup/nodup" with "do we want to
free?".

We (ab)use the "strdup_strings" in a few places to free that sort of
thing at the end if we have heap-allocated strings, but ones we did not
strdup ourselves, e.g. this in merge-ort.c (not picking on Elijah (CC'd)
here, it's common in lots of places, and this one was pretty much lifted
from merge-recursive).

        opti->paths_to_free.strdup_strings = 1;
        string_list_clear(&opti->paths_to_free, 0);
        opti->paths_to_free.strdup_strings = 0;

So I improved the string-list and strmap free functions so you can
instead do:

    string_list_clear_strings((&opti->paths_to_free, 0);

And that along with some other changes allows you to clear (or not) any
combination of the string, util, or have a callback function of your own
run (but be ensured to run all of those before we get to any of the
other freeing).

</Digression>

You must be thinking what any of this has to do with heap strings in C,
well one common case you've not discussed is that we sometimes do the
equivalent of, with string-list.h or not (somewhat pseudocode);

	void add_to_list(struct string_list *list, char *on_heap_now_we_own_it)
	{
		char *ptr = on_heap_now_we_own_it;
		char *mydup = xstrdup("foo");

	        ptr++; /* skip first byte */
		string_list_append(list, ptr);
		string_list_append(list, mydup);
	}

And:

        struct string_list list = STRING_LIST_INIT_NODUP;
        /* other stuff here, we get strings from somewhere etc. */
        add_to_list(list, some_string);

So now you're left with needing to free both at the end, but we since we
did ptr++ there we can't free() that (we'd need to free(ptr - 1), but
how to keep track of that?).

Well, tying this back to my clear() improvements for string-list.h I
thought a really neat solution to this was:

    string_list_append(list, ptr)->util = on_heap_now_we_own_it;
    string_list_append(list, mydup)->util = mydup;

I.e. by convention we store the pointer we need to free (if any) in the
"util" field.

And then if you get a string not from the heap you just leave the "util"
as NULL, and at the end you just free() all your "util" fields, and it
just so happens that some of them are the same as the "string" field.

We're not in the habit of passing loose "string_list_item" around now,
but I don't see why we wouldn't (possibly with a change to extract that
bit out, so we could use it in other places).

The neat thing about doing this is also that you're not left with every
API boundary needing to deal with your new "def_string", a lot of them
use string_list already, and hardly need to change anything, to the
extent that we do need to change anything having a "void *util" is a lot
more generally usable. You end up getting memory management for free as
you gain a feature to pass arbitrary data along with your items.
Jeff King July 16, 2021, 9:16 p.m. UTC | #6
On Fri, Jul 16, 2021 at 09:46:33AM +0200, Ævar Arnfjörð Bjarmason wrote:

> > I can't say I _love_ any of that, but I think it would work (and
> > probably we'd adapt our helpers like git_config_pathname() to take a
> > def_string. Or I guess just have a def_string_free() which can be called
> > before writing into them).
> >
> > But maybe there's a better solution I'm missing.
> 
> Instead of: "int from_heap" in your "def_string" I think we should just
> use "struct string_list_item". I.e. you want a void* here. Why?

Yes, an equivalent way to write it is with a separate to_free buffer.
But why would we want it to be void? And why would we want to use a
string_list_item, which is otherwise unrelated?

> Well, tying this back to my clear() improvements for string-list.h I
> thought a really neat solution to this was:
> 
>     string_list_append(list, ptr)->util = on_heap_now_we_own_it;
>     string_list_append(list, mydup)->util = mydup;
> 
> I.e. by convention we store the pointer we need to free (if any) in the
> "util" field.

That works, but now "util" is not available for all the _other_ uses for
which it was intended. And if we're not using it for those other uses,
then why does it need to exist at all? If we are only using it to hold
the allocated string pointer, then shouldn't it be "char *to_free"?

> We're not in the habit of passing loose "string_list_item" around now,
> but I don't see why we wouldn't (possibly with a change to extract that
> bit out, so we could use it in other places).

It seems unnecessarily confusing to me. It sounds like you have a struct
which just _happens_ to have a "void *" in it you can re-use, so you
start using it in lots of other places that are not in fact string lists
at all. That is confusing to me on the face, but what happens when
string_list needs a feature which requires adding more fields to it?

If the point is to have a maybe-allocated string, why not make that a
type itself? And then if we want string_list to use it, it can.

> The neat thing about doing this is also that you're not left with every
> API boundary needing to deal with your new "def_string", a lot of them
> use string_list already, and hardly need to change anything, to the
> extent that we do need to change anything having a "void *util" is a lot
> more generally usable. You end up getting memory management for free as
> you gain a feature to pass arbitrary data along with your items.

I don't think most interfaces take a string_list_item now, so wouldn't
they similarly need to be changed? Though the point is that all of these
degrade to a regular C-string, so when you are just passing the value
(and not ownership), you would just dereference at that point.

-Peff
Jeff King July 16, 2021, 9:20 p.m. UTC | #7
On Fri, Jul 16, 2021 at 07:18:59AM +0200, Andrzej Hunt wrote:

> > I see a few possible solutions:
> >  [...]
> > I can't say I _love_ any of that, but I think it would work (and
> > probably we'd adapt our helpers like git_config_pathname() to take a
> > def_string. Or I guess just have a def_string_free() which can be called
> > before writing into them).
> 
> Is it worth sidestepping the whole globals issue by migrating core.hookspath
> (and other string config values) to be fetched via git_config_get_pathname()
> and equivalents at the point of use instead?

Yeah, I almost suggested that. It probably does work OK in this
instance, but it's a much bigger change to convert all cases.

I'm also not sure if you'd run into tricky details. For instance,
calling git_config_* is doing an expensive-ish lookup in the cached
config (well, expensive to accessing a single pointer). So in cases
where we're going to access the string many times (say, in a loop), we'd
want our own variable. That might end up being easy by calling it once
outside the loop. Or it might not be, for cases where the variable is
used under the hood by a helper function.

-Peff
Ævar Arnfjörð Bjarmason Aug. 31, 2021, 12:47 p.m. UTC | #8
On Fri, Jul 16 2021, Jeff King wrote:

[Very late reply, just getting back to this thread]

> On Fri, Jul 16, 2021 at 09:46:33AM +0200, Ævar Arnfjörð Bjarmason wrote:
>
>> > I can't say I _love_ any of that, but I think it would work (and
>> > probably we'd adapt our helpers like git_config_pathname() to take a
>> > def_string. Or I guess just have a def_string_free() which can be called
>> > before writing into them).
>> >
>> > But maybe there's a better solution I'm missing.
>> 
>> Instead of: "int from_heap" in your "def_string" I think we should just
>> use "struct string_list_item". I.e. you want a void* here. Why?
>
> Yes, an equivalent way to write it is with a separate to_free buffer.
> But why would we want it to be void? And why would we want to use a
> string_list_item, which is otherwise unrelated?

We could factor "string_list_item" out into a "string_pair" or
whatever.

Sorry, I didn't mean to get into the naming/code location aspect of the
discussion, just the sensibility of using a "char */void * pair" for
these common memory management cases, v.s. your suggestion of having a
"char *, int from_heap" pair.

>> Well, tying this back to my clear() improvements for string-list.h I
>> thought a really neat solution to this was:
>> 
>>     string_list_append(list, ptr)->util = on_heap_now_we_own_it;
>>     string_list_append(list, mydup)->util = mydup;
>> 
>> I.e. by convention we store the pointer we need to free (if any) in the
>> "util" field.
>
> That works, but now "util" is not available for all the _other_ uses for
> which it was intended. And if we're not using it for those other uses,
> then why does it need to exist at all? If we are only using it to hold
> the allocated string pointer, then shouldn't it be "char *to_free"?

Because having it be "char *" doesn't cover the common case of
e.g. getting an already allocated "struct something *" which contains
your string, setting the "string" in "struct string_list_item" to some
string in that struct, and the "util" to the struct itself, as we now
own it and want to free() it later in its entirety.

That and the even more common case I mentioned upthread of wanting to
ferry around the truncated version of some char *, but still wanting to
account for the original for an eventual free().

But yes, if you want to account for freeing that data *and* have util
set to something else you'll need to have e.g. your own wrapper struct
and your own string_list_clear_func() callback.

I'm not suggesting that this handles every possible scenario, just that
having look at a lot of the code involved recently this seemed like a
neat solution for the common cases.

>> We're not in the habit of passing loose "string_list_item" around now,
>> but I don't see why we wouldn't (possibly with a change to extract that
>> bit out, so we could use it in other places).
>
> It seems unnecessarily confusing to me. It sounds like you have a struct
> which just _happens_ to have a "void *" in it you can re-use, so you
> start using it in lots of other places that are not in fact string lists
> at all. That is confusing to me on the face, but what happens when
> string_list needs a feature which requires adding more fields to it?
>
> If the point is to have a maybe-allocated string, why not make that a
> type itself? And then if we want string_list to use it, it can.

*nod*, covered above. My examples were unnecessarily confusing...

>> The neat thing about doing this is also that you're not left with every
>> API boundary needing to deal with your new "def_string", a lot of them
>> use string_list already, and hardly need to change anything, to the
>> extent that we do need to change anything having a "void *util" is a lot
>> more generally usable. You end up getting memory management for free as
>> you gain a feature to pass arbitrary data along with your items.
>
> I don't think most interfaces take a string_list_item now, so wouldn't
> they similarly need to be changed? Though the point is that all of these
> degrade to a regular C-string, so when you are just passing the value
> (and not ownership), you would just dereference at that point.

Sure, just like things would need to be changed to handle your proposed
"struct def_string".

By piggy-backing on an already used struct in our codebase we can get a
lot of that memory management pretty much for free without much
churn.

If you squint and pretend that "struct string_list_item" isn't called
something to do with that particular collections API (but it would make
use of it) then we've already set up most of the scaffolding and
management for this.
Jeff King Sept. 1, 2021, 7:53 a.m. UTC | #9
On Tue, Aug 31, 2021 at 02:47:01PM +0200, Ævar Arnfjörð Bjarmason wrote:

> > That works, but now "util" is not available for all the _other_ uses for
> > which it was intended. And if we're not using it for those other uses,
> > then why does it need to exist at all? If we are only using it to hold
> > the allocated string pointer, then shouldn't it be "char *to_free"?
> 
> Because having it be "char *" doesn't cover the common case of
> e.g. getting an already allocated "struct something *" which contains
> your string, setting the "string" in "struct string_list_item" to some
> string in that struct, and the "util" to the struct itself, as we now
> own it and want to free() it later in its entirety.

OK. I buy that storing a void pointer makes it more flexible. I'm not
altogether convinced this pattern is especially common, but it's not any
harder to work with than a "need_to_free" flag, so there's no reason not
to do that (and to be fair, I didn't look around for possible uses of
the pattern; it's just not one I think of as common off the top of my
head).

> That and the even more common case I mentioned upthread of wanting to
> ferry around the truncated version of some char *, but still wanting to
> account for the original for an eventual free().
> 
> But yes, if you want to account for freeing that data *and* have util
> set to something else you'll need to have e.g. your own wrapper struct
> and your own string_list_clear_func() callback.

But stuffing it into the util field of string_list really feels like a
stretch, and something that would make existing string_list use painful.
There are tons of cases where util points to some totally unrelated (in
terms of memory ownership) item. I'd venture to say most cases where
string_list_clear() is called without free_util would count here.

> > I don't think most interfaces take a string_list_item now, so wouldn't
> > they similarly need to be changed? Though the point is that all of these
> > degrade to a regular C-string, so when you are just passing the value
> > (and not ownership), you would just dereference at that point.
> 
> Sure, just like things would need to be changed to handle your proposed
> "struct def_string".
> 
> By piggy-backing on an already used struct in our codebase we can get a
> lot of that memory management pretty much for free without much
> churn.
> 
> If you squint and pretend that "struct string_list_item" isn't called
> something to do with that particular collections API (but it would make
> use of it) then we've already set up most of the scaffolding and
> management for this.

It's that squinting that bothers me. Sure, it's _kinda_ similar. And I
don't have any problem with some kind of struct that says "this is a
string, and when you are done with it, this is how you free it". And I
don't have any problem with building the "dup" version of string_list
with that struct as a primitive. But it seems to me to be orthogonal
from the "util" pointer of a string_list, which is about creating a
mapping from the string to some other thing (which may or may not
contain the string, and may or may not be owned).

TBH, I have always found the "util" field of string_list a bit ugly (and
really most of string_list). I think most cases would be better off with
a different data structure (a set or a hash table), but we didn't have
convenient versions of those for a long time. I don't mind seeing
conversions of string_list to other data structures. But that seems to
be working against using string_list's string struct in more places.

-Peff
Ævar Arnfjörð Bjarmason Sept. 1, 2021, 11:45 a.m. UTC | #10
On Wed, Sep 01 2021, Jeff King wrote:

> On Tue, Aug 31, 2021 at 02:47:01PM +0200, Ævar Arnfjörð Bjarmason wrote:
>
>> > That works, but now "util" is not available for all the _other_ uses for
>> > which it was intended. And if we're not using it for those other uses,
>> > then why does it need to exist at all? If we are only using it to hold
>> > the allocated string pointer, then shouldn't it be "char *to_free"?
>> 
>> Because having it be "char *" doesn't cover the common case of
>> e.g. getting an already allocated "struct something *" which contains
>> your string, setting the "string" in "struct string_list_item" to some
>> string in that struct, and the "util" to the struct itself, as we now
>> own it and want to free() it later in its entirety.
>
> OK. I buy that storing a void pointer makes it more flexible. I'm not
> altogether convinced this pattern is especially common, but it's not any
> harder to work with than a "need_to_free" flag, so there's no reason not
> to do that (and to be fair, I didn't look around for possible uses of
> the pattern; it's just not one I think of as common off the top of my
> head).
>
>> That and the even more common case I mentioned upthread of wanting to
>> ferry around the truncated version of some char *, but still wanting to
>> account for the original for an eventual free().
>> 
>> But yes, if you want to account for freeing that data *and* have util
>> set to something else you'll need to have e.g. your own wrapper struct
>> and your own string_list_clear_func() callback.
>
> But stuffing it into the util field of string_list really feels like a
> stretch, and something that would make existing string_list use painful.
> There are tons of cases where util points to some totally unrelated (in
> terms of memory ownership) item. I'd venture to say most cases where
> string_list_clear() is called without free_util would count here.

For what it's worth I've got some WIP code that's part of my daily build
where I did end up going through all those callers, as part of general
string_list_clear() improvements mentioned offhand in
https://lore.kernel.org/git/87bl6kq631.fsf@evledraar.gmail.com/

This is just from fuzzy memory & I can't recall the specifics (and
haven't combed through that WIP code now), but it's something like that
in the ~100 uses of string_list in our codebase 60-70% are the simple
case where the "strdup_strings" and string_list_clear() is enough, maybe
another 10-20% have a "util" field they manage or not, 5%-ish have a
simple string_list_clear_func().

It was just 2-3 cases that leaked memory due to skipping a prefix and
sticking it in the list, and maybe another 1-2 where the void* to a
struct containing the string stuck into the string slot was something we
could use.

So it's not "common" in the sense of absolute numbers, but I did run
into a handful of them, and having them handled by having the
string_list take an arbitrary "util" was something I found neat.

I should probably have said "well known" (as in "well known technique"),
"idiomatic" or something...

>> > I don't think most interfaces take a string_list_item now, so wouldn't
>> > they similarly need to be changed? Though the point is that all of these
>> > degrade to a regular C-string, so when you are just passing the value
>> > (and not ownership), you would just dereference at that point.
>> 
>> Sure, just like things would need to be changed to handle your proposed
>> "struct def_string".
>> 
>> By piggy-backing on an already used struct in our codebase we can get a
>> lot of that memory management pretty much for free without much
>> churn.
>> 
>> If you squint and pretend that "struct string_list_item" isn't called
>> something to do with that particular collections API (but it would make
>> use of it) then we've already set up most of the scaffolding and
>> management for this.
>
> It's that squinting that bothers me. Sure, it's _kinda_ similar. And I
> don't have any problem with some kind of struct that says "this is a
> string, and when you are done with it, this is how you free it". And I
> don't have any problem with building the "dup" version of string_list
> with that struct as a primitive. But it seems to me to be orthogonal
> from the "util" pointer of a string_list, which is about creating a
> mapping from the string to some other thing (which may or may not
> contain the string, and may or may not be owned).

The "util" is whatever the user makes it. We could add a
"pointer_to_free" to every container type to solve this more
cleanly/generally at the API level, but just handing the problem to the
user seems better to me. I.e. an API like string_list has convenience
functions for freeing all the "util", if you only need it for memory
tracking use it as-is, if you need a "real util" *and* such tracking
just create a 2-member wrapper struct yourself & use that.

> TBH, I have always found the "util" field of string_list a bit ugly (and
> really most of string_list). I think most cases would be better off with
> a different data structure (a set or a hash table), but we didn't have
> convenient versions of those for a long time. I don't mind seeing
> conversions of string_list to other data structures. But that seems to
> be working against using string_list's string struct in more places.

If we followed my idle musings we'd be using string_list_item in more
places, not necessarily string_list, and would rename
s/string_list_item/string_and_util/ or something.

One way to look at this problem is that we're pretty close to just
re-inventing the sort of generalized refcounted container type that some
programming languages carry around. E.g. Perl has a "struct SV*" that a
$string maps to, but also hash and array values etc.

Those languages usually have a "refcount" or whatever, but since we're
using this in native C and it's usually (or at least should be) clear
who owns the memory just having something to point free() at will do.

I'm just saying that if we're going halfway there it would be
unfortunate if we'd end up with a "struct def_string" which wouldn't
handle this "borrowing a string from a struct" case.

Or maybe we should just use "struct strbuf" and do copying in even more
places...
diff mbox series

Patch

diff --git a/config.c b/config.c
index f9c400ad30..38e132c0e2 100644
--- a/config.c
+++ b/config.c
@@ -138,8 +138,10 @@  static int handle_path_include(const char *path, struct config_include_data *inc
 		return config_error_nonbool("include.path");
 
 	expanded = expand_user_path(path, 0);
-	if (!expanded)
-		return error(_("could not expand include path '%s'"), path);
+	if (!expanded) {
+		ret = error(_("could not expand include path '%s'"), path);
+		goto cleanup;
+	}
 	path = expanded;
 
 	/*
@@ -149,8 +151,10 @@  static int handle_path_include(const char *path, struct config_include_data *inc
 	if (!is_absolute_path(path)) {
 		char *slash;
 
-		if (!cf || !cf->path)
-			return error(_("relative config includes must come from files"));
+		if (!cf || !cf->path) {
+			ret = error(_("relative config includes must come from files"));
+			goto cleanup;
+		}
 
 		slash = find_last_dir_sep(cf->path);
 		if (slash)
@@ -168,6 +172,7 @@  static int handle_path_include(const char *path, struct config_include_data *inc
 		ret = git_config_from_file(git_config_include, path, inc);
 		inc->depth--;
 	}
+cleanup:
 	strbuf_release(&buf);
 	free(expanded);
 	return ret;
@@ -1331,8 +1336,10 @@  static int git_default_core_config(const char *var, const char *value, void *cb)
 	if (!strcmp(var, "core.attributesfile"))
 		return git_config_pathname(&git_attributes_file, var, value);
 
-	if (!strcmp(var, "core.hookspath"))
+	if (!strcmp(var, "core.hookspath")) {
+		UNLEAK(git_hooks_path);
 		return git_config_pathname(&git_hooks_path, var, value);
+	}
 
 	if (!strcmp(var, "core.bare")) {
 		is_bare_repository_cfg = git_config_bool(var, value);
diff --git a/t/t1300-config.sh b/t/t1300-config.sh
index 9ff46f3b04..93ad0f4887 100755
--- a/t/t1300-config.sh
+++ b/t/t1300-config.sh
@@ -1050,12 +1050,16 @@  test_expect_success SYMLINKS 'symlink to nonexistent configuration' '
 	test_must_fail git config --file=linktolinktonada --list
 '
 
-test_expect_success 'check split_cmdline return' "
-	git config alias.split-cmdline-fix 'echo \"' &&
-	test_must_fail git split-cmdline-fix &&
+test_expect_success 'setup check split_cmdline return' "
 	echo foo > foo &&
 	git add foo &&
-	git commit -m 'initial commit' &&
+	git commit -m 'initial commit'
+"
+
+test_expect_success !SANITIZE_LEAK 'check split_cmdline return' "
+	git config alias.split-cmdline-fix 'echo \"' &&
+	test_must_fail git split-cmdline-fix &&
+
 	git config branch.main.mergeoptions 'echo \"' &&
 	test_must_fail git merge main
 "
@@ -1101,7 +1105,7 @@  test_expect_success 'key sanity-checking' '
 	git config foo."ba =z".bar false
 '
 
-test_expect_success 'git -c works with aliases of builtins' '
+test_expect_success !SANITIZE_LEAK 'git -c works with aliases of builtins' '
 	git config alias.checkconfig "-c foo.check=bar config foo.check" &&
 	echo bar >expect &&
 	git checkconfig >actual &&
@@ -1397,7 +1401,7 @@  test_expect_success 'git --config-env with missing value' '
 	grep "invalid config format: config" error
 '
 
-test_expect_success 'git --config-env fails with invalid parameters' '
+test_expect_success !SANITIZE_LEAK 'git --config-env fails with invalid parameters' '
 	test_must_fail git --config-env=foo.flag config --bool foo.flag 2>error &&
 	test_i18ngrep "invalid config format: foo.flag" error &&
 	test_must_fail git --config-env=foo.flag= config --bool foo.flag 2>error &&
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 9201510e16..98e20950c3 100644
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -1370,6 +1370,7 @@  maybe_skip_all_sanitize_leak () {
 	add_sanitize_leak_true 't000*'
 	add_sanitize_leak_true 't001*'
 	add_sanitize_leak_true 't006*'
+	add_sanitize_leak_true 't13*config*'
 
 	# Blacklist patterns (overrides whitelist)
 	add_sanitize_leak_false 't000[469]*'