diff mbox series

[2/5] t/helper/test-hashmap.c: avoid using `strtok()`

Message ID 0f199468a3b8375dbec0f56fdc831c3ac298eb4e.1681428696.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series banned: mark `strok()`, `strtok_r()` as banned | expand

Commit Message

Taylor Blau April 13, 2023, 11:31 p.m. UTC
Avoid using the non-reentrant `strtok()` to separate the parts of each
incoming command. Instead of replacing it with `strtok_r()`, let's
instead use the more friendly `string_list_split_in_place_multi()`.

Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 t/helper/test-hashmap.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

Comments

Jeff King April 18, 2023, 10:23 a.m. UTC | #1
On Thu, Apr 13, 2023 at 07:31:46PM -0400, Taylor Blau wrote:

> Avoid using the non-reentrant `strtok()` to separate the parts of each
> incoming command. Instead of replacing it with `strtok_r()`, let's
> instead use the more friendly `string_list_split_in_place_multi()`.

Junio mentioned this offhand in his response, but I wanted to highlight
one difference in before/after behavior here.

  [before]
  $ printf 'add foo    bar\niterate\n' | t/helper/test-tool hashmap
  foo bar

  [after]
  $ printf 'add foo    bar\niterate\n' | t/helper/test-tool hashmap
  foo    bar

I think that's fine for this test script, but it may be an indication
that we want string-list's split to support different semantics.

> @@ -159,21 +161,34 @@ int cmd__hashmap(int argc, const char **argv)
>  
>  	/* process commands from stdin */
>  	while (strbuf_getline(&line, stdin) != EOF) {
> -		char *cmd, *p1 = NULL, *p2 = NULL;
> +		char *cmd, *p1, *p2;
>  		unsigned int hash = 0;
>  		struct test_entry *entry;
>  
> +		/*
> +		 * Because we memdup() the arguments out of the
> +		 * string_list before inserting them into the hashmap,
> +		 * it's OK to set its length back to zero to avoid
> +		 * re-allocating the items array once per line.
> +		 *
> +		 * By doing so, we'll instead overwrite the existing
> +		 * entries and avoid re-allocating.
> +		 */
> +		parts.nr = 0;

I think this is OK, but I wonder if we should wrap it in a function that
makes sure the strings aren't owned by the strdup (and thus aren't being
leaked). Something like:

  void string_list_setlen(struct string_list *sl, size_t nr)
  {
	/* alternatively, I guess we could actually free nr..sl->nr */
	if (sl->strdup_strings)
		BUG("you can't setlen on a string-list which owns its strings");
	if (nr > sl->nr)
		BUG("you can't grow a string-list with setlen");
	sl->nr = nr;
  }

That is probably overkill for these two test helpers, but I wonder if
the pattern might escape into "real" code (especially if we suggest
using string-list instead of strtok).

-Peff
Taylor Blau April 18, 2023, 6:06 p.m. UTC | #2
On Tue, Apr 18, 2023 at 06:23:20AM -0400, Jeff King wrote:
> On Thu, Apr 13, 2023 at 07:31:46PM -0400, Taylor Blau wrote:
>
> > Avoid using the non-reentrant `strtok()` to separate the parts of each
> > incoming command. Instead of replacing it with `strtok_r()`, let's
> > instead use the more friendly `string_list_split_in_place_multi()`.
>
> Junio mentioned this offhand in his response, but I wanted to highlight
> one difference in before/after behavior here.
>
>   [before]
>   $ printf 'add foo    bar\niterate\n' | t/helper/test-tool hashmap
>   foo bar
>
>   [after]
>   $ printf 'add foo    bar\niterate\n' | t/helper/test-tool hashmap
>   foo    bar
>
> I think that's fine for this test script, but it may be an indication
> that we want string-list's split to support different semantics.

I agree that it's OK for the test scripts, but probably isn't the right
thing for all cases. In the reroll I'll send shortly, I reimplemented
this in a way that (a) doesn't change the behavior of
`string_list_split_in_place()`, and (b) allows us to easily change the
semantics for `string_list_split_in_place_multi()`.

> > @@ -159,21 +161,34 @@ int cmd__hashmap(int argc, const char **argv)
> >
> >  	/* process commands from stdin */
> >  	while (strbuf_getline(&line, stdin) != EOF) {
> > -		char *cmd, *p1 = NULL, *p2 = NULL;
> > +		char *cmd, *p1, *p2;
> >  		unsigned int hash = 0;
> >  		struct test_entry *entry;
> >
> > +		/*
> > +		 * Because we memdup() the arguments out of the
> > +		 * string_list before inserting them into the hashmap,
> > +		 * it's OK to set its length back to zero to avoid
> > +		 * re-allocating the items array once per line.
> > +		 *
> > +		 * By doing so, we'll instead overwrite the existing
> > +		 * entries and avoid re-allocating.
> > +		 */
> > +		parts.nr = 0;
>
> I think this is OK, but I wonder if we should wrap it in a function that
> makes sure the strings aren't owned by the strdup (and thus aren't being
> leaked). Something like:
>
>   void string_list_setlen(struct string_list *sl, size_t nr)
>   {
> 	/* alternatively, I guess we could actually free nr..sl->nr */
> 	if (sl->strdup_strings)
> 		BUG("you can't setlen on a string-list which owns its strings");
> 	if (nr > sl->nr)
> 		BUG("you can't grow a string-list with setlen");
> 	sl->nr = nr;
>   }
>
> That is probably overkill for these two test helpers, but I wonder if
> the pattern might escape into "real" code (especially if we suggest
> using string-list instead of strtok).

I like this suggestion. I added a new patch (for which you are listed
under Co-authored-by) which adds this function as-is.

Thanks,
Taylor
diff mbox series

Patch

diff --git a/t/helper/test-hashmap.c b/t/helper/test-hashmap.c
index 36ff07bd4be..902ceb55113 100644
--- a/t/helper/test-hashmap.c
+++ b/t/helper/test-hashmap.c
@@ -2,6 +2,7 @@ 
 #include "git-compat-util.h"
 #include "hashmap.h"
 #include "strbuf.h"
+#include "string-list.h"
 
 struct test_entry
 {
@@ -150,6 +151,7 @@  static void perf_hashmap(unsigned int method, unsigned int rounds)
  */
 int cmd__hashmap(int argc, const char **argv)
 {
+	struct string_list parts = STRING_LIST_INIT_NODUP;
 	struct strbuf line = STRBUF_INIT;
 	int icase;
 	struct hashmap map = HASHMAP_INIT(test_entry_cmp, &icase);
@@ -159,21 +161,34 @@  int cmd__hashmap(int argc, const char **argv)
 
 	/* process commands from stdin */
 	while (strbuf_getline(&line, stdin) != EOF) {
-		char *cmd, *p1 = NULL, *p2 = NULL;
+		char *cmd, *p1, *p2;
 		unsigned int hash = 0;
 		struct test_entry *entry;
 
+		/*
+		 * Because we memdup() the arguments out of the
+		 * string_list before inserting them into the hashmap,
+		 * it's OK to set its length back to zero to avoid
+		 * re-allocating the items array once per line.
+		 *
+		 * By doing so, we'll instead overwrite the existing
+		 * entries and avoid re-allocating.
+		 */
+		parts.nr = 0;
 		/* break line into command and up to two parameters */
-		cmd = strtok(line.buf, DELIM);
+		string_list_split_in_place_multi(&parts, line.buf, DELIM, 2);
+
 		/* ignore empty lines */
-		if (!cmd || *cmd == '#')
+		if (!parts.nr)
+			continue;
+		if (!*parts.items[0].string || *parts.items[0].string == '#')
 			continue;
 
-		p1 = strtok(NULL, DELIM);
-		if (p1) {
+		cmd = parts.items[0].string;
+		p1 = parts.nr >= 1 ? parts.items[1].string : NULL;
+		p2 = parts.nr >= 2 ? parts.items[2].string : NULL;
+		if (p1)
 			hash = icase ? strihash(p1) : strhash(p1);
-			p2 = strtok(NULL, DELIM);
-		}
 
 		if (!strcmp("add", cmd) && p1 && p2) {
 
@@ -260,6 +275,7 @@  int cmd__hashmap(int argc, const char **argv)
 		}
 	}
 
+	string_list_clear(&parts, 0);
 	strbuf_release(&line);
 	hashmap_clear_and_free(&map, struct test_entry, ent);
 	return 0;