diff mbox series

[v4,1/2] refs.c: refactor check_refname_component()

Message ID 20190305120834.7284-2-pclouds@gmail.com (mailing list archive)
State New, archived
Headers show
Series worktree add: sanitize worktree names | expand

Commit Message

Duy Nguyen March 5, 2019, 12:08 p.m. UTC
The core logic of this function is factored out to provide more
information when the refname is invalid: what part that is and exact
what is wrong with it. This will be useful for a coming function that
has to turn a string into a valid refname component.

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
 refs.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 62 insertions(+), 16 deletions(-)

Comments

Jeff King March 6, 2019, 9:49 p.m. UTC | #1
On Tue, Mar 05, 2019 at 07:08:33PM +0700, Nguyễn Thái Ngọc Duy wrote:

> @@ -71,11 +82,15 @@ static unsigned char refname_disposition[256] = {
>   * - it ends with a "/", or
>   * - it ends with ".lock", or
>   * - it contains a "@{" portion
> + *
> + * in which case cp_out points to the beginning of the illegal part.
>   */
> -static int check_refname_component(const char *refname, int *flags)
> +static enum refname_check_code do_check_refname_component(
> +	const char *refname, int *flags, const char **cp_out)

Hmm, OK, so we get to know what type of problem, but also the exact
character where we found it. And then we just keep mutating that char
until we have something that passes.

I can't think of any reason that wouldn't work. As you note, it's
possibly quadratic, though that might be OK for our purposes.

I had envisioned just sanitizing each character into an output buffer as
we did the checks. It does introduce some complexities, though, because
now the checking function is doing the replacement (so it has to know
the right sanitizing rule for each case).

The patch below is a rough cut at that, just for discussion.  You can
ignore the check-ref-format bits; they were just to make poking at it
easier, though perhaps we'd want something like that in the long run.

I suspect check_refname_component() could be made a bit more readable by
reordering a few bits. E.g., why do we check for a leading "." at the
_end_, after having parsed the entire rest of the component for errors?

I dunno. I think I can live with what you've got in your series, but I
figured I'd share this for the sake of completeness. If you really love
it, feel free to adapt it.

diff --git a/builtin/check-ref-format.c b/builtin/check-ref-format.c
index bc67d3f0a8..41b5434be2 100644
--- a/builtin/check-ref-format.c
+++ b/builtin/check-ref-format.c
@@ -56,6 +56,7 @@ int cmd_check_ref_format(int argc, const char **argv, const char *prefix)
 	int i;
 	int normalize = 0;
 	int flags = 0;
+	int sanitize = 0;
 	const char *refname;
 
 	if (argc == 2 && !strcmp(argv[1], "-h"))
@@ -73,13 +74,22 @@ int cmd_check_ref_format(int argc, const char **argv, const char *prefix)
 			flags &= ~REFNAME_ALLOW_ONELEVEL;
 		else if (!strcmp(argv[i], "--refspec-pattern"))
 			flags |= REFNAME_REFSPEC_PATTERN;
+		else if (!strcmp(argv[i], "--sanitize"))
+			sanitize = 1;
 		else
 			usage(builtin_check_ref_format_usage);
 	}
 	if (! (i == argc - 1))
 		usage(builtin_check_ref_format_usage);
 
 	refname = argv[i];
+	if (sanitize) {
+		struct strbuf out = STRBUF_INIT;
+		sanitize_refname(refname, &out);
+		printf("%s\n", out.buf);
+		strbuf_release(&out);
+		return 0;
+	}
 	if (normalize)
 		refname = collapse_slashes(refname);
 	if (check_refname_format(refname, flags))
diff --git a/refs.c b/refs.c
index 142888a40a..2a0c0c6338 100644
--- a/refs.c
+++ b/refs.c
@@ -72,30 +72,58 @@ static unsigned char refname_disposition[256] = {
  * - it ends with ".lock", or
  * - it contains a "@{" portion
  */
-static int check_refname_component(const char *refname, int *flags)
+static int check_refname_component(const char *refname, int *flags,
+				   struct strbuf *sanitized)
 {
 	const char *cp;
 	char last = '\0';
+	size_t component_start;
+
+	if (sanitized)
+		component_start = sanitized->len;
 
 	for (cp = refname; ; cp++) {
 		int ch = *cp & 255;
 		unsigned char disp = refname_disposition[ch];
+
+		if (sanitized && disp != 1)
+			strbuf_addch(sanitized, ch);
+
 		switch (disp) {
 		case 1:
 			goto out;
 		case 2:
-			if (last == '.')
-				return -1; /* Refname contains "..". */
+			if (last == '.') {
+				/* Refname contains "..". */
+				if (sanitized)
+					sanitized->len--; /* collapse ".." to single "." */
+				else
+					return -1;
+			}
 			break;
 		case 3:
-			if (last == '@')
-				return -1; /* Refname contains "@{". */
+			if (last == '@') {
+				/* Refname contains "@{". */
+				if (sanitized)
+					sanitized->buf[sanitized->len-1] = '-';
+				else
+					return -1;
+			}
 			break;
 		case 4:
-			return -1;
+			/* forbidden char */
+			if (sanitized)
+				sanitized->buf[sanitized->len-1] = '-';
+			else
+				return -1;
+			break;
 		case 5:
-			if (!(*flags & REFNAME_REFSPEC_PATTERN))
-				return -1; /* refspec can't be a pattern */
+			if (!(*flags & REFNAME_REFSPEC_PATTERN)) {
+				if (sanitized)
+					sanitized->buf[sanitized->len-1] = '-';
+				else
+					return -1; /* refspec can't be a pattern */
+			}
 
 			/*
 			 * Unset the pattern flag so that we only accept
@@ -109,26 +137,48 @@ static int check_refname_component(const char *refname, int *flags)
 out:
 	if (cp == refname)
 		return 0; /* Component has zero length. */
-	if (refname[0] == '.')
-		return -1; /* Component starts with '.'. */
+
+	if (refname[0] == '.') {
+		/* Component starts with '.'. */
+		if (sanitized)
+			sanitized->buf[component_start] = '-';
+		else
+			return -1;
+	}
 	if (cp - refname >= LOCK_SUFFIX_LEN &&
-	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN))
-		return -1; /* Refname ends with ".lock". */
+	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN)) {
+		/* Refname ends with ".lock". */
+		if (sanitized)
+			strbuf_strip_suffix(sanitized, LOCK_SUFFIX);
+		else
+			return -1;
+	}
 	return cp - refname;
 }
 
-int check_refname_format(const char *refname, int flags)
+static int check_or_sanitize_refname(const char *refname, int flags,
+				     struct strbuf *sanitized)
 {
 	int component_len, component_count = 0;
 
-	if (!strcmp(refname, "@"))
+	if (!strcmp(refname, "@")) {
 		/* Refname is a single character '@'. */
-		return -1;
+		if (sanitized)
+			strbuf_addch(sanitized, '-');
+		else
+			return -1;
+	}
 
 	while (1) {
+		if (sanitized && sanitized->len)
+			strbuf_complete(sanitized, '/');
+
 		/* We are at the start of a path component. */
-		component_len = check_refname_component(refname, &flags);
-		if (component_len <= 0)
+		component_len = check_refname_component(refname, &flags,
+							sanitized);
+		if (sanitized && component_len == 0)
+			; /* OK, omit empty component */
+		else if (component_len <= 0)
 			return -1;
 
 		component_count++;
@@ -138,13 +188,29 @@ int check_refname_format(const char *refname, int flags)
 		refname += component_len + 1;
 	}
 
-	if (refname[component_len - 1] == '.')
-		return -1; /* Refname ends with '.'. */
+	if (refname[component_len - 1] == '.') {
+		/* Refname ends with '.'. */
+		if (sanitized)
+			; /* omit ending dot */
+		else
+			return -1;
+	}
 	if (!(flags & REFNAME_ALLOW_ONELEVEL) && component_count < 2)
 		return -1; /* Refname has only one component. */
 	return 0;
 }
 
+int check_refname_format(const char *refname, int flags)
+{
+	return check_or_sanitize_refname(refname, flags, NULL);
+}
+
+void sanitize_refname(const char *refname, struct strbuf *out)
+{
+	if (check_or_sanitize_refname(refname, 0, out))
+		BUG("sanitizing refname check returned error");
+}
+
 int refname_is_safe(const char *refname)
 {
 	const char *rest;
diff --git a/refs.h b/refs.h
index 308fa1f03b..b99c309dd9 100644
--- a/refs.h
+++ b/refs.h
@@ -460,6 +460,12 @@ int for_each_reflog(each_ref_fn fn, void *cb_data);
  */
 int check_refname_format(const char *refname, int flags);
 
+/*
+ * Apply the rules from check_refname_format, but mutate the result until it
+ * is acceptable, and place the result in "out".
+ */
+void sanitize_refname(const char *refname, struct strbuf *out);
+
 const char *prettify_refname(const char *refname);
 
 char *shorten_unambiguous_ref(const char *refname, int strict);
Eric Sunshine March 7, 2019, 11:24 p.m. UTC | #2
On Wed, Mar 6, 2019 at 4:49 PM Jeff King <peff@peff.net> wrote:
> On Tue, Mar 05, 2019 at 07:08:33PM +0700, Nguyễn Thái Ngọc Duy wrote:
> I had envisioned just sanitizing each character into an output buffer as
> we did the checks. It does introduce some complexities, though, because
> now the checking function is doing the replacement (so it has to know
> the right sanitizing rule for each case).
>
> The patch below is a rough cut at that, just for discussion.  You can
> ignore the check-ref-format bits; they were just to make poking at it
> easier, though perhaps we'd want something like that in the long run.

This is more along the lines of what I had envisioned, as well, after
looking over the implementation of check_refname_component(). It's a
bit noisy and loud but easy to follow, and doesn't give rise to
concerns about quadratic behavior, etc.
diff mbox series

Patch

diff --git a/refs.c b/refs.c
index 142888a40a..70c55ea1b6 100644
--- a/refs.c
+++ b/refs.c
@@ -57,10 +57,21 @@  static unsigned char refname_disposition[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 4, 4
 };
 
+enum refname_check_code {
+	 refname_ok = 0,
+	 refname_contains_dotdot,
+	 refname_contains_atopen,
+	 refname_has_badchar,
+	 refname_contains_wildcard,
+	 refname_starts_with_dot,
+	 refname_ends_with_dotlock,
+	 refname_component_has_zero_length
+};
+
 /*
  * Try to read one refname component from the front of refname.
- * Return the length of the component found, or -1 if the component is
- * not legal.  It is legal if it is something reasonable to have under
+ * If the component is legal, return the end of the component in cp_out.
+ * It is legal if it is something reasonable to have under
  * ".git/refs/"; We do not like it if:
  *
  * - any path component of it begins with ".", or
@@ -71,11 +82,15 @@  static unsigned char refname_disposition[256] = {
  * - it ends with a "/", or
  * - it ends with ".lock", or
  * - it contains a "@{" portion
+ *
+ * in which case cp_out points to the beginning of the illegal part.
  */
-static int check_refname_component(const char *refname, int *flags)
+static enum refname_check_code do_check_refname_component(
+	const char *refname, int *flags, const char **cp_out)
 {
 	const char *cp;
 	char last = '\0';
+	enum refname_check_code ret = refname_ok;
 
 	for (cp = refname; ; cp++) {
 		int ch = *cp & 255;
@@ -84,18 +99,28 @@  static int check_refname_component(const char *refname, int *flags)
 		case 1:
 			goto out;
 		case 2:
-			if (last == '.')
-				return -1; /* Refname contains "..". */
+			if (last == '.') {
+				cp--;
+				ret = refname_contains_dotdot;
+				goto done;
+			}
 			break;
 		case 3:
-			if (last == '@')
-				return -1; /* Refname contains "@{". */
+			if (last == '@') {
+				cp--;
+				ret = refname_contains_atopen; /* @{ */
+				goto done;
+			}
 			break;
 		case 4:
-			return -1;
+			ret = refname_has_badchar;
+			goto done;
 		case 5:
-			if (!(*flags & REFNAME_REFSPEC_PATTERN))
-				return -1; /* refspec can't be a pattern */
+			if (!(*flags & REFNAME_REFSPEC_PATTERN)) {
+				/* refspec can't be a pattern */
+				ret = refname_contains_wildcard;
+				goto done;
+			}
 
 			/*
 			 * Unset the pattern flag so that we only accept
@@ -107,13 +132,34 @@  static int check_refname_component(const char *refname, int *flags)
 		last = ch;
 	}
 out:
-	if (cp == refname)
-		return 0; /* Component has zero length. */
-	if (refname[0] == '.')
-		return -1; /* Component starts with '.'. */
+	if (cp == refname) {
+		ret = refname_component_has_zero_length;
+		goto done;
+	}
+	if (refname[0] == '.') {
+		cp = refname;
+		ret = refname_starts_with_dot;
+		goto done;
+	}
 	if (cp - refname >= LOCK_SUFFIX_LEN &&
-	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN))
-		return -1; /* Refname ends with ".lock". */
+	    !memcmp(cp - LOCK_SUFFIX_LEN, LOCK_SUFFIX, LOCK_SUFFIX_LEN)) {
+		cp -= LOCK_SUFFIX_LEN;
+		ret = refname_ends_with_dotlock;
+	}
+done:
+	*cp_out = cp;
+	return ret;
+}
+
+/* Return the length of the component if it's legal otherwise -1 */
+static int check_refname_component(const char *refname, int *flags)
+{
+	const char *cp;
+	enum refname_check_code ret;
+
+	ret = do_check_refname_component(refname, flags, &cp);
+	if (ret)
+		return -1;
 	return cp - refname;
 }