diff mbox series

[02/13] urlmatch: define url_parse function

Message ID 13b81b8aa06cfd63a5fd9d1acbaf21a8b388ff47.1714343461.git.gitgitgadget@gmail.com (mailing list archive)
State New
Headers show
Series builtin: implement, document and test url-parse | expand

Commit Message

Matheus Afonso Martins Moreira April 28, 2024, 10:30 p.m. UTC
From: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>

Define general parsing function that supports all Git URLs
including scp style URLs such as hostname:~user/repo.
Has the same interface as the URL normalization function
and uses the same data structures, facilitating its use.
It's adapted from the algorithm used to process URLs in connect.c,
so it should support the same inputs.

Signed-off-by: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
---
 urlmatch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 urlmatch.h |  1 +
 2 files changed, 91 insertions(+)

Comments

Ghanshyam Thakkar May 1, 2024, 10:18 p.m. UTC | #1
On Sun, 28 Apr 2024, Matheus Afonso Martins Moreira via GitGitGadget <gitgitgadget@gmail.com> wrote:
> From: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
> 
> Define general parsing function that supports all Git URLs
> including scp style URLs such as hostname:~user/repo.
> Has the same interface as the URL normalization function
> and uses the same data structures, facilitating its use.
> It's adapted from the algorithm used to process URLs in connect.c,
> so it should support the same inputs.
> 
> Signed-off-by: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
> ---
>  urlmatch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  urlmatch.h |  1 +
>  2 files changed, 91 insertions(+)
> 
> diff --git a/urlmatch.c b/urlmatch.c
> index 1d0254abacb..5a442e31fa2 100644
> --- a/urlmatch.c
> +++ b/urlmatch.c
> @@ -3,6 +3,7 @@
>  #include "hex-ll.h"
>  #include "strbuf.h"
>  #include "urlmatch.h"
> +#include "url.h"
>  
>  #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
>  #define URL_DIGIT "0123456789"
> @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info)
>  	return url_normalize_1(url, out_info, 0);
>  }
>  
> +enum protocol {
> +	PROTO_UNKNOWN = 0,
> +	PROTO_LOCAL,
> +	PROTO_FILE,
> +	PROTO_SSH,
> +	PROTO_GIT,
> +};
> +
> +static enum protocol url_get_protocol(const char *name, size_t n)
> +{
> +	if (!strncmp(name, "ssh", n))
> +		return PROTO_SSH;
> +	if (!strncmp(name, "git", n))
> +		return PROTO_GIT;
> +	if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "file", n))
> +		return PROTO_FILE;
> +	return PROTO_UNKNOWN;
> +}
> +
> +char *url_parse(const char *url_orig, struct url_info *out_info)
> +{
> +	struct strbuf url;
> +	char *host, *separator;
> +	char *detached, *normalized;
> +	enum protocol protocol = PROTO_LOCAL;
> +	struct url_info local_info;
> +	struct url_info *info = out_info? out_info : &local_info;
> +	bool scp_syntax = false;
> +
> +	if (is_url(url_orig)) {
> +		url_orig = url_decode(url_orig);
> +	} else {
> +		url_orig = xstrdup(url_orig);
> +	}
> +
> +	strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
> +	strbuf_addstr(&url, url_orig);
> +
> +	host = strstr(url.buf, "://");
> +	if (host) {
> +		protocol = url_get_protocol(url.buf, host - url.buf);
> +		host += 3;
> +	} else {
> +		if (!url_is_local_not_ssh(url.buf)) {
> +			scp_syntax = true;
> +			protocol = PROTO_SSH;
> +			strbuf_insertstr(&url, 0, "ssh://");
> +			host = url.buf + 6;
> +		}
> +	}
Interesting. 

    `
    $ ./git url-parse -c protocol file:/test/test
    ssh
    `

seems like only having a single slash after the 'protocol:' prints
'ssh' always (I think this may not even be a valid url). After this 'else'
block, the url turns into 'ssh://file/test/test'. Will examine the details
later. Not that it's your code's doing, and rather the result of
url_is_local_not_ssh(). But just wanted to point this out and ask if this
should error out or is this an intended behavior that I can't figure out. 

Thanks.
Torsten Bögershausen May 2, 2024, 4:02 a.m. UTC | #2
[]
> Interesting.
>
>     `
>     $ ./git url-parse -c protocol file:/test/test
>     ssh
>     `
>
> seems like only having a single slash after the 'protocol:' prints
> 'ssh' always (I think this may not even be a valid url). After this 'else'
> block, the url turns into 'ssh://file/test/test'. Will examine the details
> later. Not that it's your code's doing, and rather the result of
> url_is_local_not_ssh(). But just wanted to point this out and ask if this
> should error out or is this an intended behavior that I can't figure out.

ssh is the correct answer, try something like

`git clone localhost:/home/myself/project/git.git`

It is the scp syntax, supported by Git as well.
From `man scp`

    scp copies files between hosts on a network.
    []
    The source and target may be specified as a local pathname,
    a remote host with optional path in the form
    [user@]host:[path],
    or a URI in the form scp://[user@]host[:port][/path].
    Local file names can be made explicit using absolute or relative pathnames
    to avoid scp treating file names containing ‘:’ as host specifiers.

So yes, they share similar problems
with the ':' that could mean different things when using the short form.
diff mbox series

Patch

diff --git a/urlmatch.c b/urlmatch.c
index 1d0254abacb..5a442e31fa2 100644
--- a/urlmatch.c
+++ b/urlmatch.c
@@ -3,6 +3,7 @@ 
 #include "hex-ll.h"
 #include "strbuf.h"
 #include "urlmatch.h"
+#include "url.h"
 
 #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
 #define URL_DIGIT "0123456789"
@@ -438,6 +439,95 @@  char *url_normalize(const char *url, struct url_info *out_info)
 	return url_normalize_1(url, out_info, 0);
 }
 
+enum protocol {
+	PROTO_UNKNOWN = 0,
+	PROTO_LOCAL,
+	PROTO_FILE,
+	PROTO_SSH,
+	PROTO_GIT,
+};
+
+static enum protocol url_get_protocol(const char *name, size_t n)
+{
+	if (!strncmp(name, "ssh", n))
+		return PROTO_SSH;
+	if (!strncmp(name, "git", n))
+		return PROTO_GIT;
+	if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
+		return PROTO_SSH;
+	if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
+		return PROTO_SSH;
+	if (!strncmp(name, "file", n))
+		return PROTO_FILE;
+	return PROTO_UNKNOWN;
+}
+
+char *url_parse(const char *url_orig, struct url_info *out_info)
+{
+	struct strbuf url;
+	char *host, *separator;
+	char *detached, *normalized;
+	enum protocol protocol = PROTO_LOCAL;
+	struct url_info local_info;
+	struct url_info *info = out_info? out_info : &local_info;
+	bool scp_syntax = false;
+
+	if (is_url(url_orig)) {
+		url_orig = url_decode(url_orig);
+	} else {
+		url_orig = xstrdup(url_orig);
+	}
+
+	strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
+	strbuf_addstr(&url, url_orig);
+
+	host = strstr(url.buf, "://");
+	if (host) {
+		protocol = url_get_protocol(url.buf, host - url.buf);
+		host += 3;
+	} else {
+		if (!url_is_local_not_ssh(url.buf)) {
+			scp_syntax = true;
+			protocol = PROTO_SSH;
+			strbuf_insertstr(&url, 0, "ssh://");
+			host = url.buf + 6;
+		}
+	}
+
+	/* path starts after ':' in scp style SSH URLs */
+	if (scp_syntax) {
+		separator = strchr(host, ':');
+		if (separator) {
+			if (separator[1] == '/')
+				strbuf_remove(&url, separator - url.buf, 1);
+			else
+				*separator = '/';
+		}
+	}
+
+	detached = strbuf_detach(&url, NULL);
+	normalized = url_normalize(detached, info);
+	free(detached);
+
+	if (!normalized) {
+		return NULL;
+	}
+
+	/* point path to ~ for URL's like this:
+	 *
+	 *     ssh://host.xz/~user/repo
+	 *     git://host.xz/~user/repo
+	 *     host.xz:~user/repo
+	 *
+	 */
+	if (protocol == PROTO_GIT || protocol == PROTO_SSH) {
+		if (normalized[info->path_off + 1] == '~')
+			info->path_off++;
+	}
+
+	return normalized;
+}
+
 static size_t url_match_prefix(const char *url,
 			       const char *url_prefix,
 			       size_t url_prefix_len)
diff --git a/urlmatch.h b/urlmatch.h
index 5ba85cea139..6b3ce428582 100644
--- a/urlmatch.h
+++ b/urlmatch.h
@@ -35,6 +35,7 @@  struct url_info {
 };
 
 char *url_normalize(const char *, struct url_info *);
+char *url_parse(const char *, struct url_info *);
 
 struct urlmatch_item {
 	size_t hostmatch_len;