diff mbox series

[v4,5/8] test-http-server: add HTTP request parsing

Message ID 5c4e36e23eecbb7841078939a982b7150e2f4ab8.1670880984.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Enhance credential helper protocol to include auth headers | expand

Commit Message

Matthew John Cheetham Dec. 12, 2022, 9:36 p.m. UTC
From: Matthew John Cheetham <mjcheetham@outlook.com>

Add ability to parse HTTP requests to the test-http-server test helper.

Signed-off-by: Matthew John Cheetham <mjcheetham@outlook.com>
---
 t/helper/test-http-server.c | 176 +++++++++++++++++++++++++++++++++++-
 1 file changed, 174 insertions(+), 2 deletions(-)

Comments

Victoria Dye Dec. 14, 2022, 11:18 p.m. UTC | #1
Matthew John Cheetham via GitGitGadget wrote:
> +/*
> + * Read the HTTP request up to the start of the optional message-body.
> + * We do this byte-by-byte because we have keep-alive turned on and
> + * cannot rely on an EOF.
> + *
> + * https://tools.ietf.org/html/rfc7230
> + *
> + * We cannot call die() here because our caller needs to properly
> + * respond to the client and/or close the socket before this
> + * child exits so that the client doesn't get a connection reset
> + * by peer error.
> + */
> +static enum worker_result req__read(struct req *req, int fd)
> +{
> +	struct strbuf h = STRBUF_INIT;
> +	struct string_list start_line_fields = STRING_LIST_INIT_DUP;
> +	int nr_start_line_fields;
> +	const char *uri_target;
> +	const char *query;
> +	char *hp;
> +	const char *hv;
> +
> +	enum worker_result result = WR_OK;
> +
> +	/*
> +	 * Read line 0 of the request and split it into component parts:
> +	 *
> +	 *    <method> SP <uri-target> SP <HTTP-version> CRLF
> +	 *
> +	 */
> +	if (strbuf_getwholeline_fd(&req->start_line, fd, '\n') == EOF) {
> +		result = WR_OK | WR_HANGUP;
> +		goto done;
> +	}
> +
> +	strbuf_trim_trailing_newline(&req->start_line);
> +
> +	nr_start_line_fields = string_list_split(&start_line_fields,
> +						 req->start_line.buf,
> +						 ' ', -1);
> +	if (nr_start_line_fields != 3) {
> +		logerror("could not parse request start-line '%s'",
> +			 req->start_line.buf);
> +		result = WR_IO_ERROR;
> +		goto done;
> +	}
> +
> +	req->method = xstrdup(start_line_fields.items[0].string);
> +	req->http_version = xstrdup(start_line_fields.items[2].string);
> +
> +	uri_target = start_line_fields.items[1].string;
> +
> +	if (strcmp(req->http_version, "HTTP/1.1")) {
> +		logerror("unsupported version '%s' (expecting HTTP/1.1)",
> +			 req->http_version);
> +		result = WR_IO_ERROR;
> +		goto done;
> +	}
> +
> +	query = strchr(uri_target, '?');
> +
> +	if (query) {
> +		strbuf_add(&req->uri_path, uri_target, (query - uri_target));
> +		strbuf_trim_trailing_dir_sep(&req->uri_path);
> +		strbuf_addstr(&req->query_args, query + 1);
> +	} else {
> +		strbuf_addstr(&req->uri_path, uri_target);
> +		strbuf_trim_trailing_dir_sep(&req->uri_path);
> +	}

This "line 0" parsing looks good, and aligns with the RFC you linked
(specifically section 3.1.1 [1]).

[1] https://www.rfc-editor.org/rfc/rfc7230#section-3.1.1

> +
> +	/*
> +	 * Read the set of HTTP headers into a string-list.
> +	 */
> +	while (1) {
> +		if (strbuf_getwholeline_fd(&h, fd, '\n') == EOF)
> +			goto done;
> +		strbuf_trim_trailing_newline(&h);
> +
> +		if (!h.len)
> +			goto done; /* a blank line ends the header */
> +
> +		hp = strbuf_detach(&h, NULL);
> +		string_list_append(&req->header_list, hp);
> +
> +		/* store common request headers separately */
> +		if (skip_prefix(hp, "Content-Type: ", &hv)) {
> +			req->content_type = hv;
> +		} else if (skip_prefix(hp, "Content-Length: ", &hv)) {
> +			req->content_length = strtol(hv, &hp, 10);
> +		}

The "separately" is somewhat confusing - you unconditionally add 'hp' to
'req->header_list', so the "Content-Type" and "Content-Length" headers are
included there as well. If that's the desired behavior, a comment like "Also
store common headers as 'req' fields" might be clearer.

> +	}
> +
> +	/*
> +	 * We do not attempt to read the <message-body>, if it exists.
> +	 * We let our caller read/chunk it in as appropriate.
> +	 */
> +
> +done:
> +	string_list_clear(&start_line_fields, 0);
> +
> +	/*
> +	 * This is useful for debugging the request, but very noisy.
> +	 */
> +	if (trace2_is_enabled()) {

'trace2_printf()' is gated internally by 'trace2_enabled' anyway, so I don't
think this 'if()' is necessary. You could add a 'DEBUG_HTTP_SERVER'
preprocessor directive (like 'DEBUG_CACHE_TREE' in 'cache-tree.c') if you
wanted to prevent these printouts unless a developer sets it to '1'.

> +		struct string_list_item *item;
> +		trace2_printf("%s: %s", TR2_CAT, req->start_line.buf);
> +		trace2_printf("%s: hver: %s", TR2_CAT, req->http_version);
> +		trace2_printf("%s: hmth: %s", TR2_CAT, req->method);
> +		trace2_printf("%s: path: %s", TR2_CAT, req->uri_path.buf);
> +		trace2_printf("%s: qury: %s", TR2_CAT, req->query_args.buf);
> +		if (req->content_length >= 0)
> +			trace2_printf("%s: clen: %d", TR2_CAT, req->content_length);
> +		if (req->content_type)
> +			trace2_printf("%s: ctyp: %s", TR2_CAT, req->content_type);
> +		for_each_string_list_item(item, &req->header_list)
> +			trace2_printf("%s: hdrs: %s", TR2_CAT, item->string);
> +	}
> +
> +	return result;
> +}
> +
> +static enum worker_result dispatch(struct req *req)
> +{
> +	return send_http_error(1, 501, "Not Implemented", -1, NULL,
> +			       WR_OK | WR_HANGUP);

Although the request is now being read & parsed, the response creation code
is still a hardcoded "Not Implemented". This means that the now-parsed 'req'
is be temporarily unused, but I think that's reasonable (since it allows for
breaking up the implementation of 'test-http-server' into multiple, less
overwhelming patches).

> +}
> +
>  static enum worker_result worker(void)
>  {
> +	struct req req = REQ__INIT;
>  	char *client_addr = getenv("REMOTE_ADDR");
>  	char *client_port = getenv("REMOTE_PORT");
>  	enum worker_result wr = WR_OK;
> @@ -160,8 +324,16 @@ static enum worker_result worker(void)
>  	set_keep_alive(0);
>  
>  	while (1) {
> -		wr = send_http_error(1, 501, "Not Implemented", -1, NULL,
> -			WR_OK | WR_HANGUP);
> +		req__release(&req);
> +
> +		alarm(init_timeout ? init_timeout : timeout);
> +		wr = req__read(&req, 0);
> +		alarm(0);

I know 'init_timeout' and 'timeout' were pulled from 'daemon.c', but what's
the difference between them/why do they both exist? It looks like
'init_timeout' just acts as a permanent override to the value of 'timeout'.

> +
> +		if (wr & WR_STOP_THE_MUSIC)
> +			break;
> +
> +		wr = dispatch(&req);
>  		if (wr & WR_STOP_THE_MUSIC)
>  			break;
>  	}
Matthew John Cheetham Jan. 11, 2023, 9:39 p.m. UTC | #2
On 2022-12-14 15:18, Victoria Dye wrote:

> Matthew John Cheetham via GitGitGadget wrote:
>> +/*
>> + * Read the HTTP request up to the start of the optional message-body.
>> + * We do this byte-by-byte because we have keep-alive turned on and
>> + * cannot rely on an EOF.
>> + *
>> + * https://tools.ietf.org/html/rfc7230
>> + *
>> + * We cannot call die() here because our caller needs to properly
>> + * respond to the client and/or close the socket before this
>> + * child exits so that the client doesn't get a connection reset
>> + * by peer error.
>> + */
>> +static enum worker_result req__read(struct req *req, int fd)
>> +{
>> +	struct strbuf h = STRBUF_INIT;
>> +	struct string_list start_line_fields = STRING_LIST_INIT_DUP;
>> +	int nr_start_line_fields;
>> +	const char *uri_target;
>> +	const char *query;
>> +	char *hp;
>> +	const char *hv;
>> +
>> +	enum worker_result result = WR_OK;
>> +
>> +	/*
>> +	 * Read line 0 of the request and split it into component parts:
>> +	 *
>> +	 *    <method> SP <uri-target> SP <HTTP-version> CRLF
>> +	 *
>> +	 */
>> +	if (strbuf_getwholeline_fd(&req->start_line, fd, '\n') == EOF) {
>> +		result = WR_OK | WR_HANGUP;
>> +		goto done;
>> +	}
>> +
>> +	strbuf_trim_trailing_newline(&req->start_line);
>> +
>> +	nr_start_line_fields = string_list_split(&start_line_fields,
>> +						 req->start_line.buf,
>> +						 ' ', -1);
>> +	if (nr_start_line_fields != 3) {
>> +		logerror("could not parse request start-line '%s'",
>> +			 req->start_line.buf);
>> +		result = WR_IO_ERROR;
>> +		goto done;
>> +	}
>> +
>> +	req->method = xstrdup(start_line_fields.items[0].string);
>> +	req->http_version = xstrdup(start_line_fields.items[2].string);
>> +
>> +	uri_target = start_line_fields.items[1].string;
>> +
>> +	if (strcmp(req->http_version, "HTTP/1.1")) {
>> +		logerror("unsupported version '%s' (expecting HTTP/1.1)",
>> +			 req->http_version);
>> +		result = WR_IO_ERROR;
>> +		goto done;
>> +	}
>> +
>> +	query = strchr(uri_target, '?');
>> +
>> +	if (query) {
>> +		strbuf_add(&req->uri_path, uri_target, (query - uri_target));
>> +		strbuf_trim_trailing_dir_sep(&req->uri_path);
>> +		strbuf_addstr(&req->query_args, query + 1);
>> +	} else {
>> +		strbuf_addstr(&req->uri_path, uri_target);
>> +		strbuf_trim_trailing_dir_sep(&req->uri_path);
>> +	}
> 
> This "line 0" parsing looks good, and aligns with the RFC you linked
> (specifically section 3.1.1 [1]).
> 
> [1] https://www.rfc-editor.org/rfc/rfc7230#section-3.1.1
> 
>> +
>> +	/*
>> +	 * Read the set of HTTP headers into a string-list.
>> +	 */
>> +	while (1) {
>> +		if (strbuf_getwholeline_fd(&h, fd, '\n') == EOF)
>> +			goto done;
>> +		strbuf_trim_trailing_newline(&h);
>> +
>> +		if (!h.len)
>> +			goto done; /* a blank line ends the header */
>> +
>> +		hp = strbuf_detach(&h, NULL);
>> +		string_list_append(&req->header_list, hp);
>> +
>> +		/* store common request headers separately */
>> +		if (skip_prefix(hp, "Content-Type: ", &hv)) {
>> +			req->content_type = hv;
>> +		} else if (skip_prefix(hp, "Content-Length: ", &hv)) {
>> +			req->content_length = strtol(hv, &hp, 10);
>> +		}
> 
> The "separately" is somewhat confusing - you unconditionally add 'hp' to
> 'req->header_list', so the "Content-Type" and "Content-Length" headers are
> included there as well. If that's the desired behavior, a comment like "Also
> store common headers as 'req' fields" might be clearer.

Will clarify this comment in next roll. You are correct, we *also* store these
common headers on `struct req`.

>> +	}
>> +
>> +	/*
>> +	 * We do not attempt to read the <message-body>, if it exists.
>> +	 * We let our caller read/chunk it in as appropriate.
>> +	 */
>> +
>> +done:
>> +	string_list_clear(&start_line_fields, 0);
>> +
>> +	/*
>> +	 * This is useful for debugging the request, but very noisy.
>> +	 */
>> +	if (trace2_is_enabled()) {
> 
> 'trace2_printf()' is gated internally by 'trace2_enabled' anyway, so I don't
> think this 'if()' is necessary. You could add a 'DEBUG_HTTP_SERVER'
> preprocessor directive (like 'DEBUG_CACHE_TREE' in 'cache-tree.c') if you
> wanted to prevent these printouts unless a developer sets it to '1'.

The overarching `trace2_is_enabled()` call is to avoid any possible repeated
evaluation within `trace2_printf` for potentially multiple request headers.

>> +		struct string_list_item *item;
>> +		trace2_printf("%s: %s", TR2_CAT, req->start_line.buf);
>> +		trace2_printf("%s: hver: %s", TR2_CAT, req->http_version);
>> +		trace2_printf("%s: hmth: %s", TR2_CAT, req->method);
>> +		trace2_printf("%s: path: %s", TR2_CAT, req->uri_path.buf);
>> +		trace2_printf("%s: qury: %s", TR2_CAT, req->query_args.buf);
>> +		if (req->content_length >= 0)
>> +			trace2_printf("%s: clen: %d", TR2_CAT, req->content_length);
>> +		if (req->content_type)
>> +			trace2_printf("%s: ctyp: %s", TR2_CAT, req->content_type);
>> +		for_each_string_list_item(item, &req->header_list)
>> +			trace2_printf("%s: hdrs: %s", TR2_CAT, item->string);
>> +	}
>> +
>> +	return result;
>> +}
>> +
>> +static enum worker_result dispatch(struct req *req)
>> +{
>> +	return send_http_error(1, 501, "Not Implemented", -1, NULL,
>> +			       WR_OK | WR_HANGUP);
> 
> Although the request is now being read & parsed, the response creation code
> is still a hardcoded "Not Implemented". This means that the now-parsed 'req'
> is be temporarily unused, but I think that's reasonable (since it allows for
> breaking up the implementation of 'test-http-server' into multiple, less
> overwhelming patches).
> 
>> +}
>> +
>>  static enum worker_result worker(void)
>>  {
>> +	struct req req = REQ__INIT;
>>  	char *client_addr = getenv("REMOTE_ADDR");
>>  	char *client_port = getenv("REMOTE_PORT");
>>  	enum worker_result wr = WR_OK;
>> @@ -160,8 +324,16 @@ static enum worker_result worker(void)
>>  	set_keep_alive(0);
>>  
>>  	while (1) {
>> -		wr = send_http_error(1, 501, "Not Implemented", -1, NULL,
>> -			WR_OK | WR_HANGUP);
>> +		req__release(&req);
>> +
>> +		alarm(init_timeout ? init_timeout : timeout);
>> +		wr = req__read(&req, 0);
>> +		alarm(0);
> 
> I know 'init_timeout' and 'timeout' were pulled from 'daemon.c', but what's
> the difference between them/why do they both exist? It looks like
> 'init_timeout' just acts as a permanent override to the value of 'timeout'.

Good catch. This split made sense in daemon.c whereby the `--timeout` are would
be passed to the `git-upload-pack` command, and `--init-timeout` is used as the
timeout value for the daemon server itself.

In the test HTTP server we don't need the differentiation so I'll just use the
simpler `--timeout` arg.

>> +
>> +		if (wr & WR_STOP_THE_MUSIC)
>> +			break;
>> +
>> +		wr = dispatch(&req);
>>  		if (wr & WR_STOP_THE_MUSIC)
>>  			break;
>>  	}
> 

Thanks,
Matthew
diff mbox series

Patch

diff --git a/t/helper/test-http-server.c b/t/helper/test-http-server.c
index 53508639714..7bde678e264 100644
--- a/t/helper/test-http-server.c
+++ b/t/helper/test-http-server.c
@@ -97,6 +97,42 @@  enum worker_result {
 	WR_STOP_THE_MUSIC = (WR_IO_ERROR | WR_HANGUP),
 };
 
+/*
+ * Fields from a parsed HTTP request.
+ */
+struct req {
+	struct strbuf start_line;
+
+	const char *method;
+	const char *http_version;
+
+	struct strbuf uri_path;
+	struct strbuf query_args;
+
+	struct string_list header_list;
+	const char *content_type;
+	ssize_t content_length;
+};
+
+#define REQ__INIT { \
+	.start_line = STRBUF_INIT, \
+	.uri_path = STRBUF_INIT, \
+	.query_args = STRBUF_INIT, \
+	.header_list = STRING_LIST_INIT_NODUP, \
+	.content_type = NULL, \
+	.content_length = -1 \
+	}
+
+static void req__release(struct req *req)
+{
+	strbuf_release(&req->start_line);
+
+	strbuf_release(&req->uri_path);
+	strbuf_release(&req->query_args);
+
+	string_list_clear(&req->header_list, 0);
+}
+
 static enum worker_result send_http_error(
 	int fd,
 	int http_code, const char *http_code_name,
@@ -148,8 +184,136 @@  done:
 	return wr;
 }
 
+/*
+ * Read the HTTP request up to the start of the optional message-body.
+ * We do this byte-by-byte because we have keep-alive turned on and
+ * cannot rely on an EOF.
+ *
+ * https://tools.ietf.org/html/rfc7230
+ *
+ * We cannot call die() here because our caller needs to properly
+ * respond to the client and/or close the socket before this
+ * child exits so that the client doesn't get a connection reset
+ * by peer error.
+ */
+static enum worker_result req__read(struct req *req, int fd)
+{
+	struct strbuf h = STRBUF_INIT;
+	struct string_list start_line_fields = STRING_LIST_INIT_DUP;
+	int nr_start_line_fields;
+	const char *uri_target;
+	const char *query;
+	char *hp;
+	const char *hv;
+
+	enum worker_result result = WR_OK;
+
+	/*
+	 * Read line 0 of the request and split it into component parts:
+	 *
+	 *    <method> SP <uri-target> SP <HTTP-version> CRLF
+	 *
+	 */
+	if (strbuf_getwholeline_fd(&req->start_line, fd, '\n') == EOF) {
+		result = WR_OK | WR_HANGUP;
+		goto done;
+	}
+
+	strbuf_trim_trailing_newline(&req->start_line);
+
+	nr_start_line_fields = string_list_split(&start_line_fields,
+						 req->start_line.buf,
+						 ' ', -1);
+	if (nr_start_line_fields != 3) {
+		logerror("could not parse request start-line '%s'",
+			 req->start_line.buf);
+		result = WR_IO_ERROR;
+		goto done;
+	}
+
+	req->method = xstrdup(start_line_fields.items[0].string);
+	req->http_version = xstrdup(start_line_fields.items[2].string);
+
+	uri_target = start_line_fields.items[1].string;
+
+	if (strcmp(req->http_version, "HTTP/1.1")) {
+		logerror("unsupported version '%s' (expecting HTTP/1.1)",
+			 req->http_version);
+		result = WR_IO_ERROR;
+		goto done;
+	}
+
+	query = strchr(uri_target, '?');
+
+	if (query) {
+		strbuf_add(&req->uri_path, uri_target, (query - uri_target));
+		strbuf_trim_trailing_dir_sep(&req->uri_path);
+		strbuf_addstr(&req->query_args, query + 1);
+	} else {
+		strbuf_addstr(&req->uri_path, uri_target);
+		strbuf_trim_trailing_dir_sep(&req->uri_path);
+	}
+
+	/*
+	 * Read the set of HTTP headers into a string-list.
+	 */
+	while (1) {
+		if (strbuf_getwholeline_fd(&h, fd, '\n') == EOF)
+			goto done;
+		strbuf_trim_trailing_newline(&h);
+
+		if (!h.len)
+			goto done; /* a blank line ends the header */
+
+		hp = strbuf_detach(&h, NULL);
+		string_list_append(&req->header_list, hp);
+
+		/* store common request headers separately */
+		if (skip_prefix(hp, "Content-Type: ", &hv)) {
+			req->content_type = hv;
+		} else if (skip_prefix(hp, "Content-Length: ", &hv)) {
+			req->content_length = strtol(hv, &hp, 10);
+		}
+	}
+
+	/*
+	 * We do not attempt to read the <message-body>, if it exists.
+	 * We let our caller read/chunk it in as appropriate.
+	 */
+
+done:
+	string_list_clear(&start_line_fields, 0);
+
+	/*
+	 * This is useful for debugging the request, but very noisy.
+	 */
+	if (trace2_is_enabled()) {
+		struct string_list_item *item;
+		trace2_printf("%s: %s", TR2_CAT, req->start_line.buf);
+		trace2_printf("%s: hver: %s", TR2_CAT, req->http_version);
+		trace2_printf("%s: hmth: %s", TR2_CAT, req->method);
+		trace2_printf("%s: path: %s", TR2_CAT, req->uri_path.buf);
+		trace2_printf("%s: qury: %s", TR2_CAT, req->query_args.buf);
+		if (req->content_length >= 0)
+			trace2_printf("%s: clen: %d", TR2_CAT, req->content_length);
+		if (req->content_type)
+			trace2_printf("%s: ctyp: %s", TR2_CAT, req->content_type);
+		for_each_string_list_item(item, &req->header_list)
+			trace2_printf("%s: hdrs: %s", TR2_CAT, item->string);
+	}
+
+	return result;
+}
+
+static enum worker_result dispatch(struct req *req)
+{
+	return send_http_error(1, 501, "Not Implemented", -1, NULL,
+			       WR_OK | WR_HANGUP);
+}
+
 static enum worker_result worker(void)
 {
+	struct req req = REQ__INIT;
 	char *client_addr = getenv("REMOTE_ADDR");
 	char *client_port = getenv("REMOTE_PORT");
 	enum worker_result wr = WR_OK;
@@ -160,8 +324,16 @@  static enum worker_result worker(void)
 	set_keep_alive(0);
 
 	while (1) {
-		wr = send_http_error(1, 501, "Not Implemented", -1, NULL,
-			WR_OK | WR_HANGUP);
+		req__release(&req);
+
+		alarm(init_timeout ? init_timeout : timeout);
+		wr = req__read(&req, 0);
+		alarm(0);
+
+		if (wr & WR_STOP_THE_MUSIC)
+			break;
+
+		wr = dispatch(&req);
 		if (wr & WR_STOP_THE_MUSIC)
 			break;
 	}