diff mbox series

[v4,5/8] transport: add client side capability to request object-info

Message ID 20220502170904.2770649-6-calvinwan@google.com (mailing list archive)
State New, archived
Headers show
Series cat-file: add --batch-command remote-object-info command | expand

Commit Message

Calvin Wan May 2, 2022, 5:09 p.m. UTC
Sometimes it is useful to get information about an object without having
to download it completely. The server logic has already been implemented
as “a2ba162cda (object-info: support for retrieving object info,
2021-04-20)”. This patch implements the client option for it. Currently,
only 'size' is supported and the server must be v2, however future
patches can implement additional options.

The question of version mismatch often comes up with client/server
relationships. There are two cases to consider here (assuming either
client or server functionality for object-info changes between the
different versions):

 1. client version > server version
 	- client can request additional attributes from server
 2. server version > client version
 	- server can return additional info to the client

The second case is a non-issue since the client would never be able to
request additional info from the server.  In order to solve the first
case, recall an earlier patch where the server sends back the attributes
even if no object ids are sent by the client. This allows the client to
first check whether the server can accept the requested attributes
before sending the entire request.

---
 fetch-pack.c       | 23 +++++++++++++
 fetch-pack.h       |  8 +++++
 transport-helper.c |  7 ++--
 transport.c        | 81 +++++++++++++++++++++++++++++++++++++++++++++-
 transport.h        | 11 +++++++
 5 files changed, 127 insertions(+), 3 deletions(-)

Comments

Junio C Hamano May 3, 2022, 12:54 a.m. UTC | #1
Calvin Wan <calvinwan@google.com> writes:

> The question of version mismatch often comes up with client/server
> relationships. There are two cases to consider here (assuming either
> client or server functionality for object-info changes between the
> different versions):
>
>  1. client version > server version
>  	- client can request additional attributes from server
>  2. server version > client version
>  	- server can return additional info to the client
>
> The second case is a non-issue since the client would never be able to
> request additional info from the server.

That assumes that we can never remove attributes once we start
supporting them, so it is not a non-issue exactly.  The way we make
it a non-issue is to make sure that the server returns no more than
what the client has asked, i.e. the same way we deal with the first
case.

> In order to solve the first
> case, recall an earlier patch where the server sends back the attributes
> even if no object ids are sent by the client. This allows the client to
> first check whether the server can accept the requested attributes
> before sending the entire request.

It requires two round-trips when the server does not know something
the client has asked, one without any object names, and then a real
request with the list of object names?

I think in practice, because the way the previous step is
structured, you can send a full request and in the non-error
codepath the server side will understand everything and give full
response back without an extra round-trip.  Only in the error case,
the client will see the set of acceptable attributes and no object
data (and implicitly treat that as an indication of an error), but
then what would the client do after that happens?

 - The client will just give up; the attributes it wanted but the
   server couldn't supply are so fundamentally necessary to perform
   whatever the client wanted to do.

 - The client will retry the request by asking a known subset of
   attributes for the same set of objects.  This requires an extra
   round-trip.  If an earlier step didn't make .unknown case bypass
   the response loop, this extra round-trip would have been totally
   unnecessary.

> index 8c7752fc82..c27018d48c 100644
> --- a/fetch-pack.h
> +++ b/fetch-pack.h
> @@ -69,6 +69,12 @@ struct fetch_pack_args {
>  	unsigned connectivity_checked:1;
>  };
>  
> +struct object_info_args {
> +	const struct string_list *object_info_options;
> +	const struct string_list *server_options;
> +	const struct oid_array *oids;
> +};

These are const pointers because these lists are not owned by this
structure and are borrowed from somebody else?

> diff --git a/transport-helper.c b/transport-helper.c
> index b4dbbabb0c..093946f7fd 100644
> --- a/transport-helper.c
> +++ b/transport-helper.c
> @@ -686,13 +686,16 @@ static int fetch_refs(struct transport *transport,
>  
>  	/*
>  	 * If we reach here, then the server, the client, and/or the transport
> -	 * helper does not support protocol v2. --negotiate-only requires
> -	 * protocol v2.
> +	 * helper does not support protocol v2. --negotiate-only and --object-info
> +	 * require protocol v2.
>  	 */

OK.

>  	if (data->transport_options.acked_commits) {
>  		warning(_("--negotiate-only requires protocol v2"));
>  		return -1;
>  	}
> +	if (transport->smart_options->object_info) {
> +		die(_("--object-info requires protocol v2"));
> +	}

OK.

> diff --git a/transport.c b/transport.c
> index 3d64a43ab3..08c505e1d0 100644
> --- a/transport.c
> +++ b/transport.c
> @@ -353,6 +353,79 @@ static struct ref *handshake(struct transport *transport, int for_push,
>  	return refs;
>  }
>  
> +static int fetch_object_info(struct transport *transport, struct object_info **object_info_data)
> +{
> +	size_t i;
> +	int size_index = -1;
> +	struct git_transport_data *data = transport->data;
> +	struct object_info_args args;
> +	struct packet_reader reader;
> +	struct string_list server_attributes = STRING_LIST_INIT_DUP;
> +
> +	memset(&args, 0, sizeof(args));
> +	args.server_options = transport->server_options;
> +	args.object_info_options = transport->smart_options->object_info_options;

It is unclear who fills smart_options->object_info_options and from
what.  Are we referring to something that is not yet used, or worse
yet, code that is not yet written?

It seems that the result of applying 1-5/8 does not compile.

> +	connect_setup(transport, 0);
> +	packet_reader_init(&reader, data->fd[0], NULL, 0,
> +			PACKET_READ_CHOMP_NEWLINE |
> +			PACKET_READ_DIE_ON_ERR_PACKET);
> +	data->version = discover_version(&reader);
> +
> +	transport->hash_algo = reader.hash_algo;
> +
> +	switch (data->version) {
> +	case protocol_v2:
> +		if (!server_supports_v2("object-info", 0))
> +			return -1;
> +		/**
> +		 * Send a request with only attributes first. If server can return all
> +		 * of the requested attributes, then send request with object ids
> +		 */

Doesn't it force the most pessimistic behaviour, i.e. always do a
likely-to-be-useless (when the feature becomes widely avaiable)
probe round-trip before making a real request?

> +		send_object_info_request(data->fd[1], &args);
> +		if (packet_reader_read(&reader) != PACKET_READ_NORMAL) {
> +			check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> +			return -1;
> +		}

All these check_stateless_delimiter() calls in this patch are on
overly long lines.

> +		string_list_split(&server_attributes, reader.line, ' ', -1);
> +		packet_reader_read(&reader);
> +		check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> +		if (server_attributes.nr != args.object_info_options->nr)
> +			return -1;

So, I am guessing (without having the code that prepares the right
hand side) that we are comparing the number of attributes we asked
(on the right hand side) and the number of attributes the other side
said it supports in response to our request (on the left hand
side).  

This does not protect against a server that responds with attribute
names duplicated or any somewhat odd third-party servers that are
not identical to what you wrote here.  I would rather see us to loop
through object_info_options array and see if server_attributes list
says all of them are supported more explicitly.  That can be done in
the loop below, where you'd figure out the "foo_index" for each
attribute "foo" you are supporting in the code.

> +		for (i = 0; i < server_attributes.nr; i++) {
> +			if (!strcmp(server_attributes.items[i].string, "size"))
> +				size_index = i + 1;
> +		}

size_index was initialized to -1 and I was expecting we can tell the
attribute is not used by checking (size_index < 0), but this seems
to make size_index 1 based, not 0 based.  Intended?

I think this is to skip the object name that comes as the first item
in the response, but it would be more "pure" to keep foo_index
0-based and add the offset by whatever constant number of things
that come in front (currently, 1 for object name) near a lot closer
to where we parse and read the data, i.e. in the while() loop below.

> +		args.oids = transport->smart_options->object_info_oids;
> +		send_object_info_request(data->fd[1], &args);

And this is the second round-trip.

> +		break;
> +	case protocol_v1:
> +	case protocol_v0:
> +		die(_("wrong protocol version. expected v2"));
> +	case protocol_unknown_version:
> +		BUG("unknown protocol version");
> +	}
> +
> +	if (packet_reader_read(&reader) != PACKET_READ_NORMAL)
> +		die(_("error reading object info header"));
> +	i = 0;
> +	while (packet_reader_read(&reader) == PACKET_READ_NORMAL) {
> +		struct string_list object_info_values = STRING_LIST_INIT_DUP;
> +
> +		string_list_split(&object_info_values, reader.line, ' ', -1);
> +		if (size_index > 0) {
> +			if (!strcmp(object_info_values.items[size_index].string, ""))
> +				die("object-info: not our ref %s",
> +					object_info_values.items[0].string);
> +			*(*object_info_data)[i].sizep = strtoul(object_info_values.items[size_index].string, NULL, 10);
> +		}
> +		i++;
> +	}
> +	check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> +
> +	return 0;
> +}

All of the above look written for the best case scenario (i.e. it
assumes there won't be any unexpected type of response), which makes
a good fuzz target.  A malicious server side can throw non-number in
the "size" slot and strtoul() does not check for a malformatted
number, or it can return more response records than it was
requested, and overflow object_info_data[] array the caller of this
function prepared.

smart_options->object_info_oids may know how many response records
we should expect, so at least this while() loop should be able to
protect against an object_info_data[] overflow attack using it as
the upper limit of i.

>  static struct ref *get_refs_via_connect(struct transport *transport, int for_push,
>  					struct transport_ls_refs_options *options)
>  {
> @@ -392,8 +465,14 @@ static int fetch_refs_via_pack(struct transport *transport,
>  	args.server_options = transport->server_options;
>  	args.negotiation_tips = data->options.negotiation_tips;
>  	args.reject_shallow_remote = transport->smart_options->reject_shallow;
> +	args.object_info = transport->smart_options->object_info;
>  
> -	if (!data->got_remote_heads) {
> +	if (transport->smart_options && transport->smart_options->object_info) {
> +		if (!fetch_object_info(transport, data->options.object_info_data))
> +			goto cleanup;
> +		ret = -1;
> +		goto cleanup;
> +	} else if (!data->got_remote_heads) {
>  		int i;
>  		int must_list_refs = 0;
>  		for (i = 0; i < nr_heads; i++) {
> diff --git a/transport.h b/transport.h
> index 12bc08fc33..dbf60bb710 100644
> --- a/transport.h
> +++ b/transport.h
> @@ -6,6 +6,7 @@
>  #include "remote.h"
>  #include "list-objects-filter-options.h"
>  #include "string-list.h"
> +#include "object-store.h"
>  
>  struct git_transport_options {
>  	unsigned thin : 1;
> @@ -31,6 +32,12 @@ struct git_transport_options {
>  	 */
>  	unsigned connectivity_checked:1;
>  
> +	/*
> +	 * Transport will attempt to pull only object-info. Fallbacks
> +	 * to pulling entire object if object-info is not supported
> +	 */
> +	unsigned object_info : 1;
> +
>  	int depth;
>  	const char *deepen_since;
>  	const struct string_list *deepen_not;
> @@ -54,6 +61,10 @@ struct git_transport_options {
>  	 * common commits to this oidset instead of fetching any packfiles.
>  	 */
>  	struct oidset *acked_commits;
> +
> +	struct oid_array *object_info_oids;
> +	struct object_info **object_info_data;
> +	struct string_list *object_info_options;
>  };
>  
>  enum transport_family {
Calvin Wan May 3, 2022, 6:58 p.m. UTC | #2
> It requires two round-trips when the server does not know something
> the client has asked, one without any object names, and then a real
> request with the list of object names?

> I think in practice, because the way the previous step is
> structured, you can send a full request and in the non-error
> codepath the server side will understand everything and give full
> response back without an extra round-trip.  Only in the error case,
> the client will see the set of acceptable attributes and no object
> data (and implicitly treat that as an indication of an error), but
> then what would the client do after that happens?

> - The client will just give up; the attributes it wanted but the
>   server couldn't supply are so fundamentally necessary to perform
>   whatever the client wanted to do.

> - The client will retry the request by asking a known subset of
>   attributes for the same set of objects.  This requires an extra
>   round-trip.  If an earlier step didn't make .unknown case bypass
>   the response loop, this extra round-trip would have been totally
>   unnecessary.

I do agree that the two round trips are unnecessary and I can send the
full request to begin with. The client (after patch 6) will fallback to
fetch after this initial request fails.

> These are const pointers because these lists are not owned by this
> structure and are borrowed from somebody else?

Yes git_transport_options holds them.

> It is unclear who fills smart_options->object_info_options and from
> what.  Are we referring to something that is not yet used, or worse
> yet, code that is not yet written?

The code that calls this function is in patch 8. I decided to separate
out the transport side and the new command in cat-file --batch-command
to make it easier to explain to a reviewer how the individual parts
work. Looking holistically at your comments, should I have gone about
splitting the patches differently? It seems like I was a little too
aggressive this time, creating confusion by requiring the reviewer to
look at the next patch to answer some questions.

> It seems that the result of applying 1-5/8 does not compile.

I'll check why this is the case. I should get in the habit of compiling
all of my patches individually.

> size_index was initialized to -1 and I was expecting we can tell the
> attribute is not used by checking (size_index < 0), but this seems
> to make size_index 1 based, not 0 based.  Intended?

Yes the server returns the packet as "object_id SP size" so the first
value is always the object_id.

> All of the above look written for the best case scenario (i.e. it
> assumes there won't be any unexpected type of response), which makes
> a good fuzz target.

I didn't take into account the possibility of a malicious server in my
implementation. Will go back through and protect against it.


On Mon, May 2, 2022 at 5:54 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Calvin Wan <calvinwan@google.com> writes:
>
> > The question of version mismatch often comes up with client/server
> > relationships. There are two cases to consider here (assuming either
> > client or server functionality for object-info changes between the
> > different versions):
> >
> >  1. client version > server version
> >       - client can request additional attributes from server
> >  2. server version > client version
> >       - server can return additional info to the client
> >
> > The second case is a non-issue since the client would never be able to
> > request additional info from the server.
>
> That assumes that we can never remove attributes once we start
> supporting them, so it is not a non-issue exactly.  The way we make
> it a non-issue is to make sure that the server returns no more than
> what the client has asked, i.e. the same way we deal with the first
> case.
>
> > In order to solve the first
> > case, recall an earlier patch where the server sends back the attributes
> > even if no object ids are sent by the client. This allows the client to
> > first check whether the server can accept the requested attributes
> > before sending the entire request.
>
> It requires two round-trips when the server does not know something
> the client has asked, one without any object names, and then a real
> request with the list of object names?
>
> I think in practice, because the way the previous step is
> structured, you can send a full request and in the non-error
> codepath the server side will understand everything and give full
> response back without an extra round-trip.  Only in the error case,
> the client will see the set of acceptable attributes and no object
> data (and implicitly treat that as an indication of an error), but
> then what would the client do after that happens?
>
>  - The client will just give up; the attributes it wanted but the
>    server couldn't supply are so fundamentally necessary to perform
>    whatever the client wanted to do.
>
>  - The client will retry the request by asking a known subset of
>    attributes for the same set of objects.  This requires an extra
>    round-trip.  If an earlier step didn't make .unknown case bypass
>    the response loop, this extra round-trip would have been totally
>    unnecessary.
>
> > index 8c7752fc82..c27018d48c 100644
> > --- a/fetch-pack.h
> > +++ b/fetch-pack.h
> > @@ -69,6 +69,12 @@ struct fetch_pack_args {
> >       unsigned connectivity_checked:1;
> >  };
> >
> > +struct object_info_args {
> > +     const struct string_list *object_info_options;
> > +     const struct string_list *server_options;
> > +     const struct oid_array *oids;
> > +};
>
> These are const pointers because these lists are not owned by this
> structure and are borrowed from somebody else?
>
> > diff --git a/transport-helper.c b/transport-helper.c
> > index b4dbbabb0c..093946f7fd 100644
> > --- a/transport-helper.c
> > +++ b/transport-helper.c
> > @@ -686,13 +686,16 @@ static int fetch_refs(struct transport *transport,
> >
> >       /*
> >        * If we reach here, then the server, the client, and/or the transport
> > -      * helper does not support protocol v2. --negotiate-only requires
> > -      * protocol v2.
> > +      * helper does not support protocol v2. --negotiate-only and --object-info
> > +      * require protocol v2.
> >        */
>
> OK.
>
> >       if (data->transport_options.acked_commits) {
> >               warning(_("--negotiate-only requires protocol v2"));
> >               return -1;
> >       }
> > +     if (transport->smart_options->object_info) {
> > +             die(_("--object-info requires protocol v2"));
> > +     }
>
> OK.
>
> > diff --git a/transport.c b/transport.c
> > index 3d64a43ab3..08c505e1d0 100644
> > --- a/transport.c
> > +++ b/transport.c
> > @@ -353,6 +353,79 @@ static struct ref *handshake(struct transport *transport, int for_push,
> >       return refs;
> >  }
> >
> > +static int fetch_object_info(struct transport *transport, struct object_info **object_info_data)
> > +{
> > +     size_t i;
> > +     int size_index = -1;
> > +     struct git_transport_data *data = transport->data;
> > +     struct object_info_args args;
> > +     struct packet_reader reader;
> > +     struct string_list server_attributes = STRING_LIST_INIT_DUP;
> > +
> > +     memset(&args, 0, sizeof(args));
> > +     args.server_options = transport->server_options;
> > +     args.object_info_options = transport->smart_options->object_info_options;
>
> It is unclear who fills smart_options->object_info_options and from
> what.  Are we referring to something that is not yet used, or worse
> yet, code that is not yet written?
>
> It seems that the result of applying 1-5/8 does not compile.
>
> > +     connect_setup(transport, 0);
> > +     packet_reader_init(&reader, data->fd[0], NULL, 0,
> > +                     PACKET_READ_CHOMP_NEWLINE |
> > +                     PACKET_READ_DIE_ON_ERR_PACKET);
> > +     data->version = discover_version(&reader);
> > +
> > +     transport->hash_algo = reader.hash_algo;
> > +
> > +     switch (data->version) {
> > +     case protocol_v2:
> > +             if (!server_supports_v2("object-info", 0))
> > +                     return -1;
> > +             /**
> > +              * Send a request with only attributes first. If server can return all
> > +              * of the requested attributes, then send request with object ids
> > +              */
>
> Doesn't it force the most pessimistic behaviour, i.e. always do a
> likely-to-be-useless (when the feature becomes widely avaiable)
> probe round-trip before making a real request?
>
> > +             send_object_info_request(data->fd[1], &args);
> > +             if (packet_reader_read(&reader) != PACKET_READ_NORMAL) {
> > +                     check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> > +                     return -1;
> > +             }
>
> All these check_stateless_delimiter() calls in this patch are on
> overly long lines.
>
> > +             string_list_split(&server_attributes, reader.line, ' ', -1);
> > +             packet_reader_read(&reader);
> > +             check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> > +             if (server_attributes.nr != args.object_info_options->nr)
> > +                     return -1;
>
> So, I am guessing (without having the code that prepares the right
> hand side) that we are comparing the number of attributes we asked
> (on the right hand side) and the number of attributes the other side
> said it supports in response to our request (on the left hand
> side).
>
> This does not protect against a server that responds with attribute
> names duplicated or any somewhat odd third-party servers that are
> not identical to what you wrote here.  I would rather see us to loop
> through object_info_options array and see if server_attributes list
> says all of them are supported more explicitly.  That can be done in
> the loop below, where you'd figure out the "foo_index" for each
> attribute "foo" you are supporting in the code.
>
> > +             for (i = 0; i < server_attributes.nr; i++) {
> > +                     if (!strcmp(server_attributes.items[i].string, "size"))
> > +                             size_index = i + 1;
> > +             }
>
> size_index was initialized to -1 and I was expecting we can tell the
> attribute is not used by checking (size_index < 0), but this seems
> to make size_index 1 based, not 0 based.  Intended?
>
> I think this is to skip the object name that comes as the first item
> in the response, but it would be more "pure" to keep foo_index
> 0-based and add the offset by whatever constant number of things
> that come in front (currently, 1 for object name) near a lot closer
> to where we parse and read the data, i.e. in the while() loop below.
>
> > +             args.oids = transport->smart_options->object_info_oids;
> > +             send_object_info_request(data->fd[1], &args);
>
> And this is the second round-trip.
>
> > +             break;
> > +     case protocol_v1:
> > +     case protocol_v0:
> > +             die(_("wrong protocol version. expected v2"));
> > +     case protocol_unknown_version:
> > +             BUG("unknown protocol version");
> > +     }
> > +
> > +     if (packet_reader_read(&reader) != PACKET_READ_NORMAL)
> > +             die(_("error reading object info header"));
> > +     i = 0;
> > +     while (packet_reader_read(&reader) == PACKET_READ_NORMAL) {
> > +             struct string_list object_info_values = STRING_LIST_INIT_DUP;
> > +
> > +             string_list_split(&object_info_values, reader.line, ' ', -1);
> > +             if (size_index > 0) {
> > +                     if (!strcmp(object_info_values.items[size_index].string, ""))
> > +                             die("object-info: not our ref %s",
> > +                                     object_info_values.items[0].string);
> > +                     *(*object_info_data)[i].sizep = strtoul(object_info_values.items[size_index].string, NULL, 10);
> > +             }
> > +             i++;
> > +     }
> > +     check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
> > +
> > +     return 0;
> > +}
>
> All of the above look written for the best case scenario (i.e. it
> assumes there won't be any unexpected type of response), which makes
> a good fuzz target.  A malicious server side can throw non-number in
> the "size" slot and strtoul() does not check for a malformatted
> number, or it can return more response records than it was
> requested, and overflow object_info_data[] array the caller of this
> function prepared.
>
> smart_options->object_info_oids may know how many response records
> we should expect, so at least this while() loop should be able to
> protect against an object_info_data[] overflow attack using it as
> the upper limit of i.
>
> >  static struct ref *get_refs_via_connect(struct transport *transport, int for_push,
> >                                       struct transport_ls_refs_options *options)
> >  {
> > @@ -392,8 +465,14 @@ static int fetch_refs_via_pack(struct transport *transport,
> >       args.server_options = transport->server_options;
> >       args.negotiation_tips = data->options.negotiation_tips;
> >       args.reject_shallow_remote = transport->smart_options->reject_shallow;
> > +     args.object_info = transport->smart_options->object_info;
> >
> > -     if (!data->got_remote_heads) {
> > +     if (transport->smart_options && transport->smart_options->object_info) {
> > +             if (!fetch_object_info(transport, data->options.object_info_data))
> > +                     goto cleanup;
> > +             ret = -1;
> > +             goto cleanup;
> > +     } else if (!data->got_remote_heads) {
> >               int i;
> >               int must_list_refs = 0;
> >               for (i = 0; i < nr_heads; i++) {
> > diff --git a/transport.h b/transport.h
> > index 12bc08fc33..dbf60bb710 100644
> > --- a/transport.h
> > +++ b/transport.h
> > @@ -6,6 +6,7 @@
> >  #include "remote.h"
> >  #include "list-objects-filter-options.h"
> >  #include "string-list.h"
> > +#include "object-store.h"
> >
> >  struct git_transport_options {
> >       unsigned thin : 1;
> > @@ -31,6 +32,12 @@ struct git_transport_options {
> >        */
> >       unsigned connectivity_checked:1;
> >
> > +     /*
> > +      * Transport will attempt to pull only object-info. Fallbacks
> > +      * to pulling entire object if object-info is not supported
> > +      */
> > +     unsigned object_info : 1;
> > +
> >       int depth;
> >       const char *deepen_since;
> >       const struct string_list *deepen_not;
> > @@ -54,6 +61,10 @@ struct git_transport_options {
> >        * common commits to this oidset instead of fetching any packfiles.
> >        */
> >       struct oidset *acked_commits;
> > +
> > +     struct oid_array *object_info_oids;
> > +     struct object_info **object_info_data;
> > +     struct string_list *object_info_options;
> >  };
> >
> >  enum transport_family {
Jonathan Tan May 3, 2022, 11:15 p.m. UTC | #3
Calvin Wan <calvinwan@google.com> writes:
> Sometimes it is useful to get information about an object without having
> to download it completely. The server logic has already been implemented
> as “a2ba162cda (object-info: support for retrieving object info,
> 2021-04-20)”. This patch implements the client option for it. Currently,
> only 'size' is supported and the server must be v2, however future
> patches can implement additional options.
> 
> The question of version mismatch often comes up with client/server
> relationships. There are two cases to consider here (assuming either
> client or server functionality for object-info changes between the
> different versions):
> 
>  1. client version > server version
>  	- client can request additional attributes from server
>  2. server version > client version
>  	- server can return additional info to the client
> 
> The second case is a non-issue since the client would never be able to
> request additional info from the server.  In order to solve the first
> case, recall an earlier patch where the server sends back the attributes
> even if no object ids are sent by the client. This allows the client to
> first check whether the server can accept the requested attributes
> before sending the entire request.

From this description, it seems like the intention is to send an
object-info request, and then if the server responds in a certain way
(here, sending back the attributes - presumably without sending any
actual information), then we know that the server doesn't support our
request and we need to fall back. As Junio says [1], this requires one
RTT more than necessary. We could just determine if the server supports
the attributes we want by using capabilities (without needing to send
the request to check).

[1] https://lore.kernel.org/git/xmqqilqnsaep.fsf@gitster.g/
Junio C Hamano May 4, 2022, 3:42 p.m. UTC | #4
Calvin Wan <calvinwan@google.com> writes:

>> It seems that the result of applying 1-5/8 does not compile.
>
> I'll check why this is the case. I should get in the habit of compiling
> all of my patches individually.

For a 8-patch series, we should make sure that all 8 states resuting
from applying patches 1-thru-N for each N (1 <= N <= 8) builds and
tests OK for bisectability.

I often do not check that due to lack of time on the receiving end,
but for this series, I didn't understand the sudden appearance of
the object_info_options member in the code that didn't even seem to
be populated anywhere, and I noticed that the series was organized
incorrectly.  Perhaps a simple rebase misordering?

>> size_index was initialized to -1 and I was expecting we can tell the
>> attribute is not used by checking (size_index < 0), but this seems
>> to make size_index 1 based, not 0 based.  Intended?
>
> Yes the server returns the packet as "object_id SP size" so the first
> value is always the object_id.

I think this is to skip the object name that comes as the first item
in the response, but it would be more "pure" to keep foo_index
0-based and add the offset by whatever constant number of things
that come in front (currently, 1 for object name) near a lot closer
to where we parse and read the data, i.e. in the while() loop below.

IOW, the code that sets size_index based on the attribute query
response should not have to know how many fixed elements will come
before these attributes on the response lines.  That knowledge
belongs to the code below:

> +     i = 0;
> +     while (packet_reader_read(&reader) == PACKET_READ_NORMAL) {
> +             struct string_list object_info_values = STRING_LIST_INIT_DUP;
> +
> +             string_list_split(&object_info_values, reader.line, ' ', -1);
> +             if (size_index > 0) {
> +                     if (!strcmp(object_info_values.items[size_index].string, ""))

And here the code that uses size_index should be like

		if (0 <= size_index &&
		    !strcmp(object_info_values.items[1 + size_index].string, ""))
			...

if we wanted the logic to be more "pure" and keep foo_index 0-based.
Junio C Hamano May 4, 2022, 3:50 p.m. UTC | #5
Jonathan Tan <jonathantanmy@google.com> writes:

> RTT more than necessary. We could just determine if the server supports
> the attributes we want by using capabilities (without needing to send
> the request to check).

Hmph.  capabilities may cut in both ways, though.

Those clients that are not interested in object-info at all (which
are the majority of case where people clone, fetch or ls-remote),
they do not even need to know what kind of object attributes the
object-info command is prepared to report, and they would appreciate
not having to spend any extra byte in the capability-advertisement.

Of course, for object-info clients, having it upfront does make
things simpler.

So...
diff mbox series

Patch

diff --git a/fetch-pack.c b/fetch-pack.c
index 45473b4602..506ca28e05 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -1278,6 +1278,29 @@  static void write_command_and_capabilities(struct strbuf *req_buf,
 	packet_buf_delim(req_buf);
 }
 
+void send_object_info_request(int fd_out, struct object_info_args *args)
+{
+	struct strbuf req_buf = STRBUF_INIT;
+	struct string_list unsorted_object_info_options = *args->object_info_options;
+	size_t i;
+
+	write_command_and_capabilities(&req_buf, args->server_options, "object-info");
+
+	if (unsorted_string_list_has_string(&unsorted_object_info_options, "size"))
+		packet_buf_write(&req_buf, "size");
+
+	if (args->oids) {
+		for (i = 0; i < args->oids->nr; i++)
+			packet_buf_write(&req_buf, "oid %s", oid_to_hex(&args->oids->oid[i]));
+	}
+
+	packet_buf_flush(&req_buf);
+	if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0)
+		die_errno(_("unable to write request to remote"));
+
+	strbuf_release(&req_buf);
+}
+
 static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      struct fetch_pack_args *args,
 			      const struct ref *wants, struct oidset *common,
diff --git a/fetch-pack.h b/fetch-pack.h
index 8c7752fc82..c27018d48c 100644
--- a/fetch-pack.h
+++ b/fetch-pack.h
@@ -69,6 +69,12 @@  struct fetch_pack_args {
 	unsigned connectivity_checked:1;
 };
 
+struct object_info_args {
+	const struct string_list *object_info_options;
+	const struct string_list *server_options;
+	const struct oid_array *oids;
+};
+
 /*
  * sought represents remote references that should be updated from.
  * On return, the names that were found on the remote will have been
@@ -102,4 +108,6 @@  void negotiate_using_fetch(const struct oid_array *negotiation_tips,
  */
 int report_unmatched_refs(struct ref **sought, int nr_sought);
 
+void send_object_info_request(int fd_out, struct object_info_args *args);
+
 #endif
diff --git a/transport-helper.c b/transport-helper.c
index b4dbbabb0c..093946f7fd 100644
--- a/transport-helper.c
+++ b/transport-helper.c
@@ -686,13 +686,16 @@  static int fetch_refs(struct transport *transport,
 
 	/*
 	 * If we reach here, then the server, the client, and/or the transport
-	 * helper does not support protocol v2. --negotiate-only requires
-	 * protocol v2.
+	 * helper does not support protocol v2. --negotiate-only and --object-info
+	 * require protocol v2.
 	 */
 	if (data->transport_options.acked_commits) {
 		warning(_("--negotiate-only requires protocol v2"));
 		return -1;
 	}
+	if (transport->smart_options->object_info) {
+		die(_("--object-info requires protocol v2"));
+	}
 
 	if (!data->get_refs_list_called)
 		get_refs_list_using_list(transport, 0);
diff --git a/transport.c b/transport.c
index 3d64a43ab3..08c505e1d0 100644
--- a/transport.c
+++ b/transport.c
@@ -353,6 +353,79 @@  static struct ref *handshake(struct transport *transport, int for_push,
 	return refs;
 }
 
+static int fetch_object_info(struct transport *transport, struct object_info **object_info_data)
+{
+	size_t i;
+	int size_index = -1;
+	struct git_transport_data *data = transport->data;
+	struct object_info_args args;
+	struct packet_reader reader;
+	struct string_list server_attributes = STRING_LIST_INIT_DUP;
+
+	memset(&args, 0, sizeof(args));
+	args.server_options = transport->server_options;
+	args.object_info_options = transport->smart_options->object_info_options;
+
+	connect_setup(transport, 0);
+	packet_reader_init(&reader, data->fd[0], NULL, 0,
+			PACKET_READ_CHOMP_NEWLINE |
+			PACKET_READ_DIE_ON_ERR_PACKET);
+	data->version = discover_version(&reader);
+
+	transport->hash_algo = reader.hash_algo;
+
+	switch (data->version) {
+	case protocol_v2:
+		if (!server_supports_v2("object-info", 0))
+			return -1;
+		/**
+		 * Send a request with only attributes first. If server can return all
+		 * of the requested attributes, then send request with object ids
+		 */
+		send_object_info_request(data->fd[1], &args);
+		if (packet_reader_read(&reader) != PACKET_READ_NORMAL) {
+			check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
+			return -1;
+		}
+		string_list_split(&server_attributes, reader.line, ' ', -1);
+		packet_reader_read(&reader);
+		check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
+		if (server_attributes.nr != args.object_info_options->nr)
+			return -1;
+		for (i = 0; i < server_attributes.nr; i++) {
+			if (!strcmp(server_attributes.items[i].string, "size"))
+				size_index = i + 1;
+		}
+		args.oids = transport->smart_options->object_info_oids;
+		send_object_info_request(data->fd[1], &args);
+		break;
+	case protocol_v1:
+	case protocol_v0:
+		die(_("wrong protocol version. expected v2"));
+	case protocol_unknown_version:
+		BUG("unknown protocol version");
+	}
+
+	if (packet_reader_read(&reader) != PACKET_READ_NORMAL)
+		die(_("error reading object info header"));
+	i = 0;
+	while (packet_reader_read(&reader) == PACKET_READ_NORMAL) {
+		struct string_list object_info_values = STRING_LIST_INIT_DUP;
+
+		string_list_split(&object_info_values, reader.line, ' ', -1);
+		if (size_index > 0) {
+			if (!strcmp(object_info_values.items[size_index].string, ""))
+				die("object-info: not our ref %s",
+					object_info_values.items[0].string);
+			*(*object_info_data)[i].sizep = strtoul(object_info_values.items[size_index].string, NULL, 10);
+		}
+		i++;
+	}
+	check_stateless_delimiter(transport->stateless_rpc, &reader, "stateless delimiter expected");
+
+	return 0;
+}
+
 static struct ref *get_refs_via_connect(struct transport *transport, int for_push,
 					struct transport_ls_refs_options *options)
 {
@@ -392,8 +465,14 @@  static int fetch_refs_via_pack(struct transport *transport,
 	args.server_options = transport->server_options;
 	args.negotiation_tips = data->options.negotiation_tips;
 	args.reject_shallow_remote = transport->smart_options->reject_shallow;
+	args.object_info = transport->smart_options->object_info;
 
-	if (!data->got_remote_heads) {
+	if (transport->smart_options && transport->smart_options->object_info) {
+		if (!fetch_object_info(transport, data->options.object_info_data))
+			goto cleanup;
+		ret = -1;
+		goto cleanup;
+	} else if (!data->got_remote_heads) {
 		int i;
 		int must_list_refs = 0;
 		for (i = 0; i < nr_heads; i++) {
diff --git a/transport.h b/transport.h
index 12bc08fc33..dbf60bb710 100644
--- a/transport.h
+++ b/transport.h
@@ -6,6 +6,7 @@ 
 #include "remote.h"
 #include "list-objects-filter-options.h"
 #include "string-list.h"
+#include "object-store.h"
 
 struct git_transport_options {
 	unsigned thin : 1;
@@ -31,6 +32,12 @@  struct git_transport_options {
 	 */
 	unsigned connectivity_checked:1;
 
+	/*
+	 * Transport will attempt to pull only object-info. Fallbacks
+	 * to pulling entire object if object-info is not supported
+	 */
+	unsigned object_info : 1;
+
 	int depth;
 	const char *deepen_since;
 	const struct string_list *deepen_not;
@@ -54,6 +61,10 @@  struct git_transport_options {
 	 * common commits to this oidset instead of fetching any packfiles.
 	 */
 	struct oidset *acked_commits;
+
+	struct oid_array *object_info_oids;
+	struct object_info **object_info_data;
+	struct string_list *object_info_options;
 };
 
 enum transport_family {