diff mbox series

[2/5] bundle-uri: create basic file-copy logic

Message ID 7d3159f0d9a29a04b5856982f787f4eddfb91b53.1658781277.git.gitgitgadget@gmail.com (mailing list archive)
State Superseded
Headers show
Series Bundle URIs II: git clone --bundle-uri | expand

Commit Message

Derrick Stolee July 25, 2022, 8:34 p.m. UTC
From: Derrick Stolee <derrickstolee@github.com>

Before implementing a way to fetch bundles into a repository, create the
basic logic. Assume that the URI is actually a file path. Future logic
will make this more careful to other protocols.

For now, we also only succeed if the content at the URI is a bundle
file, not a bundle list. Bundle lists will be implemented in a future
change.

Note that the discovery of a temporary filename is slightly racy because
the odb_mkstemp() relies on the temporary file not existing. With the
current implementation being limited to file copies, we could replace
the copy_file() with copy_fd(). The tricky part comes in future changes
that send the filename to 'git remote-https' and its 'get' capability.
At that point, we need the file descriptor closed _and_ the file
unlinked. If we were to keep the file descriptor open for the sake of
normal file copies, then we would pollute the rest of the code for
little benefit. This is especially the case because we expect that most
bundle URI use will be based on HTTPS instead of file copies.

Signed-off-by: Derrick Stolee <derrickstolee@github.com>
---
 Makefile     |   1 +
 bundle-uri.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++
 bundle-uri.h |  14 +++++++
 3 files changed, 119 insertions(+)
 create mode 100644 bundle-uri.c
 create mode 100644 bundle-uri.h

Comments

Josh Steadmon July 27, 2022, 10:09 p.m. UTC | #1
On 2022.07.25 20:34, Derrick Stolee via GitGitGadget wrote:
> From: Derrick Stolee <derrickstolee@github.com>
> 
> Before implementing a way to fetch bundles into a repository, create the
> basic logic. Assume that the URI is actually a file path. Future logic
> will make this more careful to other protocols.
> 
> For now, we also only succeed if the content at the URI is a bundle
> file, not a bundle list. Bundle lists will be implemented in a future
> change.
> 
> Note that the discovery of a temporary filename is slightly racy because
> the odb_mkstemp() relies on the temporary file not existing. With the
> current implementation being limited to file copies, we could replace
> the copy_file() with copy_fd(). The tricky part comes in future changes
> that send the filename to 'git remote-https' and its 'get' capability.

> At that point, we need the file descriptor closed _and_ the file
> unlinked.

Ahh, it looks like this was the point I missed in my previous review.
IIUC, we need the file unlinked because http_get_file() will eventually
call finalize_object_file() to move a tempfile to the final object name,
and that will fail if we have an empty file already in place.


> If we were to keep the file descriptor open for the sake of
> normal file copies, then we would pollute the rest of the code for
> little benefit. This is especially the case because we expect that most
> bundle URI use will be based on HTTPS instead of file copies.
> 
> Signed-off-by: Derrick Stolee <derrickstolee@github.com>
> ---
>  Makefile     |   1 +
>  bundle-uri.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  bundle-uri.h |  14 +++++++
>  3 files changed, 119 insertions(+)
>  create mode 100644 bundle-uri.c
>  create mode 100644 bundle-uri.h
> 
> diff --git a/Makefile b/Makefile
> index 1624471badc..7d5f48069ea 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -906,6 +906,7 @@ LIB_OBJS += blob.o
>  LIB_OBJS += bloom.o
>  LIB_OBJS += branch.o
>  LIB_OBJS += bulk-checkin.o
> +LIB_OBJS += bundle-uri.o
>  LIB_OBJS += bundle.o
>  LIB_OBJS += cache-tree.o
>  LIB_OBJS += cbtree.o
> diff --git a/bundle-uri.c b/bundle-uri.c
> new file mode 100644
> index 00000000000..b35babc36aa
> --- /dev/null
> +++ b/bundle-uri.c
> @@ -0,0 +1,104 @@
> +#include "cache.h"
> +#include "bundle-uri.h"
> +#include "bundle.h"
> +#include "object-store.h"
> +#include "refs.h"
> +#include "run-command.h"
> +
> +static int find_temp_filename(struct strbuf *name)
> +{
> +	int fd;
> +	/*
> +	 * Find a temporary filename that is available. This is briefly
> +	 * racy, but unlikely to collide.
> +	 */
> +	fd = odb_mkstemp(name, "bundles/tmp_uri_XXXXXX");
> +	if (fd < 0) {
> +		warning(_("failed to create temporary file"));
> +		return -1;
> +	}
> +
> +	close(fd);
> +	unlink(name->buf);
> +	return 0;
> +}
> +
> +static int copy_uri_to_file(const char *file, const char *uri)
> +{
> +	/* File-based URIs only for now. */
> +	return copy_file(file, uri, 0);
> +}
> +
> +static int unbundle_from_file(struct repository *r, const char *file)
> +{
> +	int result = 0;
> +	int bundle_fd;
> +	struct bundle_header header = BUNDLE_HEADER_INIT;
> +	struct string_list_item *refname;
> +	struct strbuf bundle_ref = STRBUF_INIT;
> +	size_t bundle_prefix_len;
> +
> +	if ((bundle_fd = read_bundle_header(file, &header)) < 0)
> +		return 1;
> +
> +	if ((result = unbundle(r, &header, bundle_fd, NULL)))
> +		return 1;
> +
> +	/*
> +	 * Convert all refs/heads/ from the bundle into refs/bundles/
> +	 * in the local repository.
> +	 */
> +	strbuf_addstr(&bundle_ref, "refs/bundles/");
> +	bundle_prefix_len = bundle_ref.len;
> +
> +	for_each_string_list_item(refname, &header.references) {
> +		struct object_id *oid = refname->util;
> +		struct object_id old_oid;
> +		const char *branch_name;
> +		int has_old;
> +
> +		if (!skip_prefix(refname->string, "refs/heads/", &branch_name))
> +			continue;
> +
> +		strbuf_setlen(&bundle_ref, bundle_prefix_len);
> +		strbuf_addstr(&bundle_ref, branch_name);
> +
> +		has_old = !read_ref(bundle_ref.buf, &old_oid);
> +		update_ref("fetched bundle", bundle_ref.buf, oid,
> +			   has_old ? &old_oid : NULL,
> +			   REF_SKIP_OID_VERIFICATION,
> +			   UPDATE_REFS_MSG_ON_ERR);
> +	}
> +
> +	bundle_header_release(&header);

We still also need to release bundle_ref here, right?


> +	return result;
> +}
> +
> +int fetch_bundle_uri(struct repository *r, const char *uri)
> +{
> +	int result = 0;
> +	struct strbuf filename = STRBUF_INIT;
> +
> +	if ((result = find_temp_filename(&filename)))
> +		goto cleanup;
> +
> +	if ((result = copy_uri_to_file(filename.buf, uri))) {
> +		warning(_("failed to download bundle from URI '%s'"), uri);
> +		goto cleanup;
> +	}
> +
> +	if ((result = !is_bundle(filename.buf, 0))) {
> +		warning(_("file at URI '%s' is not a bundle"), uri);
> +		goto cleanup;
> +	}
> +
> +	if ((result = unbundle_from_file(r, filename.buf))) {
> +		warning(_("failed to unbundle bundle from URI '%s'"), uri);
> +		goto cleanup;
> +	}
> +
> +cleanup:
> +	unlink(filename.buf);
> +	strbuf_release(&filename);
> +	return result;
> +}
> diff --git a/bundle-uri.h b/bundle-uri.h
> new file mode 100644
> index 00000000000..8a152f1ef14
> --- /dev/null
> +++ b/bundle-uri.h
> @@ -0,0 +1,14 @@
> +#ifndef BUNDLE_URI_H
> +#define BUNDLE_URI_H
> +
> +struct repository;
> +
> +/**
> + * Fetch data from the given 'uri' and unbundle the bundle data found
> + * based on that information.
> + *
> + * Returns non-zero if no bundle information is found at the given 'uri'.
> + */
> +int fetch_bundle_uri(struct repository *r, const char *uri);
> +
> +#endif
> -- 
> gitgitgadget
>
Derrick Stolee Aug. 1, 2022, 1:58 p.m. UTC | #2
On 7/27/2022 6:09 PM, Josh Steadmon wrote:
> On 2022.07.25 20:34, Derrick Stolee via GitGitGadget wrote:
>> From: Derrick Stolee <derrickstolee@github.com>
>>
>> Before implementing a way to fetch bundles into a repository, create the
>> basic logic. Assume that the URI is actually a file path. Future logic
>> will make this more careful to other protocols.
>>
>> For now, we also only succeed if the content at the URI is a bundle
>> file, not a bundle list. Bundle lists will be implemented in a future
>> change.
>>
>> Note that the discovery of a temporary filename is slightly racy because
>> the odb_mkstemp() relies on the temporary file not existing. With the
>> current implementation being limited to file copies, we could replace
>> the copy_file() with copy_fd(). The tricky part comes in future changes
>> that send the filename to 'git remote-https' and its 'get' capability.
> 
>> At that point, we need the file descriptor closed _and_ the file
>> unlinked.
> 
> Ahh, it looks like this was the point I missed in my previous review.
> IIUC, we need the file unlinked because http_get_file() will eventually
> call finalize_object_file() to move a tempfile to the final object name,
> and that will fail if we have an empty file already in place.

Yes, and I also was not sure what would happen if the empty file existed.
I tested it and thought allowing overwriting an existing file would be a
bigger problem than this choice of a filename.

We also discussed options about how it would be nice to have a predictable
filename so we could resume downloads _across Git process failures_
instead of just a network failure within a single Git process. This is
something to explore when creating that functionality.

Thanks,
-Stolee
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index 1624471badc..7d5f48069ea 100644
--- a/Makefile
+++ b/Makefile
@@ -906,6 +906,7 @@  LIB_OBJS += blob.o
 LIB_OBJS += bloom.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
+LIB_OBJS += bundle-uri.o
 LIB_OBJS += bundle.o
 LIB_OBJS += cache-tree.o
 LIB_OBJS += cbtree.o
diff --git a/bundle-uri.c b/bundle-uri.c
new file mode 100644
index 00000000000..b35babc36aa
--- /dev/null
+++ b/bundle-uri.c
@@ -0,0 +1,104 @@ 
+#include "cache.h"
+#include "bundle-uri.h"
+#include "bundle.h"
+#include "object-store.h"
+#include "refs.h"
+#include "run-command.h"
+
+static int find_temp_filename(struct strbuf *name)
+{
+	int fd;
+	/*
+	 * Find a temporary filename that is available. This is briefly
+	 * racy, but unlikely to collide.
+	 */
+	fd = odb_mkstemp(name, "bundles/tmp_uri_XXXXXX");
+	if (fd < 0) {
+		warning(_("failed to create temporary file"));
+		return -1;
+	}
+
+	close(fd);
+	unlink(name->buf);
+	return 0;
+}
+
+static int copy_uri_to_file(const char *file, const char *uri)
+{
+	/* File-based URIs only for now. */
+	return copy_file(file, uri, 0);
+}
+
+static int unbundle_from_file(struct repository *r, const char *file)
+{
+	int result = 0;
+	int bundle_fd;
+	struct bundle_header header = BUNDLE_HEADER_INIT;
+	struct string_list_item *refname;
+	struct strbuf bundle_ref = STRBUF_INIT;
+	size_t bundle_prefix_len;
+
+	if ((bundle_fd = read_bundle_header(file, &header)) < 0)
+		return 1;
+
+	if ((result = unbundle(r, &header, bundle_fd, NULL)))
+		return 1;
+
+	/*
+	 * Convert all refs/heads/ from the bundle into refs/bundles/
+	 * in the local repository.
+	 */
+	strbuf_addstr(&bundle_ref, "refs/bundles/");
+	bundle_prefix_len = bundle_ref.len;
+
+	for_each_string_list_item(refname, &header.references) {
+		struct object_id *oid = refname->util;
+		struct object_id old_oid;
+		const char *branch_name;
+		int has_old;
+
+		if (!skip_prefix(refname->string, "refs/heads/", &branch_name))
+			continue;
+
+		strbuf_setlen(&bundle_ref, bundle_prefix_len);
+		strbuf_addstr(&bundle_ref, branch_name);
+
+		has_old = !read_ref(bundle_ref.buf, &old_oid);
+		update_ref("fetched bundle", bundle_ref.buf, oid,
+			   has_old ? &old_oid : NULL,
+			   REF_SKIP_OID_VERIFICATION,
+			   UPDATE_REFS_MSG_ON_ERR);
+	}
+
+	bundle_header_release(&header);
+	return result;
+}
+
+int fetch_bundle_uri(struct repository *r, const char *uri)
+{
+	int result = 0;
+	struct strbuf filename = STRBUF_INIT;
+
+	if ((result = find_temp_filename(&filename)))
+		goto cleanup;
+
+	if ((result = copy_uri_to_file(filename.buf, uri))) {
+		warning(_("failed to download bundle from URI '%s'"), uri);
+		goto cleanup;
+	}
+
+	if ((result = !is_bundle(filename.buf, 0))) {
+		warning(_("file at URI '%s' is not a bundle"), uri);
+		goto cleanup;
+	}
+
+	if ((result = unbundle_from_file(r, filename.buf))) {
+		warning(_("failed to unbundle bundle from URI '%s'"), uri);
+		goto cleanup;
+	}
+
+cleanup:
+	unlink(filename.buf);
+	strbuf_release(&filename);
+	return result;
+}
diff --git a/bundle-uri.h b/bundle-uri.h
new file mode 100644
index 00000000000..8a152f1ef14
--- /dev/null
+++ b/bundle-uri.h
@@ -0,0 +1,14 @@ 
+#ifndef BUNDLE_URI_H
+#define BUNDLE_URI_H
+
+struct repository;
+
+/**
+ * Fetch data from the given 'uri' and unbundle the bundle data found
+ * based on that information.
+ *
+ * Returns non-zero if no bundle information is found at the given 'uri'.
+ */
+int fetch_bundle_uri(struct repository *r, const char *uri);
+
+#endif