Message ID | 20211203093530.93589-3-chiyutianyi@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | unpack large objects in stream | expand |
On Fri, Dec 03 2021, Han Xin wrote: > From: Han Xin <hanxin.hx@alibaba-inc.com> > > When streaming a large blob object to "write_loose_object()", we have no > chance to run "write_object_file_prepare()" to calculate the oid in > advance. So we need to handle undetermined oid in function > "write_loose_object()". > > In the original implementation, we know the oid and we can write the > temporary file in the same directory as the final object, but for an > object with an undetermined oid, we don't know the exact directory for > the object, so we have to save the temporary file in ".git/objects/" > directory instead. > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > --- > object-file.c | 30 ++++++++++++++++++++++++++++-- > 1 file changed, 28 insertions(+), 2 deletions(-) > > diff --git a/object-file.c b/object-file.c > index 82656f7428..1c41587bfb 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > const void *buf; > unsigned long len; > > - loose_object_path(the_repository, &filename, oid); > + if (is_null_oid(oid)) { > + /* When oid is not determined, save tmp file to odb path. */ > + strbuf_reset(&filename); > + strbuf_addstr(&filename, the_repository->objects->odb->path); > + strbuf_addch(&filename, '/'); > + } else { > + loose_object_path(the_repository, &filename, oid); > + } > > fd = create_tmpfile(&tmp_file, filename.buf); > if (fd < 0) { > @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), > ret); > the_hash_algo->final_oid_fn(¶no_oid, &c); > - if (!oideq(oid, ¶no_oid)) > + if (!is_null_oid(oid) && !oideq(oid, ¶no_oid)) > die(_("confused by unstable object source data for %s"), > oid_to_hex(oid)); > > close_loose_object(fd); > > + if (is_null_oid(oid)) { > + int dirlen; > + > + oidcpy((struct object_id *)oid, ¶no_oid); > + loose_object_path(the_repository, &filename, oid); Why are we breaking the promise that "oid" is constant here? I tested locally with the below on top, and it seems to work (at least no tests broke). Isn't it preferrable to the cast & the caller having its "oid" changed? diff --git a/object-file.c b/object-file.c index 71d510614b9..d014e6942ea 100644 --- a/object-file.c +++ b/object-file.c @@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr, close_loose_object(fd); if (is_null_oid(oid)) { + struct object_id oid2; int dirlen; - oidcpy((struct object_id *)oid, ¶no_oid); - loose_object_path(the_repository, &filename, oid); + oidcpy(&oid2, ¶no_oid); + loose_object_path(the_repository, &filename, &oid2); /* We finally know the object path, and create the missing dir. */ dirlen = directory_size(filename.buf);
On Fri, Dec 03 2021, Han Xin wrote: > From: Han Xin <hanxin.hx@alibaba-inc.com> > > When streaming a large blob object to "write_loose_object()", we have no > chance to run "write_object_file_prepare()" to calculate the oid in > advance. So we need to handle undetermined oid in function > "write_loose_object()". > > In the original implementation, we know the oid and we can write the > temporary file in the same directory as the final object, but for an > object with an undetermined oid, we don't know the exact directory for > the object, so we have to save the temporary file in ".git/objects/" > directory instead. > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > --- > object-file.c | 30 ++++++++++++++++++++++++++++-- > 1 file changed, 28 insertions(+), 2 deletions(-) > > diff --git a/object-file.c b/object-file.c > index 82656f7428..1c41587bfb 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > const void *buf; > unsigned long len; > > - loose_object_path(the_repository, &filename, oid); > + if (is_null_oid(oid)) { > + /* When oid is not determined, save tmp file to odb path. */ > + strbuf_reset(&filename); Why re-use this & leak memory? An existing strbuf use in this function doesn't leak in the same way. Just release it as in the below patch on top (the ret v.s. err variable naming is a bit confused, maybe could do with a prep cleanup step.). > + strbuf_addstr(&filename, the_repository->objects->odb->path); > + strbuf_addch(&filename, '/'); And once we do that this could just become: strbuf_addf($filename, "%s/", ...) Is there's existing uses of this pattern, so mayb e not worth it, but it allows you to remove the braces on the if/else. diff --git a/object-file.c b/object-file.c index 8bd89e7b7ba..2b52f3fc1cc 100644 --- a/object-file.c +++ b/object-file.c @@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr, int hdrlen, struct input_stream *in_stream, time_t mtime, unsigned flags) { - int fd, ret; + int fd, ret, err = 0; unsigned char compressed[4096]; git_zstream stream; git_hash_ctx c; @@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr, if (is_null_oid(oid)) { /* When oid is not determined, save tmp file to odb path. */ - strbuf_reset(&filename); strbuf_addstr(&filename, the_repository->objects->odb->path); strbuf_addch(&filename, '/'); } else { @@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr, fd = create_tmpfile(&tmp_file, filename.buf); if (fd < 0) { if (flags & HASH_SILENT) - return -1; + err = -1; else if (errno == EACCES) - return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); + err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); else - return error_errno(_("unable to create temporary file")); + err = error_errno(_("unable to create temporary file")); + goto cleanup; } /* Set it up */ @@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr, struct strbuf dir = STRBUF_INIT; strbuf_add(&dir, filename.buf, dirlen - 1); if (mkdir(dir.buf, 0777) && errno != EEXIST) - return -1; - if (adjust_shared_perm(dir.buf)) - return -1; - strbuf_release(&dir); + err = -1; + else if (adjust_shared_perm(dir.buf)) + err = -1; + else + strbuf_release(&dir); + if (err < 0) + goto cleanup; } } @@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr, warning_errno(_("failed utime() on %s"), tmp_file.buf); } - return finalize_object_file(tmp_file.buf, filename.buf); + err = finalize_object_file(tmp_file.buf, filename.buf); +cleanup: + strbuf_release(&filename); + return err; } static int freshen_loose_object(const struct object_id *oid)
On Fri, Dec 3, 2021 at 9:27 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > > > On Fri, Dec 03 2021, Han Xin wrote: > > > From: Han Xin <hanxin.hx@alibaba-inc.com> > > > > When streaming a large blob object to "write_loose_object()", we have no > > chance to run "write_object_file_prepare()" to calculate the oid in > > advance. So we need to handle undetermined oid in function > > "write_loose_object()". > > > > In the original implementation, we know the oid and we can write the > > temporary file in the same directory as the final object, but for an > > object with an undetermined oid, we don't know the exact directory for > > the object, so we have to save the temporary file in ".git/objects/" > > directory instead. > > > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > > --- > > object-file.c | 30 ++++++++++++++++++++++++++++-- > > 1 file changed, 28 insertions(+), 2 deletions(-) > > > > diff --git a/object-file.c b/object-file.c > > index 82656f7428..1c41587bfb 100644 > > --- a/object-file.c > > +++ b/object-file.c > > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > > const void *buf; > > unsigned long len; > > > > - loose_object_path(the_repository, &filename, oid); > > + if (is_null_oid(oid)) { > > + /* When oid is not determined, save tmp file to odb path. */ > > + strbuf_reset(&filename); > > + strbuf_addstr(&filename, the_repository->objects->odb->path); > > + strbuf_addch(&filename, '/'); > > + } else { > > + loose_object_path(the_repository, &filename, oid); > > + } > > > > fd = create_tmpfile(&tmp_file, filename.buf); > > if (fd < 0) { > > @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > > die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), > > ret); > > the_hash_algo->final_oid_fn(¶no_oid, &c); > > - if (!oideq(oid, ¶no_oid)) > > + if (!is_null_oid(oid) && !oideq(oid, ¶no_oid)) > > die(_("confused by unstable object source data for %s"), > > oid_to_hex(oid)); > > > > close_loose_object(fd); > > > > + if (is_null_oid(oid)) { > > + int dirlen; > > + > > + oidcpy((struct object_id *)oid, ¶no_oid); > > + loose_object_path(the_repository, &filename, oid); > > Why are we breaking the promise that "oid" is constant here? I tested > locally with the below on top, and it seems to work (at least no tests > broke). Isn't it preferrable to the cast & the caller having its "oid" > changed? > > diff --git a/object-file.c b/object-file.c > index 71d510614b9..d014e6942ea 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1958,10 +1958,11 @@ int write_loose_object(const struct object_id *oid, char *hdr, > close_loose_object(fd); > > if (is_null_oid(oid)) { > + struct object_id oid2; > int dirlen; > > - oidcpy((struct object_id *)oid, ¶no_oid); > - loose_object_path(the_repository, &filename, oid); > + oidcpy(&oid2, ¶no_oid); > + loose_object_path(the_repository, &filename, &oid2); > > /* We finally know the object path, and create the missing dir. */ > dirlen = directory_size(filename.buf); Maybe I should change the promise that "oid" is constant in "write_loose_object()". The original write_object_file_flags() defines a variable "oid", and completes the calculation of the "oid" in "write_object_file_prepare()" which will be passed to "write_loose_object()". If a null oid is maintained after calling "write_loose_object()", "--strict" will become meaningless, although it does not break existing test cases.
On Fri, Dec 3, 2021 at 9:54 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > > > On Fri, Dec 03 2021, Han Xin wrote: > > > From: Han Xin <hanxin.hx@alibaba-inc.com> > > > > When streaming a large blob object to "write_loose_object()", we have no > > chance to run "write_object_file_prepare()" to calculate the oid in > > advance. So we need to handle undetermined oid in function > > "write_loose_object()". > > > > In the original implementation, we know the oid and we can write the > > temporary file in the same directory as the final object, but for an > > object with an undetermined oid, we don't know the exact directory for > > the object, so we have to save the temporary file in ".git/objects/" > > directory instead. > > > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > > --- > > object-file.c | 30 ++++++++++++++++++++++++++++-- > > 1 file changed, 28 insertions(+), 2 deletions(-) > > > > diff --git a/object-file.c b/object-file.c > > index 82656f7428..1c41587bfb 100644 > > --- a/object-file.c > > +++ b/object-file.c > > @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr, > > const void *buf; > > unsigned long len; > > > > - loose_object_path(the_repository, &filename, oid); > > + if (is_null_oid(oid)) { > > + /* When oid is not determined, save tmp file to odb path. */ > > + strbuf_reset(&filename); > > Why re-use this & leak memory? An existing strbuf use in this function > doesn't leak in the same way. Just release it as in the below patch on > top (the ret v.s. err variable naming is a bit confused, maybe could do > with a prep cleanup step.). > > > + strbuf_addstr(&filename, the_repository->objects->odb->path); > > + strbuf_addch(&filename, '/'); > > And once we do that this could just become: > > strbuf_addf($filename, "%s/", ...) > > Is there's existing uses of this pattern, so mayb e not worth it, but it > allows you to remove the braces on the if/else. > > diff --git a/object-file.c b/object-file.c > index 8bd89e7b7ba..2b52f3fc1cc 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1880,7 +1880,7 @@ int write_loose_object(const struct object_id *oid, char *hdr, > int hdrlen, struct input_stream *in_stream, > time_t mtime, unsigned flags) > { > - int fd, ret; > + int fd, ret, err = 0; > unsigned char compressed[4096]; > git_zstream stream; > git_hash_ctx c; > @@ -1892,7 +1892,6 @@ int write_loose_object(const struct object_id *oid, char *hdr, > > if (is_null_oid(oid)) { > /* When oid is not determined, save tmp file to odb path. */ > - strbuf_reset(&filename); > strbuf_addstr(&filename, the_repository->objects->odb->path); > strbuf_addch(&filename, '/'); > } else { > @@ -1902,11 +1901,12 @@ int write_loose_object(const struct object_id *oid, char *hdr, > fd = create_tmpfile(&tmp_file, filename.buf); > if (fd < 0) { > if (flags & HASH_SILENT) > - return -1; > + err = -1; > else if (errno == EACCES) > - return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); > + err = error(_("insufficient permission for adding an object to repository database %s"), get_object_directory()); > else > - return error_errno(_("unable to create temporary file")); > + err = error_errno(_("unable to create temporary file")); > + goto cleanup; > } > > /* Set it up */ > @@ -1968,10 +1968,13 @@ int write_loose_object(const struct object_id *oid, char *hdr, > struct strbuf dir = STRBUF_INIT; > strbuf_add(&dir, filename.buf, dirlen - 1); > if (mkdir(dir.buf, 0777) && errno != EEXIST) > - return -1; > - if (adjust_shared_perm(dir.buf)) > - return -1; > - strbuf_release(&dir); > + err = -1; > + else if (adjust_shared_perm(dir.buf)) > + err = -1; > + else > + strbuf_release(&dir); > + if (err < 0) > + goto cleanup; > } > } > > @@ -1984,7 +1987,10 @@ int write_loose_object(const struct object_id *oid, char *hdr, > warning_errno(_("failed utime() on %s"), tmp_file.buf); > } > > - return finalize_object_file(tmp_file.buf, filename.buf); > + err = finalize_object_file(tmp_file.buf, filename.buf); > +cleanup: > + strbuf_release(&filename); > + return err; > } > > static int freshen_loose_object(const struct object_id *oid) Yes, this will be much better. Will apply.
diff --git a/object-file.c b/object-file.c index 82656f7428..1c41587bfb 100644 --- a/object-file.c +++ b/object-file.c @@ -1892,7 +1892,14 @@ static int write_loose_object(const struct object_id *oid, char *hdr, const void *buf; unsigned long len; - loose_object_path(the_repository, &filename, oid); + if (is_null_oid(oid)) { + /* When oid is not determined, save tmp file to odb path. */ + strbuf_reset(&filename); + strbuf_addstr(&filename, the_repository->objects->odb->path); + strbuf_addch(&filename, '/'); + } else { + loose_object_path(the_repository, &filename, oid); + } fd = create_tmpfile(&tmp_file, filename.buf); if (fd < 0) { @@ -1939,12 +1946,31 @@ static int write_loose_object(const struct object_id *oid, char *hdr, die(_("deflateEnd on object %s failed (%d)"), oid_to_hex(oid), ret); the_hash_algo->final_oid_fn(¶no_oid, &c); - if (!oideq(oid, ¶no_oid)) + if (!is_null_oid(oid) && !oideq(oid, ¶no_oid)) die(_("confused by unstable object source data for %s"), oid_to_hex(oid)); close_loose_object(fd); + if (is_null_oid(oid)) { + int dirlen; + + oidcpy((struct object_id *)oid, ¶no_oid); + loose_object_path(the_repository, &filename, oid); + + /* We finally know the object path, and create the missing dir. */ + dirlen = directory_size(filename.buf); + if (dirlen) { + struct strbuf dir = STRBUF_INIT; + strbuf_add(&dir, filename.buf, dirlen - 1); + if (mkdir(dir.buf, 0777) && errno != EEXIST) + return -1; + if (adjust_shared_perm(dir.buf)) + return -1; + strbuf_release(&dir); + } + } + if (mtime) { struct utimbuf utb; utb.actime = mtime;