Message ID | 9ab631a3b29addaa54415139e7f60a79a19a6edb.1640199396.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Reftable coverity fixes | expand |
"Han-Wen Nienhuys via GitGitGadget" <gitgitgadget@gmail.com> writes: > diff --git a/reftable/reader.c b/reftable/reader.c > index 006709a645a..0d16b098f5e 100644 > --- a/reftable/reader.c > +++ b/reftable/reader.c > @@ -290,28 +290,34 @@ int reader_init_block_reader(struct reftable_reader *r, struct block_reader *br, > > err = reader_get_block(r, &block, next_off, guess_block_size); > if (err < 0) > - return err; > + goto done; > > block_size = extract_block_size(block.data, &block_typ, next_off, > r->version); > - if (block_size < 0) > - return block_size; > - > + if (block_size < 0) { > + err = block_size; > + goto done; > + } > if (want_typ != BLOCK_TYPE_ANY && block_typ != want_typ) { > - reftable_block_done(&block); > - return 1; > + err = 1; > + goto done; > } > > if (block_size > guess_block_size) { > reftable_block_done(&block); > err = reader_get_block(r, &block, next_off, block_size); > if (err < 0) { > - return err; > + goto done; > } > } > > - return block_reader_init(br, &block, header_off, r->block_size, > - hash_size(r->hash_id)); > + err = block_reader_init(br, &block, header_off, r->block_size, > + hash_size(r->hash_id)); > +done: > + if (err) Is the convention for reader_init() different from all other functions? It makes reader wonder why this is not if (err < 0) even though it is not wrong per-se (as long as "zero means success" is a part of the return value convention). > + reftable_block_done(&block); > + > + return err; > } This one is new in this round. All look good, other than that one check for error return. > diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c > index 5f6bcc2f775..6e88182a83a 100644 > --- a/reftable/readwrite_test.c > +++ b/reftable/readwrite_test.c > @@ -254,6 +254,71 @@ static void test_log_write_read(void) > reader_close(&rd); > } > > +static void test_log_zlib_corruption(void) > +{ > + struct reftable_write_options opts = { > + .block_size = 256, > + }; > + struct reftable_iterator it = { 0 }; > + struct reftable_reader rd = { 0 }; > + struct reftable_block_source source = { 0 }; > + struct strbuf buf = STRBUF_INIT; > + struct reftable_writer *w = > + reftable_new_writer(&strbuf_add_void, &buf, &opts); > + const struct reftable_stats *stats = NULL; > + uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 }; > + uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 }; Will this code be exercised when compiling with SHA256 support? If not, this is perfectly fine, but otherwise, this needs to be MAX, not SHA1, no? > + char message[100] = { 0 }; You're filling this to the sizeof(message)-1, so we can afford to leave it uninitialized. > + int err, i, n; > + > + struct reftable_log_record log = { > + .refname = "refname", > + .value_type = REFTABLE_LOG_UPDATE, > + .value = { > + .update = { > + .new_hash = hash1, > + .old_hash = hash2, > + .name = "My Name", > + .email = "myname@invalid", > + .message = message, > + }, > + }, > + }; > + > + for (i = 0; i < sizeof(message)-1; i++) Style: SP around "-" on both sides. > + message[i] = (uint8_t)(rand() % 64 + ' '); > + > + reftable_writer_set_limits(w, 1, 1);
On Wed, Dec 22, 2021 at 11:51 PM Junio C Hamano <gitster@pobox.com> wrote: > > + if (err) > > Is the convention for reader_init() different from all other > functions? It makes reader wonder why this is not > > if (err < 0) > > + reftable_block_done(&block); > > even though it is not wrong per-se (as long as "zero means success" > is a part of the return value convention). err > 0 is returned when we reach the end of the iteration, and this function can generate err==1. Normally, block_reader_init() transfers the block to the block_reader. If err > 0, we skip that, so we'd be leaking the block. At the same time, this means the "if (err)" is superfluous. In the success case, the block was transferred to the block_reader, so the reftable_block_done() call is a nop. > > > + > > + return err; > > } > > This one is new in this round. All look good, other than that one > check for error return. > > > diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c > > index 5f6bcc2f775..6e88182a83a 100644 > > --- a/reftable/readwrite_test.c > > +++ b/reftable/readwrite_test.c > > @@ -254,6 +254,71 @@ static void test_log_write_read(void) > > reader_close(&rd); > > } > > > > +static void test_log_zlib_corruption(void) > > +{ > > + struct reftable_write_options opts = { > > + .block_size = 256, > > + }; > > + struct reftable_iterator it = { 0 }; > > + struct reftable_reader rd = { 0 }; > > + struct reftable_block_source source = { 0 }; > > + struct strbuf buf = STRBUF_INIT; > > + struct reftable_writer *w = > > + reftable_new_writer(&strbuf_add_void, &buf, &opts); > > + const struct reftable_stats *stats = NULL; > > + uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 }; > > + uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 }; > > Will this code be exercised when compiling with SHA256 support? If > not, this is perfectly fine, but otherwise, this needs to be MAX, > not SHA1, no? The code is parameterized in terms of hash_size, so we don't have to test both flavors exhaustively. There is a test_table_read_write_seek_linear_sha256() that ensures that the basic functionality works for SHA256. > > + char message[100] = { 0 }; > > You're filling this to the sizeof(message)-1, so we can afford to > leave it uninitialized. At the same time, we can afford to initialize it :-) I'd rather not think about this, and always initialize everything. > > + for (i = 0; i < sizeof(message)-1; i++) > > Style: SP around "-" on both sides. done. (I assume I don't have to resend the whole series for these small tweaks? I'll wait if anyone else has comments, and send a reroll early January)
Han-Wen Nienhuys <hanwen@google.com> writes: > Normally, block_reader_init() transfers the block to the block_reader. > If err > 0, we skip that, so we'd be leaking the block. Thanks. I was about to say that <reftable/block.h> wants to say more than "initializes a block reader."; perhaps "returns the number of blocks" or something needs to be added. but I do not think the lack of the documentation was the source of my confusion. > At the same time, this means the "if (err)" is superfluous. In the > success case, the block was transferred to the block_reader, so the > reftable_block_done() call is a nop. I agree that having the "if (err)" there is highly misleading. "if (err < 0 || 0 < err)" would be the more faithful expression of what you explained, and if it were written that way, I wouldn't have been as confused as I was. In any case, if _done() becomes a safe and quick no-op when 0 block was transferred, losing the conditional would be the way to make the resulting code the most readable, I think. >> > + uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 }; >> > + uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 }; >> >> Will this code be exercised when compiling with SHA256 support? If >> not, this is perfectly fine, but otherwise, this needs to be MAX, >> not SHA1, no? > > The code is parameterized in terms of hash_size, so we don't have to > test both flavors exhaustively. If it is safe for this function to assume that it will be called only with hash_size of SHA1, then what you wrote is perfectly fine, and I agree that there is no particular reason why this test must be repeated for all available hashes. I just don't know if we have a mechanism to prevent clueless code churners from saying "let's test both flavors exhaustively". For example, I see nothing in this function, other than the size of these two hashes, that says that this test function will be compiled with reftable library that expects that the hash function is SHA1. How does this function protect itself from being run in a CI job that uses GIT_TEST_DEFAULT_HASH=sha256, for example? It was why I asked the question. It wasn't that I necessarily thought we need to test all combinations (but there is no need not to, if it takes more engineering time to exclude it when testing Git as a whole in sha256 mode). >> > + char message[100] = { 0 }; >> >> You're filling this to the sizeof(message)-1, so we can afford to >> leave it uninitialized. > > At the same time, we can afford to initialize it :-) > > I'd rather not think about this, and always initialize everything. I do not care too deeply in this test-only function, but as a general principle, primarily to help less experienced developers who may be reading from the sidelines to avoid copying such an attitude, I have to make a note here. If you know you will have to fill an array with, or assign to a variable, meaningful value(s), leaving the array or variable uninitialized at the declaration time is a much better thing to do than initializing them with less meaningful value(s). It will help compilers and tools to detect a lack of proper assignment and use of uninitialized variable (iow, you may know you will have to fill, but in the code to do so, your implementation may be botched). Once you initialize at the declaration with "less meaningful" value (like zero initialization), the tools won't be able to tell when the code uses that variable "uninitialized" (because the assignment was skipped by a bug), since it appears to always be initialied to them. Helping the tools help us is what those of us, who would rather not think about it, should be doing.
On Fri, Dec 24, 2021 at 5:16 AM Junio C Hamano <gitster@pobox.com> wrote: > Once you > initialize at the declaration with "less meaningful" value (like > zero initialization), the tools won't be able to tell when the code > uses that variable "uninitialized" (because the assignment was > skipped by a bug), since it appears to always be initialied to them. Which tools are these? When I add static void test_memcpy(void) { uint32_t dest; char not_init[200]; int i; memcpy(&dest, not_init, sizeof(dest)); for (i = 0 ; i < 10; i++) not_init[i] = rand() % 255 + 1; printf("%d", (int) strlen(not_init)); } to the C code, it compiles cleanly if I do "make DEVELOPER=1".
Am 12.01.22 um 12:58 schrieb Han-Wen Nienhuys: > On Fri, Dec 24, 2021 at 5:16 AM Junio C Hamano <gitster@pobox.com> wrote: >> Once you >> initialize at the declaration with "less meaningful" value (like >> zero initialization), the tools won't be able to tell when the code >> uses that variable "uninitialized" (because the assignment was >> skipped by a bug), since it appears to always be initialied to them. > > Which tools are these? When I add > > static void test_memcpy(void) > { > uint32_t dest; > char not_init[200]; > int i; > memcpy(&dest, not_init, sizeof(dest)); > > for (i = 0 ; i < 10; i++) > not_init[i] = rand() % 255 + 1; > printf("%d", (int) strlen(not_init)); > } > > to the C code, it compiles cleanly if I do "make DEVELOPER=1". https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html says about -Wuninitialized: "Note that there may be no warning about a variable that is used only to compute a value that itself is never used, because such computations may be deleted by data flow analysis before the warnings are printed." And indeed, dest is not used above. But even if we change the funtion to use dest by returning it, GCC versions 9.1 and higher don't warn about the use of the uninitialized buffer. GCC 4.7.1 to 8.5 do warn, though: https://godbolt.org/z/zYz9KvPdK René
On Wed, Jan 12 2022, Han-Wen Nienhuys wrote: > On Fri, Dec 24, 2021 at 5:16 AM Junio C Hamano <gitster@pobox.com> wrote: >> Once you >> initialize at the declaration with "less meaningful" value (like >> zero initialization), the tools won't be able to tell when the code >> uses that variable "uninitialized" (because the assignment was >> skipped by a bug), since it appears to always be initialied to them. > > Which tools are these? When I add > > static void test_memcpy(void) > { > uint32_t dest; > char not_init[200]; > int i; > memcpy(&dest, not_init, sizeof(dest)); > > for (i = 0 ; i < 10; i++) > not_init[i] = rand() % 255 + 1; > printf("%d", (int) strlen(not_init)); > } > > to the C code, it compiles cleanly if I do "make DEVELOPER=1". Aside from what René said in his follow-up, I think what Junio's pointing out here has to do with the use of this pattern in general, not the specific code being discussed here. I.e. if you get into the habit of needless initialization it may not matter right now, but once the function grows an if/else branch, or someone copies the pattern to such a function it may hide a logic error. So it's not about analyzing the control specific flow here, but that your upthread patch is separating a variable and its actual internalization by ~20 lines. So in the general case, by initializing it when it's declared it's more likely that we'll introduce a logic error where it won't be initialized at all.
On Thu, Jan 13, 2022 at 11:02 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > > Aside from what René said in his follow-up, I think what Junio's > pointing out here has to do with the use of this pattern in general, not > the specific code being discussed here. > > I.e. if you get into the habit of needless initialization it may not > matter right now, but once the function grows an if/else branch, or > someone copies the pattern to such a function it may hide a logic error. > > So it's not about analyzing the control specific flow here, but that > your upthread patch is separating a variable and its actual > internalization by ~20 lines. I know this is the Git project's preferred style, so I'm OK with adapting to that, but I'm also sad about it. Sure, you can introduce logic errors by refactoring things, but with initialized data, you get a reproducible logic error, rather than something whose failure mode depends on platform, compiler and surrounding function calls. This makes debugging problems across platforms easier. In particular, I don't have a functioning Windows environment, so anything that helps minimize differences across platforms is welcome to me. -- Han-Wen Nienhuys - Google Munich I work 80%. Don't expect answers from me on Fridays. -- Google Germany GmbH, Erika-Mann-Strasse 33, 80636 Munich Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Paul Manicle, Halimah DeLaine Prado -- Google Germany GmbH, Erika-Mann-Strasse 33, 80636 Munich Registergericht und -nummer: Hamburg, HRB 86891 Sitz der Gesellschaft: Hamburg Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
René Scharfe <l.s.r@web.de> writes: > Am 12.01.22 um 12:58 schrieb Han-Wen Nienhuys: >> On Fri, Dec 24, 2021 at 5:16 AM Junio C Hamano <gitster@pobox.com> wrote: >>> Once you >>> initialize at the declaration with "less meaningful" value (like >>> zero initialization), the tools won't be able to tell when the code >>> uses that variable "uninitialized" (because the assignment was >>> skipped by a bug), since it appears to always be initialied to them. >> >> Which tools are these? When I add >> >> static void test_memcpy(void) >> { >> uint32_t dest; >> char not_init[200]; >> int i; >> memcpy(&dest, not_init, sizeof(dest)); >> >> for (i = 0 ; i < 10; i++) >> not_init[i] = rand() % 255 + 1; >> printf("%d", (int) strlen(not_init)); >> } >> >> to the C code, it compiles cleanly if I do "make DEVELOPER=1". > > https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html says about > -Wuninitialized: > > "Note that there may be no warning about a variable that is used only > to compute a value that itself is never used, because such > computations may be deleted by data flow analysis before the > warnings are printed." > > And indeed, dest is not used above. But even if we change the funtion > to use dest by returning it, GCC versions 9.1 and higher don't warn > about the use of the uninitialized buffer. GCC 4.7.1 to 8.5 do warn, > though: https://godbolt.org/z/zYz9KvPdK What I meant in the original discussion was a runtime checker that notices a read from an uninitialized location (I've also written one myself way before my Git time, which was how I noticed strcpy() on old SunOS, in an attempt to optimize by loading a word at a time, sometimes read one word too much beyond the end of a page). But if a static analysis may catch itas a potential error, that is another reason to worry about it, too. The original code wanted to do char message[100] = { 0 }; for (i = 0; i < sizeof(message) - 1; i++) message[i] = something(); use_NUL_terminated_string(message); and it allowed to omit an assigmnent message[sizeof(message) - 1] = '\0'; after the loop. As I already said, I do not care too deeply in a test-only helper like this. Thanks.
diff --git a/reftable/block.c b/reftable/block.c index 855e3f5c947..6c8e8705205 100644 --- a/reftable/block.c +++ b/reftable/block.c @@ -188,13 +188,16 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block, uint32_t full_block_size = table_block_size; uint8_t typ = block->data[header_off]; uint32_t sz = get_be24(block->data + header_off + 1); - + int err = 0; uint16_t restart_count = 0; uint32_t restart_start = 0; uint8_t *restart_bytes = NULL; + uint8_t *uncompressed = NULL; - if (!reftable_is_block_type(typ)) - return REFTABLE_FORMAT_ERROR; + if (!reftable_is_block_type(typ)) { + err = REFTABLE_FORMAT_ERROR; + goto done; + } if (typ == BLOCK_TYPE_LOG) { int block_header_skip = 4 + header_off; @@ -203,7 +206,7 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block, uLongf src_len = block->len - block_header_skip; /* Log blocks specify the *uncompressed* size in their header. */ - uint8_t *uncompressed = reftable_malloc(sz); + uncompressed = reftable_malloc(sz); /* Copy over the block header verbatim. It's not compressed. */ memcpy(uncompressed, block->data, block_header_skip); @@ -212,16 +215,19 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block, if (Z_OK != uncompress2(uncompressed + block_header_skip, &dst_len, block->data + block_header_skip, &src_len)) { - reftable_free(uncompressed); - return REFTABLE_ZLIB_ERROR; + err = REFTABLE_ZLIB_ERROR; + goto done; } - if (dst_len + block_header_skip != sz) - return REFTABLE_FORMAT_ERROR; + if (dst_len + block_header_skip != sz) { + err = REFTABLE_FORMAT_ERROR; + goto done; + } /* We're done with the input data. */ reftable_block_done(block); block->data = uncompressed; + uncompressed = NULL; block->len = sz; block->source = malloc_block_source(); full_block_size = src_len + block_header_skip; @@ -251,7 +257,9 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block, br->restart_count = restart_count; br->restart_bytes = restart_bytes; - return 0; +done: + reftable_free(uncompressed); + return err; } static uint32_t block_reader_restart_offset(struct block_reader *br, int i) diff --git a/reftable/reader.c b/reftable/reader.c index 006709a645a..0d16b098f5e 100644 --- a/reftable/reader.c +++ b/reftable/reader.c @@ -290,28 +290,34 @@ int reader_init_block_reader(struct reftable_reader *r, struct block_reader *br, err = reader_get_block(r, &block, next_off, guess_block_size); if (err < 0) - return err; + goto done; block_size = extract_block_size(block.data, &block_typ, next_off, r->version); - if (block_size < 0) - return block_size; - + if (block_size < 0) { + err = block_size; + goto done; + } if (want_typ != BLOCK_TYPE_ANY && block_typ != want_typ) { - reftable_block_done(&block); - return 1; + err = 1; + goto done; } if (block_size > guess_block_size) { reftable_block_done(&block); err = reader_get_block(r, &block, next_off, block_size); if (err < 0) { - return err; + goto done; } } - return block_reader_init(br, &block, header_off, r->block_size, - hash_size(r->hash_id)); + err = block_reader_init(br, &block, header_off, r->block_size, + hash_size(r->hash_id)); +done: + if (err) + reftable_block_done(&block); + + return err; } static int table_iter_next_block(struct table_iter *dest, diff --git a/reftable/readwrite_test.c b/reftable/readwrite_test.c index 5f6bcc2f775..6e88182a83a 100644 --- a/reftable/readwrite_test.c +++ b/reftable/readwrite_test.c @@ -254,6 +254,71 @@ static void test_log_write_read(void) reader_close(&rd); } +static void test_log_zlib_corruption(void) +{ + struct reftable_write_options opts = { + .block_size = 256, + }; + struct reftable_iterator it = { 0 }; + struct reftable_reader rd = { 0 }; + struct reftable_block_source source = { 0 }; + struct strbuf buf = STRBUF_INIT; + struct reftable_writer *w = + reftable_new_writer(&strbuf_add_void, &buf, &opts); + const struct reftable_stats *stats = NULL; + uint8_t hash1[GIT_SHA1_RAWSZ] = { 1 }; + uint8_t hash2[GIT_SHA1_RAWSZ] = { 2 }; + char message[100] = { 0 }; + int err, i, n; + + struct reftable_log_record log = { + .refname = "refname", + .value_type = REFTABLE_LOG_UPDATE, + .value = { + .update = { + .new_hash = hash1, + .old_hash = hash2, + .name = "My Name", + .email = "myname@invalid", + .message = message, + }, + }, + }; + + for (i = 0; i < sizeof(message)-1; i++) + message[i] = (uint8_t)(rand() % 64 + ' '); + + reftable_writer_set_limits(w, 1, 1); + + err = reftable_writer_add_log(w, &log); + EXPECT_ERR(err); + + n = reftable_writer_close(w); + EXPECT(n == 0); + + stats = writer_stats(w); + EXPECT(stats->log_stats.blocks > 0); + reftable_writer_free(w); + w = NULL; + + /* corrupt the data. */ + buf.buf[50] ^= 0x99; + + block_source_from_strbuf(&source, &buf); + + err = init_reader(&rd, &source, "file.log"); + EXPECT_ERR(err); + + err = reftable_reader_seek_log(&rd, &it, "refname"); + EXPECT(err == REFTABLE_ZLIB_ERROR); + + reftable_iterator_destroy(&it); + + /* cleanup. */ + strbuf_release(&buf); + reader_close(&rd); +} + static void test_table_read_write_sequential(void) { char **names; @@ -633,6 +698,7 @@ static void test_corrupt_table(void) int readwrite_test_main(int argc, const char *argv[]) { + RUN_TEST(test_log_zlib_corruption); RUN_TEST(test_corrupt_table); RUN_TEST(test_corrupt_table_empty); RUN_TEST(test_log_write_read);