[v5,1/5] eoie: add End of Index Entry (EOIE) extension
diff mbox series

Message ID 20180912161832.55324-2-benpeart@microsoft.com
State New
Headers show
Series
  • [v5,1/5] eoie: add End of Index Entry (EOIE) extension
Related show

Commit Message

Ben Peart Sept. 12, 2018, 4:18 p.m. UTC
The End of Index Entry (EOIE) is used to locate the end of the variable
length index entries and the beginning of the extensions. Code can take
advantage of this to quickly locate the index extensions without having
to parse through all of the index entries.

Because it must be able to be loaded before the variable length cache
entries and other index extensions, this extension must be written last.
The signature for this extension is { 'E', 'O', 'I', 'E' }.

The extension consists of:

- 32-bit offset to the end of the index entries

- 160-bit SHA-1 over the extension types and their sizes (but not
their contents).  E.g. if we have "TREE" extension that is N-bytes
long, "REUC" extension that is M-bytes long, followed by "EOIE",
then the hash would be:

SHA-1("TREE" + <binary representation of N> +
	"REUC" + <binary representation of M>)

Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
---
 Documentation/technical/index-format.txt |  23 ++++
 read-cache.c                             | 154 +++++++++++++++++++++--
 t/README                                 |   5 +
 t/t1700-split-index.sh                   |   1 +
 4 files changed, 175 insertions(+), 8 deletions(-)

Comments

Junio C Hamano Sept. 13, 2018, 10:44 p.m. UTC | #1
Ben Peart <benpeart@microsoft.com> writes:

> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>  
>  test_expect_success 'enable split index' '
>  	git config splitIndex.maxPercentChange 100 &&

It is safer to squash the following in; we may want to revisit the
decision test-lint makes on this issue later, though.

-- >8 --
Subject: [PATCH] SQUASH???

http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#export

specifies how "export name[=word]" ought to work, but because
writing "name=word; export name" is not so much more cumbersome
and some older shells that do not understand the former do grok
the latter.  test-lint also recommends spelling it this way.
---
 t/t1700-split-index.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index f613dd72e3..dab97c2187 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,7 +7,7 @@ test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
-export GIT_TEST_DISABLE_EOIE=true
+GIT_TEST_DISABLE_EOIE=true; export GIT_TEST_DISABLE_EOIE
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&
Duy Nguyen Sept. 15, 2018, 10:02 a.m. UTC | #2
On Wed, Sep 12, 2018 at 6:18 PM Ben Peart <benpeart@microsoft.com> wrote:
>
> The End of Index Entry (EOIE) is used to locate the end of the variable
> length index entries and the beginning of the extensions. Code can take
> advantage of this to quickly locate the index extensions without having
> to parse through all of the index entries.
>
> Because it must be able to be loaded before the variable length cache
> entries and other index extensions, this extension must be written last.
> The signature for this extension is { 'E', 'O', 'I', 'E' }.
>
> The extension consists of:
>
> - 32-bit offset to the end of the index entries
>
> - 160-bit SHA-1 over the extension types and their sizes (but not
> their contents).  E.g. if we have "TREE" extension that is N-bytes
> long, "REUC" extension that is M-bytes long, followed by "EOIE",
> then the hash would be:
>
> SHA-1("TREE" + <binary representation of N> +
>         "REUC" + <binary representation of M>)
>
> Signed-off-by: Ben Peart <Ben.Peart@microsoft.com>
> ---
>  Documentation/technical/index-format.txt |  23 ++++
>  read-cache.c                             | 154 +++++++++++++++++++++--
>  t/README                                 |   5 +
>  t/t1700-split-index.sh                   |   1 +
>  4 files changed, 175 insertions(+), 8 deletions(-)
>
> diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
> index db3572626b..6bc2d90f7f 100644
> --- a/Documentation/technical/index-format.txt
> +++ b/Documentation/technical/index-format.txt
> @@ -314,3 +314,26 @@ The remaining data of each directory block is grouped by type:
>
>    - An ewah bitmap, the n-th bit indicates whether the n-th index entry
>      is not CE_FSMONITOR_VALID.
> +
> +== End of Index Entry
> +
> +  The End of Index Entry (EOIE) is used to locate the end of the variable
> +  length index entries and the begining of the extensions. Code can take
> +  advantage of this to quickly locate the index extensions without having
> +  to parse through all of the index entries.
> +
> +  Because it must be able to be loaded before the variable length cache
> +  entries and other index extensions, this extension must be written last.
> +  The signature for this extension is { 'E', 'O', 'I', 'E' }.
> +
> +  The extension consists of:
> +
> +  - 32-bit offset to the end of the index entries
> +
> +  - 160-bit SHA-1 over the extension types and their sizes (but not
> +       their contents).  E.g. if we have "TREE" extension that is N-bytes
> +       long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +       then the hash would be:
> +
> +       SHA-1("TREE" + <binary representation of N> +
> +               "REUC" + <binary representation of M>)
> diff --git a/read-cache.c b/read-cache.c
> index 7b1354d759..858935f123 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -43,6 +43,7 @@
>  #define CACHE_EXT_LINK 0x6c696e6b        /* "link" */
>  #define CACHE_EXT_UNTRACKED 0x554E5452   /* "UNTR" */
>  #define CACHE_EXT_FSMONITOR 0x46534D4E   /* "FSMN" */
> +#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945 /* "EOIE" */
>
>  /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
>  #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
> @@ -1693,6 +1694,9 @@ static int read_index_extension(struct index_state *istate,
>         case CACHE_EXT_FSMONITOR:
>                 read_fsmonitor_extension(istate, data, sz);
>                 break;
> +       case CACHE_EXT_ENDOFINDEXENTRIES:
> +               /* already handled in do_read_index() */
> +               break;

Perhaps catch this extension when it's not written at the end (e.g. by
some other git implementation) and warn.

>         default:
>                 if (*ext < 'A' || 'Z' < *ext)
>                         return error("index uses %.4s extension, which we do not understand",
> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>         return ondisk_size + entries * per_entry;
>  }
>
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> +#endif

Keep functions unconditionally built as much as possible. I don't see
why this read_eoie_extension() must be built only on multithread
platforms.

> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
> +
>  /* remember to discard_cache() before reading a different cache! */
>  int do_read_index(struct index_state *istate, const char *path, int must_exist)
>  {
> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>         return 0;
>  }
>
> -static int write_index_ext_header(git_hash_ctx *context, int fd,
> -                                 unsigned int ext, unsigned int sz)
> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
> +                                 int fd, unsigned int ext, unsigned int sz)
>  {
>         ext = htonl(ext);
>         sz = htonl(sz);
> +       if (eoie_context) {
> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
> +       }
>         return ((ce_write(context, fd, &ext, 4) < 0) ||
>                 (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>  }
> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>  {
>         uint64_t start = getnanotime();
>         int newfd = tempfile->fd;
> -       git_hash_ctx c;
> +       git_hash_ctx c, eoie_c;
>         struct cache_header hdr;
>         int i, err = 0, removed, extended, hdr_version;
>         struct cache_entry **cache = istate->cache;
> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>         struct ondisk_cache_entry_extended ondisk;
>         struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>         int drop_cache_tree = istate->drop_cache_tree;
> +       unsigned long offset;
>
>         for (i = removed = extended = 0; i < entries; i++) {
>                 if (cache[i]->ce_flags & CE_REMOVE)
> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 return err;
>
>         /* Write extension data here */
> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> +       the_hash_algo->init_fn(&eoie_c);

Don't write (or even calculate to write it) unless it's needed. Which
means only do this when parallel reading is enabled and the index size
large enough, or when a test variable is set so you can force writing
this extension.

I briefly wondered if we should continue writing the extension if it's
already written. This way you can manually enable it with "git
update-index". But I don't think it's worth the complexity.

>         if (!strip_extensions && istate->split_index) {
>                 struct strbuf sb = STRBUF_INIT;
>
>                 err = write_link_extension(&sb, istate) < 0 ||
> -                       write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
> +                       write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
>                                                sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2535,7 +2551,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 cache_tree_write(&sb, istate->cache_tree);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2545,7 +2561,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 resolve_undo_write(&sb, istate->resolve_undo);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
>                                              sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2556,7 +2572,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_untracked_extension(&sb, istate->untracked);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
>                                              sb.len) < 0 ||
>                         ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
> @@ -2567,7 +2583,23 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>                 struct strbuf sb = STRBUF_INIT;
>
>                 write_fsmonitor_extension(&sb, istate);
> -               err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +               err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
> +                       || ce_write(&c, newfd, sb.buf, sb.len) < 0;
> +               strbuf_release(&sb);
> +               if (err)
> +                       return -1;
> +       }
> +
> +       /*
> +        * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
> +        * so that it can be found and processed before all the index entries are
> +        * read.
> +        */
> +       if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
> +               struct strbuf sb = STRBUF_INIT;
> +
> +               write_eoie_extension(&sb, &eoie_c, offset);
> +               err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
>                         || ce_write(&c, newfd, sb.buf, sb.len) < 0;
>                 strbuf_release(&sb);
>                 if (err)
> @@ -2978,3 +3010,109 @@ int should_validate_cache_entries(void)
>
>         return validate_index_cache_entries;
>  }
> +
> +#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
> +#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
> +
> +#ifndef NO_PTHREADS
> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
> +{
> +       /*
> +        * The end of index entries (EOIE) extension is guaranteed to be last
> +        * so that it can be found by scanning backwards from the EOF.
> +        *
> +        * "EOIE"
> +        * <4-byte length>
> +        * <4-byte offset>
> +        * <20-byte hash>
> +        */
> +       const char *mmap = mmap_;
> +       const char *index, *eoie;
> +       uint32_t extsize;
> +       unsigned long offset, src_offset;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +       git_hash_ctx c;
> +
> +       /* ensure we have an index big enough to contain an EOIE extension */
> +       if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
> +               return 0;

All these "return 0" indicates an error in EOIE extension. You
probably want to print some warning (much easier to track down why
parallel reading does not happen).

> +
> +       /* validate the extension signature */
> +       index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
> +       if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /* validate the extension size */
> +       extsize = get_be32(index);
> +       if (extsize != EOIE_SIZE)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * Validate the offset we're going to look for the first extension
> +        * signature is after the index header and before the eoie extension.
> +        */
> +       offset = get_be32(index);
> +       if (mmap + offset < mmap + sizeof(struct cache_header))
> +               return 0;
> +       if (mmap + offset >= eoie)
> +               return 0;
> +       index += sizeof(uint32_t);
> +
> +       /*
> +        * The hash is computed over extension types and their sizes (but not
> +        * their contents).  E.g. if we have "TREE" extension that is N-bytes
> +        * long, "REUC" extension that is M-bytes long, followed by "EOIE",
> +        * then the hash would be:
> +        *
> +        * SHA-1("TREE" + <binary representation of N> +
> +        *               "REUC" + <binary representation of M>)
> +        */
> +       src_offset = offset;
> +       the_hash_algo->init_fn(&c);
> +       while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
> +               /* After an array of active_nr index entries,
> +                * there can be arbitrary number of extended
> +                * sections, each of which is prefixed with
> +                * extension name (4-byte) and section length
> +                * in 4-byte network byte order.
> +                */
> +               uint32_t extsize;
> +               memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
> +               extsize = ntohl(extsize);
> +
> +               /* verify the extension size isn't so large it will wrap around */
> +               if (src_offset + 8 + extsize < src_offset)
> +                       return 0;
> +
> +               the_hash_algo->update_fn(&c, mmap + src_offset, 8);
> +
> +               src_offset += 8;
> +               src_offset += extsize;
> +       }
> +       the_hash_algo->final_fn(hash, &c);
> +       if (hashcmp(hash, (const unsigned char *)index))
> +               return 0;
> +
> +       /* Validate that the extension offsets returned us back to the eoie extension. */
> +       if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
> +               return 0;
> +
> +       return offset;
> +}
> +#endif
> +
> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)

We normally just put function implementations before it's used to
avoid static forward declaration. Any special reason why it's not done
here?

> +{
> +       uint32_t buffer;
> +       unsigned char hash[GIT_MAX_RAWSZ];
> +
> +       /* offset */
> +       put_be32(&buffer, offset);
> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
> +
> +       /* hash */
> +       the_hash_algo->final_fn(hash, eoie_context);
> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
> +}
> diff --git a/t/README b/t/README
> index 9028b47d92..d8754dd23a 100644
> --- a/t/README
> +++ b/t/README
> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>  path where deltas larger than this limit require extra memory
>  allocation for bookkeeping.
>
> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed

I have a feeling that you won't have problems if you don't write eoie
extension by default in the first place. Then this could be switched
to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
eoie and split index are forced on when running through the test
suite, but that should be an easy fix.

> +as they currently hard code SHA values for the index which are no longer
> +valid due to the addition of the EOIE extension.
> +
>  Naming Tests
>  ------------
>
> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
> index 39133bcbc8..f613dd72e3 100755
> --- a/t/t1700-split-index.sh
> +++ b/t/t1700-split-index.sh
> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>  # We need total control of index splitting here
>  sane_unset GIT_TEST_SPLIT_INDEX
>  sane_unset GIT_FSMONITOR_TEST
> +export GIT_TEST_DISABLE_EOIE=true
>
>  test_expect_success 'enable split index' '
>         git config splitIndex.maxPercentChange 100 &&
> --
> 2.18.0.windows.1
>
Ben Peart Sept. 17, 2018, 2:54 p.m. UTC | #3
On 9/15/2018 6:02 AM, Duy Nguyen wrote:

>>          default:
>>                  if (*ext < 'A' || 'Z' < *ext)
>>                          return error("index uses %.4s extension, which we do not understand",
>> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
>>          return ondisk_size + entries * per_entry;
>>   }
>>
>> +#ifndef NO_PTHREADS
>> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
>> +#endif
> 
> Keep functions unconditionally built as much as possible. I don't see
> why this read_eoie_extension() must be built only on multithread
> platforms.
> 

This is conditional to avoid generating a warning on single threaded 
platforms where the function is currently unused.  That seemed like a 
better choice than calling it and ignoring it on single threaded 
platforms just to avoid a compiler warning.

>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
>> +
>>   /* remember to discard_cache() before reading a different cache! */
>>   int do_read_index(struct index_state *istate, const char *path, int must_exist)
>>   {
>> @@ -2198,11 +2207,15 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
>>          return 0;
>>   }
>>
>> -static int write_index_ext_header(git_hash_ctx *context, int fd,
>> -                                 unsigned int ext, unsigned int sz)
>> +static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
>> +                                 int fd, unsigned int ext, unsigned int sz)
>>   {
>>          ext = htonl(ext);
>>          sz = htonl(sz);
>> +       if (eoie_context) {
>> +               the_hash_algo->update_fn(eoie_context, &ext, 4);
>> +               the_hash_algo->update_fn(eoie_context, &sz, 4);
>> +       }
>>          return ((ce_write(context, fd, &ext, 4) < 0) ||
>>                  (ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
>>   }
>> @@ -2445,7 +2458,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>   {
>>          uint64_t start = getnanotime();
>>          int newfd = tempfile->fd;
>> -       git_hash_ctx c;
>> +       git_hash_ctx c, eoie_c;
>>          struct cache_header hdr;
>>          int i, err = 0, removed, extended, hdr_version;
>>          struct cache_entry **cache = istate->cache;
>> @@ -2454,6 +2467,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>          struct ondisk_cache_entry_extended ondisk;
>>          struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
>>          int drop_cache_tree = istate->drop_cache_tree;
>> +       unsigned long offset;
>>
>>          for (i = removed = extended = 0; i < entries; i++) {
>>                  if (cache[i]->ce_flags & CE_REMOVE)
>> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
>>                  return err;
>>
>>          /* Write extension data here */
>> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
>> +       the_hash_algo->init_fn(&eoie_c);
> 
> Don't write (or even calculate to write it) unless it's needed. Which
> means only do this when parallel reading is enabled and the index size
> large enough, or when a test variable is set so you can force writing
> this extension.

I made the logic always write the extension based on the earlier 
discussion [1] where it was suggested this should have been part of the 
original index format for extensions from the beginning.  This helps 
ensure it is available for current and future uses we haven't even 
discovered yet.

[1] 
https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/


>> +
>> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> 
> We normally just put function implementations before it's used to
> avoid static forward declaration. Any special reason why it's not done
> here?
> 

This was done to promote readability of the (already large) read-cache.c 
file.  I first considered moving the EOIE read/write functions into a 
separate file entirely but they need access to information only 
available within read-cache.c so I compromised and moved them to the end 
of the file instead.

>> +{
>> +       uint32_t buffer;
>> +       unsigned char hash[GIT_MAX_RAWSZ];
>> +
>> +       /* offset */
>> +       put_be32(&buffer, offset);
>> +       strbuf_add(sb, &buffer, sizeof(uint32_t));
>> +
>> +       /* hash */
>> +       the_hash_algo->final_fn(hash, eoie_context);
>> +       strbuf_add(sb, hash, the_hash_algo->rawsz);
>> +}
>> diff --git a/t/README b/t/README
>> index 9028b47d92..d8754dd23a 100644
>> --- a/t/README
>> +++ b/t/README
>> @@ -319,6 +319,11 @@ GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
>>   path where deltas larger than this limit require extra memory
>>   allocation for bookkeeping.
>>
>> +GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
>> +This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
> 
> I have a feeling that you won't have problems if you don't write eoie
> extension by default in the first place. Then this could be switched
> to GIT_TEST_ENABLE_EOIE instead. We may still have problem if both
> eoie and split index are forced on when running through the test
> suite, but that should be an easy fix.
> 
>> +as they currently hard code SHA values for the index which are no longer
>> +valid due to the addition of the EOIE extension.
>> +
>>   Naming Tests
>>   ------------
>>
>> diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
>> index 39133bcbc8..f613dd72e3 100755
>> --- a/t/t1700-split-index.sh
>> +++ b/t/t1700-split-index.sh
>> @@ -7,6 +7,7 @@ test_description='split index mode tests'
>>   # We need total control of index splitting here
>>   sane_unset GIT_TEST_SPLIT_INDEX
>>   sane_unset GIT_FSMONITOR_TEST
>> +export GIT_TEST_DISABLE_EOIE=true
>>
>>   test_expect_success 'enable split index' '
>>          git config splitIndex.maxPercentChange 100 &&
>> --
>> 2.18.0.windows.1
>>
> 
>
Duy Nguyen Sept. 17, 2018, 4:05 p.m. UTC | #4
On Mon, Sep 17, 2018 at 4:55 PM Ben Peart <peartben@gmail.com> wrote:
> On 9/15/2018 6:02 AM, Duy Nguyen wrote:
>
> >>          default:
> >>                  if (*ext < 'A' || 'Z' < *ext)
> >>                          return error("index uses %.4s extension, which we do not understand",
> >> @@ -1889,6 +1893,11 @@ static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
> >>          return ondisk_size + entries * per_entry;
> >>   }
> >>
> >> +#ifndef NO_PTHREADS
> >> +static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
> >> +#endif
> >
> > Keep functions unconditionally built as much as possible. I don't see
> > why this read_eoie_extension() must be built only on multithread
> > platforms.
> >
>
> This is conditional to avoid generating a warning on single threaded
> platforms where the function is currently unused.  That seemed like a
> better choice than calling it and ignoring it on single threaded
> platforms just to avoid a compiler warning.

The third option is ignore the compiler. I consider that warning a
helpful suggestion, not a strict rule.

Most devs don't run single thread builds (I think) so is this function
is updated in a way that breaks single thread mode, it can only be
found out when this function is used in single thread mode. At that
point the function may have changed a lot. If it's built
unconditionally, at least single thread users will yell up much sooner
and we could fix it much earlier.

> >> @@ -2520,11 +2534,13 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
> >>                  return err;
> >>
> >>          /* Write extension data here */
> >> +       offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
> >> +       the_hash_algo->init_fn(&eoie_c);
> >
> > Don't write (or even calculate to write it) unless it's needed. Which
> > means only do this when parallel reading is enabled and the index size
> > large enough, or when a test variable is set so you can force writing
> > this extension.
>
> I made the logic always write the extension based on the earlier
> discussion [1] where it was suggested this should have been part of the
> original index format for extensions from the beginning.  This helps
> ensure it is available for current and future uses we haven't even
> discovered yet.

But it _is_ available now. If you need it, you write the extension
out. If we make this part of index version 5 (and make it not an
extension anymore) then I buy that argument. As it is, it's an
optional extension.

> [1] https://public-inbox.org/git/xmqqwp2s1h1x.fsf@gitster.mtv.corp.google.com/
>
>
> >> +
> >> +static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
> >
> > We normally just put function implementations before it's used to
> > avoid static forward declaration. Any special reason why it's not done
> > here?
> >
>
> This was done to promote readability of the (already large) read-cache.c
> file.  I first considered moving the EOIE read/write functions into a
> separate file entirely but they need access to information only
> available within read-cache.c so I compromised and moved them to the end
> of the file instead.

I consider grouping extension related functions closer to
read_index_extension gives better readability, or at least better than
just putting new functions at the end in no particular order. But I
guess this is personal view.
Junio C Hamano Sept. 17, 2018, 5:31 p.m. UTC | #5
Duy Nguyen <pclouds@gmail.com> writes:

> But it _is_ available now. If you need it, you write the extension
> out.

Are you arguing for making it omitted when it is not needed (e.g.
small enough index file)?  IOW, did you mean "If you do not need it,
you do not write it out" by the above?

I do not think overhead of writing (or preparing to write) the
extension for a small index file is by definition small enough ;-).

I do not think the configuration that decides if the reader side
uses parallel reading should have any say in the decision to write
(or omit) the extension, by the way.
Duy Nguyen Sept. 17, 2018, 5:38 p.m. UTC | #6
On Mon, Sep 17, 2018 at 7:31 PM Junio C Hamano <gitster@pobox.com> wrote:
>
> Duy Nguyen <pclouds@gmail.com> writes:
>
> > But it _is_ available now. If you need it, you write the extension
> > out.
>
> Are you arguing for making it omitted when it is not needed (e.g.
> small enough index file)?  IOW, did you mean "If you do not need it,
> you do not write it out" by the above?

Yes I did.

> I do not think overhead of writing (or preparing to write) the
> extension for a small index file is by definition small enough ;-).

Good point.

I get annoyed by the "ignoring unknown extension xxx" messages while
testing though (not just this extension) and I think it will be the
same for other git implementations. But perhaps other implementations
just silently drop the extension. Most of the extensions we have added
so far (except the ancient 'TREE') are optional and are probably not
present 99% of time when a different git impl reads an index created
by C Git. This 'EIOE' may be a good test then to see if they follow
the "ignore optional extensions" rule since it will always appear in
new C Git releases.
Junio C Hamano Sept. 17, 2018, 7:08 p.m. UTC | #7
Duy Nguyen <pclouds@gmail.com> writes:

> I get annoyed by the "ignoring unknown extension xxx" messages while
> testing though (not just this extension) and I think it will be the
> same for other git implementations. But perhaps other implementations
> just silently drop the extension. Most of the extensions we have added
> so far (except the ancient 'TREE') are optional and are probably not

Most of the index extensions are optional, including TREE.  I think
"link" is the only one that the readers that do not understand it
are told to abort without causing damage.

> present 99% of time when a different git impl reads an index created
> by C Git. This 'EIOE' may be a good test then to see if they follow
> the "ignore optional extensions" rule since it will always appear in
> new C Git releases.

I think we probably should squelch "ignoring unknown" unless some
sort of GIT_TRACE/DEBUG switch is set.

Patches welcome ;-)

Thanks.

Patch
diff mbox series

diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt
index db3572626b..6bc2d90f7f 100644
--- a/Documentation/technical/index-format.txt
+++ b/Documentation/technical/index-format.txt
@@ -314,3 +314,26 @@  The remaining data of each directory block is grouped by type:
 
   - An ewah bitmap, the n-th bit indicates whether the n-th index entry
     is not CE_FSMONITOR_VALID.
+
+== End of Index Entry
+
+  The End of Index Entry (EOIE) is used to locate the end of the variable
+  length index entries and the begining of the extensions. Code can take
+  advantage of this to quickly locate the index extensions without having
+  to parse through all of the index entries.
+
+  Because it must be able to be loaded before the variable length cache
+  entries and other index extensions, this extension must be written last.
+  The signature for this extension is { 'E', 'O', 'I', 'E' }.
+
+  The extension consists of:
+
+  - 32-bit offset to the end of the index entries
+
+  - 160-bit SHA-1 over the extension types and their sizes (but not
+	their contents).  E.g. if we have "TREE" extension that is N-bytes
+	long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	then the hash would be:
+
+	SHA-1("TREE" + <binary representation of N> +
+		"REUC" + <binary representation of M>)
diff --git a/read-cache.c b/read-cache.c
index 7b1354d759..858935f123 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -43,6 +43,7 @@ 
 #define CACHE_EXT_LINK 0x6c696e6b	  /* "link" */
 #define CACHE_EXT_UNTRACKED 0x554E5452	  /* "UNTR" */
 #define CACHE_EXT_FSMONITOR 0x46534D4E	  /* "FSMN" */
+#define CACHE_EXT_ENDOFINDEXENTRIES 0x454F4945	/* "EOIE" */
 
 /* changes that can be kept in $GIT_DIR/index (basically all extensions) */
 #define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED | \
@@ -1693,6 +1694,9 @@  static int read_index_extension(struct index_state *istate,
 	case CACHE_EXT_FSMONITOR:
 		read_fsmonitor_extension(istate, data, sz);
 		break;
+	case CACHE_EXT_ENDOFINDEXENTRIES:
+		/* already handled in do_read_index() */
+		break;
 	default:
 		if (*ext < 'A' || 'Z' < *ext)
 			return error("index uses %.4s extension, which we do not understand",
@@ -1889,6 +1893,11 @@  static size_t estimate_cache_size(size_t ondisk_size, unsigned int entries)
 	return ondisk_size + entries * per_entry;
 }
 
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size);
+#endif
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset);
+
 /* remember to discard_cache() before reading a different cache! */
 int do_read_index(struct index_state *istate, const char *path, int must_exist)
 {
@@ -2198,11 +2207,15 @@  static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
-static int write_index_ext_header(git_hash_ctx *context, int fd,
-				  unsigned int ext, unsigned int sz)
+static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
+				  int fd, unsigned int ext, unsigned int sz)
 {
 	ext = htonl(ext);
 	sz = htonl(sz);
+	if (eoie_context) {
+		the_hash_algo->update_fn(eoie_context, &ext, 4);
+		the_hash_algo->update_fn(eoie_context, &sz, 4);
+	}
 	return ((ce_write(context, fd, &ext, 4) < 0) ||
 		(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
 }
@@ -2445,7 +2458,7 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 {
 	uint64_t start = getnanotime();
 	int newfd = tempfile->fd;
-	git_hash_ctx c;
+	git_hash_ctx c, eoie_c;
 	struct cache_header hdr;
 	int i, err = 0, removed, extended, hdr_version;
 	struct cache_entry **cache = istate->cache;
@@ -2454,6 +2467,7 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	struct ondisk_cache_entry_extended ondisk;
 	struct strbuf previous_name_buf = STRBUF_INIT, *previous_name;
 	int drop_cache_tree = istate->drop_cache_tree;
+	unsigned long offset;
 
 	for (i = removed = extended = 0; i < entries; i++) {
 		if (cache[i]->ce_flags & CE_REMOVE)
@@ -2520,11 +2534,13 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		return err;
 
 	/* Write extension data here */
+	offset = lseek(newfd, 0, SEEK_CUR) + write_buffer_len;
+	the_hash_algo->init_fn(&eoie_c);
 	if (!strip_extensions && istate->split_index) {
 		struct strbuf sb = STRBUF_INIT;
 
 		err = write_link_extension(&sb, istate) < 0 ||
-			write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
+			write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
 					       sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2535,7 +2551,7 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		cache_tree_write(&sb, istate->cache_tree);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_TREE, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2545,7 +2561,7 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		resolve_undo_write(&sb, istate->resolve_undo);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_RESOLVE_UNDO,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
 					     sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2556,7 +2572,7 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_untracked_extension(&sb, istate->untracked);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_UNTRACKED,
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
 					     sb.len) < 0 ||
 			ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
@@ -2567,7 +2583,23 @@  static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 		struct strbuf sb = STRBUF_INIT;
 
 		write_fsmonitor_extension(&sb, istate);
-		err = write_index_ext_header(&c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+		err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
+			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
+		strbuf_release(&sb);
+		if (err)
+			return -1;
+	}
+
+	/*
+	 * CACHE_EXT_ENDOFINDEXENTRIES must be written as the last entry before the SHA1
+	 * so that it can be found and processed before all the index entries are
+	 * read.
+	 */
+	if (!strip_extensions && offset && !git_env_bool("GIT_TEST_DISABLE_EOIE", 0)) {
+		struct strbuf sb = STRBUF_INIT;
+
+		write_eoie_extension(&sb, &eoie_c, offset);
+		err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
 			|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
 		strbuf_release(&sb);
 		if (err)
@@ -2978,3 +3010,109 @@  int should_validate_cache_entries(void)
 
 	return validate_index_cache_entries;
 }
+
+#define EOIE_SIZE (4 + GIT_SHA1_RAWSZ) /* <4-byte offset> + <20-byte hash> */
+#define EOIE_SIZE_WITH_HEADER (4 + 4 + EOIE_SIZE) /* <4-byte signature> + <4-byte length> + EOIE_SIZE */
+
+#ifndef NO_PTHREADS
+static unsigned long read_eoie_extension(void *mmap_, size_t mmap_size)
+{
+	/*
+	 * The end of index entries (EOIE) extension is guaranteed to be last
+	 * so that it can be found by scanning backwards from the EOF.
+	 *
+	 * "EOIE"
+	 * <4-byte length>
+	 * <4-byte offset>
+	 * <20-byte hash>
+	 */
+	const char *mmap = mmap_;
+	const char *index, *eoie;
+	uint32_t extsize;
+	unsigned long offset, src_offset;
+	unsigned char hash[GIT_MAX_RAWSZ];
+	git_hash_ctx c;
+
+	/* ensure we have an index big enough to contain an EOIE extension */
+	if (mmap_size < sizeof(struct cache_header) + EOIE_SIZE_WITH_HEADER + the_hash_algo->rawsz)
+		return 0;
+
+	/* validate the extension signature */
+	index = eoie = mmap + mmap_size - EOIE_SIZE_WITH_HEADER - the_hash_algo->rawsz;
+	if (CACHE_EXT(index) != CACHE_EXT_ENDOFINDEXENTRIES)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/* validate the extension size */
+	extsize = get_be32(index);
+	if (extsize != EOIE_SIZE)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * Validate the offset we're going to look for the first extension
+	 * signature is after the index header and before the eoie extension.
+	 */
+	offset = get_be32(index);
+	if (mmap + offset < mmap + sizeof(struct cache_header))
+		return 0;
+	if (mmap + offset >= eoie)
+		return 0;
+	index += sizeof(uint32_t);
+
+	/*
+	 * The hash is computed over extension types and their sizes (but not
+	 * their contents).  E.g. if we have "TREE" extension that is N-bytes
+	 * long, "REUC" extension that is M-bytes long, followed by "EOIE",
+	 * then the hash would be:
+	 *
+	 * SHA-1("TREE" + <binary representation of N> +
+	 *               "REUC" + <binary representation of M>)
+	 */
+	src_offset = offset;
+	the_hash_algo->init_fn(&c);
+	while (src_offset < mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER) {
+		/* After an array of active_nr index entries,
+		 * there can be arbitrary number of extended
+		 * sections, each of which is prefixed with
+		 * extension name (4-byte) and section length
+		 * in 4-byte network byte order.
+		 */
+		uint32_t extsize;
+		memcpy(&extsize, (char *)mmap + src_offset + 4, 4);
+		extsize = ntohl(extsize);
+
+		/* verify the extension size isn't so large it will wrap around */
+		if (src_offset + 8 + extsize < src_offset)
+			return 0;
+
+		the_hash_algo->update_fn(&c, mmap + src_offset, 8);
+
+		src_offset += 8;
+		src_offset += extsize;
+	}
+	the_hash_algo->final_fn(hash, &c);
+	if (hashcmp(hash, (const unsigned char *)index))
+		return 0;
+
+	/* Validate that the extension offsets returned us back to the eoie extension. */
+	if (src_offset != mmap_size - the_hash_algo->rawsz - EOIE_SIZE_WITH_HEADER)
+		return 0;
+
+	return offset;
+}
+#endif
+
+static void write_eoie_extension(struct strbuf *sb, git_hash_ctx *eoie_context, unsigned long offset)
+{
+	uint32_t buffer;
+	unsigned char hash[GIT_MAX_RAWSZ];
+
+	/* offset */
+	put_be32(&buffer, offset);
+	strbuf_add(sb, &buffer, sizeof(uint32_t));
+
+	/* hash */
+	the_hash_algo->final_fn(hash, eoie_context);
+	strbuf_add(sb, hash, the_hash_algo->rawsz);
+}
diff --git a/t/README b/t/README
index 9028b47d92..d8754dd23a 100644
--- a/t/README
+++ b/t/README
@@ -319,6 +319,11 @@  GIT_TEST_OE_DELTA_SIZE=<n> exercises the uncomon pack-objects code
 path where deltas larger than this limit require extra memory
 allocation for bookkeeping.
 
+GIT_TEST_DISABLE_EOIE=<boolean> disables writing the EOIE extension.
+This is used to allow tests 1, 4-9 in t1700-split-index.sh to succeed
+as they currently hard code SHA values for the index which are no longer
+valid due to the addition of the EOIE extension.
+
 Naming Tests
 ------------
 
diff --git a/t/t1700-split-index.sh b/t/t1700-split-index.sh
index 39133bcbc8..f613dd72e3 100755
--- a/t/t1700-split-index.sh
+++ b/t/t1700-split-index.sh
@@ -7,6 +7,7 @@  test_description='split index mode tests'
 # We need total control of index splitting here
 sane_unset GIT_TEST_SPLIT_INDEX
 sane_unset GIT_FSMONITOR_TEST
+export GIT_TEST_DISABLE_EOIE=true
 
 test_expect_success 'enable split index' '
 	git config splitIndex.maxPercentChange 100 &&