diff mbox series

Prevent git from rehashing 4GBi files

Message ID CY4PR16MB16552D74E064638BEC11ECB1AFC59@CY4PR16MB1655.namprd16.prod.outlook.com (mailing list archive)
State New, archived
Headers show
Series Prevent git from rehashing 4GBi files | expand

Commit Message

Jason Hatton May 6, 2022, 12:26 a.m. UTC
Git cache stores file sizes using uint32_t. This causes any file
that is a multiple of 2^32 to have a cached file size of zero.
Zero is a special value used by racily clean. This causes git to
rehash every file that is a multiple of 2^32 every time git status
or git commit is run.

This patch mitigates the problem by making all files that are a
multiple of 2^32 appear to have a size of 1<<31 instead of zero.

The value of 1<<31 is chosen to keep it as far away from zero
as possible to help prevent things getting mixed up with unpatched
versions of git.

An example would be to have a 2^32 sized file in the index of
patched git. Patched git would save the file as 2^31 in the cache.
An unpatched git would very much see the file has changed in size
and force it to rehash the file, which is safe. The file would
have to grow or shrink by exactly 2^31 and retain all of its
ctime, mtime, and other attributes for old git to not notice
the change.

This patch does not change the behavior of any file that is not
an exact multiple of 2^32.

Signed-off-by: Jason D. Hatton <jhatton@globalfinishing.com>
---
 cache.h      |  1 +
 read-cache.c | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

Comments

Torsten Bögershausen May 6, 2022, 4:37 a.m. UTC | #1
On Fri, May 06, 2022 at 12:26:53AM +0000, Jason Hatton wrote:
> Git cache stores file sizes using uint32_t. This causes any file
> that is a multiple of 2^32 to have a cached file size of zero.
> Zero is a special value used by racily clean. This causes git to
> rehash every file that is a multiple of 2^32 every time git status
> or git commit is run.
>
> This patch mitigates the problem by making all files that are a
> multiple of 2^32 appear to have a size of 1<<31 instead of zero.
>
> The value of 1<<31 is chosen to keep it as far away from zero
> as possible to help prevent things getting mixed up with unpatched
> versions of git.
>
> An example would be to have a 2^32 sized file in the index of
> patched git. Patched git would save the file as 2^31 in the cache.
> An unpatched git would very much see the file has changed in size
> and force it to rehash the file, which is safe. The file would
> have to grow or shrink by exactly 2^31 and retain all of its
> ctime, mtime, and other attributes for old git to not notice
> the change.
>
> This patch does not change the behavior of any file that is not
> an exact multiple of 2^32.
>
> Signed-off-by: Jason D. Hatton <jhatton@globalfinishing.com>
> ---
>  cache.h      |  1 +
>  read-cache.c | 16 ++++++++++++++--
>  2 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/cache.h b/cache.h
> index 4b666b2848..74e983227b 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
>  #define HASH_SILENT 8
>  int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
>  int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
> +unsigned int munge_st_size(off_t st_size);
>
>  /*
>   * Record to sd the data from st that we use to check whether a file
> diff --git a/read-cache.c b/read-cache.c
> index ea6150ea28..b0a1b505db 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -163,6 +163,18 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n
>  		add_index_entry(istate, new_entry, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
>  }
>
> +/*
> + * Munge st_size into an unsigned int.
> + */
> +unsigned int munge_st_size(off_t st_size) {
> +	unsigned int sd_size = st_size;

Should this be written as
	unsigned int sd_size = (unsigned int)st_size;


> +
> +	if(!sd_size && st_size)
> +		return 0x80000000;
> +	else
> +		return sd_size;
> +}
> +
>  void fill_stat_data(struct stat_data *sd, struct stat *st)
>  {
>  	sd->sd_ctime.sec = (unsigned int)st->st_ctime;
> @@ -173,7 +185,7 @@ void fill_stat_data(struct stat_data *sd, struct stat *st)
>  	sd->sd_ino = st->st_ino;
>  	sd->sd_uid = st->st_uid;
>  	sd->sd_gid = st->st_gid;
> -	sd->sd_size = st->st_size;
> +	sd->sd_size = munge_st_size(st->st_size);
>  }
>
>  int match_stat_data(const struct stat_data *sd, struct stat *st)
> @@ -212,7 +224,7 @@ int match_stat_data(const struct stat_data *sd, struct stat *st)
>  			changed |= INODE_CHANGED;
>  #endif
>
> -	if (sd->sd_size != (unsigned int) st->st_size)
> +	if (sd->sd_size != munge_st_size(st->st_size))
>  		changed |= DATA_CHANGED;
>
>  	return changed;
> --
> 2.36.0
>
Philip Oakley May 6, 2022, 10:22 a.m. UTC | #2
On 06/05/2022 01:26, Jason Hatton wrote:
> Git cache stores file sizes using uint32_t. This causes any file
> that is a multiple of 2^32 to have a cached file size of zero.
> Zero is a special value used by racily clean. This causes git to
> rehash every file that is a multiple of 2^32 every time git status
> or git commit is run.
>
> This patch mitigates the problem by making all files that are a
> multiple of 2^32 appear to have a size of 1<<31 instead of zero.
>
> The value of 1<<31 is chosen to keep it as far away from zero
> as possible to help prevent things getting mixed up with unpatched
> versions of git.
>
> An example would be to have a 2^32 sized file in the index of
> patched git. Patched git would save the file as 2^31 in the cache.
> An unpatched git would very much see the file has changed in size
> and force it to rehash the file, which is safe. The file would
> have to grow or shrink by exactly 2^31 and retain all of its
> ctime, mtime, and other attributes for old git to not notice
> the change.
>
> This patch does not change the behavior of any file that is not
> an exact multiple of 2^32.
>
> Signed-off-by: Jason D. Hatton <jhatton@globalfinishing.com>
> ---
>  cache.h      |  1 +
>  read-cache.c | 16 ++++++++++++++--
>  2 files changed, 15 insertions(+), 2 deletions(-)
>
> diff --git a/cache.h b/cache.h
> index 4b666b2848..74e983227b 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -898,6 +898,7 @@ int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
>  #define HASH_SILENT 8
>  int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
>  int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
> +unsigned int munge_st_size(off_t st_size);
>  
>  /*
>   * Record to sd the data from st that we use to check whether a file
> diff --git a/read-cache.c b/read-cache.c
> index ea6150ea28..b0a1b505db 100644
> --- a/read-cache.c
> +++ b/read-cache.c
> @@ -163,6 +163,18 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n
>  		add_index_entry(istate, new_entry, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
>  }
>  
> +/*
> + * Munge st_size into an unsigned int.

This "Munge" above isn't telling the reader 'why'/'what' is going on.
The comment should in some way highlight that a zero size result is
special, and that we have the roll over issue when the stored in 32 bits
- the double duty of racy vs changed in the stat data heuristic.
Synonyms of 'munge' ?


> + */
> +unsigned int munge_st_size(off_t st_size) {
> +	unsigned int sd_size = st_size;
> +
> +	if(!sd_size && st_size)
> +		return 0x80000000;
> +	else
> +		return sd_size;
> +}
> +
>  void fill_stat_data(struct stat_data *sd, struct stat *st)
>  {
>  	sd->sd_ctime.sec = (unsigned int)st->st_ctime;
> @@ -173,7 +185,7 @@ void fill_stat_data(struct stat_data *sd, struct stat *st)
>  	sd->sd_ino = st->st_ino;
>  	sd->sd_uid = st->st_uid;
>  	sd->sd_gid = st->st_gid;
> -	sd->sd_size = st->st_size;
> +	sd->sd_size = munge_st_size(st->st_size);
>  }
>  
>  int match_stat_data(const struct stat_data *sd, struct stat *st)
> @@ -212,7 +224,7 @@ int match_stat_data(const struct stat_data *sd, struct stat *st)
>  			changed |= INODE_CHANGED;
>  #endif
>  
> -	if (sd->sd_size != (unsigned int) st->st_size)
> +	if (sd->sd_size != munge_st_size(st->st_size))
>  		changed |= DATA_CHANGED;
>  
>  	return changed;
Junio C Hamano May 6, 2022, 4:36 p.m. UTC | #3
Philip Oakley <philipoakley@iee.email> writes:

> This "Munge" above isn't telling the reader 'why'/'what' is going on.
> The comment should in some way highlight that a zero size result is
> special, and that we have the roll over issue when the stored in 32 bits
> - the double duty of racy vs changed in the stat data heuristic.
> Synonyms of 'munge' ?
>
>
>> + */
>> +unsigned int munge_st_size(off_t st_size) {
>> +	unsigned int sd_size = st_size;
>> +
>> +	if(!sd_size && st_size)

Style.

>> +		return 0x80000000;
>> +	else
>> +		return sd_size;
>> +}

This may treat non-zero multiple of 4GiB as "not racy", but has
anybody double checked the concern Réne brought up earlier that a
4GiB file that was added and then got rewritten to 2GiB within the
same second would suddenly start getting treated as not racy?

The patch (the firnal version of it anyway) needs to be accompanied
by a handful of test additions to tickle corner cases like that.

Thanks, all, for working on this.
Philip Oakley May 6, 2022, 9:17 p.m. UTC | #4
On 06/05/2022 17:36, Junio C Hamano wrote:
>>> + */
>>> +unsigned int munge_st_size(off_t st_size) {
>>> +	unsigned int sd_size = st_size;
>>> +
>>> +	if(!sd_size && st_size)
> Style.
Ah, the same line / braces choice (as per coding guidelines).
>>> +		return 0x80000000;
>>> +	else
>>> +		return sd_size;
>>> +}
> This may treat non-zero multiple of 4GiB as "not racy", but has
> anybody double checked the concern Réne brought up earlier that a
> 4GiB file that was added and then got rewritten to 2GiB within the
> same second would suddenly start getting treated as not racy?
This is the pre-existing problem, that ~1in 2^31 size changes might not
get noticed for size change. The 0 byte / 4GiB change is an identical
issue, as is changing from 3 bytes to 4GiB+3 bytes, etc., so that's no
worse than before (well maybe twice as 'unlikely').

>
> The patch (the firnal version of it anyway) needs to be accompanied
> by a handful of test additions to tickle corner cases like that.
They'd be protected by the EXPENSIVE prerequisite I would assume.
Any particular test t/txxx that they should be placed in (I'm not that
familiar with the test suit)
--
Philip
Junio C Hamano May 6, 2022, 9:23 p.m. UTC | #5
Philip Oakley <philipoakley@iee.email> writes:

>> This may treat non-zero multiple of 4GiB as "not racy", but has
>> anybody double checked the concern Réne brought up earlier that a
>> 4GiB file that was added and then got rewritten to 2GiB within the
>> same second would suddenly start getting treated as not racy?
> This is the pre-existing problem, that ~1in 2^31 size changes might not
> get noticed for size change. The 0 byte / 4GiB change is an identical
> issue, as is changing from 3 bytes to 4GiB+3 bytes, etc., so that's no
> worse than before (well maybe twice as 'unlikely').

OK, it added one more case to 2^32-1 existing cases, I guess.

>> The patch (the firnal version of it anyway) needs to be accompanied
>> by a handful of test additions to tickle corner cases like that.
> They'd be protected by the EXPENSIVE prerequisite I would assume.

Oh, absolutely.  Thanks for spelling that out.
diff mbox series

Patch

diff --git a/cache.h b/cache.h
index 4b666b2848..74e983227b 100644
--- a/cache.h
+++ b/cache.h
@@ -898,6 +898,7 @@  int ie_modified(struct index_state *, const struct cache_entry *, struct stat *,
 #define HASH_SILENT 8
 int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
 int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags);
+unsigned int munge_st_size(off_t st_size);
 
 /*
  * Record to sd the data from st that we use to check whether a file
diff --git a/read-cache.c b/read-cache.c
index ea6150ea28..b0a1b505db 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -163,6 +163,18 @@  void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n
 		add_index_entry(istate, new_entry, ADD_CACHE_OK_TO_ADD|ADD_CACHE_OK_TO_REPLACE);
 }
 
+/*
+ * Munge st_size into an unsigned int.
+ */
+unsigned int munge_st_size(off_t st_size) {
+	unsigned int sd_size = st_size;
+
+	if(!sd_size && st_size)
+		return 0x80000000;
+	else
+		return sd_size;
+}
+
 void fill_stat_data(struct stat_data *sd, struct stat *st)
 {
 	sd->sd_ctime.sec = (unsigned int)st->st_ctime;
@@ -173,7 +185,7 @@  void fill_stat_data(struct stat_data *sd, struct stat *st)
 	sd->sd_ino = st->st_ino;
 	sd->sd_uid = st->st_uid;
 	sd->sd_gid = st->st_gid;
-	sd->sd_size = st->st_size;
+	sd->sd_size = munge_st_size(st->st_size);
 }
 
 int match_stat_data(const struct stat_data *sd, struct stat *st)
@@ -212,7 +224,7 @@  int match_stat_data(const struct stat_data *sd, struct stat *st)
 			changed |= INODE_CHANGED;
 #endif
 
-	if (sd->sd_size != (unsigned int) st->st_size)
+	if (sd->sd_size != munge_st_size(st->st_size))
 		changed |= DATA_CHANGED;
 
 	return changed;