diff mbox series

[v1,05/10] cat-file: use delta_base_cache entries directly

Message ID 20240715003519.2671385-6-e@80x24.org (mailing list archive)
State New, archived
Headers show
Series cat-file speedups | expand

Commit Message

Eric Wong July 15, 2024, 12:35 a.m. UTC
For objects already in the delta_base_cache, we can safely use
them directly to avoid the malloc+memcpy+free overhead.

While only 2-7% of objects are delta bases in repos I've looked
at, this avoids up to 96MB of duplicated memory in the worst
case with the default git config.  For a more reasonable 1MB
delta base object, this eliminates the speed penalty of
duplicating large objects into memory and speeds up those 1MB
delta base cached content retrievals by roughly 30%.

The new delta_base_cache_lock is a simple single-threaded
assertion to ensure cat-file is the exclusive user of the
delta_base_cache.

Signed-off-by: Eric Wong <e@80x24.org>
---
 builtin/cat-file.c | 15 ++++++++++++++-
 object-file.c      |  5 +++++
 object-store-ll.h  |  7 +++++++
 packfile.c         | 28 +++++++++++++++++++++++++---
 packfile.h         |  4 ++++
 5 files changed, 55 insertions(+), 4 deletions(-)

Comments

Patrick Steinhardt July 24, 2024, 8:35 a.m. UTC | #1
On Mon, Jul 15, 2024 at 12:35:14AM +0000, Eric Wong wrote:
> For objects already in the delta_base_cache, we can safely use
> them directly to avoid the malloc+memcpy+free overhead.

Same here, I feel like you need to explain a bit more in depth what the
actual idea behind your patch is to help reviewers.

> diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> index bc4bb89610..769c8b48d2 100644
> --- a/builtin/cat-file.c
> +++ b/builtin/cat-file.c
> @@ -24,6 +24,7 @@
>  #include "promisor-remote.h"
>  #include "mailmap.h"
>  #include "write-or-die.h"
> +#define USE_DIRECT_CACHE 1

I'm confused by this. Why do we introduce a macro that is always defined
to a trueish value? Why don't we just remove the code guarded by this?

>  enum batch_mode {
>  	BATCH_MODE_CONTENTS,
> @@ -386,7 +387,18 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
>  
>  	if (data->content) {
>  		batch_write(opt, data->content, data->size);
> -		FREE_AND_NULL(data->content);
> +		switch (data->info.whence) {
> +		case OI_CACHED: BUG("FIXME OI_CACHED support not done");

Is this something that will get addressed in a subsequent patch? If so,
the commit message and the message here should likely mention this. If
not, we should have a comment here saying why this is fine to be kept.

> +		case OI_LOOSE:
> +		case OI_PACKED:
> +			FREE_AND_NULL(data->content);
> +			break;
> +		case OI_DBCACHED:
> +			if (USE_DIRECT_CACHE)
> +				unlock_delta_base_cache();
> +			else
> +				FREE_AND_NULL(data->content);
> +		}
>  	} else if (data->type == OBJ_BLOB) {
>  		if (opt->buffer_output)
>  			fflush(stdout);
> @@ -815,6 +827,7 @@ static int batch_objects(struct batch_options *opt)
>  			data.info.sizep = &data.size;
>  			data.info.contentp = &data.content;
>  			data.info.content_limit = big_file_threshold;
> +			data.info.direct_cache = USE_DIRECT_CACHE;
>  		}
>  	}
>  
> diff --git a/object-file.c b/object-file.c
> index 1cc29c3c58..19100e823d 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1586,6 +1586,11 @@ static int do_oid_object_info_extended(struct repository *r,
>  			oidclr(oi->delta_base_oid, the_repository->hash_algo);
>  		if (oi->type_name)
>  			strbuf_addstr(oi->type_name, type_name(co->type));
> +		/*
> +		 * Currently `blame' is the only command which creates
> +		 * OI_CACHED, and direct_cache is only used by `cat-file'.
> +		 */
> +		assert(!oi->direct_cache);

We shouldn't use asserts, but rather use `BUG()` statements in our
codebase. `assert()`s don't help users that run production builds.

>  		if (oi->contentp)
>  			*oi->contentp = xmemdupz(co->buf, co->size);
>  		oi->whence = OI_CACHED;
> diff --git a/object-store-ll.h b/object-store-ll.h
> index b71a15f590..50c5219308 100644
> --- a/object-store-ll.h
> +++ b/object-store-ll.h
> @@ -298,6 +298,13 @@ struct object_info {
>  		OI_PACKED,
>  		OI_DBCACHED
>  	} whence;
> +
> +	/*
> +	 * set if caller is able to use OI_DBCACHED entries without copying
> +	 * TODO OI_CACHED if its use goes beyond blame
> +	 */
> +	unsigned direct_cache:1;
> +

This comment looks unfinished to me.

>  	union {
>  		/*
>  		 * struct {
> diff --git a/packfile.c b/packfile.c
> index 1a409ec142..b2660e14f9 100644
> --- a/packfile.c
> +++ b/packfile.c
> @@ -1362,6 +1362,9 @@ static enum object_type packed_to_object_type(struct repository *r,
>  static struct hashmap delta_base_cache;
>  static size_t delta_base_cached;
>  
> +/* ensures oi->direct_cache is used properly */
> +static int delta_base_cache_lock;
> +

How exactly does it ensure it? What is the intent of this variable and
how would it be used correctly?

>  static LIST_HEAD(delta_base_cache_lru);
>  
>  struct delta_base_cache_key {
> @@ -1444,6 +1447,18 @@ static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
>  	free(ent);
>  }
>  
> +static void lock_delta_base_cache(void)
> +{
> +	delta_base_cache_lock++;
> +	assert(delta_base_cache_lock == 1);
> +}
> +
> +void unlock_delta_base_cache(void)
> +{
> +	delta_base_cache_lock--;
> +	assert(delta_base_cache_lock == 0);
> +}

Hum. So this looks like a pseudo-mutex to me? Are there any code paths
where this may be used in a threaded context? I assume not in the
current state of affairs as we only use it in git-cat-file(1).

>  static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
>  {
>  	free(ent->data);
> @@ -1453,6 +1468,7 @@ static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
>  void clear_delta_base_cache(void)
>  {
>  	struct list_head *lru, *tmp;
> +	assert(!delta_base_cache_lock);
>  	list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
>  		struct delta_base_cache_entry *entry =
>  			list_entry(lru, struct delta_base_cache_entry, lru);
> @@ -1466,6 +1482,7 @@ static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
>  	struct delta_base_cache_entry *ent;
>  	struct list_head *lru, *tmp;
>  
> +	assert(!delta_base_cache_lock);
>  	/*
>  	 * Check required to avoid redundant entries when more than one thread
>  	 * is unpacking the same object, in unpack_entry() (since its phases I
> @@ -1521,11 +1538,16 @@ int packed_object_info(struct repository *r, struct packed_git *p,
>  		if (oi->sizep)
>  			*oi->sizep = ent->size;
>  		if (oi->contentp) {
> -			if (!oi->content_limit ||
> -					ent->size <= oi->content_limit)
> +			/* ignore content_limit if avoiding copy from cache */
> +			if (oi->direct_cache) {
> +				lock_delta_base_cache();
> +				*oi->contentp = ent->data;
> +			} else if (!oi->content_limit ||
> +					ent->size <= oi->content_limit) {
>  				*oi->contentp = xmemdupz(ent->data, ent->size);
> -			else
> +			} else {
>  				*oi->contentp = NULL; /* caller must stream */
> +			}
>  		}
>  	} else if (oi->contentp && !oi->content_limit) {
>  		*oi->contentp = unpack_entry(r, p, obj_offset, &type,

Okay, this hunk is the gist of this patch. Instead of copying over the
delta base, we simply take its data pointer as the content pointer. All
the other infra that you're adding is mostly only added as a safeguard
to make sure that we don't discard the delta base while the object is
getting accessed.

Patrick
Eric Wong July 26, 2024, 7:42 a.m. UTC | #2
Patrick Steinhardt <ps@pks.im> wrote:
> On Mon, Jul 15, 2024 at 12:35:14AM +0000, Eric Wong wrote:
> > For objects already in the delta_base_cache, we can safely use
> > them directly to avoid the malloc+memcpy+free overhead.
> 
> Same here, I feel like you need to explain a bit more in depth what the
> actual idea behind your patch is to help reviewers.

I elaborated more on the speedup gained in the second paragraph
of the commit message:

	... this avoids up to 96MB of duplicated memory in the worst
	case with the default git config.  For a more reasonable 1MB
	delta base object, this eliminates the speed penalty of
	duplicating large objects into memory and speeds up those 1MB
	delta base cached content retrievals by roughly 30%.

> > diff --git a/builtin/cat-file.c b/builtin/cat-file.c
> > index bc4bb89610..769c8b48d2 100644
> > --- a/builtin/cat-file.c
> > +++ b/builtin/cat-file.c
> > @@ -24,6 +24,7 @@
> >  #include "promisor-remote.h"
> >  #include "mailmap.h"
> >  #include "write-or-die.h"
> > +#define USE_DIRECT_CACHE 1
> 
> I'm confused by this. Why do we introduce a macro that is always defined
> to a trueish value? Why don't we just remove the code guarded by this?

I wanted to be able to toggle the feature for comparison during
development.  I can eliminate it for v2.

> >  enum batch_mode {
> >  	BATCH_MODE_CONTENTS,
> > @@ -386,7 +387,18 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d
> >  
> >  	if (data->content) {
> >  		batch_write(opt, data->content, data->size);
> > -		FREE_AND_NULL(data->content);
> > +		switch (data->info.whence) {
> > +		case OI_CACHED: BUG("FIXME OI_CACHED support not done");
> 
> Is this something that will get addressed in a subsequent patch? If so,
> the commit message and the message here should likely mention this. If
> not, we should have a comment here saying why this is fine to be kept.

Not in this series.  I'm not sure if we'll ever need OI_CACHED
support, here.  However, I've been considering an new cache
that can be shared across multiple cat-file processes, but
that'll be a separate series.

> > diff --git a/object-file.c b/object-file.c
> > index 1cc29c3c58..19100e823d 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1586,6 +1586,11 @@ static int do_oid_object_info_extended(struct repository *r,
> >  			oidclr(oi->delta_base_oid, the_repository->hash_algo);
> >  		if (oi->type_name)
> >  			strbuf_addstr(oi->type_name, type_name(co->type));
> > +		/*
> > +		 * Currently `blame' is the only command which creates
> > +		 * OI_CACHED, and direct_cache is only used by `cat-file'.
> > +		 */
> > +		assert(!oi->direct_cache);
> 
> We shouldn't use asserts, but rather use `BUG()` statements in our
> codebase. `assert()`s don't help users that run production builds.

OK.

> >  		if (oi->contentp)
> >  			*oi->contentp = xmemdupz(co->buf, co->size);
> >  		oi->whence = OI_CACHED;
> > diff --git a/object-store-ll.h b/object-store-ll.h
> > index b71a15f590..50c5219308 100644
> > --- a/object-store-ll.h
> > +++ b/object-store-ll.h
> > @@ -298,6 +298,13 @@ struct object_info {
> >  		OI_PACKED,
> >  		OI_DBCACHED
> >  	} whence;
> > +
> > +	/*
> > +	 * set if caller is able to use OI_DBCACHED entries without copying
> > +	 * TODO OI_CACHED if its use goes beyond blame
> > +	 */
> > +	unsigned direct_cache:1;
> > +
> 
> This comment looks unfinished to me.

Yeah.  I'll elaborate on it's only intended for cat-file atm and
would break if blame (or other callers) used it.

> >  	union {
> >  		/*
> >  		 * struct {
> > diff --git a/packfile.c b/packfile.c
> > index 1a409ec142..b2660e14f9 100644
> > --- a/packfile.c
> > +++ b/packfile.c
> > @@ -1362,6 +1362,9 @@ static enum object_type packed_to_object_type(struct repository *r,
> >  static struct hashmap delta_base_cache;
> >  static size_t delta_base_cached;
> >  
> > +/* ensures oi->direct_cache is used properly */
> > +static int delta_base_cache_lock;
> > +
> 
> How exactly does it ensure it? What is the intent of this variable and
> how would it be used correctly?

It prevents multiple cache entries from being acquired at once.

> > +static void lock_delta_base_cache(void)
> > +{
> > +	delta_base_cache_lock++;
> > +	assert(delta_base_cache_lock == 1);
> > +}
> > +
> > +void unlock_delta_base_cache(void)
> > +{
> > +	delta_base_cache_lock--;
> > +	assert(delta_base_cache_lock == 0);
> > +}
> 
> Hum. So this looks like a pseudo-mutex to me? Are there any code paths
> where this may be used in a threaded context? I assume not in the
> current state of affairs as we only use it in git-cat-file(1).

No parallelism or threads at all.  It's to ensure callers can't
load multiple entries at the same time since retrieving a delta
base cache entry could invalidate an entry that's already
acquired for use.

> >  static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
> >  {
> >  	free(ent->data);
> > @@ -1453,6 +1468,7 @@ static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
> >  void clear_delta_base_cache(void)
> >  {
> >  	struct list_head *lru, *tmp;
> > +	assert(!delta_base_cache_lock);
> >  	list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
> >  		struct delta_base_cache_entry *entry =
> >  			list_entry(lru, struct delta_base_cache_entry, lru);
> > @@ -1466,6 +1482,7 @@ static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
> >  	struct delta_base_cache_entry *ent;
> >  	struct list_head *lru, *tmp;
> >  
> > +	assert(!delta_base_cache_lock);
> >  	/*
> >  	 * Check required to avoid redundant entries when more than one thread
> >  	 * is unpacking the same object, in unpack_entry() (since its phases I
> > @@ -1521,11 +1538,16 @@ int packed_object_info(struct repository *r, struct packed_git *p,
> >  		if (oi->sizep)
> >  			*oi->sizep = ent->size;
> >  		if (oi->contentp) {
> > -			if (!oi->content_limit ||
> > -					ent->size <= oi->content_limit)
> > +			/* ignore content_limit if avoiding copy from cache */
> > +			if (oi->direct_cache) {
> > +				lock_delta_base_cache();
> > +				*oi->contentp = ent->data;
> > +			} else if (!oi->content_limit ||
> > +					ent->size <= oi->content_limit) {
> >  				*oi->contentp = xmemdupz(ent->data, ent->size);
> > -			else
> > +			} else {
> >  				*oi->contentp = NULL; /* caller must stream */
> > +			}
> >  		}
> >  	} else if (oi->contentp && !oi->content_limit) {
> >  		*oi->contentp = unpack_entry(r, p, obj_offset, &type,
> 
> Okay, this hunk is the gist of this patch. Instead of copying over the
> delta base, we simply take its data pointer as the content pointer. All
> the other infra that you're adding is mostly only added as a safeguard
> to make sure that we don't discard the delta base while the object is
> getting accessed.

Right.  I'll switch the asserts to BUG calls for v2.
Eric Wong Aug. 18, 2024, 5:36 p.m. UTC | #3
Eric Wong <e@80x24.org> wrote:
> Patrick Steinhardt <ps@pks.im> wrote:
> > We shouldn't use asserts, but rather use `BUG()` statements in our
> > codebase. `assert()`s don't help users that run production builds.
> 
> OK.

Thinking about this more, I still favor assert() in common code
paths since it's only meant to be used during development and
later removed or neutralized (via -DNDEBUG).

IOW, I treat assert() as scaffolding that can/should later be
removed once the code is proven to work well.  We also have
plenty of existing asserts in our codebase.

Furthermore, assert() is also a well known API which reduces the
learning curve for drive-by hackers (I still consider myself
a drive-by since my I do minimal C).
Junio C Hamano Aug. 19, 2024, 3:50 p.m. UTC | #4
Eric Wong <e@80x24.org> writes:

> Eric Wong <e@80x24.org> wrote:
>> Patrick Steinhardt <ps@pks.im> wrote:
>> > We shouldn't use asserts, but rather use `BUG()` statements in our
>> > codebase. `assert()`s don't help users that run production builds.
>> 
>> OK.
>
> Thinking about this more, I still favor assert() in common code
> paths since it's only meant to be used during development and
> later removed or neutralized (via -DNDEBUG).

I have a mixed feeling.

I agree that assert() is only meant to be used during development,
but we only want code that is polished enough to be added to the
system, so there is no place for assert() in the code in 'master'.

The point of BUG() is that it is not easily "neutralized", so it can
help safeguarding the production code from harming the end-user data
due to doing nonsense things without noticing that the precondition
is not satisfied for it to perform correctly.  It makes sense to
have it in both during development and after deployment.
diff mbox series

Patch

diff --git a/builtin/cat-file.c b/builtin/cat-file.c
index bc4bb89610..769c8b48d2 100644
--- a/builtin/cat-file.c
+++ b/builtin/cat-file.c
@@ -24,6 +24,7 @@ 
 #include "promisor-remote.h"
 #include "mailmap.h"
 #include "write-or-die.h"
+#define USE_DIRECT_CACHE 1
 
 enum batch_mode {
 	BATCH_MODE_CONTENTS,
@@ -386,7 +387,18 @@  static void print_object_or_die(struct batch_options *opt, struct expand_data *d
 
 	if (data->content) {
 		batch_write(opt, data->content, data->size);
-		FREE_AND_NULL(data->content);
+		switch (data->info.whence) {
+		case OI_CACHED: BUG("FIXME OI_CACHED support not done");
+		case OI_LOOSE:
+		case OI_PACKED:
+			FREE_AND_NULL(data->content);
+			break;
+		case OI_DBCACHED:
+			if (USE_DIRECT_CACHE)
+				unlock_delta_base_cache();
+			else
+				FREE_AND_NULL(data->content);
+		}
 	} else if (data->type == OBJ_BLOB) {
 		if (opt->buffer_output)
 			fflush(stdout);
@@ -815,6 +827,7 @@  static int batch_objects(struct batch_options *opt)
 			data.info.sizep = &data.size;
 			data.info.contentp = &data.content;
 			data.info.content_limit = big_file_threshold;
+			data.info.direct_cache = USE_DIRECT_CACHE;
 		}
 	}
 
diff --git a/object-file.c b/object-file.c
index 1cc29c3c58..19100e823d 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1586,6 +1586,11 @@  static int do_oid_object_info_extended(struct repository *r,
 			oidclr(oi->delta_base_oid, the_repository->hash_algo);
 		if (oi->type_name)
 			strbuf_addstr(oi->type_name, type_name(co->type));
+		/*
+		 * Currently `blame' is the only command which creates
+		 * OI_CACHED, and direct_cache is only used by `cat-file'.
+		 */
+		assert(!oi->direct_cache);
 		if (oi->contentp)
 			*oi->contentp = xmemdupz(co->buf, co->size);
 		oi->whence = OI_CACHED;
diff --git a/object-store-ll.h b/object-store-ll.h
index b71a15f590..50c5219308 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -298,6 +298,13 @@  struct object_info {
 		OI_PACKED,
 		OI_DBCACHED
 	} whence;
+
+	/*
+	 * set if caller is able to use OI_DBCACHED entries without copying
+	 * TODO OI_CACHED if its use goes beyond blame
+	 */
+	unsigned direct_cache:1;
+
 	union {
 		/*
 		 * struct {
diff --git a/packfile.c b/packfile.c
index 1a409ec142..b2660e14f9 100644
--- a/packfile.c
+++ b/packfile.c
@@ -1362,6 +1362,9 @@  static enum object_type packed_to_object_type(struct repository *r,
 static struct hashmap delta_base_cache;
 static size_t delta_base_cached;
 
+/* ensures oi->direct_cache is used properly */
+static int delta_base_cache_lock;
+
 static LIST_HEAD(delta_base_cache_lru);
 
 struct delta_base_cache_key {
@@ -1444,6 +1447,18 @@  static void detach_delta_base_cache_entry(struct delta_base_cache_entry *ent)
 	free(ent);
 }
 
+static void lock_delta_base_cache(void)
+{
+	delta_base_cache_lock++;
+	assert(delta_base_cache_lock == 1);
+}
+
+void unlock_delta_base_cache(void)
+{
+	delta_base_cache_lock--;
+	assert(delta_base_cache_lock == 0);
+}
+
 static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
 {
 	free(ent->data);
@@ -1453,6 +1468,7 @@  static inline void release_delta_base_cache(struct delta_base_cache_entry *ent)
 void clear_delta_base_cache(void)
 {
 	struct list_head *lru, *tmp;
+	assert(!delta_base_cache_lock);
 	list_for_each_safe(lru, tmp, &delta_base_cache_lru) {
 		struct delta_base_cache_entry *entry =
 			list_entry(lru, struct delta_base_cache_entry, lru);
@@ -1466,6 +1482,7 @@  static void add_delta_base_cache(struct packed_git *p, off_t base_offset,
 	struct delta_base_cache_entry *ent;
 	struct list_head *lru, *tmp;
 
+	assert(!delta_base_cache_lock);
 	/*
 	 * Check required to avoid redundant entries when more than one thread
 	 * is unpacking the same object, in unpack_entry() (since its phases I
@@ -1521,11 +1538,16 @@  int packed_object_info(struct repository *r, struct packed_git *p,
 		if (oi->sizep)
 			*oi->sizep = ent->size;
 		if (oi->contentp) {
-			if (!oi->content_limit ||
-					ent->size <= oi->content_limit)
+			/* ignore content_limit if avoiding copy from cache */
+			if (oi->direct_cache) {
+				lock_delta_base_cache();
+				*oi->contentp = ent->data;
+			} else if (!oi->content_limit ||
+					ent->size <= oi->content_limit) {
 				*oi->contentp = xmemdupz(ent->data, ent->size);
-			else
+			} else {
 				*oi->contentp = NULL; /* caller must stream */
+			}
 		}
 	} else if (oi->contentp && !oi->content_limit) {
 		*oi->contentp = unpack_entry(r, p, obj_offset, &type,
diff --git a/packfile.h b/packfile.h
index eb18ec15db..94941bbe80 100644
--- a/packfile.h
+++ b/packfile.h
@@ -210,4 +210,8 @@  int is_promisor_object(const struct object_id *oid);
 int load_idx(const char *path, const unsigned int hashsz, void *idx_map,
 	     size_t idx_size, struct packed_git *p);
 
+/*
+ * release lock acquired via oi->direct_cache
+ */
+void unlock_delta_base_cache(void);
 #endif