diff mbox series

[v2,18/22] pickaxe -S: slightly optimize contains()

Message ID 20210216115801.4773-19-avarab@gmail.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Ævar Arnfjörð Bjarmason Feb. 16, 2021, 11:57 a.m. UTC
When the "log -S<pat>" switch counts occurrences of <pat> on the
pre-image and post-image of a change. As soon as we know we had e.g. 1
before and 2 now we can stop, we don't need to keep counting past 2.

Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
---
 diffcore-pickaxe.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

Comments

Junio C Hamano March 30, 2021, 11:58 p.m. UTC | #1
Ævar Arnfjörð Bjarmason  <avarab@gmail.com> writes:

> When the "log -S<pat>" switch counts occurrences of <pat> on the
> pre-image and post-image of a change. As soon as we know we had e.g. 1
> before and 2 now we can stop, we don't need to keep counting past 2.

Logical.

I do not know how much difference this would make in practice,
though.  The performance characteristics between "diff A B"
and "diff B A" with the same pickaxe -Sneedle would be asymmetric
with this optimization, which is a bit curious (but not incorrect).

I wonder if there is a good heuristics to decide which, between one
and two, blob to start counting.  Obviously, scanning the one that
is likely to contain fewer occurrence of the needle, before we can
employ this optimization to the other side, is what we want.

>
> Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> ---
>  diffcore-pickaxe.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
> index 66e34d254f1..76c178bae2b 100644
> --- a/diffcore-pickaxe.c
> +++ b/diffcore-pickaxe.c
> @@ -68,7 +68,8 @@ static int diff_grep(mmfile_t *one, mmfile_t *two,
>  	return ecbdata.hit;
>  }
>  
> -static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
> +static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws,
> +			     unsigned int limit)
>  {
>  	unsigned int cnt = 0;
>  	unsigned long sz = mf->size;
> @@ -88,6 +89,9 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
>  				sz--;
>  			}
>  			cnt++;
> +
> +			if (limit && cnt == limit)
> +				return cnt;
>  		}
>  
>  	} else { /* Classic exact string match */
> @@ -99,6 +103,9 @@ static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
>  			sz -= offset + kwsm.size[0];
>  			data += offset + kwsm.size[0];
>  			cnt++;
> +
> +			if (limit && cnt == limit)
> +				return cnt;
>  		}
>  	}
>  	return cnt;
> @@ -108,8 +115,8 @@ static int has_changes(mmfile_t *one, mmfile_t *two,
>  		       struct diff_options *o,
>  		       regex_t *regexp, kwset_t kws)
>  {
> -	unsigned int c1 = one ? contains(one, regexp, kws) : 0;
> -	unsigned int c2 = two ? contains(two, regexp, kws) : 0;
> +	unsigned int c1 = one ? contains(one, regexp, kws, 0) : 0;
> +	unsigned int c2 = two ? contains(two, regexp, kws, c1 + 1) : 0;
>  	return c1 != c2;
>  }
diff mbox series

Patch

diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c
index 66e34d254f1..76c178bae2b 100644
--- a/diffcore-pickaxe.c
+++ b/diffcore-pickaxe.c
@@ -68,7 +68,8 @@  static int diff_grep(mmfile_t *one, mmfile_t *two,
 	return ecbdata.hit;
 }
 
-static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
+static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws,
+			     unsigned int limit)
 {
 	unsigned int cnt = 0;
 	unsigned long sz = mf->size;
@@ -88,6 +89,9 @@  static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
 				sz--;
 			}
 			cnt++;
+
+			if (limit && cnt == limit)
+				return cnt;
 		}
 
 	} else { /* Classic exact string match */
@@ -99,6 +103,9 @@  static unsigned int contains(mmfile_t *mf, regex_t *regexp, kwset_t kws)
 			sz -= offset + kwsm.size[0];
 			data += offset + kwsm.size[0];
 			cnt++;
+
+			if (limit && cnt == limit)
+				return cnt;
 		}
 	}
 	return cnt;
@@ -108,8 +115,8 @@  static int has_changes(mmfile_t *one, mmfile_t *two,
 		       struct diff_options *o,
 		       regex_t *regexp, kwset_t kws)
 {
-	unsigned int c1 = one ? contains(one, regexp, kws) : 0;
-	unsigned int c2 = two ? contains(two, regexp, kws) : 0;
+	unsigned int c1 = one ? contains(one, regexp, kws, 0) : 0;
+	unsigned int c2 = two ? contains(two, regexp, kws, c1 + 1) : 0;
 	return c1 != c2;
 }