[v4,02/15] bloom.c: add the murmur3 hash implementation
diff mbox series

Message ID a5aa3415c05ee9bc67a9471445a20c71a9834673.1586192395.git.gitgitgadget@gmail.com
State New
Headers show
Series
  • Changed Paths Bloom Filters
Related show

Commit Message

Marco Trevisan via GitGitGadget April 6, 2020, 4:59 p.m. UTC
From: Garima Singh <garima.singh@microsoft.com>

In preparation for computing changed paths Bloom filters,
implement the Murmur3 hash algorithm as described in [1].
It hashes the given data using the given seed and produces
a uniformly distributed hash value.

[1] https://en.wikipedia.org/wiki/MurmurHash#Algorithm

Helped-by: Derrick Stolee <dstolee@microsoft.com>
Helped-by: Szeder Gábor <szeder.dev@gmail.com>
Reviewed-by: Jakub Narębski <jnareb@gmail.com>
Signed-off-by: Garima Singh <garima.singh@microsoft.com>
---
 Makefile              |  2 ++
 bloom.c               | 73 +++++++++++++++++++++++++++++++++++++++++++
 bloom.h               | 13 ++++++++
 t/helper/test-bloom.c | 13 ++++++++
 t/helper/test-tool.c  |  1 +
 t/helper/test-tool.h  |  1 +
 t/t0095-bloom.sh      | 30 ++++++++++++++++++
 7 files changed, 133 insertions(+)
 create mode 100644 bloom.c
 create mode 100644 bloom.h
 create mode 100644 t/helper/test-bloom.c
 create mode 100755 t/t0095-bloom.sh

Patch
diff mbox series

diff --git a/Makefile b/Makefile
index ef1ff2228f0..491f75e68c5 100644
--- a/Makefile
+++ b/Makefile
@@ -695,6 +695,7 @@  X =
 PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
 
 TEST_BUILTINS_OBJS += test-advise.o
+TEST_BUILTINS_OBJS += test-bloom.o
 TEST_BUILTINS_OBJS += test-chmtime.o
 TEST_BUILTINS_OBJS += test-config.o
 TEST_BUILTINS_OBJS += test-ctype.o
@@ -840,6 +841,7 @@  LIB_OBJS += base85.o
 LIB_OBJS += bisect.o
 LIB_OBJS += blame.o
 LIB_OBJS += blob.o
+LIB_OBJS += bloom.o
 LIB_OBJS += branch.o
 LIB_OBJS += bulk-checkin.o
 LIB_OBJS += bundle.o
diff --git a/bloom.c b/bloom.c
new file mode 100644
index 00000000000..40e87632aeb
--- /dev/null
+++ b/bloom.c
@@ -0,0 +1,73 @@ 
+#include "git-compat-util.h"
+#include "bloom.h"
+
+static uint32_t rotate_left(uint32_t value, int32_t count)
+{
+	uint32_t mask = 8 * sizeof(uint32_t) - 1;
+	count &= mask;
+	return ((value << count) | (value >> ((-count) & mask)));
+}
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
+{
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t r1 = 15;
+	const uint32_t r2 = 13;
+	const uint32_t m = 5;
+	const uint32_t n = 0xe6546b64;
+	int i;
+	uint32_t k1 = 0;
+	const char *tail;
+
+	int len4 = len / sizeof(uint32_t);
+
+	uint32_t k;
+	for (i = 0; i < len4; i++) {
+		uint32_t byte1 = (uint32_t)data[4*i];
+		uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
+		uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
+		uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
+		k = byte1 | byte2 | byte3 | byte4;
+		k *= c1;
+		k = rotate_left(k, r1);
+		k *= c2;
+
+		seed ^= k;
+		seed = rotate_left(seed, r2) * m + n;
+	}
+
+	tail = (data + len4 * sizeof(uint32_t));
+
+	switch (len & (sizeof(uint32_t) - 1)) {
+	case 3:
+		k1 ^= ((uint32_t)tail[2]) << 16;
+		/*-fallthrough*/
+	case 2:
+		k1 ^= ((uint32_t)tail[1]) << 8;
+		/*-fallthrough*/
+	case 1:
+		k1 ^= ((uint32_t)tail[0]) << 0;
+		k1 *= c1;
+		k1 = rotate_left(k1, r1);
+		k1 *= c2;
+		seed ^= k1;
+		break;
+	}
+
+	seed ^= (uint32_t)len;
+	seed ^= (seed >> 16);
+	seed *= 0x85ebca6b;
+	seed ^= (seed >> 13);
+	seed *= 0xc2b2ae35;
+	seed ^= (seed >> 16);
+
+	return seed;
+}
\ No newline at end of file
diff --git a/bloom.h b/bloom.h
new file mode 100644
index 00000000000..d0fcc5f0aa6
--- /dev/null
+++ b/bloom.h
@@ -0,0 +1,13 @@ 
+#ifndef BLOOM_H
+#define BLOOM_H
+
+/*
+ * Calculate the murmur3 32-bit hash value for the given data
+ * using the given seed.
+ * Produces a uniformly distributed hash value.
+ * Not considered to be cryptographically secure.
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
+ */
+uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
+
+#endif
\ No newline at end of file
diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c
new file mode 100644
index 00000000000..60ee2043689
--- /dev/null
+++ b/t/helper/test-bloom.c
@@ -0,0 +1,13 @@ 
+#include "git-compat-util.h"
+#include "bloom.h"
+#include "test-tool.h"
+
+int cmd__bloom(int argc, const char **argv)
+{
+	if (!strcmp(argv[1], "get_murmur3")) {
+		uint32_t hashed = murmur3_seeded(0, argv[2], strlen(argv[2]));
+		printf("Murmur3 Hash with seed=0:0x%08x\n", hashed);
+	}
+
+	return 0;
+}
\ No newline at end of file
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 31eedcd241f..6e26bd65c97 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -15,6 +15,7 @@  struct test_cmd {
 
 static struct test_cmd cmds[] = {
 	{ "advise", cmd__advise_if_enabled },
+	{ "bloom", cmd__bloom },
 	{ "chmtime", cmd__chmtime },
 	{ "config", cmd__config },
 	{ "ctype", cmd__ctype },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 4eb5e6609e1..dceeef1d5c2 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -5,6 +5,7 @@ 
 #include "git-compat-util.h"
 
 int cmd__advise_if_enabled(int argc, const char **argv);
+int cmd__bloom(int argc, const char **argv);
 int cmd__chmtime(int argc, const char **argv);
 int cmd__config(int argc, const char **argv);
 int cmd__ctype(int argc, const char **argv);
diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh
new file mode 100755
index 00000000000..2dad8c4a94e
--- /dev/null
+++ b/t/t0095-bloom.sh
@@ -0,0 +1,30 @@ 
+#!/bin/sh
+
+test_description='Testing the various Bloom filter computations in bloom.c'
+. ./test-lib.sh
+
+test_expect_success 'compute unseeded murmur3 hash for empty string' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x00000000
+	EOF
+	test-tool bloom get_murmur3 "" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 1' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x627b0c2c
+	EOF
+	test-tool bloom get_murmur3 "Hello world!" >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'compute unseeded murmur3 hash for test string 2' '
+	cat >expect <<-\EOF &&
+	Murmur3 Hash with seed=0:0x2e4ff723
+	EOF
+	test-tool bloom get_murmur3 "The quick brown fox jumps over the lazy dog" >actual &&
+	test_cmp expect actual
+'
+
+test_done
\ No newline at end of file