diff mbox series

[v6,26/28] t/lib-unicode-nfc-nfd: helper prereqs for testing unicode nfc/nfd

Message ID 8278f32c4d894d4930b9f1f70f3aa01679e2011e.1650662994.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series Builtin FSMonitor Part 3 | expand

Commit Message

Jeff Hostetler April 22, 2022, 9:29 p.m. UTC
From: Jeff Hostetler <jeffhost@microsoft.com>

Create a set of prereqs to help understand how file names
are handled by the filesystem when they contain NFC and NFD
Unicode characters.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 t/lib-unicode-nfc-nfd.sh | 167 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100755 t/lib-unicode-nfc-nfd.sh

Comments

Johannes Schindelin May 12, 2022, 3:26 p.m. UTC | #1
Hi Jeff,

On Fri, 22 Apr 2022, Jeff Hostetler via GitGitGadget wrote:

> From: Jeff Hostetler <jeffhost@microsoft.com>
>
> Create a set of prereqs to help understand how file names
> are handled by the filesystem when they contain NFC and NFD
> Unicode characters.
>
> Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
> ---
>  t/lib-unicode-nfc-nfd.sh | 167 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 167 insertions(+)
>  create mode 100755 t/lib-unicode-nfc-nfd.sh
>
> diff --git a/t/lib-unicode-nfc-nfd.sh b/t/lib-unicode-nfc-nfd.sh
> new file mode 100755
> index 00000000000..cf9c26d1e22
> --- /dev/null
> +++ b/t/lib-unicode-nfc-nfd.sh
> @@ -0,0 +1,167 @@
> +# Help detect how Unicode NFC and NFD are handled on the filesystem.
> +
> +# A simple character that has a NFD form.
> +#
> +# NFC:       U+00e9 LATIN SMALL LETTER E WITH ACUTE
> +# UTF8(NFC): \xc3 \xa9
> +#
> +# NFD:       U+0065 LATIN SMALL LETTER E
> +#            U+0301 COMBINING ACUTE ACCENT
> +# UTF8(NFD): \x65  +  \xcc \x81
> +#
> +utf8_nfc=$(printf "\xc3\xa9")
> +utf8_nfd=$(printf "\x65\xcc\x81")
> +
> +# Is the OS or the filesystem "Unicode composition sensitive"?
> +#
> +# That is, does the OS or the filesystem allow files to exist with
> +# both the NFC and NFD spellings?  Or, does the OS/FS lie to us and
> +# tell us that the NFC and NFD forms are equivalent.
> +#
> +# This is or may be independent of what type of filesystem we have,
> +# since it might be handled by the OS at a layer above the FS.
> +# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
> +# collision, for example.
> +#
> +# This does not tell us how the Unicode pathname will be spelled
> +# on disk, but rather only that the two spelling "collide".  We
> +# will examine the actual on disk spelling in a later prereq.
> +#
> +test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
> +	mkdir trial_${utf8_nfc} &&
> +	mkdir trial_${utf8_nfd}
> +'
> +
> +# Is the spelling of an NFC pathname preserved on disk?
> +#
> +# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
> +# and on APFS, NFC paths are preserved.  As we have established
> +# above, this is independent of "composition sensitivity".
> +#
> +# 0000000 63 5f c3 a9
> +#
> +# (/usr/bin/od output contains different amount of whitespace
> +# on different platforms, so we need the wildcards here.)
> +#
> +test_lazy_prereq UNICODE_NFC_PRESERVED '
> +	mkdir c_${utf8_nfc} &&
> +	ls | od -t x1 | grep "63 *5f *c3 *a9"

As far as I can see, this would be the first usage of `od` in the test
suite. I'd actually like to reduce our dependency on Unix-y tools, not
increase it.

One thing we could do would be to imitate t4030, and introduce a shell
function that calls Perl, something like:

	bin2hex () {
		perl -e '
			$/ = undef;
			$_ = <>;
			s/./sprintf("%02x ", ord($&))/ge;
			print $_
		'
	}

But it is a thorn in my side for quite a few years already that we
_require_ Perl, even in NO_PERL builds.

So maybe a much better idea would be to introduce a small helper in
`t/helper/` that converts binary data on stdin to hex on stdout? Something
like this:

-- snip --
From bee2a3c43c90683b3e86e1739361570cce76d382 Mon Sep 17 00:00:00 2001
From: Johannes Schindelin <johannes.schindelin@gmx.de>
Date: Thu, 12 May 2022 17:24:50 +0200
Subject: [PATCH] tests: add a helped to print a hexdump

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---
 Makefile                |  1 +
 t/helper/test-hexdump.c | 24 ++++++++++++++++++++++++
 t/helper/test-tool.c    |  1 +
 t/helper/test-tool.h    |  1 +
 4 files changed, 27 insertions(+)
 create mode 100644 t/helper/test-hexdump.c

diff --git a/Makefile b/Makefile
index 4a23508d16f..fc262f99a1f 100644
--- a/Makefile
+++ b/Makefile
@@ -708,6 +708,7 @@ TEST_BUILTINS_OBJS += test-getcwd.o
 TEST_BUILTINS_OBJS += test-hash-speed.o
 TEST_BUILTINS_OBJS += test-hash.o
 TEST_BUILTINS_OBJS += test-hashmap.o
+TEST_BUILTINS_OBJS += test-hexdump.o
 TEST_BUILTINS_OBJS += test-index-version.o
 TEST_BUILTINS_OBJS += test-json-writer.o
 TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
diff --git a/t/helper/test-hexdump.c b/t/helper/test-hexdump.c
new file mode 100644
index 00000000000..13f154d9fa7
--- /dev/null
+++ b/t/helper/test-hexdump.c
@@ -0,0 +1,24 @@
+#include "test-tool.h"
+#include "git-compat-util.h"
+
+/*
+ * Read stdin and print a hexdump to stdout.
+ */
+int cmd__hexdump(int argc, const char **argv)
+{
+	char buf[1024];
+	ssize_t i, len;
+
+	for (;;) {
+		len = xread(0, buf, sizeof(buf));
+		if (len < 0)
+			die_errno("failure reading stdin");
+		if (!len)
+			break;
+
+		for (i = 0; i < len; i++)
+			printf("%02x ", buf[i]);
+	}
+
+	return 0;
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index 3ce5585e53a..44bd8269a07 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -35,6 +35,7 @@ static struct test_cmd cmds[] = {
 	{ "genzeros", cmd__genzeros },
 	{ "getcwd", cmd__getcwd },
 	{ "hashmap", cmd__hashmap },
+	{ "hexdump", cmd__hexdump },
 	{ "hash-speed", cmd__hash_speed },
 	{ "index-version", cmd__index_version },
 	{ "json-writer", cmd__json_writer },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index 9f0f5228508..8ec30136913 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -25,6 +25,7 @@ int cmd__genrandom(int argc, const char **argv);
 int cmd__genzeros(int argc, const char **argv);
 int cmd__getcwd(int argc, const char **argv);
 int cmd__hashmap(int argc, const char **argv);
+int cmd__hexdump(int argc, const char **argv);
 int cmd__hash_speed(int argc, const char **argv);
 int cmd__index_version(int argc, const char **argv);
 int cmd__json_writer(int argc, const char **argv);
-- snap --

Other than the `od` usage, this patch looks good to me.

Thank you very much for driving FSMonitor forward!
Dscho

> +'
> +
> +# Is the spelling of an NFD pathname preserved on disk?
> +#
> +# 0000000 64 5f 65 cc 81
> +#
> +test_lazy_prereq UNICODE_NFD_PRESERVED '
> +	mkdir d_${utf8_nfd} &&
> +	ls | od -t x1 | grep "64 *5f *65 *cc *81"
> +'
> +	mkdir c_${utf8_nfc} &&
> +	mkdir d_${utf8_nfd} &&
> +
> +# The following _DOUBLE_ forms are more for my curiosity,
> +# but there may be quirks lurking when there are multiple
> +# combining characters in non-canonical order.
> +
> +# Unicode also allows multiple combining characters
> +# that can be decomposed in pieces.
> +#
> +# NFC:        U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
> +# UTF8(NFC):  \xe1 \xbd \xa7
> +#
> +# NFD1:       U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
> +#             U+0342 COMBINING GREEK PERISPOMENI
> +# UTF8(NFD1): \xe1 \xbd \xa1  +  \xcd \x82
> +#
> +# But U+1f61 decomposes into
> +# NFD2:       U+03c9 GREEK SMALL LETTER OMEGA
> +#             U+0314 COMBINING REVERSED COMMA ABOVE
> +# UTF8(NFD2): \xcf \x89  +  \xcc \x94
> +#
> +# Yielding:   \xcf \x89  +  \xcc \x94  +  \xcd \x82
> +#
> +# Note that I've used the canonical ordering of the
> +# combinining characters.  It is also possible to
> +# swap them.  My testing shows that that non-standard
> +# ordering also causes a collision in mkdir.  However,
> +# the resulting names don't draw correctly on the
> +# terminal (implying that the on-disk format also has
> +# them out of order).
> +#
> +greek_nfc=$(printf "\xe1\xbd\xa7")
> +greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
> +greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")
> +
> +# See if a double decomposition also collides.
> +#
> +test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
> +	mkdir trial_${greek_nfc} &&
> +	mkdir trial_${greek_nfd2}
> +'
> +
> +# See if the NFC spelling appears on the disk.
> +#
> +test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
> +	mkdir c_${greek_nfc} &&
> +	ls | od -t x1 | grep "63 *5f *e1 *bd *a7"
> +'
> +
> +# See if the NFD spelling appears on the disk.
> +#
> +test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
> +	mkdir d_${greek_nfd2} &&
> +	ls | od -t x1 | grep "64 *5f *cf *89 *cc *94 *cd *82"
> +'
> +
> +# The following is for debugging. I found it useful when
> +# trying to understand the various (OS, FS) quirks WRT
> +# Unicode and how composition/decomposition is handled.
> +# For example, when trying to understand how (macOS, APFS)
> +# and (macOS, HFS) and (macOS, FAT32) compare.
> +#
> +# It is rather noisy, so it is disabled by default.
> +#
> +if test "$unicode_debug" = "true"
> +then
> +	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
> +	then
> +		echo NFC and NFD are distinct on this OS/filesystem.
> +	else
> +		echo NFC and NFD are aliases on this OS/filesystem.
> +	fi
> +
> +	if test_have_prereq UNICODE_NFC_PRESERVED
> +	then
> +		echo NFC maintains original spelling.
> +	else
> +		echo NFC is modified.
> +	fi
> +
> +	if test_have_prereq UNICODE_NFD_PRESERVED
> +	then
> +		echo NFD maintains original spelling.
> +	else
> +		echo NFD is modified.
> +	fi
> +
> +	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
> +	then
> +		echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
> +	else
> +		echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
> +	fi
> +
> +	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
> +	then
> +		echo Double NFC maintains original spelling.
> +	else
> +		echo Double NFC is modified.
> +	fi
> +
> +	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
> +	then
> +		echo Double NFD maintains original spelling.
> +	else
> +		echo Double NFD is modified.
> +	fi
> +fi
> --
> gitgitgadget
>
>
Jeff Hostetler May 17, 2022, 9:14 p.m. UTC | #2
On 5/12/22 11:26 AM, Johannes Schindelin wrote:
> Hi Jeff,
> 
> On Fri, 22 Apr 2022, Jeff Hostetler via GitGitGadget wrote:
> 
>> From: Jeff Hostetler <jeffhost@microsoft.com>
>>
[...]
>> +#
>> +test_lazy_prereq UNICODE_NFC_PRESERVED '
>> +	mkdir c_${utf8_nfc} &&
>> +	ls | od -t x1 | grep "63 *5f *c3 *a9"
> 
> As far as I can see, this would be the first usage of `od` in the test
> suite. I'd actually like to reduce our dependency on Unix-y tools, not
> increase it.
> 
> One thing we could do would be to imitate t4030, and introduce a shell
> function that calls Perl, something like:
> 
> 	bin2hex () {
> 		perl -e '
> 			$/ = undef;
> 			$_ = <>;
> 			s/./sprintf("%02x ", ord($&))/ge;
> 			print $_
> 		'
> 	}
> 
> But it is a thorn in my side for quite a few years already that we
> _require_ Perl, even in NO_PERL builds.
> 
> So maybe a much better idea would be to introduce a small helper in
> `t/helper/` that converts binary data on stdin to hex on stdout? Something
> like this:
> 

Yeah, lets add the hexdump helper.

Thanks
Jeff
diff mbox series

Patch

diff --git a/t/lib-unicode-nfc-nfd.sh b/t/lib-unicode-nfc-nfd.sh
new file mode 100755
index 00000000000..cf9c26d1e22
--- /dev/null
+++ b/t/lib-unicode-nfc-nfd.sh
@@ -0,0 +1,167 @@ 
+# Help detect how Unicode NFC and NFD are handled on the filesystem.
+
+# A simple character that has a NFD form.
+#
+# NFC:       U+00e9 LATIN SMALL LETTER E WITH ACUTE
+# UTF8(NFC): \xc3 \xa9
+#
+# NFD:       U+0065 LATIN SMALL LETTER E
+#            U+0301 COMBINING ACUTE ACCENT
+# UTF8(NFD): \x65  +  \xcc \x81
+#
+utf8_nfc=$(printf "\xc3\xa9")
+utf8_nfd=$(printf "\x65\xcc\x81")
+
+# Is the OS or the filesystem "Unicode composition sensitive"?
+#
+# That is, does the OS or the filesystem allow files to exist with
+# both the NFC and NFD spellings?  Or, does the OS/FS lie to us and
+# tell us that the NFC and NFD forms are equivalent.
+#
+# This is or may be independent of what type of filesystem we have,
+# since it might be handled by the OS at a layer above the FS.
+# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
+# collision, for example.
+#
+# This does not tell us how the Unicode pathname will be spelled
+# on disk, but rather only that the two spelling "collide".  We
+# will examine the actual on disk spelling in a later prereq.
+#
+test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
+	mkdir trial_${utf8_nfc} &&
+	mkdir trial_${utf8_nfd}
+'
+
+# Is the spelling of an NFC pathname preserved on disk?
+#
+# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
+# and on APFS, NFC paths are preserved.  As we have established
+# above, this is independent of "composition sensitivity".
+#
+# 0000000 63 5f c3 a9
+#
+# (/usr/bin/od output contains different amount of whitespace
+# on different platforms, so we need the wildcards here.)
+#
+test_lazy_prereq UNICODE_NFC_PRESERVED '
+	mkdir c_${utf8_nfc} &&
+	ls | od -t x1 | grep "63 *5f *c3 *a9"
+'
+
+# Is the spelling of an NFD pathname preserved on disk?
+#
+# 0000000 64 5f 65 cc 81
+#
+test_lazy_prereq UNICODE_NFD_PRESERVED '
+	mkdir d_${utf8_nfd} &&
+	ls | od -t x1 | grep "64 *5f *65 *cc *81"
+'
+	mkdir c_${utf8_nfc} &&
+	mkdir d_${utf8_nfd} &&
+
+# The following _DOUBLE_ forms are more for my curiosity,
+# but there may be quirks lurking when there are multiple
+# combining characters in non-canonical order.
+
+# Unicode also allows multiple combining characters
+# that can be decomposed in pieces.
+#
+# NFC:        U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
+# UTF8(NFC):  \xe1 \xbd \xa7
+#
+# NFD1:       U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
+#             U+0342 COMBINING GREEK PERISPOMENI
+# UTF8(NFD1): \xe1 \xbd \xa1  +  \xcd \x82
+#
+# But U+1f61 decomposes into
+# NFD2:       U+03c9 GREEK SMALL LETTER OMEGA
+#             U+0314 COMBINING REVERSED COMMA ABOVE
+# UTF8(NFD2): \xcf \x89  +  \xcc \x94
+#
+# Yielding:   \xcf \x89  +  \xcc \x94  +  \xcd \x82
+#
+# Note that I've used the canonical ordering of the
+# combinining characters.  It is also possible to
+# swap them.  My testing shows that that non-standard
+# ordering also causes a collision in mkdir.  However,
+# the resulting names don't draw correctly on the
+# terminal (implying that the on-disk format also has
+# them out of order).
+#
+greek_nfc=$(printf "\xe1\xbd\xa7")
+greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
+greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")
+
+# See if a double decomposition also collides.
+#
+test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
+	mkdir trial_${greek_nfc} &&
+	mkdir trial_${greek_nfd2}
+'
+
+# See if the NFC spelling appears on the disk.
+#
+test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
+	mkdir c_${greek_nfc} &&
+	ls | od -t x1 | grep "63 *5f *e1 *bd *a7"
+'
+
+# See if the NFD spelling appears on the disk.
+#
+test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
+	mkdir d_${greek_nfd2} &&
+	ls | od -t x1 | grep "64 *5f *cf *89 *cc *94 *cd *82"
+'
+
+# The following is for debugging. I found it useful when
+# trying to understand the various (OS, FS) quirks WRT
+# Unicode and how composition/decomposition is handled.
+# For example, when trying to understand how (macOS, APFS)
+# and (macOS, HFS) and (macOS, FAT32) compare.
+#
+# It is rather noisy, so it is disabled by default.
+#
+if test "$unicode_debug" = "true"
+then
+	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
+	then
+		echo NFC and NFD are distinct on this OS/filesystem.
+	else
+		echo NFC and NFD are aliases on this OS/filesystem.
+	fi
+
+	if test_have_prereq UNICODE_NFC_PRESERVED
+	then
+		echo NFC maintains original spelling.
+	else
+		echo NFC is modified.
+	fi
+
+	if test_have_prereq UNICODE_NFD_PRESERVED
+	then
+		echo NFD maintains original spelling.
+	else
+		echo NFD is modified.
+	fi
+
+	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
+	then
+		echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
+	else
+		echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
+	fi
+
+	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
+	then
+		echo Double NFC maintains original spelling.
+	else
+		echo Double NFC is modified.
+	fi
+
+	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
+	then
+		echo Double NFD maintains original spelling.
+	else
+		echo Double NFD is modified.
+	fi
+fi