diff mbox series

[06/13] xfs_scrub: hoist non-rendering character predicate

Message ID 172229847622.1348850.4728182139864049922.stgit@frogsfrogsfrogs (mailing list archive)
State Accepted, archived
Headers show
Series [01/13] xfs_scrub: use proper UChar string iterators | expand

Commit Message

Darrick J. Wong July 30, 2024, 1:07 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

Hoist this predicate code into its own function; we're going to use it
elsewhere later on.  While we're at it, document how we generated this
list in the first place.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 scrub/unicrash.c |   49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)
diff mbox series

Patch

diff --git a/scrub/unicrash.c b/scrub/unicrash.c
index 456caec27..1a86b5f8c 100644
--- a/scrub/unicrash.c
+++ b/scrub/unicrash.c
@@ -170,6 +170,36 @@  remove_ignorable(
 	return dest;
 }
 
+/*
+ * Certain unicode codepoints are formatting hints that are not themselves
+ * supposed to be rendered by a display system.  These codepoints can be
+ * encoded in file names to try to confuse users.
+ *
+ * Download https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt and
+ * $ grep -E '(zero width|invisible|joiner|application)' -i UnicodeData.txt
+ */
+static inline bool is_nonrendering(UChar32 uchr)
+{
+	switch (uchr) {
+	case 0x034F:	/* combining grapheme joiner */
+	case 0x200B:	/* zero width space */
+	case 0x200C:	/* zero width non-joiner */
+	case 0x200D:	/* zero width joiner */
+	case 0x2028:	/* line separator */
+	case 0x2029:	/* paragraph separator */
+	case 0x2060:	/* word joiner */
+	case 0x2061:	/* function application */
+	case 0x2062:	/* invisible times (multiply) */
+	case 0x2063:	/* invisible separator (comma) */
+	case 0x2064:	/* invisible plus (addition) */
+	case 0x2D7F:	/* tifinagh consonant joiner */
+	case 0xFEFF:	/* zero width non breaking space */
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Generate normalized form and skeleton of the name.  If this fails, just
  * forget everything and return false; this is an advisory checker.
@@ -349,24 +379,9 @@  name_entry_examine(
 
 	uiter_setString(&uiter, entry->normstr, entry->normstrlen);
 	while ((uchr = uiter_next32(&uiter)) != U_SENTINEL) {
-		/* zero width character sequences */
-		switch (uchr) {
-		case 0x034F:	/* combining grapheme joiner */
-		case 0x200B:	/* zero width space */
-		case 0x200C:	/* zero width non-joiner */
-		case 0x200D:	/* zero width joiner */
-		case 0x2028:	/* line separator */
-		case 0x2029:	/* paragraph separator */
-		case 0x2060:	/* word joiner */
-		case 0x2061:	/* function application */
-		case 0x2062:	/* invisible times (multiply) */
-		case 0x2063:	/* invisible separator (comma) */
-		case 0x2064:	/* invisible plus (addition) */
-		case 0x2D7F:	/* tifinagh consonant joiner */
-		case 0xFEFF:	/* zero width non breaking space */
+		/* characters are invisible */
+		if (is_nonrendering(uchr))
 			*badflags |= UNICRASH_ZERO_WIDTH;
-			break;
-		}
 
 		/* control characters */
 		if (u_iscntrl(uchr))