diff mbox series

[v2,1/3] xfs_db: hoist name obfuscation code out of metadump.c

Message ID 20230615161206.GQ11441@frogsfrogsfrogs (mailing list archive)
State Accepted, archived
Headers show
Series [v2,1/3] xfs_db: hoist name obfuscation code out of metadump.c | expand

Commit Message

Darrick J. Wong June 15, 2023, 4:12 p.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

We want to create a debugger command that will create obfuscated names
for directory and xattr names, so hoist the name obfuscation code into a
separate file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
---
v2: rebase to deal with s/alloca/malloc change
---
 db/Makefile    |    2 
 db/metadump.c  |  388 -------------------------------------------------------
 db/obfuscate.c |  393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 db/obfuscate.h |   17 ++
 4 files changed, 412 insertions(+), 388 deletions(-)
 create mode 100644 db/obfuscate.c
 create mode 100644 db/obfuscate.h

Comments

Carlos Maiolino June 22, 2023, 11:55 a.m. UTC | #1
On Thu, Jun 15, 2023 at 09:12:06AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> We want to create a debugger command that will create obfuscated names
> for directory and xattr names, so hoist the name obfuscation code into a
> separate file.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> ---
> v2: rebase to deal with s/alloca/malloc change
> ---

Thanks!
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

>  db/Makefile    |    2
>  db/metadump.c  |  388 -------------------------------------------------------
>  db/obfuscate.c |  393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  db/obfuscate.h |   17 ++
>  4 files changed, 412 insertions(+), 388 deletions(-)
>  create mode 100644 db/obfuscate.c
>  create mode 100644 db/obfuscate.h
> 
> diff --git a/db/Makefile b/db/Makefile
> index b2e01174571..2f95f67075d 100644
> --- a/db/Makefile
> +++ b/db/Makefile
> @@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
>  	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
>  	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
>  	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
> -	fuzz.h
> +	fuzz.h obfuscate.h
>  CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
>  	timelimit.c
>  LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
> diff --git a/db/metadump.c b/db/metadump.c
> index 9ccee0b7ace..d9a616a9296 100644
> --- a/db/metadump.c
> +++ b/db/metadump.c
> @@ -19,6 +19,7 @@
>  #include "faddr.h"
>  #include "field.h"
>  #include "dir2.h"
> +#include "obfuscate.h"
> 
>  #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
> 
> @@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
>  	return ent;
>  }
> 
> -#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> -#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> -
> -static inline unsigned char
> -random_filename_char(void)
> -{
> -	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> -						"abcdefghijklmnopqrstuvwxyz"
> -						"0123456789-_";
> -
> -	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> -}
> -
>  #define	ORPHANAGE	"lost+found"
>  #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
> 
> @@ -808,380 +796,6 @@ in_lost_found(
>  	return slen == namelen && !memcmp(name, s, namelen);
>  }
> 
> -/*
> - * Given a name and its hash value, massage the name in such a way
> - * that the result is another name of equal length which shares the
> - * same hash value.
> - */
> -static void
> -obfuscate_name(
> -	xfs_dahash_t	hash,
> -	size_t		name_len,
> -	unsigned char	*name,
> -	bool		is_dirent)
> -{
> -	unsigned char	*oldname = NULL;
> -	unsigned char	*newp;
> -	int		i;
> -	xfs_dahash_t	new_hash;
> -	unsigned char	*first;
> -	unsigned char	high_bit;
> -	int		tries = 0;
> -	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> -	int		shift;
> -
> -	/*
> -	 * Our obfuscation algorithm requires at least 5-character
> -	 * names, so don't bother if the name is too short.  We
> -	 * work backward from a hash value to determine the last
> -	 * five bytes in a name required to produce a new name
> -	 * with the same hash.
> -	 */
> -	if (name_len < 5)
> -		return;
> -
> -	if (is_ci_name) {
> -		oldname = malloc(name_len);
> -		if (!oldname)
> -			return;
> -		memcpy(oldname, name, name_len);
> -	}
> -
> -again:
> -	newp = name;
> -	new_hash = 0;
> -
> -	/*
> -	 * If we cannot generate a ci-compatible obfuscated name after 1000
> -	 * tries, don't bother obfuscating the name.
> -	 */
> -	if (tries++ > 1000) {
> -		memcpy(name, oldname, name_len);
> -		goto out_free;
> -	}
> -
> -	/*
> -	 * The beginning of the obfuscated name can be pretty much
> -	 * anything, so fill it in with random characters.
> -	 * Accumulate its new hash value as we go.
> -	 */
> -	for (i = 0; i < name_len - 5; i++) {
> -		*newp = random_filename_char();
> -		if (is_ci_name)
> -			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> -							rol32(new_hash, 7);
> -		else
> -			new_hash = *newp ^ rol32(new_hash, 7);
> -		newp++;
> -	}
> -
> -	/*
> -	 * Compute which five bytes need to be used at the end of
> -	 * the name so the hash of the obfuscated name is the same
> -	 * as the hash of the original.  If any result in an invalid
> -	 * character, flip a bit and arrange for a corresponding bit
> -	 * in a neighboring byte to be flipped as well.  For the
> -	 * last byte, the "neighbor" to change is the first byte
> -	 * we're computing here.
> -	 */
> -	new_hash = rol32(new_hash, 3) ^ hash;
> -
> -	first = newp;
> -	high_bit = 0;
> -	for (shift = 28; shift >= 0; shift -= 7) {
> -		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> -		if (is_invalid_char(*newp)) {
> -			*newp ^= 1;
> -			high_bit = 0x80;
> -		} else
> -			high_bit = 0;
> -
> -		/*
> -		 * If ascii-ci is enabled, uppercase characters are converted
> -		 * to lowercase characters while computing the name hash.  If
> -		 * any of the necessary correction bytes are uppercase, the
> -		 * hash of the new name will not match.  Try again with a
> -		 * different prefix.
> -		 */
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*newp));
> -		newp++;
> -	}
> -
> -	/*
> -	 * If we flipped a bit on the last byte, we need to fix up
> -	 * the matching bit in the first byte.  The result will
> -	 * be a valid character, because we know that first byte
> -	 * has 0's in its upper four bits (it was produced by a
> -	 * 28-bit right-shift of a 32-bit unsigned value).
> -	 */
> -	if (high_bit) {
> -		*first ^= 0x10;
> -
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*first));
> -	}
> -
> -out_free:
> -	free(oldname);
> -}
> -
> -/*
> - * Flip a bit in each of two bytes at the end of the given name.
> - * This is used in generating a series of alternate names to be used
> - * in the event a duplicate is found.
> - *
> - * The bits flipped are selected such that they both affect the same
> - * bit in the name's computed hash value, so flipping them both will
> - * preserve the hash.
> - *
> - * The following diagram aims to show the portion of a computed
> - * hash that a given byte of a name affects.
> - *
> - *	   31    28      24    21	     14		  8 7       3     0
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> - *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> - *			 |<-- last-7 --->|	     |<-- last-5 --->|
> - *	   |<-- last-8 --->|	       |<-- last-6 --->|
> - *			. . . and so on
> - *
> - * The last byte of the name directly affects the low-order byte of
> - * the hash.  The next-to-last affects bits 7-14, the next one back
> - * affects bits 14-21, and so on.  The effect wraps around when it
> - * goes beyond the top of the hash (as happens for byte last-4).
> - *
> - * Bits that are flipped together "overlap" on the hash value.  As
> - * an example of overlap, the last two bytes both affect bit 7 in
> - * the hash.  That pair of bytes (and their overlapping bits) can be
> - * used for this "flip bit" operation (it's the first pair tried,
> - * actually).
> - *
> - * A table defines overlapping pairs--the bytes involved and bits
> - * within them--that can be used this way.  The byte offset is
> - * relative to a starting point within the name, which will be set
> - * to affect the bytes at the end of the name.  The function is
> - * called with a "bitseq" value which indicates which bit flip is
> - * desired, and this translates directly into selecting which entry
> - * in the bit_to_flip[] table to apply.
> - *
> - * The function returns 1 if the operation was successful.  It
> - * returns 0 if the result produced a character that's not valid in
> - * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> - * sequence number is beyond what is supported for a name of this
> - * length.
> - *
> - * Discussion
> - * ----------
> - * (Also see the discussion above find_alternate(), below.)
> - *
> - * In order to make this function work for any length name, the
> - * table is ordered by increasing byte offset, so that the earliest
> - * entries can apply to the shortest strings.  This way all names
> - * are done consistently.
> - *
> - * When bit flips occur, they can convert printable characters
> - * into non-printable ones.  In an effort to reduce the impact of
> - * this, the first bit flips are chosen to affect bytes the end of
> - * the name (and furthermore, toward the low bits of a byte).  Those
> - * bytes are often non-printable anyway because of the way they are
> - * initially selected by obfuscate_name()).  This is accomplished,
> - * using later table entries first.
> - *
> - * Each row in the table doubles the number of alternates that
> - * can be generated.  A two-byte name is limited to using only
> - * the first row, so it's possible to generate two alternates
> - * (the original name, plus the alternate produced by flipping
> - * the one pair of bits).  In a 5-byte name, the effect of the
> - * first byte overlaps the last by 4 its, and there are 8 bits
> - * to flip, allowing for 256 possible alternates.
> - *
> - * Short names (less than 5 bytes) are never even obfuscated, so for
> - * such names the relatively small number of alternates should never
> - * really be a problem.
> - *
> - * Long names (more than 6 bytes, say) are not likely to exhaust
> - * the number of available alternates.  In fact, the table could
> - * probably have stopped at 8 entries, on the assumption that 256
> - * alternates should be enough for most any situation.  The entries
> - * beyond those are present mostly for demonstration of how it could
> - * be populated with more entries, should it ever be necessary to do
> - * so.
> - */
> -static int
> -flip_bit(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	bitseq)
> -{
> -	int	index;
> -	size_t	offset;
> -	unsigned char *p0, *p1;
> -	unsigned char m0, m1;
> -	struct {
> -	    int		byte;	/* Offset from start within name */
> -	    unsigned char bit;	/* Bit within that byte */
> -	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> -	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> -	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> -	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> -	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> -	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> -	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> -	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> -	    { { 3, 0 }, { 4, 7 } },
> -	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> -	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> -	    { { 0, 2 }, { 5, 5 } },
> -	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> -	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> -	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> -		/* . . . */
> -	};
> -
> -	/* Find the first entry *not* usable for name of this length */
> -
> -	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> -		if (bit_to_flip[index][1].byte >= name_len)
> -			break;
> -
> -	/*
> -	 * Back up to the last usable entry.  If that number is
> -	 * smaller than the bit sequence number, inform the caller
> -	 * that nothing this large (or larger) will work.
> -	 */
> -	if (bitseq > --index)
> -		return -1;
> -
> -	/*
> -	 * We will be switching bits at the end of name, with a
> -	 * preference for affecting the last bytes first.  Compute
> -	 * where in the name we'll start applying the changes.
> -	 */
> -	offset = name_len - (bit_to_flip[index][1].byte + 1);
> -	index -= bitseq;	/* Use later table entries first */
> -
> -	p0 = name + offset + bit_to_flip[index][0].byte;
> -	p1 = name + offset + bit_to_flip[index][1].byte;
> -	m0 = 1 << bit_to_flip[index][0].bit;
> -	m1 = 1 << bit_to_flip[index][1].bit;
> -
> -	/* Only change the bytes if it produces valid characters */
> -
> -	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> -		return 0;
> -
> -	*p0 ^= m0;
> -	*p1 ^= m1;
> -
> -	return 1;
> -}
> -
> -/*
> - * This function generates a well-defined sequence of "alternate"
> - * names for a given name.  An alternate is a name having the same
> - * length and same hash value as the original name.  This is needed
> - * because the algorithm produces only one obfuscated name to use
> - * for a given original name, and it's possible that result matches
> - * a name already seen.  This function checks for this, and if it
> - * occurs, finds another suitable obfuscated name to use.
> - *
> - * Each bit in the binary representation of the sequence number is
> - * used to select one possible "bit flip" operation to perform on
> - * the name.  So for example:
> - *    seq = 0:	selects no bits to flip
> - *    seq = 1:	selects the 0th bit to flip
> - *    seq = 2:	selects the 1st bit to flip
> - *    seq = 3:	selects the 0th and 1st bit to flip
> - *    ... and so on.
> - *
> - * The flip_bit() function takes care of the details of the bit
> - * flipping within the name.  Note that the "1st bit" in this
> - * context is a bit sequence number; i.e. it doesn't necessarily
> - * mean bit 0x02 will be changed.
> - *
> - * If a valid name (one that contains no '/' or '\0' characters) is
> - * produced by this process for the given sequence number, this
> - * function returns 1.  If the result is not valid, it returns 0.
> - * Returns -1 if the sequence number is beyond the the maximum for
> - * names of the given length.
> - *
> - *
> - * Discussion
> - * ----------
> - * The number of alternates available for a given name is dependent
> - * on its length.  A "bit flip" involves inverting two bits in
> - * a name--the two bits being selected such that their values
> - * affect the name's hash value in the same way.  Alternates are
> - * thus generated by inverting the value of pairs of such
> - * "overlapping" bits in the original name.  Each byte after the
> - * first in a name adds at least one bit of overlap to work with.
> - * (See comments above flip_bit() for more discussion on this.)
> - *
> - * So the number of alternates is dependent on the number of such
> - * overlapping bits in a name.  If there are N bit overlaps, there
> - * 2^N alternates for that hash value.
> - *
> - * Here are the number of overlapping bits available for generating
> - * alternates for names of specific lengths:
> - *	1	0	(must have 2 bytes to have any overlap)
> - *	2	1	One bit overlaps--so 2 possible alternates
> - *	3	2	Two bits overlap--so 4 possible alternates
> - *	4	4	Three bits overlap, so 2^3 alternates
> - *	5	8	8 bits overlap (due to wrapping), 256 alternates
> - *	6	18	2^18 alternates
> - *	7	28	2^28 alternates
> - *	   ...
> - * It's clear that the number of alternates grows very quickly with
> - * the length of the name.  But note that the set of alternates
> - * includes invalid names.  And for certain (contrived) names, the
> - * number of valid names is a fairly small fraction of the total
> - * number of alternates.
> - *
> - * The main driver for this infrastructure for coming up with
> - * alternate names is really related to names 5 (or possibly 6)
> - * bytes in length.  5-byte obfuscated names contain no randomly-
> - * generated bytes in them, and the chance of an obfuscated name
> - * matching an already-seen name is too high to just ignore.  This
> - * methodical selection of alternates ensures we don't produce
> - * duplicate names unless we have exhausted our options.
> - */
> -static int
> -find_alternate(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	seq)
> -{
> -	uint32_t	bitseq = 0;
> -	uint32_t	bits = seq;
> -
> -	if (!seq)
> -		return 1;	/* alternate 0 is the original name */
> -	if (name_len < 2)	/* Must have 2 bytes to flip */
> -		return -1;
> -
> -	for (bitseq = 0; bits; bitseq++) {
> -		uint32_t	mask = 1 << bitseq;
> -		int		fb;
> -
> -		if (!(bits & mask))
> -			continue;
> -
> -		fb = flip_bit(name_len, name, bitseq);
> -		if (fb < 1)
> -			return fb ? -1 : 0;
> -		bits ^= mask;
> -	}
> -
> -	return 1;
> -}
> -
>  /*
>   * Look up the given name in the name table.  If it is already
>   * present, iterate through a well-defined sequence of alternate
> diff --git a/db/obfuscate.c b/db/obfuscate.c
> new file mode 100644
> index 00000000000..cd950b44581
> --- /dev/null
> +++ b/db/obfuscate.c
> @@ -0,0 +1,393 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#include "libxfs.h"
> +#include "init.h"
> +#include "obfuscate.h"
> +
> +static inline unsigned char
> +random_filename_char(void)
> +{
> +	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> +						"abcdefghijklmnopqrstuvwxyz"
> +						"0123456789-_";
> +
> +	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> +}
> +
> +#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> +
> +/*
> + * Given a name and its hash value, massage the name in such a way
> + * that the result is another name of equal length which shares the
> + * same hash value.
> + */
> +void
> +obfuscate_name(
> +	xfs_dahash_t	hash,
> +	size_t		name_len,
> +	unsigned char	*name,
> +	bool		is_dirent)
> +{
> +	unsigned char	*oldname = NULL;
> +	unsigned char	*newp;
> +	int		i;
> +	xfs_dahash_t	new_hash;
> +	unsigned char	*first;
> +	unsigned char	high_bit;
> +	int		tries = 0;
> +	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> +	int		shift;
> +
> +	/*
> +	 * Our obfuscation algorithm requires at least 5-character
> +	 * names, so don't bother if the name is too short.  We
> +	 * work backward from a hash value to determine the last
> +	 * five bytes in a name required to produce a new name
> +	 * with the same hash.
> +	 */
> +	if (name_len < 5)
> +		return;
> +
> +	if (is_ci_name) {
> +		oldname = malloc(name_len);
> +		if (!oldname)
> +			return;
> +		memcpy(oldname, name, name_len);
> +	}
> +
> +again:
> +	newp = name;
> +	new_hash = 0;
> +
> +	/*
> +	 * If we cannot generate a ci-compatible obfuscated name after 1000
> +	 * tries, don't bother obfuscating the name.
> +	 */
> +	if (tries++ > 1000) {
> +		memcpy(name, oldname, name_len);
> +		goto out_free;
> +	}
> +
> +	/*
> +	 * The beginning of the obfuscated name can be pretty much
> +	 * anything, so fill it in with random characters.
> +	 * Accumulate its new hash value as we go.
> +	 */
> +	for (i = 0; i < name_len - 5; i++) {
> +		*newp = random_filename_char();
> +		if (is_ci_name)
> +			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> +							rol32(new_hash, 7);
> +		else
> +			new_hash = *newp ^ rol32(new_hash, 7);
> +		newp++;
> +	}
> +
> +	/*
> +	 * Compute which five bytes need to be used at the end of
> +	 * the name so the hash of the obfuscated name is the same
> +	 * as the hash of the original.  If any result in an invalid
> +	 * character, flip a bit and arrange for a corresponding bit
> +	 * in a neighboring byte to be flipped as well.  For the
> +	 * last byte, the "neighbor" to change is the first byte
> +	 * we're computing here.
> +	 */
> +	new_hash = rol32(new_hash, 3) ^ hash;
> +
> +	first = newp;
> +	high_bit = 0;
> +	for (shift = 28; shift >= 0; shift -= 7) {
> +		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> +		if (is_invalid_char(*newp)) {
> +			*newp ^= 1;
> +			high_bit = 0x80;
> +		} else
> +			high_bit = 0;
> +
> +		/*
> +		 * If ascii-ci is enabled, uppercase characters are converted
> +		 * to lowercase characters while computing the name hash.  If
> +		 * any of the necessary correction bytes are uppercase, the
> +		 * hash of the new name will not match.  Try again with a
> +		 * different prefix.
> +		 */
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*newp));
> +		newp++;
> +	}
> +
> +	/*
> +	 * If we flipped a bit on the last byte, we need to fix up
> +	 * the matching bit in the first byte.  The result will
> +	 * be a valid character, because we know that first byte
> +	 * has 0's in its upper four bits (it was produced by a
> +	 * 28-bit right-shift of a 32-bit unsigned value).
> +	 */
> +	if (high_bit) {
> +		*first ^= 0x10;
> +
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*first));
> +	}
> +out_free:
> +	free(oldname);
> +}
> +
> +/*
> + * Flip a bit in each of two bytes at the end of the given name.
> + * This is used in generating a series of alternate names to be used
> + * in the event a duplicate is found.
> + *
> + * The bits flipped are selected such that they both affect the same
> + * bit in the name's computed hash value, so flipping them both will
> + * preserve the hash.
> + *
> + * The following diagram aims to show the portion of a computed
> + * hash that a given byte of a name affects.
> + *
> + *	   31    28      24    21	     14		  8 7       3     0
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> + *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> + *			 |<-- last-7 --->|	     |<-- last-5 --->|
> + *	   |<-- last-8 --->|	       |<-- last-6 --->|
> + *			. . . and so on
> + *
> + * The last byte of the name directly affects the low-order byte of
> + * the hash.  The next-to-last affects bits 7-14, the next one back
> + * affects bits 14-21, and so on.  The effect wraps around when it
> + * goes beyond the top of the hash (as happens for byte last-4).
> + *
> + * Bits that are flipped together "overlap" on the hash value.  As
> + * an example of overlap, the last two bytes both affect bit 7 in
> + * the hash.  That pair of bytes (and their overlapping bits) can be
> + * used for this "flip bit" operation (it's the first pair tried,
> + * actually).
> + *
> + * A table defines overlapping pairs--the bytes involved and bits
> + * within them--that can be used this way.  The byte offset is
> + * relative to a starting point within the name, which will be set
> + * to affect the bytes at the end of the name.  The function is
> + * called with a "bitseq" value which indicates which bit flip is
> + * desired, and this translates directly into selecting which entry
> + * in the bit_to_flip[] table to apply.
> + *
> + * The function returns 1 if the operation was successful.  It
> + * returns 0 if the result produced a character that's not valid in
> + * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> + * sequence number is beyond what is supported for a name of this
> + * length.
> + *
> + * Discussion
> + * ----------
> + * (Also see the discussion above find_alternate(), below.)
> + *
> + * In order to make this function work for any length name, the
> + * table is ordered by increasing byte offset, so that the earliest
> + * entries can apply to the shortest strings.  This way all names
> + * are done consistently.
> + *
> + * When bit flips occur, they can convert printable characters
> + * into non-printable ones.  In an effort to reduce the impact of
> + * this, the first bit flips are chosen to affect bytes the end of
> + * the name (and furthermore, toward the low bits of a byte).  Those
> + * bytes are often non-printable anyway because of the way they are
> + * initially selected by obfuscate_name()).  This is accomplished,
> + * using later table entries first.
> + *
> + * Each row in the table doubles the number of alternates that
> + * can be generated.  A two-byte name is limited to using only
> + * the first row, so it's possible to generate two alternates
> + * (the original name, plus the alternate produced by flipping
> + * the one pair of bits).  In a 5-byte name, the effect of the
> + * first byte overlaps the last by 4 its, and there are 8 bits
> + * to flip, allowing for 256 possible alternates.
> + *
> + * Short names (less than 5 bytes) are never even obfuscated, so for
> + * such names the relatively small number of alternates should never
> + * really be a problem.
> + *
> + * Long names (more than 6 bytes, say) are not likely to exhaust
> + * the number of available alternates.  In fact, the table could
> + * probably have stopped at 8 entries, on the assumption that 256
> + * alternates should be enough for most any situation.  The entries
> + * beyond those are present mostly for demonstration of how it could
> + * be populated with more entries, should it ever be necessary to do
> + * so.
> + */
> +static int
> +flip_bit(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	bitseq)
> +{
> +	int	index;
> +	size_t	offset;
> +	unsigned char *p0, *p1;
> +	unsigned char m0, m1;
> +	struct {
> +	    int		byte;	/* Offset from start within name */
> +	    unsigned char bit;	/* Bit within that byte */
> +	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> +	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> +	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> +	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> +	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> +	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> +	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> +	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> +	    { { 3, 0 }, { 4, 7 } },
> +	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> +	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> +	    { { 0, 2 }, { 5, 5 } },
> +	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> +	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> +	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> +		/* . . . */
> +	};
> +
> +	/* Find the first entry *not* usable for name of this length */
> +
> +	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> +		if (bit_to_flip[index][1].byte >= name_len)
> +			break;
> +
> +	/*
> +	 * Back up to the last usable entry.  If that number is
> +	 * smaller than the bit sequence number, inform the caller
> +	 * that nothing this large (or larger) will work.
> +	 */
> +	if (bitseq > --index)
> +		return -1;
> +
> +	/*
> +	 * We will be switching bits at the end of name, with a
> +	 * preference for affecting the last bytes first.  Compute
> +	 * where in the name we'll start applying the changes.
> +	 */
> +	offset = name_len - (bit_to_flip[index][1].byte + 1);
> +	index -= bitseq;	/* Use later table entries first */
> +
> +	p0 = name + offset + bit_to_flip[index][0].byte;
> +	p1 = name + offset + bit_to_flip[index][1].byte;
> +	m0 = 1 << bit_to_flip[index][0].bit;
> +	m1 = 1 << bit_to_flip[index][1].bit;
> +
> +	/* Only change the bytes if it produces valid characters */
> +
> +	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> +		return 0;
> +
> +	*p0 ^= m0;
> +	*p1 ^= m1;
> +
> +	return 1;
> +}
> +
> +/*
> + * This function generates a well-defined sequence of "alternate"
> + * names for a given name.  An alternate is a name having the same
> + * length and same hash value as the original name.  This is needed
> + * because the algorithm produces only one obfuscated name to use
> + * for a given original name, and it's possible that result matches
> + * a name already seen.  This function checks for this, and if it
> + * occurs, finds another suitable obfuscated name to use.
> + *
> + * Each bit in the binary representation of the sequence number is
> + * used to select one possible "bit flip" operation to perform on
> + * the name.  So for example:
> + *    seq = 0:	selects no bits to flip
> + *    seq = 1:	selects the 0th bit to flip
> + *    seq = 2:	selects the 1st bit to flip
> + *    seq = 3:	selects the 0th and 1st bit to flip
> + *    ... and so on.
> + *
> + * The flip_bit() function takes care of the details of the bit
> + * flipping within the name.  Note that the "1st bit" in this
> + * context is a bit sequence number; i.e. it doesn't necessarily
> + * mean bit 0x02 will be changed.
> + *
> + * If a valid name (one that contains no '/' or '\0' characters) is
> + * produced by this process for the given sequence number, this
> + * function returns 1.  If the result is not valid, it returns 0.
> + * Returns -1 if the sequence number is beyond the the maximum for
> + * names of the given length.
> + *
> + *
> + * Discussion
> + * ----------
> + * The number of alternates available for a given name is dependent
> + * on its length.  A "bit flip" involves inverting two bits in
> + * a name--the two bits being selected such that their values
> + * affect the name's hash value in the same way.  Alternates are
> + * thus generated by inverting the value of pairs of such
> + * "overlapping" bits in the original name.  Each byte after the
> + * first in a name adds at least one bit of overlap to work with.
> + * (See comments above flip_bit() for more discussion on this.)
> + *
> + * So the number of alternates is dependent on the number of such
> + * overlapping bits in a name.  If there are N bit overlaps, there
> + * 2^N alternates for that hash value.
> + *
> + * Here are the number of overlapping bits available for generating
> + * alternates for names of specific lengths:
> + *	1	0	(must have 2 bytes to have any overlap)
> + *	2	1	One bit overlaps--so 2 possible alternates
> + *	3	2	Two bits overlap--so 4 possible alternates
> + *	4	4	Three bits overlap, so 2^3 alternates
> + *	5	8	8 bits overlap (due to wrapping), 256 alternates
> + *	6	18	2^18 alternates
> + *	7	28	2^28 alternates
> + *	   ...
> + * It's clear that the number of alternates grows very quickly with
> + * the length of the name.  But note that the set of alternates
> + * includes invalid names.  And for certain (contrived) names, the
> + * number of valid names is a fairly small fraction of the total
> + * number of alternates.
> + *
> + * The main driver for this infrastructure for coming up with
> + * alternate names is really related to names 5 (or possibly 6)
> + * bytes in length.  5-byte obfuscated names contain no randomly-
> + * generated bytes in them, and the chance of an obfuscated name
> + * matching an already-seen name is too high to just ignore.  This
> + * methodical selection of alternates ensures we don't produce
> + * duplicate names unless we have exhausted our options.
> + */
> +int
> +find_alternate(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	seq)
> +{
> +	uint32_t	bitseq = 0;
> +	uint32_t	bits = seq;
> +
> +	if (!seq)
> +		return 1;	/* alternate 0 is the original name */
> +	if (name_len < 2)	/* Must have 2 bytes to flip */
> +		return -1;
> +
> +	for (bitseq = 0; bits; bitseq++) {
> +		uint32_t	mask = 1 << bitseq;
> +		int		fb;
> +
> +		if (!(bits & mask))
> +			continue;
> +
> +		fb = flip_bit(name_len, name, bitseq);
> +		if (fb < 1)
> +			return fb ? -1 : 0;
> +		bits ^= mask;
> +	}
> +
> +	return 1;
> +}
> diff --git a/db/obfuscate.h b/db/obfuscate.h
> new file mode 100644
> index 00000000000..afaaca37154
> --- /dev/null
> +++ b/db/obfuscate.h
> @@ -0,0 +1,17 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#ifndef __DB_OBFUSCATE_H__
> +#define __DB_OBFUSCATE_H__
> +
> +/* Routines to obfuscate directory filenames and xattr names. */
> +
> +#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> +
> +void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
> +		bool is_dirent);
> +int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
> +
> +#endif /* __DB_OBFUSCATE_H__ */
diff mbox series

Patch

diff --git a/db/Makefile b/db/Makefile
index b2e01174571..2f95f67075d 100644
--- a/db/Makefile
+++ b/db/Makefile
@@ -13,7 +13,7 @@  HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
 	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
 	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
 	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
-	fuzz.h
+	fuzz.h obfuscate.h
 CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
 	timelimit.c
 LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/metadump.c b/db/metadump.c
index 9ccee0b7ace..d9a616a9296 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -19,6 +19,7 @@ 
 #include "faddr.h"
 #include "field.h"
 #include "dir2.h"
+#include "obfuscate.h"
 
 #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
 
@@ -736,19 +737,6 @@  nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
 	return ent;
 }
 
-#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
-#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
-
-static inline unsigned char
-random_filename_char(void)
-{
-	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-						"abcdefghijklmnopqrstuvwxyz"
-						"0123456789-_";
-
-	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
-}
-
 #define	ORPHANAGE	"lost+found"
 #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
 
@@ -808,380 +796,6 @@  in_lost_found(
 	return slen == namelen && !memcmp(name, s, namelen);
 }
 
-/*
- * Given a name and its hash value, massage the name in such a way
- * that the result is another name of equal length which shares the
- * same hash value.
- */
-static void
-obfuscate_name(
-	xfs_dahash_t	hash,
-	size_t		name_len,
-	unsigned char	*name,
-	bool		is_dirent)
-{
-	unsigned char	*oldname = NULL;
-	unsigned char	*newp;
-	int		i;
-	xfs_dahash_t	new_hash;
-	unsigned char	*first;
-	unsigned char	high_bit;
-	int		tries = 0;
-	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
-	int		shift;
-
-	/*
-	 * Our obfuscation algorithm requires at least 5-character
-	 * names, so don't bother if the name is too short.  We
-	 * work backward from a hash value to determine the last
-	 * five bytes in a name required to produce a new name
-	 * with the same hash.
-	 */
-	if (name_len < 5)
-		return;
-
-	if (is_ci_name) {
-		oldname = malloc(name_len);
-		if (!oldname)
-			return;
-		memcpy(oldname, name, name_len);
-	}
-
-again:
-	newp = name;
-	new_hash = 0;
-
-	/*
-	 * If we cannot generate a ci-compatible obfuscated name after 1000
-	 * tries, don't bother obfuscating the name.
-	 */
-	if (tries++ > 1000) {
-		memcpy(name, oldname, name_len);
-		goto out_free;
-	}
-
-	/*
-	 * The beginning of the obfuscated name can be pretty much
-	 * anything, so fill it in with random characters.
-	 * Accumulate its new hash value as we go.
-	 */
-	for (i = 0; i < name_len - 5; i++) {
-		*newp = random_filename_char();
-		if (is_ci_name)
-			new_hash = xfs_ascii_ci_xfrm(*newp) ^
-							rol32(new_hash, 7);
-		else
-			new_hash = *newp ^ rol32(new_hash, 7);
-		newp++;
-	}
-
-	/*
-	 * Compute which five bytes need to be used at the end of
-	 * the name so the hash of the obfuscated name is the same
-	 * as the hash of the original.  If any result in an invalid
-	 * character, flip a bit and arrange for a corresponding bit
-	 * in a neighboring byte to be flipped as well.  For the
-	 * last byte, the "neighbor" to change is the first byte
-	 * we're computing here.
-	 */
-	new_hash = rol32(new_hash, 3) ^ hash;
-
-	first = newp;
-	high_bit = 0;
-	for (shift = 28; shift >= 0; shift -= 7) {
-		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
-		if (is_invalid_char(*newp)) {
-			*newp ^= 1;
-			high_bit = 0x80;
-		} else
-			high_bit = 0;
-
-		/*
-		 * If ascii-ci is enabled, uppercase characters are converted
-		 * to lowercase characters while computing the name hash.  If
-		 * any of the necessary correction bytes are uppercase, the
-		 * hash of the new name will not match.  Try again with a
-		 * different prefix.
-		 */
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
-			goto again;
-
-		ASSERT(!is_invalid_char(*newp));
-		newp++;
-	}
-
-	/*
-	 * If we flipped a bit on the last byte, we need to fix up
-	 * the matching bit in the first byte.  The result will
-	 * be a valid character, because we know that first byte
-	 * has 0's in its upper four bits (it was produced by a
-	 * 28-bit right-shift of a 32-bit unsigned value).
-	 */
-	if (high_bit) {
-		*first ^= 0x10;
-
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
-			goto again;
-
-		ASSERT(!is_invalid_char(*first));
-	}
-
-out_free:
-	free(oldname);
-}
-
-/*
- * Flip a bit in each of two bytes at the end of the given name.
- * This is used in generating a series of alternate names to be used
- * in the event a duplicate is found.
- *
- * The bits flipped are selected such that they both affect the same
- * bit in the name's computed hash value, so flipping them both will
- * preserve the hash.
- *
- * The following diagram aims to show the portion of a computed
- * hash that a given byte of a name affects.
- *
- *	   31    28      24    21	     14		  8 7       3     0
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
- *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
- *			 |<-- last-7 --->|	     |<-- last-5 --->|
- *	   |<-- last-8 --->|	       |<-- last-6 --->|
- *			. . . and so on
- *
- * The last byte of the name directly affects the low-order byte of
- * the hash.  The next-to-last affects bits 7-14, the next one back
- * affects bits 14-21, and so on.  The effect wraps around when it
- * goes beyond the top of the hash (as happens for byte last-4).
- *
- * Bits that are flipped together "overlap" on the hash value.  As
- * an example of overlap, the last two bytes both affect bit 7 in
- * the hash.  That pair of bytes (and their overlapping bits) can be
- * used for this "flip bit" operation (it's the first pair tried,
- * actually).
- *
- * A table defines overlapping pairs--the bytes involved and bits
- * within them--that can be used this way.  The byte offset is
- * relative to a starting point within the name, which will be set
- * to affect the bytes at the end of the name.  The function is
- * called with a "bitseq" value which indicates which bit flip is
- * desired, and this translates directly into selecting which entry
- * in the bit_to_flip[] table to apply.
- *
- * The function returns 1 if the operation was successful.  It
- * returns 0 if the result produced a character that's not valid in
- * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
- * sequence number is beyond what is supported for a name of this
- * length.
- *
- * Discussion
- * ----------
- * (Also see the discussion above find_alternate(), below.)
- *
- * In order to make this function work for any length name, the
- * table is ordered by increasing byte offset, so that the earliest
- * entries can apply to the shortest strings.  This way all names
- * are done consistently.
- *
- * When bit flips occur, they can convert printable characters
- * into non-printable ones.  In an effort to reduce the impact of
- * this, the first bit flips are chosen to affect bytes the end of
- * the name (and furthermore, toward the low bits of a byte).  Those
- * bytes are often non-printable anyway because of the way they are
- * initially selected by obfuscate_name()).  This is accomplished,
- * using later table entries first.
- *
- * Each row in the table doubles the number of alternates that
- * can be generated.  A two-byte name is limited to using only
- * the first row, so it's possible to generate two alternates
- * (the original name, plus the alternate produced by flipping
- * the one pair of bits).  In a 5-byte name, the effect of the
- * first byte overlaps the last by 4 its, and there are 8 bits
- * to flip, allowing for 256 possible alternates.
- *
- * Short names (less than 5 bytes) are never even obfuscated, so for
- * such names the relatively small number of alternates should never
- * really be a problem.
- *
- * Long names (more than 6 bytes, say) are not likely to exhaust
- * the number of available alternates.  In fact, the table could
- * probably have stopped at 8 entries, on the assumption that 256
- * alternates should be enough for most any situation.  The entries
- * beyond those are present mostly for demonstration of how it could
- * be populated with more entries, should it ever be necessary to do
- * so.
- */
-static int
-flip_bit(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	bitseq)
-{
-	int	index;
-	size_t	offset;
-	unsigned char *p0, *p1;
-	unsigned char m0, m1;
-	struct {
-	    int		byte;	/* Offset from start within name */
-	    unsigned char bit;	/* Bit within that byte */
-	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
-	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
-	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
-	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
-	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
-	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
-	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
-	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
-	    { { 3, 0 }, { 4, 7 } },
-	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
-	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
-	    { { 0, 2 }, { 5, 5 } },
-	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
-	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
-	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
-		/* . . . */
-	};
-
-	/* Find the first entry *not* usable for name of this length */
-
-	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
-		if (bit_to_flip[index][1].byte >= name_len)
-			break;
-
-	/*
-	 * Back up to the last usable entry.  If that number is
-	 * smaller than the bit sequence number, inform the caller
-	 * that nothing this large (or larger) will work.
-	 */
-	if (bitseq > --index)
-		return -1;
-
-	/*
-	 * We will be switching bits at the end of name, with a
-	 * preference for affecting the last bytes first.  Compute
-	 * where in the name we'll start applying the changes.
-	 */
-	offset = name_len - (bit_to_flip[index][1].byte + 1);
-	index -= bitseq;	/* Use later table entries first */
-
-	p0 = name + offset + bit_to_flip[index][0].byte;
-	p1 = name + offset + bit_to_flip[index][1].byte;
-	m0 = 1 << bit_to_flip[index][0].bit;
-	m1 = 1 << bit_to_flip[index][1].bit;
-
-	/* Only change the bytes if it produces valid characters */
-
-	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
-		return 0;
-
-	*p0 ^= m0;
-	*p1 ^= m1;
-
-	return 1;
-}
-
-/*
- * This function generates a well-defined sequence of "alternate"
- * names for a given name.  An alternate is a name having the same
- * length and same hash value as the original name.  This is needed
- * because the algorithm produces only one obfuscated name to use
- * for a given original name, and it's possible that result matches
- * a name already seen.  This function checks for this, and if it
- * occurs, finds another suitable obfuscated name to use.
- *
- * Each bit in the binary representation of the sequence number is
- * used to select one possible "bit flip" operation to perform on
- * the name.  So for example:
- *    seq = 0:	selects no bits to flip
- *    seq = 1:	selects the 0th bit to flip
- *    seq = 2:	selects the 1st bit to flip
- *    seq = 3:	selects the 0th and 1st bit to flip
- *    ... and so on.
- *
- * The flip_bit() function takes care of the details of the bit
- * flipping within the name.  Note that the "1st bit" in this
- * context is a bit sequence number; i.e. it doesn't necessarily
- * mean bit 0x02 will be changed.
- *
- * If a valid name (one that contains no '/' or '\0' characters) is
- * produced by this process for the given sequence number, this
- * function returns 1.  If the result is not valid, it returns 0.
- * Returns -1 if the sequence number is beyond the the maximum for
- * names of the given length.
- *
- *
- * Discussion
- * ----------
- * The number of alternates available for a given name is dependent
- * on its length.  A "bit flip" involves inverting two bits in
- * a name--the two bits being selected such that their values
- * affect the name's hash value in the same way.  Alternates are
- * thus generated by inverting the value of pairs of such
- * "overlapping" bits in the original name.  Each byte after the
- * first in a name adds at least one bit of overlap to work with.
- * (See comments above flip_bit() for more discussion on this.)
- *
- * So the number of alternates is dependent on the number of such
- * overlapping bits in a name.  If there are N bit overlaps, there
- * 2^N alternates for that hash value.
- *
- * Here are the number of overlapping bits available for generating
- * alternates for names of specific lengths:
- *	1	0	(must have 2 bytes to have any overlap)
- *	2	1	One bit overlaps--so 2 possible alternates
- *	3	2	Two bits overlap--so 4 possible alternates
- *	4	4	Three bits overlap, so 2^3 alternates
- *	5	8	8 bits overlap (due to wrapping), 256 alternates
- *	6	18	2^18 alternates
- *	7	28	2^28 alternates
- *	   ...
- * It's clear that the number of alternates grows very quickly with
- * the length of the name.  But note that the set of alternates
- * includes invalid names.  And for certain (contrived) names, the
- * number of valid names is a fairly small fraction of the total
- * number of alternates.
- *
- * The main driver for this infrastructure for coming up with
- * alternate names is really related to names 5 (or possibly 6)
- * bytes in length.  5-byte obfuscated names contain no randomly-
- * generated bytes in them, and the chance of an obfuscated name
- * matching an already-seen name is too high to just ignore.  This
- * methodical selection of alternates ensures we don't produce
- * duplicate names unless we have exhausted our options.
- */
-static int
-find_alternate(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	seq)
-{
-	uint32_t	bitseq = 0;
-	uint32_t	bits = seq;
-
-	if (!seq)
-		return 1;	/* alternate 0 is the original name */
-	if (name_len < 2)	/* Must have 2 bytes to flip */
-		return -1;
-
-	for (bitseq = 0; bits; bitseq++) {
-		uint32_t	mask = 1 << bitseq;
-		int		fb;
-
-		if (!(bits & mask))
-			continue;
-
-		fb = flip_bit(name_len, name, bitseq);
-		if (fb < 1)
-			return fb ? -1 : 0;
-		bits ^= mask;
-	}
-
-	return 1;
-}
-
 /*
  * Look up the given name in the name table.  If it is already
  * present, iterate through a well-defined sequence of alternate
diff --git a/db/obfuscate.c b/db/obfuscate.c
new file mode 100644
index 00000000000..cd950b44581
--- /dev/null
+++ b/db/obfuscate.c
@@ -0,0 +1,393 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#include "libxfs.h"
+#include "init.h"
+#include "obfuscate.h"
+
+static inline unsigned char
+random_filename_char(void)
+{
+	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+						"abcdefghijklmnopqrstuvwxyz"
+						"0123456789-_";
+
+	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
+}
+
+#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
+
+/*
+ * Given a name and its hash value, massage the name in such a way
+ * that the result is another name of equal length which shares the
+ * same hash value.
+ */
+void
+obfuscate_name(
+	xfs_dahash_t	hash,
+	size_t		name_len,
+	unsigned char	*name,
+	bool		is_dirent)
+{
+	unsigned char	*oldname = NULL;
+	unsigned char	*newp;
+	int		i;
+	xfs_dahash_t	new_hash;
+	unsigned char	*first;
+	unsigned char	high_bit;
+	int		tries = 0;
+	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
+	int		shift;
+
+	/*
+	 * Our obfuscation algorithm requires at least 5-character
+	 * names, so don't bother if the name is too short.  We
+	 * work backward from a hash value to determine the last
+	 * five bytes in a name required to produce a new name
+	 * with the same hash.
+	 */
+	if (name_len < 5)
+		return;
+
+	if (is_ci_name) {
+		oldname = malloc(name_len);
+		if (!oldname)
+			return;
+		memcpy(oldname, name, name_len);
+	}
+
+again:
+	newp = name;
+	new_hash = 0;
+
+	/*
+	 * If we cannot generate a ci-compatible obfuscated name after 1000
+	 * tries, don't bother obfuscating the name.
+	 */
+	if (tries++ > 1000) {
+		memcpy(name, oldname, name_len);
+		goto out_free;
+	}
+
+	/*
+	 * The beginning of the obfuscated name can be pretty much
+	 * anything, so fill it in with random characters.
+	 * Accumulate its new hash value as we go.
+	 */
+	for (i = 0; i < name_len - 5; i++) {
+		*newp = random_filename_char();
+		if (is_ci_name)
+			new_hash = xfs_ascii_ci_xfrm(*newp) ^
+							rol32(new_hash, 7);
+		else
+			new_hash = *newp ^ rol32(new_hash, 7);
+		newp++;
+	}
+
+	/*
+	 * Compute which five bytes need to be used at the end of
+	 * the name so the hash of the obfuscated name is the same
+	 * as the hash of the original.  If any result in an invalid
+	 * character, flip a bit and arrange for a corresponding bit
+	 * in a neighboring byte to be flipped as well.  For the
+	 * last byte, the "neighbor" to change is the first byte
+	 * we're computing here.
+	 */
+	new_hash = rol32(new_hash, 3) ^ hash;
+
+	first = newp;
+	high_bit = 0;
+	for (shift = 28; shift >= 0; shift -= 7) {
+		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
+		if (is_invalid_char(*newp)) {
+			*newp ^= 1;
+			high_bit = 0x80;
+		} else
+			high_bit = 0;
+
+		/*
+		 * If ascii-ci is enabled, uppercase characters are converted
+		 * to lowercase characters while computing the name hash.  If
+		 * any of the necessary correction bytes are uppercase, the
+		 * hash of the new name will not match.  Try again with a
+		 * different prefix.
+		 */
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
+			goto again;
+
+		ASSERT(!is_invalid_char(*newp));
+		newp++;
+	}
+
+	/*
+	 * If we flipped a bit on the last byte, we need to fix up
+	 * the matching bit in the first byte.  The result will
+	 * be a valid character, because we know that first byte
+	 * has 0's in its upper four bits (it was produced by a
+	 * 28-bit right-shift of a 32-bit unsigned value).
+	 */
+	if (high_bit) {
+		*first ^= 0x10;
+
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
+			goto again;
+
+		ASSERT(!is_invalid_char(*first));
+	}
+out_free:
+	free(oldname);
+}
+
+/*
+ * Flip a bit in each of two bytes at the end of the given name.
+ * This is used in generating a series of alternate names to be used
+ * in the event a duplicate is found.
+ *
+ * The bits flipped are selected such that they both affect the same
+ * bit in the name's computed hash value, so flipping them both will
+ * preserve the hash.
+ *
+ * The following diagram aims to show the portion of a computed
+ * hash that a given byte of a name affects.
+ *
+ *	   31    28      24    21	     14		  8 7       3     0
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
+ *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
+ *			 |<-- last-7 --->|	     |<-- last-5 --->|
+ *	   |<-- last-8 --->|	       |<-- last-6 --->|
+ *			. . . and so on
+ *
+ * The last byte of the name directly affects the low-order byte of
+ * the hash.  The next-to-last affects bits 7-14, the next one back
+ * affects bits 14-21, and so on.  The effect wraps around when it
+ * goes beyond the top of the hash (as happens for byte last-4).
+ *
+ * Bits that are flipped together "overlap" on the hash value.  As
+ * an example of overlap, the last two bytes both affect bit 7 in
+ * the hash.  That pair of bytes (and their overlapping bits) can be
+ * used for this "flip bit" operation (it's the first pair tried,
+ * actually).
+ *
+ * A table defines overlapping pairs--the bytes involved and bits
+ * within them--that can be used this way.  The byte offset is
+ * relative to a starting point within the name, which will be set
+ * to affect the bytes at the end of the name.  The function is
+ * called with a "bitseq" value which indicates which bit flip is
+ * desired, and this translates directly into selecting which entry
+ * in the bit_to_flip[] table to apply.
+ *
+ * The function returns 1 if the operation was successful.  It
+ * returns 0 if the result produced a character that's not valid in
+ * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
+ * sequence number is beyond what is supported for a name of this
+ * length.
+ *
+ * Discussion
+ * ----------
+ * (Also see the discussion above find_alternate(), below.)
+ *
+ * In order to make this function work for any length name, the
+ * table is ordered by increasing byte offset, so that the earliest
+ * entries can apply to the shortest strings.  This way all names
+ * are done consistently.
+ *
+ * When bit flips occur, they can convert printable characters
+ * into non-printable ones.  In an effort to reduce the impact of
+ * this, the first bit flips are chosen to affect bytes the end of
+ * the name (and furthermore, toward the low bits of a byte).  Those
+ * bytes are often non-printable anyway because of the way they are
+ * initially selected by obfuscate_name()).  This is accomplished,
+ * using later table entries first.
+ *
+ * Each row in the table doubles the number of alternates that
+ * can be generated.  A two-byte name is limited to using only
+ * the first row, so it's possible to generate two alternates
+ * (the original name, plus the alternate produced by flipping
+ * the one pair of bits).  In a 5-byte name, the effect of the
+ * first byte overlaps the last by 4 its, and there are 8 bits
+ * to flip, allowing for 256 possible alternates.
+ *
+ * Short names (less than 5 bytes) are never even obfuscated, so for
+ * such names the relatively small number of alternates should never
+ * really be a problem.
+ *
+ * Long names (more than 6 bytes, say) are not likely to exhaust
+ * the number of available alternates.  In fact, the table could
+ * probably have stopped at 8 entries, on the assumption that 256
+ * alternates should be enough for most any situation.  The entries
+ * beyond those are present mostly for demonstration of how it could
+ * be populated with more entries, should it ever be necessary to do
+ * so.
+ */
+static int
+flip_bit(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	bitseq)
+{
+	int	index;
+	size_t	offset;
+	unsigned char *p0, *p1;
+	unsigned char m0, m1;
+	struct {
+	    int		byte;	/* Offset from start within name */
+	    unsigned char bit;	/* Bit within that byte */
+	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
+	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
+	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
+	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
+	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
+	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
+	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
+	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
+	    { { 3, 0 }, { 4, 7 } },
+	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
+	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
+	    { { 0, 2 }, { 5, 5 } },
+	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
+	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
+	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
+		/* . . . */
+	};
+
+	/* Find the first entry *not* usable for name of this length */
+
+	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
+		if (bit_to_flip[index][1].byte >= name_len)
+			break;
+
+	/*
+	 * Back up to the last usable entry.  If that number is
+	 * smaller than the bit sequence number, inform the caller
+	 * that nothing this large (or larger) will work.
+	 */
+	if (bitseq > --index)
+		return -1;
+
+	/*
+	 * We will be switching bits at the end of name, with a
+	 * preference for affecting the last bytes first.  Compute
+	 * where in the name we'll start applying the changes.
+	 */
+	offset = name_len - (bit_to_flip[index][1].byte + 1);
+	index -= bitseq;	/* Use later table entries first */
+
+	p0 = name + offset + bit_to_flip[index][0].byte;
+	p1 = name + offset + bit_to_flip[index][1].byte;
+	m0 = 1 << bit_to_flip[index][0].bit;
+	m1 = 1 << bit_to_flip[index][1].bit;
+
+	/* Only change the bytes if it produces valid characters */
+
+	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
+		return 0;
+
+	*p0 ^= m0;
+	*p1 ^= m1;
+
+	return 1;
+}
+
+/*
+ * This function generates a well-defined sequence of "alternate"
+ * names for a given name.  An alternate is a name having the same
+ * length and same hash value as the original name.  This is needed
+ * because the algorithm produces only one obfuscated name to use
+ * for a given original name, and it's possible that result matches
+ * a name already seen.  This function checks for this, and if it
+ * occurs, finds another suitable obfuscated name to use.
+ *
+ * Each bit in the binary representation of the sequence number is
+ * used to select one possible "bit flip" operation to perform on
+ * the name.  So for example:
+ *    seq = 0:	selects no bits to flip
+ *    seq = 1:	selects the 0th bit to flip
+ *    seq = 2:	selects the 1st bit to flip
+ *    seq = 3:	selects the 0th and 1st bit to flip
+ *    ... and so on.
+ *
+ * The flip_bit() function takes care of the details of the bit
+ * flipping within the name.  Note that the "1st bit" in this
+ * context is a bit sequence number; i.e. it doesn't necessarily
+ * mean bit 0x02 will be changed.
+ *
+ * If a valid name (one that contains no '/' or '\0' characters) is
+ * produced by this process for the given sequence number, this
+ * function returns 1.  If the result is not valid, it returns 0.
+ * Returns -1 if the sequence number is beyond the the maximum for
+ * names of the given length.
+ *
+ *
+ * Discussion
+ * ----------
+ * The number of alternates available for a given name is dependent
+ * on its length.  A "bit flip" involves inverting two bits in
+ * a name--the two bits being selected such that their values
+ * affect the name's hash value in the same way.  Alternates are
+ * thus generated by inverting the value of pairs of such
+ * "overlapping" bits in the original name.  Each byte after the
+ * first in a name adds at least one bit of overlap to work with.
+ * (See comments above flip_bit() for more discussion on this.)
+ *
+ * So the number of alternates is dependent on the number of such
+ * overlapping bits in a name.  If there are N bit overlaps, there
+ * 2^N alternates for that hash value.
+ *
+ * Here are the number of overlapping bits available for generating
+ * alternates for names of specific lengths:
+ *	1	0	(must have 2 bytes to have any overlap)
+ *	2	1	One bit overlaps--so 2 possible alternates
+ *	3	2	Two bits overlap--so 4 possible alternates
+ *	4	4	Three bits overlap, so 2^3 alternates
+ *	5	8	8 bits overlap (due to wrapping), 256 alternates
+ *	6	18	2^18 alternates
+ *	7	28	2^28 alternates
+ *	   ...
+ * It's clear that the number of alternates grows very quickly with
+ * the length of the name.  But note that the set of alternates
+ * includes invalid names.  And for certain (contrived) names, the
+ * number of valid names is a fairly small fraction of the total
+ * number of alternates.
+ *
+ * The main driver for this infrastructure for coming up with
+ * alternate names is really related to names 5 (or possibly 6)
+ * bytes in length.  5-byte obfuscated names contain no randomly-
+ * generated bytes in them, and the chance of an obfuscated name
+ * matching an already-seen name is too high to just ignore.  This
+ * methodical selection of alternates ensures we don't produce
+ * duplicate names unless we have exhausted our options.
+ */
+int
+find_alternate(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	seq)
+{
+	uint32_t	bitseq = 0;
+	uint32_t	bits = seq;
+
+	if (!seq)
+		return 1;	/* alternate 0 is the original name */
+	if (name_len < 2)	/* Must have 2 bytes to flip */
+		return -1;
+
+	for (bitseq = 0; bits; bitseq++) {
+		uint32_t	mask = 1 << bitseq;
+		int		fb;
+
+		if (!(bits & mask))
+			continue;
+
+		fb = flip_bit(name_len, name, bitseq);
+		if (fb < 1)
+			return fb ? -1 : 0;
+		bits ^= mask;
+	}
+
+	return 1;
+}
diff --git a/db/obfuscate.h b/db/obfuscate.h
new file mode 100644
index 00000000000..afaaca37154
--- /dev/null
+++ b/db/obfuscate.h
@@ -0,0 +1,17 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#ifndef __DB_OBFUSCATE_H__
+#define __DB_OBFUSCATE_H__
+
+/* Routines to obfuscate directory filenames and xattr names. */
+
+#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
+
+void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
+		bool is_dirent);
+int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
+
+#endif /* __DB_OBFUSCATE_H__ */