[RFC,v5,05/11] unicode: Implement higher level API for string handling

Message ID	20190128213223.31512-6-krisman@collabora.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-fsdevel-owner@kernel.org> sender: krisman) with ESMTPSA id 2635927FB7B From: Gabriel Krisman Bertazi <krisman@collabora.com> To: tytso@mit.edu Cc: linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org, sfrench@samba.org, darrick.wong@oracle.com, samba-technical@lists.samba.org, jlayton@kernel.org, bfields@fieldses.org, paulus@samba.org, Gabriel Krisman Bertazi <krisman@collabora.co.uk> Subject: [PATCH RFC v5 05/11] unicode: Implement higher level API for string handling Date: Mon, 28 Jan 2019 16:32:17 -0500 Message-Id: <20190128213223.31512-6-krisman@collabora.com> In-Reply-To: <20190128213223.31512-1-krisman@collabora.com> References: <20190128213223.31512-1-krisman@collabora.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: linux-fsdevel-owner@vger.kernel.org Precedence: bulk
Series	Ext4 Encoding and Case-insensitive support \| expand [RFC,v5,00/11] Ext4 Encoding and Case-insensitive support [RFC,v5,01/11] unicode: Add unicode character database files [RFC,v5,02/11] scripts: add trie generator for UTF-8 [RFC,v5,03/11] unicode: Introduce code for UTF-8 normalization [RFC,v5,04/11] unicode: reduce the size of utf8data[] [RFC,v5,05/11] unicode: Implement higher level API for string handling [RFC,v5,06/11] unicode: Introduce test module for normalized utf8 implementation [RFC,v5,07/11] MAINTAINERS: Add Unicode subsystem entry [RFC,v5,08/11] ext4: Include encoding information in the superblock [RFC,v5,09/11] ext4: Support encoding-aware file name lookups [RFC,v5,10/11] ext4: Implement EXT4_CASEFOLD_FL flag [RFC,v5,11/11] docs: ext4.rst: Document encoding and case-insensitive

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 1ed10e40c30d..9a9836fcf38b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -2,7 +2,9 @@ UNICODE_VERSION=11.0.0 -obj-$(CONFIG_UNICODE) += utf8-norm.o +obj-$(CONFIG_UNICODE) += unicode.o + +unicode-y := utf8-norm.o utf8-core.o $(obj)/utf8-norm.o: $(obj)/utf8data.h $(obj)/utf8data.h: $(srctree)/$(src)/ucd/*.txt $(objtree)/scripts/mkutf8data FORCE diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c new file mode 100644 index 000000000000..39f4b06dded6 --- /dev/null +++ b/fs/unicode/utf8-core.c @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/errno.h> +#include <linux/unicode.h> + +#include "utf8n.h" + +int utf8_validate(const struct unicode_map *um, const struct qstr *str) +{ + const struct utf8data *data = utf8nfdi(um->version); + + if (utf8nlen(data, str->name, str->len) < 0) + return -1; + return 0; +} +EXPORT_SYMBOL(utf8_validate); + +int utf8_strncmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + const struct utf8data *data = utf8nfdi(um->version); + struct utf8cursor cur1, cur2; + int c1, c2; + + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + return -EINVAL; + + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} +EXPORT_SYMBOL(utf8_strncmp); + +int utf8_strncasecmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur1, cur2; + int c1, c2; + + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) + return -EINVAL; + + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) + return -EINVAL; + + do { + c1 = utf8byte(&cur1); + c2 = utf8byte(&cur2); + + if (c1 < 0 || c2 < 0) + return -EINVAL; + if (c1 != c2) + return 1; + } while (c1); + + return 0; +} +EXPORT_SYMBOL(utf8_strncasecmp); + +int utf8_casefold(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfdicf(um->version); + struct utf8cursor cur; + size_t nlen = 0; + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + if (dest[nlen] == -1) + break; + } + return -EINVAL; +} + +EXPORT_SYMBOL(utf8_casefold); + +int utf8_normalize(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen) +{ + const struct utf8data *data = utf8nfdi(um->version); + struct utf8cursor cur; + ssize_t nlen = 0; + + if (utf8ncursor(&cur, data, str->name, str->len) < 0) + return -EINVAL; + + for (nlen = 0; nlen < dlen; nlen++) { + dest[nlen] = utf8byte(&cur); + if (!dest[nlen]) + return nlen; + if (dest[nlen] == -1) + break; + } + return -EINVAL; +} + +EXPORT_SYMBOL(utf8_normalize); + +static int utf8_parse_version(const char *version, unsigned int *maj, + unsigned int *min, unsigned int *rev) +{ + substring_t args[3]; + char version_string[12]; + const struct match_token token[] = { + {1, "%d.%d.%d"}, + {0, NULL} + }; + + strncpy(version_string, version, sizeof(version_string)); + + if (match_token(version_string, token, args) != 1) + return -EINVAL; + + if (match_int(&args[0], maj) || match_int(&args[1], min) || + match_int(&args[2], rev)) + return -EINVAL; + + return 0; +} + +struct unicode_map *utf8_load(const char *version) +{ + struct unicode_map *um = NULL; + int unicode_version; + + if (version) { + unsigned int maj, min, rev; + + if (utf8_parse_version(version, &maj, &min, &rev) < 0) + return ERR_PTR(-EINVAL); + + if (!utf8version_is_supported(maj, min, rev)) + return ERR_PTR(-EINVAL); + + unicode_version = UNICODE_AGE(maj, min, rev); + } else { + unicode_version = utf8version_latest(); + printk(KERN_WARNING"UTF-8 version not specified. " + "Assuming latest supported version (%d.%d.%d).", + (unicode_version >> 16) & 0xff, + (unicode_version >> 8) & 0xff, + (unicode_version & 0xff)); + } + + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); + if (!um) + return ERR_PTR(-ENOMEM); + + um->charset = "UTF-8"; + um->version = unicode_version; + + return um; +} +EXPORT_SYMBOL(utf8_load); + +void utf8_unload(struct unicode_map *um) +{ + kfree(um); +} +EXPORT_SYMBOL(utf8_unload); + +MODULE_LICENSE("GPL v2"); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 845c0f300370..94e066be3ea6 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev) } EXPORT_SYMBOL(utf8version_is_supported); +int utf8version_latest() +{ + return utf8vers; +} +EXPORT_SYMBOL(utf8version_latest); + /* * UTF-8 valid ranges. * diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index b63a9091dc39..a120638014c1 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h @@ -32,6 +32,7 @@ /* Highest unicode version supported by the data tables. */ extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); +extern int utf8version_latest(void); /* * Look for the correct const struct utf8data for a unicode version. diff --git a/include/linux/unicode.h b/include/linux/unicode.h new file mode 100644 index 000000000000..aec2c6d800aa --- /dev/null +++ b/include/linux/unicode.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UNICODE_H +#define _LINUX_UNICODE_H + +#include <linux/init.h> +#include <linux/dcache.h> + +struct unicode_map { + const char *charset; + int version; +}; + +int utf8_validate(const struct unicode_map *um, const struct qstr *str); + +int utf8_strncmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2); + +int utf8_strncasecmp(const struct unicode_map *um, + const struct qstr *s1, const struct qstr *s2); + +int utf8_normalize(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen); + +int utf8_casefold(const struct unicode_map *um, const struct qstr *str, + unsigned char *dest, size_t dlen); + +struct unicode_map *utf8_load(const char *version); +void utf8_unload(struct unicode_map *um); + +#endif /* _LINUX_UNICODE_H */

[RFC,v5,05/11] unicode: Implement higher level API for string handling

Commit Message

Patch