diff mbox

[linux-cifs-client,03/10] cifs: add replacement for cifs_strtoUCS_le called cifs_utf16le_to_host

Message ID 1241011759-7632-4-git-send-email-jlayton@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jeff Layton April 29, 2009, 1:29 p.m. UTC
Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
takes args for the source and destination length so that we can ensure
that the function is confined within the intended buffers.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/cifs_unicode.h |    2 +
 2 files changed, 123 insertions(+), 0 deletions(-)

Comments

Shirish Pargaonkar April 29, 2009, 3:26 p.m. UTC | #1
On Wed, Apr 29, 2009 at 8:29 AM, Jeff Layton <jlayton@redhat.com> wrote:
> Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
> takes args for the source and destination length so that we can ensure
> that the function is confined within the intended buffers.
>
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
>  fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/cifs/cifs_unicode.h |    2 +
>  2 files changed, 123 insertions(+), 0 deletions(-)
>
> diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
> index 7d75272..aafaf0d 100644
> --- a/fs/cifs/cifs_unicode.c
> +++ b/fs/cifs/cifs_unicode.c
> @@ -26,6 +26,127 @@
>  #include "cifs_debug.h"
>
>  /*
> + * cifs_mapchar - convert a little-endian char to proper char in codepage
> + * @target - where converted character should be copied
> + * @src_char - 2 byte little-endian source character
> + * @cp - codepage to which character should be converted
> + * @mapchar - should character be mapped according to mapchars mount option?
> + *
> + * This function handles the conversion of a single character. It is the
> + * responsibility of the caller to ensure that the target buffer is large
> + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
> + */
> +static int
> +cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
> +            bool mapchar)
> +{
> +       int len = 1;
> +
> +       if (!mapchar)
> +               goto cp_convert;
> +
> +       /*
> +        * BB: Cannot handle remapping UNI_SLASH until all the calls to
> +        *     build_path_from_dentry are modified, as they use slash as
> +        *     separator.
> +        */
> +       switch (le16_to_cpu(src_char)) {
> +       case UNI_COLON:
> +               *target = ':';
> +               break;
> +       case UNI_ASTERIK:
> +               *target = '*';
> +               break;
> +       case UNI_QUESTION:
> +               *target = '?';
> +               break;
> +       case UNI_PIPE:
> +               *target = '|';
> +               break;
> +       case UNI_GRTRTHAN:
> +               *target = '>';
> +               break;
> +       case UNI_LESSTHAN:
> +               *target = '<';
> +               break;
> +       default:
> +               goto cp_convert;
> +       }
> +
> +out:
> +       return len;
> +
> +cp_convert:
> +       len = cp->uni2char(le16_to_cpu(src_char), target,
> +                          NLS_MAX_CHARSET_SIZE);
> +       if (len <= 0) {
> +               *target = '?';
> +               len = 1;
> +       }
> +       goto out;
> +}
> +
> +/*
> + * cifs_utf16le_to_host - convert utf16le string to local charset
> + * @to - destination buffer
> + * @from - source buffer
> + * @tolen - destination buffer size (in bytes)
> + * @fromlen - source buffer size (in bytes)
> + * @codepage - codepage to which characters should be converted
> + * @mapchar - should characters be remapped according to the mapchars option?
> + *
> + * Convert a little-endian utf16le string (as sent by the server) to a string
> + * in the provided codepage. The tolen and fromlen parameters are to ensure
> + * that the code doesn't walk off of the end of the buffer (which is always
> + * a danger if the alignment of the source buffer is off). The destination
> + * string is always properly null terminated and fits in the destination
> + * buffer. Returns the length of the destination string in bytes (including
> + * null terminator).
> + */
> +int
> +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> +                    const struct nls_table *codepage, bool mapchar)
> +{
> +       int i, charlen, safelen;
> +       int outlen = 0;
> +       int nullsize = null_charlen(codepage);
> +       int fromwords = fromlen / 2;

I think assumption here is code values are two bytes.  I think that is
correct in case of UCS-2 encoding
but in case of UTF-16, the code values can be either two or four bytes.

> +       char tmp[NLS_MAX_CHARSET_SIZE];
> +
> +       /*
> +        * because the chars can be of varying widths, we need to take care
> +        * not to overflow the destination buffer when we get close to the
> +        * end of it. Until we get to this offset, we don't need to check
> +        * for overflow however.
> +        */
> +       safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);

Can safelen become negative?  In case of a code value byte stream
consisting of say two, two byte code values?

> +
> +       for (i = 0; i < fromwords && from[i]; i++) {
> +               /*
> +                * check to see if converting this character might make the
> +                * conversion bleed into the null terminator
> +                */
> +               if (outlen >= safelen) {
> +                       charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);

If mapchar is not set, cifs_mapchar is always going to return 1 (since
uni2char always returns 1)
in case of no error.

> +                       if (charlen <= 0)
> +                               charlen = 1;
> +                       if ((outlen + charlen) > (tolen - nullsize))
> +                               break;
> +               }
> +
> +               /* put converted char into 'to' buffer */
> +               charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
> +               outlen += charlen;
> +       }
> +
> +       /* properly null-terminate string */
> +       for (i = 0; i < nullsize; i++)
> +               to[outlen++] = 0;
> +
> +       return outlen;
> +}
> +
> +/*
>  * NAME:       cifs_strfromUCS()
>  *
>  * FUNCTION:   Convert little-endian unicode string to character string
> diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
> index 2dfae68..e23ef08 100644
> --- a/fs/cifs/cifs_unicode.h
> +++ b/fs/cifs/cifs_unicode.h
> @@ -72,6 +72,8 @@ extern struct UniCaseRange UniLowerRange[];
>  #endif                         /* UNIUPR_NOLOWER */
>
>  #ifdef __KERNEL__
> +int cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> +                        const struct nls_table *codepage, bool mapchar);
>  int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
>  int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
>  #endif
> --
> 1.6.0.6
>
> _______________________________________________
> linux-cifs-client mailing list
> linux-cifs-client@lists.samba.org
> https://lists.samba.org/mailman/listinfo/linux-cifs-client
>
Jeff Layton April 29, 2009, 3:30 p.m. UTC | #2
On Wed, 29 Apr 2009 10:26:40 -0500
Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:

> On Wed, Apr 29, 2009 at 8:29 AM, Jeff Layton <jlayton@redhat.com> wrote:
> > Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
> > takes args for the source and destination length so that we can ensure
> > that the function is confined within the intended buffers.
> >
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> >  fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/cifs/cifs_unicode.h |    2 +
> >  2 files changed, 123 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
> > index 7d75272..aafaf0d 100644
> > --- a/fs/cifs/cifs_unicode.c
> > +++ b/fs/cifs/cifs_unicode.c
> > @@ -26,6 +26,127 @@
> >  #include "cifs_debug.h"
> >
> >  /*
> > + * cifs_mapchar - convert a little-endian char to proper char in codepage
> > + * @target - where converted character should be copied
> > + * @src_char - 2 byte little-endian source character
> > + * @cp - codepage to which character should be converted
> > + * @mapchar - should character be mapped according to mapchars mount option?
> > + *
> > + * This function handles the conversion of a single character. It is the
> > + * responsibility of the caller to ensure that the target buffer is large
> > + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
> > + */
> > +static int
> > +cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
> > +            bool mapchar)
> > +{
> > +       int len = 1;
> > +
> > +       if (!mapchar)
> > +               goto cp_convert;
> > +
> > +       /*
> > +        * BB: Cannot handle remapping UNI_SLASH until all the calls to
> > +        *     build_path_from_dentry are modified, as they use slash as
> > +        *     separator.
> > +        */
> > +       switch (le16_to_cpu(src_char)) {
> > +       case UNI_COLON:
> > +               *target = ':';
> > +               break;
> > +       case UNI_ASTERIK:
> > +               *target = '*';
> > +               break;
> > +       case UNI_QUESTION:
> > +               *target = '?';
> > +               break;
> > +       case UNI_PIPE:
> > +               *target = '|';
> > +               break;
> > +       case UNI_GRTRTHAN:
> > +               *target = '>';
> > +               break;
> > +       case UNI_LESSTHAN:
> > +               *target = '<';
> > +               break;
> > +       default:
> > +               goto cp_convert;
> > +       }
> > +
> > +out:
> > +       return len;
> > +
> > +cp_convert:
> > +       len = cp->uni2char(le16_to_cpu(src_char), target,
> > +                          NLS_MAX_CHARSET_SIZE);
> > +       if (len <= 0) {
> > +               *target = '?';
> > +               len = 1;
> > +       }
> > +       goto out;
> > +}
> > +
> > +/*
> > + * cifs_utf16le_to_host - convert utf16le string to local charset
> > + * @to - destination buffer
> > + * @from - source buffer
> > + * @tolen - destination buffer size (in bytes)
> > + * @fromlen - source buffer size (in bytes)
> > + * @codepage - codepage to which characters should be converted
> > + * @mapchar - should characters be remapped according to the mapchars option?
> > + *
> > + * Convert a little-endian utf16le string (as sent by the server) to a string
> > + * in the provided codepage. The tolen and fromlen parameters are to ensure
> > + * that the code doesn't walk off of the end of the buffer (which is always
> > + * a danger if the alignment of the source buffer is off). The destination
> > + * string is always properly null terminated and fits in the destination
> > + * buffer. Returns the length of the destination string in bytes (including
> > + * null terminator).
> > + */
> > +int
> > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> > +                    const struct nls_table *codepage, bool mapchar)
> > +{
> > +       int i, charlen, safelen;
> > +       int outlen = 0;
> > +       int nullsize = null_charlen(codepage);
> > +       int fromwords = fromlen / 2;
> 
> I think assumption here is code values are two bytes.  I think that is
> correct in case of UCS-2 encoding
> but in case of UTF-16, the code values can be either two or four bytes.
> 

Can you show me a citation? I thought UTF-16 meant a fixed-length 2
byte encoding.

> > +       char tmp[NLS_MAX_CHARSET_SIZE];
> > +
> > +       /*
> > +        * because the chars can be of varying widths, we need to take care
> > +        * not to overflow the destination buffer when we get close to the
> > +        * end of it. Until we get to this offset, we don't need to check
> > +        * for overflow however.
> > +        */
> > +       safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
> 
> Can safelen become negative?  In case of a code value byte stream
> consisting of say two, two byte code values?
> 

Yes. It doesn't matter though. The math where it's checked still works.

> > +
> > +       for (i = 0; i < fromwords && from[i]; i++) {
> > +               /*
> > +                * check to see if converting this character might make the
> > +                * conversion bleed into the null terminator
> > +                */
> > +               if (outlen >= safelen) {
> > +                       charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
> 
> If mapchar is not set, cifs_mapchar is always going to return 1 (since
> uni2char always returns 1)
> in case of no error.
> 

uni2char does not always return 1. In the case of UTF-8, for instance
it returns the width of the character in bytes that it put in the
destination buffer.

> > +                       if (charlen <= 0)
> > +                               charlen = 1;
> > +                       if ((outlen + charlen) > (tolen - nullsize))
> > +                               break;
> > +               }
> > +
> > +               /* put converted char into 'to' buffer */
> > +               charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
> > +               outlen += charlen;
> > +       }
> > +
> > +       /* properly null-terminate string */
> > +       for (i = 0; i < nullsize; i++)
> > +               to[outlen++] = 0;
> > +
> > +       return outlen;
> > +}
> > +
> > +/*
> >  * NAME:       cifs_strfromUCS()
> >  *
> >  * FUNCTION:   Convert little-endian unicode string to character string
> > diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
> > index 2dfae68..e23ef08 100644
> > --- a/fs/cifs/cifs_unicode.h
> > +++ b/fs/cifs/cifs_unicode.h
> > @@ -72,6 +72,8 @@ extern struct UniCaseRange UniLowerRange[];
> >  #endif                         /* UNIUPR_NOLOWER */
> >
> >  #ifdef __KERNEL__
> > +int cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> > +                        const struct nls_table *codepage, bool mapchar);
> >  int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
> >  int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
> >  #endif
> > --
> > 1.6.0.6
> >
> > _______________________________________________
> > linux-cifs-client mailing list
> > linux-cifs-client@lists.samba.org
> > https://lists.samba.org/mailman/listinfo/linux-cifs-client
> >
Shirish Pargaonkar April 29, 2009, 3:40 p.m. UTC | #3
On Wed, Apr 29, 2009 at 10:30 AM, Jeff Layton <jlayton@redhat.com> wrote:
> On Wed, 29 Apr 2009 10:26:40 -0500
> Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:
>
>> On Wed, Apr 29, 2009 at 8:29 AM, Jeff Layton <jlayton@redhat.com> wrote:
>> > Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
>> > takes args for the source and destination length so that we can ensure
>> > that the function is confined within the intended buffers.
>> >
>> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> > ---
>> >  fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
>> >  fs/cifs/cifs_unicode.h |    2 +
>> >  2 files changed, 123 insertions(+), 0 deletions(-)
>> >
>> > diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
>> > index 7d75272..aafaf0d 100644
>> > --- a/fs/cifs/cifs_unicode.c
>> > +++ b/fs/cifs/cifs_unicode.c
>> > @@ -26,6 +26,127 @@
>> >  #include "cifs_debug.h"
>> >
>> >  /*
>> > + * cifs_mapchar - convert a little-endian char to proper char in codepage
>> > + * @target - where converted character should be copied
>> > + * @src_char - 2 byte little-endian source character
>> > + * @cp - codepage to which character should be converted
>> > + * @mapchar - should character be mapped according to mapchars mount option?
>> > + *
>> > + * This function handles the conversion of a single character. It is the
>> > + * responsibility of the caller to ensure that the target buffer is large
>> > + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
>> > + */
>> > +static int
>> > +cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
>> > +            bool mapchar)
>> > +{
>> > +       int len = 1;
>> > +
>> > +       if (!mapchar)
>> > +               goto cp_convert;
>> > +
>> > +       /*
>> > +        * BB: Cannot handle remapping UNI_SLASH until all the calls to
>> > +        *     build_path_from_dentry are modified, as they use slash as
>> > +        *     separator.
>> > +        */
>> > +       switch (le16_to_cpu(src_char)) {
>> > +       case UNI_COLON:
>> > +               *target = ':';
>> > +               break;
>> > +       case UNI_ASTERIK:
>> > +               *target = '*';
>> > +               break;
>> > +       case UNI_QUESTION:
>> > +               *target = '?';
>> > +               break;
>> > +       case UNI_PIPE:
>> > +               *target = '|';
>> > +               break;
>> > +       case UNI_GRTRTHAN:
>> > +               *target = '>';
>> > +               break;
>> > +       case UNI_LESSTHAN:
>> > +               *target = '<';
>> > +               break;
>> > +       default:
>> > +               goto cp_convert;
>> > +       }
>> > +
>> > +out:
>> > +       return len;
>> > +
>> > +cp_convert:
>> > +       len = cp->uni2char(le16_to_cpu(src_char), target,
>> > +                          NLS_MAX_CHARSET_SIZE);
>> > +       if (len <= 0) {
>> > +               *target = '?';
>> > +               len = 1;
>> > +       }
>> > +       goto out;
>> > +}
>> > +
>> > +/*
>> > + * cifs_utf16le_to_host - convert utf16le string to local charset
>> > + * @to - destination buffer
>> > + * @from - source buffer
>> > + * @tolen - destination buffer size (in bytes)
>> > + * @fromlen - source buffer size (in bytes)
>> > + * @codepage - codepage to which characters should be converted
>> > + * @mapchar - should characters be remapped according to the mapchars option?
>> > + *
>> > + * Convert a little-endian utf16le string (as sent by the server) to a string
>> > + * in the provided codepage. The tolen and fromlen parameters are to ensure
>> > + * that the code doesn't walk off of the end of the buffer (which is always
>> > + * a danger if the alignment of the source buffer is off). The destination
>> > + * string is always properly null terminated and fits in the destination
>> > + * buffer. Returns the length of the destination string in bytes (including
>> > + * null terminator).
>> > + */
>> > +int
>> > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
>> > +                    const struct nls_table *codepage, bool mapchar)
>> > +{
>> > +       int i, charlen, safelen;
>> > +       int outlen = 0;
>> > +       int nullsize = null_charlen(codepage);
>> > +       int fromwords = fromlen / 2;
>>
>> I think assumption here is code values are two bytes.  I think that is
>> correct in case of UCS-2 encoding
>> but in case of UTF-16, the code values can be either two or four bytes.
>>
>
> Can you show me a citation? I thought UTF-16 meant a fixed-length 2
> byte encoding.
>

Jeff,

http://unicode.org/faq/utf_bom.html#utf16-1

I guess we may not encounter some of those (code values) characters,
but again we may.

>> > +       char tmp[NLS_MAX_CHARSET_SIZE];
>> > +
>> > +       /*
>> > +        * because the chars can be of varying widths, we need to take care
>> > +        * not to overflow the destination buffer when we get close to the
>> > +        * end of it. Until we get to this offset, we don't need to check
>> > +        * for overflow however.
>> > +        */
>> > +       safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
>>
>> Can safelen become negative?  In case of a code value byte stream
>> consisting of say two, two byte code values?
>>
>
> Yes. It doesn't matter though. The math where it's checked still works.
>
>> > +
>> > +       for (i = 0; i < fromwords && from[i]; i++) {
>> > +               /*
>> > +                * check to see if converting this character might make the
>> > +                * conversion bleed into the null terminator
>> > +                */
>> > +               if (outlen >= safelen) {
>> > +                       charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
>>
>> If mapchar is not set, cifs_mapchar is always going to return 1 (since
>> uni2char always returns 1)
>> in case of no error.
>>
>
> uni2char does not always return 1. In the case of UTF-8, for instance
> it returns the width of the character in bytes that it put in the
> destination buffer.

Jeff, can you please point me to the file where uni2char is coded thus?
I did not find a uni2char function returning more that 1 bytes as
character length/width,
all of them return one, even linux/fs/nls_koi8-u.c  (not sure koi is a
Korean charset, I just assumed from koi).


>
>> > +                       if (charlen <= 0)
>> > +                               charlen = 1;
>> > +                       if ((outlen + charlen) > (tolen - nullsize))
>> > +                               break;
>> > +               }
>> > +
>> > +               /* put converted char into 'to' buffer */
>> > +               charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
>> > +               outlen += charlen;
>> > +       }
>> > +
>> > +       /* properly null-terminate string */
>> > +       for (i = 0; i < nullsize; i++)
>> > +               to[outlen++] = 0;
>> > +
>> > +       return outlen;
>> > +}
>> > +
>> > +/*
>> >  * NAME:       cifs_strfromUCS()
>> >  *
>> >  * FUNCTION:   Convert little-endian unicode string to character string
>> > diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
>> > index 2dfae68..e23ef08 100644
>> > --- a/fs/cifs/cifs_unicode.h
>> > +++ b/fs/cifs/cifs_unicode.h
>> > @@ -72,6 +72,8 @@ extern struct UniCaseRange UniLowerRange[];
>> >  #endif                         /* UNIUPR_NOLOWER */
>> >
>> >  #ifdef __KERNEL__
>> > +int cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
>> > +                        const struct nls_table *codepage, bool mapchar);
>> >  int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
>> >  int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
>> >  #endif
>> > --
>> > 1.6.0.6
>> >
>> > _______________________________________________
>> > linux-cifs-client mailing list
>> > linux-cifs-client@lists.samba.org
>> > https://lists.samba.org/mailman/listinfo/linux-cifs-client
>> >
>
>
> --
> Jeff Layton <jlayton@redhat.com>
>
Jeff Layton April 29, 2009, 3:48 p.m. UTC | #4
On Wed, 29 Apr 2009 11:30:23 -0400
Jeff Layton <jlayton@redhat.com> wrote:

> > > +/*
> > > + * cifs_utf16le_to_host - convert utf16le string to local charset
> > > + * @to - destination buffer
> > > + * @from - source buffer
> > > + * @tolen - destination buffer size (in bytes)
> > > + * @fromlen - source buffer size (in bytes)
> > > + * @codepage - codepage to which characters should be converted
> > > + * @mapchar - should characters be remapped according to the mapchars option?
> > > + *
> > > + * Convert a little-endian utf16le string (as sent by the server) to a string
> > > + * in the provided codepage. The tolen and fromlen parameters are to ensure
> > > + * that the code doesn't walk off of the end of the buffer (which is always
> > > + * a danger if the alignment of the source buffer is off). The destination
> > > + * string is always properly null terminated and fits in the destination
> > > + * buffer. Returns the length of the destination string in bytes (including
> > > + * null terminator).
> > > + */
> > > +int
> > > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> > > +                    const struct nls_table *codepage, bool mapchar)
> > > +{
> > > +       int i, charlen, safelen;
> > > +       int outlen = 0;
> > > +       int nullsize = null_charlen(codepage);
> > > +       int fromwords = fromlen / 2;
> > 
> > I think assumption here is code values are two bytes.  I think that is
> > correct in case of UCS-2 encoding
> > but in case of UTF-16, the code values can be either two or four bytes.
> > 
> 
> Can you show me a citation? I thought UTF-16 meant a fixed-length 2
> byte encoding.
> 

Ahh ok, I'm wrong here. Simo straightened me out...

"characters" in UTF-16 are a multiple of 16 bytes. So we *can* have
multiword chars (similar to how UTF-8 works). The problem we have
though is that the nls code in kernel isn't set up to deal with that. So
I don't think we can really do much at this point other than to treat
each 16 bit word as a character. If it's untranslatable to the
local charset, we'll just call it a '?' and move on.

Given that, I'll probably rename these functions to have _ucs2le_
instead of _utf16le_ since that's more accurate.

The good news is that that does not materially change how the buffer
sizing works here, and that's the immediate problem that we're trying
to solve with these patches.
Jeff Layton April 29, 2009, 3:58 p.m. UTC | #5
On Wed, 29 Apr 2009 10:40:52 -0500
Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:

> On Wed, Apr 29, 2009 at 10:30 AM, Jeff Layton <jlayton@redhat.com> wrote:
> > On Wed, 29 Apr 2009 10:26:40 -0500
> > Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:
> >
> >> On Wed, Apr 29, 2009 at 8:29 AM, Jeff Layton <jlayton@redhat.com> wrote:
> >> > Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
> >> > takes args for the source and destination length so that we can ensure
> >> > that the function is confined within the intended buffers.
> >> >
> >> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> >> > ---
> >> >  fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
> >> >  fs/cifs/cifs_unicode.h |    2 +
> >> >  2 files changed, 123 insertions(+), 0 deletions(-)
> >> >
> >> > diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
> >> > index 7d75272..aafaf0d 100644
> >> > --- a/fs/cifs/cifs_unicode.c
> >> > +++ b/fs/cifs/cifs_unicode.c
> >> > @@ -26,6 +26,127 @@
> >> >  #include "cifs_debug.h"
> >> >
> >> >  /*
> >> > + * cifs_mapchar - convert a little-endian char to proper char in codepage
> >> > + * @target - where converted character should be copied
> >> > + * @src_char - 2 byte little-endian source character
> >> > + * @cp - codepage to which character should be converted
> >> > + * @mapchar - should character be mapped according to mapchars mount option?
> >> > + *
> >> > + * This function handles the conversion of a single character. It is the
> >> > + * responsibility of the caller to ensure that the target buffer is large
> >> > + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
> >> > + */
> >> > +static int
> >> > +cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
> >> > +            bool mapchar)
> >> > +{
> >> > +       int len = 1;
> >> > +
> >> > +       if (!mapchar)
> >> > +               goto cp_convert;
> >> > +
> >> > +       /*
> >> > +        * BB: Cannot handle remapping UNI_SLASH until all the calls to
> >> > +        *     build_path_from_dentry are modified, as they use slash as
> >> > +        *     separator.
> >> > +        */
> >> > +       switch (le16_to_cpu(src_char)) {
> >> > +       case UNI_COLON:
> >> > +               *target = ':';
> >> > +               break;
> >> > +       case UNI_ASTERIK:
> >> > +               *target = '*';
> >> > +               break;
> >> > +       case UNI_QUESTION:
> >> > +               *target = '?';
> >> > +               break;
> >> > +       case UNI_PIPE:
> >> > +               *target = '|';
> >> > +               break;
> >> > +       case UNI_GRTRTHAN:
> >> > +               *target = '>';
> >> > +               break;
> >> > +       case UNI_LESSTHAN:
> >> > +               *target = '<';
> >> > +               break;
> >> > +       default:
> >> > +               goto cp_convert;
> >> > +       }
> >> > +
> >> > +out:
> >> > +       return len;
> >> > +
> >> > +cp_convert:
> >> > +       len = cp->uni2char(le16_to_cpu(src_char), target,
> >> > +                          NLS_MAX_CHARSET_SIZE);
> >> > +       if (len <= 0) {
> >> > +               *target = '?';
> >> > +               len = 1;
> >> > +       }
> >> > +       goto out;
> >> > +}
> >> > +
> >> > +/*
> >> > + * cifs_utf16le_to_host - convert utf16le string to local charset
> >> > + * @to - destination buffer
> >> > + * @from - source buffer
> >> > + * @tolen - destination buffer size (in bytes)
> >> > + * @fromlen - source buffer size (in bytes)
> >> > + * @codepage - codepage to which characters should be converted
> >> > + * @mapchar - should characters be remapped according to the mapchars option?
> >> > + *
> >> > + * Convert a little-endian utf16le string (as sent by the server) to a string
> >> > + * in the provided codepage. The tolen and fromlen parameters are to ensure
> >> > + * that the code doesn't walk off of the end of the buffer (which is always
> >> > + * a danger if the alignment of the source buffer is off). The destination
> >> > + * string is always properly null terminated and fits in the destination
> >> > + * buffer. Returns the length of the destination string in bytes (including
> >> > + * null terminator).
> >> > + */
> >> > +int
> >> > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
> >> > +                    const struct nls_table *codepage, bool mapchar)
> >> > +{
> >> > +       int i, charlen, safelen;
> >> > +       int outlen = 0;
> >> > +       int nullsize = null_charlen(codepage);
> >> > +       int fromwords = fromlen / 2;
> >>
> >> I think assumption here is code values are two bytes.  I think that is
> >> correct in case of UCS-2 encoding
> >> but in case of UTF-16, the code values can be either two or four bytes.
> >>
> >
> > Can you show me a citation? I thought UTF-16 meant a fixed-length 2
> > byte encoding.
> >
> 
> Jeff,
> 
> http://unicode.org/faq/utf_bom.html#utf16-1
> 
> I guess we may not encounter some of those (code values) characters,
> but again we may.
> 
> >> > +       char tmp[NLS_MAX_CHARSET_SIZE];
> >> > +
> >> > +       /*
> >> > +        * because the chars can be of varying widths, we need to take care
> >> > +        * not to overflow the destination buffer when we get close to the
> >> > +        * end of it. Until we get to this offset, we don't need to check
> >> > +        * for overflow however.
> >> > +        */
> >> > +       safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
> >>
> >> Can safelen become negative?  In case of a code value byte stream
> >> consisting of say two, two byte code values?
> >>
> >
> > Yes. It doesn't matter though. The math where it's checked still works.
> >
> >> > +
> >> > +       for (i = 0; i < fromwords && from[i]; i++) {
> >> > +               /*
> >> > +                * check to see if converting this character might make the
> >> > +                * conversion bleed into the null terminator
> >> > +                */
> >> > +               if (outlen >= safelen) {
> >> > +                       charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
> >>
> >> If mapchar is not set, cifs_mapchar is always going to return 1 (since
> >> uni2char always returns 1)
> >> in case of no error.
> >>
> >
> > uni2char does not always return 1. In the case of UTF-8, for instance
> > it returns the width of the character in bytes that it put in the
> > destination buffer.
> 
> Jeff, can you please point me to the file where uni2char is coded thus?
> I did not find a uni2char function returning more that 1 bytes as
> character length/width,
> all of them return one, even linux/fs/nls_koi8-u.c  (not sure koi is a
> Korean charset, I just assumed from koi).
> 
> 

fs/nls/nls_utf8.c

uni2char returns the value returned by utf8_wctomb -- the width of the
character in bytes. If other charsets are not doing that correctly
then they are broken.
Shirish Pargaonkar April 29, 2009, 4:19 p.m. UTC | #6
On Wed, Apr 29, 2009 at 10:58 AM, Jeff Layton <jlayton@redhat.com> wrote:
> On Wed, 29 Apr 2009 10:40:52 -0500
> Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:
>
>> On Wed, Apr 29, 2009 at 10:30 AM, Jeff Layton <jlayton@redhat.com> wrote:
>> > On Wed, 29 Apr 2009 10:26:40 -0500
>> > Shirish Pargaonkar <shirishpargaonkar@gmail.com> wrote:
>> >
>> >> On Wed, Apr 29, 2009 at 8:29 AM, Jeff Layton <jlayton@redhat.com> wrote:
>> >> > Add a replacement function for cifs_strtoUCS_le. cifs_utf16le_to_host
>> >> > takes args for the source and destination length so that we can ensure
>> >> > that the function is confined within the intended buffers.
>> >> >
>> >> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
>> >> > ---
>> >> >  fs/cifs/cifs_unicode.c |  121 ++++++++++++++++++++++++++++++++++++++++++++++++
>> >> >  fs/cifs/cifs_unicode.h |    2 +
>> >> >  2 files changed, 123 insertions(+), 0 deletions(-)
>> >> >
>> >> > diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
>> >> > index 7d75272..aafaf0d 100644
>> >> > --- a/fs/cifs/cifs_unicode.c
>> >> > +++ b/fs/cifs/cifs_unicode.c
>> >> > @@ -26,6 +26,127 @@
>> >> >  #include "cifs_debug.h"
>> >> >
>> >> >  /*
>> >> > + * cifs_mapchar - convert a little-endian char to proper char in codepage
>> >> > + * @target - where converted character should be copied
>> >> > + * @src_char - 2 byte little-endian source character
>> >> > + * @cp - codepage to which character should be converted
>> >> > + * @mapchar - should character be mapped according to mapchars mount option?
>> >> > + *
>> >> > + * This function handles the conversion of a single character. It is the
>> >> > + * responsibility of the caller to ensure that the target buffer is large
>> >> > + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
>> >> > + */
>> >> > +static int
>> >> > +cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
>> >> > +            bool mapchar)
>> >> > +{
>> >> > +       int len = 1;
>> >> > +
>> >> > +       if (!mapchar)
>> >> > +               goto cp_convert;
>> >> > +
>> >> > +       /*
>> >> > +        * BB: Cannot handle remapping UNI_SLASH until all the calls to
>> >> > +        *     build_path_from_dentry are modified, as they use slash as
>> >> > +        *     separator.
>> >> > +        */
>> >> > +       switch (le16_to_cpu(src_char)) {
>> >> > +       case UNI_COLON:
>> >> > +               *target = ':';
>> >> > +               break;
>> >> > +       case UNI_ASTERIK:
>> >> > +               *target = '*';
>> >> > +               break;
>> >> > +       case UNI_QUESTION:
>> >> > +               *target = '?';
>> >> > +               break;
>> >> > +       case UNI_PIPE:
>> >> > +               *target = '|';
>> >> > +               break;
>> >> > +       case UNI_GRTRTHAN:
>> >> > +               *target = '>';
>> >> > +               break;
>> >> > +       case UNI_LESSTHAN:
>> >> > +               *target = '<';
>> >> > +               break;
>> >> > +       default:
>> >> > +               goto cp_convert;
>> >> > +       }
>> >> > +
>> >> > +out:
>> >> > +       return len;
>> >> > +
>> >> > +cp_convert:
>> >> > +       len = cp->uni2char(le16_to_cpu(src_char), target,
>> >> > +                          NLS_MAX_CHARSET_SIZE);
>> >> > +       if (len <= 0) {
>> >> > +               *target = '?';
>> >> > +               len = 1;
>> >> > +       }
>> >> > +       goto out;
>> >> > +}
>> >> > +
>> >> > +/*
>> >> > + * cifs_utf16le_to_host - convert utf16le string to local charset
>> >> > + * @to - destination buffer
>> >> > + * @from - source buffer
>> >> > + * @tolen - destination buffer size (in bytes)
>> >> > + * @fromlen - source buffer size (in bytes)
>> >> > + * @codepage - codepage to which characters should be converted
>> >> > + * @mapchar - should characters be remapped according to the mapchars option?
>> >> > + *
>> >> > + * Convert a little-endian utf16le string (as sent by the server) to a string
>> >> > + * in the provided codepage. The tolen and fromlen parameters are to ensure
>> >> > + * that the code doesn't walk off of the end of the buffer (which is always
>> >> > + * a danger if the alignment of the source buffer is off). The destination
>> >> > + * string is always properly null terminated and fits in the destination
>> >> > + * buffer. Returns the length of the destination string in bytes (including
>> >> > + * null terminator).
>> >> > + */
>> >> > +int
>> >> > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
>> >> > +                    const struct nls_table *codepage, bool mapchar)
>> >> > +{
>> >> > +       int i, charlen, safelen;
>> >> > +       int outlen = 0;
>> >> > +       int nullsize = null_charlen(codepage);
>> >> > +       int fromwords = fromlen / 2;
>> >>
>> >> I think assumption here is code values are two bytes.  I think that is
>> >> correct in case of UCS-2 encoding
>> >> but in case of UTF-16, the code values can be either two or four bytes.
>> >>
>> >
>> > Can you show me a citation? I thought UTF-16 meant a fixed-length 2
>> > byte encoding.
>> >
>>
>> Jeff,
>>
>> http://unicode.org/faq/utf_bom.html#utf16-1
>>
>> I guess we may not encounter some of those (code values) characters,
>> but again we may.
>>
>> >> > +       char tmp[NLS_MAX_CHARSET_SIZE];
>> >> > +
>> >> > +       /*
>> >> > +        * because the chars can be of varying widths, we need to take care
>> >> > +        * not to overflow the destination buffer when we get close to the
>> >> > +        * end of it. Until we get to this offset, we don't need to check
>> >> > +        * for overflow however.
>> >> > +        */
>> >> > +       safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
>> >>
>> >> Can safelen become negative?  In case of a code value byte stream
>> >> consisting of say two, two byte code values?
>> >>
>> >
>> > Yes. It doesn't matter though. The math where it's checked still works.
>> >
>> >> > +
>> >> > +       for (i = 0; i < fromwords && from[i]; i++) {
>> >> > +               /*
>> >> > +                * check to see if converting this character might make the
>> >> > +                * conversion bleed into the null terminator
>> >> > +                */
>> >> > +               if (outlen >= safelen) {
>> >> > +                       charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
>> >>
>> >> If mapchar is not set, cifs_mapchar is always going to return 1 (since
>> >> uni2char always returns 1)
>> >> in case of no error.
>> >>
>> >
>> > uni2char does not always return 1. In the case of UTF-8, for instance
>> > it returns the width of the character in bytes that it put in the
>> > destination buffer.
>>
>> Jeff, can you please point me to the file where uni2char is coded thus?
>> I did not find a uni2char function returning more that 1 bytes as
>> character length/width,
>> all of them return one, even linux/fs/nls_koi8-u.c  (not sure koi is a
>> Korean charset, I just assumed from koi).
>>
>>
>
> fs/nls/nls_utf8.c
>
> uni2char returns the value returned by utf8_wctomb -- the width of the
> character in bytes. If other charsets are not doing that correctly
> then they are broken.
>
> --
> Jeff Layton <jlayton@redhat.com>
>

I think we may be OK, most of the nls_*.c charset files return 1 byte length for
uni2char and char2uni translation/mapping/encoding and that is probably
correct for that charset, do not know enough about all the charsets.
There are some charset files such as nls_cp932.c and nls_cp936.c which
return 2 bytes as character length sometimes, so looks like for a charset,
uni2char and char2uni functions return appropriate (character
width/length) values.
Steve French April 29, 2009, 4:25 p.m. UTC | #7
On Wed, Apr 29, 2009 at 10:48 AM, Jeff Layton <jlayton@redhat.com> wrote:
> On Wed, 29 Apr 2009 11:30:23 -0400
> Jeff Layton <jlayton@redhat.com> wrote:
>
>> > > +/*
>> > > + * cifs_utf16le_to_host - convert utf16le string to local charset
>> > > + * @to - destination buffer
>> > > + * @from - source buffer
>> > > + * @tolen - destination buffer size (in bytes)
>> > > + * @fromlen - source buffer size (in bytes)
>> > > + * @codepage - codepage to which characters should be converted
>> > > + * @mapchar - should characters be remapped according to the mapchars option?
>> > > + *
>> > > + * Convert a little-endian utf16le string (as sent by the server) to a string
>> > > + * in the provided codepage. The tolen and fromlen parameters are to ensure
>> > > + * that the code doesn't walk off of the end of the buffer (which is always
>> > > + * a danger if the alignment of the source buffer is off). The destination
>> > > + * string is always properly null terminated and fits in the destination
>> > > + * buffer. Returns the length of the destination string in bytes (including
>> > > + * null terminator).
>> > > + */
>> > > +int
>> > > +cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
>> > > +                    const struct nls_table *codepage, bool mapchar)
>> > > +{
>> > > +       int i, charlen, safelen;
>> > > +       int outlen = 0;
>> > > +       int nullsize = null_charlen(codepage);
>> > > +       int fromwords = fromlen / 2;
>> >
>> > I think assumption here is code values are two bytes.  I think that is
>> > correct in case of UCS-2 encoding
>> > but in case of UTF-16, the code values can be either two or four bytes.
>> >
>>
>> Can you show me a citation? I thought UTF-16 meant a fixed-length 2
>> byte encoding.
We can't translate these - they will look like a ? followed by a bogus
character, but they are extremely rare, and only a few could even
generate them (e.g. some Windows, but not most applications, and not
Java).


> Given that, I'll probably rename these functions to have _ucs2le_
> instead of _utf16le_ since that's more accurate.

Let's not worry about the "le" part of the name.  It makes it a longer
name and hard to read (it sounds like you are converting "ucs" (in
host endianness) to little endian ucs, and UCS is driven by Windows
(which is little endian so the little endian part of the name is less
interesting).  Technically you could have a UCS2 big endian but I
don't think it makes sense to name it that way (a little endian vs. a
big endian in the name).

> The good news is that that does not materially change how the buffer
> sizing works here, and that's the immediate problem that we're trying
> to solve with these patches.

right - agreed.
diff mbox

Patch

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 7d75272..aafaf0d 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -26,6 +26,127 @@ 
 #include "cifs_debug.h"
 
 /*
+ * cifs_mapchar - convert a little-endian char to proper char in codepage
+ * @target - where converted character should be copied
+ * @src_char - 2 byte little-endian source character
+ * @cp - codepage to which character should be converted
+ * @mapchar - should character be mapped according to mapchars mount option?
+ *
+ * This function handles the conversion of a single character. It is the
+ * responsibility of the caller to ensure that the target buffer is large
+ * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
+ */
+static int
+cifs_mapchar(char *target, const __le16 src_char, const struct nls_table *cp,
+	     bool mapchar)
+{
+	int len = 1;
+
+	if (!mapchar)
+		goto cp_convert;
+
+	/*
+	 * BB: Cannot handle remapping UNI_SLASH until all the calls to
+	 *     build_path_from_dentry are modified, as they use slash as
+	 *     separator.
+	 */
+	switch (le16_to_cpu(src_char)) {
+	case UNI_COLON:
+		*target = ':';
+		break;
+	case UNI_ASTERIK:
+		*target = '*';
+		break;
+	case UNI_QUESTION:
+		*target = '?';
+		break;
+	case UNI_PIPE:
+		*target = '|';
+		break;
+	case UNI_GRTRTHAN:
+		*target = '>';
+		break;
+	case UNI_LESSTHAN:
+		*target = '<';
+		break;
+	default:
+		goto cp_convert;
+	}
+
+out:
+	return len;
+
+cp_convert:
+	len = cp->uni2char(le16_to_cpu(src_char), target,
+			   NLS_MAX_CHARSET_SIZE);
+	if (len <= 0) {
+		*target = '?';
+		len = 1;
+	}
+	goto out;
+}
+
+/*
+ * cifs_utf16le_to_host - convert utf16le string to local charset
+ * @to - destination buffer
+ * @from - source buffer
+ * @tolen - destination buffer size (in bytes)
+ * @fromlen - source buffer size (in bytes)
+ * @codepage - codepage to which characters should be converted
+ * @mapchar - should characters be remapped according to the mapchars option?
+ *
+ * Convert a little-endian utf16le string (as sent by the server) to a string
+ * in the provided codepage. The tolen and fromlen parameters are to ensure
+ * that the code doesn't walk off of the end of the buffer (which is always
+ * a danger if the alignment of the source buffer is off). The destination
+ * string is always properly null terminated and fits in the destination
+ * buffer. Returns the length of the destination string in bytes (including
+ * null terminator).
+ */
+int
+cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
+		     const struct nls_table *codepage, bool mapchar)
+{
+	int i, charlen, safelen;
+	int outlen = 0;
+	int nullsize = null_charlen(codepage);
+	int fromwords = fromlen / 2;
+	char tmp[NLS_MAX_CHARSET_SIZE];
+
+	/*
+	 * because the chars can be of varying widths, we need to take care
+	 * not to overflow the destination buffer when we get close to the
+	 * end of it. Until we get to this offset, we don't need to check
+	 * for overflow however.
+	 */
+	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
+
+	for (i = 0; i < fromwords && from[i]; i++) {
+		/*
+		 * check to see if converting this character might make the
+		 * conversion bleed into the null terminator
+		 */
+		if (outlen >= safelen) {
+			charlen = cifs_mapchar(tmp, from[i], codepage, mapchar);
+			if (charlen <= 0)
+				charlen = 1;
+			if ((outlen + charlen) > (tolen - nullsize))
+				break;
+		}
+
+		/* put converted char into 'to' buffer */
+		charlen = cifs_mapchar(&to[outlen], from[i], codepage, mapchar);
+		outlen += charlen;
+	}
+
+	/* properly null-terminate string */
+	for (i = 0; i < nullsize; i++)
+		to[outlen++] = 0;
+
+	return outlen;
+}
+
+/*
  * NAME:	cifs_strfromUCS()
  *
  * FUNCTION:	Convert little-endian unicode string to character string
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 2dfae68..e23ef08 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -72,6 +72,8 @@  extern struct UniCaseRange UniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
+int cifs_utf16le_to_host(char *to, const __le16 *from, int tolen, int fromlen,
+			 const struct nls_table *codepage, bool mapchar);
 int cifs_strfromUCS_le(char *, const __le16 *, int, const struct nls_table *);
 int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
 #endif