common/smbsrv/smb_utf8.c

	smb_utf8.c revision b819cea2f73f98c5662230cc9affc8cc84f77fcf
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 *
 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 * Multibyte/wide-char conversion routines. Wide-char encoding provides
 * a fixed size character encoding that maps to the Unicode 16-bit
 * (UCS-2) character set standard. Multibyte or UCS transformation
 * format (UTF) encoding is a variable length character encoding scheme
 * that s compatible with existing ASCII characters and guarantees that
 * the resultant strings do not contain embedded null characters. Both
 * types of encoding provide a null terminator: single byte for UTF-8
 * and a wide-char null for Unicode. See RFC 2044.
 *
 * The table below illustrates the UTF-8 encoding scheme. The letter x
 * indicates bits available for encoding the character value.
 *
 *  UCS-2           UTF-8 octet sequence (binary)
 *  0x0000-0x007F   0xxxxxxx
 *  0x0080-0x07FF   110xxxxx 10xxxxxx
 *  0x0800-0xFFFF   1110xxxx 10xxxxxx 10xxxxxx
 *
 * RFC 2044
 * UTF-8,a transformation format of UNICODE and ISO 10646
 * F. Yergeau
 * Alis Technologies
 * October 1996
 */

#if defined(_KERNEL) || defined(_FAKE_KERNEL)
#include <sys/types.h>
#include <sys/sunddi.h>
#else
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <strings.h>
#endif
#include <smbsrv/string.h>


/*
 * mbstowcs
 *
 * The mbstowcs() function converts a multibyte character string
 * mbstring into a wide character string wcstring. No more than
 * nwchars wide characters are stored. A terminating null wide
 * character is appended if there is room.
 *
 * Returns the number of wide characters converted, not counting
 * any terminating null wide character. Returns -1 if an invalid
 * multibyte character is encountered.
 */
size_t
smb_mbstowcs(smb_wchar_t *wcstring, const char *mbstring, size_t nwchars)
{
    int len;
    smb_wchar_t *start = wcstring;

    while (nwchars--) {
        len = smb_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
        if (len < 0) {
            *wcstring = 0;
            return ((size_t)-1);
        }

        if (*mbstring == 0)
            break;

        ++wcstring;
        mbstring += len;
    }

    return (wcstring - start);
}


/*
 * mbtowc
 *
 * The mbtowc() function converts a multibyte character mbchar into
 * a wide character and stores the result in the object pointed to
 * by wcharp. Up to nbytes bytes are examined.
 *
 * If mbchar is NULL, mbtowc() returns zero to indicate that shift
 * states are not supported.  Shift states are used to switch between
 * representation modes using reserved bytes to signal shifting
 * without them being interpreted as characters.  If mbchar is null
 * mbtowc should return non-zero if the current locale requires shift
 * states.  Otherwise it should be return 0.
 *
 * If mbchar is non-null, returns the number of bytes processed in
 * mbchar.  If mbchar is invalid, returns -1.
 */
int /*ARGSUSED*/
smb_mbtowc(smb_wchar_t *wcharp, const char *mbchar, size_t nbytes)
{
    unsigned char mbyte;
    smb_wchar_t wide_char;
    int count;
    int bytes_left;

    if (mbchar == NULL)
        return (0); /* no shift states */

    /* 0xxxxxxx -> 1 byte ASCII encoding */
    if (((mbyte = *mbchar++) & 0x80) == 0) {
        if (wcharp)
            *wcharp = (smb_wchar_t)mbyte;

        return (mbyte ? 1 : 0);
    }

    /* 10xxxxxx -> invalid first byte */
    if ((mbyte & 0x40) == 0)
        return (-1);

    wide_char = mbyte;
    if ((mbyte & 0x20) == 0) {
        wide_char &= 0x1f;
        bytes_left = 1;
    } else if ((mbyte & 0x10) == 0) {
        wide_char &= 0x0f;
        bytes_left = 2;
    } else {
        return (-1);
    }

    count = 1;
    while (bytes_left--) {
        if (((mbyte = *mbchar++) & 0xc0) != 0x80)
            return (-1);

        count++;
        wide_char = (wide_char << 6) | (mbyte & 0x3f);
    }

    if (wcharp)
        *wcharp = wide_char;

    return (count);
}


/*
 * wctomb
 *
 * The wctomb() function converts a wide character wchar into a multibyte
 * character and stores the result in mbchar. The object pointed to by
 * mbchar must be large enough to accommodate the multibyte character.
 *
 * Returns the numberof bytes written to mbchar.
 */
int
smb_wctomb(char *mbchar, smb_wchar_t wchar)
{
    if ((wchar & ~0x7f) == 0) {
        *mbchar = (char)wchar;
        return (1);
    }

    if ((wchar & ~0x7ff) == 0) {
        *mbchar++ = (wchar >> 6) | 0xc0;
        *mbchar = (wchar & 0x3f) | 0x80;
        return (2);
    }

    *mbchar++ = (wchar >> 12) | 0xe0;
    *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
    *mbchar = (wchar & 0x3f) | 0x80;
    return (3);
}


/*
 * wcstombs
 *
 * The wcstombs() function converts a wide character string wcstring
 * into a multibyte character string mbstring. Up to nbytes bytes are
 * stored in mbstring. Partial multibyte characters at the end of the
 * string are not stored. The multibyte character string is null
 * terminated if there is room.
 *
 * Returns the number of bytes converted, not counting the terminating
 * null byte.
 */
size_t
smb_wcstombs(char *mbstring, const smb_wchar_t *wcstring, size_t nbytes)
{
    char *start = mbstring;
    const smb_wchar_t *wcp = wcstring;
    smb_wchar_t wide_char = 0;
    char buf[4];
    size_t len;

    if ((mbstring == NULL) || (wcstring == NULL))
        return (0);

    while (nbytes > MTS_MB_CHAR_MAX) {
        wide_char = *wcp++;
        len = smb_wctomb(mbstring, wide_char);

        if (wide_char == 0)
            /*LINTED E_PTRDIFF_OVERFLOW*/
            return (mbstring - start);

        mbstring += len;
        nbytes -= len;
    }

    while (wide_char && nbytes) {
        wide_char = *wcp++;
        if ((len = smb_wctomb(buf, wide_char)) > nbytes) {
            *mbstring = 0;
            break;
        }

        bcopy(buf, mbstring, len);
        mbstring += len;
        nbytes -= len;
    }

    /*LINTED E_PTRDIFF_OVERFLOW*/
    return (mbstring - start);
}


/*
 * Returns the number of bytes that would be written if the multi-
 * byte string mbs was converted to a wide character string, not
 * counting the terminating null wide character.
 */
size_t
smb_wcequiv_strlen(const char *mbs)
{
    smb_wchar_t wide_char;
    size_t bytes;
    size_t len = 0;

    while (*mbs) {
        bytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
        if (bytes == ((size_t)-1))
            return ((size_t)-1);

        len += sizeof (smb_wchar_t);
        mbs += bytes;
    }

    return (len);
}


/*
 * Returns the number of bytes that would be written if the multi-
 * byte string mbs was converted to a single byte character string,
 * not counting the terminating null character.
 */
size_t
smb_sbequiv_strlen(const char *mbs)
{
    smb_wchar_t wide_char;
    size_t nbytes;
    size_t len = 0;

    while (*mbs) {
        nbytes = smb_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
        if (nbytes == ((size_t)-1))
            return ((size_t)-1);

        if (wide_char & 0xFF00)
            len += sizeof (smb_wchar_t);
        else
            ++len;

        mbs += nbytes;
    }

    return (len);
}


/*
 * stombs
 *
 * Convert a regular null terminated string 'string' to a UTF-8 encoded
 * null terminated multi-byte string 'mbstring'. Only full converted
 * UTF-8 characters will be written 'mbstring'. If a character will not
 * fit within the remaining buffer space or 'mbstring' will overflow
 * max_mblen, the conversion process will be terminated and 'mbstring'
 * will be null terminated.
 *
 * Returns the number of bytes written to 'mbstring', excluding the
 * terminating null character.
 *
 * If either mbstring or string is a null pointer, -1 is returned.
 */
int
smb_stombs(char *mbstring, char *string, int max_mblen)
{
    char *start = mbstring;
    unsigned char *p = (unsigned char *)string;
    int space_left = max_mblen;
    int len;
    smb_wchar_t wide_char;
    char buf[4];

    if (!mbstring || !string)
        return (-1);

    while (*p && space_left > 2) {
        wide_char = *p++;
        len = smb_wctomb(mbstring, wide_char);
        mbstring += len;
        space_left -= len;
    }

    if (*p) {
        wide_char = *p;
        if ((len = smb_wctomb(buf, wide_char)) < 2) {
            *mbstring = *buf;
            mbstring += len;
            space_left -= len;
        }
    }

    *mbstring = '\0';

    /*LINTED E_PTRDIFF_OVERFLOW*/
    return (mbstring - start);
}


/*
 * mbstos
 *
 * Convert a null terminated multi-byte string 'mbstring' to a regular
 * null terminated string 'string'.  A 1-byte character in 'mbstring'
 * maps to a 1-byte character in 'string'. A 2-byte character in
 * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
 * Otherwise the upper byte null will be discarded to ensure that the
 * output stream does not contain embedded null characters.
 *
 * If the input stream contains invalid multi-byte characters, a value
 * of -1 will be returned. Otherwise the length of 'string', excluding
 * the terminating null character, is returned.
 *
 * If either mbstring or string is a null pointer, -1 is returned.
 */
int
smb_mbstos(char *string, const char *mbstring)
{
    smb_wchar_t wc;
    unsigned char *start = (unsigned char *)string;
    int len;

    if (string == NULL || mbstring == NULL)
        return (-1);

    while (*mbstring) {
        if ((len = smb_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
            *string = 0;
            return (-1);
        }

        if (wc & 0xFF00) {
            /*LINTED E_BAD_PTR_CAST_ALIGN*/
            *((smb_wchar_t *)string) = wc;
            string += sizeof (smb_wchar_t);
        }
        else
        {
            *string = (unsigned char)wc;
            string++;
        }

        mbstring += len;
    }

    *string = 0;

    /*LINTED E_PTRDIFF_OVERFLOW*/
    return ((unsigned char *)string - start);
}