libsmbfs/common/smbfs_utf_str.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Unicode conversions (yet more)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <iconv.h>
#include <libintl.h>

#include <sys/u8_textprep.h>

#include "smbfs_lib.h"
#include "smbfs_charsets.h"

static char *smbfs_convert_ucs2xx_to_utf8(iconv_t, const uint16_t *);
static uint16_t *smbfs_convert_utf8_to_ucs2xx(iconv_t, const char *);

/*
 * Number of unicode symbols in the string,
 * not including the 2-byte null terminator.
 * (multiply by two for storage size)
 */
size_t
smbfs_unicode_strlen(const uint16_t *us)
{
    size_t len = 0;
    while (*us++)
        len++;
    return (len);
}

/*
 * Convert (native) Unicode string to UTF-8.
 * Returns allocated memory.
 */
char *
smbfs_convert_unicode_to_utf8(uint16_t *us)
{
    static iconv_t cd1 = (iconv_t)-1;

    /* Get conversion descriptor (to, from) */
    if (cd1 == (iconv_t)-1)
        cd1 = iconv_open("UTF-8", "UCS-2");

    return (smbfs_convert_ucs2xx_to_utf8(cd1, us));
}

static char *
smbfs_convert_ucs2xx_to_utf8(iconv_t cd, const uint16_t *us)
{
    char *obuf, *optr;
    char *iptr;
    size_t  ileft, obsize, oleft, ret;

    if (cd == (iconv_t)-1) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv_open(UTF-8/UCS-2)"), -1);
        return (NULL);
    }

    iptr = (char *)us;
    ileft = smbfs_unicode_strlen(us);
    ileft *= 2; /* now bytes */

    /* Worst-case output size is 2x input size. */
    oleft = ileft * 2;
    obsize = oleft + 2; /* room for null */
    obuf = malloc(obsize);
    if (!obuf)
        return (NULL);
    optr = obuf;

    ret = iconv(cd, &iptr, &ileft, &optr, &oleft);
    *optr = '\0';
    if (ret == (size_t)-1) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv(%s) failed"), errno, obuf);
    }
    if (ileft) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv(%s) failed"), -1, obuf);
        /*
         * XXX: What's better?  return NULL?
         * The truncated string? << for now
         */
    }

    return (obuf);
}

/*
 * Convert UTF-8 string to little-endian Unicode.
 * Returns allocated memory.
 */
uint16_t *
smbfs_convert_utf8_to_leunicode(const char *utf8_string)
{
    static iconv_t cd4 = (iconv_t)-1;

    /* Get conversion descriptor (to, from) */
    if (cd4 == (iconv_t)-1)
        cd4 = iconv_open("UCS-2LE", "UTF-8");
    return (smbfs_convert_utf8_to_ucs2xx(cd4, utf8_string));
}

static uint16_t *
smbfs_convert_utf8_to_ucs2xx(iconv_t cd, const char *utf8_string)
{
    uint16_t *obuf, *optr;
    char *iptr;
    size_t  ileft, obsize, oleft, ret;

    if (cd == (iconv_t)-1) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv_open(UCS-2/UTF-8)"), -1);
        return (NULL);
    }

    iptr = (char *)utf8_string;
    ileft = strlen(iptr);

    /* Worst-case output size is 2x input size. */
    oleft = ileft * 2;
    obsize = oleft + 2; /* room for null */
    obuf = malloc(obsize);
    if (!obuf)
        return (NULL);
    optr = obuf;

    ret = iconv(cd, &iptr, &ileft, (char **)&optr, &oleft);
    *optr = '\0';
    if (ret == (size_t)-1) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv(%s) failed"), errno, utf8_string);
    }
    if (ileft) {
        smbfs_error(dgettext(TEXT_DOMAIN,
            "iconv(%s) failed"), -1, utf8_string);
        /*
         * XXX: What's better?  return NULL?
         * The truncated string? << for now
         */
    }

    return (obuf);
}


/*
 * A simple wrapper around u8_textprep_str() that returns the Unicode
 * upper-case version of some string.  Returns memory from malloc.
 * Borrowed from idmapd.
 */
static char *
smbfs_utf8_str_to_upper_or_lower(const char *s, int upper_lower)
{
    char *res = NULL;
    char *outs;
    size_t inlen, outlen, inbleft, outbleft;
    int rc, err;

    /*
     * u8_textprep_str() does not allocate memory.  The input and
     * output buffers may differ in size (though that would be more
     * likely when normalization is done).  We have to loop over it...
     *
     * To improve the chances that we can avoid looping we add 10
     * bytes of output buffer room the first go around.
     */
    inlen = inbleft = strlen(s);
    outlen = outbleft = inlen + 10;
    if ((res = malloc(outlen)) == NULL)
        return (NULL);
    outs = res;

    while ((rc = u8_textprep_str((char *)s, &inbleft, outs,
        &outbleft, upper_lower, U8_UNICODE_LATEST, &err)) < 0 &&
        err == E2BIG) {
        if ((res = realloc(res, outlen + inbleft)) == NULL)
            return (NULL);
        /* adjust input/output buffer pointers */
        s += (inlen - inbleft);
        outs = res + outlen - outbleft;
        /* adjust outbleft and outlen */
        outlen += inbleft;
        outbleft += inbleft;
    }

    if (rc < 0) {
        free(res);
        res = NULL;
        return (NULL);
    }

    res[outlen - outbleft] = '\0';

    return (res);
}

char *
smbfs_utf8_str_toupper(const char *s)
{
    return (smbfs_utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOUPPER));
}

char *
smbfs_utf8_str_tolower(const char *s)
{
    return (smbfs_utf8_str_to_upper_or_lower(s, U8_TEXTPREP_TOLOWER));
}

/*
 * On Solaris, we will need to do some rewriting to use our iconv
 * routines for the conversions.  For now, we're effectively
 * stubbing out code, leaving the details of what happens on
 * Darwin in case it's useful as a guide later.
 */

static unsigned
smbfs_xtoi(char u)
{
    if (isdigit(u))
        return (u - '0');
    else if (islower(u))
        return (10 + u - 'a');
    else if (isupper(u))
        return (10 + u - 'A');
    return (16);
}


/*
 * Removes the "%" escape sequences from a URL component.
 * See IETF RFC 2396.
 */
char *
smbfs_unpercent(char *component)
{
    char c, *s;
    unsigned hi, lo;

    if (component == NULL)
        return (component);

    for (s = component; (c = *s) != 0; s++) {
        if (c != '%')
            continue;
        if ((hi = smbfs_xtoi(s[1])) > 15 ||
            (lo = smbfs_xtoi(s[2])) > 15)
            continue; /* ignore invalid escapes */
        s[0] = hi*16 + lo;
        /*
         * This was strcpy(s + 1, s + 3);
         * But nowadays leftward overlapping copies are
         * officially undefined in C.  Ours seems to
         * work or not depending upon alignment.
         */
        (void) memmove(s+1, s+3, strlen(s+3) + 1);
    }
    return (component);
}

/*
 * XXX - NLS, or CF?  We should probably use the same routine for all
 * conversions.
 */
char *
smbfs_convert_wincs_to_utf8(const char *windows_string)
{
    return (strdup((char *)windows_string));
}

/*
 * XXX - NLS, or CF?  We should probably use the same routine for all
 * conversions.
 */
char *
smbfs_convert_utf8_to_wincs(const char *utf8_string)
{
    return (strdup((char *)utf8_string));
}