src/libcroco/cr-utils.c

/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */

/*
 * This file is part of The Croco Library
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2.1 of the GNU Lesser General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 * USA
 *
 * Author: Dodji Seketeli
 * See COPYRIGHTS file for copyright information.
 */

#include "cr-utils.h"
#include "cr-string.h"

/**
 *@file:
 *Some misc utility functions used
 *in the libcroco.
 *Note that troughout this file I will
 *refer to the CSS SPECIFICATIONS DOCUMENTATION
 *written by the w3c guys. You can find that document
 *at http://www.w3.org/TR/REC-CSS2/ .
 */

/****************************
 *Encoding transformations and
 *encoding helpers
 ****************************/

/*
 *Here is the correspondance between the ucs-4 charactere codes
 *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
 *
 *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
 *------------------    -----------------------------
 *0000 0000-0000 007F   0xxxxxxx
 *0000 0080-0000 07FF   110xxxxx 10xxxxxx
 *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
 *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
 */

/**
 *Given an utf8 string buffer, calculates
 *the length of this string if it was encoded
 *in ucs4.
 *@param a_in_start a pointer to the begining of
 *the input utf8 string.
 *@param a_in_end a pointre to the end of the input
 *utf8 string (points to the last byte of the buffer)
 *@param a_len out parameter the calculated length.
 *@return CR_OK upon succesfull completion, an error code
 *otherwise.
 */
enum CRStatus
cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
                               const guchar * a_in_end, gulong * a_len)
{
        guchar *byte_ptr = NULL;
        gint len = 0;

        /*
         *to store the final decoded
         *unicode char
         */
        guint c = 0;

        g_return_val_if_fail (a_in_start && a_in_end && a_len,
                              CR_BAD_PARAM_ERROR);
        *a_len = 0;

        for (byte_ptr = (guchar *) a_in_start;
             byte_ptr <= a_in_end; byte_ptr++) {
                gint nb_bytes_2_decode = 0;

                if (*byte_ptr <= 0x7F) {
                        /*
                         *7 bits long char
                         *encoded over 1 byte:
                         * 0xxx xxxx
                         */
                        c = *byte_ptr;
                        nb_bytes_2_decode = 1;

                } else if ((*byte_ptr & 0xE0) == 0xC0) {
                        /*
                         *up to 11 bits long char.
                         *encoded over 2 bytes:
                         *110x xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x1F;
                        nb_bytes_2_decode = 2;

                } else if ((*byte_ptr & 0xF0) == 0xE0) {
                        /*
                         *up to 16 bit long char
                         *encoded over 3 bytes:
                         *1110 xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x0F;
                        nb_bytes_2_decode = 3;

                } else if ((*byte_ptr & 0xF8) == 0xF0) {
                        /*
                         *up to 21 bits long char
                         *encoded over 4 bytes:
                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x7;
                        nb_bytes_2_decode = 4;

                } else if ((*byte_ptr & 0xFC) == 0xF8) {
                        /*
                         *up to 26 bits long char
                         *encoded over 5 bytes.
                         *1111 10xx  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 3;
                        nb_bytes_2_decode = 5;

                } else if ((*byte_ptr & 0xFE) == 0xFC) {
                        /*
                         *up to 31 bits long char
                         *encoded over 6 bytes:
                         *1111 110x  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 1;
                        nb_bytes_2_decode = 6;

                } else {
                        /*
                         *BAD ENCODING
                         */
                        return CR_ENCODING_ERROR;
                }

                /*
                 *Go and decode the remaining byte(s)
                 *(if any) to get the current character.
                 */
                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
                        /*decode the next byte */
                        byte_ptr++;

                        /*byte pattern must be: 10xx xxxx */
                        if ((*byte_ptr & 0xC0) != 0x80) {
                                return CR_ENCODING_ERROR;
                        }

                        c = (c << 6) | (*byte_ptr & 0x3F);
                }

                len++;
        }

        *a_len = len;

        return CR_OK;
}

/**
 *Given an ucs4 string, this function
 *returns the size (in bytes) this string
 *would have occupied if it was encoded in utf-8.
 *@param a_in_start a pointer to the beginning of the input
 *buffer.
 *@param a_in_end a pointer to the end of the input buffer.
 *@param a_len out parameter. The computed length.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
                               const guint32 * a_in_end, gulong * a_len)
{
        gint len = 0;
        guint32 *char_ptr = NULL;

        g_return_val_if_fail (a_in_start && a_in_end && a_len,
                              CR_BAD_PARAM_ERROR);

        for (char_ptr = (guint32 *) a_in_start;
             char_ptr <= a_in_end; char_ptr++) {
                if (*char_ptr <= 0x7F) {
                        /*the utf-8 char would take 1 byte */
                        len += 1;
                } else if (*char_ptr <= 0x7FF) {
                        /*the utf-8 char would take 2 bytes */
                        len += 2;
                } else if (*char_ptr <= 0xFFFF) {
                        len += 3;
                } else if (*char_ptr <= 0x1FFFFF) {
                        len += 4;
                } else if (*char_ptr <= 0x3FFFFFF) {
                        len += 5;
                } else if (*char_ptr <= 0x7FFFFFFF) {
                        len += 6;
                }
        }

        *a_len = len;
        return CR_OK;
}

/**
 *Given an ucsA string, this function
 *returns the size (in bytes) this string
 *would have occupied if it was encoded in utf-8.
 *@param a_in_start a pointer to the beginning of the input
 *buffer.
 *@param a_in_end a pointer to the end of the input buffer.
 *@param a_len out parameter. The computed length.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
                               const guchar * a_in_end, gulong * a_len)
{
        gint len = 0;
        guchar *char_ptr = NULL;

        g_return_val_if_fail (a_in_start && a_in_end && a_len,
                              CR_BAD_PARAM_ERROR);

        for (char_ptr = (guchar *) a_in_start;
             char_ptr <= a_in_end; char_ptr++) {
                if (*char_ptr <= 0x7F) {
                        /*the utf-8 char would take 1 byte */
                        len += 1;
                } else {
                        /*the utf-8 char would take 2 bytes */
                        len += 2;
                }
        }

        *a_len = len;
        return CR_OK;
}

/**
 *Converts an utf8 buffer into an ucs4 buffer.
 *
 *@param a_in the input utf8 buffer to convert.
 *@param a_in_len in/out parameter. The size of the
 *input buffer to convert. After return, this parameter contains
 *the actual number of bytes consumed.
 *@param a_out the output converted ucs4 buffer. Must be allocated by
 *the caller.
 *@param a_out_len in/out parameter. The size of the output buffer.
 *If this size is actually smaller than the real needed size, the function
 *just converts what it can and returns a success status. After return,
 *this param points to the actual number of characters decoded.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_utf8_to_ucs4 (const guchar * a_in,
                       gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
{
        gulong in_len = 0,
                out_len = 0,
                in_index = 0,
                out_index = 0;
        enum CRStatus status = CR_OK;

        /*
         *to store the final decoded
         *unicode char
         */
        guint c = 0;

        g_return_val_if_fail (a_in && a_in_len
                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);

        if (*a_in_len < 1) {
                status = CR_OK;
                goto end;
        }

        in_len = *a_in_len;
        out_len = *a_out_len;

        for (in_index = 0, out_index = 0;
             (in_index < in_len) && (out_index < out_len);
             in_index++, out_index++) {
                gint nb_bytes_2_decode = 0;

                if (a_in[in_index] <= 0x7F) {
                        /*
                         *7 bits long char
                         *encoded over 1 byte:
                         * 0xxx xxxx
                         */
                        c = a_in[in_index];
                        nb_bytes_2_decode = 1;

                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
                        /*
                         *up to 11 bits long char.
                         *encoded over 2 bytes:
                         *110x xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x1F;
                        nb_bytes_2_decode = 2;

                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
                        /*
                         *up to 16 bit long char
                         *encoded over 3 bytes:
                         *1110 xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x0F;
                        nb_bytes_2_decode = 3;

                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
                        /*
                         *up to 21 bits long char
                         *encoded over 4 bytes:
                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x7;
                        nb_bytes_2_decode = 4;

                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
                        /*
                         *up to 26 bits long char
                         *encoded over 5 bytes.
                         *1111 10xx  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 3;
                        nb_bytes_2_decode = 5;

                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
                        /*
                         *up to 31 bits long char
                         *encoded over 6 bytes:
                         *1111 110x  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 1;
                        nb_bytes_2_decode = 6;

                } else {
                        /*BAD ENCODING */
                        goto end;
                }

                /*
                 *Go and decode the remaining byte(s)
                 *(if any) to get the current character.
                 */
                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
                        /*decode the next byte */
                        in_index++;

                        /*byte pattern must be: 10xx xxxx */
                        if ((a_in[in_index] & 0xC0) != 0x80) {
                                goto end;
                        }

                        c = (c << 6) | (a_in[in_index] & 0x3F);
                }

                /*
                 *The decoded ucs4 char is now
                 *in c.
                 */

                /************************
                 *Some security tests
                 ***********************/

                /*be sure c is a char */
                if (c == 0xFFFF || c == 0xFFFE)
                        goto end;

                /*be sure c is inferior to the max ucs4 char value */
                if (c > 0x10FFFF)
                        goto end;

                /*
                 *c must be less than UTF16 "lower surrogate begin"
                 *or higher than UTF16 "High surrogate end"
                 */
                if (c >= 0xD800 && c <= 0xDFFF)
                        goto end;

                /*Avoid characters that equals zero */
                if (c == 0)
                        goto end;

                a_out[out_index] = c;
        }

      end:
        *a_out_len = out_index + 1;
        *a_in_len = in_index + 1;

        return status;
}

/**
 *Reads a character from an utf8 buffer.
 *Actually decode the next character code (unicode character code)
 *and returns it.
 *@param a_in the starting address of the utf8 buffer.
 *@param a_in_len the length of the utf8 buffer.
 *@param a_out output parameter. The resulting read char.
 *@param a_consumed the number of the bytes consumed to
 *decode the returned character code.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_read_char_from_utf8_buf (const guchar * a_in,
                                  gulong a_in_len,
                                  guint32 * a_out, gulong * a_consumed)
{
        gulong in_index = 0,
               nb_bytes_2_decode = 0;
        enum CRStatus status = CR_OK;

        /*
         *to store the final decoded
         *unicode char
         */
        guint32 c = 0;

        g_return_val_if_fail (a_in && a_out && a_out
                              && a_consumed, CR_BAD_PARAM_ERROR);

        if (a_in_len < 1) {
                status = CR_OK;
                goto end;
        }

        if (*a_in <= 0x7F) {
                /*
                 *7 bits long char
                 *encoded over 1 byte:
                 * 0xxx xxxx
                 */
                c = *a_in;
                nb_bytes_2_decode = 1;

        } else if ((*a_in & 0xE0) == 0xC0) {
                /*
                 *up to 11 bits long char.
                 *encoded over 2 bytes:
                 *110x xxxx  10xx xxxx
                 */
                c = *a_in & 0x1F;
                nb_bytes_2_decode = 2;

        } else if ((*a_in & 0xF0) == 0xE0) {
                /*
                 *up to 16 bit long char
                 *encoded over 3 bytes:
                 *1110 xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 0x0F;
                nb_bytes_2_decode = 3;

        } else if ((*a_in & 0xF8) == 0xF0) {
                /*
                 *up to 21 bits long char
                 *encoded over 4 bytes:
                 *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 0x7;
                nb_bytes_2_decode = 4;

        } else if ((*a_in & 0xFC) == 0xF8) {
                /*
                 *up to 26 bits long char
                 *encoded over 5 bytes.
                 *1111 10xx  10xx xxxx  10xx xxxx
                 *10xx xxxx  10xx xxxx
                 */
                c = *a_in & 3;
                nb_bytes_2_decode = 5;

        } else if ((*a_in & 0xFE) == 0xFC) {
                /*
                 *up to 31 bits long char
                 *encoded over 6 bytes:
                 *1111 110x  10xx xxxx  10xx xxxx
                 *10xx xxxx  10xx xxxx  10xx xxxx
                 */
                c = *a_in & 1;
                nb_bytes_2_decode = 6;

        } else {
                /*BAD ENCODING */
                goto end;
        }

        if (nb_bytes_2_decode > a_in_len) {
                status = CR_END_OF_INPUT_ERROR;
                goto end;
        }

        /*
         *Go and decode the remaining byte(s)
         *(if any) to get the current character.
         */
        for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
                /*byte pattern must be: 10xx xxxx */
                if ((a_in[in_index] & 0xC0) != 0x80) {
                        goto end;
                }

                c = (c << 6) | (a_in[in_index] & 0x3F);
        }

        /*
         *The decoded ucs4 char is now
         *in c.
         */

    /************************
     *Some security tests
     ***********************/

        /*be sure c is a char */
        if (c == 0xFFFF || c == 0xFFFE)
                goto end;

        /*be sure c is inferior to the max ucs4 char value */
        if (c > 0x10FFFF)
                goto end;

        /*
         *c must be less than UTF16 "lower surrogate begin"
         *or higher than UTF16 "High surrogate end"
         */
        if (c >= 0xD800 && c <= 0xDFFF)
                goto end;

        /*Avoid characters that equals zero */
        if (c == 0)
                goto end;

        *a_out = c;

      end:
        *a_consumed = nb_bytes_2_decode;

        return status;
}

/**
 *
 */
enum CRStatus
cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
                               const guchar * a_in_end, gulong * a_len)
{
        /*
         *Note: this function can be made shorter
         *but it considers all the cases of the utf8 encoding
         *to ease further extensions ...
         */

        guchar *byte_ptr = NULL;
        gint len = 0;

        /*
         *to store the final decoded
         *unicode char
         */
        guint c = 0;

        g_return_val_if_fail (a_in_start && a_in_end && a_len,
                              CR_BAD_PARAM_ERROR);
        *a_len = 0;

        for (byte_ptr = (guchar *) a_in_start;
             byte_ptr <= a_in_end; byte_ptr++) {
                gint nb_bytes_2_decode = 0;

                if (*byte_ptr <= 0x7F) {
                        /*
                         *7 bits long char
                         *encoded over 1 byte:
                         * 0xxx xxxx
                         */
                        c = *byte_ptr;
                        nb_bytes_2_decode = 1;

                } else if ((*byte_ptr & 0xE0) == 0xC0) {
                        /*
                         *up to 11 bits long char.
                         *encoded over 2 bytes:
                         *110x xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x1F;
                        nb_bytes_2_decode = 2;

                } else if ((*byte_ptr & 0xF0) == 0xE0) {
                        /*
                         *up to 16 bit long char
                         *encoded over 3 bytes:
                         *1110 xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x0F;
                        nb_bytes_2_decode = 3;

                } else if ((*byte_ptr & 0xF8) == 0xF0) {
                        /*
                         *up to 21 bits long char
                         *encoded over 4 bytes:
                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 0x7;
                        nb_bytes_2_decode = 4;

                } else if ((*byte_ptr & 0xFC) == 0xF8) {
                        /*
                         *up to 26 bits long char
                         *encoded over 5 bytes.
                         *1111 10xx  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 3;
                        nb_bytes_2_decode = 5;

                } else if ((*byte_ptr & 0xFE) == 0xFC) {
                        /*
                         *up to 31 bits long char
                         *encoded over 6 bytes:
                         *1111 110x  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = *byte_ptr & 1;
                        nb_bytes_2_decode = 6;

                } else {
                        /*
                         *BAD ENCODING
                         */
                        return CR_ENCODING_ERROR;
                }

                /*
                 *Go and decode the remaining byte(s)
                 *(if any) to get the current character.
                 */
                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
                        /*decode the next byte */
                        byte_ptr++;

                        /*byte pattern must be: 10xx xxxx */
                        if ((*byte_ptr & 0xC0) != 0x80) {
                                return CR_ENCODING_ERROR;
                        }

                        c = (c << 6) | (*byte_ptr & 0x3F);
                }

                /*
                 *The decoded ucs4 char is now
                 *in c.
                 */

                if (c <= 0xFF) { /*Add other conditions to support
                                  *other char sets (ucs2, ucs3, ucs4).
                                  */
                        len++;
                } else {
                        /*the char is too long to fit
                         *into the supposed charset len.
                         */
                        return CR_ENCODING_ERROR;
                }
        }

        *a_len = len;

        return CR_OK;
}

/**
 *Converts an utf8 string into an ucs4 string.
 *@param a_in the input string to convert.
 *@param a_in_len in/out parameter. The length of the input
 *string. After return, points to the actual number of bytes
 *consumed. This can be usefull to debug the input stream in case
 *of encoding error.
 *@param a_out out parameter. Points to the output string. It is allocated
 *by this function and must be freed by the caller.
 *@param a_out_len out parameter. The length of the output string.
 *@return CR_OK upon successfull completion, an error code otherwise.
 *
 */
enum CRStatus
cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
                           gulong * a_in_len,
                           guint32 ** a_out, gulong * a_out_len)
{
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len
                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);

        status = cr_utils_utf8_str_len_as_ucs4 (a_in,
                                                &a_in[*a_in_len - 1],
                                                a_out_len);

        g_return_val_if_fail (status == CR_OK, status);

        *a_out = (guint32 *) g_malloc0 (*a_out_len * sizeof (guint32));

        status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);

        return status;
}

/**
 *Converts an ucs4 buffer into an utf8 buffer.
 *
 *@param a_in the input ucs4 buffer to convert.
 *@param a_in_len in/out parameter. The size of the
 *input buffer to convert. After return, this parameter contains
 *the actual number of characters consumed.
 *@param a_out the output converted utf8 buffer. Must be allocated by
 *the caller.
 *@param a_out_len in/out parameter. The size of the output buffer.
 *If this size is actually smaller than the real needed size, the function
 *just converts what it can and returns a success status. After return,
 *this param points to the actual number of bytes in the buffer.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_ucs4_to_utf8 (const guint32 * a_in,
                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
        gulong in_len = 0,
                in_index = 0,
                out_index = 0;
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
                              CR_BAD_PARAM_ERROR);

        if (*a_in_len < 1) {
                status = CR_OK;
                goto end;
        }

        in_len = *a_in_len;

        for (in_index = 0; in_index < in_len; in_index++) {
                /*
                 *FIXME: return whenever we encounter forbidden char values.
                 */

                if (a_in[in_index] <= 0x7F) {
                        a_out[out_index] = a_in[in_index];
                        out_index++;
                } else if (a_in[in_index] <= 0x7FF) {
                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
                        a_out[out_index + 1] =
                                (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 2;
                } else if (a_in[in_index] <= 0xFFFF) {
                        a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
                        a_out[out_index + 1] =
                                (0x80 | ((a_in[in_index] >> 6) & 0x3F));
                        a_out[out_index + 2] =
                                (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 3;
                } else if (a_in[in_index] <= 0x1FFFFF) {
                        a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
                        a_out[out_index + 1]
                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
                        a_out[out_index + 2]
                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
                        a_out[out_index + 3]
                                = (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 4;
                } else if (a_in[in_index] <= 0x3FFFFFF) {
                        a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
                        a_out[out_index + 1] =
                                (0x80 | (a_in[in_index] >> 18));
                        a_out[out_index + 2]
                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
                        a_out[out_index + 3]
                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
                        a_out[out_index + 4]
                                = (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 5;
                } else if (a_in[in_index] <= 0x7FFFFFFF) {
                        a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
                        a_out[out_index + 1] =
                                (0x80 | (a_in[in_index] >> 24));
                        a_out[out_index + 2]
                                = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
                        a_out[out_index + 3]
                                = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
                        a_out[out_index + 4]
                                = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
                        a_out[out_index + 4]
                                = (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 6;
                } else {
                        status = CR_ENCODING_ERROR;
                        goto end;
                }
        }                       /*end for */

      end:
        *a_in_len = in_index + 1;
        *a_out_len = out_index + 1;

        return status;
}

/**
 *Converts an ucs4 string into an utf8 string.
 *@param a_in the input string to convert.
 *@param a_in_len in/out parameter. The length of the input
 *string. After return, points to the actual number of characters
 *consumed. This can be usefull to debug the input string in case
 *of encoding error.
 *@param a_out out parameter. Points to the output string. It is allocated
 *by this function and must be freed by the caller.
 *@param a_out_len out parameter. The length (in bytes) of the output string.
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
                           gulong * a_in_len,
                           guchar ** a_out, gulong * a_out_len)
{
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len && a_out
                              && a_out_len, CR_BAD_PARAM_ERROR);

        status = cr_utils_ucs4_str_len_as_utf8 (a_in,
                                                &a_in[*a_out_len - 1],
                                                a_out_len);

        g_return_val_if_fail (status == CR_OK, status);

        status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);

        return status;
}

/**
 *Converts an ucs1 buffer into an utf8 buffer.
 *The caller must know the size of the resulting buffer and
 *allocate it prior to calling this function.
 *
 *@param a_in the input ucs1 buffer.
 *
 *@param a_in_len in/out parameter. The length of the input buffer.
 *After return, points to the number of bytes actually consumed even
 *in case of encoding error.
 *
 *@param a_out out parameter. The output utf8 converted buffer.
 *
 *@param a_out_len in/out parameter. The size of the output buffer.
 *If the output buffer size is shorter than the actual needed size,
 *this function just convert what it can.
 *
 *@return CR_OK upon successfull completion, an error code otherwise.
 *
 */
enum CRStatus
cr_utils_ucs1_to_utf8 (const guchar * a_in,
                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
        gulong out_index = 0,
                in_index = 0,
                in_len = 0,
                out_len = 0;
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len
                              && a_out_len,
                              CR_BAD_PARAM_ERROR);

        if (*a_in_len == 0) {
                *a_out_len = 0 ;
                return status;
        }
        g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;

        in_len = *a_in_len;
        out_len = *a_out_len;

        for (in_index = 0, out_index = 0;
             (in_index < in_len) && (out_index < out_len); in_index++) {
                /*
                 *FIXME: return whenever we encounter forbidden char values.
                 */

                if (a_in[in_index] <= 0x7F) {
                        a_out[out_index] = a_in[in_index];
                        out_index++;
                } else {
                        a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
                        a_out[out_index + 1] =
                                (0x80 | (a_in[in_index] & 0x3F));
                        out_index += 2;
                }
        }                       /*end for */

        *a_in_len = in_index;
        *a_out_len = out_index;

        return status;
}

/**
 *Converts an ucs1 string into an utf8 string.
 *@param a_in_start the beginning of the input string to convert.
 *@param a_in_end the end of the input string to convert.
 *@param a_out out parameter. The converted string.
 *@param a_out out parameter. The length of the converted string.
 *@return CR_OK upon successfull completion, an error code otherwise.
 *
 */
enum CRStatus
cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
                           gulong * a_in_len,
                           guchar ** a_out, gulong * a_out_len)
{
        gulong out_len = 0;
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len && a_out
                              && a_out_len, CR_BAD_PARAM_ERROR);

        if (*a_in_len < 1) {
                *a_out_len = 0;
                *a_out = NULL;
                return CR_OK;
        }

        status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
                                                &out_len);

        g_return_val_if_fail (status == CR_OK, status);

        *a_out = (guchar *) g_malloc0 (out_len);

        status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);

        *a_out_len = out_len;

        return status;
}

/**
 *Converts an utf8 buffer into an ucs1 buffer.
 *The caller must know the size of the resulting
 *converted buffer, and allocated it prior to calling this
 *function.
 *
 *@param a_in the input utf8 buffer to convert.
 *
 *@param a_in_len in/out parameter. The size of the input utf8 buffer.
 *After return, points to the number of bytes consumed
 *by the function even in case of encoding error.
 *
 *@param a_out out parameter. Points to the resulting buffer.
 *Must be allocated by the caller. If the size of a_out is shorter
 *than its required size, this function converts what it can and return
 *a successfull status.
 *
 *@param a_out_len in/out parameter. The size of the output buffer.
 *After return, points to the number of bytes consumed even in case of
 *encoding error.
 *
 *@return CR_OK upon successfull completion, an error code otherwise.
 */
enum CRStatus
cr_utils_utf8_to_ucs1 (const guchar * a_in,
                       gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
        gulong in_index = 0,
                out_index = 0,
                in_len = 0,
                out_len = 0;
        enum CRStatus status = CR_OK;

        /*
         *to store the final decoded
         *unicode char
         */
        guint32 c = 0;

        g_return_val_if_fail (a_in && a_in_len
                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);

        if (*a_in_len < 1) {
                goto end;
        }

        in_len = *a_in_len;
        out_len = *a_out_len;

        for (in_index = 0, out_index = 0;
             (in_index < in_len) && (out_index < out_len);
             in_index++, out_index++) {
                gint nb_bytes_2_decode = 0;

                if (a_in[in_index] <= 0x7F) {
                        /*
                         *7 bits long char
                         *encoded over 1 byte:
                         * 0xxx xxxx
                         */
                        c = a_in[in_index];
                        nb_bytes_2_decode = 1;

                } else if ((a_in[in_index] & 0xE0) == 0xC0) {
                        /*
                         *up to 11 bits long char.
                         *encoded over 2 bytes:
                         *110x xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x1F;
                        nb_bytes_2_decode = 2;

                } else if ((a_in[in_index] & 0xF0) == 0xE0) {
                        /*
                         *up to 16 bit long char
                         *encoded over 3 bytes:
                         *1110 xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x0F;
                        nb_bytes_2_decode = 3;

                } else if ((a_in[in_index] & 0xF8) == 0xF0) {
                        /*
                         *up to 21 bits long char
                         *encoded over 4 bytes:
                         *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 0x7;
                        nb_bytes_2_decode = 4;

                } else if ((a_in[in_index] & 0xFC) == 0xF8) {
                        /*
                         *up to 26 bits long char
                         *encoded over 5 bytes.
                         *1111 10xx  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 3;
                        nb_bytes_2_decode = 5;

                } else if ((a_in[in_index] & 0xFE) == 0xFC) {
                        /*
                         *up to 31 bits long char
                         *encoded over 6 bytes:
                         *1111 110x  10xx xxxx  10xx xxxx
                         *10xx xxxx  10xx xxxx  10xx xxxx
                         */
                        c = a_in[in_index] & 1;
                        nb_bytes_2_decode = 6;

                } else {
                        /*BAD ENCODING */
                        status = CR_ENCODING_ERROR;
                        goto end;
                }

                /*
                 *Go and decode the remaining byte(s)
                 *(if any) to get the current character.
                 */
                if (in_index + nb_bytes_2_decode - 1 >= in_len) {
                        goto end;
                }

                for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
                        /*decode the next byte */
                        in_index++;

                        /*byte pattern must be: 10xx xxxx */
                        if ((a_in[in_index] & 0xC0) != 0x80) {
                                status = CR_ENCODING_ERROR;
                                goto end;
                        }

                        c = (c << 6) | (a_in[in_index] & 0x3F);
                }

                /*
                 *The decoded ucs4 char is now
                 *in c.
                 */

                if (c > 0xFF) {
                        status = CR_ENCODING_ERROR;
                        goto end;
                }

                a_out[out_index] = c;
        }

      end:
        *a_out_len = out_index;
        *a_in_len = in_index;

        return status;
}

/**
 *Converts an utf8 buffer into an
 *ucs1 buffer.
 *@param a_in_start the start of the input buffer.
 *@param a_in_end the end of the input buffer.
 *@param a_out out parameter. The resulting converted ucs4 buffer.
 *Must be freed by the caller.
 *@param a_out_len out parameter. The length of the converted buffer.
 *@return CR_OK upon successfull completion, an error code otherwise.
 *Note that out parameters are valid if and only if this function
 *returns CR_OK.
 */
enum CRStatus
cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
                           gulong * a_in_len,
                           guchar ** a_out, gulong * a_out_len)
{
        enum CRStatus status = CR_OK;

        g_return_val_if_fail (a_in && a_in_len
                              && a_out && a_out_len, CR_BAD_PARAM_ERROR);

        if (*a_in_len < 1) {
                *a_out_len = 0;
                *a_out = NULL;
                return CR_OK;
        }

        status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
                                                a_out_len);

        g_return_val_if_fail (status == CR_OK, status);

        *a_out = (guchar *) g_malloc0 (*a_out_len * sizeof (guint32));

        status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
        return status;
}

/*****************************************
 *CSS basic types identification utilities
 *****************************************/

/**
 *Returns TRUE if a_char is a white space as
 *defined in the css spec in chap 4.1.1.
 *
 *white-space ::= ' '| \t|\r|\n|\f
 *
 *@param a_char the character to test.
 *return TRUE if is a white space, false otherwise.
 */
gboolean
cr_utils_is_white_space (guint32 a_char)
{
        switch (a_char) {
        case ' ':
        case '\t':
        case '\r':
        case '\n':
        case '\f':
                return TRUE;
                break;
        default:
                return FALSE;
        }
}

/**
 *Returns true if the character is a newline
 *as defined in the css spec in the chap 4.1.1.
 *
 *nl ::= \n|\r\n|\r|\f
 *
 *@param a_char the character to test.
 *@return TRUE if the character is a newline, FALSE otherwise.
 */
gboolean
cr_utils_is_newline (guint32 a_char)
{
        switch (a_char) {
        case '\n':
        case '\r':
        case '\f':
                return TRUE;
                break;
        default:
                return FALSE;
        }
}

/**
 *returns TRUE if the char is part of an hexa num char:
 *i.e hexa_char ::= [0-9A-F]
 */
gboolean
cr_utils_is_hexa_char (guint32 a_char)
{
        if ((a_char >= '0' && a_char <= '9')
            || (a_char >= 'A' && a_char <= 'F')) {
                return TRUE;
        }
        return FALSE;
}

/**
 *Returns true if the character is a nonascii
 *character (as defined in the css spec chap 4.1.1):
 *
 *nonascii ::= [^\0-\177]
 *
 *@param a_char the character to test.
 *@return TRUE if the character is a nonascii char,
 *FALSE otherwise.
 */
gboolean
cr_utils_is_nonascii (guint32 a_char)
{
        if (a_char <= 177) {
                return FALSE;
        }

        return TRUE;
}

/**
 *Dumps a character a_nb times on a file.
 *@param a_char the char to dump
 *@param a_fp the destination file pointer
 *@param a_nb the number of times a_char is to be dumped.
 */
void
cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
{
        glong i = 0;

        for (i = 0; i < a_nb; i++) {
                fprintf (a_fp, "%c", a_char);
        }
}

void
cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
{
        glong i = 0;

        g_return_if_fail (a_string);

        for (i = 0; i < a_nb; i++) {
                g_string_append_printf (a_string, "%c", a_char);
        }
}

/**
 *Duplicates a list of GString instances.
 *@return the duplicated list of GString instances or NULL if
 *something bad happened.
 *@param a_list_of_strings the list of strings to be duplicated.
 */
GList *
cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
{
        GList const *cur = NULL;
        GList *result = NULL;

        g_return_val_if_fail (a_list_of_strings, NULL);

        for (cur = a_list_of_strings; cur; cur = cur->next) {
                GString *str = NULL;

                str = g_string_new_len (((GString *) cur->data)->str,
                                        ((GString *) cur->data)->len);
                if (str)
                        result = g_list_append (result, str);
        }

        return result;
}

/**
 *Duplicate a GList where the GList::data is a CRString.
 *@param a_list_of_strings the list to duplicate
 *@return the duplicated list, or NULL if something bad
 *happened.
 */
GList *
cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
{
        GList const *cur = NULL;
        GList *result = NULL;

        g_return_val_if_fail (a_list_of_strings, NULL);

        for (cur = a_list_of_strings; cur; cur = cur->next) {
                CRString *str = NULL;

                str = cr_string_dup ((CRString const *) cur->data) ;
                if (str)
                        result = g_list_append (result, str);
        }

        return result;
}