/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
/*
* This file is part of The Croco Library
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2.1 of the GNU Lesser General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* Author: Dodji Seketeli
* See COPYRIGHTS file for copyright information.
*/
#include "cr-utils.h"
#include "cr-string.h"
/**
*@file:
*Some misc utility functions used
*in the libcroco.
*Note that troughout this file I will
*refer to the CSS SPECIFICATIONS DOCUMENTATION
*written by the w3c guys. You can find that document
*at http://www.w3.org/TR/REC-CSS2/ .
*/
/****************************
*Encoding transformations and
*encoding helpers
****************************/
/*
*Here is the correspondance between the ucs-4 charactere codes
*and there matching utf-8 encoding pattern as dscribed by RFC 2279:
*
*UCS-4 range (hex.) UTF-8 octet sequence (binary)
*------------------ -----------------------------
*0000 0000-0000 007F 0xxxxxxx
*0000 0080-0000 07FF 110xxxxx 10xxxxxx
*0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
*/
/**
*Given an utf8 string buffer, calculates
*the length of this string if it was encoded
*in ucs4.
*@param a_in_start a pointer to the begining of
*the input utf8 string.
*@param a_in_end a pointre to the end of the input
*utf8 string (points to the last byte of the buffer)
*@param a_len out parameter the calculated length.
*@return CR_OK upon succesfull completion, an error code
*otherwise.
*/
enum CRStatus
cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
const guchar * a_in_end, gulong * a_len)
{
guchar *byte_ptr = NULL;
gint len = 0;
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
g_return_val_if_fail (a_in_start && a_in_end && a_len,
CR_BAD_PARAM_ERROR);
*a_len = 0;
for (byte_ptr = (guchar *) a_in_start;
byte_ptr <= a_in_end; byte_ptr++) {
gint nb_bytes_2_decode = 0;
if (*byte_ptr <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *byte_ptr;
nb_bytes_2_decode = 1;
} else if ((*byte_ptr & 0xE0) == 0xC0) {
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *byte_ptr & 0x1F;
nb_bytes_2_decode = 2;
} else if ((*byte_ptr & 0xF0) == 0xE0) {
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x0F;
nb_bytes_2_decode = 3;
} else if ((*byte_ptr & 0xF8) == 0xF0) {
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x7;
nb_bytes_2_decode = 4;
} else if ((*byte_ptr & 0xFC) == 0xF8) {
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 3;
nb_bytes_2_decode = 5;
} else if ((*byte_ptr & 0xFE) == 0xFC) {
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 1;
nb_bytes_2_decode = 6;
} else {
/*
*BAD ENCODING
*/
return CR_ENCODING_ERROR;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
/*decode the next byte */
byte_ptr++;
/*byte pattern must be: 10xx xxxx */
if ((*byte_ptr & 0xC0) != 0x80) {
return CR_ENCODING_ERROR;
}
c = (c << 6) | (*byte_ptr & 0x3F);
}
len++;
}
*a_len = len;
return CR_OK;
}
/**
*Given an ucs4 string, this function
*returns the size (in bytes) this string
*would have occupied if it was encoded in utf-8.
*@param a_in_start a pointer to the beginning of the input
*buffer.
*@param a_in_end a pointer to the end of the input buffer.
*@param a_len out parameter. The computed length.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
const guint32 * a_in_end, gulong * a_len)
{
gint len = 0;
guint32 *char_ptr = NULL;
g_return_val_if_fail (a_in_start && a_in_end && a_len,
CR_BAD_PARAM_ERROR);
for (char_ptr = (guint32 *) a_in_start;
char_ptr <= a_in_end; char_ptr++) {
if (*char_ptr <= 0x7F) {
/*the utf-8 char would take 1 byte */
len += 1;
} else if (*char_ptr <= 0x7FF) {
/*the utf-8 char would take 2 bytes */
len += 2;
} else if (*char_ptr <= 0xFFFF) {
len += 3;
} else if (*char_ptr <= 0x1FFFFF) {
len += 4;
} else if (*char_ptr <= 0x3FFFFFF) {
len += 5;
} else if (*char_ptr <= 0x7FFFFFFF) {
len += 6;
}
}
*a_len = len;
return CR_OK;
}
/**
*Given an ucsA string, this function
*returns the size (in bytes) this string
*would have occupied if it was encoded in utf-8.
*@param a_in_start a pointer to the beginning of the input
*buffer.
*@param a_in_end a pointer to the end of the input buffer.
*@param a_len out parameter. The computed length.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
const guchar * a_in_end, gulong * a_len)
{
gint len = 0;
guchar *char_ptr = NULL;
g_return_val_if_fail (a_in_start && a_in_end && a_len,
CR_BAD_PARAM_ERROR);
for (char_ptr = (guchar *) a_in_start;
char_ptr <= a_in_end; char_ptr++) {
if (*char_ptr <= 0x7F) {
/*the utf-8 char would take 1 byte */
len += 1;
} else {
/*the utf-8 char would take 2 bytes */
len += 2;
}
}
*a_len = len;
return CR_OK;
}
/**
*Converts an utf8 buffer into an ucs4 buffer.
*
*@param a_in the input utf8 buffer to convert.
*@param a_in_len in/out parameter. The size of the
*input buffer to convert. After return, this parameter contains
*the actual number of bytes consumed.
*@param a_out the output converted ucs4 buffer. Must be allocated by
*the caller.
*@param a_out_len in/out parameter. The size of the output buffer.
*If this size is actually smaller than the real needed size, the function
*just converts what it can and returns a success status. After return,
*this param points to the actual number of characters decoded.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_utf8_to_ucs4 (const guchar * a_in,
gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
{
gulong in_len = 0,
out_len = 0,
in_index = 0,
out_index = 0;
enum CRStatus status = CR_OK;
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
g_return_val_if_fail (a_in && a_in_len
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
status = CR_OK;
goto end;
}
in_len = *a_in_len;
out_len = *a_out_len;
for (in_index = 0, out_index = 0;
(in_index < in_len) && (out_index < out_len);
in_index++, out_index++) {
gint nb_bytes_2_decode = 0;
if (a_in[in_index] <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = a_in[in_index];
nb_bytes_2_decode = 1;
} else if ((a_in[in_index] & 0xE0) == 0xC0) {
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x1F;
nb_bytes_2_decode = 2;
} else if ((a_in[in_index] & 0xF0) == 0xE0) {
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x0F;
nb_bytes_2_decode = 3;
} else if ((a_in[in_index] & 0xF8) == 0xF0) {
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x7;
nb_bytes_2_decode = 4;
} else if ((a_in[in_index] & 0xFC) == 0xF8) {
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 3;
nb_bytes_2_decode = 5;
} else if ((a_in[in_index] & 0xFE) == 0xFC) {
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 1;
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
/*decode the next byte */
in_index++;
/*byte pattern must be: 10xx xxxx */
if ((a_in[in_index] & 0xC0) != 0x80) {
goto end;
}
c = (c << 6) | (a_in[in_index] & 0x3F);
}
/*
*The decoded ucs4 char is now
*in c.
*/
/************************
*Some security tests
***********************/
/*be sure c is a char */
if (c == 0xFFFF || c == 0xFFFE)
goto end;
/*be sure c is inferior to the max ucs4 char value */
if (c > 0x10FFFF)
goto end;
/*
*c must be less than UTF16 "lower surrogate begin"
*or higher than UTF16 "High surrogate end"
*/
if (c >= 0xD800 && c <= 0xDFFF)
goto end;
/*Avoid characters that equals zero */
if (c == 0)
goto end;
a_out[out_index] = c;
}
end:
*a_out_len = out_index + 1;
*a_in_len = in_index + 1;
return status;
}
/**
*Reads a character from an utf8 buffer.
*Actually decode the next character code (unicode character code)
*and returns it.
*@param a_in the starting address of the utf8 buffer.
*@param a_in_len the length of the utf8 buffer.
*@param a_out output parameter. The resulting read char.
*@param a_consumed the number of the bytes consumed to
*decode the returned character code.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_read_char_from_utf8_buf (const guchar * a_in,
gulong a_in_len,
guint32 * a_out, gulong * a_consumed)
{
gulong in_index = 0,
nb_bytes_2_decode = 0;
enum CRStatus status = CR_OK;
/*
*to store the final decoded
*unicode char
*/
guint32 c = 0;
g_return_val_if_fail (a_in && a_out && a_out
&& a_consumed, CR_BAD_PARAM_ERROR);
if (a_in_len < 1) {
status = CR_OK;
goto end;
}
if (*a_in <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *a_in;
nb_bytes_2_decode = 1;
} else if ((*a_in & 0xE0) == 0xC0) {
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *a_in & 0x1F;
nb_bytes_2_decode = 2;
} else if ((*a_in & 0xF0) == 0xE0) {
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 0x0F;
nb_bytes_2_decode = 3;
} else if ((*a_in & 0xF8) == 0xF0) {
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 0x7;
nb_bytes_2_decode = 4;
} else if ((*a_in & 0xFC) == 0xF8) {
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *a_in & 3;
nb_bytes_2_decode = 5;
} else if ((*a_in & 0xFE) == 0xFC) {
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 1;
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
goto end;
}
if (nb_bytes_2_decode > a_in_len) {
status = CR_END_OF_INPUT_ERROR;
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
/*byte pattern must be: 10xx xxxx */
if ((a_in[in_index] & 0xC0) != 0x80) {
goto end;
}
c = (c << 6) | (a_in[in_index] & 0x3F);
}
/*
*The decoded ucs4 char is now
*in c.
*/
/************************
*Some security tests
***********************/
/*be sure c is a char */
if (c == 0xFFFF || c == 0xFFFE)
goto end;
/*be sure c is inferior to the max ucs4 char value */
if (c > 0x10FFFF)
goto end;
/*
*c must be less than UTF16 "lower surrogate begin"
*or higher than UTF16 "High surrogate end"
*/
if (c >= 0xD800 && c <= 0xDFFF)
goto end;
/*Avoid characters that equals zero */
if (c == 0)
goto end;
*a_out = c;
end:
*a_consumed = nb_bytes_2_decode;
return status;
}
/**
*
*/
enum CRStatus
cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
const guchar * a_in_end, gulong * a_len)
{
/*
*Note: this function can be made shorter
*but it considers all the cases of the utf8 encoding
*to ease further extensions ...
*/
guchar *byte_ptr = NULL;
gint len = 0;
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
g_return_val_if_fail (a_in_start && a_in_end && a_len,
CR_BAD_PARAM_ERROR);
*a_len = 0;
for (byte_ptr = (guchar *) a_in_start;
byte_ptr <= a_in_end; byte_ptr++) {
gint nb_bytes_2_decode = 0;
if (*byte_ptr <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *byte_ptr;
nb_bytes_2_decode = 1;
} else if ((*byte_ptr & 0xE0) == 0xC0) {
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *byte_ptr & 0x1F;
nb_bytes_2_decode = 2;
} else if ((*byte_ptr & 0xF0) == 0xE0) {
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x0F;
nb_bytes_2_decode = 3;
} else if ((*byte_ptr & 0xF8) == 0xF0) {
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x7;
nb_bytes_2_decode = 4;
} else if ((*byte_ptr & 0xFC) == 0xF8) {
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 3;
nb_bytes_2_decode = 5;
} else if ((*byte_ptr & 0xFE) == 0xFC) {
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 1;
nb_bytes_2_decode = 6;
} else {
/*
*BAD ENCODING
*/
return CR_ENCODING_ERROR;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
/*decode the next byte */
byte_ptr++;
/*byte pattern must be: 10xx xxxx */
if ((*byte_ptr & 0xC0) != 0x80) {
return CR_ENCODING_ERROR;
}
c = (c << 6) | (*byte_ptr & 0x3F);
}
/*
*The decoded ucs4 char is now
*in c.
*/
if (c <= 0xFF) { /*Add other conditions to support
*other char sets (ucs2, ucs3, ucs4).
*/
len++;
} else {
/*the char is too long to fit
*into the supposed charset len.
*/
return CR_ENCODING_ERROR;
}
}
*a_len = len;
return CR_OK;
}
/**
*Converts an utf8 string into an ucs4 string.
*@param a_in the input string to convert.
*@param a_in_len in/out parameter. The length of the input
*string. After return, points to the actual number of bytes
*consumed. This can be usefull to debug the input stream in case
*of encoding error.
*@param a_out out parameter. Points to the output string. It is allocated
*by this function and must be freed by the caller.
*@param a_out_len out parameter. The length of the output string.
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
gulong * a_in_len,
guint32 ** a_out, gulong * a_out_len)
{
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
status = cr_utils_utf8_str_len_as_ucs4 (a_in,
&a_in[*a_in_len - 1],
a_out_len);
g_return_val_if_fail (status == CR_OK, status);
*a_out = (guint32 *) g_malloc0 (*a_out_len * sizeof (guint32));
status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
return status;
}
/**
*Converts an ucs4 buffer into an utf8 buffer.
*
*@param a_in the input ucs4 buffer to convert.
*@param a_in_len in/out parameter. The size of the
*input buffer to convert. After return, this parameter contains
*the actual number of characters consumed.
*@param a_out the output converted utf8 buffer. Must be allocated by
*the caller.
*@param a_out_len in/out parameter. The size of the output buffer.
*If this size is actually smaller than the real needed size, the function
*just converts what it can and returns a success status. After return,
*this param points to the actual number of bytes in the buffer.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_ucs4_to_utf8 (const guint32 * a_in,
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
gulong in_len = 0,
in_index = 0,
out_index = 0;
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
status = CR_OK;
goto end;
}
in_len = *a_in_len;
for (in_index = 0; in_index < in_len; in_index++) {
/*
*FIXME: return whenever we encounter forbidden char values.
*/
if (a_in[in_index] <= 0x7F) {
a_out[out_index] = a_in[in_index];
out_index++;
} else if (a_in[in_index] <= 0x7FF) {
a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
a_out[out_index + 1] =
(0x80 | (a_in[in_index] & 0x3F));
out_index += 2;
} else if (a_in[in_index] <= 0xFFFF) {
a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
a_out[out_index + 1] =
(0x80 | ((a_in[in_index] >> 6) & 0x3F));
a_out[out_index + 2] =
(0x80 | (a_in[in_index] & 0x3F));
out_index += 3;
} else if (a_in[in_index] <= 0x1FFFFF) {
a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
a_out[out_index + 1]
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
a_out[out_index + 2]
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
a_out[out_index + 3]
= (0x80 | (a_in[in_index] & 0x3F));
out_index += 4;
} else if (a_in[in_index] <= 0x3FFFFFF) {
a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
a_out[out_index + 1] =
(0x80 | (a_in[in_index] >> 18));
a_out[out_index + 2]
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
a_out[out_index + 3]
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
a_out[out_index + 4]
= (0x80 | (a_in[in_index] & 0x3F));
out_index += 5;
} else if (a_in[in_index] <= 0x7FFFFFFF) {
a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
a_out[out_index + 1] =
(0x80 | (a_in[in_index] >> 24));
a_out[out_index + 2]
= (0x80 | ((a_in[in_index] >> 18) & 0x3F));
a_out[out_index + 3]
= (0x80 | ((a_in[in_index] >> 12) & 0x3F));
a_out[out_index + 4]
= (0x80 | ((a_in[in_index] >> 6) & 0x3F));
a_out[out_index + 4]
= (0x80 | (a_in[in_index] & 0x3F));
out_index += 6;
} else {
status = CR_ENCODING_ERROR;
goto end;
}
} /*end for */
end:
*a_in_len = in_index + 1;
*a_out_len = out_index + 1;
return status;
}
/**
*Converts an ucs4 string into an utf8 string.
*@param a_in the input string to convert.
*@param a_in_len in/out parameter. The length of the input
*string. After return, points to the actual number of characters
*consumed. This can be usefull to debug the input string in case
*of encoding error.
*@param a_out out parameter. Points to the output string. It is allocated
*by this function and must be freed by the caller.
*@param a_out_len out parameter. The length (in bytes) of the output string.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
gulong * a_in_len,
guchar ** a_out, gulong * a_out_len)
{
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len && a_out
&& a_out_len, CR_BAD_PARAM_ERROR);
status = cr_utils_ucs4_str_len_as_utf8 (a_in,
&a_in[*a_out_len - 1],
a_out_len);
g_return_val_if_fail (status == CR_OK, status);
status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
return status;
}
/**
*Converts an ucs1 buffer into an utf8 buffer.
*The caller must know the size of the resulting buffer and
*allocate it prior to calling this function.
*
*@param a_in the input ucs1 buffer.
*
*@param a_in_len in/out parameter. The length of the input buffer.
*After return, points to the number of bytes actually consumed even
*in case of encoding error.
*
*@param a_out out parameter. The output utf8 converted buffer.
*
*@param a_out_len in/out parameter. The size of the output buffer.
*If the output buffer size is shorter than the actual needed size,
*this function just convert what it can.
*
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
cr_utils_ucs1_to_utf8 (const guchar * a_in,
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
gulong out_index = 0,
in_index = 0,
in_len = 0,
out_len = 0;
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len
&& a_out_len,
CR_BAD_PARAM_ERROR);
if (*a_in_len == 0) {
*a_out_len = 0 ;
return status;
}
g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
in_len = *a_in_len;
out_len = *a_out_len;
for (in_index = 0, out_index = 0;
(in_index < in_len) && (out_index < out_len); in_index++) {
/*
*FIXME: return whenever we encounter forbidden char values.
*/
if (a_in[in_index] <= 0x7F) {
a_out[out_index] = a_in[in_index];
out_index++;
} else {
a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
a_out[out_index + 1] =
(0x80 | (a_in[in_index] & 0x3F));
out_index += 2;
}
} /*end for */
*a_in_len = in_index;
*a_out_len = out_index;
return status;
}
/**
*Converts an ucs1 string into an utf8 string.
*@param a_in_start the beginning of the input string to convert.
*@param a_in_end the end of the input string to convert.
*@param a_out out parameter. The converted string.
*@param a_out out parameter. The length of the converted string.
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
gulong * a_in_len,
guchar ** a_out, gulong * a_out_len)
{
gulong out_len = 0;
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len && a_out
&& a_out_len, CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
*a_out_len = 0;
*a_out = NULL;
return CR_OK;
}
status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
&out_len);
g_return_val_if_fail (status == CR_OK, status);
*a_out = (guchar *) g_malloc0 (out_len);
status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
*a_out_len = out_len;
return status;
}
/**
*Converts an utf8 buffer into an ucs1 buffer.
*The caller must know the size of the resulting
*converted buffer, and allocated it prior to calling this
*function.
*
*@param a_in the input utf8 buffer to convert.
*
*@param a_in_len in/out parameter. The size of the input utf8 buffer.
*After return, points to the number of bytes consumed
*by the function even in case of encoding error.
*
*@param a_out out parameter. Points to the resulting buffer.
*Must be allocated by the caller. If the size of a_out is shorter
*than its required size, this function converts what it can and return
*a successfull status.
*
*@param a_out_len in/out parameter. The size of the output buffer.
*After return, points to the number of bytes consumed even in case of
*encoding error.
*
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
cr_utils_utf8_to_ucs1 (const guchar * a_in,
gulong * a_in_len, guchar * a_out, gulong * a_out_len)
{
gulong in_index = 0,
out_index = 0,
in_len = 0,
out_len = 0;
enum CRStatus status = CR_OK;
/*
*to store the final decoded
*unicode char
*/
guint32 c = 0;
g_return_val_if_fail (a_in && a_in_len
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
goto end;
}
in_len = *a_in_len;
out_len = *a_out_len;
for (in_index = 0, out_index = 0;
(in_index < in_len) && (out_index < out_len);
in_index++, out_index++) {
gint nb_bytes_2_decode = 0;
if (a_in[in_index] <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = a_in[in_index];
nb_bytes_2_decode = 1;
} else if ((a_in[in_index] & 0xE0) == 0xC0) {
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x1F;
nb_bytes_2_decode = 2;
} else if ((a_in[in_index] & 0xF0) == 0xE0) {
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x0F;
nb_bytes_2_decode = 3;
} else if ((a_in[in_index] & 0xF8) == 0xF0) {
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 0x7;
nb_bytes_2_decode = 4;
} else if ((a_in[in_index] & 0xFC) == 0xF8) {
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 3;
nb_bytes_2_decode = 5;
} else if ((a_in[in_index] & 0xFE) == 0xFC) {
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = a_in[in_index] & 1;
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
status = CR_ENCODING_ERROR;
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
if (in_index + nb_bytes_2_decode - 1 >= in_len) {
goto end;
}
for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
/*decode the next byte */
in_index++;
/*byte pattern must be: 10xx xxxx */
if ((a_in[in_index] & 0xC0) != 0x80) {
status = CR_ENCODING_ERROR;
goto end;
}
c = (c << 6) | (a_in[in_index] & 0x3F);
}
/*
*The decoded ucs4 char is now
*in c.
*/
if (c > 0xFF) {
status = CR_ENCODING_ERROR;
goto end;
}
a_out[out_index] = c;
}
end:
*a_out_len = out_index;
*a_in_len = in_index;
return status;
}
/**
*Converts an utf8 buffer into an
*ucs1 buffer.
*@param a_in_start the start of the input buffer.
*@param a_in_end the end of the input buffer.
*@param a_out out parameter. The resulting converted ucs4 buffer.
*Must be freed by the caller.
*@param a_out_len out parameter. The length of the converted buffer.
*@return CR_OK upon successfull completion, an error code otherwise.
*Note that out parameters are valid if and only if this function
*returns CR_OK.
*/
enum CRStatus
cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
gulong * a_in_len,
guchar ** a_out, gulong * a_out_len)
{
enum CRStatus status = CR_OK;
g_return_val_if_fail (a_in && a_in_len
&& a_out && a_out_len, CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
*a_out_len = 0;
*a_out = NULL;
return CR_OK;
}
status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
a_out_len);
g_return_val_if_fail (status == CR_OK, status);
*a_out = (guchar *) g_malloc0 (*a_out_len * sizeof (guint32));
status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
return status;
}
/*****************************************
*CSS basic types identification utilities
*****************************************/
/**
*Returns TRUE if a_char is a white space as
*defined in the css spec in chap 4.1.1.
*
*white-space ::= ' '| \t|\r|\n|\f
*
*@param a_char the character to test.
*return TRUE if is a white space, false otherwise.
*/
gboolean
cr_utils_is_white_space (guint32 a_char)
{
switch (a_char) {
case ' ':
case '\t':
case '\r':
case '\n':
case '\f':
return TRUE;
break;
default:
return FALSE;
}
}
/**
*Returns true if the character is a newline
*as defined in the css spec in the chap 4.1.1.
*
*nl ::= \n|\r\n|\r|\f
*
*@param a_char the character to test.
*@return TRUE if the character is a newline, FALSE otherwise.
*/
gboolean
cr_utils_is_newline (guint32 a_char)
{
switch (a_char) {
case '\n':
case '\r':
case '\f':
return TRUE;
break;
default:
return FALSE;
}
}
/**
*returns TRUE if the char is part of an hexa num char:
*i.e hexa_char ::= [0-9A-F]
*/
gboolean
cr_utils_is_hexa_char (guint32 a_char)
{
if ((a_char >= '0' && a_char <= '9')
|| (a_char >= 'A' && a_char <= 'F')) {
return TRUE;
}
return FALSE;
}
/**
*Returns true if the character is a nonascii
*character (as defined in the css spec chap 4.1.1):
*
*nonascii ::= [^\0-\177]
*
*@param a_char the character to test.
*@return TRUE if the character is a nonascii char,
*FALSE otherwise.
*/
gboolean
cr_utils_is_nonascii (guint32 a_char)
{
if (a_char <= 177) {
return FALSE;
}
return TRUE;
}
/**
*Dumps a character a_nb times on a file.
*@param a_char the char to dump
*@param a_fp the destination file pointer
*@param a_nb the number of times a_char is to be dumped.
*/
void
cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
{
glong i = 0;
for (i = 0; i < a_nb; i++) {
fprintf (a_fp, "%c", a_char);
}
}
void
cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
{
glong i = 0;
g_return_if_fail (a_string);
for (i = 0; i < a_nb; i++) {
g_string_append_printf (a_string, "%c", a_char);
}
}
/**
*Duplicates a list of GString instances.
*@return the duplicated list of GString instances or NULL if
*something bad happened.
*@param a_list_of_strings the list of strings to be duplicated.
*/
GList *
cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
{
GList const *cur = NULL;
GList *result = NULL;
g_return_val_if_fail (a_list_of_strings, NULL);
for (cur = a_list_of_strings; cur; cur = cur->next) {
GString *str = NULL;
str = g_string_new_len (((GString *) cur->data)->str,
((GString *) cur->data)->len);
if (str)
result = g_list_append (result, str);
}
return result;
}
/**
*Duplicate a GList where the GList::data is a CRString.
*@param a_list_of_strings the list to duplicate
*@return the duplicated list, or NULL if something bad
*happened.
*/
GList *
cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
{
GList const *cur = NULL;
GList *result = NULL;
g_return_val_if_fail (a_list_of_strings, NULL);
for (cur = a_list_of_strings; cur; cur = cur->next) {
CRString *str = NULL;
str = cr_string_dup ((CRString const *) cur->data) ;
if (str)
result = g_list_append (result, str);
}
return result;
}