/* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
/*
* This file is part of The Croco Library
*
* modify it under the terms of version 2.1 of the GNU Lesser General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
* Author: Dodji Seketeli
* See COPYRIGHTS file for copyright information.
*/
#include "cr-utils.h"
#include "cr-string.h"
/**
*@file:
*Some misc utility functions used
*in the libcroco.
*Note that troughout this file I will
*refer to the CSS SPECIFICATIONS DOCUMENTATION
*written by the w3c guys. You can find that document
*/
/****************************
*Encoding transformations and
*encoding helpers
****************************/
/*
*Here is the correspondance between the ucs-4 charactere codes
*and there matching utf-8 encoding pattern as dscribed by RFC 2279:
*
*UCS-4 range (hex.) UTF-8 octet sequence (binary)
*------------------ -----------------------------
*0000 0000-0000 007F 0xxxxxxx
*0000 0080-0000 07FF 110xxxxx 10xxxxxx
*0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
*/
/**
*Given an utf8 string buffer, calculates
*the length of this string if it was encoded
*in ucs4.
*@param a_in_start a pointer to the begining of
*the input utf8 string.
*@param a_in_end a pointre to the end of the input
*utf8 string (points to the last byte of the buffer)
*@param a_len out parameter the calculated length.
*@return CR_OK upon succesfull completion, an error code
*otherwise.
*/
enum CRStatus
{
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
*a_len = 0;
if (*byte_ptr <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *byte_ptr;
nb_bytes_2_decode = 1;
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *byte_ptr & 0x1F;
nb_bytes_2_decode = 2;
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x0F;
nb_bytes_2_decode = 3;
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x7;
nb_bytes_2_decode = 4;
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 3;
nb_bytes_2_decode = 5;
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 1;
nb_bytes_2_decode = 6;
} else {
/*
*BAD ENCODING
*/
return CR_ENCODING_ERROR;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
/*decode the next byte */
byte_ptr++;
/*byte pattern must be: 10xx xxxx */
return CR_ENCODING_ERROR;
}
}
len++;
}
return CR_OK;
}
/**
*Given an ucs4 string, this function
*returns the size (in bytes) this string
*would have occupied if it was encoded in utf-8.
*@param a_in_start a pointer to the beginning of the input
*buffer.
*@param a_in_end a pointer to the end of the input buffer.
*@param a_len out parameter. The computed length.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
if (*char_ptr <= 0x7F) {
/*the utf-8 char would take 1 byte */
len += 1;
} else if (*char_ptr <= 0x7FF) {
/*the utf-8 char would take 2 bytes */
len += 2;
} else if (*char_ptr <= 0xFFFF) {
len += 3;
} else if (*char_ptr <= 0x1FFFFF) {
len += 4;
} else if (*char_ptr <= 0x3FFFFFF) {
len += 5;
} else if (*char_ptr <= 0x7FFFFFFF) {
len += 6;
}
}
return CR_OK;
}
/**
*Given an ucsA string, this function
*returns the size (in bytes) this string
*would have occupied if it was encoded in utf-8.
*@param a_in_start a pointer to the beginning of the input
*buffer.
*@param a_in_end a pointer to the end of the input buffer.
*@param a_len out parameter. The computed length.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
if (*char_ptr <= 0x7F) {
/*the utf-8 char would take 1 byte */
len += 1;
} else {
/*the utf-8 char would take 2 bytes */
len += 2;
}
}
return CR_OK;
}
/**
*Converts an utf8 buffer into an ucs4 buffer.
*
*@param a_in the input utf8 buffer to convert.
*input buffer to convert. After return, this parameter contains
*the actual number of bytes consumed.
*@param a_out the output converted ucs4 buffer. Must be allocated by
*the caller.
*If this size is actually smaller than the real needed size, the function
*just converts what it can and returns a success status. After return,
*this param points to the actual number of characters decoded.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
out_len = 0,
in_index = 0,
out_index = 0;
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
if (*a_in_len < 1) {
goto end;
}
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
nb_bytes_2_decode = 1;
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
nb_bytes_2_decode = 2;
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 3;
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 4;
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 5;
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
/*decode the next byte */
in_index++;
/*byte pattern must be: 10xx xxxx */
goto end;
}
}
/*
*The decoded ucs4 char is now
*in c.
*/
/************************
*Some security tests
***********************/
/*be sure c is a char */
if (c == 0xFFFF || c == 0xFFFE)
goto end;
/*be sure c is inferior to the max ucs4 char value */
if (c > 0x10FFFF)
goto end;
/*
*c must be less than UTF16 "lower surrogate begin"
*or higher than UTF16 "High surrogate end"
*/
if (c >= 0xD800 && c <= 0xDFFF)
goto end;
/*Avoid characters that equals zero */
if (c == 0)
goto end;
}
end:
return status;
}
/**
*Reads a character from an utf8 buffer.
*Actually decode the next character code (unicode character code)
*and returns it.
*@param a_in the starting address of the utf8 buffer.
*@param a_in_len the length of the utf8 buffer.
*@param a_out output parameter. The resulting read char.
*@param a_consumed the number of the bytes consumed to
*decode the returned character code.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
nb_bytes_2_decode = 0;
/*
*to store the final decoded
*unicode char
*/
guint32 c = 0;
&& a_consumed, CR_BAD_PARAM_ERROR);
if (a_in_len < 1) {
goto end;
}
if (*a_in <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *a_in;
nb_bytes_2_decode = 1;
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *a_in & 0x1F;
nb_bytes_2_decode = 2;
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 0x0F;
nb_bytes_2_decode = 3;
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 0x7;
nb_bytes_2_decode = 4;
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *a_in & 3;
nb_bytes_2_decode = 5;
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *a_in & 1;
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
goto end;
}
if (nb_bytes_2_decode > a_in_len) {
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
/*byte pattern must be: 10xx xxxx */
goto end;
}
}
/*
*The decoded ucs4 char is now
*in c.
*/
/************************
*Some security tests
***********************/
/*be sure c is a char */
if (c == 0xFFFF || c == 0xFFFE)
goto end;
/*be sure c is inferior to the max ucs4 char value */
if (c > 0x10FFFF)
goto end;
/*
*c must be less than UTF16 "lower surrogate begin"
*or higher than UTF16 "High surrogate end"
*/
if (c >= 0xD800 && c <= 0xDFFF)
goto end;
/*Avoid characters that equals zero */
if (c == 0)
goto end;
*a_out = c;
end:
return status;
}
/**
*
*/
enum CRStatus
{
/*
*Note: this function can be made shorter
*but it considers all the cases of the utf8 encoding
*to ease further extensions ...
*/
/*
*to store the final decoded
*unicode char
*/
guint c = 0;
*a_len = 0;
if (*byte_ptr <= 0x7F) {
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
c = *byte_ptr;
nb_bytes_2_decode = 1;
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
c = *byte_ptr & 0x1F;
nb_bytes_2_decode = 2;
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x0F;
nb_bytes_2_decode = 3;
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 0x7;
nb_bytes_2_decode = 4;
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 3;
nb_bytes_2_decode = 5;
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
c = *byte_ptr & 1;
nb_bytes_2_decode = 6;
} else {
/*
*BAD ENCODING
*/
return CR_ENCODING_ERROR;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
/*decode the next byte */
byte_ptr++;
/*byte pattern must be: 10xx xxxx */
return CR_ENCODING_ERROR;
}
}
/*
*The decoded ucs4 char is now
*in c.
*/
if (c <= 0xFF) { /*Add other conditions to support
*other char sets (ucs2, ucs3, ucs4).
*/
len++;
} else {
/*the char is too long to fit
*into the supposed charset len.
*/
return CR_ENCODING_ERROR;
}
}
return CR_OK;
}
/**
*Converts an utf8 string into an ucs4 string.
*@param a_in the input string to convert.
*string. After return, points to the actual number of bytes
*consumed. This can be usefull to debug the input stream in case
*of encoding error.
*@param a_out out parameter. Points to the output string. It is allocated
*by this function and must be freed by the caller.
*@param a_out_len out parameter. The length of the output string.
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
{
return status;
}
/**
*Converts an ucs4 buffer into an utf8 buffer.
*
*@param a_in the input ucs4 buffer to convert.
*input buffer to convert. After return, this parameter contains
*the actual number of characters consumed.
*@param a_out the output converted utf8 buffer. Must be allocated by
*the caller.
*If this size is actually smaller than the real needed size, the function
*just converts what it can and returns a success status. After return,
*this param points to the actual number of bytes in the buffer.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
in_index = 0,
out_index = 0;
if (*a_in_len < 1) {
goto end;
}
/*
*FIXME: return whenever we encounter forbidden char values.
*/
out_index++;
out_index += 2;
out_index += 3;
out_index += 4;
out_index += 5;
out_index += 6;
} else {
goto end;
}
} /*end for */
end:
return status;
}
/**
*Converts an ucs4 string into an utf8 string.
*@param a_in the input string to convert.
*string. After return, points to the actual number of characters
*consumed. This can be usefull to debug the input string in case
*of encoding error.
*@param a_out out parameter. Points to the output string. It is allocated
*by this function and must be freed by the caller.
*@param a_out_len out parameter. The length (in bytes) of the output string.
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
&& a_out_len, CR_BAD_PARAM_ERROR);
return status;
}
/**
*Converts an ucs1 buffer into an utf8 buffer.
*The caller must know the size of the resulting buffer and
*allocate it prior to calling this function.
*
*@param a_in the input ucs1 buffer.
*
*After return, points to the number of bytes actually consumed even
*in case of encoding error.
*
*@param a_out out parameter. The output utf8 converted buffer.
*
*If the output buffer size is shorter than the actual needed size,
*this function just convert what it can.
*
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
{
in_index = 0,
in_len = 0,
out_len = 0;
&& a_out_len,
if (*a_in_len == 0) {
*a_out_len = 0 ;
return status;
}
/*
*FIXME: return whenever we encounter forbidden char values.
*/
out_index++;
} else {
out_index += 2;
}
} /*end for */
return status;
}
/**
*Converts an ucs1 string into an utf8 string.
*@param a_in_start the beginning of the input string to convert.
*@param a_in_end the end of the input string to convert.
*@param a_out out parameter. The converted string.
*@param a_out out parameter. The length of the converted string.
*@return CR_OK upon successfull completion, an error code otherwise.
*
*/
enum CRStatus
{
&& a_out_len, CR_BAD_PARAM_ERROR);
if (*a_in_len < 1) {
*a_out_len = 0;
return CR_OK;
}
&out_len);
return status;
}
/**
*Converts an utf8 buffer into an ucs1 buffer.
*The caller must know the size of the resulting
*converted buffer, and allocated it prior to calling this
*function.
*
*@param a_in the input utf8 buffer to convert.
*
*After return, points to the number of bytes consumed
*by the function even in case of encoding error.
*
*@param a_out out parameter. Points to the resulting buffer.
*Must be allocated by the caller. If the size of a_out is shorter
*than its required size, this function converts what it can and return
*a successfull status.
*
*After return, points to the number of bytes consumed even in case of
*encoding error.
*
*@return CR_OK upon successfull completion, an error code otherwise.
*/
enum CRStatus
{
out_index = 0,
in_len = 0,
out_len = 0;
/*
*to store the final decoded
*unicode char
*/
guint32 c = 0;
if (*a_in_len < 1) {
goto end;
}
/*
*7 bits long char
*encoded over 1 byte:
* 0xxx xxxx
*/
nb_bytes_2_decode = 1;
/*
*up to 11 bits long char.
*encoded over 2 bytes:
*110x xxxx 10xx xxxx
*/
nb_bytes_2_decode = 2;
/*
*up to 16 bit long char
*encoded over 3 bytes:
*1110 xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 3;
/*
*up to 21 bits long char
*encoded over 4 bytes:
*1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 4;
/*
*up to 26 bits long char
*encoded over 5 bytes.
*1111 10xx 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 5;
/*
*up to 31 bits long char
*encoded over 6 bytes:
*1111 110x 10xx xxxx 10xx xxxx
*10xx xxxx 10xx xxxx 10xx xxxx
*/
nb_bytes_2_decode = 6;
} else {
/*BAD ENCODING */
goto end;
}
/*
*Go and decode the remaining byte(s)
*(if any) to get the current character.
*/
goto end;
}
/*decode the next byte */
in_index++;
/*byte pattern must be: 10xx xxxx */
goto end;
}
}
/*
*The decoded ucs4 char is now
*in c.
*/
if (c > 0xFF) {
goto end;
}
}
end:
return status;
}
/**
*Converts an utf8 buffer into an
*ucs1 buffer.
*@param a_in_start the start of the input buffer.
*@param a_in_end the end of the input buffer.
*@param a_out out parameter. The resulting converted ucs4 buffer.
*Must be freed by the caller.
*@param a_out_len out parameter. The length of the converted buffer.
*@return CR_OK upon successfull completion, an error code otherwise.
*Note that out parameters are valid if and only if this function
*returns CR_OK.
*/
enum CRStatus
{
if (*a_in_len < 1) {
*a_out_len = 0;
return CR_OK;
}
return status;
}
/*****************************************
*CSS basic types identification utilities
*****************************************/
/**
*Returns TRUE if a_char is a white space as
*defined in the css spec in chap 4.1.1.
*
*white-space ::= ' '| \t|\r|\n|\f
*
*@param a_char the character to test.
*return TRUE if is a white space, false otherwise.
*/
{
switch (a_char) {
case ' ':
case '\t':
case '\r':
case '\n':
case '\f':
return TRUE;
break;
default:
return FALSE;
}
}
/**
*Returns true if the character is a newline
*as defined in the css spec in the chap 4.1.1.
*
*nl ::= \n|\r\n|\r|\f
*
*@param a_char the character to test.
*@return TRUE if the character is a newline, FALSE otherwise.
*/
{
switch (a_char) {
case '\n':
case '\r':
case '\f':
return TRUE;
break;
default:
return FALSE;
}
}
/**
*returns TRUE if the char is part of an hexa num char:
*i.e hexa_char ::= [0-9A-F]
*/
{
return TRUE;
}
return FALSE;
}
/**
*Returns true if the character is a nonascii
*character (as defined in the css spec chap 4.1.1):
*
*nonascii ::= [^\0-\177]
*
*@param a_char the character to test.
*@return TRUE if the character is a nonascii char,
*FALSE otherwise.
*/
{
if (a_char <= 177) {
return FALSE;
}
return TRUE;
}
/**
*Dumps a character a_nb times on a file.
*@param a_char the char to dump
*@param a_fp the destination file pointer
*@param a_nb the number of times a_char is to be dumped.
*/
void
{
glong i = 0;
for (i = 0; i < a_nb; i++) {
}
}
void
{
glong i = 0;
for (i = 0; i < a_nb; i++) {
}
}
/**
*Duplicates a list of GString instances.
*@return the duplicated list of GString instances or NULL if
*something bad happened.
*@param a_list_of_strings the list of strings to be duplicated.
*/
GList *
{
if (str)
}
return result;
}
/**
*Duplicate a GList where the GList::data is a CRString.
*@param a_list_of_strings the list to duplicate
*@return the duplicated list, or NULL if something bad
*happened.
*/
GList *
{
if (str)
}
return result;
}