nsNativeCharsetUtils.cpp revision 677833bc953b6cb418c701facbdcf4aa18d6c44e
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2002
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Darin Fisher <darin@netscape.com>
* Brian Stell <bstell@ix.netcom.com>
* Frank Tang <ftang@netscape.com>
* Brendan Eich <brendan@mozilla.org>
* Sergei Dolgov <sergei_d@fi.fi.tartu.ee>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "xpcom-private.h"
//-----------------------------------------------------------------------------
// XP_UNIX
//-----------------------------------------------------------------------------
#if defined(XP_UNIX)
#include <stdlib.h> // mbtowc, wctomb
#include <locale.h> // setlocale
#include "nscore.h"
#include "prlock.h"
#include "nsAString.h"
#include "nsReadableUtils.h"
//
// but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
// or not (see bug 206811 and
// news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
// iconv for all platforms where nltypes.h and nllanginfo.h are present
// along with iconv.
//
#define USE_ICONV 1
#else
#define USE_STDCONV 1
#endif
static void
isolatin1_to_utf16(const char **input, PRUint32 *inputLeft, PRUnichar **output, PRUint32 *outputLeft)
{
while (*inputLeft && *outputLeft) {
(*input)++;
(*inputLeft)--;
(*output)++;
(*outputLeft)--;
}
}
static void
utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, PRUint32 *outputLeft)
{
while (*inputLeft && *outputLeft) {
(*input)++;
(*inputLeft)--;
(*output)++;
(*outputLeft)--;
}
}
//-----------------------------------------------------------------------------
// conversion using iconv
//-----------------------------------------------------------------------------
#if defined(USE_ICONV)
#include <nl_types.h> // CODESET
#include <langinfo.h> // nl_langinfo
#include <iconv.h> // iconv_open, iconv, iconv_close
#include <errno.h>
#if defined(HAVE_ICONV_WITH_CONST_INPUT)
#define ICONV_INPUT(x) (x)
#else
#define ICONV_INPUT(x) ((char **)x)
#endif
// solaris definitely needs this, but we'll enable it by default
// just in case... but we know for sure that iconv(3) in glibc
// doesn't need this.
#if !defined(__GLIBC__)
#define ENABLE_UTF8_FALLBACK_SUPPORT
#endif
static inline size_t
const char **input,
char **output,
{
// on some platforms (e.g., linux) iconv will fail with
// E2BIG if it cannot convert _all_ of its input. it'll
// can ignore this error. the assumption is that we will
// be called again to complete the conversion.
res = 0;
}
return res;
}
static inline void
{
// NOTE: the man pages on Solaris claim that you can pass NULL
// for all parameter to reset the converter, but beware the
// evil Solaris crash if you go down this route >:-)
const char *zero_char_in_ptr = NULL;
char *zero_char_out_ptr = NULL;
size_t zero_size_in = 0,
zero_size_out = 0;
}
static inline iconv_t
{
const char **from_name;
const char **to_name;
// try all possible combinations to locate a converter.
while (*to_name) {
if (**to_name) {
while (*from_name) {
if (**from_name) {
if (res != INVALID_ICONV_T)
return res;
}
from_name++;
}
}
to_name++;
}
return INVALID_ICONV_T;
}
/*
* PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
* have to use UTF-16 with iconv(3) on platforms where it's supported.
* However, the way UTF-16 and UCS-2 are interpreted varies across platforms
* and implementations of iconv(3). On Tru64, it also depends on the environment
* variable. To avoid the trouble arising from byte-swapping
* back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
* which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
* and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
* variable ICONV_BYTEORDER is set to 'big-endian', about which not much
* can be done other than adding a note in the release notes. (bug 206811)
*/
static const char *UTF_16_NAMES[] = {
#if defined(IS_LITTLE_ENDIAN)
"UTF-16LE",
#if defined(__GLIBC__)
"UNICODELITTLE",
#endif
"UCS-2LE",
#else
"UTF-16BE",
#if defined(__GLIBC__)
"UNICODEBIG",
#endif
"UCS-2BE",
#endif
"UTF-16",
"UCS-2",
"UCS2",
"UCS_2",
"ucs-2",
"ucs2",
"ucs_2",
};
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
static const char *UTF_8_NAMES[] = {
"UTF-8",
"UTF8",
"UTF_8",
"utf-8",
"utf8",
"utf_8",
};
#endif
static const char *ISO_8859_1_NAMES[] = {
"ISO-8859-1",
#if !defined(__GLIBC__)
"ISO8859-1",
"ISO88591",
"ISO_8859_1",
"ISO8859_1",
"iso-8859-1",
"iso8859-1",
"iso88591",
"iso_8859_1",
"iso8859_1",
#endif
};
class nsNativeCharsetConverter
{
public:
static void GlobalInit();
static void GlobalShutdown();
private:
static iconv_t gNativeToUnicode;
static iconv_t gUnicodeToNative;
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
static iconv_t gNativeToUTF8;
static iconv_t gUTF8ToNative;
static iconv_t gUnicodeToUTF8;
static iconv_t gUTF8ToUnicode;
#endif
static PRBool gInitialized;
static void LazyInit();
};
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
#endif
void
{
const char **native_charset_list = blank_list;
if (native_charset == nsnull) {
NS_ERROR("native charset is unknown");
// fallback to ISO-8859-1
}
else
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
if (gNativeToUnicode == INVALID_ICONV_T) {
}
if (gUnicodeToNative == INVALID_ICONV_T) {
}
#else
#endif
/*
* On Solaris 8 (and newer?), the iconv modules converting to UCS-2
* prepend a byte order mark unicode character (BOM, u+FEFF) during
* the first use of the iconv converter. The same is the case of
* glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
* should be safe. But just in case...
*
* This dummy conversion gets rid of the BOMs and fixes bug 153562.
*/
char dummy_output[4];
if (gNativeToUnicode != INVALID_ICONV_T) {
const char *input = dummy_input;
char *output = dummy_output;
}
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
if (gUTF8ToUnicode != INVALID_ICONV_T) {
const char *input = dummy_input;
char *output = dummy_output;
}
#endif
}
void
{
gLock = PR_NewLock();
}
void
{
if (gLock) {
}
if (gNativeToUnicode != INVALID_ICONV_T) {
}
if (gUnicodeToNative != INVALID_ICONV_T) {
}
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
if (gNativeToUTF8 != INVALID_ICONV_T) {
}
if (gUTF8ToNative != INVALID_ICONV_T) {
}
if (gUnicodeToUTF8 != INVALID_ICONV_T) {
}
if (gUTF8ToUnicode != INVALID_ICONV_T) {
}
#endif
}
{
Lock();
if (!gInitialized)
LazyInit();
}
{
// reset converters for next time
if (gNativeToUnicode != INVALID_ICONV_T)
if (gUnicodeToNative != INVALID_ICONV_T)
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
if (gNativeToUTF8 != INVALID_ICONV_T)
if (gUTF8ToNative != INVALID_ICONV_T)
if (gUnicodeToUTF8 != INVALID_ICONV_T)
if (gUTF8ToUnicode != INVALID_ICONV_T)
#endif
Unlock();
}
{
if (gNativeToUnicode != INVALID_ICONV_T) {
return NS_OK;
NS_WARNING("conversion from native to utf-16 failed");
// reset converter
}
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
(gUTF8ToUnicode != INVALID_ICONV_T)) {
// convert first to UTF8, then from UTF8 to UCS2
char ubuf[1024];
// we assume we're always called with enough space in |output|,
// so convert many chars at a time...
while (inLeft) {
char *p = ubuf;
NS_ERROR("conversion from native to utf-8 failed");
break;
}
p = ubuf;
n = sizeof(ubuf) - n;
NS_ERROR("conversion from utf-8 to utf-16 failed");
break;
}
}
return NS_OK;
// reset converters
}
#endif
// fallback: zero-pad and hope for the best
// XXX This is lame and we have to do better.
return NS_OK;
}
char **output,
{
if (gUnicodeToNative != INVALID_ICONV_T) {
*outputLeft = outLeft;
return NS_OK;
}
NS_ERROR("iconv failed");
// reset converter
}
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
(gUTF8ToNative != INVALID_ICONV_T)) {
// convert one uchar at a time...
char *p = ubuf;
NS_ERROR("conversion from utf-16 to utf-8 failed");
break;
}
p = ubuf;
n = sizeof(ubuf) - n;
// not enough room for last uchar... back up and return.
res = 0;
}
else
NS_ERROR("conversion from utf-8 to native failed");
break;
}
}
*outputLeft = outLeft;
return NS_OK;
}
// reset converters
}
#endif
// fallback: truncate and hope for the best
return NS_OK;
}
#endif // USE_ICONV
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
#if defined(USE_STDCONV)
#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
#include <wchar.h> // mbrtowc, wcrtomb
#endif
class nsNativeCharsetConverter
{
public:
static void GlobalInit();
static void GlobalShutdown() { }
private:
static PRBool gWCharIsUnicode;
#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
#endif
};
{
#if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
#endif
}
void
{
// verify that wchar_t for the current locale is actually unicode.
// just fallback on zero-pad/truncation conversion.
//
// this test cannot be done at build time because the encoding of
// wchar_t may depend on the runtime locale. sad, but true!!
//
// so, if wchar_t is unicode then converting an ASCII character
// to wchar_t should not change its numeric value. we'll just
// check what happens with the ASCII 'a' character.
//
// this test is not perfect... obviously, it could yield false
// positives, but then at least ASCII text would be converted
// properly (or maybe just the 'a' character) -- oh well :(
char a = 'a';
unsigned int w = 0;
#ifndef L4ENV
#else
gWCharIsUnicode = 0;
#endif
#ifdef DEBUG
if (!gWCharIsUnicode)
NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
#endif
}
{
if (gWCharIsUnicode) {
#ifndef L4ENV
/* We don't have any wchar support built into uclibc just now */
int incr;
// cannot use wchar_t here since it may have been redefined (e.g.,
// via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
unsigned int tmp = 0;
while (*inputLeft && *outputLeft) {
#ifdef HAVE_MBRTOWC
#else
// XXX is this thread-safe?
#endif
if (incr < 0) {
NS_WARNING("mbtowc failed: possible charset mismatch");
// zero-pad and hope for the best
incr = 1;
}
(*output)++;
(*outputLeft)--;
}
#endif /* not defined L4ENV */
}
else {
// wchar_t isn't unicode, so the best we can do is treat the
// input as if it is isolatin1 :(
}
return NS_OK;
}
char **output,
{
if (gWCharIsUnicode) {
#ifndef L4ENV
/* We don't have any wchar support built into uclibc just now */
int incr;
#ifdef HAVE_WCRTOMB
#else
// XXX is this thread-safe?
#endif
if (incr < 0) {
NS_WARNING("mbtowc failed: possible charset mismatch");
incr = 1;
}
// most likely we're dead anyways if this assertion should fire
(*outputLeft) -= incr;
(*input)++;
(*inputLeft)--;
}
#endif /* not defined L4ENV */
}
else {
// wchar_t isn't unicode, so the best we can do is treat the
// input as if it is isolatin1 :(
}
return NS_OK;
}
#endif // USE_STDCONV
//-----------------------------------------------------------------------------
// API implementation
//-----------------------------------------------------------------------------
{
//
// OPTIMIZATION: preallocate space for largest possible result; convert
// directly into the result buffer to avoid intermediate buffer copy.
//
// this will generally result in a larger allocation, but that seems
// better than an extra buffer copy.
//
if (NS_SUCCEEDED(rv)) {
}
return rv;
}
{
// cannot easily avoid intermediate buffer copy.
char temp[4096];
while (bufLeft) {
char *p = temp;
}
return NS_OK;
}
void
{
//
// need to initialize the locale or else charset conversion will fail.
// better not delay this in case some other component alters the locale
// settings.
//
// XXX we assume that we are called early enough that we should
// always be the first to care about the locale's charset.
//
}
void
{
}
//-----------------------------------------------------------------------------
// XP_BEOS
//-----------------------------------------------------------------------------
#include "nsAString.h"
#include "nsReadableUtils.h"
#include "nsString.h"
{
return NS_OK;
}
{
return NS_OK;
}
void
{
}
void
{
}
//-----------------------------------------------------------------------------
// XP_WIN
//-----------------------------------------------------------------------------
#include <windows.h>
#include "nsAString.h"
{
// determine length of result
if (n > 0)
resultLen += n;
// allocate sufficient space
if (resultLen > 0) {
}
return NS_OK;
}
{
// determine length of result
if (n > 0)
resultLen += n;
// allocate sufficient space
if (resultLen > 0) {
// default "defaultChar" is '?', which is an illegal character on windows
// file system. That will cause file uncreatable. Change it to '_'
const char defaultChar = '_';
&defaultChar, NULL);
}
return NS_OK;
}
void
{
}
void
{
}
//-----------------------------------------------------------------------------
// XP_OS2
//-----------------------------------------------------------------------------
#define INCL_DOS
#include <os2.h>
#include <uconv.h>
#include "nsAString.h"
#include <ulserrno.h>
#include "nsNativeCharsetUtils.h"
{
// determine length of result
if (!UnicodeConverter)
if (unirc != ULS_SUCCESS) {
return NS_ERROR_FAILURE;
}
// Need to update string length to reflect how many bytes were actually
// written.
return NS_OK;
}
{
// maximum length of unicode string of length x converted to native
// codepage is x*2
if (!UnicodeConverter)
if (unirc != ULS_SUCCESS) {
return NS_ERROR_FAILURE;
}
// Need to update string length to reflect how many bytes were actually
// written.
return NS_OK;
}
void
{
if (unirc == ULS_SUCCESS) {
if (unirc == ULS_SUCCESS) {
}
}
}
void
{
}
//-----------------------------------------------------------------------------
// XP_MAC
//-----------------------------------------------------------------------------
#include <UnicodeConverter.h>
#include <TextCommon.h>
#include <Script.h>
#include <MacErrors.h>
#include "nsAString.h"
class nsFSStringConversionMac {
public:
static void CleanUp();
private:
static TextEncoding GetSystemEncoding();
static nsresult PrepareEncoder();
static nsresult PrepareDecoder();
static UnicodeToTextInfo sEncoderInfo;
static TextToUnicodeInfo sDecoderInfo;
};
{
char stackBuffer[512];
// for each chunk of |aIn|...
do {
sizeof(stackBuffer),
if (err == kTECUsedFallbacksStatus)
else if (err == kTECOutputBufferFullStatus) {
}
}
while (err == kTECOutputBufferFullStatus);
}
{
// for each chunk of |aIn|...
do {
sizeof(stackBuffer),
if (err == kTECUsedFallbacksStatus)
else if (err == kTECOutputBufferFullStatus) {
}
}
while (err == kTECOutputBufferFullStatus);
}
void nsFSStringConversionMac::CleanUp()
{
if (sDecoderInfo) {
}
if (sEncoderInfo) {
}
}
{
return theEncoding;
}
{
if (!sEncoderInfo) {
if (err)
}
return rv;
}
{
if (!sDecoderInfo) {
if (err)
}
return rv;
}
{
}
{
}
void
{
}
void
{
}
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
#else
#include "nsReadableUtils.h"
{
return NS_OK;
}
{
return NS_OK;
}
void
{
}
void
{
}
#endif