/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* Kernel iconv code conversion functions (PSARC/2007/173).
*
* Man pages: kiconv_open(9F), kiconv(9F), kiconv_close(9F), and kiconvstr(9F).
* Interface stability: Committed.
*/
#include <sys/sysmacros.h>
#include <sys/byteorder.h>
#include <sys/kiconv_latin1.h>
/*
* The following macros indicate ids to the correct code conversion mapping
* data tables to use. The actual tables are coming from <sys/kiconv_latin1.h>.
*/
/*
* The following tables are coming from u8_textprep.c. We use them to
* check on validity of UTF-8 characters and their bytes.
*/
extern const int8_t u8_number_of_bytes[];
extern const uint8_t u8_valid_min_2nd_byte[];
extern const uint8_t u8_valid_max_2nd_byte[];
/*
* The following four functions, open_to_1252(), open_to_88591(),
* open_to_885915(), and open_to_850(), are kiconv_open functions from
* UTF-8 to corresponding single byte codesets.
*/
static void *
{
s->id = KICONV_TBLID_1252;
s->bom_processed = 0;
return ((void *)s);
}
static void *
{
s->id = KICONV_TBLID_8859_1;
s->bom_processed = 0;
return ((void *)s);
}
static void *
{
s->id = KICONV_TBLID_8859_15;
s->bom_processed = 0;
return ((void *)s);
}
static void *
{
s->id = KICONV_TBLID_850;
s->bom_processed = 0;
return ((void *)s);
}
/*
* The following four functions, open_fr_1252(), open_fr_88591(),
* open_fr_885915(), and open_fr_850(), are kiconv_open functions from
* corresponding single byte codesets to UTF-8.
*/
static void *
{
return ((void *)KICONV_TBLID_1252);
}
static void *
{
return ((void *)KICONV_TBLID_8859_1);
}
static void *
{
return ((void *)KICONV_TBLID_8859_15);
}
static void *
{
return ((void *)KICONV_TBLID_850);
}
/*
* The following close_to_sb() function is kiconv_close function for
* the conversions from UTF-8 to single byte codesets. The close_fr_sb()
* is kiconv_close function for the conversions from single byte codesets to
* UTF-8.
*/
static int
close_to_sb(void *s)
{
if (! s || s == (void *)-1)
return (EBADF);
kmem_free(s, sizeof (kiconv_state_data_t));
return (0);
}
static int
close_fr_sb(void *s)
{
if ((ulong_t)s > KICONV_MAX_MAPPING_TBLID)
return (EBADF);
return (0);
}
/*
* The following is the common kiconv function for conversions from UTF-8
* to single byte codesets.
*/
static size_t
{
size_t i;
size_t l;
size_t h;
/* Check on the kiconv code conversion descriptor. */
return ((size_t)-1);
}
/*
* Get the table id we are going to use for the code conversion
* and let's double check on it.
*/
if (id > KICONV_MAX_MAPPING_TBLID) {
return ((size_t)-1);
}
/* If this is a state reset request, process and return. */
return ((size_t)0);
}
ret_val = 0;
/*
* The inital high value for the binary search we will be using
* shortly is a literal constant as of today but to be future proof,
* let's calculate it like the following at here.
*/
/*
* If we haven't checked on the UTF-8 signature BOM character in
* the beginning of the conversion data stream, we check it and if
* find one, we skip it since we have no use for it.
*/
ib += 3;
if (sz <= 0) {
break;
}
/*
* If there is no room to write at the output buffer,
* issue E2BIG error.
*/
break;
}
/*
* If it is a 7-bit ASCII character, we don't need to
* process further and we just copy the character over.
*
* If not, we collect the character bytes up to four bytes,
* validate the bytes, and binary search for the corresponding
* single byte codeset character byte. If we find it from
* the mapping table, we put that into the output buffer;
* otherwise, we put a replacement character instead as
* a non-identical conversion.
*/
if (sz == 1) {
continue;
}
/*
* Issue EINVAL error if input buffer has an incomplete
* character at the end of the buffer.
*/
break;
}
/*
* We collect UTF-8 character bytes and also check if
* this is a valid UTF-8 character without any bogus bytes
* based on the latest UTF-8 binary representation.
*/
for (i = 1; i < sz; i++) {
if (second) {
goto TO_SB_ILLEGAL_CHAR_ERR;
}
goto TO_SB_ILLEGAL_CHAR_ERR;
}
ib++;
}
i = l = 0;
h = init_h;
while (l <= h) {
i = (l + h) / 2;
break;
l = i + 1;
else
h = i - 1;
}
} else {
/*
* If we don't find a character in the target
* codeset, we insert an ASCII replacement character
* at the output buffer and indicate such
* "non-identical" conversion by increasing the
* return value which is the non-identical conversion
* counter if bigger than 0.
*/
ret_val++;
}
}
return (ret_val);
}
/*
* The following is the common kiconv function from single byte codesets to
* UTF-8.
*/
static size_t
{
size_t i;
size_t k;
/* Check on the kiconv code conversion descriptor validity. */
return ((size_t)-1);
}
/*
* If this is a state reset request, there is nothing to do and so
* we just return.
*/
return ((size_t)0);
ret_val = 0;
/*
* If this is a 7-bit ASCII character, we just copy over and
* that's all we need to do for this character.
*/
if (*ib < 0x80) {
break;
}
continue;
}
/*
* Otherwise, we get the corresponding UTF-8 character bytes
* from the mapping table and copy them over.
*
* We don't need to worry about if the UTF-8 character bytes
* at the mapping tables are valid or not since they are good.
*/
k = *ib - 0x80;
/*
* If sz <= 0, that means we don't have any assigned character
* at the code point, k + 0x80, of the single byte codeset
* which is the fromcode. In other words, the input buffer
* has an illegal character.
*/
if (sz <= 0) {
break;
}
break;
}
for (i = 0; i < sz; i++)
ib++;
}
return (ret_val);
}
/*
* The following is the common kiconvstr function from UTF-8 to single byte
* codesets.
*/
static size_t
{
size_t i;
size_t l;
size_t h;
/* Let's make sure that the table id is within the valid boundary. */
if (id > KICONV_MAX_MAPPING_TBLID) {
return ((size_t)-1);
}
ret_val = 0;
/* Skip any UTF-8 signature BOM character in the beginning. */
ib += 3;
/*
* Basically this is pretty much the same as kiconv_to_sb() except
* that we are now accepting two flag values and doing the processing
* accordingly.
*/
if (sz <= 0) {
if (flag & KICONV_REPLACE_INVALID) {
break;
}
ib++;
}
break;
}
break;
break;
}
if (sz == 1) {
continue;
}
if (flag & KICONV_REPLACE_INVALID) {
}
break;
}
for (i = 1; i < sz; i++) {
if (second) {
if (flag & KICONV_REPLACE_INVALID) {
}
}
if (flag & KICONV_REPLACE_INVALID) {
}
}
ib++;
}
i = l = 0;
h = init_h;
while (l <= h) {
i = (l + h) / 2;
break;
l = i + 1;
else
h = i - 1;
}
} else {
ret_val++;
}
}
return (ret_val);
}
/*
* The following four functions are entry points recorded at the conv_list[]
* defined at below.
*/
static size_t
{
}
static size_t
{
}
static size_t
{
}
static size_t
{
}
/*
* The following is the common kiconvstr function for conversions from
* single byte codesets to UTF-8.
*/
static size_t
{
size_t i;
size_t k;
ret_val = 0;
break;
if (*ib < 0x80) {
break;
}
continue;
}
k = *ib - 0x80;
if (sz <= 0) {
if (flag & KICONV_REPLACE_INVALID) {
break;
}
/* Save KICONV_UTF8_REPLACEMENT_CHAR. */
*ob++ = 0xef;
*ob++ = 0xbf;
*ob++ = 0xbd;
ret_val++;
ib++;
continue;
}
break;
}
break;
}
for (i = 0; i < sz; i++)
ib++;
}
return (ret_val);
}
/*
* The following four functions are also entry points recorded at
* the conv_list[] at below.
*/
static size_t
{
}
static size_t
{
}
static size_t
{
}
static size_t
{
}
/*
* The following static vector contains the normalized code names
* and their corresponding code ids. They are somewhat arbitrarily ordered
* based on marketing data available. A code id could repeat for aliases.
*
* The vector was generated by using a small utility program called
* codeidlistgen.c that you can find from PSARC/2007/173/materials/util/.
*
* The code ids must be portable, i.e., if needed, you can always generate
* the code_list[] again with different code ids. You'll also need to
* update the conv_list[] at below.
*/
{ "utf8", 0 },
{ "cp1252", 1 },
{ "1252", 1 },
{ "iso88591", 2 },
{ "iso885915", 3 },
{ "cp850", 4 },
{ "850", 4 },
{ "eucjp", 5 },
{ "eucjpms", 6 },
{ "cp932", 7 },
{ "932", 7 },
{ "shiftjis", 8 },
{ "pck", 8 },
{ "sjis", 8 },
{ "gb18030", 9 },
{ "gbk", 10 },
{ "cp936", 10 },
{ "936", 10 },
{ "euccn", 11 },
{ "euckr", 12 },
{ "unifiedhangul", 13 },
{ "cp949", 13 },
{ "949", 13 },
{ "big5", 14 },
{ "cp950", 14 },
{ "950", 14 },
{ "big5hkscs", 15 },
{ "euctw", 16 },
{ "cp950hkscs", 17 },
{ "cp1250", 18 },
{ "1250", 18 },
{ "iso88592", 19 },
{ "cp852", 20 },
{ "852", 20 },
{ "cp1251", 21 },
{ "1251", 21 },
{ "iso88595", 22 },
{ "koi8r", 23 },
{ "cp866", 24 },
{ "866", 24 },
{ "cp1253", 25 },
{ "1253", 25 },
{ "iso88597", 26 },
{ "cp737", 27 },
{ "737", 27 },
{ "cp1254", 28 },
{ "1254", 28 },
{ "iso88599", 29 },
{ "cp857", 30 },
{ "857", 30 },
{ "cp1256", 31 },
{ "1256", 31 },
{ "iso88596", 32 },
{ "cp720", 33 },
{ "720", 33 },
{ "cp1255", 34 },
{ "1255", 34 },
{ "iso88598", 35 },
{ "cp862", 36 },
{ "862", 36 },
{ "cp1257", 37 },
{ "1257", 37 },
{ "iso885913", 38 },
{ "iso885910", 39 },
{ "iso885911", 40 },
{ "tis620", 40 },
{ "iso88593", 41 },
{ "iso88594", 42 },
};
/*
* The list of code conversions supported are grouped together per
* module which will be loaded as needed.
*/
/* Embedded code conversions: */
{
1, 0, KICONV_EMBEDDED,
},
{
0, 1, KICONV_EMBEDDED,
},
{
2, 0, KICONV_EMBEDDED,
},
{
0, 2, KICONV_EMBEDDED,
},
{
3, 0, KICONV_EMBEDDED,
},
{
0, 3, KICONV_EMBEDDED,
},
{
4, 0, KICONV_EMBEDDED,
},
{
0, 4, KICONV_EMBEDDED,
},
/* kiconv_ja module conversions: */
/* kiconv_sc module conversions: */
/* kiconv_ko module conversions: */
/* kiconv_tc module conversions: */
/* kiconv_emea module conversions: */
};
/* The list of implemeted and supported modules. */
"kiconv_embedded", 0,
"kiconv_ja", 0,
"kiconv_sc", 0,
"kiconv_ko", 0,
"kiconv_tc", 0,
"kiconv_emea", 0,
};
/*
* We use conv_list_lock to restrict data access of both conv_list[] and
* module_list[] as they are tightly coupled critical sections that need to be
* dealt together as a unit.
*/
void
{
}
/*
* The following is used to check on whether a kiconv module is being
* used or not at the _fini() of the module.
*/
{
int count;
return (0);
return (count);
}
/*
* This function "normalizes" a given code name, n, by not including skippable
* characters and folding uppercase letters to corresponding lowercase letters.
* We only fold 7-bit ASCII uppercase characters since the names should be in
* Portable Character Set of 7-bit ASCII.
*
* By doing this, we will be able to maximize the code name matches.
*/
static size_t
normalize_codename(const char *n)
{
char s[KICONV_MAX_CODENAME_LEN + 1];
size_t i;
if (n == NULL)
return ((size_t)-1);
for (i = 0; *n; n++) {
if (KICONV_SKIPPABLE_CHAR(*n))
continue;
/* If unreasonably lengthy, we don't support such names. */
if (i >= KICONV_MAX_CODENAME_LEN)
return ((size_t)-1);
s[i++] = (*n >= 'A' && *n <= 'Z') ? *n - 'A' + 'a' : *n;
}
s[i] = '\0';
/* With the normalized name, find the corresponding codeset id. */
for (i = 0; i < KICONV_MAX_CODEID_ENTRY; i++)
/*
* In future time, we will also have a few more lines of code at below
* that will deal with other user-created modules' fromcodes and
* tocodes including aliases in a different vector. For now, we don't
* support that but only the known names to this project at this time.
*/
return ((size_t)-1);
}
/*
* This function called from mod_install() registers supplied code
* conversions. At this point, it does not honor aliases and hence does not
* use nowait data field from the kiconv module info data structure.
*/
int
{
size_t i;
size_t j;
/* Validate the given kiconv module info. */
return (EINVAL);
/*
* Check if this is one of the known modules. At this point,
* we do not allow user-defined kiconv modules and that'd be for
* a future project.
*/
break;
if (mid > KICONV_MAX_MODULE_ID)
return (EINVAL);
/* Let's register the conversions supplied. */
/*
* This is very unlikely situation but by any chance we don't want to
* register a module that is already in.
*/
return (EAGAIN);
}
for (i = 0; i < info->kiconv_num_convs; i++) {
/*
* If we find anything wrong in this particular conversion,
* we skip this one and continue to the next one. This include
* a case where there is a conversion already being assigned
* into the conv_list[] somehow, i.e., new one never kicks out
* old one.
*/
continue;
for (j = 0; j < KICONV_MAX_CONVERSIONS; j++) {
}
break;
}
}
}
return (0);
}
/*
* The following function called during mod_remove() will try to unregister,
* i.e., clear up conversion function pointers, from the conv_list[] if it
* can. If there is any code conversions being used, then, the function will
* just return EBUSY indicating that the module cannot be unloaded.
*/
int
{
size_t i;
return (EINVAL);
break;
if (mid > KICONV_MAX_MODULE_ID)
return (EINVAL);
/*
* If any of the conversions are used, then, this module canont be
* unloaded.
*/
return (EBUSY);
}
/*
* Otherwise, we unregister all conversions from this module
* and be ready for the unloading. At this point, we only care about
* the conversions we know about with the module.
*/
for (i = 0; i < KICONV_MAX_CONVERSIONS; i++) {
}
}
return (0);
}
/*
* The following function check if asked code conversion is available
* and if necessary, load the corresponding kiconv module that contains
* the conversion (and others).
*/
static kiconv_t
{
size_t i;
/* Normalize the given names and find the corresponding code ids. */
return ((kiconv_t)-1);
return ((kiconv_t)-1);
/*
* Search the conversion.
*
* If the conversion isn't supported, just return -1.
* If the conversion is supported but there is no corresponding
* module loaded, try to load it and if successful, return
* a kiconv conversion descriptor memory block.
*
* We maintain a reference counter of uint_t for each module.
*/
for (i = 0; i < KICONV_MAX_CONVERSIONS; i++)
break;
if (i >= KICONV_MAX_CONVERSIONS) {
return ((kiconv_t)-1);
}
return ((kiconv_t)-1);
/*
* Let's double check if something happened right after
*/
return ((kiconv_t)-1);
}
}
/*
* If we got the conversion, we will use the conversion function
* in the module and so let's increase the module's refcounter
* so that the module won't be kicked out. (To be more exact and
* specific, the "refcount" is thus the reference counter of
* the module functions being used.)
*/
return (kcd);
}
/*
* The following are the four "Committed" interfaces.
*/
{
return ((kiconv_t)-1);
/*
* If the conversion couldn't be opened for some reason,
* then, we unallocate the kcd and, more importantly, before
* that, we also decrease the module reference counter.
*/
return ((kiconv_t)-1);
}
return (kcd);
}
{
/* Do some minimum checking on the kiconv conversion descriptor. */
return ((size_t)-1);
}
}
int
{
int ret;
return (EBADF);
/*
* While we maintain reference conter for each module, once loaded,
* we don't modunload from kiconv functions even if the counter
* reaches back to zero.
*/
return (ret);
}
{
return ((size_t)-1);
}
return (ret);
}