#ifndef lint
static char *rcsid = "$Id: unicode.c,v 1.1 2003/06/04 00:26:16 marka Exp $";
#endif
/*
* Copyright (c) 2000,2001,2002 Japan Network Information Center.
* All rights reserved.
*
* By using this file, you agree to the terms and conditions set forth bellow.
*
* LICENSE TERMS AND CONDITIONS
*
* The following License Terms and Conditions apply, unless a different
* license is obtained from Japan Network Information Center ("JPNIC"),
* a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
* Chiyoda-ku, Tokyo 101-0047, Japan.
*
* 1. Use, Modification and Redistribution (including distribution of any
* modified or derived work) in source and/or binary forms is permitted
* under this License Terms and Conditions.
*
* 2. Redistribution of source code must retain the copyright notices as they
* appear in each source code file, this License Terms and Conditions.
*
* 3. Redistribution in binary form must reproduce the Copyright Notice,
* this License Terms and Conditions, in the documentation and/or other
* materials provided with the distribution. For the purposes of binary
* distribution the "Copyright Notice" refers to the following language:
* "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
*
* 4. The name of JPNIC may not be used to endorse or promote products
* derived from this Software without specific prior written approval of
* JPNIC.
*
* 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
*/
#include <config.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <idn/result.h>
#include <idn/logmacro.h>
#include <idn/assert.h>
#include <idn/unicode.h>
#define UNICODE_CURRENT "3.2.0"
#define UCS_MAX 0x10ffff
#define END_BIT 0x80000000
/*
* Some constants for Hangul decomposition/composition.
*/
#define SBase 0xac00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11a7
#define LCount 19
#define VCount 21
#define TCount 28
#define SLast (SBase + LCount * VCount * TCount)
/*
* Symbol composition macro.
*/
#define compose_sym(a, b) compose_symX(a, b)
#define compose_symX(a, b) a ## b
struct composition {
unsigned long c2; /* 2nd character */
unsigned long comp; /* composed character */
};
#include "unicodedata_320.c"
#define VERSION v320
#include "unicode_template.c"
#undef VERSION
typedef int (*unicode_canonclassproc)(unsigned long v);
typedef int (*unicode_decomposeproc)(unsigned long c,
const unsigned long **seqp);
typedef int (*unicode_composeproc)(unsigned long c,
const struct composition **compp);
static struct idn__unicode_ops {
char *version;
unicode_canonclassproc canonclass_proc;
unicode_decomposeproc decompose_proc;
unicode_composeproc compose_proc;
} unicode_versions[] = {
#define MAKE_UNICODE_HANDLE(version, suffix) \
{ version, \
compose_sym(canonclass_, suffix), \
compose_sym(decompose_, suffix), \
compose_sym(compose_, suffix) }
MAKE_UNICODE_HANDLE("3.2.0", v320),
{ NULL },
#undef MAKE_UNICODE_HANDLE
};
idn_result_t
idn__unicode_create(const char *version,
idn__unicode_version_t *versionp) {
idn__unicode_version_t v;
assert(versionp != NULL);
TRACE(("idn__unicode_create(version=%-.50s)\n",
version == NULL ? "<NULL>" : version));
if (version == NULL)
version = UNICODE_CURRENT;
for (v = unicode_versions; v->version != NULL; v++) {
if (strcmp(v->version, version) == 0) {
*versionp = v;
return (idn_success);
}
}
return (idn_notfound);
}
void
idn__unicode_destroy(idn__unicode_version_t version) {
assert(version != NULL);
TRACE(("idn__unicode_destroy()\n"));
/* Nothing to do */
}
int
idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) {
if (c > UCS_MAX)
return (0);
return (*version->canonclass_proc)(c);
}
idn_result_t
idn__unicode_decompose(idn__unicode_version_t version,
int compat, unsigned long *v, size_t vlen,
unsigned long c, int *decomp_lenp) {
unsigned long *vorg = v;
int seqidx;
const unsigned long *seq;
assert(v != NULL && vlen >= 0 && decomp_lenp != NULL);
if (c > UCS_MAX)
return (idn_notfound);
/*
* First, check for Hangul.
*/
if (SBase <= c && c < SLast) {
int idx, t_offset, v_offset, l_offset;
idx = c - SBase;
t_offset = idx % TCount;
idx /= TCount;
v_offset = idx % VCount;
l_offset = idx / VCount;
if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
return (idn_buffer_overflow);
*v++ = LBase + l_offset;
*v++ = VBase + v_offset;
if (t_offset > 0)
*v++ = TBase + t_offset;
*decomp_lenp = v - vorg;
return (idn_success);
}
/*
* Look up decomposition table. If no decomposition is defined
* or if it is a compatibility decomosition when canonical
* decomposition requested, return 'idn_notfound'.
*/
seqidx = (*version->decompose_proc)(c, &seq);
if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
return (idn_notfound);
/*
* Copy the decomposed sequence. The end of the sequence are
* marked with END_BIT.
*/
do {
unsigned long c;
int dlen;
idn_result_t r;
c = *seq & ~END_BIT;
/* Decompose recursively. */
r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen);
if (r == idn_success) {
v += dlen;
vlen -= dlen;
} else if (r == idn_notfound) {
if (vlen < 1)
return (idn_buffer_overflow);
*v++ = c;
vlen--;
} else {
return (r);
}
} while ((*seq++ & END_BIT) == 0);
*decomp_lenp = v - vorg;
return (idn_success);
}
int
idn__unicode_iscompositecandidate(idn__unicode_version_t version,
unsigned long c) {
const struct composition *dummy;
if (c > UCS_MAX)
return (0);
/* Check for Hangul */
if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
return (1);
/*
* Look up composition table. If there are no composition
* that begins with the given character, it is not a
* composition candidate.
*/
if ((*version->compose_proc)(c, &dummy) == 0)
return (0);
else
return (1);
}
idn_result_t
idn__unicode_compose(idn__unicode_version_t version, unsigned long c1,
unsigned long c2, unsigned long *compp) {
int n;
int lo, hi;
const struct composition *cseq;
assert(compp != NULL);
if (c1 > UCS_MAX || c2 > UCS_MAX)
return (idn_notfound);
/*
* Check for Hangul.
*/
if (LBase <= c1 && c1 < LBase + LCount &&
VBase <= c2 && c2 < VBase + VCount) {
/*
* Hangul L and V.
*/
*compp = SBase +
((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
return (idn_success);
} else if (SBase <= c1 && c1 < SLast &&
TBase <= c2 && c2 < TBase + TCount &&
(c1 - SBase) % TCount == 0) {
/*
* Hangul LV and T.
*/
*compp = c1 + (c2 - TBase);
return (idn_success);
}
/*
* Look up composition table. If the result is 0, no composition
* is defined. Otherwise, upper 16bits of the result contains
* the number of composition that begins with 'c1', and the lower
* 16bits is the offset in 'compose_seq'.
*/
if ((n = (*version->compose_proc)(c1, &cseq)) == 0)
return (idn_notfound);
/*
* The composite sequences are sorted by the 2nd character 'c2'.
* So we can use binary search.
*/
lo = 0;
hi = n - 1;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (cseq[mid].c2 < c2) {
lo = mid + 1;
} else if (cseq[mid].c2 > c2) {
hi = mid - 1;
} else {
*compp = cseq[mid].comp;
return (idn_success);
}
}
return (idn_notfound);
}