a9da3307db733eb1739ba859952610bba3d894abnp#ifndef lint
a9da3307db733eb1739ba859952610bba3d894abnpstatic char *rcsid = "$Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp $";
a9da3307db733eb1739ba859952610bba3d894abnp#endif
a9da3307db733eb1739ba859952610bba3d894abnp
a9da3307db733eb1739ba859952610bba3d894abnp/*
a9da3307db733eb1739ba859952610bba3d894abnp * Copyright (c) 2000,2001,2002 Japan Network Information Center.
a9da3307db733eb1739ba859952610bba3d894abnp * All rights reserved.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * By using this file, you agree to the terms and conditions set forth bellow.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * LICENSE TERMS AND CONDITIONS
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * The following License Terms and Conditions apply, unless a different
a9da3307db733eb1739ba859952610bba3d894abnp * license is obtained from Japan Network Information Center ("JPNIC"),
a9da3307db733eb1739ba859952610bba3d894abnp * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
a9da3307db733eb1739ba859952610bba3d894abnp * Chiyoda-ku, Tokyo 101-0047, Japan.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * 1. Use, Modification and Redistribution (including distribution of any
a9da3307db733eb1739ba859952610bba3d894abnp * modified or derived work) in source and/or binary forms is permitted
a9da3307db733eb1739ba859952610bba3d894abnp * under this License Terms and Conditions.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * 2. Redistribution of source code must retain the copyright notices as they
a9da3307db733eb1739ba859952610bba3d894abnp * appear in each source code file, this License Terms and Conditions.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * 3. Redistribution in binary form must reproduce the Copyright Notice,
a9da3307db733eb1739ba859952610bba3d894abnp * this License Terms and Conditions, in the documentation and/or other
a9da3307db733eb1739ba859952610bba3d894abnp * materials provided with the distribution. For the purposes of binary
a9da3307db733eb1739ba859952610bba3d894abnp * distribution the "Copyright Notice" refers to the following language:
a9da3307db733eb1739ba859952610bba3d894abnp * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * 4. The name of JPNIC may not be used to endorse or promote products
a9da3307db733eb1739ba859952610bba3d894abnp * derived from this Software without specific prior written approval of
a9da3307db733eb1739ba859952610bba3d894abnp * JPNIC.
a9da3307db733eb1739ba859952610bba3d894abnp *
a9da3307db733eb1739ba859952610bba3d894abnp * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
a9da3307db733eb1739ba859952610bba3d894abnp * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
a9da3307db733eb1739ba859952610bba3d894abnp * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
a9da3307db733eb1739ba859952610bba3d894abnp * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
*/
#include <config.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <idn/result.h>
#include <idn/assert.h>
#include <idn/logmacro.h>
#include <idn/ucs4.h>
#include <idn/unicode.h>
#include <idn/unormalize.h>
#include <idn/debug.h>
#if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY)
#define memmove(a,b,c) bcopy((char *)(b),(char *)(a),(int)(c))
#endif
#define WORKBUF_SIZE 128
#define WORKBUF_SIZE_MAX 10000
typedef struct {
idn__unicode_version_t version; /* Unicode version */
int cur; /* pointing now processing character */
int last; /* pointing just after the last character */
int size; /* size of UCS and CLASS array */
unsigned long *ucs4; /* UCS-4 characters */
int *class; /* and their canonical classes */
unsigned long ucs4_buf[WORKBUF_SIZE]; /* local buffer */
int class_buf[WORKBUF_SIZE]; /* ditto */
} workbuf_t;
static idn_result_t normalize(idn__unicode_version_t version,
int do_composition, int compat,
const unsigned long *from,
unsigned long *to, size_t tolen);
static idn_result_t decompose(workbuf_t *wb, unsigned long c, int compat);
static void get_class(workbuf_t *wb);
static void reorder(workbuf_t *wb);
static void compose(workbuf_t *wb);
static idn_result_t flush_before_cur(workbuf_t *wb,
unsigned long **top, size_t *tolenp);
static void workbuf_init(workbuf_t *wb);
static void workbuf_free(workbuf_t *wb);
static idn_result_t workbuf_extend(workbuf_t *wb);
static idn_result_t workbuf_append(workbuf_t *wb, unsigned long c);
static void workbuf_shift(workbuf_t *wb, int shift);
static void workbuf_removevoid(workbuf_t *wb);
idn_result_t
idn__unormalize_formkc(idn__unicode_version_t version,
const unsigned long *from, unsigned long *to,
size_t tolen) {
assert(version != NULL && from != NULL && to != NULL && tolen >= 0);
TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n",
idn__debug_ucs4xstring(from, 50), tolen));
return (normalize(version, 1, 1, from, to, tolen));
}
static idn_result_t
normalize(idn__unicode_version_t version, int do_composition, int compat,
const unsigned long *from, unsigned long *to, size_t tolen) {
workbuf_t wb;
idn_result_t r = idn_success;
/*
* Initialize working buffer.
*/
workbuf_init(&wb);
wb.version = version;
while (*from != '\0') {
unsigned long c;
assert(wb.cur == wb.last);
/*
* Get one character from 'from'.
*/
c = *from++;
/*
* Decompose it.
*/
if ((r = decompose(&wb, c, compat)) != idn_success)
goto ret;
/*
* Get canonical class.
*/
get_class(&wb);
/*
* Reorder & compose.
*/
for (; wb.cur < wb.last; wb.cur++) {
if (wb.cur == 0) {
continue;
} else if (wb.class[wb.cur] > 0) {
/*
* This is not a starter. Try reordering.
* Note that characters up to it are
* already in canonical order.
*/
reorder(&wb);
continue;
}
/*
* This is a starter character, and there are
* some characters before it. Those characters
* have been reordered properly, and
* ready for composition.
*/
if (do_composition && wb.class[0] == 0)
compose(&wb);
/*
* If CUR points to a starter character,
* then process of characters before CUR are
* already finished, because any further
* reordering/composition for them are blocked
* by the starter CUR points.
*/
if (wb.cur > 0 && wb.class[wb.cur] == 0) {
/* Flush everything before CUR. */
r = flush_before_cur(&wb, &to, &tolen);
if (r != idn_success)
goto ret;
}
}
}
if (r == idn_success) {
if (do_composition && wb.cur > 0 && wb.class[0] == 0) {
/*
* There is some characters left in WB.
* They are ordered, but not composed yet.
* Now CUR points just after the last character in WB,
* and since compose() tries to compose characters
* between top and CUR inclusive, we must make CUR
* one character back during compose().
*/
wb.cur--;
compose(&wb);
wb.cur++;
}
/*
* Call this even when WB.CUR == 0, to make TO
* NUL-terminated.
*/
r = flush_before_cur(&wb, &to, &tolen);
if (r != idn_success)
goto ret;
}
if (tolen <= 0) {
r = idn_buffer_overflow;
goto ret;
}
*to = '\0';
ret:
workbuf_free(&wb);
return (r);
}
static idn_result_t
decompose(workbuf_t *wb, unsigned long c, int compat) {
idn_result_t r;
int dec_len;
again:
r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last,
wb->size - wb->last, c, &dec_len);
switch (r) {
case idn_success:
wb->last += dec_len;
return (idn_success);
case idn_notfound:
return (workbuf_append(wb, c));
case idn_buffer_overflow:
if ((r = workbuf_extend(wb)) != idn_success)
return (r);
if (wb->size > WORKBUF_SIZE_MAX) {
WARNING(("idn__unormalize_form*: "
"working buffer too large\n"));
return (idn_nomemory);
}
goto again;
default:
return (r);
}
/* NOTREACHED */
}
static void
get_class(workbuf_t *wb) {
int i;
for (i = wb->cur; i < wb->last; i++)
wb->class[i] = idn__unicode_canonicalclass(wb->version,
wb->ucs4[i]);
}
static void
reorder(workbuf_t *wb) {
unsigned long c;
int i;
int class;
assert(wb != NULL);
i = wb->cur;
c = wb->ucs4[i];
class = wb->class[i];
while (i > 0 && wb->class[i - 1] > class) {
wb->ucs4[i] = wb->ucs4[i - 1];
wb->class[i] =wb->class[i - 1];
i--;
wb->ucs4[i] = c;
wb->class[i] = class;
}
}
static void
compose(workbuf_t *wb) {
int cur;
unsigned long *ucs4;
int *class;
int last_class;
int nvoids;
int i;
idn__unicode_version_t ver;
assert(wb != NULL && wb->class[0] == 0);
cur = wb->cur;
ucs4 = wb->ucs4;
class = wb->class;
ver = wb->version;
/*
* If there are no decomposition sequence that begins with
* the top character, composition is impossible.
*/
if (!idn__unicode_iscompositecandidate(ver, ucs4[0]))
return;
last_class = 0;
nvoids = 0;
for (i = 1; i <= cur; i++) {
unsigned long c;
int cl = class[i];
if ((last_class < cl || cl == 0) &&
idn__unicode_compose(ver, ucs4[0], ucs4[i],
&c) == idn_success) {
/*
* Replace the top character with the composed one.
*/
ucs4[0] = c;
class[0] = idn__unicode_canonicalclass(ver, c);
class[i] = -1; /* void this character */
nvoids++;
} else {
last_class = cl;
}
}
/* Purge void characters, if any. */
if (nvoids > 0)
workbuf_removevoid(wb);
}
static idn_result_t
flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) {
if (*tolenp < wb->cur)
return (idn_buffer_overflow);
memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur);
*top += wb->cur;
*tolenp -= wb->cur;
workbuf_shift(wb, wb->cur);
return (idn_success);
}
static void
workbuf_init(workbuf_t *wb) {
wb->cur = 0;
wb->last = 0;
wb->size = WORKBUF_SIZE;
wb->ucs4 = wb->ucs4_buf;
wb->class = wb->class_buf;
}
static void
workbuf_free(workbuf_t *wb) {
if (wb->ucs4 != wb->ucs4_buf) {
free(wb->ucs4);
free(wb->class);
}
}
static idn_result_t
workbuf_extend(workbuf_t *wb) {
int newsize = wb->size * 3;
if (wb->ucs4 == wb->ucs4_buf) {
wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize);
wb->class = malloc(sizeof(wb->class[0]) * newsize);
} else {
wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize);
wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize);
}
if (wb->ucs4 == NULL || wb->class == NULL)
return (idn_nomemory);
else
return (idn_success);
}
static idn_result_t
workbuf_append(workbuf_t *wb, unsigned long c) {
idn_result_t r;
if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success)
return (r);
wb->ucs4[wb->last++] = c;
return (idn_success);
}
static void
workbuf_shift(workbuf_t *wb, int shift) {
int nmove;
assert(wb != NULL && wb->cur >= shift);
nmove = wb->last - shift;
(void)memmove(&wb->ucs4[0], &wb->ucs4[shift],
nmove * sizeof(wb->ucs4[0]));
(void)memmove(&wb->class[0], &wb->class[shift],
nmove * sizeof(wb->class[0]));
wb->cur -= shift;
wb->last -= shift;
}
static void
workbuf_removevoid(workbuf_t *wb) {
int i, j;
int last = wb->last;
for (i = j = 0; i < last; i++) {
if (wb->class[i] >= 0) {
if (j < i) {
wb->ucs4[j] = wb->ucs4[i];
wb->class[j] = wb->class[i];
}
j++;
}
}
wb->cur -= last - j;
wb->last = j;
}