libslp/clib/slp_utf8.c

2N/A/*
2N/A * CDDL HEADER START
2N/A *
2N/A * The contents of this file are subject to the terms of the
2N/A * Common Development and Distribution License, Version 1.0 only
2N/A * (the "License").  You may not use this file except in compliance
2N/A * with the License.
2N/A *
2N/A * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
2N/A * or http://www.opensolaris.org/os/licensing.
2N/A * See the License for the specific language governing permissions
2N/A * and limitations under the License.
2N/A *
2N/A * When distributing Covered Code, include this CDDL HEADER in each
2N/A * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
2N/A * If applicable, add the following below this CDDL HEADER, with the
2N/A * fields enclosed by brackets "[]" replaced with your own identifying
2N/A * information: Portions Copyright [yyyy] [name of copyright owner]
2N/A *
2N/A * CDDL HEADER END
2N/A */
2N/A/*
2N/A * Copyright (c) 1999 by Sun Microsystems, Inc.
2N/A * All rights reserved.
2N/A */
2N/A
2N/A#pragma ident   "%Z%%M% %I% %E% SMI"
2N/A
2N/A/*
2N/A * UTF-8 encoded Unicode parsing routines. For efficiency, we convert
2N/A * to wide chars only when absolutely needed. The following interfaces
2N/A * are exported to libslp:
2N/A *
2N/A * slp_utf_strchr:  same semantics as strchr, but handles UTF-8 strings
2N/A * slp_fold_space:  folds white space around and in between works;
2N/A *              handles UTF-8 strings
2N/A * slp_strcasecmp:  same semantics as strcasecmp, but also folds white
2N/A *              space and attempts locale-specific
2N/A *              case-insensitive comparisons.
2N/A */
2N/A
2N/A#include <stdio.h>
2N/A#include <string.h>
2N/A#include <widec.h>
2N/A#include <stdlib.h>
2N/A#include <syslog.h>
2N/A#include <slp-internal.h>
2N/A
2N/A/*
2N/A * Same semantics as strchr.
2N/A * Assumes that we start on a char boundry, and that c is a 7-bit
2N/A * ASCII char.
2N/A */
2N/Achar *slp_utf_strchr(const char *s, char c) {
2N/A    int len;
2N/A    char *p;
2N/A
2N/A    for (p = (char *)s; *p; p += len) {
2N/A        len = mblen(p, MB_CUR_MAX);
2N/A        if (len == 1 && *p == c)
2N/A            return (p);
2N/A    }
2N/A    return (NULL);
2N/A}
2N/A
2N/A/*
2N/A * folds white space around and in between words.
2N/A * " aa    bb   " becomes "aa bb".
2N/A * returns NULL if it couldn't allocate memory. The caller must free
2N/A * the result when done.
2N/A */
2N/Astatic char *slp_fold_space(const char *s) {
2N/A    int len;
2N/A    char *folded, *f;
2N/A
2N/A    if (!(folded = malloc(strlen(s) + 1))) {
2N/A        slp_err(LOG_CRIT, 0, "slp_fold_space", "out of memory");
2N/A        return (NULL);
2N/A    }
2N/A
2N/A    f = folded;
2N/A    for (;;) {
2N/A        /* step 1: skip white space */
2N/A        for (; *s; s++) {
2N/A            len = mblen(s, MB_CUR_MAX);
2N/A            if (len != 1)
2N/A                break;
2N/A            if (!isspace(*s))
2N/A                break;
2N/A        }
2N/A
2N/A        if (!*s) {
2N/A            /* end of string */
2N/A            *f = 0;
2N/A            return (folded);
2N/A        }
2N/A        /* if we are in between words, keep one space */
2N/A        if (f != folded)
2N/A            *f++ = ' ';
2N/A
2N/A        /* step 2: copy into folded until we hit more white space */
2N/A        while (*s) {
2N/A            int i;
2N/A            len = mblen(s, MB_CUR_MAX);
2N/A            if (len == 1 && isspace(*s))
2N/A                break;
2N/A
2N/A            for (i = 0; i < len; i++)
2N/A                *f++ = *s++;
2N/A        }
2N/A        *f = *s;
2N/A        if (!*s++)
2N/A            return (folded);
2N/A    }
2N/A}
2N/A
2N/A/*
2N/A * performs like strcasecmp, but also folds white space before comparing,
2N/A * and will handle UTF-8 comparisons (including case). Note that the
2N/A * application's locale must have been set to a UTF-8 locale for this
2N/A * to work properly.
2N/A */
2N/Aint slp_strcasecmp(const char *s1, const char *s2) {
2N/A    int diff = -1;
2N/A    char *p1, *p2;
2N/A    size_t wcslen1, wcslen2;
2N/A    wchar_t *wcs1, *wcs2;
2N/A
2N/A    p1 = p2 = NULL; wcs1 = wcs2 = NULL;
2N/A
2N/A    /* optimization: try simple case first */
2N/A    if (strcasecmp(s1, s2) == 0)
2N/A        return (0);
2N/A
2N/A    /* fold white space, and try again */
2N/A    p1 = slp_fold_space(s1);
2N/A    p2 = slp_fold_space(s2);
2N/A    if (!p1 || !p2)
2N/A        goto cleanup;
2N/A
2N/A    if ((diff = strcasecmp(p1, p2)) == 0)
2N/A        goto cleanup;
2N/A
2N/A    /*
2N/A     * try converting to wide char -- we must be in a locale which
2N/A     * supports the UTF8 codeset for this to work.
2N/A     */
2N/A    if ((wcslen1 = mbstowcs(NULL, p1, 0)) == (size_t)-1)
2N/A        goto cleanup;
2N/A
2N/A    if (!(wcs1 = malloc(sizeof (*wcs1) * (wcslen1 + 1)))) {
2N/A        slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
2N/A        goto cleanup;
2N/A    }
2N/A
2N/A    if ((wcslen2 = mbstowcs(NULL, p2, 0)) == (size_t)-1)
2N/A        goto cleanup;
2N/A
2N/A    if (!(wcs2 = malloc(sizeof (*wcs2) * (wcslen2 + 1)))) {
2N/A        slp_err(LOG_CRIT, 0, "slp_strcasecmp", "out of memory");
2N/A        goto cleanup;
2N/A    }
2N/A    if (mbstowcs(wcs1, p1, wcslen1 + 1) == (size_t)-1)
2N/A        goto cleanup;
2N/A    if (mbstowcs(wcs2, p2, wcslen2 + 1) == (size_t)-1)
2N/A        goto cleanup;
2N/A
2N/A    diff = wscasecmp(wcs1, wcs2);
2N/A
2N/Acleanup:
2N/A    if (p1) free(p1);
2N/A    if (p2) free(p2);
2N/A    if (wcs1) free(wcs1);
2N/A    if (wcs2) free(wcs2);
2N/A    return (diff);
2N/A}