ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * util/support/utf8.c
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Copyright 2008 by the Massachusetts Institute of Technology.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * All Rights Reserved.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Export of this software from the United States of America may
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * require a specific license from the United States Government.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * It is the responsibility of any person or organization contemplating
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * export to obtain such a license before exporting.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * distribute this software and its documentation for any purpose and
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * without fee is hereby granted, provided that the above copyright
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * notice appear in all copies and that both that copyright notice and
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * this permission notice appear in supporting documentation, and that
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * the name of M.I.T. not be used in advertising or publicity pertaining
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * to distribution of the software without specific, written prior
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * permission. Furthermore if you modify this software you must label
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * your software as modified software and not distribute it in such a
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * fashion that it might be confused with the original M.I.T. software.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * M.I.T. makes no representations about the suitability of
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * this software for any purpose. It is provided "as is" without express
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * or implied warranty.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Copyright 1998-2008 The OpenLDAP Foundation.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * All rights reserved.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Redistribution and use in source and binary forms, with or without
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * modification, are permitted only as authorized by the OpenLDAP
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Public License.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * A copy of this license is available in the file LICENSE in the
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * top-level directory of the distribution or, alternatively, at
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * <http://www.OpenLDAP.org/license.html>.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* Basic UTF-8 routines
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * These routines are "dumb". Though they understand UTF-8,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * they don't grok Unicode. That is, they can push bits,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * but don't have a clue what the bits represent. That's
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * good enough for use with the KRB5 Client SDK.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * These routines are not optimized.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#include "k5-platform.h"
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#include "k5-utf8.h"
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#include "supp-int.h"
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * return the number of bytes required to hold the
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * NULL-terminated UTF-8 string NOT INCLUDING the
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * termination.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_utf8_bytes(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry size_t bytes;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (bytes = 0; p[bytes]; bytes++)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry ;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return bytes;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_utf8_chars(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry /* could be optimized and could check for invalid sequences */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry size_t chars = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for ( ; *p ; KRB5_UTF8_INCR(p))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry chars++;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return chars;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_utf8c_chars(const char *p, size_t length)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry /* could be optimized and could check for invalid sequences */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry size_t chars = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const char *end = p + length;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for ( ; p < end; KRB5_UTF8_INCR(p))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry chars++;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return chars;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* return offset to next character */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_offset(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_UTF8_NEXT(p) - p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Returns length indicated by first byte.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryconst char krb5int_utf8_lentab[] = {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_charlen(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!(*p & 0x80))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Make sure the UTF-8 char used the shortest possible encoding
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * returns charlen if valid, 0 if not.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * The table is slightly modified from that of the RFC.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * UCS-4 range (hex) UTF-8 sequence (binary)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0000 0000-0000 007F 0.......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0000 0080-0000 07FF 110++++. 10......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * The '.' bits are "don't cares". When validating a UTF-8 sequence,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * at least one of the '+' bits must be set, otherwise the character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * should have been encoded in fewer octets. Note that in the two-octet
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * case, only the first octet needs to be validated, and this is done
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * in the krb5int_utf8_lentab[] above.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* mask of required bits in second octet */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#undef c
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#define c const char
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryc krb5int_utf8_mintab[] = {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#undef c
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_charlen2(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry int i = KRB5_UTF8_CHARLEN(p);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (i > 2) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry i = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Convert a UTF8 character to a UCS4 character. Return 0 on success,
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * -1 on failure.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const unsigned char *c = (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 ch;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry int len, i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry static unsigned char mask[] = {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *out = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry len = KRB5_UTF8_CHARLEN2(p, len);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (len == 0)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return -1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry ch = c[0] & mask[len];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (i = 1; i < len; i++) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if ((c[i] & 0xc0) != 0x80)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return -1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry ch <<= 6;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry ch |= c[i] & 0x3f;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *out = ch;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 ch;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *out = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return -1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *out = (krb5_ucs2) ch;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* conv UCS-2 to UTF-8, not used */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry size_t len = 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned char *p = (unsigned char *) buf;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry /* not a valid Unicode character */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (c < 0)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry /* Just return length, don't convert */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (buf == NULL) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (c < 0x80) return 1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry else if (c < 0x800) return 2;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry else if (c < 0x10000) return 3;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry else if (c < 0x200000) return 4;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry else if (c < 0x4000000) return 5;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry else return 6;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (c < 0x80) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = c;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry } else if (c < 0x800) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0xc0 | ( c >> 6 );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( c & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry } else if (c < 0x10000) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0xe0 | ( c >> 12 );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( c & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry } else if (c < 0x200000) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0xf0 | ( c >> 18 );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( c & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry } else if (c < 0x4000000) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0xf8 | ( c >> 24 );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( c & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry } else /* if( c < 0x80000000 ) */ {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0xfc | ( c >> 30 );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 24) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 18) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 12) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( (c >> 6) & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry p[len++] = 0x80 | ( c & 0x3f );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return len;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#define KRB5_UCS_UTF8LEN(c) \
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Advance to the next UTF-8 character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Ignores length of multibyte character, instead rely on
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * continuation markers to find start of next character.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * This allows for "resyncing" of when invalid characters
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * are provided provided the start of the next character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * is appears within the 6 bytes examined.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrychar *krb5int_utf8_next(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry int i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const unsigned char *u = (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (KRB5_UTF8_ISASCII(u)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *) &p[1];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (i = 1; i < 6; i++) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if ((u[i] & 0xc0) != 0x80) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *) &p[i];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *) &p[i];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Advance to the previous UTF-8 character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Ignores length of multibyte character, instead rely on
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * continuation markers to find start of next character.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * This allows for "resyncing" of when invalid characters
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * are provided provided the start of the next character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * is appears within the 6 bytes examined.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrychar *krb5int_utf8_prev(const char *p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry int i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const unsigned char *u = (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (i = -1; i>-6 ; i--) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if ((u[i] & 0xc0 ) != 0x80) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *) &p[i];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *) &p[i];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Copy one UTF-8 character from src to dst returning
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * number of bytes copied.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Ignores length of multibyte character, instead rely on
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * continuation markers to find start of next character.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * This allows for "resyncing" of when invalid characters
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * are provided provided the start of the next character
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * is appears within the 6 bytes examined.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_copy(char* dst, const char *src)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry int i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const unsigned char *u = (const unsigned char *) src;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry dst[0] = src[0];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (KRB5_UTF8_ISASCII(u)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (i=1; i<6; i++) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if ((u[i] & 0xc0) != 0x80) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry dst[i] = src[i];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return i;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#ifndef UTF8_ALPHA_CTYPE
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * UTF-8 ctype routines
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * Only deals with characters < 0x80 (ie: US-ASCII)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isascii(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_ASCII(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isdigit(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_DIGIT( c );
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isxdigit(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_HEX(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isspace(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry switch(c) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case ' ':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case '\t':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case '\n':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case '\r':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case '\v':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry case '\f':
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 1;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * These are not needed by the C SDK and are
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * not "good enough" for general use.
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isalpha(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_ALPHA(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isalnum(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_ALNUM(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#if 0
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_islower(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_LOWER(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barryint krb5int_utf8_isupper(const char * p)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry unsigned c = * (const unsigned char *) p;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (!KRB5_ASCII(c))
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return 0;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return KRB5_UPPER(c);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#endif
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry#endif
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/*
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry * UTF-8 string routines
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* like strchr() */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrychar *krb5int_utf8_strchr(const char *str, const char *chr)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 chs, ch;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *)str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* like strcspn() but returns number of bytes, not characters */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_utf8_strcspn(const char *str, const char *set)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const char *cstr, *cset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 chstr, chset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return cstr - str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return cstr - str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* like strspn() but returns number of bytes, not characters */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrysize_t krb5int_utf8_strspn(const char *str, const char *set)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const char *cstr, *cset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 chstr, chset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (cset = set; ; KRB5_UTF8_INCR(cset)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (*cset == '\0')
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return cstr - str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry break;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return cstr - str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* like strpbrk(), replaces strchr() as well */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrychar *krb5int_utf8_strpbrk(const char *str, const char *set)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry const char *cset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry krb5_ucs4 chstr, chset;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (krb5int_utf8_to_ucs4(str, &chstr) == 0
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry && krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return (char *)str;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry/* like strtok_r(), not strtok() */
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barrychar *krb5int_utf8_strtok(char *str, const char *sep, char **last)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry{
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry char *begin;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry char *end;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (last == NULL)
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry begin = str ? str : *last;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry begin += krb5int_utf8_strspn(begin, sep);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (*begin == '\0') {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *last = NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return NULL;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry end = &begin[krb5int_utf8_strcspn(begin, sep)];
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry if (*end != '\0') {
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry char *next = KRB5_UTF8_NEXT(end);
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *end = '\0';
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry end = next;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry }
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry *last = end;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry return begin;
ba7b222e36bac28710a7f43739283302b617e7f5Glenn Barry}