utf8.c revision 8f6ce71fe79d897b67157d92869db87ee2042af6
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering This file is part of systemd.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering Copyright 2012 Lennart Poettering
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering systemd is free software; you can redistribute it and/or modify it
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering under the terms of the GNU Lesser General Public License as published by
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering the Free Software Foundation; either version 2.1 of the License, or
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering (at your option) any later version.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering systemd is distributed in the hope that it will be useful, but
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering WITHOUT ANY WARRANTY; without even the implied warranty of
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering Lesser General Public License for more details.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering You should have received a copy of the GNU Lesser General Public License
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering along with systemd; If not, see <http://www.gnu.org/licenses/>.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering/* This file is based on the GLIB utf8 validation functions. The
d7b8eec7dc7fe307d3a08b32cf1a9ad4276ce6d5Lennart Poettering * original license text follows. */
4aa4d2ae9717d0f8656528a3197bbc0c256380b1Zbigniew Jędrzejewski-Szmek/* gutf8.c - Operations on UTF-8 strings.
4aa4d2ae9717d0f8656528a3197bbc0c256380b1Zbigniew Jędrzejewski-Szmek * Copyright (C) 1999 Tom Tromey
4aa4d2ae9717d0f8656528a3197bbc0c256380b1Zbigniew Jędrzejewski-Szmek * Copyright (C) 2000 Red Hat, Inc.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * This library is free software; you can redistribute it and/or
0a2f9085e29c855ec1aaa996ded00fc36b06210cLennart Poettering * modify it under the terms of the GNU Library General Public
0a2f9085e29c855ec1aaa996ded00fc36b06210cLennart Poettering * License as published by the Free Software Foundation; either
0a2f9085e29c855ec1aaa996ded00fc36b06210cLennart Poettering * version 2 of the License, or (at your option) any later version.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * This library is distributed in the hope that it will be useful,
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * but WITHOUT ANY WARRANTY; without even the implied warranty of
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * Library General Public License for more details.
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering * You should have received a copy of the GNU Library General Public
329c542585cd92cb905990e3bf59eda16fd88cfbLennart Poettering * License along with this library; if not, write to the Free Software
329c542585cd92cb905990e3bf59eda16fd88cfbLennart Poettering * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poetteringstatic inline bool is_unicode_valid(uint32_t ch) {
755bde375f4db393ad06e73340bfcf4d0cf91bb2Lennart Poettering if (ch >= 0x110000) /* End of unicode space */
755bde375f4db393ad06e73340bfcf4d0cf91bb2Lennart Poettering if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
755bde375f4db393ad06e73340bfcf4d0cf91bb2Lennart Poetteringstatic bool is_unicode_control(uint32_t ch) {
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering 0 to ' '-1 is the C0 range.
ecabcf8b6edcc856ec2fd5bd43fc675a8fe04731Lennart Poettering DEL=0x7F, and DEL+1 to 0x9F is C1 range.
7dbb1d08f66cd44b1296be3ee8e3629b989e19a8Zbigniew Jędrzejewski-Szmek '\t' is in C0 range, but more or less harmless and commonly used.
755bde375f4db393ad06e73340bfcf4d0cf91bb2Lennart Poettering return (ch < ' ' && ch != '\t' && ch != '\n') ||
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering/* count of characters used to encode one unicode char */
4aa4d2ae9717d0f8656528a3197bbc0c256380b1Zbigniew Jędrzejewski-Szmekstatic int utf8_encoded_expected_len(const char *str) {
4aa4d2ae9717d0f8656528a3197bbc0c256380b1Zbigniew Jędrzejewski-Szmek unsigned char c = (unsigned char)str[0];
8ea48dfcd33e8db0c01bf8c57c3bbcfdc3c86d4bLennart Poettering/* decode one unicode char */
755bde375f4db393ad06e73340bfcf4d0cf91bb2Lennart Poetteringstatic int utf8_encoded_to_unichar(const char *str) {
7dbb1d08f66cd44b1296be3ee8e3629b989e19a8Zbigniew Jędrzejewski-Szmek len = utf8_encoded_expected_len(str);
return unichar;
const uint8_t *p;
const uint8_t *p;
if (len < 0)
return NULL;
p += len;
return str;
for (p = str; *p; p++)
return NULL;
return (char*) str;
size_t l;
return NULL;
for (s = str, d = r; *s; s++)
const uint8_t *f;
uint8_t *t;
return NULL;
t = (uint8_t*) r;
uint16_t c;
*(t++) = (uint8_t) c;
int len;
int unichar;
if (len == 0)
for (i = 0; i < len; i++)
return len;