utf-8-case.cpp revision 14380feae039b4eb5a70e053e186000c706ff358
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * IPRT - UTF-8 Case Sensitivity and Folding.
e64031e20c39650a7bc902a3e1aba613b9415deevboxsync * Copyright (C) 2006-2009 Oracle Corporation
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * This file is part of VirtualBox Open Source Edition (OSE), as
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * available from http://www.virtualbox.org. This file is free software;
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * you can redistribute it and/or modify it under the terms of the GNU
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * General Public License (GPL) as published by the Free Software
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * Foundation, in version 2 as it comes in the "COPYING" file of the
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * The contents of this file may alternatively be used under the terms
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * of the Common Development and Distribution License Version 1.0
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * VirtualBox OSE distribution, in which case the provisions of the
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * CDDL are applicable instead of those of the GPL.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * You may elect to license modified versions of this file under the
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * terms and conditions of either the GPL or the CDDL or both.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync/*******************************************************************************
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync* Header Files *
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync*******************************************************************************/
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * Performs a case insensitive string compare between two UTF-8 strings.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * This is a simplified compare, as only the simplified lower/upper case folding
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * specified by the unicode specs are used. It does not consider character pairs
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * as they are used in some languages, just simple upper & lower case compares.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * The result is the difference between the mismatching codepoints after they
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * both have been lower cased.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * If the string encoding is invalid the function will assert (strict builds)
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * and use RTStrCmp for the remainder of the string.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * @returns < 0 if the first string less than the second string.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * @returns 0 if the first string identical to the second string.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * @returns > 0 if the first string greater than the second string.
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync * @param psz1 First UTF-8 string. Null is allowed.
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync * @param psz2 Second UTF-8 string. Null is allowed.
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsyncRTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
4e3469ac31db1401d787d60312d2179bc09757b9vboxsync /* Get the codepoints */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync /* compare */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync /* hit the terminator? */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync /* Hit some bad encoding, continue in case sensitive mode. */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * Performs a case insensitive string compare between two UTF-8 strings, given a
d1966fe9681e9a100f8c895f08e450fc32dafd48vboxsync * maximum string length.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * This is a simplified compare, as only the simplified lower/upper case folding
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * specified by the unicode specs are used. It does not consider character pairs
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * as they are used in some languages, just simple upper & lower case compares.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * The result is the difference between the mismatching codepoints after they
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * both have been lower cased.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * If the string encoding is invalid the function will assert (strict builds)
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * and use RTStrCmp for the remainder of the string.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * @returns < 0 if the first string less than the second string.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * @returns 0 if the first string identical to the second string.
343a0f715a7abe21308b4a564698ab9c93473fcavboxsync * @returns > 0 if the first string greater than the second string.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * @param psz1 First UTF-8 string. Null is allowed.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * @param psz2 Second UTF-8 string. Null is allowed.
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync * @param cchMax Maximum string length
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsyncRTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync /* Get the codepoints */
9ac9eda1e04d193f54339eaeffe1c8b2f8ea04b1vboxsync psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync /* compare */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync iDiff = RTUniCpToUpper(cp1) != RTUniCpToUpper(cp2);
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync iDiff = RTUniCpToLower(cp1) - RTUniCpToLower(cp2); /* lower case diff last! */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync /* hit the terminator? */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync /* Hit some bad encoding, continue in case insensitive mode. */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsyncRTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
4ee5a4cd660730c997785c6cbc12881a115079e8vboxsync /* The empty string matches everything. */
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync return (char *)pszHaystack;
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * The search strategy is to pick out the first char of the needle, fold it,
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * and match it against the haystack code point by code point. When encountering
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync /* Cp0 is not a case sensitive char. */
cd899b2444ca69566bd04cfac96828714d3bd1b0vboxsync /* Cp0 is case sensitive */
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * Loop the code points in the string, converting them one by one.
230bd8589bba39933ac5ec21482d6186d675e604vboxsync * ASSUMES that the code points for upper and lower case are encoded
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * with the exact same length.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync /** @todo Handled bad encodings correctly+quietly, remove assumption,
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * optimize. */
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * Loop the code points in the string, converting them one by one.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * ASSUMES that the code points for upper and lower case are encoded
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * with the exact same length.
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync /** @todo Handled bad encodings correctly+quietly, remove assumption,
b1d7d513c459787311cd09c440524044fa7ff8a9vboxsync * optimize. */