13493ab7596e827b8d0caab2c89e635dd65f78f9vboxsync * IPRT - UTF-8 Case Sensitivity and Folding, Part 1.
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync * Copyright (C) 2006-2010 Oracle Corporation
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * This file is part of VirtualBox Open Source Edition (OSE), as
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * available from http://www.virtualbox.org. This file is free software;
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * you can redistribute it and/or modify it under the terms of the GNU
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * General Public License (GPL) as published by the Free Software
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * Foundation, in version 2 as it comes in the "COPYING" file of the
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * The contents of this file may alternatively be used under the terms
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * of the Common Development and Distribution License Version 1.0
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * VirtualBox OSE distribution, in which case the provisions of the
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * CDDL are applicable instead of those of the GPL.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * You may elect to license modified versions of this file under the
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * terms and conditions of either the GPL or the CDDL or both.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync/*******************************************************************************
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync* Header Files *
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync*******************************************************************************/
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * Performs a case insensitive string compare between two UTF-8 strings.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * This is a simplified compare, as only the simplified lower/upper case folding
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * specified by the unicode specs are used. It does not consider character pairs
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * as they are used in some languages, just simple upper & lower case compares.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * The result is the difference between the mismatching codepoints after they
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * both have been lower cased.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * If the string encoding is invalid the function will assert (strict builds)
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * and use RTStrCmp for the remainder of the string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns < 0 if the first string less than the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns 0 if the first string identical to the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns > 0 if the first string greater than the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @param psz1 First UTF-8 string. Null is allowed.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @param psz2 Second UTF-8 string. Null is allowed.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsyncRTDECL(int) RTStrICmp(const char *psz1, const char *psz2)
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Get the codepoints */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* compare */
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* hit the terminator? */
14380feae039b4eb5a70e053e186000c706ff358vboxsync /* Hit some bad encoding, continue in case sensitive mode. */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * Performs a case insensitive string compare between two UTF-8 strings, given a
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * maximum string length.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * This is a simplified compare, as only the simplified lower/upper case folding
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * specified by the unicode specs are used. It does not consider character pairs
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * as they are used in some languages, just simple upper & lower case compares.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * The result is the difference between the mismatching codepoints after they
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * both have been lower cased.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * If the string encoding is invalid the function will assert (strict builds)
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * and use RTStrCmp for the remainder of the string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns < 0 if the first string less than the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns 0 if the first string identical to the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @returns > 0 if the first string greater than the second string.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @param psz1 First UTF-8 string. Null is allowed.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @param psz2 Second UTF-8 string. Null is allowed.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * @param cchMax Maximum string length
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsyncRTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax)
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Get the codepoints */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* compare */
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2);
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* hit the terminator? */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Hit some bad encoding, continue in case insensitive mode. */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsyncRTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle)
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* The empty string matches everything. */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync return (char *)pszHaystack;
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * The search strategy is to pick out the first char of the needle, fold it,
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * and match it against the haystack code point by code point. When encountering
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync size_t const cchNeedleCp0= pszNeedle - pszNeedleStart;
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Cp0 is not a case sensitive char. */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Cp0 is case sensitive */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * Loop the code points in the string, converting them one by one.
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync * ASSUMES that the folded code points have an encoding that is equal or
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync * shorter than the original (this is presently correct).
f9bca9145a0dca17ca3355cfd8d416d1241637c0vboxsync || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync } while (uc != 0);
3183efc91c7b8252f1dc50dca3efd2d8ae627813vboxsync * Loop the code points in the string, converting them one by one.
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync * ASSUMES that the folded code points have an encoding that is equal or
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync * shorter than the original (this is presently correct).
f9bca9145a0dca17ca3355cfd8d416d1241637c0vboxsync || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc)))
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */
5a4840b4edc5d07e4f3fe427e1f51d6395552f7evboxsync } while (uc != 0);