utf-8-case.cpp revision e64031e20c39650a7bc902a3e1aba613b9415dee
/* $Id$ */
/** @file
* IPRT - UTF-8 Case Sensitivity and Folding.
*/
/*
* Copyright (C) 2006-2009 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
/**
* Performs a case insensitive string compare between two UTF-8 strings.
*
* specified by the unicode specs are used. It does not consider character pairs
* as they are used in some languages, just simple upper & lower case compares.
*
* The result is the difference between the mismatching codepoints after they
* both have been lower cased.
*
* If the string encoding is invalid the function will assert (strict builds)
* and use RTStrCmp for the remainder of the string.
*
* @returns < 0 if the first string less than the second string.
* @returns 0 if the first string identical to the second string.
* @returns > 0 if the first string greater than the second string.
* @param psz1 First UTF-8 string. Null is allowed.
* @param psz2 Second UTF-8 string. Null is allowed.
*/
{
return 0;
if (!psz1)
return -1;
if (!psz2)
return 1;
for (;;)
{
/* Get the codepoints */
if (RT_FAILURE(rc))
{
psz1--;
break;
}
if (RT_FAILURE(rc))
{
psz2--;
break;
}
/* compare */
if (iDiff)
{
if (iDiff)
{
if (iDiff)
return iDiff;
}
}
/* hit the terminator? */
if (!cp1)
return 0;
}
/* Hit some bad encoding, continue in case insensitive mode. */
}
/**
* Performs a case insensitive string compare between two UTF-8 strings, given a
* maximum string length.
*
* specified by the unicode specs are used. It does not consider character pairs
* as they are used in some languages, just simple upper & lower case compares.
*
* The result is the difference between the mismatching codepoints after they
* both have been lower cased.
*
* If the string encoding is invalid the function will assert (strict builds)
* and use RTStrCmp for the remainder of the string.
*
* @returns < 0 if the first string less than the second string.
* @returns 0 if the first string identical to the second string.
* @returns > 0 if the first string greater than the second string.
* @param psz1 First UTF-8 string. Null is allowed.
* @param psz2 Second UTF-8 string. Null is allowed.
* @param cchMax Maximum string length
*/
{
if (cchMax == 0)
return 0;
return 0;
if (!psz1)
return -1;
if (!psz2)
return 1;
for (;;)
{
/* Get the codepoints */
if (RT_FAILURE(rc))
{
psz1--;
cchMax++;
break;
}
if (RT_FAILURE(rc))
{
psz2--;
break;
}
/* compare */
if (iDiff)
{
if (iDiff)
{
if (iDiff)
return iDiff;
}
}
/* hit the terminator? */
return 0;
}
/* Hit some bad encoding, continue in case insensitive mode. */
}
{
/* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */
if (!pszHaystack)
return NULL;
if (!pszNeedle)
return NULL;
/* The empty string matches everything. */
if (!*pszNeedle)
return (char *)pszHaystack;
/*
* The search strategy is to pick out the first char of the needle, fold it,
* and match it against the haystack code point by code point. When encountering
* a matching code point we use RTStrNICmp for the remainder (if any) of the needle.
*/
const char * const pszNeedleStart = pszNeedle;
{
/* Cp0 is not a case sensitive char. */
for (;;)
{
if (!Cp)
break;
return (char *)pszHaystack - cchNeedleCp0;
}
}
{
/* Cp0 is case sensitive */
for (;;)
{
if (!Cp)
break;
return (char *)pszHaystack - cchNeedleCp0;
}
}
else
{
/* Cp0 is case sensitive and folds to two difference chars. (paranoia) */
for (;;)
{
if (!Cp)
break;
return (char *)pszHaystack - cchNeedleCp0;
}
}
return NULL;
}
{
/*
* Loop the code points in the string, converting them one by one.
* ASSUMES that the code points for upper and lower case are encoded
* with the exact same length.
*/
/** @todo Handled bad encodings correctly+quietly, remove assumption,
* optimize. */
while (*pszCur)
{
}
return psz;
}
{
/*
* Loop the code points in the string, converting them one by one.
* ASSUMES that the code points for upper and lower case are encoded
* with the exact same length.
*/
/** @todo Handled bad encodings correctly+quietly, remove assumption,
* optimize. */
while(*pszCur)
{
}
return psz;
}