common/string/uniread.cpp

	uniread.cpp revision c58f1213e628a545081c70e26c6b67a841cff880
/* $Id$ */
/** @file
 * IPRT - Unicode Specification Reader.
 */

/*
 * Copyright (C) 2006-2012 Oracle Corporation
 *
 * This file is part of VirtualBox Open Source Edition (OSE), as
 * available from http://www.virtualbox.org. This file is free software;
 * you can redistribute it and/or modify it under the terms of the GNU
 * General Public License (GPL) as published by the Free Software
 * Foundation, in version 2 as it comes in the "COPYING" file of the
 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
 * VirtualBox OSE distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 */

/*******************************************************************************
*   Header Files                                                               *
*******************************************************************************/
#include <iprt/types.h>
#include <iprt/stdarg.h>
#include <iprt/ctype.h>

#include <stdio.h>
#include <string.h>
#include <stdlib.h>


/*******************************************************************************
*   Global Variables                                                           *
*******************************************************************************/
/** When set, no output is produced.  Very useful when debugging ths code. */
static bool g_fQuiet = false;
/** The file we're currently parsing. */
static const char *g_pszCurFile;
/** The current line number. */
static unsigned g_iLine;


/**
 * Exit the program after printing a parse error.
 *
 * @param   pszFormat           The message.
 * @param   ...                 Format arguments.
 */
static void ParseError(const char *pszFormat, ...)
{
    va_list va;
    va_start(va, pszFormat);
    fprintf(stderr, "parse error: %s:%u: ", g_pszCurFile, g_iLine);
    vfprintf(stderr, pszFormat, va);
    va_end(va);
    exit(1);
}

/**
 * Strip a line.
 * @returns pointer to first non-blank char.
 * @param   pszLine     The line string to strip.
 */
static char *StripLine(char *pszLine)
{
    while (*pszLine == ' ' || *pszLine == '\t')
        pszLine++;

    char *psz = strchr(pszLine, '#');
    if (psz)
        *psz = '\0';
    else
        psz = strchr(pszLine, '\0');
    while (psz > pszLine)
    {
        switch (psz[-1])
        {
            case ' ':
            case '\t':
            case '\n':
            case '\r':
                *--psz = '\0';
                continue;
        }
        break;
    }

    return pszLine;
}


/**
 * Checks if the line is blank or a comment line and should be skipped.
 * @returns true/false.
 * @param   pszLine     The line to consider.
 */
static bool IsCommentOrBlankLine(const char *pszLine)
{
    while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\n' || *pszLine == '\r')
        pszLine++;
    return *pszLine == '#' || *pszLine == '\0';
}


/**
 * Get the first field in the string.
 *
 * @returns Pointer to the next field.
 * @param   ppsz        Where to store the pointer to the next field.
 * @param   pszLine     The line string. (could also be *ppsz from a FirstNext call)
 */
static char *FirstField(char **ppsz, char *pszLine)
{
    char *psz = strchr(pszLine, ';');
    if (!psz)
        *ppsz = psz = strchr(pszLine, '\0');
    else
    {
        *psz = '\0';
        *ppsz = psz + 1;
    }

    /* strip */
    while (*pszLine == ' ' || *pszLine == '\t' || *pszLine == '\r' || *pszLine == '\n')
        pszLine++;
    while (psz > pszLine)
    {
        switch (psz[-1])
        {
            case ' ':
            case '\t':
            case '\n':
            case '\r':
                *--psz = '\0';
                continue;
        }
        break;
    }
    return pszLine;
}


/**
 * Get the next field in a field enumeration.
 *
 * @returns Pointer to the next field.
 * @param   ppsz        Where to get and store the string position.
 */
static char *NextField(char **ppsz)
{
    return FirstField(ppsz, *ppsz);
}


/**
 * Splits a decomposition field.
 *
 * This may start with a type that is enclosed in angle brackets.
 *
 * @returns Pointer to the mapping values following the type. @a *ppsz if empty.
 * @param   ppszType    Pointer to the type field pointer.  On input the type
 *                      field contains the combined type and mapping string.  On
 *                      output this should only contain the type, no angle
 *                      brackets.  If no type specified, it is replaced with an
 *                      empty string (const).
 */
static char *SplitDecompField(char **ppszType)
{
    /* Empty field? */
    char *psz = *ppszType;
    if (!*psz)
        return psz;

    /* No type? */
    if (*psz != '<')
    {
        *ppszType = (char *)"";
        return psz;
    }

    /* Split out the type. */
    *ppszType = ++psz;
    psz = strchr(psz, '>');
    if (!psz)
    {
        ParseError("Bad Decomposition Type/Mappings\n");
        return *ppszType;
    }
    *psz++ = '\0';

    psz = StripLine(psz);
    if (!*psz)
        ParseError("Missing decomposition mappings\n");
    return psz;
}

/**
 * Converts a code point field to a number.
 * @returns Code point.
 * @param   psz     The field string.
 */
static RTUNICP ToNum(const char *psz)
{
    char *pszEnd = NULL;
    unsigned long ul = strtoul(psz, &pszEnd, 16);
    if (pszEnd && *pszEnd)
        ParseError("failed converting '%s' to a number!\n", psz);
    return (RTUNICP)ul;
}


/**
 * Same as ToNum except that if the field is empty the Default is returned.
 */
static RTUNICP ToNumDefault(const char *psz, RTUNICP Default)
{
    if (*psz)
        return ToNum(psz);
    return Default;
}


/**
 * Converts a code point range to numbers.
 * @returns The start code point.\
 * @returns ~(RTUNICP)0 on failure.
 * @param   psz     The field string.
 * @param   pLast   Where to store the last code point in the range.
 */
static RTUNICP ToRange(const char *psz, PRTUNICP pLast)
{
    char *pszEnd = NULL;
    unsigned long ulStart = strtoul(psz, &pszEnd, 16);
    unsigned long ulLast = ulStart;
    if (pszEnd && *pszEnd)
    {
        if (*pszEnd == '.')
        {
            while (*pszEnd == '.')
                pszEnd++;
            ulLast = strtoul(pszEnd, &pszEnd, 16);
            if (pszEnd && *pszEnd)
            {
                ParseError("failed converting '%s' to a number!\n", psz);
                return ~(RTUNICP)0;
            }
        }
        else
        {
            ParseError("failed converting '%s' to a number!\n", psz);
            return ~(RTUNICP)0;
        }
    }
    *pLast = (RTUNICP)ulLast;
    return (RTUNICP)ulStart;

}

/**
 * For converting the decomposition mappings field and similar.
 *
 * @returns Mapping array or NULL if none.
 * @param   psz                 The string to convert.  Can be empty.
 * @param   pcEntries           Where to store the number of entries.
 * @param   cMax                The max number of entries.
 */
static PRTUNICP ToMapping(char *psz, unsigned *pcEntries, unsigned cMax)
{
    PRTUNICP paCps  = NULL;
    unsigned cAlloc = 0;
    unsigned i      = 0;

    /* Convert the code points. */
    while (psz)
    {
        /* skip leading spaces */
        while (RT_C_IS_BLANK(*psz))
            psz++;

        /* the end? */
        if (!*psz)
            break;

        /* room left? */
        if (i >= cMax)
        {
            ParseError("Too many mappings.\n");
            break;
        }
        if (i >= cAlloc)
        {
            cAlloc += 4;
            paCps = (PRTUNICP)realloc(paCps, cAlloc * sizeof(paCps[0]));
            if (!paCps)
            {
                fprintf(stderr, "out of memory (%u)\n", (unsigned)(cAlloc * sizeof(paCps[0])));
                exit(1);
            }
        }

        /* Find the end. */
        char *pszThis = psz;
        while (RT_C_IS_XDIGIT(*psz))
            psz++;
        if (*psz && !RT_C_IS_BLANK(*psz))
            ParseError("Malformed mappings.\n");
        if (*psz)
            *psz++ = '\0';

        /* Convert to number and add it. */
        paCps[i++] = ToNum(pszThis);
    }

    *pcEntries = i;
    return paCps;
}


/**
 * Duplicate a string, optimize certain strings to save memory.
 *
 * @returns Pointer to string copy.
 * @param   pszStr      The string to duplicate.
 */
static char *DupStr(const char *pszStr)
{
    if (!*pszStr)
        return (char*)"";
    char *psz = strdup(pszStr);
    if (psz)
        return psz;

    fprintf(stderr, "out of memory!\n");
    exit(1);
}


/**
 * Array of all possible and impossible unicode code points as of 4.1
 */
struct CPINFO
{
    RTUNICP     CodePoint;
    RTUNICP     SimpleUpperCaseMapping;
    RTUNICP     SimpleLowerCaseMapping;
    RTUNICP     SimpleTitleCaseMapping;
    unsigned    CanonicalCombiningClass;
    const char *pszDecompositionType;
    unsigned    cDecompositionMapping;
    PRTUNICP    paDecompositionMapping;
    const char *pszName;
    /** Set if this is an unused entry */
    unsigned    fNullEntry : 1;

    unsigned    fAlphabetic : 1;
    unsigned    fASCIIHexDigit : 1;
    unsigned    fBidiControl : 1;
    unsigned    fCaseIgnorable : 1;
    unsigned    fCased : 1;
    unsigned    fChangesWhenCasefolded : 1;
    unsigned    fChangesWhenCasemapped : 1;
    unsigned    fChangesWhenLowercased : 1;
    unsigned    fChangesWhenTitlecased : 1;
    unsigned    fChangesWhenUppercased : 1;
    unsigned    fDash : 1;
    unsigned    fDefaultIgnorableCodePoint : 1;
    unsigned    fDeprecated : 1;
    unsigned    fDiacritic : 1;
    unsigned    fExtender : 1;
    unsigned    fGraphemeBase : 1;
    unsigned    fGraphemeExtend : 1;
    unsigned    fGraphemeLink : 1;
    unsigned    fHexDigit : 1;
    unsigned    fHyphen : 1;
    unsigned    fIDContinue : 1;
    unsigned    fIdeographic : 1;
    unsigned    fIDSBinaryOperator : 1;
    unsigned    fIDStart : 1;
    unsigned    fIDSTrinaryOperator : 1;
    unsigned    fJoinControl : 1;
    unsigned    fLogicalOrderException : 1;
    unsigned    fLowercase : 1;
    unsigned    fMath : 1;
    unsigned    fNoncharacterCodePoint : 1;
    unsigned    fOtherAlphabetic : 1;
    unsigned    fOtherDefaultIgnorableCodePoint : 1;
    unsigned    fOtherGraphemeExtend : 1;
    unsigned    fOtherIDContinue : 1;
    unsigned    fOtherIDStart : 1;
    unsigned    fOtherLowercase : 1;
    unsigned    fOtherMath : 1;
    unsigned    fOtherUppercase : 1;
    unsigned    fPatternSyntax : 1;
    unsigned    fPatternWhiteSpace : 1;
    unsigned    fQuotationMark : 1;
    unsigned    fRadical : 1;
    unsigned    fSoftDotted : 1;
    unsigned    fSTerm : 1;
    unsigned    fTerminalPunctuation : 1;
    unsigned    fUnifiedIdeograph : 1;
    unsigned    fUppercase : 1;
    unsigned    fVariationSelector : 1;
    unsigned    fWhiteSpace : 1;
    unsigned    fXIDContinue : 1;
    unsigned    fXIDStart : 1;

    /** @name DerivedNormalizationProps.txt
     * @{ */
    unsigned    fFullCompositionExclusion : 1;
    unsigned    fInvNFC_QC : 2;     /**< If 1 (NFC_QC == N) then code point 100% sure not part of NFC string. */
    unsigned    fInvNFD_QC : 2;     /**< If 1 (NFD_QC == N) then code point 100% sure not part of NFD string. */
    unsigned    fInvNFKC_QC : 2;
    unsigned    fInvNFKD_QC : 2;
    unsigned    fExpandsOnNFC : 1;
    unsigned    fExpandsOnNFD : 1;
    unsigned    fExpandsOnNFKC : 1;
    unsigned    fExpandsOnNFKD : 1;
    /** @}  */

    /* unprocessed stuff, so far. */
    const char *pszGeneralCategory;
    const char *pszBidiClass;
    const char *pszNumericType;
    const char *pszNumericValueD;
    const char *pszNumericValueN;
    const char *pszBidiMirrored;
    const char *pszUnicode1Name;
    const char *pszISOComment;
} g_aCPInfo[0x110000];


/**
 * Creates a 'null' entry at i.
 * @param   i       The entry in question.
 */
static void NullEntry(unsigned i)
{
    g_aCPInfo[i].CodePoint = i;
    g_aCPInfo[i].fNullEntry = 1;
    g_aCPInfo[i].SimpleUpperCaseMapping = i;
    g_aCPInfo[i].SimpleLowerCaseMapping = i;
    g_aCPInfo[i].SimpleTitleCaseMapping = i;
    g_aCPInfo[i].pszDecompositionType = "";
    g_aCPInfo[i].cDecompositionMapping = 0;
    g_aCPInfo[i].paDecompositionMapping = NULL;
    g_aCPInfo[i].pszName = "";
    g_aCPInfo[i].pszGeneralCategory = "";
    g_aCPInfo[i].pszBidiClass = "";
    g_aCPInfo[i].pszNumericType = "";
    g_aCPInfo[i].pszNumericValueD = "";
    g_aCPInfo[i].pszNumericValueN = "";
    g_aCPInfo[i].pszBidiMirrored = "";
    g_aCPInfo[i].pszUnicode1Name = "";
    g_aCPInfo[i].pszISOComment = "";
}


/**
 * Open a file for reading, optionally with a base path prefixed.
 *
 * @returns file stream on success, NULL w/ complaint on failure.
 * @param   pszBasePath         The base path, can be NULL.
 * @param   pszFilename         The name of the file to open.
 */
static FILE *OpenFile(const char *pszBasePath, const char *pszFilename)
{
    FILE *pFile;
    if (   !pszBasePath
        || *pszFilename == '/'
#if defined(_MSC_VER) || defined(__OS2__)
        || *pszFilename == '\\'
        || (*pszFilename && pszFilename[1] == ':')
#endif
       )
    {
        pFile = fopen(pszFilename, "r");
        if (!pFile)
            fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFilename);
    }
    else
    {
        size_t cchBasePath = strlen(pszBasePath);
        size_t cchFilename = strlen(pszFilename);
        char  *pszFullName = (char *)malloc(cchBasePath + 1 + cchFilename + 1);
        if (!pszFullName)
        {
            fprintf(stderr, "uniread: failed to allocate %d bytes\n", (int)(cchBasePath + 1 + cchFilename + 1));
            return NULL;
        }

        memcpy(pszFullName, pszBasePath, cchBasePath);
        pszFullName[cchBasePath] = '/';
        memcpy(&pszFullName[cchBasePath + 1], pszFilename, cchFilename + 1);

        pFile = fopen(pszFullName, "r");
        if (!pFile)
            fprintf(stderr, "uniread: failed to open '%s' for reading\n", pszFullName);
        free(pszFullName);
    }
    g_pszCurFile = pszFilename;
    g_iLine      = 0;
    return pFile;
}


/**
 * Wrapper around fgets that keep track of the line number.
 *
 * @returns See fgets.
 * @param   pszBuf              The buffer.  See fgets for output definition.
 * @param   cbBuf               The buffer size.
 * @param   pFile               The file to read from.
 */
static char *GetLineFromFile(char *pszBuf, int cbBuf, FILE *pFile)
{
    g_iLine++;
    return fgets(pszBuf, cbBuf, pFile);
}


/**
 * Closes a file opened by OpenFile
 *
 * @param   pFile               The file to close.
 */
static void CloseFile(FILE *pFile)
{
    g_pszCurFile = NULL;
    g_iLine = 0;
    fclose(pFile);
}


/**
 * Read the UnicodeData.txt file.
 * @returns 0 on success.
 * @returns !0 on failure.
 * @param   pszBasePath         The base path, can be NULL.
 * @param   pszFilename         The name of the file.
 */
static int ReadUnicodeData(const char *pszBasePath, const char *pszFilename)
{
    /*
     * Open input.
     */
    FILE *pFile = OpenFile(pszBasePath, pszFilename);
    if (!pFile)
        return 1;

    /*
     * Parse the input and spit out the output.
     */
    char szLine[4096];
    RTUNICP i = 0;
    while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
    {
        if (IsCommentOrBlankLine(szLine))
            continue;

        char *pszCurField;
        char *pszCodePoint = FirstField(&pszCurField, StripLine(szLine)); /* 0 */
        char *pszName = NextField(&pszCurField);                          /* 1 */
        char *pszGeneralCategory = NextField(&pszCurField);               /* 2 */
        char *pszCanonicalCombiningClass = NextField(&pszCurField);       /* 3 */
        char *pszBidiClass = NextField(&pszCurField);                     /* 4 */
        char *pszDecompositionType = NextField(&pszCurField);             /* 5 */
        char *pszDecompositionMapping = SplitDecompField(&pszDecompositionType);
        char *pszNumericType = NextField(&pszCurField);                   /* 6 */
        char *pszNumericValueD = NextField(&pszCurField);                 /* 7 */
        char *pszNumericValueN = NextField(&pszCurField);                 /* 8 */
        char *pszBidiMirrored = NextField(&pszCurField);                  /* 9 */
        char *pszUnicode1Name = NextField(&pszCurField);                  /* 10 */
        char *pszISOComment = NextField(&pszCurField);                    /* 11 */
        char *pszSimpleUpperCaseMapping = NextField(&pszCurField);        /* 12 */
        char *pszSimpleLowerCaseMapping = NextField(&pszCurField);        /* 13 */
        char *pszSimpleTitleCaseMapping = NextField(&pszCurField);        /* 14 */

        RTUNICP CodePoint = ToNum(pszCodePoint);
        if (CodePoint >= RT_ELEMENTS(g_aCPInfo))
        {
            ParseError("U+05X is out of range\n", CodePoint);
            continue;
        }

        /* catchup? */
        while (i < CodePoint)
            NullEntry(i++);
        if (i != CodePoint)
        {
            ParseError("i=%d CodePoint=%u\n", i, CodePoint);
            CloseFile(pFile);
            return 1;
        }

        /* this one */
        g_aCPInfo[i].CodePoint = i;
        g_aCPInfo[i].fNullEntry = 0;
        g_aCPInfo[i].pszName                    = DupStr(pszName);
        g_aCPInfo[i].SimpleUpperCaseMapping     = ToNumDefault(pszSimpleUpperCaseMapping, CodePoint);
        g_aCPInfo[i].SimpleLowerCaseMapping     = ToNumDefault(pszSimpleLowerCaseMapping, CodePoint);
        g_aCPInfo[i].SimpleTitleCaseMapping     = ToNumDefault(pszSimpleTitleCaseMapping, CodePoint);
        g_aCPInfo[i].CanonicalCombiningClass    = ToNum(pszCanonicalCombiningClass);
        g_aCPInfo[i].pszDecompositionType       = DupStr(pszDecompositionType);
        g_aCPInfo[i].paDecompositionMapping     = ToMapping(pszDecompositionMapping, &g_aCPInfo[i].cDecompositionMapping, 20);
        g_aCPInfo[i].pszGeneralCategory         = DupStr(pszGeneralCategory);
        g_aCPInfo[i].pszBidiClass               = DupStr(pszBidiClass);
        g_aCPInfo[i].pszNumericType             = DupStr(pszNumericType);
        g_aCPInfo[i].pszNumericValueD           = DupStr(pszNumericValueD);
        g_aCPInfo[i].pszNumericValueN           = DupStr(pszNumericValueN);
        g_aCPInfo[i].pszBidiMirrored            = DupStr(pszBidiMirrored);
        g_aCPInfo[i].pszUnicode1Name            = DupStr(pszUnicode1Name);
        g_aCPInfo[i].pszISOComment              = DupStr(pszISOComment);
        i++;
    }

    /* catchup? */
    while (i < RT_ELEMENTS(g_aCPInfo))
        NullEntry(i++);
    CloseFile(pFile);

    return 0;
}


/**
 * Generates excluded data.
 *
 * @returns 0 on success, exit code on failure.
 */
static int GenerateExcludedData(void)
{
    /*
     * Hangul Syllables U+AC00 to U+D7A3.
     */
    for (RTUNICP i = 0xac00; i <= 0xd7a3; i++)
    {
        g_aCPInfo[i].fNullEntry = 0;
        g_aCPInfo[i].fInvNFD_QC = 1;
        /** @todo generate the decomposition: http://unicode.org/reports/tr15/#Hangul
         *         */
    }

    /** @todo
     * CJK Ideographs Extension A (U+3400 - U+4DB5)
     * CJK Ideographs (U+4E00 - U+9FA5)
     * CJK Ideograph Extension B (U+20000 - U+2A6D6)
     * CJK Ideograph Extension C (U+2A700 - U+2B734)
     */

    return 0;
}


/**
 * Worker for ApplyProperty that handles a yes, no, maybe property value.
 *
 * @returns 0 (NO), 1 (YES), 2 (MAYBE).
 * @param   ppszNextField   The field cursor, input and output.
 */
static int YesNoMaybePropertyValue(char **ppszNextField)
{
    if (!**ppszNextField)
    {
        ParseError("Missing Y/N/M field\n");
        return 0;
    }
    char *psz = NextField(ppszNextField);
    if (!strcmp(psz, "N"))
        return 0;
    if (!strcmp(psz, "Y"))
        return 1;
    if (!strcmp(psz, "M"))
        return 2;
    ParseError("Unexpected Y/N/M value: '%s'\n",  psz);
    return 0;
}


/**
 * Inverted version of YesNoMaybePropertyValue
 *
 * @returns 1 (NO), 0 (YES), 2 (MAYBE).
 * @param   ppszNextField   The field cursor, input and output.
 */
static int YesNoMaybePropertyValueInv(char **ppszNextField)
{
    unsigned rc = YesNoMaybePropertyValue(ppszNextField);
    switch (rc)
    {
        case 0:     return 1;
        case 1:     return 0;
        default:    return rc;
    }
}


/**
 * Applies a property to a code point.
 *
 * @param   StartCP     The code point.
 * @param   pszProperty The property name.
 */
static void ApplyProperty(RTUNICP StartCP, const char *pszProperty, char *pszNextField)
{
    if (StartCP >= RT_ELEMENTS(g_aCPInfo))
    {
        ParseError("U+%06X is out of the g_aCPInfo range.\n", StartCP);
        return;
    }
    struct CPINFO *pCPInfo = &g_aCPInfo[StartCP];
    /* string switch */
         if (!strcmp(pszProperty, "ASCII_Hex_Digit")) pCPInfo->fASCIIHexDigit = 1;
    else if (!strcmp(pszProperty, "Alphabetic")) pCPInfo->fAlphabetic = 1;
    else if (!strcmp(pszProperty, "Bidi_Control")) pCPInfo->fBidiControl = 1;
    else if (!strcmp(pszProperty, "Case_Ignorable")) pCPInfo->fCaseIgnorable = 1;
    else if (!strcmp(pszProperty, "Cased")) pCPInfo->fCased = 1;
    else if (!strcmp(pszProperty, "Changes_When_Casefolded")) pCPInfo->fChangesWhenCasefolded = 1;
    else if (!strcmp(pszProperty, "Changes_When_Casemapped")) pCPInfo->fChangesWhenCasemapped = 1;
    else if (!strcmp(pszProperty, "Changes_When_Lowercased")) pCPInfo->fChangesWhenLowercased = 1;
    else if (!strcmp(pszProperty, "Changes_When_Titlecased")) pCPInfo->fChangesWhenTitlecased = 1;
    else if (!strcmp(pszProperty, "Changes_When_Uppercased")) pCPInfo->fChangesWhenUppercased = 1;
    else if (!strcmp(pszProperty, "Dash")) pCPInfo->fDash = 1;
    else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
    else if (!strcmp(pszProperty, "Deprecated")) pCPInfo->fDeprecated = 1;
    else if (!strcmp(pszProperty, "Diacritic")) pCPInfo->fDiacritic = 1;
    else if (!strcmp(pszProperty, "Extender")) pCPInfo->fExtender = 1;
    else if (!strcmp(pszProperty, "Grapheme_Base")) pCPInfo->fGraphemeBase = 1;
    else if (!strcmp(pszProperty, "Grapheme_Extend")) pCPInfo->fGraphemeExtend = 1;
    else if (!strcmp(pszProperty, "Grapheme_Link")) pCPInfo->fGraphemeLink = 1;
    else if (!strcmp(pszProperty, "Hex_Digit")) pCPInfo->fHexDigit = 1;
    else if (!strcmp(pszProperty, "Hyphen")) pCPInfo->fHyphen = 1;
    else if (!strcmp(pszProperty, "ID_Continue")) pCPInfo->fIDContinue = 1;
    else if (!strcmp(pszProperty, "ID_Start")) pCPInfo->fIDStart = 1;
    else if (!strcmp(pszProperty, "Ideographic")) pCPInfo->fIdeographic = 1;
    else if (!strcmp(pszProperty, "IDS_Binary_Operator")) pCPInfo->fIDSBinaryOperator = 1;
    else if (!strcmp(pszProperty, "IDS_Trinary_Operator")) pCPInfo->fIDSTrinaryOperator = 1;
    else if (!strcmp(pszProperty, "Join_Control")) pCPInfo->fJoinControl = 1;
    else if (!strcmp(pszProperty, "Logical_Order_Exception")) pCPInfo->fLogicalOrderException = 1;
    else if (!strcmp(pszProperty, "Lowercase")) pCPInfo->fLowercase = 1;
    else if (!strcmp(pszProperty, "Math")) pCPInfo->fMath = 1;
    else if (!strcmp(pszProperty, "Noncharacter_Code_Point")) pCPInfo->fNoncharacterCodePoint = 1;
    else if (!strcmp(pszProperty, "Other_Alphabetic")) pCPInfo->fOtherAlphabetic = 1;
    else if (!strcmp(pszProperty, "Other_Default_Ignorable_Code_Point")) pCPInfo->fOtherDefaultIgnorableCodePoint = 1;
    else if (!strcmp(pszProperty, "Other_Grapheme_Extend")) pCPInfo->fOtherGraphemeExtend = 1;
    else if (!strcmp(pszProperty, "Other_ID_Continue")) pCPInfo->fOtherIDContinue = 1;
    else if (!strcmp(pszProperty, "Other_ID_Start")) pCPInfo->fOtherIDStart = 1;
    else if (!strcmp(pszProperty, "Other_Lowercase")) pCPInfo->fOtherLowercase = 1;
    else if (!strcmp(pszProperty, "Other_Math")) pCPInfo->fOtherMath = 1;
    else if (!strcmp(pszProperty, "Other_Uppercase")) pCPInfo->fOtherUppercase = 1;
    else if (!strcmp(pszProperty, "Pattern_Syntax")) pCPInfo->fPatternSyntax = 1;
    else if (!strcmp(pszProperty, "Pattern_White_Space")) pCPInfo->fPatternWhiteSpace = 1;
    else if (!strcmp(pszProperty, "Quotation_Mark")) pCPInfo->fQuotationMark = 1;
    else if (!strcmp(pszProperty, "Radical")) pCPInfo->fRadical = 1;
    else if (!strcmp(pszProperty, "Soft_Dotted")) pCPInfo->fSoftDotted = 1;
    else if (!strcmp(pszProperty, "STerm")) pCPInfo->fSTerm = 1;
    else if (!strcmp(pszProperty, "Terminal_Punctuation")) pCPInfo->fTerminalPunctuation = 1;
    else if (!strcmp(pszProperty, "Unified_Ideograph")) pCPInfo->fUnifiedIdeograph = 1;
    else if (!strcmp(pszProperty, "Uppercase")) pCPInfo->fUppercase = 1;
    else if (!strcmp(pszProperty, "Variation_Selector")) pCPInfo->fVariationSelector = 1;
    else if (!strcmp(pszProperty, "White_Space")) pCPInfo->fWhiteSpace = 1;
    else if (!strcmp(pszProperty, "XID_Continue")) pCPInfo->fXIDContinue = 1;
    else if (!strcmp(pszProperty, "XID_Start")) pCPInfo->fXIDStart = 1;
    /* DerivedNormalizationProps: */
    else if (!strcmp(pszProperty, "FC_NFKC")) return; /* ignored */
    else if (!strcmp(pszProperty, "Full_Composition_Exclusion")) pCPInfo->fFullCompositionExclusion = 1;
    else if (!strcmp(pszProperty, "NFC_QC"))  pCPInfo->fInvNFC_QC  = YesNoMaybePropertyValueInv(&pszNextField);
    else if (!strcmp(pszProperty, "NFD_QC"))  pCPInfo->fInvNFD_QC  = YesNoMaybePropertyValueInv(&pszNextField);
    else if (!strcmp(pszProperty, "NFKC_QC")) pCPInfo->fInvNFKC_QC = YesNoMaybePropertyValueInv(&pszNextField);
    else if (!strcmp(pszProperty, "NFKD_QC")) pCPInfo->fInvNFKD_QC = YesNoMaybePropertyValueInv(&pszNextField);
    else if (!strcmp(pszProperty, "Expands_On_NFC"))  pCPInfo->fExpandsOnNFC  = 1;
    else if (!strcmp(pszProperty, "Expands_On_NFD"))  pCPInfo->fExpandsOnNFD  = 1;
    else if (!strcmp(pszProperty, "Expands_On_NFKC")) pCPInfo->fExpandsOnNFKC = 1;
    else if (!strcmp(pszProperty, "Expands_On_NFKD")) pCPInfo->fExpandsOnNFKD = 1;
    else if (!strcmp(pszProperty, "NFKC_CF")) return; /*ignore */
    else if (!strcmp(pszProperty, "Changes_When_NFKC_Casefolded")) return; /*ignore */
    else
    {
        ParseError("Unknown property '%s'\n", pszProperty);
        return;
    }

    if (pszNextField && *pszNextField)
        ParseError("Unexpected next field: '%s'\n", pszNextField);
}


/**
 * Reads a property file.
 *
 * There are several property files, this code can read all
 * of those but will only make use of the properties it recognizes.
 *
 * @returns 0 on success.
 * @returns !0 on failure.
 * @param   pszBasePath         The base path, can be NULL.
 * @param   pszFilename     The name of the file.
 */
static int ReadProperties(const char *pszBasePath, const char *pszFilename)
{
    /*
     * Open input.
     */
    FILE *pFile = OpenFile(pszBasePath, pszFilename);
    if (!pFile)
        return 1;

    /*
     * Parse the input and spit out the output.
     */
    char szLine[4096];
    while (GetLineFromFile(szLine, sizeof(szLine), pFile) != NULL)
    {
        if (IsCommentOrBlankLine(szLine))
            continue;
        char *pszCurField;
        char *pszRange    = FirstField(&pszCurField, StripLine(szLine));
        char *pszProperty = NextField(&pszCurField);
        if (!*pszProperty)
        {
            ParseError("no property field.\n");
            continue;
        }

        RTUNICP LastCP;
        RTUNICP StartCP = ToRange(pszRange, &LastCP);
        if (StartCP == ~(RTUNICP)0)
            continue;

        while (StartCP <= LastCP)
            ApplyProperty(StartCP++, pszProperty, pszCurField);
    }

    CloseFile(pFile);

    return 0;
}


/**
 * Append a flag to the string.
 */
static char *AppendFlag(char *psz, const char *pszFlag)
{
    char *pszEnd = strchr(psz, '\0');
    if (pszEnd != psz)
    {
        *pszEnd++ = ' ';
        *pszEnd++ = '|';
        *pszEnd++ = ' ';
    }
    strcpy(pszEnd, pszFlag);
    return psz;
}

/**
 * Calcs the flags for a code point.
 * @returns true if there is a flag.
 * @returns false if the isn't.
 */
static bool CalcFlags(struct CPINFO *pInfo, char *pszFlags)
{
    pszFlags[0] = '\0';
    /** @todo read the specs on this other vs standard stuff, and check out the finer points */
    if (pInfo->fAlphabetic || pInfo->fOtherAlphabetic)
        AppendFlag(pszFlags, "RTUNI_ALPHA");
    if (pInfo->fHexDigit || pInfo->fASCIIHexDigit)
        AppendFlag(pszFlags, "RTUNI_XDIGIT");
    if (!strcmp(pInfo->pszGeneralCategory, "Nd"))
        AppendFlag(pszFlags, "RTUNI_DDIGIT");
    if (pInfo->fWhiteSpace)
        AppendFlag(pszFlags, "RTUNI_WSPACE");
    if (pInfo->fUppercase || pInfo->fOtherUppercase)
        AppendFlag(pszFlags, "RTUNI_UPPER");
    if (pInfo->fLowercase || pInfo->fOtherLowercase)
        AppendFlag(pszFlags, "RTUNI_LOWER");
    //if (pInfo->???)
    //    AppendFlag(pszFlags, "RTUNI_BSPACE");
    if (pInfo->fInvNFD_QC != 0 || pInfo->fInvNFC_QC != 0)
    {
        AppendFlag(pszFlags, "RTUNI_QC_NFX");
        if (!pInfo->paDecompositionMapping && pInfo->fInvNFD_QC)
            fprintf(stderr, "uniread: U+%05X is QC_NFD but has no mappings.\n", pInfo->CodePoint);
        else if (*pInfo->pszDecompositionType && pInfo->fInvNFD_QC)
            fprintf(stderr, "uniread: U+%05X is QC_NFD but has no canonical mappings.\n", pInfo->CodePoint);
    }
    else if (pInfo->paDecompositionMapping && !*pInfo->pszDecompositionType)
        fprintf(stderr, "uniread: U+%05X is not QC_NFX but has canonical mappings.\n", pInfo->CodePoint);

    if (!*pszFlags)
    {
        pszFlags[0] = '0';
        pszFlags[1] = '\0';
        return false;
    }
    return true;
}


/**
 * printf wrapper for the primary output stream.
 *
 * @returns See vfprintf.
 * @param   pszFormat           The vfprintf format string.
 * @param   ...                 The format arguments.
 */
static int Stream1Printf(const char *pszFormat, ...)
{
    int     cch;
    va_list va;
    va_start(va, pszFormat);
    if (!g_fQuiet)
        cch = vfprintf(stdout, pszFormat, va);
    else
        cch = strlen(pszFormat);
    va_end(va);
    return cch;
}


/** the data store for stream two. */
static char g_szStream2[10240];
static unsigned volatile g_offStream2 = 0;

/**
 * Initializes the 2nd steam.
 */
static void Stream2Init(void)
{
    g_szStream2[0] = '\0';
    g_offStream2 = 0;
}

/**
 * Flushes the 2nd stream to stdout.
 */
static int Stream2Flush(void)
{
    g_szStream2[g_offStream2] = '\0';
    Stream1Printf("%s", g_szStream2);
    Stream2Init();
    return 0;
}

/**
 * printf to the 2nd stream.
 */
static int Stream2Printf(const char *pszFormat, ...)
{
    unsigned offStream2 = g_offStream2;
    va_list va;
    va_start(va, pszFormat);
    int cch = vsprintf(&g_szStream2[offStream2], pszFormat, va);
    va_end(va);
    offStream2 += cch;
    if (offStream2 >= sizeof(g_szStream2))
    {
        fprintf(stderr, "error: stream2 overflow!\n");
        exit(1);
    }
    g_offStream2 = offStream2;
    return cch;
}


/**
 * Print the unidata.cpp file header and include list.
 */
int PrintHeader(const char *argv0)
{
    Stream1Printf("/** @file\n"
                  " *\n"
                  " * IPRT - Unicode Tables.\n"
                  " *\n"
                  " * Automatically Generated by %s (" __DATE__ " " __TIME__ ")\n"
                  " */\n"
                  "\n"
                  "/*\n"
                  " * Copyright (C) 2006-2010 Oracle Corporation\n"
                  " *\n"
                  " * This file is part of VirtualBox Open Source Edition (OSE), as\n"
                  " * available from http://www.virtualbox.org. This file is free software;\n"
                  " * you can redistribute it and/or modify it under the terms of the GNU\n"
                  " * General Public License (GPL) as published by the Free Software\n"
                  " * Foundation, in version 2 as it comes in the \"COPYING\" file of the\n"
                  " * VirtualBox OSE distribution. VirtualBox OSE is distributed in the\n"
                  " * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.\n"
                  " *\n"
                  " * The contents of this file may alternatively be used under the terms\n"
                  " * of the Common Development and Distribution License Version 1.0\n"
                  " * (CDDL) only, as it comes in the \"COPYING.CDDL\" file of the\n"
                  " * VirtualBox OSE distribution, in which case the provisions of the\n"
                  " * CDDL are applicable instead of those of the GPL.\n"
                  " *\n"
                  " * You may elect to license modified versions of this file under the\n"
                  " * terms and conditions of either the GPL or the CDDL or both.\n"
                  " */\n"
                  "\n"
                  "#include <iprt/uni.h>\n"
                  "\n",
                  argv0);
    return 0;
}


/**
 * Print the flag tables.
 */
int PrintFlags(void)
{
    /*
     * Print flags table.
     */
    Stream2Init();
    Stream2Printf("const RTUNIFLAGSRANGE g_aRTUniFlagsRanges[] =\n"
                  "{\n");
    RTUNICP i = 0;
    int iStart = -1;
    while (i < RT_ELEMENTS(g_aCPInfo))
    {
        /* figure how far off the next chunk is */
        char szFlags[256];
        unsigned iNonNull = i;
        while (   iNonNull < RT_ELEMENTS(g_aCPInfo)
               && iNonNull >= 256
               && (g_aCPInfo[iNonNull].fNullEntry || !CalcFlags(&g_aCPInfo[iNonNull], szFlags)) )
            iNonNull++;
        if (iNonNull - i > 4096 || iNonNull == RT_ELEMENTS(g_aCPInfo))
        {
            if (iStart >= 0)
            {
                Stream1Printf("};\n\n");
                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniFlags0x%06x[0] },\n", iStart, i, iStart);
                iStart = -1;
            }
            i = iNonNull;
        }
        else
        {
            if (iStart < 0)
            {
                Stream1Printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
                              "{\n", i);
                iStart = i;
            }
            CalcFlags(&g_aCPInfo[i], szFlags);
            Stream1Printf("    %50s, /* U+%06x: %s*/\n", szFlags, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
            i++;
        }
    }
    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
                  "};\n\n\n");
    Stream1Printf("\n");
    return Stream2Flush();
}


/**
 * Prints the upper case tables.
 */
static int PrintUpper(void)
{
    Stream2Init();
    Stream2Printf("const RTUNICASERANGE g_aRTUniUpperRanges[] =\n"
                  "{\n");
    RTUNICP i = 0;
    int iStart = -1;
    while (i < RT_ELEMENTS(g_aCPInfo))
    {
        /* figure how far off the next chunk is */
        unsigned iSameCase = i;
        while (     iSameCase < RT_ELEMENTS(g_aCPInfo)
               &&   g_aCPInfo[iSameCase].SimpleUpperCaseMapping == g_aCPInfo[iSameCase].CodePoint
               &&   iSameCase >= 256)
            iSameCase++;
        if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
        {
            if (iStart >= 0)
            {
                Stream1Printf("};\n\n");
                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniUpper0x%06x[0] },\n", iStart, i, iStart);
                iStart = -1;
            }
            i = iSameCase;
        }
        else
        {
            if (iStart < 0)
            {
                Stream1Printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
                              "{\n", i);
                iStart = i;
            }
            Stream1Printf("    0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
            i++;
        }
    }
    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
                  "};\n\n\n");
    Stream1Printf("\n");
    return Stream2Flush();
}


/**
 * Prints the lowercase tables.
 */
static int PrintLower(void)
{
    Stream2Init();
    Stream2Printf("const RTUNICASERANGE g_aRTUniLowerRanges[] =\n"
                  "{\n");
    RTUNICP i = 0;
    int iStart = -1;
    while (i < RT_ELEMENTS(g_aCPInfo))
    {
        /* figure how far off the next chunk is */
        unsigned iSameCase = i;
        while (     iSameCase < RT_ELEMENTS(g_aCPInfo)
               &&   g_aCPInfo[iSameCase].SimpleLowerCaseMapping == g_aCPInfo[iSameCase].CodePoint
               &&   iSameCase >= 256)
            iSameCase++;
        if (iSameCase - i > 4096/sizeof(RTUNICP) || iSameCase == RT_ELEMENTS(g_aCPInfo))
        {
            if (iStart >= 0)
            {
                Stream1Printf("};\n\n");
                Stream2Printf("    { 0x%06x, 0x%06x, &g_afRTUniLower0x%06x[0] },\n", iStart, i, iStart);
                iStart = -1;
            }
            i = iSameCase;
        }
        else
        {
            if (iStart < 0)
            {
                Stream1Printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
                              "{\n", i);
                iStart = i;
            }
            Stream1Printf("    0x%02x, /* U+%06x: %s*/\n",
                          g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
            i++;
        }
    }
    Stream2Printf("    { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
                  "};\n\n\n");
    Stream1Printf("\n");
    return Stream2Flush();
}


int main(int argc, char **argv)
{
    /*
     * Parse args.
     */
    if (argc <= 1)
    {
        printf("usage: %s [-C|--dir <UCD-dir>] [UnicodeData.txt [DerivedCoreProperties.txt [PropList.txt] [DerivedNormalizationProps.txt]]]\n",
                argv[0]);
        return 1;
    }

    const char *pszBaseDir                      = NULL;
    const char *pszUnicodeData                  = "UnicodeData.txt";
    const char *pszDerivedCoreProperties        = "DerivedCoreProperties.txt";
    const char *pszPropList                     = "PropList.txt";
    const char *pszDerivedNormalizationProps    = "DerivedNormalizationProps.txt";
    int iFile = 0;
    for (int argi = 1;  argi < argc; argi++)
    {
        if (argv[argi][0] != '-')
        {
            switch (iFile++)
            {
                case 0: pszUnicodeData                  = argv[argi]; break;
                case 1: pszDerivedCoreProperties        = argv[argi]; break;
                case 2: pszPropList                     = argv[argi]; break;
                case 3: pszDerivedNormalizationProps    = argv[argi]; break;
                default:
                    fprintf(stderr, "uniread: syntax error at '%s': too many filenames\n", argv[argi]);
                    return 1;
            }
        }
        else if (   !strcmp(argv[argi], "--dir")
                 || !strcmp(argv[argi], "-C"))
        {
            if (argi + 1 >= argc)
            {
                fprintf(stderr, "uniread: syntax error: '%s' is missing the directory name.\n", argv[argi]);
                return 1;
            }
            argi++;
            pszBaseDir = argv[argi];
        }
        else if (   !strcmp(argv[argi], "-q")
                 || !strcmp(argv[argi], "--quiet"))
            g_fQuiet = true;
        else
        {
            fprintf(stderr, "uniread: syntax error at '%s': Unknown argument\n", argv[argi]);
            return 1;
        }
    }

    /*
     * Read the data.
     */
    int rc = ReadUnicodeData(pszBaseDir, pszUnicodeData);
    if (rc)
        return rc;
    rc = GenerateExcludedData();
    if (rc)
        return rc;
    rc = ReadProperties(pszBaseDir, pszPropList);
    if (rc)
        return rc;
    rc = ReadProperties(pszBaseDir, pszDerivedCoreProperties);
    if (rc)
        return rc;
    rc = ReadProperties(pszBaseDir, pszDerivedNormalizationProps);
    if (rc)
        return rc;

    /*
     * Print stuff.
     */
    rc = PrintHeader(argv[0]);
    if (rc)
        return rc;
    rc = PrintFlags();
    if (rc)
        return rc;
    rc = PrintUpper();
    if (rc)
        return rc;
    rc = PrintLower();
    if (rc)
        return rc;

    /* done */
    fflush(stdout);

    return rc;
}