uniread.cpp revision e64031e20c39650a7bc902a3e1aba613b9415dee
/* $Id$ */
/** @file
* IPRT - Unicode Specification Reader.
*/
/*
* Copyright (C) 2006-2007 Oracle Corporation
*
* This file is part of VirtualBox Open Source Edition (OSE), as
* available from http://www.virtualbox.org. This file is free software;
* General Public License (GPL) as published by the Free Software
* Foundation, in version 2 as it comes in the "COPYING" file of the
* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
*
* The contents of this file may alternatively be used under the terms
* of the Common Development and Distribution License Version 1.0
* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
* VirtualBox OSE distribution, in which case the provisions of the
* CDDL are applicable instead of those of the GPL.
*
* You may elect to license modified versions of this file under the
* terms and conditions of either the GPL or the CDDL or both.
*/
/*******************************************************************************
* Header Files *
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/**
* Strip a line.
* @returns pointer to first non-blank char.
* @param pszLine The line string to strip.
*/
{
pszLine++;
if (psz)
*psz = '\0';
else
{
switch (psz[-1])
{
case ' ':
case '\t':
case '\n':
case '\r':
*--psz = '\0';
continue;
}
break;
}
return pszLine;
}
/**
* Checks if the line is blank or a comment line and should be skipped.
* @param pszLine The line to consider.
*/
static bool IsCommentOrBlankLine(const char *pszLine)
{
pszLine++;
}
/**
* Get the first field in the string.
*
* @returns Pointer to the next field.
* @param ppsz Where to store the pointer to the next field.
* @param pszLine The line string. (could also be *ppsz from a FirstNext call)
*/
{
if (!psz)
else
{
*psz = '\0';
}
/* strip */
pszLine++;
{
switch (psz[-1])
{
case ' ':
case '\t':
case '\n':
case '\r':
*--psz = '\0';
continue;
}
break;
}
return pszLine;
}
/**
* Get the next field in a field enumeration.
*
* @returns Pointer to the next field.
* @param ppsz Where to get and store the string postition.
*/
{
}
/**
* Converts a code point field to a number.
* @returns Code point.
* @param psz The field string.
*/
{
}
/**
* Same as ToNum except that if the field is empty the Default is returned.
*/
{
if (*psz)
return Default;
}
/**
* Converts a code point range to numbers.
* @returns The start code point.\
* @returns ~(RTUNICP)0 on failure.
* @param psz The field string.
* @param pLast Where to store the last code point in the range.
*/
{
{
if (*pszEnd == '.')
{
while (*pszEnd == '.')
pszEnd++;
{
return ~(RTUNICP)0;
}
}
else
{
return ~(RTUNICP)0;
}
}
}
/**
* Duplicate a string, optimize certain strings to save memory.
*
* @returns Pointer to string copy.
* @param pszStr The string to duplicate.
*/
{
if (!*pszStr)
return (char*)"";
if (psz)
return psz;
exit(1);
}
/**
* Array of all possible and impossible unicode code points as of 4.1
*/
struct CPINFO
{
const char *pszName;
/** Set if this is an unused entry */
unsigned fNullEntry : 1;
unsigned fAlphabetic : 1;
unsigned fASCIIHexDigit : 1;
unsigned fBidiControl : 1;
unsigned fDash : 1;
unsigned fDefaultIgnorableCodePoint : 1;
unsigned fDeprecated : 1;
unsigned fDiacritic : 1;
unsigned fExtender : 1;
unsigned fGraphemeBase : 1;
unsigned fGraphemeExtend : 1;
unsigned fGraphemeLink : 1;
unsigned fHexDigit : 1;
unsigned fHyphen : 1;
unsigned fIDContinue : 1;
unsigned fIdeographic : 1;
unsigned fIDSBinaryOperator : 1;
unsigned fIDStart : 1;
unsigned fIDSTrinaryOperator : 1;
unsigned fJoinControl : 1;
unsigned fLogicalOrderException : 1;
unsigned fLowercase : 1;
unsigned fMath : 1;
unsigned fNoncharacterCodePoint : 1;
unsigned fOtherAlphabetic : 1;
unsigned fOtherDefaultIgnorableCodePoint : 1;
unsigned fOtherGraphemeExtend : 1;
unsigned fOtherIDContinue : 1;
unsigned fOtherIDStart : 1;
unsigned fOtherLowercase : 1;
unsigned fOtherMath : 1;
unsigned fOtherUppercase : 1;
unsigned fPatternSyntax : 1;
unsigned fPatternWhiteSpace : 1;
unsigned fQuotationMark : 1;
unsigned fRadical : 1;
unsigned fSoftDotted : 1;
unsigned fSTerm : 1;
unsigned fTerminalPunctuation : 1;
unsigned fUnifiedIdeograph : 1;
unsigned fUppercase : 1;
unsigned fVariationSelector : 1;
unsigned fWhiteSpace : 1;
unsigned fXIDContinue : 1;
unsigned fXIDStart : 1;
/* unprocess stuff, so far. */
const char *pszGeneralCategory;
const char *pszCanonicalCombiningClass;
const char *pszBidiClass;
const char *pszDecompositionType;
const char *pszDecompositionMapping;
const char *pszNumericType;
const char *pszNumericValue;
const char *pszBidiMirrored;
const char *pszUnicode1Name;
const char *pszISOComment;
} g_aCPInfo[0xf0000];
/**
* Creates a 'null' entry at i.
* @param i The entry in question.
*/
static void NullEntry(unsigned i)
{
g_aCPInfo[i].SimpleUpperCaseMapping = i;
g_aCPInfo[i].SimpleLowerCaseMapping = i;
g_aCPInfo[i].SimpleTitleCaseMapping = i;
}
/**
* Read the UnicodeData.txt file.
* @returns 0 on success.
* @returns !0 on failure.
* @param pszFilename The name of the file.
*/
static int ReadUnicodeData(const char *pszFilename)
{
/*
* Open input.
*/
if (!pFile)
{
return 1;
}
/*
* Parse the input and spit out the output.
*/
char szLine[4096];
RTUNICP i = 0;
{
if (IsCommentOrBlankLine(szLine))
continue;
char *pszCurField;
continue;
/* catchup? */
while (i < CodePoint)
NullEntry(i++);
if (i != CodePoint)
{
return 1;
}
/* this one */
g_aCPInfo[i].fNullEntry = 0;
i++;
}
/* catchup? */
while (i < RT_ELEMENTS(g_aCPInfo))
NullEntry(i++);
return 0;
}
/**
* Applies a property to a code point.
*
* @param StartCP The code point.
* @param pszProperty The property name.
*/
{
return;
/* string switch */
else if (!strcmp(pszProperty, "Other_Default_Ignorable_Code_Point")) pCPInfo->fOtherDefaultIgnorableCodePoint = 1;
else if (!strcmp(pszProperty, "Default_Ignorable_Code_Point")) pCPInfo->fDefaultIgnorableCodePoint = 1;
else
}
/**
* Reads a property file.
*
* There are several property files, this code can read all
* of those but will only make use of the properties it recognizes.
*
* @returns 0 on success.
* @returns !0 on failure.
* @param pszFilename The name of the file.
*/
static int ReadProperties(const char *pszFilename)
{
/*
* Open input.
*/
if (!pFile)
{
return 1;
}
/*
* Parse the input and spit out the output.
*/
char szLine[4096];
{
if (IsCommentOrBlankLine(szLine))
continue;
char *pszCurField;
if (!*pszProperty)
continue;
continue;
}
return 0;
}
/**
* Append a flag to the string.
*/
{
{
*pszEnd++ = ' ';
*pszEnd++ = '|';
*pszEnd++ = ' ';
}
return psz;
}
/**
* Calcs the flags for a code point.
* @returns true if there is a flag.
* @returns false if the isn't.
*/
{
pszFlags[0] = '\0';
/** @todo read the specs on this other vs standard stuff, and check out the finer points */
if (pInfo->fWhiteSpace)
//if (pInfo->fNumeric)
// AppendFlag(pszFlags, "RTUNI_NUMERIC");
if (!*pszFlags)
{
pszFlags[0] = '0';
return false;
}
return true;
}
/** the data store for stream two. */
static char g_szStream2[10240];
static unsigned g_offStream2 = 0;
/**
* Initializes the 2nd steam.
*/
static void Stream2Init(void)
{
g_szStream2[0] = '\0';
g_offStream2 = 0;
}
/**
* Flushes the 2nd stream to stdout.
*/
static int Stream2Flush(void)
{
return 0;
}
/**
* printf to the 2nd stream.
*/
static int Stream2Printf(const char *pszFormat, ...)
{
g_offStream2 += cch;
if (g_offStream2 >= sizeof(g_szStream2))
{
exit(1);
}
return cch;
}
/**
* Print the unidata.cpp file header and include list.
*/
int PrintHeader(const char *argv0)
{
/*
* Print file header.
*/
printf("/** @file\n"
" *\n"
" * IPRT - Unicode Tables\n"
" *\n"
" */\n\n"
"/*\n"
" * Copyright (C) 2006-2008 Sun Microsystems, Inc.\n"
" *\n"
" * This file is part of VirtualBox Open Source Edition (OSE), as\n"
" * available from http://www.virtualbox.org. This file is free software;\n"
" * General Public License as published by the Free Software Foundation,\n"
" * in version 2 as it comes in the \"COPYING\" file of the VirtualBox OSE\n"
" * distribution. VirtualBox OSE is distributed in the hope that it will\n"
" * be useful, but WITHOUT ANY WARRANTY of any kind.\n"
" *\n"
"\n"
"\n",
argv0);
return 0;
}
/**
* Print the flag tables.
*/
int PrintFlags(void)
{
/*
* Print flags table.
*/
Stream2Init();
Stream2Printf("const RTUNIFLAGSRANGE g_aRTUniFlagRanges[] =\n"
"{\n");
RTUNICP i = 0;
int iStart = -1;
while (i < RT_ELEMENTS(g_aCPInfo))
{
/* figure how far off the next chunk is */
char szFlags[256];
unsigned iNonNull = i;
&& iNonNull >= 256)
iNonNull++;
{
if (iStart >= 0)
{
printf("};\n\n");
iStart = -1;
}
i = iNonNull;
}
else
{
if (iStart < 0)
{
printf("static const uint8_t g_afRTUniFlags0x%06x[] = \n"
"{\n", i);
iStart = i;
}
i++;
}
}
Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
"};\n\n\n");
printf("\n");
return Stream2Flush();
}
/**
* Prints the upper case tables.
*/
static int PrintUpper(void)
{
Stream2Init();
Stream2Printf("const RTUNICASERANGE g_aRTUniUpperRanges[] =\n"
"{\n");
RTUNICP i = 0;
int iStart = -1;
while (i < RT_ELEMENTS(g_aCPInfo))
{
/* figure how far off the next chunk is */
unsigned iSameCase = i;
&& iSameCase >= 256)
iSameCase++;
{
if (iStart >= 0)
{
printf("};\n\n");
iStart = -1;
}
i = iSameCase;
}
else
{
if (iStart < 0)
{
printf("static const RTUNICP g_afRTUniUpper0x%06x[] = \n"
"{\n", i);
iStart = i;
}
printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleUpperCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
i++;
}
}
Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
"};\n\n\n");
printf("\n");
return Stream2Flush();
}
/**
* Prints the lowercase tables.
*/
static int PrintLower(void)
{
Stream2Init();
Stream2Printf("const RTUNICASERANGE g_aRTUniLowerRanges[] =\n"
"{\n");
RTUNICP i = 0;
int iStart = -1;
while (i < RT_ELEMENTS(g_aCPInfo))
{
/* figure how far off the next chunk is */
unsigned iSameCase = i;
&& iSameCase >= 256)
iSameCase++;
{
if (iStart >= 0)
{
printf("};\n\n");
iStart = -1;
}
i = iSameCase;
}
else
{
if (iStart < 0)
{
printf("static const RTUNICP g_afRTUniLower0x%06x[] = \n"
"{\n", i);
iStart = i;
}
printf(" 0x%02x, /* U+%06x: %s*/\n", g_aCPInfo[i].SimpleLowerCaseMapping, g_aCPInfo[i].CodePoint, g_aCPInfo[i].pszName);
i++;
}
}
Stream2Printf(" { ~(RTUNICP)0, ~(RTUNICP)0, NULL }\n"
"};\n\n\n");
printf("\n");
return Stream2Flush();
}
{
/*
* Parse args.
*/
if (argc <= 1)
{
return 1;
}
const char *pszUnicodeData = "UnicodeData.txt";
const char *pszDerivedCoreProperties = "DerivedCoreProperties.txt";
const char *pszPropList = "PropList.txt";
int iFile = 0;
{
{
switch (iFile++)
{
default:
return 1;
}
}
else
{
return 1;
}
}
/*
* Read the data.
*/
if (rc)
return rc;
if (rc)
return rc;
if (rc)
return rc;
/*
* Print stuff.
*/
if (rc)
return rc;
rc = PrintFlags();
if (rc)
return rc;
rc = PrintUpper();
if (rc)
return rc;
rc = PrintLower();
if (rc)
return rc;
/* done */
return rc;
}