mktables revision 7c478bd95313f5f23a4c958a745db2134aa03244
use strict;
use Carp;
die "$0: Please run me as ./mktables to avoid unnecessary differences\n"
##
##
##
## Process any args.
##
my $Verbose = 0;
my $MakeTestScript = 0;
while (@ARGV)
{
$Verbose = 1;
$Verbose = 0;
$MakeTestScript = 1;
} else {
}
}
my $HEADER=<<"EOF";
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. UnicodeData.txt.
# Any changes made here will be lost!
##
## Given a filename and a reference to an array of lines,
## write the lines to the file only if the contents have not changed.
##
sub WriteIfChanged($\@)
{
my $file = shift;
my $lines = shift;
my $PreviousText = <IN>;
return;
}
}
}
print OUT $TextToWrite;
}
##
## The main datastructure (a "Table") represents a set of code points that
## are part of a particular quality (that are part of \pL, \p{InGreek},
## etc.). They are kept as ranges of code points (starting and ending of
## each range).
##
## For example, a range ASCII LETTERS would be represented as:
## [ [ 0x41 => 0x5A, 'UPPER' ],
## [ 0x61 => 0x7A, 'LOWER, ] ]
##
## Conceptually, these should really be folded into the 'Table' objects
my %TableInfo;
my %TableDesc;
my %FuzzyNames;
my %AliasInfo;
my %CanonicalToOrig;
##
## Turn something like
## OLD-ITALIC
## into
## OldItalic
##
sub CanonicalName($)
{
my $orig = shift;
$name =~ s/[-_\s]+//g;
$CanonicalToOrig{$name} = $orig if not $CanonicalToOrig{$name};
return $name;
}
##
## Associates a property ("Greek", "Lu", "Assigned",...) with a Table.
##
## Called like:
## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1);
##
## Normally, these parameters are set when the Table is created (when the
## Table->New constructor is called), but there are times when it needs to
## be done after-the-fact...)
##
sub New_Prop($$$@)
{
my $Type = shift; ## "Is" or "In";
my $Name = shift;
my $Table = shift;
my %Args = @_;
## sanity check a few args
confess "$0: bad args to New_Prop"
}
if (not $TableInfo{$Type}->{$Name})
{
$TableInfo{$Type}->{$Name} = $Table;
$TableDesc{$Type}->{$Name} = $Desc;
$FuzzyNames{$Type}->{$Name} = $Name;
}
}
}
##
## Creates a new Table object.
##
## In => Name -- Name of "In" property to be associated with
## Is => Name -- Name of "Is" property to be associated with
## Fuzzy => Boolean -- True if name can be accessed "fuzzily"
## Desc => String -- Description of the property
##
## No args are required.
##
{
my $class = shift;
my %Args = @_;
{
}
}
## shouldn't have any left over
confess "$0: bad args to Table->New"
}
return $Table;
}
##
## Returns true if the Table has no code points
##
{
my $Table = shift; #self
}
##
## Returns true if the Table has code points
##
{
my $Table = shift; #self
return @$Table;
}
##
## Returns the maximum code point currently in the table.
##
{
my $Table = shift; #self
}
##
## Replaces the codepoints in the Table with those in the Table given
## as an arg. (NOTE: this is not a "deep copy").
##
{
my $Table = shift; #self
my $New = shift;
}
##
## Given a new code point, make the last range of the Table extend to
## include the new (and all intervening) code points.
##
{
my $Table = shift; #self
my $codepoint = shift;
$Table->[-1]->[RANGE_END] = $codepoint;
}
##
## Given a code point range start and end (and optional name), blindly
## append them to the list of ranges for the Table.
##
## NOTE: Code points must be added in strictly ascending numeric order.
##
sub Table::RawAppendRange
{
my $Table = shift; #self
my $start = shift;
my $end = shift;
my $name = shift;
$end, # RANGE_END
$name ]; # RANGE_NAME
}
##
## Given a code point (and optional name), add it to the Table.
##
## NOTE: Code points must be added in strictly ascending numeric order.
##
{
my $Table = shift; #self
my $codepoint = shift;
my $name = shift;
##
## If we've already got a range working, and this code point is the next
## one in line, and if the name is the same, just extend the current range.
##
$Table->Max == $codepoint - 1
$Table->[-1]->[RANGE_NAME] eq $name)
{
}
else
{
}
}
##
## Given a code point range starting value and ending value (and name),
## Add the range to teh Table.
##
## NOTE: Code points must be added in strictly ascending numeric order.
##
sub Table::AppendRange
{
my $Table = shift; #self
my $start = shift;
my $end = shift;
my $name = shift;
}
##
## Return a new Table that represents all code points not in the Table.
##
{
my $Table = shift; #self
my $max = -1;
{
my $start = $range->[RANGE_START];
}
}
}
return $New;
}
##
## Merges any number of other tables with $self, returning the new table.
## (existing tables are not modified)
##
##
## Args may be Tables, or individual code points (as integers).
##
## Can be called as either a constructor or a method.
##
{
my @Tables = @_;
## Accumulate all records from all tables
my @Records;
{
## arg is a table -- get its ranges
} else {
## arg is a codepoint, make a range
}
}
## sort by range start, with longer ranges coming first.
($a->[RANGE_START] <=> $b->[RANGE_START])
} @Records;
## Ensuring the first range is there makes the subsequent loop easier
$New->AppendRange($first->[RANGE_START],
## Fold in records so long as they add new information.
{
my $start = $set->[RANGE_START];
}
}
return $New;
}
##
## Given a filename, write a representation of the Table to a file.
## May have an optional comment as a 2nd arg.
##
{
my $Table = shift; #self
my $filename = shift;
my $comment = shift;
$comment =~ s/\s+\Z//;
$comment =~ s/^/# /gm;
}
{
my $start = $set->[RANGE_START];
my $name = $set->[RANGE_NAME];
} else {
}
}
}
## This used only for making the test script.
## helper function
sub IsUsable($)
{
my $code = shift;
return 0 if $code <= 0x0000; ## don't use null
return 0 if $code >= $LastUnicodeCodepoint; ## keep in range
return 1;
}
## Return a code point that's part of the table.
## Returns nothing if the table is empty (or covers only surrogates).
## This used only for making the test script.
{
my $Table = shift; #self
}
return ();
}
## Return a code point that's not part of the table
## Returns nothing if the table covers all code points.
## This used only for making the test script.
sub Table::InvalidCode
{
my $Table = shift; #self
{
{
}
{
return $set->[RANGE_START] - 1;
}
}
return ();
}
###########################################################################
###########################################################################
###########################################################################
##
## Called like:
## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1);
##
## The args must be in that order, although the Fuzzy pair may be omitted.
##
## This creates 'IsAll' as an alias for 'IsAny'
##
sub New_Alias($$$@)
{
my $Type = shift; ## "Is" or "In"
my $Alias = shift;
my $SameAs = shift; # expecting "SameAs" -- just ignored
my $Name = shift;
my %Args = @_;
## sanity check a few args
confess "$0: bad args to New_Alias"
}
if (not $TableInfo{$Type}->{$Name})
{
} else {
}
}
}
$AliasInfo{$Type}->{$Name} = $Alias;
$FuzzyNames{$Type}->{$Alias} = $Name;
}
}
## All assigned code points
Fuzzy => 0);
my %General;
my %Cat;
##
## Process UnicodeData.txt (Categories, etc.)
##
sub UnicodeData_Txt()
{
Fuzzy => 0);
my %DC;
my %Bidi;
my %Deco;
Fuzzy => 0);
Fuzzy => 0);
## Initialize Perl-generated categories
## (Categories from UnicodeData.txt are auto-initialized in gencat)
my %To;
sub gencat($$$$)
{
$cat, ## Category ("Lu", "Zp", "Nd", etc.)
$code, ## Code point (as an integer)
$op) = @_;
## add to the sub category (e.g. "Lu", "Nd", "Cf", ..)
Fuzzy => 0);
## add to the major category (e.g. "L", "N", "C", ...)
Fuzzy => 0);
# 005F: SPACING UNDERSCORE
my $isspace =
# and therefore it is not part of "space" even while it is "Zs".
|| $code == 0x0009 # 0009: HORIZONTAL TAB
|| $code == 0x000A # 000A: LINE FEED
|| $code == 0x000B # 000B: VERTICAL TAB
|| $code == 0x000C # 000C: FORM FEED
|| $code == 0x000D # 000D: CARRIAGE RETURN
|| $code == 0x0085 # 0085: NEL
;
&& $code != 0x000B; # Backward compat.
&& !($code == 0x000A ||
$code == 0x000B ||
$code == 0x000C ||
$code == 0x000D ||
$code == 0x0085 ||
}
## open ane read file.....
}
##
## For building \p{_CombAbove} and \p{_CanonDCIJ}
##
my %_Above_HexCodes; ## Hexcodes for chars with $comb == 230 ("ABOVE")
my %CodeToDeco; ## Maps code to decomp. list for chars with first
## decomp. char an "i" or "j" (for \p{_CanonDCIJ})
## This is filled in as we go....
Fuzzy => 0);
while (<IN>)
{
s/\s+$//;
$deco, ## decomposition mapping
$decimal, ## decimal digit value
$digit, ## digit value
$number, ## numeric value
$mirrored, ## mirrored
$unicode10, ## name in Unicode 1.0
$comment, ## comment field
$upper, ## uppercase mapping
$lower, ## lowercase mapping
$title, ## titlecase mapping
# Note that in Unicode 3.2 there will be names like
# LINE FEED (LF), which probably means that \N{} needs
# to cope also with LINE FEED and LF.
$_Above_HexCodes{$hexcode} = 1;
}
## Used in building \p{_CanonDCIJ}
$CodeToDeco{$code} = $deco;
}
##
## There are a few pairs of lines like:
## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
## that define ranges.
##
{
$name = $1;
#New_Prop(In => $name, $General{$name}, Fuzzy => 1);
}
else
{
## normal (single-character) lines
# No Append() here since since several codes may map into one.
Fuzzy => 0);
if ($deco)
{
{
Fuzzy => 0);
}
else
{
}
}
}
}
##
## Tidy up a few special cases....
##
Fuzzy => 0);
## Unassigned is the same as 'Cn'
# L& is Ll, Lu, and Lt.
Fuzzy => 0);
## Any and All are all code points.
Fuzzy => 0);
##
## Build special properties for Perl's internal case-folding needs:
## \p{_CaseIgnorable}
## \p{_CanonDCIJ}
## \p{_CombAbove}
## _CombAbove was built above. Others are built here....
##
## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010]
0x00AD, #SOFT HYPHEN
0x2010), #HYPHEN
Fuzzy => 0);
## \p{_CanonDCIJ} is fairly complex...
Fuzzy => 0);
## It contains the ASCII 'i' and 'j'....
## ...and any character with a decomposition that starts with either of
## those code points, but only if the decomposition does not have any
## combining character with the "ABOVE" canonical combining class.
{
## Need to ensure that all decomposition characters do not have
## a %HexCodeToComb in %AboveCombClasses.
my $want = 1;
for my $deco_hexcode (split / /, $CodeToDeco{$code})
{
## one of the decmposition chars has an ABOVE combination
## class, so we're not interested in this one
$want = 0;
last;
}
}
}
}
##
## Now dump the files.
##
}
}
##
## Process LineBreak.txt
##
sub LineBreak_Txt()
{
}
my %Lbrk;
while (<IN>)
{
Fuzzy => 0);
}
}
}
##
## Process ArabicShaping.txt.
##
sub ArabicShaping_txt()
{
}
while (<IN>)
{
s/\s+$//;
}
}
##
## Process Jamo.txt.
##
sub Jamo_txt()
{
}
while (<IN>)
{
}
}
##
## Process Scripts.txt.
##
sub Scripts_txt()
{
my @ScriptInfo;
}
# Wait until all the scripts have been read since
# they are not listed in numeric order.
}
# Now append the scripts properties in their code point order.
my %Script;
{
Fuzzy => 1);
}
}
## Common is everything not explicitly assigned to a Script
##
## ***shouldn't this be intersected with \p{Assigned}? ******
##
Fuzzy => 1);
}
##
## Given a name like "Close Punctuation", return a regex (that when applied
## with /i) matches any valid form of that name (e.g. "ClosePunctuation",
## "Close-Punctuation", etc.)
##
## Accept any space, dash, or underbar where in the official name there is
## space or a dash (or underbar, but there never is).
##
##
sub NameToRegex($)
{
my $Name = shift;
return $Name;
}
##
## Process Blocks.txt.
##
sub Blocks_txt()
{
my %Blocks;
}
while (<IN>)
{
#next if not /Private Use$/;
Fuzzy => 1);
}
}
}
##
## Read in the PropList.txt. It contains extended properties not
## listed in the UnicodeData.txt, such as 'Other_Alphabetic':
## alphabetic but not of the general category L; many modifiers
## belong to this extended property category: while they are not
## alphabets, they are alphabetic in nature.
##
sub PropList_txt()
{
my @PropInfo;
}
while (<IN>)
{
# Wait until all the extended properties have been read since
# they are not listed in numeric order.
}
# Now append the extended properties in their code point order.
my %Prop;
{
Fuzzy => 1);
}
}
# Alphabetic is L and Other_Alphabetic.
Fuzzy => 1);
# Lowercase is Ll and Other_Lowercase.
Fuzzy => 1);
# Uppercase is Lu and Other_Uppercase.
Fuzzy => 1);
# Math is Sm and Other_Math.
Fuzzy => 1);
# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
Fuzzy => 1);
# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
Fuzzy => 1);
}
sub Make_GC_Aliases()
{
##
## The mapping from General Category long forms to short forms is
## currently hardwired here since no simple data file in the UCD
## seems to do that. Unicode 3.2 will assumedly correct this.
##
my %Is = (
'Letter' => 'L',
'Uppercase_Letter' => 'Lu',
'Lowercase_Letter' => 'Ll',
'Titlecase_Letter' => 'Lt',
'Modifier_Letter' => 'Lm',
'Other_Letter' => 'Lo',
'Mark' => 'M',
'Non_Spacing_Mark' => 'Mn',
'Spacing_Mark' => 'Mc',
'Enclosing_Mark' => 'Me',
'Separator' => 'Z',
'Space_Separator' => 'Zs',
'Line_Separator' => 'Zl',
'Paragraph_Separator' => 'Zp',
'Number' => 'N',
'Decimal_Number' => 'Nd',
'Letter_Number' => 'Nl',
'Other_Number' => 'No',
'Punctuation' => 'P',
'Connector_Punctuation' => 'Pc',
'Dash_Punctuation' => 'Pd',
'Open_Punctuation' => 'Ps',
'Close_Punctuation' => 'Pe',
'Initial_Punctuation' => 'Pi',
'Final_Punctuation' => 'Pf',
'Other_Punctuation' => 'Po',
'Symbol' => 'S',
'Math_Symbol' => 'Sm',
'Currency_Symbol' => 'Sc',
'Modifier_Symbol' => 'Sk',
'Other_Symbol' => 'So',
'Other' => 'C',
'Control' => 'Cc',
'Format' => 'Cf',
'Surrogate' => 'Cs',
'Private Use' => 'Co',
'Unassigned' => 'Cn',
);
## make the aliases....
}
}
##
## These are used in:
## MakePropTestScript()
## WriteAllMappings()
## for making the test script.
##
my %FuzzyNameToTest;
my %ExactNameToTest;
## This used only for making the test script
sub GenTests($$$$)
{
my $FH = shift;
my $Prop = shift;
my $MatchCode = shift;
my $FailCode = shift;
}
}
}
## This used only for making the test script
sub ExpectError($$)
{
my $FH = shift;
my $prop = shift;
}
## This used only for making the test script
my @GoodSeps = (
" ",
"-",
" \t ",
"",
"",
"_",
);
my @BadSeps = (
"--",
"__",
" _",
"/"
);
## This used only for making the test script
sub RandomlyFuzzifyName($;$)
{
my $Name = shift;
my $WantError = shift; ## if true, make an error
my @parts;
{
$WantError = 0;
} else {
}
}
} else {
}
}
} else {
}
}
return $new;
}
## This used only for making the test script
sub MakePropTestScript()
{
## this written directly -- it's huge.
}
{
}
{
my $Orig = $CanonicalToOrig{$Name};
my %Names = (
$Name => 1,
$Orig => 1,
);
}
}
}
##
## These are used only in:
## RegisterFileForName()
## WriteAllMappings()
##
my %Exact; ## will become %utf8::Exact;
my %Canonical; ## will become %utf8::Canonical;
my %CaComment; ## Comment for %Canonical entry of same key
##
## Given info about a name and a datafile that it should be associated with,
## register that assocation in %Exact and %Canonical.
sub RegisterFileForName($$$$)
{
my $Type = shift;
my $Name = shift;
my $IsFuzzy = shift;
my $filename = shift;
##
## Now in details for the mapping. $Type eq 'Is' has the
## Is removed, as it will be removed in utf8_heavy when this
## data is being checked. In keeps its "In", but a second
## sans-In record is written if it doesn't conflict with
## anything already there.
##
{
} else {
}
}
else
{
} else {
}
}
}
}
##
## Writes the info accumulated in
##
## %TableInfo;
## %FuzzyNames;
## %AliasInfo;
##
##
sub WriteAllMappings()
{
my @MAP;
my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts)
## 'Is' *MUST* come first, so its names have precidence over 'In's
{
my %RawNameToFile; ## a per-$Type cache
{
## Note: $Name is already canonical
my $Table = $TableInfo{$Type}->{$Name};
my $IsFuzzy = $FuzzyNames{$Type}->{$Name};
## Need an 8.3 safe filename (which means "an 8 safe" $filename)
my $filename;
{
## 'Is' items lose 'Is' from the basename.
##
## Make sure the basename doesn't conflict with something we
## might have already written. If we have, say,
## InGreekExtended1
## InGreekExtended2
## they become
## InGreekE
## InGreek2
##
while (my $num = $BaseNames{lc $filename}++)
{
$num++; ## so basenames with numbers start with '2', which
## just looks more natural.
## Want to append $num, but if it'll make the basename longer
## than 8 characters, pre-truncate $filename so that the result
## is acceptable.
} else {
}
}
};
##
## Construct a nice comment to add to the file, and build data
## for the "./Properties" file along the way.
##
my $Comment;
{
## get list of names this table is reference by
{
}
}
my $OrigProp;
for my $N (@Supported)
{
my $IsFuzzy = $FuzzyNames{$Type}->{$N};
} else {
}
}
}
}
##
## Okay, write the file...
##
## and register it
if ($IsFuzzy)
{
$FuzzyNameToTest{$Name} = $Table if !$FuzzyNameToTest{$Name};
$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
} else {
}
}
## Register aliase info
{
my $Alias = $AliasInfo{$Type}->{$Name};
my $IsFuzzy = $FuzzyNames{$Type}->{$Alias};
my $filename = $RawNameToFile{$Name};
my $Table = $TableInfo{$Type}->{$Name};
if ($IsFuzzy)
{
$FuzzyNameToTest{$Alias} = $Table if !$FuzzyNameToTest{$Alias};
$FuzzyNameToTest{$CName} = $Table if !$FuzzyNameToTest{$CName};
} else {
}
}
}
##
## Write out the property list
##
{
my @OUT = (
"##\n",
"## This file created by $0\n",
"## List of built-in \\p{...}/\\P{...} properties.\n",
"##\n",
"## '*' means name may be 'fuzzy'\n",
"##\n\n",
);
}
## Write Exact.pl
{
my @OUT = (
"##\n",
"##\n\n",
"## Mapping from name to filename in ./lib\n",
"%utf8::Exact = (\n",
);
{
}
}
## Write Canonical.pl
{
my @OUT = (
"##\n",
"##\n\n",
"## Mapping from lc(canonical name) to filename in ./lib\n",
"%utf8::Canonical = (\n",
);
{
my $File = $Canonical{$Name};
} else {
}
}
}
}
sub SpecialCasing_txt()
{
#
# Read in the special cases.
#
my %CaseInfo;
}
s/\#.*//;
s/\s+$//;
next;
}
# Wait until all the special cases have been read since
# they are not listed in numeric order.
}
# Now write out the special cases properties in their code point order.
# Prepend them to the To/{Upper,Lower,Title}.pl.
{
my @OUT =
(
"# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
);
my $tostr =
push @OUT, sprintf qq["%s" => "$tostr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $ix)));
# Remove any single-character mappings for
# the same character since we are going for
# the special casing rules.
$NormalCase =~ s/^$code\t\t\w+\n//m;
}
");\n\n",
"return <<'END';\n",
"END\n"
);
}
}
#
# Read in the case foldings.
#
# We will do full case folding, C + F + I (see CaseFolding.txt).
#
sub CaseFolding_txt()
{
}
my %Fold;
# Skip status 'S', simple case folding
# No append() since several codes may fold into one.
} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
}
}
#
# Prepend the special foldings to the common foldings.
#
my @OUT =
(
"# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n",
"%utf8::ToSpecFold =\n(\n",
);
my $foldstr =
push @OUT, sprintf qq["%s" => "$foldstr",\n], join("", map { sprintf "\\x%02X", $_ } unpack("U0C*", pack("U", $code)));
}
");\n\n",
"return <<'END';\n",
"END\n",
);
}
## Do it....
exit(0);
## TRAILING CODE IS USED BY MakePropTestScript()
use strict;
use warnings;
my $Tests = 0;
my $Fails = 0;
sub Expect($$$)
{
my $Expect = shift;
my $String = shift;
my $Regex = shift;
$Tests++;
my $RegObj;
my $result = eval {
};
$Fails++;
$Fails++;
}
}
sub Error($)
{
my $Regex = shift;
$Tests++;
$Fails++;
}
}
sub Finished()
{
exit(0);
} else {
exit(-1);
}
}