1N/Adie "$0: Please run me as ./mktables to avoid unnecessary differences\n" 1N/A## mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl) 1N/A die "usage: $0 [-v|-q] [-maketest]"; 1N/A# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 1N/A# Any changes made here will be lost! 1N/A## Given a filename and a reference to an array of lines, 1N/A## write the lines to the file only if the contents have not changed. 1N/A## The main datastructure (a "Table") represents a set of code points that 1N/A## are part of a particular quality (that are part of \pL, \p{InGreek}, 1N/A## etc.). They are kept as ranges of code points (starting and ending of 1N/A## For example, a range ASCII LETTERS would be represented as: 1N/A## [ [ 0x41 => 0x5A, 'UPPER' ], 1N/A## [ 0x61 => 0x7A, 'LOWER, ] ] 1N/A## Conceptually, these should really be folded into the 'Table' objects 1N/A## Turn something like 1N/A## Associates a property ("Greek", "Lu", "Assigned",...) with a Table. 1N/A## New_Prop(In => 'Greek', $Table, Desc => 'Greek Block', Fuzzy => 1); 1N/A## Normally, these parameters are set when the Table is created (when the 1N/A## Table->New constructor is called), but there are times when it needs to 1N/A## be done after-the-fact...) 1N/A ## sanity check a few args 1N/A## Creates a new Table object. 1N/A## In => Name -- Name of "In" property to be associated with 1N/A## Is => Name -- Name of "Is" property to be associated with 1N/A## Fuzzy => Boolean -- True if name can be accessed "fuzzily" 1N/A## Desc => String -- Description of the property 1N/A## No args are required. 1N/A ## shouldn't have any left over 1N/A## Returns true if the Table has no code points 1N/A## Returns true if the Table has code points 1N/A## Returns the maximum code point currently in the table. 1N/A## Replaces the codepoints in the Table with those in the Table given 1N/A## as an arg. (NOTE: this is not a "deep copy"). 1N/A## Given a new code point, make the last range of the Table extend to 1N/A## include the new (and all intervening) code points. 1N/A## Given a code point range start and end (and optional name), blindly 1N/A## append them to the list of ranges for the Table. 1N/A## NOTE: Code points must be added in strictly ascending numeric order. 1N/A## Given a code point (and optional name), add it to the Table. 1N/A## NOTE: Code points must be added in strictly ascending numeric order. 1N/A ## If we've already got a range working, and this code point is the next 1N/A ## one in line, and if the name is the same, just extend the current range. 1N/A## Given a code point range starting value and ending value (and name), 1N/A## Add the range to teh Table. 1N/A## NOTE: Code points must be added in strictly ascending numeric order. 1N/A## Return a new Table that represents all code points not in the Table. 1N/A## Merges any number of other tables with $self, returning the new table. 1N/A## (existing tables are not modified) 1N/A## Args may be Tables, or individual code points (as integers). 1N/A## Can be called as either a constructor or a method. 1N/A shift(@_) if not ref $_[0]; ## if called as a constructor, lose the class 1N/A ## Accumulate all records from all tables 1N/A ## arg is a table -- get its ranges 1N/A ## arg is a codepoint, make a range 1N/A ## sort by range start, with longer ranges coming first. 1N/A ## Ensuring the first range is there makes the subsequent loop easier 1N/A ## Fold in records so long as they add new information. 1N/A## Given a filename, write a representation of the Table to a file. 1N/A## May have an optional comment as a 2nd arg. 1N/A## This used only for making the test script. 1N/A return 0 if $code <= 0x0000; ## don't use null 1N/A## Return a code point that's part of the table. 1N/A## Returns nothing if the table is empty (or covers only surrogates). 1N/A## This used only for making the test script. 1N/A## Return a code point that's not part of the table 1N/A## Returns nothing if the table covers all code points. 1N/A## This used only for making the test script. 1N/A########################################################################### 1N/A########################################################################### 1N/A########################################################################### 1N/A## New_Alias(Is => 'All', SameAs => 'Any', Fuzzy => 1); 1N/A## The args must be in that order, although the Fuzzy pair may be omitted. 1N/A## This creates 'IsAll' as an alias for 'IsAny' 1N/A my $SameAs = shift; # expecting "SameAs" -- just ignored 1N/A ## sanity check a few args 1N/A## All assigned code points 1N/A Desc => "All assigned code points", 1N/A Desc => "Mirrored in bidirectional text", 1N/A Desc => 'Decomposes to multiple characters', 1N/A Desc => 'Compatible with a more-basic character', 1N/A ## Initialize Perl-generated categories 1N/A my ($name, ## Name ("LATIN CAPITAL LETTER A") 1N/A $cat, ## Category ("Lu", "Zp", "Nd", etc.) 1N/A ## add to the sub category (e.g. "Lu", "Nd", "Cf", ..) 1N/A ## add to the major category (e.g. "L", "N", "C", ...) 1N/A # 005F: SPACING UNDERSCORE 1N/A $code != 0x200B) # 200B is ZWSP which is for line break control 1N/A # and therefore it is not part of "space" even while it is "Zs". 1N/A || $code == 0x0009 # 0009: HORIZONTAL TAB 1N/A || $code == 0x000D # 000D: CARRIAGE RETURN 1N/A ## open ane read file..... 1N/A ## For building \p{_CombAbove} and \p{_CanonDCIJ} 1N/A my %CodeToDeco; ## Maps code to decomp. list for chars with first 1N/A ## decomp. char an "i" or "j" (for \p{_CanonDCIJ}) 1N/A ## This is filled in as we go.... 1N/A Desc => '(for internal casefolding use)', 1N/A $name, ## character name (e.g. "LATIN CAPITAL LETTER A") 1N/A # Note that in Unicode 3.2 there will be names like 1N/A # LINE FEED (LF), which probably means that \N{} needs 1N/A # to cope also with LINE FEED and LF. 1N/A ## Used in building \p{_CanonDCIJ} 1N/A ## There are a few pairs of lines like: 1N/A ## AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 1N/A ## D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 1N/A ## that define ranges. 1N/A #New_Prop(In => $name, $General{$name}, Fuzzy => 1); 1N/A ## normal (single-character) lines 1N/A # No Append() here since since several codes may map into one. 1N/A ## Tidy up a few special cases.... 1N/A Desc => "General Category 'Cn' [not functional in Perl]", 1N/A ## Unassigned is the same as 'Cn' 1N/A # L& is Ll, Lu, and Lt. 1N/A ## Any and All are all code points. 1N/A ## Build special properties for Perl's internal case-folding needs: 1N/A ## \p{_CaseIgnorable} 1N/A ## _CombAbove was built above. Others are built here.... 1N/A ## \p{_CaseIgnorable} is [\p{Mn}\0x00AD\x2010] 1N/A 0x00AD, #SOFT HYPHEN 1N/A Desc => '(for internal casefolding use)', 1N/A ## \p{_CanonDCIJ} is fairly complex... 1N/A Desc => '(for internal casefolding use)', 1N/A ## It contains the ASCII 'i' and 'j'.... 1N/A ## ...and any character with a decomposition that starts with either of 1N/A ## those code points, but only if the decomposition does not have any 1N/A ## combining character with the "ABOVE" canonical combining class. 1N/A ## Need to ensure that all decomposition characters do not have 1N/A ## a %HexCodeToComb in %AboveCombClasses. 1N/A ## one of the decmposition chars has an ABOVE combination 1N/A ## class, so we're not interested in this one 1N/A ## Now dump the files. 1N/A next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\w+)/; 1N/A next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; 1N/A # Wait until all the scripts have been read since 1N/A # they are not listed in numeric order. 1N/A # Now append the scripts properties in their code point order. 1N/A ## Common is everything not explicitly assigned to a Script 1N/A ## ***shouldn't this be intersected with \p{Assigned}? ****** 1N/A Desc => 'Pseudo-Script of codepoints not in other Unicode scripts', 1N/A## Given a name like "Close Punctuation", return a regex (that when applied 1N/A## with /i) matches any valid form of that name (e.g. "ClosePunctuation", 1N/A## "Close-Punctuation", etc.) 1N/A## Accept any space, dash, or underbar where in the official name there is 1N/A## space or a dash (or underbar, but there never is). 1N/A #next if not /Private Use$/; 1N/A next if not /^([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.+?)\s*$/; 1N/A## alphabetic but not of the general category L; many modifiers 1N/A## belong to this extended property category: while they are not 1N/A## alphabets, they are alphabetic in nature. 1N/A next unless /^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(.+?)\s*\#/; 1N/A # Wait until all the extended properties have been read since 1N/A # they are not listed in numeric order. 1N/A # Now append the extended properties in their code point order. 1N/A # Alphabetic is L and Other_Alphabetic. 1N/A Desc => '[\p{L}\p{OtherAlphabetic}]', # use canonical names here 1N/A # Lowercase is Ll and Other_Lowercase. 1N/A Desc => '[\p{Ll}\p{OtherLowercase}]', # use canonical names here 1N/A # Uppercase is Lu and Other_Uppercase. 1N/A Desc => '[\p{Lu}\p{Other_Uppercase}]', # use canonical names here 1N/A # Math is Sm and Other_Math. 1N/A Desc => '[\p{Sm}\p{OtherMath}]', # use canonical names here 1N/A # ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl. 1N/A Desc => '[\p{Ll}\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]', 1N/A # ID_Continue is ID_Start, Mn, Mc, Nd, and Pc. 1N/A Desc => '[\p{ID_Start}\p{Mn}\p{Mc}\p{Nd}\p{Pc}]', 1N/A ## The mapping from General Category long forms to short forms is 1N/A ## currently hardwired here since no simple data file in the UCD 1N/A ## seems to do that. Unicode 3.2 will assumedly correct this. 1N/A 'Uppercase_Letter' => 'Lu', 1N/A 'Lowercase_Letter' => 'Ll', 1N/A 'Titlecase_Letter' => 'Lt', 1N/A 'Modifier_Letter' => 'Lm', 1N/A 'Other_Letter' => 'Lo', 1N/A 'Non_Spacing_Mark' => 'Mn', 1N/A 'Spacing_Mark' => 'Mc', 1N/A 'Enclosing_Mark' => 'Me', 1N/A 'Space_Separator' => 'Zs', 1N/A 'Line_Separator' => 'Zl', 1N/A 'Paragraph_Separator' => 'Zp', 1N/A 'Decimal_Number' => 'Nd', 1N/A 'Letter_Number' => 'Nl', 1N/A 'Other_Number' => 'No', 1N/A 'Punctuation' => 'P', 1N/A 'Connector_Punctuation' => 'Pc', 1N/A 'Dash_Punctuation' => 'Pd', 1N/A 'Open_Punctuation' => 'Ps', 1N/A 'Close_Punctuation' => 'Pe', 1N/A 'Initial_Punctuation' => 'Pi', 1N/A 'Final_Punctuation' => 'Pf', 1N/A 'Other_Punctuation' => 'Po', 1N/A 'Math_Symbol' => 'Sm', 1N/A 'Currency_Symbol' => 'Sc', 1N/A 'Modifier_Symbol' => 'Sk', 1N/A 'Other_Symbol' => 'So', 1N/A 'Surrogate' => 'Cs', 1N/A 'Private Use' => 'Co', 1N/A 'Unassigned' => 'Cn', 1N/A ## make the aliases.... 1N/A## These are used in: 1N/A## MakePropTestScript() 1N/A## WriteAllMappings() 1N/A## for making the test script. 1N/A## This used only for making the test script 1N/A## This used only for making the test script 1N/A## This used only for making the test script 1N/A## This used only for making the test script 1N/A## This used only for making the test script 1N/A ## this written directly -- it's huge. 1N/A## These are used only in: 1N/A## RegisterFileForName() 1N/A## WriteAllMappings() 1N/A## Given info about a name and a datafile that it should be associated with, 1N/A## register that assocation in %Exact and %Canonical. 1N/A ## Now in details for the mapping. $Type eq 'Is' has the 1N/A ## Is removed, as it will be removed in utf8_heavy when this 1N/A ## data is being checked. In keeps its "In", but a second 1N/A ## sans-In record is written if it doesn't conflict with 1N/A ## anything already there. 1N/A## Writes the info accumulated in 1N/A my %BaseNames; ## Base names already used (for avoiding 8.3 conflicts) 1N/A ## 'Is' *MUST* come first, so its names have precidence over 'In's 1N/A ## Note: $Name is already canonical 1N/A ## Need an 8.3 safe filename (which means "an 8 safe" $filename) 1N/A ## 'Is' items lose 'Is' from the basename. 1N/A ## Make sure the basename doesn't conflict with something we 1N/A ## might have already written. If we have, say, 1N/A $num++; ## so basenames with numbers start with '2', which 1N/A ## just looks more natural. 1N/A ## Want to append $num, but if it'll make the basename longer 1N/A ## than 8 characters, pre-truncate $filename so that the result 1N/A ## Construct a nice comment to add to the file, and build data 1N/A ## for the "./Properties" file along the way. 1N/A ## get list of names this table is reference by 1N/A ## Okay, write the file... 1N/A ## Register aliase info 1N/A ## Write out the property list 1N/A "## This file created by $0\n", 1N/A "## List of built-in \\p{...}/\\P{...} properties.\n", 1N/A "## '*' means name may be 'fuzzy'\n", 1N/A use Text::Tabs (); ## using this makes the files about half the size 1N/A "## Mapping from name to filename in ./lib\n", 1N/A "%utf8::Exact = (\n", 1N/A "## Mapping from lc(canonical name) to filename in ./lib\n", 1N/A "%utf8::Canonical = (\n", 1N/A my $Trail = ""; ## used just to keep the spacing pretty 1N/A # Read in the special cases. 1N/A # Wait until all the special cases have been read since 1N/A # they are not listed in numeric order. 1N/A # Now write out the special cases properties in their code point order. 1N/A # Prepend them to the To/{Upper,Lower,Title}.pl. 1N/A "# The key UTF-8 _bytes_, the value UTF-8 (speed hack)\n", 1N/A # Remove any single-character mappings for 1N/A # the same character since we are going for 1N/A # the special casing rules. 1N/A "return <<'END';\n", 1N/A# Read in the case foldings. 1N/A # Skip status 'S', simple case folding 1N/A # No append() since several codes may fold into one. 1N/A } else { # F: full, or I: dotted uppercase I -> dotless lowercase I 1N/A # Prepend the special foldings to the common foldings. 1N/A "# The ke UTF-8 _bytes_, the value UTF-8 (speed hack)\n", 1N/A "%utf8::ToSpecFold =\n(\n", 1N/A "return <<'END';\n", 1N/A## TRAILING CODE IS USED BY MakePropTestScript()