#! /usr/local/bin/perl -w
# $Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $
#
# Copyright (c) 2000,2001 Japan Network Information Center.
# All rights reserved.
#
# By using this file, you agree to the terms and conditions set forth bellow.
#
# LICENSE TERMS AND CONDITIONS
#
# The following License Terms and Conditions apply, unless a different
# license is obtained from Japan Network Information Center ("JPNIC"),
# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
# Chiyoda-ku, Tokyo 101-0047, Japan.
#
# 1. Use, Modification and Redistribution (including distribution of any
# modified or derived work) in source and/or binary forms is permitted
# under this License Terms and Conditions.
#
# 2. Redistribution of source code must retain the copyright notices as they
# appear in each source code file, this License Terms and Conditions.
#
# 3. Redistribution in binary form must reproduce the Copyright Notice,
# this License Terms and Conditions, in the documentation and/or other
# materials provided with the distribution. For the purposes of binary
# distribution the "Copyright Notice" refers to the following language:
# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
#
# 4. The name of JPNIC may not be used to endorse or promote products
# derived from this Software without specific prior written approval of
# JPNIC.
#
# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
#
#
# Generate lib/unicodedata.c from UnicodeData.txt,
# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,
# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.
#
use strict;
use lib qw(.);
use Getopt::Long;
use UCD;
use SparseMap;
use constant UCS_MAX => 0x110000;
use constant END_BIT => 0x80000000;
my $DECOMP_COMPAT_BIT = 0x8000;
my $CASEMAP_FINAL_BIT = 0x1;
my $CASEMAP_NONFINAL_BIT = 0x2;
my $CASEMAP_LAST_BIT = 0x10;
my $LETTER_BIT = 1;
my $NSPMARK_BIT = 2;
(my $myid = '$Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
my @default_bits = (9, 7, 5);
#my @default_bits = (7, 7, 7);
my @canon_class_bits = @default_bits;
my @decomp_bits = @default_bits;
my @comp_bits = @default_bits;
my @folding_bits = @default_bits;
my @casemap_bits = @default_bits;
my @casemap_ctx_bits = @default_bits;
my $prefix = '';
my $dir = '.';
my $unicodedatafile = 'UnicodeData.txt';
my $exclusionfile = 'CompositionExclusions.txt';
my $specialcasefile = 'SpecialCasing.txt';
my $casefoldingfile = 'CaseFolding.txt';
my $verbose;
GetOptions('dir|d=s' => \$dir,
'unicodedata|u=s' => \$unicodedatafile,
'exclude|e=s' => \$exclusionfile,
'specialcase|s=s' => \$specialcasefile,
'casefold|c=s' => \$casefoldingfile,
'prefix|p=s' => \$prefix,
'verbose|v' => \$verbose,
) or usage();
foreach my $r (\$unicodedatafile, \$exclusionfile,
\$specialcasefile, \$casefoldingfile) {
$$r = "$dir/$$r" unless $$r =~ m|^/|;
}
my %exclusions;
my %lower_special;
my %upper_special;
my @decomp_data;
my @comp_data;
my @toupper_data;
my @tolower_data;
my @folding_data;
#
# Create Mapping/Bitmap objects.
#
# canonical class
my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
# canonical/compatibility decomposition
my $decomp = SparseMap::Int->new(BITS => [@decomp_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
# canonical composition
my $comp = SparseMap::Int->new(BITS => [@comp_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
# uppercase/lowercase
my $upper = SparseMap::Int->new(BITS => [@casemap_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $lower = SparseMap::Int->new(BITS => [@casemap_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
# final/nonfinal context
my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
# casefolding
my $folding = SparseMap::Int->new(BITS => [@folding_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
#
# Read datafiles.
#
read_exclusion_file();
read_specialcasing_file();
read_unicodedata_file();
read_casefolding_file();
print_header();
print_canon_class();
print_composition();
print_decomposition();
print_casemap();
print_casemap_context();
print_casefolding();
exit;
sub usage {
print STDERR <<"END";
Usage: $0 [options..]
options:
-d DIR directory where Unicode Character Data files resides [./]
-u FILE name of the UnicodeData file [UnicodeData.txt]
-e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]
-s FILE name of the SpecialCasing file [SpecialCasing.txt]
-c FILE name of the CaseFolding file [CaseFolding.txt]
END
exit 1;
}
#
# read_exclusion_file -- read CompositionExclusions-1.txt.
#
sub read_exclusion_file {
open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n";
while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {
my %data = UCD::CompositionExclusions::parseline($_);
$exclusions{$data{CODE}} = 1;
}
close EXCLUDE;
}
#
# read_specialcasing_file -- read SpecialCasing.txt
#
sub read_specialcasing_file {
open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";
while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {
my %data = UCD::SpecialCasing::parseline($_);
my $code = $data{CODE};
my $lower = $data{LOWER};
my $upper = $data{UPPER};
my $cond = $data{CONDITION} || '';
next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;
if (defined $cond && (@$lower > 1 || $lower->[0] != $code)
or @$lower > 1 or $lower->[0] != $code) {
$lower_special{$code} = [$lower, $cond];
}
if (defined $cond && (@$upper > 1 || $upper->[0] != $code)
or @$upper > 1 or $upper->[0] != $code) {
$upper_special{$code} = [$upper, $cond];
}
}
close SPCASE;
}
#
# read_unicodedata_file -- read UnicodeData.txt
#
sub read_unicodedata_file {
open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";
@decomp_data = (0);
@toupper_data = (0);
@tolower_data = (0);
my @comp_cand; # canonical composition candidates
my %nonstarter;
while ($_ = UCD::UnicodeData::getline(\*UCD)) {
my %data = UCD::UnicodeData::parseline($_);
my $code = $data{CODE};
# combining class
if ($data{CLASS} > 0) {
$nonstarter{$code} = 1;
$canon_class->add($code, $data{CLASS});
}
# uppercasing
if (exists $upper_special{$code} or defined $data{UPPER}) {
my $offset = @toupper_data;
my @casedata;
$upper->add($code, $offset);
if (exists $upper_special{$code}) {
push @casedata, $upper_special{$code};
}
if (defined $data{UPPER}) {
push @casedata, $data{UPPER};
}
push @toupper_data, casemap_data(@casedata);
}
# lowercasing
if (exists $lower_special{$code} or defined $data{LOWER}) {
my $offset = @tolower_data;
my @casedata;
$lower->add($code, $offset);
if (exists $lower_special{$code}) {
push @casedata, $lower_special{$code};
}
if (defined $data{LOWER}) {
push @casedata, $data{LOWER};
}
push @tolower_data, casemap_data(@casedata);
}
# composition/decomposition
if ($data{DECOMP}) {
my ($tag, @decomp) = @{$data{DECOMP}};
my $offset = @decomp_data;
# composition
if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {
# canonical composition candidate
push @comp_cand, [$code, @decomp];
}
# decomposition
if ($tag ne '') {
# compatibility decomposition
$offset |= $DECOMP_COMPAT_BIT;
}
$decomp->add($code, $offset);
push @decomp_data, @decomp;
$decomp_data[-1] |= END_BIT;
}
# final/nonfinal context
if ($data{CATEGORY} =~ /L[ult]/) {
$casemap_ctx->add($code, $LETTER_BIT);
} elsif ($data{CATEGORY} eq 'Mn') {
$casemap_ctx->add($code, $NSPMARK_BIT);
}
}
close UCD;
# Eliminate composition candidates whose decomposition starts with
# a non-starter.
@comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;
@comp_data = ([0, 0, 0]);
my $last_code = -1;
my $last_offset = @comp_data;
for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {
if ($r->[1] != $last_code) {
$comp->add($last_code,
($last_offset | ((@comp_data - $last_offset)<<16)))
unless $last_code == -1;
$last_code = $r->[1];
$last_offset = @comp_data;
}
push @comp_data, $r;
}
$comp->add($last_code,
($last_offset | ((@comp_data - $last_offset)<<16)));
}
sub casemap_data {
my @data = @_;
my @result = ();
while (@data > 0) {
my $r = shift @data;
my $flag = 0;
if (ref $r) {
if ($r->[1] eq 'FINAL') {
$flag |= $CASEMAP_FINAL_BIT;
} elsif ($r->[1] eq 'NON_FINAL') {
$flag |= $CASEMAP_NONFINAL_BIT;
} elsif ($r->[1] ne '') {
die "unknown condition \"", $r->[1], "\"\n";
}
}
$flag |= $CASEMAP_LAST_BIT if @data == 0;
push @result, $flag;
push @result, (ref $r) ? @{$r->[0]} : $r;
$result[-1] |= END_BIT;
}
@result;
}
#
# read_casefolding_file -- read CaseFolding.txt
#
sub read_casefolding_file {
open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";
# dummy.
@folding_data = (0);
while ($_ = UCD::CaseFolding::getline(\*FOLD)) {
my %data = UCD::CaseFolding::parseline($_);
$folding->add($data{CODE}, scalar(@folding_data));
push @folding_data, @{$data{MAP}};
$folding_data[-1] |= END_BIT;
}
close FOLD;
}
sub print_header {
print <<"END";
/* \$Id\$ */
/* $myid */
/*
* Do not edit this file!
* This file is generated from UnicodeData.txt, CompositionExclusions-1.txt,
* SpecialCasing.txt and CaseFolding.txt.
*/
END
}
#
# print_canon_class -- generate data for canonical class
#
sub print_canon_class {
$canon_class->fix();
print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;
print <<"END";
/*
* Canonical Class
*/
END
print_bits("CANON_CLASS", @canon_class_bits);
print "\n";
print $canon_class->cprog(NAME => "${prefix}canon_class");
}
#
# print_composition -- generate data for canonical composition
#
sub print_composition {
$comp->fix();
print STDERR "** composition\n", $comp->stat() if $verbose;
print <<"END";
/*
* Canonical Composition
*/
END
print_bits("CANON_COMPOSE", @comp_bits);
print "\n";
print $comp->cprog(NAME => "${prefix}compose");
print <<"END";
static const struct composition ${prefix}compose_seq[] = {
END
my $i = 0;
foreach my $r (@comp_data) {
if ($i % 2 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
$i++;
}
print "\n};\n\n";
}
#
# print_decomposition -- generate data for canonical/compatibility
# decomposition
#
sub print_decomposition {
$decomp->fix();
print STDERR "** decomposition\n", $decomp->stat() if $verbose;
print <<"END";
/*
* Canonical/Compatibility Decomposition
*/
END
print_bits("DECOMP", @decomp_bits);
print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
print $decomp->cprog(NAME => "${prefix}decompose");
print "static const unsigned long ${prefix}decompose_seq[] = {\n";
print_ulseq(@decomp_data);
print "};\n\n";
}
#
# print_casemap -- generate data for case mapping
#
sub print_casemap {
$upper->fix();
$lower->fix();
print STDERR "** upper mapping\n", $upper->stat() if $verbose;
print STDERR "** lower mapping\n", $lower->stat() if $verbose;
print <<"END";
/*
* Lowercase <-> Uppercase mapping
*/
/*
* Flags for special case mapping.
*/
#define CMF_FINAL $CASEMAP_FINAL_BIT
#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT
#define CMF_LAST $CASEMAP_LAST_BIT
#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL)
END
print_bits("CASEMAP", @casemap_bits);
print "\n";
print $upper->cprog(NAME => "${prefix}toupper");
print $lower->cprog(NAME => "${prefix}tolower");
print "static const unsigned long ${prefix}toupper_seq[] = {\n";
print_ulseq(@toupper_data);
print "};\n\n";
print "static const unsigned long ${prefix}tolower_seq[] = {\n";
print_ulseq(@tolower_data);
print "};\n\n";
}
#
# print_casefolding -- generate data for case folding
#
sub print_casefolding {
$folding->fix();
print STDERR "** case folding\n", $folding->stat() if $verbose;
print <<"END";
/*
* Case Folding
*/
END
print_bits("CASE_FOLDING", @folding_bits);
print "\n";
print $folding->cprog(NAME => "${prefix}case_folding");
print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
print_ulseq(@folding_data);
print "};\n\n";
}
#
# print_casemap_context -- gerarate data for determining context
# (final/non-final)
#
sub print_casemap_context {
$casemap_ctx->fix();
print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;
print <<"END";
/*
* Cased characters and non-spacing marks (for casemap context)
*/
END
print_bits("CASEMAP_CTX", @casemap_ctx_bits);
print <<"END";
#define CTX_CASED $LETTER_BIT
#define CTX_NSM $NSPMARK_BIT
END
print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");
}
sub sprint_composition_hash {
my $i = 0;
my $s = '';
foreach my $r (@_) {
if ($i % 2 == 0) {
$s .= "\n" if $i != 0;
$s .= "\t";
}
$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
$i++;
}
$s;
}
sub print_bits {
my $prefix = shift;
my $i = 0;
foreach my $bit (@_) {
print "#define ${prefix}_BITS_$i\t$bit\n";
$i++;
}
}
sub print_ulseq {
my $i = 0;
foreach my $v (@_) {
if ($i % 4 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "0x%08x, ", $v;
$i++;
}
print "\n";
}