# $Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $
#
# Copyright (c) 2000,2001 Japan Network Information Center.
# All rights reserved.
#
# By using this file, you agree to the terms and conditions set forth bellow.
#
# LICENSE TERMS AND CONDITIONS
#
# The following License Terms and Conditions apply, unless a different
# license is obtained from Japan Network Information Center ("JPNIC"),
# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
# Chiyoda-ku, Tokyo 101-0047, Japan.
#
# 1. Use, Modification and Redistribution (including distribution of any
# under this License Terms and Conditions.
#
# 2. Redistribution of source code must retain the copyright notices as they
# appear in each source code file, this License Terms and Conditions.
#
# 3. Redistribution in binary form must reproduce the Copyright Notice,
# materials provided with the distribution. For the purposes of binary
# distribution the "Copyright Notice" refers to the following language:
# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
#
# 4. The name of JPNIC may not be used to endorse or promote products
# derived from this Software without specific prior written approval of
# JPNIC.
#
# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
#
#
# Generate lib/unicodedata.c from UnicodeData.txt,
# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.
#
use strict;
use lib qw(.);
use UCD;
use SparseMap;
my $DECOMP_COMPAT_BIT = 0x8000;
my $CASEMAP_FINAL_BIT = 0x1;
my $CASEMAP_NONFINAL_BIT = 0x2;
my $CASEMAP_LAST_BIT = 0x10;
my $LETTER_BIT = 1;
my $NSPMARK_BIT = 2;
(my $myid = '$Id: generate_normalize_data.pl,v 1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
#my @default_bits = (7, 7, 7);
my @canon_class_bits = @default_bits;
my @decomp_bits = @default_bits;
my @comp_bits = @default_bits;
my @folding_bits = @default_bits;
my @casemap_bits = @default_bits;
my @casemap_ctx_bits = @default_bits;
my $prefix = '';
my $dir = '.';
my $unicodedatafile = 'UnicodeData.txt';
my $exclusionfile = 'CompositionExclusions.txt';
my $specialcasefile = 'SpecialCasing.txt';
my $casefoldingfile = 'CaseFolding.txt';
my $verbose;
'unicodedata|u=s' => \$unicodedatafile,
'exclude|e=s' => \$exclusionfile,
'specialcase|s=s' => \$specialcasefile,
'casefold|c=s' => \$casefoldingfile,
'prefix|p=s' => \$prefix,
'verbose|v' => \$verbose,
) or usage();
foreach my $r (\$unicodedatafile, \$exclusionfile,
\$specialcasefile, \$casefoldingfile) {
$$r = "$dir/$$r" unless $$r =~ m|^/|;
}
my %exclusions;
my %lower_special;
my %upper_special;
my @decomp_data;
my @comp_data;
my @toupper_data;
my @tolower_data;
my @folding_data;
#
#
# canonical class
MAPALL => 1,
DEFAULT => 0);
# canonical/compatibility decomposition
MAPALL => 1,
DEFAULT => 0);
# canonical composition
MAPALL => 1,
DEFAULT => 0);
MAPALL => 1,
DEFAULT => 0);
MAPALL => 1,
DEFAULT => 0);
MAPALL => 1,
DEFAULT => 0);
# casefolding
MAPALL => 1,
DEFAULT => 0);
#
# Read datafiles.
#
print_header();
exit;
sub usage {
print STDERR <<"END";
END
exit 1;
}
#
# read_exclusion_file -- read CompositionExclusions-1.txt.
#
sub read_exclusion_file {
}
close EXCLUDE;
}
#
# read_specialcasing_file -- read SpecialCasing.txt
#
sub read_specialcasing_file {
}
}
}
close SPCASE;
}
#
# read_unicodedata_file -- read UnicodeData.txt
#
sub read_unicodedata_file {
@decomp_data = (0);
@toupper_data = (0);
@tolower_data = (0);
my @comp_cand; # canonical composition candidates
my %nonstarter;
# combining class
}
# uppercasing
my $offset = @toupper_data;
my @casedata;
if (exists $upper_special{$code}) {
}
}
}
# lowercasing
my $offset = @tolower_data;
my @casedata;
if (exists $lower_special{$code}) {
}
}
}
my $offset = @decomp_data;
# composition
# canonical composition candidate
}
# decomposition
if ($tag ne '') {
# compatibility decomposition
$offset |= $DECOMP_COMPAT_BIT;
}
push @decomp_data, @decomp;
}
}
}
close UCD;
# Eliminate composition candidates whose decomposition starts with
# a non-starter.
my $last_code = -1;
my $last_offset = @comp_data;
if ($r->[1] != $last_code) {
unless $last_code == -1;
$last_code = $r->[1];
$last_offset = @comp_data;
}
push @comp_data, $r;
}
}
sub casemap_data {
my @data = @_;
my @result = ();
while (@data > 0) {
my $r = shift @data;
my $flag = 0;
if (ref $r) {
if ($r->[1] eq 'FINAL') {
$flag |= $CASEMAP_FINAL_BIT;
} elsif ($r->[1] eq 'NON_FINAL') {
$flag |= $CASEMAP_NONFINAL_BIT;
} elsif ($r->[1] ne '') {
die "unknown condition \"", $r->[1], "\"\n";
}
}
push @result, (ref $r) ? @{$r->[0]} : $r;
}
@result;
}
#
# read_casefolding_file -- read CaseFolding.txt
#
sub read_casefolding_file {
# dummy.
@folding_data = (0);
}
close FOLD;
}
sub print_header {
print <<"END";
/* \$Id\$ */
/* $myid */
/*
*/
END
}
#
# print_canon_class -- generate data for canonical class
#
sub print_canon_class {
$canon_class->fix();
print <<"END";
/*
*/
END
print "\n";
}
#
# print_composition -- generate data for canonical composition
#
sub print_composition {
print <<"END";
/*
*/
END
print "\n";
print <<"END";
END
my $i = 0;
foreach my $r (@comp_data) {
if ($i % 2 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
$i++;
}
print "\n};\n\n";
}
#
# print_decomposition -- generate data for canonical/compatibility
# decomposition
#
sub print_decomposition {
print <<"END";
/*
*/
END
print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
print "static const unsigned long ${prefix}decompose_seq[] = {\n";
print "};\n\n";
}
#
# print_casemap -- generate data for case mapping
#
sub print_casemap {
print <<"END";
/*
*/
/*
*/
#define CMF_FINAL $CASEMAP_FINAL_BIT
#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT
#define CMF_LAST $CASEMAP_LAST_BIT
#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL)
END
print "\n";
print "static const unsigned long ${prefix}toupper_seq[] = {\n";
print "};\n\n";
print "static const unsigned long ${prefix}tolower_seq[] = {\n";
print "};\n\n";
}
#
# print_casefolding -- generate data for case folding
#
sub print_casefolding {
print <<"END";
/*
*/
END
print "\n";
print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
print "};\n\n";
}
#
# print_casemap_context -- gerarate data for determining context
#
sub print_casemap_context {
$casemap_ctx->fix();
print <<"END";
/*
*/
END
print <<"END";
#define CTX_CASED $LETTER_BIT
#define CTX_NSM $NSPMARK_BIT
END
}
sub sprint_composition_hash {
my $i = 0;
my $s = '';
foreach my $r (@_) {
if ($i % 2 == 0) {
$s .= "\n" if $i != 0;
$s .= "\t";
}
$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
$i++;
}
$s;
}
sub print_bits {
my $prefix = shift;
my $i = 0;
foreach my $bit (@_) {
print "#define ${prefix}_BITS_$i\t$bit\n";
$i++;
}
}
sub print_ulseq {
my $i = 0;
foreach my $v (@_) {
if ($i % 4 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "0x%08x, ", $v;
$i++;
}
print "\n";
}