unicodemap.pl revision 0ddb604f911b908085ef787455c015a91dc9c365
7cb128dc4cae2a03a742f63ba7afee23c78e3af0Phil Carmody#!/usr/bin/env perl
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenuse strict;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenmy (@titlecase16_keys, @titlecase16_values);
bdd36cfdba3ff66d25570a9ff568d69e1eb543cfTimo Sirainenmy (@titlecase32_keys, @titlecase32_values);
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenmy (@uni16_decomp_keys, @uni16_decomp_values);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenmy (@uni32_decomp_keys, @uni32_decomp_values);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenmy (@multidecomp_keys, @multidecomp_offsets, @multidecomp_values);
fe8af34153615d9007f2238fca87df11ff32d614Timo Sirainenwhile (<>) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen chomp $_;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my @arr = split(";");
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $code = eval("0x".$arr[0]);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $decomp = $arr[5];
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $titlecode = $arr[14];
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen if ($titlecode ne "") {
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen # titlecase mapping
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen my $value = eval("0x$titlecode");
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen if ($value == $code) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen # the same character, ignore
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen } elsif ($code <= 0xffff && $value <= 0xffff) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @titlecase16_keys, $code;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @titlecase16_values, $value;
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen } else {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @titlecase32_keys, $code;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @titlecase32_values, $value;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen }
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen } elsif ($decomp =~ /\<[^>]*> (.+)/) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen # decompositions
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $decomp_codes = $1;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen if ($decomp_codes =~ /^([0-9A-Z]*)$/i) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen # unicharacter decomposition. use separate lists for this
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen my $value = eval("0x$1");
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen if ($value > 0xffff) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen print STDERR "We've assumed decomposition codes are max. 16bit\n";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen exit;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen }
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen if ($code <= 0xffff) {
152db3f90f298b7fb2dbbd4276f0fc30a9bc30f6Timo Sirainen push @uni16_decomp_keys, $code;
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen push @uni16_decomp_values, $value;
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen } else {
009217abb57a24a4076092e8e4e165545747839eStephan Bosch push @uni32_decomp_keys, $code;
009217abb57a24a4076092e8e4e165545747839eStephan Bosch push @uni32_decomp_values, $value;
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen }
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen } else {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen # multicharacter decomposition.
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen if ($code > 0xffff) {
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen print STDERR "We've assumed multi-decomposition key codes are max. 16bit\n";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen exit;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen }
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @multidecomp_keys, $code;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen push @multidecomp_offsets, scalar(@multidecomp_values);
2cc34f340d31125b16f58948bed8cc6a86911fcfTimo Sirainen
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen foreach my $dcode (split(" ", $decomp_codes)) {
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen my $value = eval("0x$dcode");
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen if ($value > 0xffff) {
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen print STDERR "We've assumed decomposition codes are max. 16bit\n";
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen exit;
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen }
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen push @multidecomp_values, $value;
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen }
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen push @multidecomp_values, 0;
2cd04598af2b69120948e29f2a0dd1710bebbe1bTimo Sirainen }
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen }
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen}
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainensub print_list {
2cc34f340d31125b16f58948bed8cc6a86911fcfTimo Sirainen my @list = @{$_[0]};
2cc34f340d31125b16f58948bed8cc6a86911fcfTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $last = $#list;
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen my $n = 0;
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen foreach my $key (@list) {
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen printf("0x%04x", $key);
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen last if ($n == $last);
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen print ",";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen $n++;
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen if (($n % 8) == 0) {
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen print "\n\t";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen } else {
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen print " ";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen }
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen }
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen}
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint "/* This file is automatically generated by unicodemap.pl from UnicodeData.txt
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen NOTE: decompositions for characters having titlecase characters
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen are not included, because we first translate everything to titlecase */\n";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint "static uint16_t titlecase16_keys[] = {\n\t";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint_list(\@titlecase16_keys);
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint "\n};\n";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint "static uint16_t titlecase16_values[] = {\n\t";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint_list(\@titlecase16_values);
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainenprint "\n};\n";
db3b95d5a33ddce552d41136ae68d7331f8bf5feTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "static uint32_t titlecase32_keys[] = {\n\t";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint_list(\@titlecase32_keys);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "static uint32_t titlecase32_values[] = {\n\t";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint_list(\@titlecase32_values);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "static uint16_t uni16_decomp_keys[] = {\n\t";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint_list(\@uni16_decomp_keys);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainen
e2a88d59c0d47d63ce1ad5b1fd95e487124a3fd4Timo Sirainenprint "static uint16_t uni16_decomp_values[] = {\n\t";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint_list(\@uni16_decomp_values);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint "static uint32_t uni32_decomp_keys[] = {\n\t";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint_list(\@uni32_decomp_keys);
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint "\n};\n";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint "static uint16_t uni32_decomp_values[] = {\n\t";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint_list(\@uni32_decomp_values);
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint "\n};\n";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint "static uint16_t multidecomp_keys[] = {\n\t";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainenprint_list(\@multidecomp_keys);
152db3f90f298b7fb2dbbd4276f0fc30a9bc30f6Timo Sirainenprint "\n};\n";
152db3f90f298b7fb2dbbd4276f0fc30a9bc30f6Timo Sirainen
152db3f90f298b7fb2dbbd4276f0fc30a9bc30f6Timo Sirainenprint "static uint16_t multidecomp_offsets[] = {\n\t";
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint_list(\@multidecomp_offsets);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
300e4e43ed1ca46d0614459161ca2fb460ef661aTimo Sirainen
44875c83e0fa7111571262686818d8de43a57b2fTimo Sirainenprint "static uint16_t multidecomp_values[] = {\n\t";
300e4e43ed1ca46d0614459161ca2fb460ef661aTimo Sirainenprint_list(\@multidecomp_values);
2670cd577aa57eb9f915a4f4220ae48c9b4fc5fbTimo Sirainenprint "\n};\n";
0dc72981f5286d60ca9233f6ac7c444d393d24fbTimo Sirainen