Unicode/Normalize/Normalize.pm

1N/Apackage Unicode::Normalize;
1N/A
1N/ABEGIN {
1N/A    unless ("A" eq pack('U', 0x41)) {
1N/A    die "Unicode::Normalize cannot stringify a Unicode code point\n";
1N/A    }
1N/A}
1N/A
1N/Ause 5.006;
1N/Ause strict;
1N/Ause warnings;
1N/Ause Carp;
1N/A
1N/Ano warnings 'utf8';
1N/A
1N/Aour $VERSION = '0.28';
1N/Aour $PACKAGE = __PACKAGE__;
1N/A
1N/Arequire Exporter;
1N/Arequire DynaLoader;
1N/A
1N/Aour @ISA = qw(Exporter DynaLoader);
1N/Aour @EXPORT = qw( NFC NFD NFKC NFKD );
1N/Aour @EXPORT_OK = qw(
1N/A    normalize decompose reorder compose
1N/A    checkNFD checkNFKD checkNFC checkNFKC check
1N/A    getCanon getCompat getComposite getCombinClass
1N/A    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
1N/A    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
1N/A    FCD checkFCD FCC checkFCC composeContiguous
1N/A    splitOnLastStarter
1N/A);
1N/Aour %EXPORT_TAGS = (
1N/A    all       => [ @EXPORT, @EXPORT_OK ],
1N/A    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
1N/A    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
1N/A    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
1N/A);
1N/A
1N/A######
1N/A
1N/Abootstrap Unicode::Normalize $VERSION;
1N/A
1N/A######
1N/A
1N/Asub pack_U {
1N/A    return pack('U*', @_);
1N/A}
1N/A
1N/Asub unpack_U {
1N/A    return unpack('U*', pack('U*').shift);
1N/A}
1N/A
1N/A
1N/A##
1N/A## normalization forms
1N/A##
1N/A
1N/Ause constant COMPAT => 1;
1N/A
1N/Asub NFD  ($) { reorder(decompose($_[0])) }
1N/Asub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
1N/Asub NFC  ($) { compose(reorder(decompose($_[0]))) }
1N/Asub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
1N/A
1N/Asub FCD ($) {
1N/A    my $str = shift;
1N/A    return checkFCD($str) ? $str : NFD($str);
1N/A}
1N/Asub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
1N/A
1N/Aour %formNorm = (
1N/A    NFC  => \&NFC,  C  => \&NFC,
1N/A    NFD  => \&NFD,  D  => \&NFD,
1N/A    NFKC => \&NFKC, KC => \&NFKC,
1N/A    NFKD => \&NFKD, KD => \&NFKD,
1N/A    FCD  => \&FCD,  FCC => \&FCC,
1N/A);
1N/A
1N/Asub normalize($$)
1N/A{
1N/A    my $form = shift;
1N/A    my $str = shift;
1N/A    return exists $formNorm{$form}
1N/A    ? $formNorm{$form}->($str)
1N/A    : croak $PACKAGE."::normalize: invalid form name: $form";
1N/A}
1N/A
1N/A
1N/A##
1N/A## quick check
1N/A##
1N/A
1N/Aour %formCheck = (
1N/A    NFC  => \&checkNFC,     C  => \&checkNFC,
1N/A    NFD  => \&checkNFD,     D  => \&checkNFD,
1N/A    NFKC => \&checkNFKC,    KC => \&checkNFKC,
1N/A    NFKD => \&checkNFKD,    KD => \&checkNFKD,
1N/A    FCD  => \&checkFCD,     FCC => \&checkFCC,
1N/A);
1N/A
1N/Asub check($$)
1N/A{
1N/A    my $form = shift;
1N/A    my $str = shift;
1N/A    return exists $formCheck{$form}
1N/A    ? $formCheck{$form}->($str)
1N/A    : croak $PACKAGE."::check: invalid form name: $form";
1N/A}
1N/A
1N/A1;
1N/A__END__
1N/A
1N/A=head1 NAME
1N/A
1N/AUnicode::Normalize - Unicode Normalization Forms
1N/A
1N/A=head1 SYNOPSIS
1N/A
1N/A  use Unicode::Normalize;
1N/A
1N/A  $NFD_string  = NFD($string);  # Normalization Form D
1N/A  $NFC_string  = NFC($string);  # Normalization Form C
1N/A  $NFKD_string = NFKD($string); # Normalization Form KD
1N/A  $NFKC_string = NFKC($string); # Normalization Form KC
1N/A
1N/A   or
1N/A
1N/A  use Unicode::Normalize 'normalize';
1N/A
1N/A  $NFD_string  = normalize('D',  $string);  # Normalization Form D
1N/A  $NFC_string  = normalize('C',  $string);  # Normalization Form C
1N/A  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
1N/A  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
1N/A
1N/A=head1 DESCRIPTION
1N/A
1N/AParameters:
1N/A
1N/AC<$string> is used as a string under character semantics
1N/A(see F<perlunicode>).
1N/A
1N/AC<$codepoint> should be an unsigned integer
1N/Arepresenting a Unicode code point.
1N/A
1N/ANote: Between XS edition and pure Perl edition,
1N/Ainterpretation of C<$codepoint> as a decimal number has incompatibility.
1N/AXS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
1N/ADo not use a floating point nor a negative sign in C<$codepoint>.
1N/A
1N/A=head2 Normalization Forms
1N/A
1N/A=over 4
1N/A
1N/A=item C<$NFD_string = NFD($string)>
1N/A
1N/Areturns the Normalization Form D (formed by canonical decomposition).
1N/A
1N/A=item C<$NFC_string = NFC($string)>
1N/A
1N/Areturns the Normalization Form C (formed by canonical decomposition
1N/Afollowed by canonical composition).
1N/A
1N/A=item C<$NFKD_string = NFKD($string)>
1N/A
1N/Areturns the Normalization Form KD (formed by compatibility decomposition).
1N/A
1N/A=item C<$NFKC_string = NFKC($string)>
1N/A
1N/Areturns the Normalization Form KC (formed by compatibility decomposition
1N/Afollowed by B<canonical> composition).
1N/A
1N/A=item C<$FCD_string = FCD($string)>
1N/A
1N/AIf the given string is in FCD ("Fast C or D" form; cf. UTN #5),
1N/Areturns it without modification; otherwise returns an FCD string.
1N/A
1N/ANote: FCD is not always unique, then plural forms may be equivalent
1N/Aeach other. C<FCD()> will return one of these equivalent forms.
1N/A
1N/A=item C<$FCC_string = FCC($string)>
1N/A
1N/Areturns the FCC form ("Fast C Contiguous"; cf. UTN #5).
1N/A
1N/ANote: FCC is unique, as well as four normalization forms (NF*).
1N/A
1N/A=item C<$normalized_string = normalize($form_name, $string)>
1N/A
1N/AAs C<$form_name>, one of the following names must be given.
1N/A
1N/A  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
1N/A  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
1N/A  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
1N/A  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
1N/A
1N/A  'FCD'          for "Fast C or D" Form  (UTN #5)
1N/A  'FCC'          for "Fast C Contiguous" (UTN #5)
1N/A
1N/A=back
1N/A
1N/A=head2 Decomposition and Composition
1N/A
1N/A=over 4
1N/A
1N/A=item C<$decomposed_string = decompose($string)>
1N/A
1N/A=item C<$decomposed_string = decompose($string, $useCompatMapping)>
1N/A
1N/ADecomposes the specified string and returns the result.
1N/A
1N/AIf the second parameter (a boolean) is omitted or false, decomposes it
1N/Ausing the Canonical Decomposition Mapping.
1N/AIf true, decomposes it using the Compatibility Decomposition Mapping.
1N/A
1N/AThe string returned is not always in NFD/NFKD.
1N/AReordering may be required.
1N/A
1N/A    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
1N/A    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
1N/A
1N/A=item C<$reordered_string  = reorder($string)>
1N/A
1N/AReorders the combining characters and the like in the canonical ordering
1N/Aand returns the result.
1N/A
1N/AE.g., when you have a list of NFD/NFKD strings,
1N/Ayou can get the concatenated NFD/NFKD string from them, saying
1N/A
1N/A    $concat_NFD  = reorder(join '', @NFD_strings);
1N/A    $concat_NFKD = reorder(join '', @NFKD_strings);
1N/A
1N/A=item C<$composed_string   = compose($string)>
1N/A
1N/AReturns the string where composable pairs are composed.
1N/A
1N/AE.g., when you have a NFD/NFKD string,
1N/Ayou can get its NFC/NFKC string, saying
1N/A
1N/A    $NFC_string  = compose($NFD_string);
1N/A    $NFKC_string = compose($NFKD_string);
1N/A
1N/A=back
1N/A
1N/A=head2 Quick Check
1N/A
1N/A(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
1N/A
1N/AThe following functions check whether the string is in that normalization form.
1N/A
1N/AThe result returned will be:
1N/A
1N/A    YES     The string is in that normalization form.
1N/A    NO      The string is not in that normalization form.
1N/A    MAYBE   Dubious. Maybe yes, maybe no.
1N/A
1N/A=over 4
1N/A
1N/A=item C<$result = checkNFD($string)>
1N/A
1N/Areturns C<YES> (C<1>) or C<NO> (C<empty string>).
1N/A
1N/A=item C<$result = checkNFC($string)>
1N/A
1N/Areturns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
1N/A
1N/A=item C<$result = checkNFKD($string)>
1N/A
1N/Areturns C<YES> (C<1>) or C<NO> (C<empty string>).
1N/A
1N/A=item C<$result = checkNFKC($string)>
1N/A
1N/Areturns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
1N/A
1N/A=item C<$result = checkFCD($string)>
1N/A
1N/Areturns C<YES> (C<1>) or C<NO> (C<empty string>).
1N/A
1N/A=item C<$result = checkFCC($string)>
1N/A
1N/Areturns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
1N/A
1N/AIf a string is not in FCD, it must not be in FCC.
1N/ASo C<checkFCC($not_FCD_string)> should return C<NO>.
1N/A
1N/A=item C<$result = check($form_name, $string)>
1N/A
1N/Areturns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
1N/A
1N/AC<$form_name> is alike to that for C<normalize()>.
1N/A
1N/A=back
1N/A
1N/AB<Note>
1N/A
1N/AIn the cases of NFD, NFKD, and FCD, the answer must be
1N/Aeither C<YES> or C<NO>. The answer C<MAYBE> may be returned
1N/Ain the cases of NFC, NFKC, and FCC.
1N/A
1N/AA C<MAYBE> string should contain at least one combining character
1N/Aor the like. For example, C<COMBINING ACUTE ACCENT> has
1N/Athe MAYBE_NFC/MAYBE_NFKC property.
1N/A
1N/ABoth C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
1N/Aand C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
1N/AC<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
1N/A(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
1N/Awhile C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
1N/A
1N/AIf you want to check exactly, compare the string with its NFC/NFKC/FCC;
1N/Ai.e.,
1N/A
1N/A    $string eq NFC($string)    # thorough than checkNFC($string)
1N/A    $string eq NFKC($string)   # thorough than checkNFKC($string)
1N/A    $string eq FCC($string)    # thorough than checkFCC($string)
1N/A
1N/A=head2 Character Data
1N/A
1N/AThese functions are interface of character data used internally.
1N/AIf you want only to get Unicode normalization forms, you don't need
1N/Acall them yourself.
1N/A
1N/A=over 4
1N/A
1N/A=item C<$canonical_decomposed = getCanon($codepoint)>
1N/A
1N/AIf the character of the specified codepoint is canonically
1N/Adecomposable (including Hangul Syllables),
1N/Areturns the B<completely decomposed> string canonically equivalent to it.
1N/A
1N/AIf it is not decomposable, returns C<undef>.
1N/A
1N/A=item C<$compatibility_decomposed = getCompat($codepoint)>
1N/A
1N/AIf the character of the specified codepoint is compatibility
1N/Adecomposable (including Hangul Syllables),
1N/Areturns the B<completely decomposed> string compatibility equivalent to it.
1N/A
1N/AIf it is not decomposable, returns C<undef>.
1N/A
1N/A=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
1N/A
1N/AIf two characters here and next (as codepoints) are composable
1N/A(including Hangul Jamo/Syllables and Composition Exclusions),
1N/Areturns the codepoint of the composite.
1N/A
1N/AIf they are not composable, returns C<undef>.
1N/A
1N/A=item C<$combining_class = getCombinClass($codepoint)>
1N/A
1N/AReturns the combining class of the character as an integer.
1N/A
1N/A=item C<$is_exclusion = isExclusion($codepoint)>
1N/A
1N/AReturns a boolean whether the character of the specified codepoint
1N/Ais a composition exclusion.
1N/A
1N/A=item C<$is_singleton = isSingleton($codepoint)>
1N/A
1N/AReturns a boolean whether the character of the specified codepoint is
1N/Aa singleton.
1N/A
1N/A=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
1N/A
1N/AReturns a boolean whether the canonical decomposition
1N/Aof the character of the specified codepoint
1N/Ais a Non-Starter Decomposition.
1N/A
1N/A=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
1N/A
1N/AReturns a boolean whether the character of the specified codepoint
1N/Amay be composed with the previous one in a certain composition
1N/A(including Hangul Compositions, but excluding
1N/AComposition Exclusions and Non-Starter Decompositions).
1N/A
1N/A=back
1N/A
1N/A=head2 EXPORT
1N/A
1N/AC<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
1N/A
1N/AC<normalize> and other some functions: on request.
1N/A
1N/A=head1 AUTHOR
1N/A
1N/ASADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
1N/A
1N/A  http://homepage1.nifty.com/nomenclator/perl/
1N/A
1N/A  Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
1N/A
1N/A  This module is free software; you can redistribute it
1N/A  and/or modify it under the same terms as Perl itself.
1N/A
1N/A=head1 SEE ALSO
1N/A
1N/A=over 4
1N/A
1N/A=item http://www.unicode.org/reports/tr15/
1N/A
1N/AUnicode Normalization Forms - UAX #15
1N/A
1N/A=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
1N/A
1N/ADerived Normalization Properties
1N/A
1N/A=item http://www.unicode.org/notes/tn5/
1N/A
1N/ACanonical Equivalence in Applications - UTN #5
1N/A
1N/A=back
1N/A
1N/A=cut
1N/A