t/pragma/utf8.t

	utf8.t revision 7c478bd95313f5f23a4c958a745db2134aa03244
#!./perl

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
    $ENV{PERL5LIB} = '../lib';
    if ( ord("\t") != 9 ) { # skip on ebcdic platforms
        print "1..0 # Skip utf8 tests on ebcdic platform.\n";
        exit;
    }
}

print "1..90\n";

my $test = 1;

sub ok {
    my ($got,$expect) = @_;
    print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
    print "ok $test\n";
}

sub nok {
    my ($got,$expect) = @_;
    print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
    print "ok $test\n";
}

sub ok_bytes {
    use bytes;
    my ($got,$expect) = @_;
    print "# expected [$expect], got [$got]\nnot " if $got ne $expect;
    print "ok $test\n";
}

sub nok_bytes {
    use bytes;
    my ($got,$expect) = @_;
    print "# expected not [$expect], got [$got]\nnot " if $got eq $expect;
    print "ok $test\n";
}

{
    use utf8;
    $_ = ">\x{263A}<";
    s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg;
    ok $_, '>&#9786;<';
    $test++;				# 1

    $_ = ">\x{263A}<";
    my $rx = "\x{80}-\x{10ffff}";
    s/([$rx])/"&#".ord($1).";"/eg;
    ok $_, '>&#9786;<';
    $test++;				# 2

    $_ = ">\x{263A}<";
    my $rx = "\\x{80}-\\x{10ffff}";
    s/([$rx])/"&#".ord($1).";"/eg;
    ok $_, '>&#9786;<';
    $test++;				# 3

    $_ = "alpha,numeric";
    m/([[:alpha:]]+)/;
    ok $1, 'alpha';
    $test++;				# 4

    $_ = "alphaNUMERICstring";
    m/([[:^lower:]]+)/;
    ok $1, 'NUMERIC';
    $test++;				# 5

    $_ = "alphaNUMERICstring";
    m/(\p{Ll}+)/;
    ok $1, 'alpha';
    $test++;				# 6

    $_ = "alphaNUMERICstring";
    m/(\p{Lu}+)/;
    ok $1, 'NUMERIC';
    $test++;				# 7

    $_ = "alpha,numeric";
    m/([\p{IsAlpha}]+)/;
    ok $1, 'alpha';
    $test++;				# 8

    $_ = "alphaNUMERICstring";
    m/([^\p{IsLower}]+)/;
    ok $1, 'NUMERIC';
    $test++;				# 9

    $_ = "alpha123numeric456";
    m/([\p{IsDigit}]+)/;
    ok $1, '123';
    $test++;				# 10

    $_ = "alpha123numeric456";
    m/([^\p{IsDigit}]+)/;
    ok $1, 'alpha';
    $test++;				# 11

    $_ = ",123alpha,456numeric";
    m/([\p{IsAlnum}]+)/;
    ok $1, '123alpha';
    $test++;				# 12
}

{
    use utf8;

    $_ = "\x{263A}>\x{263A}\x{263A}";

    ok length, 4;
    $test++;				# 13

    ok length((m/>(.)/)[0]), 1;
    $test++;				# 14

    ok length($&), 2;
    $test++;				# 15

    ok length($'), 1;
    $test++;				# 16

    ok length($`), 1;
    $test++;				# 17

    ok length($1), 1;
    $test++;				# 18

    ok length($tmp=$&), 2;
    $test++;				# 19

    ok length($tmp=$'), 1;
    $test++;				# 20

    ok length($tmp=$`), 1;
    $test++;				# 21

    ok length($tmp=$1), 1;
    $test++;				# 22

    {
	use bytes;

	my $tmp = $&;
	ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
	$test++;				# 23

	$tmp = $';
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 24

	$tmp = $`;
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 25

	$tmp = $1;
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 26
    }

    ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
    $test++;				# 27

    ok_bytes $', pack("C*", 0342, 0230, 0272);
    $test++;				# 28

    ok_bytes $`, pack("C*", 0342, 0230, 0272);
    $test++;				# 29

    ok_bytes $1, pack("C*", 0342, 0230, 0272);
    $test++;				# 30

    {
	use bytes;
	no utf8;

	ok length, 10;
	$test++;				# 31

    	ok length((m/>(.)/)[0]), 1;
    	$test++;				# 32

    	ok length($&), 2;
    	$test++;				# 33

    	ok length($'), 5;
    	$test++;				# 34

    	ok length($`), 3;
    	$test++;				# 35

    	ok length($1), 1;
    	$test++;				# 36

	ok $&, pack("C*", ord(">"), 0342);
	$test++;				# 37

	ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
	$test++;				# 38

	ok $`, pack("C*", 0342, 0230, 0272);
	$test++;				# 39

	ok $1, pack("C*", 0342);
	$test++;				# 40
    }

    {
	no utf8;
	$_="\342\230\272>\342\230\272\342\230\272";
    }

    ok length, 10;
    $test++;				# 41

    ok length((m/>(.)/)[0]), 1;
    $test++;				# 42

    ok length($&), 2;
    $test++;				# 43

    ok length($'), 1;
    $test++;				# 44

    ok length($`), 1;
    $test++;				# 45

    ok length($1), 1;
    $test++;				# 46

    ok length($tmp=$&), 2;
    $test++;				# 47

    ok length($tmp=$'), 1;
    $test++;				# 48

    ok length($tmp=$`), 1;
    $test++;				# 49

    ok length($tmp=$1), 1;
    $test++;				# 50

    {
	use bytes;

        my $tmp = $&;
	ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
	$test++;				# 51

        $tmp = $';
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 52

        $tmp = $`;
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 53

        $tmp = $1;
	ok $tmp, pack("C*", 0342, 0230, 0272);
	$test++;				# 54
    }

    {
	use bytes;
	no utf8;

	ok length, 10;
	$test++;				# 55

    	ok length((m/>(.)/)[0]), 1;
    	$test++;				# 56

    	ok length($&), 2;
    	$test++;				# 57

    	ok length($'), 5;
    	$test++;				# 58

    	ok length($`), 3;
    	$test++;				# 59

    	ok length($1), 1;
    	$test++;				# 60

	ok $&, pack("C*", ord(">"), 0342);
	$test++;				# 61

	ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
	$test++;				# 62

	ok $`, pack("C*", 0342, 0230, 0272);
	$test++;				# 63

	ok $1, pack("C*", 0342);
	$test++;				# 64
    }

    ok "\x{ab}" =~ /^\x{ab}$/, 1;
    $test++;					# 65
}

{
    use utf8;
    ok join(" ",unpack("C*",chr(128).chr(255))), "128 255";
    $test++;
}

{
    use utf8;
    my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
    ok "@a", "1234 123 2345";
    $test++;                # 67
}

{
    use utf8;
    my $x = chr(123);
    my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
    ok "@a", "1234 2345";
    $test++;                # 68
}

{
    # bug id 20001009.001

    my ($a, $b);

    { use bytes; $a = "\xc3\xa4" }
    { use utf8;  $b = "\xe4"     } # \xXX must not produce UTF-8

    print "not " if $a eq $b;
    print "ok $test\n"; $test++;

    { use utf8; print "not " if $a eq $b; }
    print "ok $test\n"; $test++;
}

{
    # bug id 20001008.001

    my @x = ("stra\337e 138","stra\337e 138");
    for (@x) {
	s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
	my($latin) = /^(.+)(?:\s+\d)/;
	print $latin eq "stra\337e" ? "ok $test\n" :
	    "#latin[$latin]\nnot ok $test\n";
	$test++;
	$latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
	use utf8;
	$latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a
    }
}

{
    # bug id 20000427.003

    use utf8;
    use warnings;
    use strict;

    my $sushi = "\x{b36c}\x{5a8c}\x{ff5b}\x{5079}\x{505b}";

    my @charlist = split //, $sushi;
    my $r = '';
    foreach my $ch (@charlist) {
	$r = $r . " " . sprintf "U+%04X", ord($ch);
    }

    print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
    print "ok $test\n";
    $test++;
}

{
    # bug id 20000426.003

    use utf8;

    my $s = "\x20\x40\x{80}\x{100}\x{80}\x40\x20";

    my ($a, $b, $c) = split(/\x40/, $s);
    print "not "
	unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
    print "ok $test\n";
    $test++;

    my ($a, $b) = split(/\x{100}/, $s);
    print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
    print "ok $test\n";
    $test++;

    my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
    print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
    print "ok $test\n";
    $test++;

    my ($a, $b) = split(/\x40\x{80}/, $s);
    print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
    print "ok $test\n";
    $test++;

    my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
    print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
    print "ok $test\n";
    $test++;
}

{
    # bug id 20000730.004

    use utf8;

    my $smiley = "\x{263a}";

    for my $s ("\x{263a}",                     #  1
	       $smiley,                        #  2

	       "" . $smiley,                   #  3
	       "" . "\x{263a}",                #  4

	       $smiley    . "",                #  5
	       "\x{263a}" . "",                #  6
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	print "not "
	    unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
		   "1/1/1/3";
	print "ok $test\n";
	$test++;
    }

    for my $s ("\x{263a}" . "\x{263a}",        #  7
	       $smiley    . $smiley,           #  8

	       "\x{263a}\x{263a}",             #  9
	       "$smiley$smiley",               # 10

	       "\x{263a}" x 2,                 # 11
	       $smiley    x 2,                 # 12
	       ) {
	my $length_chars = length($s);
	my $length_bytes;
	{ use bytes; $length_bytes = length($s) }
	my @regex_chars = $s =~ m/(.)/g;
	my $regex_chars = @regex_chars;
	my @split_chars = split //, $s;
	my $split_chars = @split_chars;
	print "not "
	    unless "$length_chars/$regex_chars/$split_chars/$length_bytes" eq
		   "2/2/2/6";
	print "ok $test\n";
	$test++;
    }
}