1N/A#!./perl
1N/A
1N/ABEGIN {
1N/A chdir 't' if -d 't';
1N/A @INC = '../lib';
1N/A}
1N/A
1N/A{
1N/A my $wide = v256;
1N/A use bytes;
1N/A my $ordwide = ord($wide);
1N/A printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
1N/A if ($ordwide == 140) {
1N/A print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
1N/A exit 0;
1N/A }
1N/A elsif ($ordwide != 196) {
1N/A printf "# v256 starts with 0x%02x\n", $ordwide;
1N/A }
1N/A}
1N/A
1N/Ano utf8;
1N/A
1N/Aprint "1..78\n";
1N/A
1N/Amy $test = 1;
1N/A
1N/A# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
1N/A# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
1N/A# version dated 2000-09-02.
1N/A
1N/A# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
1N/A# because e.g. many patch programs have issues with binary data.
1N/A
1N/Amy @MK = split(/\n/, <<__EOMK__);
1N/A1 Correct UTF-8
1N/A1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
1N/A2 Boundary conditions
1N/A2.1 First possible sequence of certain length
1N/A2.1.1 y "\x00" 0 1 00 1
1N/A2.1.2 y "\xc2\x80" 80 2 c2:80 1
1N/A2.1.3 y "\xe0\xa0\x80" 800 3 e0:a0:80 1
1N/A2.1.4 y "\xf0\x90\x80\x80" 10000 4 f0:90:80:80 1
1N/A2.1.5 y "\xf8\x88\x80\x80\x80" 200000 5 f8:88:80:80:80 1
1N/A2.1.6 y "\xfc\x84\x80\x80\x80\x80" 4000000 6 fc:84:80:80:80:80 1
1N/A2.2 Last possible sequence of certain length
1N/A2.2.1 y "\x7f" 7f 1 7f 1
1N/A2.2.2 y "\xdf\xbf" 7ff 2 df:bf 1
1N/A# The ffff is illegal unless UTF8_ALLOW_FFFF
1N/A2.2.3 n "\xef\xbf\xbf" ffff 3 ef:bf:bf 1 character 0xffff
1N/A2.2.4 y "\xf7\xbf\xbf\xbf" 1fffff 4 f7:bf:bf:bf 1
1N/A2.2.5 y "\xfb\xbf\xbf\xbf\xbf" 3ffffff 5 fb:bf:bf:bf:bf 1
1N/A2.2.6 y "\xfd\xbf\xbf\xbf\xbf\xbf" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
1N/A2.3 Other boundary conditions
1N/A2.3.1 y "\xed\x9f\xbf" d7ff 3 ed:9f:bf 1
1N/A2.3.2 y "\xee\x80\x80" e000 3 ee:80:80 1
1N/A2.3.3 y "\xef\xbf\xbd" fffd 3 ef:bf:bd 1
1N/A2.3.4 y "\xf4\x8f\xbf\xbf" 10ffff 4 f4:8f:bf:bf 1
1N/A2.3.5 y "\xf4\x90\x80\x80" 110000 4 f4:90:80:80 1
1N/A3 Malformed sequences
1N/A3.1 Unexpected continuation bytes
1N/A3.1.1 n "\x80" - 1 80 - unexpected continuation byte 0x80
1N/A3.1.2 n "\xbf" - 1 bf - unexpected continuation byte 0xbf
1N/A3.1.3 n "\x80\xbf" - 2 80:bf - unexpected continuation byte 0x80
1N/A3.1.4 n "\x80\xbf\x80" - 3 80:bf:80 - unexpected continuation byte 0x80
1N/A3.1.5 n "\x80\xbf\x80\xbf" - 4 80:bf:80:bf - unexpected continuation byte 0x80
1N/A3.1.6 n "\x80\xbf\x80\xbf\x80" - 5 80:bf:80:bf:80 - unexpected continuation byte 0x80
1N/A3.1.7 n "\x80\xbf\x80\xbf\x80\xbf" - 6 80:bf:80:bf:80:bf - unexpected continuation byte 0x80
1N/A3.1.8 n "\x80\xbf\x80\xbf\x80\xbf\x80" - 7 80:bf:80:bf:80:bf:80 - unexpected continuation byte 0x80
1N/A3.1.9 n "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf - unexpected continuation byte 0x80
1N/A3.2 Lonely start characters
1N/A3.2.1 n "\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 - unexpected non-continuation byte 0x20 after start byte 0xc0
1N/A3.2.2 n "\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 - unexpected non-continuation byte 0x20 after start byte 0xe0
1N/A3.2.3 n "\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 - unexpected non-continuation byte 0x20 after start byte 0xf0
1N/A3.2.4 n "\xf8 \xf9 \xfa \xfb " - 8 f8:20:f9:20:fa:20:fb:20 - unexpected non-continuation byte 0x20 after start byte 0xf8
1N/A3.2.5 n "\xfc \xfd " - 4 fc:20:fd:20 - unexpected non-continuation byte 0x20 after start byte 0xfc
1N/A3.3 Sequences with last continuation byte missing
1N/A3.3.1 n "\xc0" - 1 c0 - 1 byte, need 2
1N/A3.3.2 n "\xe0\x80" - 2 e0:80 - 2 bytes, need 3
1N/A3.3.3 n "\xf0\x80\x80" - 3 f0:80:80 - 3 bytes, need 4
1N/A3.3.4 n "\xf8\x80\x80\x80" - 4 f8:80:80:80 - 4 bytes, need 5
1N/A3.3.5 n "\xfc\x80\x80\x80\x80" - 5 fc:80:80:80:80 - 5 bytes, need 6
1N/A3.3.6 n "\xdf" - 1 df - 1 byte, need 2
1N/A3.3.7 n "\xef\xbf" - 2 ef:bf - 2 bytes, need 3
1N/A3.3.8 n "\xf7\xbf\xbf" - 3 f7:bf:bf - 3 bytes, need 4
1N/A3.3.9 n "\xfb\xbf\xbf\xbf" - 4 fb:bf:bf:bf - 4 bytes, need 5
1N/A3.3.10 n "\xfd\xbf\xbf\xbf\xbf" - 5 fd:bf:bf:bf:bf - 5 bytes, need 6
1N/A3.4 Concatenation of incomplete sequences
1N/A3.4.1 n "\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf - unexpected non-continuation byte 0xe0 after start byte 0xc0
1N/A3.5 Impossible bytes
1N/A3.5.1 n "\xfe" - 1 fe - byte 0xfe
1N/A3.5.2 n "\xff" - 1 ff - byte 0xff
1N/A3.5.3 n "\xfe\xfe\xff\xff" - 4 fe:fe:ff:ff - byte 0xfe
1N/A4 Overlong sequences
1N/A4.1 Examples of an overlong ASCII character
1N/A4.1.1 n "\xc0\xaf" - 2 c0:af - 2 bytes, need 1
1N/A4.1.2 n "\xe0\x80\xaf" - 3 e0:80:af - 3 bytes, need 1
1N/A4.1.3 n "\xf0\x80\x80\xaf" - 4 f0:80:80:af - 4 bytes, need 1
1N/A4.1.4 n "\xf8\x80\x80\x80\xaf" - 5 f8:80:80:80:af - 5 bytes, need 1
1N/A4.1.5 n "\xfc\x80\x80\x80\x80\xaf" - 6 fc:80:80:80:80:af - 6 bytes, need 1
1N/A4.2 Maximum overlong sequences
1N/A4.2.1 n "\xc1\xbf" - 2 c1:bf - 2 bytes, need 1
1N/A4.2.2 n "\xe0\x9f\xbf" - 3 e0:9f:bf - 3 bytes, need 2
1N/A4.2.3 n "\xf0\x8f\xbf\xbf" - 4 f0:8f:bf:bf - 4 bytes, need 3
1N/A4.2.4 n "\xf8\x87\xbf\xbf\xbf" - 5 f8:87:bf:bf:bf - 5 bytes, need 4
1N/A4.2.5 n "\xfc\x83\xbf\xbf\xbf\xbf" - 6 fc:83:bf:bf:bf:bf - 6 bytes, need 5
1N/A4.3 Overlong representation of the NUL character
1N/A4.3.1 n "\xc0\x80" - 2 c0:80 - 2 bytes, need 1
1N/A4.3.2 n "\xe0\x80\x80" - 3 e0:80:80 - 3 bytes, need 1
1N/A4.3.3 n "\xf0\x80\x80\x80" - 4 f0:80:80:80 - 4 bytes, need 1
1N/A4.3.4 n "\xf8\x80\x80\x80\x80" - 5 f8:80:80:80:80 - 5 bytes, need 1
1N/A4.3.5 n "\xfc\x80\x80\x80\x80\x80" - 6 fc:80:80:80:80:80 - 6 bytes, need 1
1N/A5 Illegal code positions
1N/A5.1 Single UTF-16 surrogates
1N/A5.1.1 n "\xed\xa0\x80" - 3 ed:a0:80 - UTF-16 surrogate 0xd800
1N/A5.1.2 n "\xed\xad\xbf" - 3 ed:ad:bf - UTF-16 surrogate 0xdb7f
1N/A5.1.3 n "\xed\xae\x80" - 3 ed:ae:80 - UTF-16 surrogate 0xdb80
1N/A5.1.4 n "\xed\xaf\xbf" - 3 ed:af:bf - UTF-16 surrogate 0xdbff
1N/A5.1.5 n "\xed\xb0\x80" - 3 ed:b0:80 - UTF-16 surrogate 0xdc00
1N/A5.1.6 n "\xed\xbe\x80" - 3 ed:be:80 - UTF-16 surrogate 0xdf80
1N/A5.1.7 n "\xed\xbf\xbf" - 3 ed:bf:bf - UTF-16 surrogate 0xdfff
1N/A5.2 Paired UTF-16 surrogates
1N/A5.2.1 n "\xed\xa0\x80\xed\xb0\x80" - 6 ed:a0:80:ed:b0:80 - UTF-16 surrogate 0xd800
1N/A5.2.2 n "\xed\xa0\x80\xed\xbf\xbf" - 6 ed:a0:80:ed:bf:bf - UTF-16 surrogate 0xd800
1N/A5.2.3 n "\xed\xad\xbf\xed\xb0\x80" - 6 ed:ad:bf:ed:b0:80 - UTF-16 surrogate 0xdb7f
1N/A5.2.4 n "\xed\xad\xbf\xed\xbf\xbf" - 6 ed:ad:bf:ed:bf:bf - UTF-16 surrogate 0xdb7f
1N/A5.2.5 n "\xed\xae\x80\xed\xb0\x80" - 6 ed:ae:80:ed:b0:80 - UTF-16 surrogate 0xdb80
1N/A5.2.6 n "\xed\xae\x80\xed\xbf\xbf" - 6 ed:ae:80:ed:bf:bf - UTF-16 surrogate 0xdb80
1N/A5.2.7 n "\xed\xaf\xbf\xed\xb0\x80" - 6 ed:af:bf:ed:b0:80 - UTF-16 surrogate 0xdbff
1N/A5.2.8 n "\xed\xaf\xbf\xed\xbf\xbf" - 6 ed:af:bf:ed:bf:bf - UTF-16 surrogate 0xdbff
1N/A5.3 Other illegal code positions
1N/A5.3.1 n "\xef\xbf\xbe" - 3 ef:bf:be - byte order mark 0xfffe
1N/A# The ffff is illegal unless UTF8_ALLOW_FFFF
1N/A5.3.2 n "\xef\xbf\xbf" - 3 ef:bf:bf - character 0xffff
1N/A__EOMK__
1N/A
1N/A# 104..181
1N/A{
1N/A my $id;
1N/A
1N/A local $SIG{__WARN__} = sub {
1N/A print "# $id: @_";
1N/A $@ = "@_";
1N/A };
1N/A
1N/A sub moan {
1N/A print "$id: @_";
1N/A }
1N/A
1N/A sub warn_unpack_U {
1N/A $@ = '';
1N/A my @null = unpack('U0U*', $_[0]);
1N/A return $@;
1N/A }
1N/A
1N/A for (@MK) {
1N/A if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
1N/A # print "# $_\n";
1N/A } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+((?:\d+|-)(?:\s+(.+))?))?$/) {
1N/A $id = $1;
1N/A my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen, $experr) =
1N/A ($2, $3, $4, $5, $6, $7, $8);
1N/A my @hex = split(/:/, $hex);
1N/A unless (@hex == $byteslen) {
1N/A my $nhex = @hex;
1N/A moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
1N/A }
1N/A {
1N/A use bytes;
1N/A my $bytesbyteslen = length($bytes);
1N/A unless ($bytesbyteslen == $byteslen) {
1N/A moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
1N/A }
1N/A }
1N/A my $warn = warn_unpack_U($bytes);
1N/A if ($okay eq 'y') {
1N/A if ($warn) {
1N/A moan "unpack('U0U*') false negative\n";
1N/A print "not ";
1N/A }
1N/A } elsif ($okay eq 'n') {
1N/A if (not $warn || ($experr ne '' && $warn !~ /$experr/)) {
1N/A moan "unpack('U0U*') false positive\n";
1N/A print "not ";
1N/A }
1N/A }
1N/A print "ok $test # $id $okay\n";
1N/A $test++;
1N/A } else {
1N/A moan "unknown format\n";
1N/A }
1N/A }
1N/A}