1N/A/*
1N/A $Id: Unicode.xs,v 1.9 2003/12/29 02:47:16 dankogai Exp dankogai $
1N/A */
1N/A
1N/A#define PERL_NO_GET_CONTEXT
1N/A#include "EXTERN.h"
1N/A#include "perl.h"
1N/A#include "XSUB.h"
1N/A#define U8 U8
1N/A#include "../Encode/encode.h"
1N/A
1N/A#define FBCHAR 0xFFFd
1N/A#define BOM_BE 0xFeFF
1N/A#define BOM16LE 0xFFFe
1N/A#define BOM32LE 0xFFFe0000
1N/A#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
1N/A#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
1N/A#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
1N/A#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
1N/A
1N/Astatic UV
1N/Aenc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
1N/A{
1N/A U8 *s = *sp;
1N/A UV v = 0;
1N/A if (s+size > e) {
1N/A croak("Partial character %c",(char) endian);
1N/A }
1N/A switch(endian) {
1N/A case 'N':
1N/A v = *s++;
1N/A v = (v << 8) | *s++;
1N/A case 'n':
1N/A v = (v << 8) | *s++;
1N/A v = (v << 8) | *s++;
1N/A break;
1N/A case 'V':
1N/A case 'v':
1N/A v |= *s++;
1N/A v |= (*s++ << 8);
1N/A if (endian == 'v')
1N/A break;
1N/A v |= (*s++ << 16);
1N/A v |= (*s++ << 24);
1N/A break;
1N/A default:
1N/A croak("Unknown endian %c",(char) endian);
1N/A break;
1N/A }
1N/A *sp = s;
1N/A return v;
1N/A}
1N/A
1N/Avoid
1N/Aenc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
1N/A{
1N/A U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
1N/A switch(endian) {
1N/A case 'v':
1N/A case 'V':
1N/A d += SvCUR(result);
1N/A SvCUR_set(result,SvCUR(result)+size);
1N/A while (size--) {
1N/A *d++ = (U8)(value & 0xFF);
1N/A value >>= 8;
1N/A }
1N/A break;
1N/A case 'n':
1N/A case 'N':
1N/A SvCUR_set(result,SvCUR(result)+size);
1N/A d += SvCUR(result);
1N/A while (size--) {
1N/A *--d = (U8)(value & 0xFF);
1N/A value >>= 8;
1N/A }
1N/A break;
1N/A default:
1N/A croak("Unknown endian %c",(char) endian);
1N/A break;
1N/A }
1N/A}
1N/A
1N/AMODULE = Encode::Unicode PACKAGE = Encode::Unicode
1N/A
1N/APROTOTYPES: DISABLE
1N/A
1N/A#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
1N/A *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
1N/A
1N/Avoid
1N/Adecode_xs(obj, str, check = 0)
1N/ASV * obj
1N/ASV * str
1N/AIV check
1N/ACODE:
1N/A{
1N/A U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
1N/A int size = SvIV(attr("size", 4));
1N/A int ucs2 = SvTRUE(attr("ucs2", 4));
1N/A int clone = SvTRUE(attr("clone", 5));
1N/A SV *result = newSVpvn("",0);
1N/A STRLEN ulen;
1N/A U8 *s = (U8 *)SvPVbyte(str,ulen);
1N/A U8 *e = (U8 *)SvEND(str);
1N/A ST(0) = sv_2mortal(result);
1N/A SvUTF8_on(result);
1N/A
1N/A if (!endian && s+size <= e) {
1N/A UV bom;
1N/A endian = (size == 4) ? 'N' : 'n';
1N/A bom = enc_unpack(aTHX_ &s,e,size,endian);
1N/A if (bom != BOM_BE) {
1N/A if (bom == BOM16LE) {
1N/A endian = 'v';
1N/A }
1N/A else if (bom == BOM32LE) {
1N/A endian = 'V';
1N/A }
1N/A else {
1N/A croak("%"SVf":Unrecognised BOM %"UVxf,
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0),
1N/A bom);
1N/A }
1N/A }
1N/A#if 1
1N/A /* Update endian for next sequence */
1N/A if (clone) {
1N/A hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
1N/A }
1N/A#endif
1N/A }
1N/A while (s < e && s+size <= e) {
1N/A UV ord = enc_unpack(aTHX_ &s,e,size,endian);
1N/A U8 *d;
1N/A if (size != 4 && invalid_ucs2(ord)) {
1N/A if (ucs2) {
1N/A if (check) {
1N/A croak("%"SVf":no surrogates allowed %"UVxf,
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0),
1N/A ord);
1N/A }
1N/A if (s+size <= e) {
1N/A /* skip the next one as well */
1N/A enc_unpack(aTHX_ &s,e,size,endian);
1N/A }
1N/A ord = FBCHAR;
1N/A }
1N/A else {
1N/A UV lo;
1N/A if (!isHiSurrogate(ord)) {
1N/A croak("%"SVf":Malformed HI surrogate %"UVxf,
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0),
1N/A ord);
1N/A }
1N/A if (s+size > e) {
1N/A /* Partial character */
1N/A s -= size; /* back up to 1st half */
1N/A break; /* And exit loop */
1N/A }
1N/A lo = enc_unpack(aTHX_ &s,e,size,endian);
1N/A if (!isLoSurrogate(lo)){
1N/A croak("%"SVf":Malformed LO surrogate %"UVxf,
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0),
1N/A ord);
1N/A }
1N/A ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
1N/A }
1N/A }
1N/A d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
1N/A d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
1N/A SvCUR_set(result,d - (U8 *)SvPVX(result));
1N/A }
1N/A if (s < e) {
1N/A /* unlikely to happen because it's fixed-length -- dankogai */
1N/A if (check & ENCODE_WARN_ON_ERR){
1N/A Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0));
1N/A }
1N/A }
1N/A if (check && !(check & ENCODE_LEAVE_SRC)){
1N/A if (s < e) {
1N/A Move(s,SvPVX(str),e-s,U8);
1N/A SvCUR_set(str,(e-s));
1N/A }
1N/A else {
1N/A SvCUR_set(str,0);
1N/A }
1N/A *SvEND(str) = '\0';
1N/A }
1N/A XSRETURN(1);
1N/A}
1N/A
1N/Avoid
1N/Aencode_xs(obj, utf8, check = 0)
1N/ASV * obj
1N/ASV * utf8
1N/AIV check
1N/ACODE:
1N/A{
1N/A U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
1N/A int size = SvIV(attr("size", 4));
1N/A int ucs2 = SvTRUE(attr("ucs2", 4));
1N/A int clone = SvTRUE(attr("clone", 5));
1N/A SV *result = newSVpvn("",0);
1N/A STRLEN ulen;
1N/A U8 *s = (U8 *)SvPVutf8(utf8,ulen);
1N/A U8 *e = (U8 *)SvEND(utf8);
1N/A ST(0) = sv_2mortal(result);
1N/A if (!endian) {
1N/A endian = (size == 4) ? 'N' : 'n';
1N/A enc_pack(aTHX_ result,size,endian,BOM_BE);
1N/A#if 1
1N/A /* Update endian for next sequence */
1N/A if (clone){
1N/A hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
1N/A }
1N/A#endif
1N/A }
1N/A while (s < e && s+UTF8SKIP(s) <= e) {
1N/A STRLEN len;
1N/A UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
1N/A s += len;
1N/A if (size != 4 && invalid_ucs2(ord)) {
1N/A if (!issurrogate(ord)){
1N/A if (ucs2) {
1N/A if (check) {
1N/A croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
1N/A }
1N/A enc_pack(aTHX_ result,size,endian,FBCHAR);
1N/A }else{
1N/A UV hi = ((ord - 0x10000) >> 10) + 0xD800;
1N/A UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
1N/A enc_pack(aTHX_ result,size,endian,hi);
1N/A enc_pack(aTHX_ result,size,endian,lo);
1N/A }
1N/A }
1N/A else {
1N/A /* not supposed to happen */
1N/A enc_pack(aTHX_ result,size,endian,FBCHAR);
1N/A }
1N/A }
1N/A else {
1N/A enc_pack(aTHX_ result,size,endian,ord);
1N/A }
1N/A }
1N/A if (s < e) {
1N/A /* UTF-8 partial char happens often on PerlIO.
1N/A Since this is okay and normal, we do not warn.
1N/A But this is critical when you choose to LEAVE_SRC
1N/A in which case we die */
1N/A if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
1N/A Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
1N/A "when CHECK = 0x%" UVuf,
1N/A *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
1N/A }
1N/A
1N/A }
1N/A if (check && !(check & ENCODE_LEAVE_SRC)){
1N/A if (s < e) {
1N/A Move(s,SvPVX(utf8),e-s,U8);
1N/A SvCUR_set(utf8,(e-s));
1N/A }
1N/A else {
1N/A SvCUR_set(utf8,0);
1N/A }
1N/A *SvEND(utf8) = '\0';
1N/A }
1N/A XSRETURN(1);
1N/A}
1N/A