ext/Encode/encengine.c

1N/A/*
1N/AData structures for encoding transformations.
1N/A
1N/APerl works internally in either a native 'byte' encoding or
1N/Ain UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
1N/Arepresentation. When we do we can use utf8_to_uv().
1N/A
1N/AMost character encodings are either simple byte mappings or
1N/Avariable length multi-byte encodings. UTF-8 can be viewed as a
1N/Arather extreme case of the latter.
1N/A
1N/ASo to solve an important part of perl's encode needs we need to solve the
1N/A"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
1N/Acase. (Where one of multi-bytes will usually be UTF-8.)
1N/A
1N/AThe other type of encoding is a shift encoding where a prefix sequence
1N/Adetermines what subsequent bytes mean. Such encodings have state.
1N/A
1N/AWe also need to handle case where a character in one encoding has to be
1N/Arepresented as multiple characters in the other. e.g. letter+diacritic.
1N/A
1N/AThe process can be considered as pseudo perl:
1N/A
1N/Amy $dst = '';
1N/Awhile (length($src))
1N/A {
1N/A  my $size    = $count($src);
1N/A  my $in_seq  = substr($src,0,$size,'');
1N/A  my $out_seq = $s2d_hash{$in_seq};
1N/A  if (defined $out_seq)
1N/A   {
1N/A    $dst .= $out_seq;
1N/A   }
1N/A  else
1N/A   {
1N/A    # an error condition
1N/A   }
1N/A }
1N/Areturn $dst;
1N/A
1N/AThat has the following components:
1N/A &src_count - a "rule" for how many bytes make up the next character in the
1N/A              source.
1N/A %s2d_hash  - a mapping from input sequences to output sequences
1N/A
1N/AThe problem with that scheme is that it does not allow the output
1N/Acharacter repertoire to affect the characters considered from the
1N/Ainput.
1N/A
1N/ASo we use a "trie" representation which can also be considered
1N/Aa state machine:
1N/A
1N/Amy $dst   = '';
1N/Amy $seq   = \@s2d_seq;
1N/Amy $next  = \@s2d_next;
1N/Awhile (length($src))
1N/A {
1N/A  my $byte    = $substr($src,0,1,'');
1N/A  my $out_seq = $seq->[$byte];
1N/A  if (defined $out_seq)
1N/A   {
1N/A    $dst .= $out_seq;
1N/A   }
1N/A  else
1N/A   {
1N/A    # an error condition
1N/A   }
1N/A  ($next,$seq) = @$next->[$byte] if $next;
1N/A }
1N/Areturn $dst;
1N/A
1N/AThere is now a pair of data structures to represent everything.
1N/AIt is valid for output sequence at a particular point to
1N/Abe defined but zero length, that just means "don't know yet".
1N/AFor the single byte case there is no 'next' so new tables will be the same as
1N/Athe original tables. For a multi-byte case a prefix byte will flip to the tables
1N/Afor  the next page (adding nothing to the output), then the tables for the page
1N/Awill provide the actual output and set tables back to original base page.
1N/A
1N/AThis scheme can also handle shift encodings.
1N/A
1N/AA slight enhancement to the scheme also allows for look-ahead - if
1N/Awe add a flag to re-add the removed byte to the source we could handle
1N/A  a" -> �
1N/A  ab -> a (and take b back please)
1N/A
1N/A*/
1N/A
1N/A#include <EXTERN.h>
1N/A#include <perl.h>
1N/A#define U8 U8
1N/A#include "encode.h"
1N/A
1N/Aint
1N/Ado_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
1N/A      STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
1N/A{
1N/A    const U8 *s = src;
1N/A    const U8 *send = s + *slen;
1N/A    const U8 *last = s;
1N/A    U8 *d = dst;
1N/A    U8 *dend = d + dlen, *dlast = d;
1N/A    int code = 0;
1N/A    while (s < send) {
1N/A    encpage_t *e = enc;
1N/A    U8 byte = *s;
1N/A    while (byte > e->max)
1N/A        e++;
1N/A    if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
1N/A        const U8 *cend = s + (e->slen & 0x7f);
1N/A        if (cend <= send) {
1N/A        STRLEN n;
1N/A        if ((n = e->dlen)) {
1N/A            const U8 *out = e->seq + n * (byte - e->min);
1N/A            U8 *oend = d + n;
1N/A            if (dst) {
1N/A            if (oend <= dend) {
1N/A                while (d < oend)
1N/A                *d++ = *out++;
1N/A            }
1N/A            else {
1N/A                /* Out of space */
1N/A                code = ENCODE_NOSPACE;
1N/A                break;
1N/A            }
1N/A            }
1N/A            else
1N/A            d = oend;
1N/A        }
1N/A        enc = e->next;
1N/A        s++;
1N/A        if (s == cend) {
1N/A            if (approx && (e->slen & 0x80))
1N/A            code = ENCODE_FALLBACK;
1N/A            last = s;
1N/A            if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
1N/A              code = ENCODE_FOUND_TERM;
1N/A              break;
1N/A            }
1N/A            dlast = d;
1N/A        }
1N/A        }
1N/A        else {
1N/A        /* partial source character */
1N/A        code = ENCODE_PARTIAL;
1N/A        break;
1N/A        }
1N/A    }
1N/A    else {
1N/A        /* Cannot represent */
1N/A        code = ENCODE_NOREP;
1N/A        break;
1N/A    }
1N/A    }
1N/A    *slen = last - src;
1N/A    *dout = d - dst;
1N/A    return code;
1N/A}