cmd/spell/huff.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*  Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*    All Rights Reserved   */

#pragma ident   "%Z%%M% %I% %E% SMI"


#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>

#define BYTE 8
#define QW 1        /* width of bas-q digit in bits */

/*
 * this stuff should be local and hidden; it was made
 * accessible outside for dirty reasons: 20% faster spell
 */
#include "huff.h"
struct huff huffcode;

/*
 * Infinite Huffman code
 *
 * Let the messages be exponentially distributed with ratio r:
 *  P {message k} = r^k*(1-r),  k = 0, 1, ...
 * Let the messages be coded in base q, and suppose
 *  r^n = 1/q
 * If each decade(base q) contains n codes, then
 * the messages assigned to each decade will be q times
 * as probable as the next. Moreover the code for the tail of
 * the distribution after truncating one decade should look
 * just like the original, but longer by one leading digit q-1.
 *  q(z+n) = z + (q-1)q^w
 * where z is first code of decade, w is width of code, in shortest
 * full decade. Examples, base 2:
 *  r^1 = 1/2   r^5 = 1/2
 *  0       0110
 *  10      0111
 *  110     1000
 *  1110        1001
 *  ...     1010
 *          10110
 *  w = 1, z = 0        w = 4, z = 0110
 * Rewriting slightly
 *  (q-1)z + q*n = (q-1)q^w
 * whence z is a multiple of q and n is a multiple of q-1. Let
 *  z = cq, n = d(q-1)
 * We pick w to be the least integer such that
 *  d = n/(q-1) <= q^(w-1)
 * Then solve for c
 *  c = q^(w-1) - d
 * If c is not zero, the first decade may be preceded by
 * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
 * the example code with r^5 = 1/2 becomes
 *  000
 *  001
 *  010
 *  0110
 *  0111
 *  1000
 *  1001
 *  1010
 *  10110
 *  ...
 *  110110
 *  ...
 * The expected number of base-q digits in a codeword is then
 *  w - 1 + r^c/(1-r^n)
 * The present routines require q to be a power of 2
 */
/*
 * There is a lot of hanky-panky with left justification against
 * sign instead of simple left justification because
 * unsigned long is not available
 */
#define L (BYTE*(sizeof (long))-1)  /* length of signless long */
#define MASK (~((unsigned long)1<<L))   /* mask out sign */

/*
 * decode the prefix of word y (which is left justified against sign)
 * place mesage number into place pointed to by kp
 * return length (in bits) of decoded prefix or 0 if code is out of
 * range
 */
int
decode(long y, long *pk)
{
    int l;
    long v;
    if (y < cs) {
        *pk = y >> (long)(L+QW-w);
        return (w-QW);
    }
    for (l = w, v = v0; y >= qcs;
        y = ((unsigned long)y << QW) & MASK, v += n)
        if ((l += QW) > L)
            return (0);
    *pk = v + (y>>(long)(L-w));
    return (l);
}

/*
 * encode message k and put result (right justified) into
 * place pointed to by py.
 * return length (in bits) of result,
 * or 0 if code is too long
 */

int
encode(long k, long *py)
{
    int l;
    long y;
    if (k < c) {
        *py = k;
        return (w-QW);
    }
    for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
        if ((l += QW) > L)
            return (0);
    *py = ((y-1)<<w) + cq + k;
    return (l);
}


/*
 * Initialization code, given expected value of k
 * E(k) = r/(1-r) = a
 * and given base width b
 * return expected length of coded messages
 */
static struct qlog {
    long p;
    double u;
} z;

static struct qlog
qlog(double x, double y, long p, double u)  /* find smallest p so x^p<=y */
{

    if (u/x <= y) {
        z.p = 0;
        z.u = 1;
    } else {
        z = qlog(x, y, p+p, u*u);
        if (u*z.u/x > y) {
            z.p += p;
            z.u *= u;
        }
    }
    return (z);
}

double
huff(float a)
{
    int i, q;
    long d, j;
    double r = a/(1.0 + a);
    double rc, rq;

    for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
        continue;
    rq /= r;    /* rq = r^(q-1) */
    (void) qlog(rq, 1./q, 1L, rq);
    d = z.p;
    n = d*(q-1);
    if (n != d * (q - 1))
        abort();    /* time to make n long */
    for (w = QW, j = 1; j < d; w += QW, j *= q)
        continue;
    c = j - d;
    cq = c*q;
    cs = cq<<(L-w);
    qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
    v0 = c - cq;
    for (i = 0, rc = 1; i < c; i++, rc *= r)    /* rc = r^c */
        continue;
    return (w + QW*(rc/(1-z.u) - 1));
}

void
whuff(void)
{
    (void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
}

int
rhuff(FILE *f)
{
    return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
        sizeof (huffcode));
}