cmd/pzip/rectify.c

/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*          Copyright (c) 1998-2011 AT&T Intellectual Property          *
*                      and is licensed under the                       *
*                 Eclipse Public License, Version 1.0                  *
*                    by AT&T Intellectual Property                     *
*                                                                      *
*                A copy of the License is available at                 *
*          http://www.eclipse.org/org/documents/epl-v10.html           *
*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                                                                      *
***********************************************************************/
#pragma prototyped

/*
 * induce fixed length record groups from data
 */

static const char usage[] =
"[-?\n@(#)$Id: rectify (AT&T Research) 1999-03-22 $\n]"
USAGE_LICENSE
"[+NAME?rectify - induce fixed length record groups from data]"
"[+DESCRIPTION?\brectify\b induces fixed length record groups from input data"
"   by sampling and comparing character frequencies. The standard input is"
"   read if \a-\a or no files are specified.]"

"[c:context?List \acontext\a records at the beginning and end of"
"   record groups larger that 3*\acontext\a.]#[context]"
"[d:description?Specify a structured dump description file. Each line of"
"   this file describes the size and content of a contiguous portion"
"   of the input file. The description is applied separately to each"
"   input file. Comments and optional labels in the following"
"   descriptions are listed with the \b--verbose\b option. Supported"
"   descriptions are:]:[file]{"
"       [+c comment?comment]"
"       [+d size [label]]?\asize\a bytes of data with optional label]"
"       [+i size [label]]?ignore \asize\a bytes of data]"
"       [+r size count [label]]?\acount\a records of length \asize\a]"
"       [+t count?Match \acount\a records against the \bT\b record"
"           table. \acount\a=0 continues until no record type"
"           match is found.]"
"       [+z size [label]]?a string with length determined by a"
"           \asize\a byte binary integer]"
"       [+T idlen id size unit [offset]]?Defines a sized record"
"           table entry.]{"
"           [+idlen?type identifier length, must be"
"               <= 4 bytes]"
"           [+id?type identifier, starting at record offset 0]"
"           [+size?default record size]"
"           [+unit?if > 0 then the record is variable length and"
"               the size is the byte at \aoffset\a]"
"           [+offset?if \aunit\a > 0 then this byte multiplied by"
"               \aunit\a is the size of variable length data"
"               appended to the record]"
"       }"
"}"
"[f:format?Byte output \bprintf\b(3) format.]:[format:=02x]"
"[g!:group?Group output in 4's.]"
"[m:min?Minimum record length to consider.]#[min:=8]"
"[n:count?List the top \acount\a candidate record lengths.]#[count:=16]"
"[o:offset?Start description listing at \aoffset\a.]#[offset:=0]"
"[r:run?List runs at least as long as \arun\a.]#[run]"
"[v:verbose?Dump description labels with data.]"

"\n"
"\n[ file ... ]\n"
"\n"
"[+SEE ALSO?\bpin\b(1), \bpop\b(1)]"
;

#include <ast.h>
#include <error.h>
#include <tok.h>

typedef struct Item_s
{
    unsigned long   index;
    unsigned long   offset;
    unsigned long   start;
    unsigned long   count;
    unsigned long   run;
} Item_t;

typedef struct
{
    int     len;
    unsigned long   id;
    int     size;
    int     unit;
    int     offset;
} Type_t;

typedef struct
{
    Sfoff_t     offset;
    unsigned long   count;
} Loop_t;

static struct
{
    Type_t      type[4 * 1024];
    Item_t      mod[4 * 1024];
    unsigned long   hit[UCHAR_MAX + 1];
    Sfoff_t     offset;
    char*       format1;
    char*       format4;
    unsigned long   context;
    unsigned long   count;
    unsigned long   min;
    unsigned long   run;
    int     group;
    int     types;
    int     typelen;
    int     typelast;
} state;

/*
 * order items by count hi to lo
 */

static int
bycount(const void* va, const void* vb)
{
    register Item_t*    a = (Item_t*)va;
    register Item_t*    b = (Item_t*)vb;

    if (a->count < b->count)
        return 1;
    if (a->count > b->count)
        return -1;
    if (a < b)
        return 1;
    if (a > b)
        return -1;
    return 0;
}

/*
 * rectify fp open for read on file
 */

static void
rectify(register Sfio_t* fp, char* file, int verbose)
{
    register unsigned char* s;
    register Item_t*    p;
    register unsigned long* q;
    register unsigned long  offset;
    register unsigned long  i;
    unsigned long       n;
    unsigned long       cur;
    unsigned long       dif;
    unsigned long       max;

    memset(state.hit, 0, sizeof(state.hit));
    memset(state.mod, 0, sizeof(state.mod));
    for (i = 0; i < elementsof(state.mod); i++)
        state.mod[i].index = i;
    max = 0;
    offset = 0;
    while (s = sfreserve(fp, SF_UNBOUND, 0))
    {
        n = sfvalue(fp);
        for (i = 0; i < n; i++)
        {
            cur = offset + i;
            q = state.hit + s[i];
            dif = cur - *q;
            *q = cur;
            if (dif < elementsof(state.mod))
            {
                p = state.mod + dif;
                if (dif > max)
                    max = dif;
                p->count++;
                if ((cur - p->offset) <= dif)
                {
                    if (!p->run++)
                        p->start = cur;
                }
                else if (p->run)
                {
                    if (state.run && p->run >= state.run && p->index >= state.min)
                        sfprintf(sfstdout, "run %7lu %7lu %7lu\n", p->index, p->run, p->start);
                    p->run = 0;
                }
                p->offset = cur;
            }
        }
        offset += n;
    }
    qsort(state.mod, elementsof(state.mod), sizeof(state.mod[0]), bycount);
    n = 0;
    for (i = 0; i < elementsof(state.mod) && n < state.count; i++)
        if (state.mod[i].index >= state.min)
        {
            n++;
            sfprintf(sfstdout, "rec %7lu %7lu %7lu\n", state.mod[i].index, state.mod[i].count, state.mod[i].offset);
        }
}

/*
 * dump size n buffer b to op in 4 hex byte chunks
 */

static void
dump(Sfio_t* op, register unsigned char* b, size_t n)
{
    register unsigned char* e = b + n / 4 * 4;
    register unsigned char* x;

    x = state.group ? (b + n) : b;
    while (b < e)
    {
        sfprintf(op, state.format4, b[0], b[1], b[2], b[3]);
        if ((b += 4) < x)
            sfputc(op, ' ');
    }
    while (b < x)
        sfprintf(op, state.format1, *b++);
    sfputc(op, '\n');
}

/*
 * return a number from b and advance b
 */

static unsigned long
number(char** b)
{
    register char*  s;
    unsigned long   r;

    for (s = *b; *s == ' ' || *s == '\t'; s++);
    r = strtoul(s, b, 0);
    if (*b == s)
        error(3, "numeric argument expected");
    for (s = *b; *s == ' ' || *s == '\t'; s++);
    *b = s;
    return r;
}

/*
 * dump fp according to dp
 */

static void
describe(register Sfio_t* dp, char* desc, register Sfio_t* fp, char* file, int verbose)
{
    register unsigned char* p;
    unsigned char*      e;
    long            size;
    long            count;
    unsigned long       context;
    Sfoff_t         offset;
    Sfoff_t         skip;
    int         nest;
    int         op;
    char*           s;
    char*           t;
    Loop_t          loop[64];
    unsigned long       id[5];

    error_info.file = desc;
    error_info.line = 0;
    offset = 0;
    nest = -1;
    while (s = sfgetr(dp, '\n', 0))
    {
        error_info.line++;
        for (t = s + sfvalue(dp) - 1; *s == ' ' || *s == '\t'; s++);
        for (op = *s; *s != ' ' && *s != '\t' && *s != '\n'; s++);
        for (; *s == ' ' || *s == '\t'; s++);
        switch (op)
        {
        case '#':
        case '\n':
            break;
        case '{':
            if (++nest >= elementsof(loop))
                error(3, "%c: nesting too deep -- %d max", op, elementsof(loop));
            count = number(&s);
            loop[nest].offset = sfseek(dp, (Sfoff_t)0, SEEK_CUR);
            loop[nest].count = count;
            if (verbose && offset >= state.offset)
                sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
            break;
        case '}':
            if (nest < 0)
                error(3, "%c: no matching {", op); /*balance}*/
            if (loop[nest].count-- <= 1)
                nest--;
            else if (sfseek(dp, loop[nest].offset, SEEK_SET) < 0)
                error(ERROR_SYSTEM|3, "loop seek error to %I*d", sizeof(loop[nest].offset), loop[nest].offset);
            else if (verbose && offset >= state.offset)
                sfprintf(sfstdout, "=== %I*d === loop %d %lu %I*d === %-.*s\n", sizeof(offset), offset, nest, loop[nest].count, sizeof(loop[nest].offset), loop[nest].offset, t - s, s);
            break;
        case 'c':
            if (verbose && offset >= state.offset)
                sfprintf(sfstdout, "=== %I*d === %-.*s\n", sizeof(offset), offset, t - s, s);
            break;
        case 'd':
            size = number(&s);
            if (offset >= state.offset)
            {
                if (verbose)
                    sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
                if (!(p = sfreserve(fp, size, 0)))
                    error(ERROR_SYSTEM|3, "%s: cannot read %ld bytes at %I*d", file, size, sizeof(offset), offset);
                dump(sfstdout, p, size);
            }
            else if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
                error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
            offset += size;
            break;
        case 'i':
            size = number(&s);
            if (verbose && offset >= state.offset)
                sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
            if (sfseek(fp, (Sfoff_t)size, SEEK_CUR) < 0)
                error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, size, sizeof(offset), offset);
            offset += size;
            break;
        case 'r':
            size = number(&s);
            count = number(&s);
            if (offset < state.offset)
            {
                skip = count * size;
                if ((offset + skip) > state.offset)
                {
                    skip = (state.offset - offset) / size;
                    count -= skip;
                    skip *= size;
                    if (sfseek(fp, skip, SEEK_CUR) < 0)
                        error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
                    offset += skip;
                }
            }
            if (offset >= state.offset)
            {
                if (verbose)
                    sfprintf(sfstdout, "=== %I*d === %ld * %ld === %-.*s\n", sizeof(offset), offset, size, count, t - s, s);
                if (state.context && count > (3 * state.context))
                {
                    skip = (count - 2 * state.context) * size;
                    count = state.context;
                    while (count-- > 0)
                    {
                        if (!(p = sfreserve(fp, size, 0)))
                            error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
                        offset += size;
                        dump(sfstdout, p, size);
                    }
                    sfprintf(sfstdout, " . . .\n");
                    if (sfseek(fp, skip, SEEK_CUR) < 0)
                        error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
                    offset += skip;
                    count = state.context;
                }
                while (count-- > 0)
                {
                    if (!(p = sfreserve(fp, size, 0)))
                        error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
                    offset += size;
                    dump(sfstdout, p, size);
                }
            }
            else
            {
                skip = count * size;
                if (sfseek(fp, skip, SEEK_CUR) < 0)
                    error(ERROR_SYSTEM|3, "%s: cannot seek %I*d bytes at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
                offset += skip;
            }
            break;
        case 't':
            if (!state.typelen)
                error(3, "no sized record types defined");
            context = 0;
            count = number(&s);
            do
            {
                if (!(p = sfreserve(fp, state.typelen, SF_LOCKR)))
                    break;
                switch (state.typelen)
                {
                case 4: id[4] = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
                case 3: id[3] = (p[0] << 16) | (p[1] << 8) | p[2];
                case 2: id[2] = (p[0] << 8) | p[1];
                case 1: id[1] = p[0];
                }
                sfread(fp, p, 0);
                if (state.type[state.typelast].id != id[state.type[state.typelast].len])
                {
                    for (state.typelast = 0; state.typelast < state.types && state.type[state.typelast].id != id[state.type[state.typelast].len]; state.typelast++);
                    if (state.typelast >= state.types)
                    {
                        if (verbose)
                            sfprintf(sfstdout, "=== %I*d === %0*x === type not found\n", sizeof(offset), offset, 2 * state.typelen, id[state.typelen]);
                        break;
                    }
                    if (verbose && offset >= state.offset)
                        sfprintf(sfstdout, "=== %I*d === %0*x === type\n", sizeof(offset), offset, 2 * state.type[state.typelast].len, id[state.type[state.typelast].len]);
                    context = 0;
                }
                size = state.type[state.typelast].size;
                if (!(p = sfreserve(fp, size, state.type[state.typelast].unit ? SF_LOCKR : 0)))
                    error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
                if (state.type[state.typelast].unit)
                {
                    size += p[state.type[state.typelast].offset] * state.type[state.typelast].unit;
                    sfread(fp, p, 0);
                    if (!(p = sfreserve(fp, size, 0)))
                        error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
                }
                if (offset >= state.offset)
                {
                    if (!state.context)
                        dump(sfstdout, p, size);
                    else if (context++ < state.context)
                        dump(sfstdout, p, size);
                    else if (context == state.context + 1)
                        sfprintf(sfstdout, " . . .\n");
                }
                offset += size;
            } while (!count || --count);
            break;
        case 'z':
            size = number(&s);
            if (!(p = sfreserve(fp, size, 0)))
                error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", size, sizeof(offset), offset);
            count = 0;
            e = p + size;
            while (p < e)
                count = (count << 8) | *p++;
            if (offset >= state.offset)
            {
                if (verbose)
                    sfprintf(sfstdout, "=== %I*d === %ld === %-.*s\n", sizeof(offset), offset, size, t - s, s);
                offset += size;
                if (!(p = sfreserve(fp, count, 0)))
                    error(ERROR_SYSTEM|3, "cannot read %ld bytes at %I*d", count, sizeof(offset), offset);
                sfprintf(sfstdout, "\"%s\"\n", fmtnesq((char*)p, "\"", count));
            }
            else
            {
                offset += 2;
                if (sfseek(fp, (Sfoff_t)count, SEEK_CUR) < 0)
                    error(ERROR_SYSTEM|3, "%s: cannot seek %ld bytes at %I*d", file, count, sizeof(offset), offset);
            }
            offset += count;
            break;
        case 'T':
            if (state.types >= elementsof(state.type))
                error(3, "too many types -- %d max", elementsof(state.type));
            if ((state.type[state.types].len = number(&s)) > state.typelen)
                state.typelen = state.type[state.types].len;
            if (state.type[state.types].len >= elementsof(id))
                error(3, "type id length must be <= %d", elementsof(id) - 1);
            state.type[state.types].id = number(&s);
            state.type[state.types].size = number(&s);
            if (state.type[state.types].unit = number(&s))
                state.type[state.types].offset = number(&s);
            state.types++;
            break;
        default:
            error(2, "%c: unknown description op", op);
            break;
        }
    }
    if (verbose && offset >= state.offset)
        sfprintf(sfstdout, "=== %I*d === EOF\n", sizeof(offset), offset);
    error_info.file = 0;
    error_info.line = 0;
    if (skip = sfseek(fp, (Sfoff_t)0, SEEK_END) - offset)
        error(1, "%s: %I*d bytes ignored at %I*d", file, sizeof(skip), skip, sizeof(offset), offset);
}

int
main(int argc, char** argv)
{
    register char*  file;
    int     n;
    Sfio_t*     fp;
    Sfio_t*     dp;

    char*       desc = 0;
    char*       format = "02x";
    int     verbose = 0;

    error_info.id = "rectify";
    state.count = 16;
    state.group = 1;
    state.min = 8;
    state.run = 0;
    for (;;)
    {
        switch (optget(argv, usage))
        {
        case 'c':
            state.context = opt_info.num;
            continue;
        case 'd':
            if (desc)
                error(2, "%s: only one description file allowed", opt_info.arg);
            else
                desc = opt_info.arg;
            continue;
        case 'f':
            format = opt_info.arg;
            continue;
        case 'g':
            state.group = opt_info.num;
            continue;
        case 'm':
            state.min = opt_info.num;
            continue;
        case 'n':
            state.count = opt_info.num;
            continue;
        case 'o':
            state.offset = opt_info.num;
            continue;
        case 'r':
            state.run = opt_info.num;
            continue;
        case 'v':
            verbose = opt_info.num;
            continue;
        case '?':
            error(ERROR_USAGE|4, "%s", opt_info.arg);
            continue;
        case ':':
            error(2, "%s", opt_info.arg);
            continue;
        }
        break;
    }
    argv += opt_info.index;
    if (error_info.errors)
        error(ERROR_USAGE|4, "%s", optusage(NiL));
    n = (strlen(format) + 1) * 4 + 1;
    if (!(state.format4 = newof(0, char, n, 0)))
        error(ERROR_SYSTEM|3, "out of space [format]");
    sfsprintf(state.format4, n, "%%%s%%%s%%%s%%%s", format, format, format, format);
    state.format1 = state.format4 + 3 * (strlen(format) + 1);
    if (desc && !(dp = sfopen(NiL, desc, "r")))
        error(ERROR_SYSTEM|3, "%s: cannot open description file", desc);
    if (file = *argv)
        argv++;
    do
    {
        if (!file || streq(file, "-"))
            fp = sfstdin;
        else if (!(fp = sfopen(NiL, file, "r")))
            error(ERROR_SYSTEM|3, "%s: cannot read", file);
        if (desc)
            describe(dp, desc, fp, file, verbose);
        else
            rectify(fp, file, verbose);
    } while (file = *argv++);
    return error_info.errors != 0;;
}