/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1998-2011 AT&T Intellectual Property *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* http://www.eclipse.org/org/documents/epl-v10.html *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* partitioned fixed ops
*/
static const char usage[] =
"[-?\n@(#)$Id: pop (AT&T Research) 2003-04-05 $\n]"
USAGE_LICENSE
"[+NAME?pop - operate on partioned fixed row and column data]"
"[+DESCRIPTION?\bpop\b operates on partitioned fixed row and column data files."
" It can cut high or low frequency partition columns, list format field"
" names for partition columns, and list the partition column frequencies."
" See \bpzip\b(1) for a detailed description of file partitions"
" and column frequencies.]"
"[c:cut?Copy selected columns from the input rows to the standard output.]"
"[e:endiff?Copy the row-by-row difference to the standard output.]"
"[f:format?Specifies the data format (schema) file. Two input styles"
" are accepted. The first style lists field names and sizes in"
" consecutive order: `\bname\b,\asize\a[,\acomment\a...]]'. The second"
" style lists the field offset range and name:"
" `\abegin\a[-\aend\a]] \bname\b'. Column offsets start at 0."
" Names are used to label partition group listings on the standard"
" output, with partition groups separated by an empty line.]:[file]"
"[h:high?List information on high frequency columns only. This is"
" the default.]"
"[i:information?List the selected column frequency information on the"
" standard output.]"
"[l:low?List information on low frequency columns only.]"
"[m:map?List the partition file with the row size equal to the number of"
" high frequency columns and the high frequency columns renumbered"
" in order from 0. This partition file can then be used on high"
" frequency data produced by the \b--cut\b option.]"
"[n:newline?Append a newline to each cut output row.]"
"[o:override?Override the column partition. Currently only fixed value"
" columns may be specified. The syntax is"
" \abegin\a[-\aend\a]]='\avalue\a' where \abegin\a is the beginning"
" column offset (starting at 0), \aend\a is the ending column offset"
" for an inclusive range, and \avalue\a is the fixed column value."
" Uncompress time is improved when high frequency columns are given"
" fixed values (see the \b--partition\b option).]:[name=value]"
"[p:partition?Specifies the data row size and the high frequency column"
" partition groups and permutation. The partition file is a sequence"
" of lines. Comments start with # and continue to the end of the line."
" The first non-comment line specifies the optional name string"
" in \"...\". The next non-comment line specifies the row size."
" The remaining lines operate on column offset ranges of the form:"
" \abegin\a[-\aend\a]] where \abegin\a is the beginning column offset"
" (starting at 0), and \aend\a is the ending column offset for an"
" inclusive range. The operators are:]:[file]{"
" [+range [...]]?places all columns in the specified \arange\a"
" list in the same high frequency partition group."
" Each high frequency partition group is processed as"
" a separate block by the underlying compressor"
" (\bgzip\b(1) by default).]"
" [+range='value'?specifies that each column in \arange\a"
" has the fixed character value \avalue\a. C-style"
" character escapes are valid for \avalue\a.]"
"}"
"[r:row?Specifies the input row size (number of byte columns). Exactly"
" one of \b--row\b or \b--partition\b must be specified.]#[row-size]"
"[u:undiff?The inverse of the \b--endiff\b difference encoding.]"
"[v:verbose?List header information on the input \apzip\a file or"
" \apartition-file\a and continue processing.]"
"[x:identify?Identify output information columns with labels from the"
" \b--format\b file.]"
"[Q:regress?Generate output for regression testing, such that identical"
" invocations with identical input files will generate the same output.]"
"[T:test?Enable implementation-specific tests and tracing.]#[test-mask]"
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
" and an optional \aterminator\a. This data is not \bpzip\b compressed."
" \aterminator\a may be one of:]:[count[*terminator]]]{"
" [+\aomitted\a?\acount\a bytes.]"
" [+L?\acount\a \bnewline\b terminated records.]"
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
"\n"
"\n[ file ]\n"
"\n"
"[+SEE ALSO?\bgzip\b(1), \bpin\b(1), \bpzip\b(1), \bpzip\b(3)]"
;
#include <ast.h>
#include <ctype.h>
#include <error.h>
#include <pzip.h>
#include <tok.h>
#define OP_CUT 0x0001
#define OP_ENDIFF 0x0002
#define OP_ID 0x0004
#define OP_INFO 0x0008
#define OP_LO 0x0010
#define OP_MAP 0x0020
#define OP_NL 0x0040
#define OP_UNDIFF 0x0100
#define OP_VERBOSE 0x0200
typedef struct
{
char* name;
int beg;
int end;
} Label_t;
typedef struct
{
unsigned char hit[UCHAR_MAX+1]; /* values seen */
unsigned long changes; /* number of changes */
unsigned int values; /* # different values */
int prev; /* prev row value */
} Info_t;
/*
* gather stats from sp into ip
*/
static ssize_t
gather(register Pz_t* pz, register Pzpart_t* pp, Sfio_t* sp, register Info_t* ip, size_t* map, size_t m)
{
register int i;
register int j;
register unsigned char* buf;
register size_t n;
register ssize_t r;
register size_t rows;
for (i = 0; i < m; i++)
ip[i].prev = -1;
rows = 0;
for (;;)
{
buf = pz->buf;
if ((r = sfread(sp, buf, pz->win)) < (ssize_t)pp->row)
{
if (r < 0)
{
error(ERROR_SYSTEM|2, "read error");
return -1;
}
if (r > 0)
error(1, "last record incomplete");
break;
}
for (rows += (n = r / pp->row); n--; buf += pp->row)
for (i = 0; i < m; i++)
if (ip[i].prev != buf[j = map[i]])
{
ip[i].hit[ip[i].prev = buf[j]] = 1;
ip[i].changes++;
}
}
for (i = 0; i < m; i++)
for (j = 0; j < elementsof(ip[i].hit); j++)
if (ip[i].hit[j])
ip[i].values++;
return rows;
}
/*
* cut hi (default) or lo cols from stdin to stdout
*/
static int
cut(register Pz_t* pz, register Pzpart_t* pp, int op, register size_t* map, size_t m)
{
register int i;
register int j;
register size_t n;
register ssize_t r;
register unsigned char* ib;
register unsigned char* ob;
if (op & OP_VERBOSE)
for (n = 0; n < m; n++)
error(0, "map %3d => %3d", map[n], n);
if (!(pz->wrk = vmnewof(pz->vm, 0, unsigned char, pz->win, 0)))
error(ERROR_SYSTEM|3, "out of space");
for (;;)
{
ib = pz->buf;
ob = pz->wrk;
if ((r = sfread(pz->io, ib, pz->win)) < (ssize_t)pp->row)
{
if (r > 0)
error(1, "last record incomplete");
break;
}
n = r / pp->row;
for (i = 0; i < n; i++)
{
if (op & OP_ID)
{
*ob++ = i >> 8;
*ob++ = i;
}
for (j = 0; j < m; j++)
*ob++ = ib[map[j]];
if (op & OP_NL)
*ob++ = '\n';
ib += pp->row;
}
n = ob - pz->wrk;
if (sfwrite(sfstdout, pz->wrk, n) != (ssize_t)n)
error(ERROR_SYSTEM|3, "write error");
}
return 0;
}
/*
* label the mapped format fields
*/
static int
label(register Pz_t* pz, Pzpart_t* pp, int op, register size_t* map, size_t m, char* format)
{
register char* s;
register int i;
register int g;
ssize_t rows;
Sfio_t* sp;
Label_t* lv;
Label_t* lp;
Label_t** xv;
Info_t* ip;
if (!(sp = pzfind(pz, format, "fmt", "r")))
error(ERROR_SYSTEM|3, "%s: cannot read format file", format);
if (!(lv = vmnewof(pz->vm, 0, Label_t, pp->row + 1, 0)))
error(ERROR_SYSTEM|3, "out of space");
if (!(xv = vmnewof(pz->vm, 0, Label_t*, pp->row, 0)))
error(ERROR_SYSTEM|3, "out of space");
error_info.file = format;
lv->end = -1;
lp = ++lv;
while (s = sfgetr(sp, '\n', 1))
{
error_info.line++;
for (; isspace(*s); s++);
if (!*s || *s == '#' || *s == '"')
continue;
if (!isdigit(*s))
{
if (tokscan(s, NiL, "%s, %d,", &lp->name, &lp->end) != 2)
continue;
lp->beg = (lp-1)->end + 1;
lp->end += lp->beg - 1;
}
else if (tokscan(s, NiL, "%d-%d %s", &lp->beg, &lp->end, &lp->name) != 3)
continue;
if (streq(lp->name, "variable_ascii"))
continue;
if (streq(lp->name, "Newline"))
break;
if (lp->end >= pp->row)
error(3, "format entry extends beyond %I*d row size", sizeof(pp->row), pp->row);
if (!(lp->name = vmstrdup(pz->vm, lp->name)))
error(ERROR_SYSTEM|3, "out of space");
for (i = lp->beg; i <= lp->end; i++)
xv[i] = lp;
if (pz->test & 0x0010)
error(2, "%d-%d\t%s", lp->beg, lp->end, lp->name);
lp++;
}
sfclose(sp);
lp->name = "Newline";
lp->beg = lp->end = (lp-1)->end + 1;
if (lp->end != (pp->row - 1))
error(3, "format file row size %d does not match expected %I*d", lp->end + 1, sizeof(pp->row), pp->row);
xv[lp->beg] = lp;
error_info.file = 0;
error_info.line = 0;
if (op & OP_INFO)
{
if (!(ip = vmnewof(pz->vm, 0, Info_t, m, 0)))
error(ERROR_SYSTEM|3, "out of space");
if ((rows = gather(pz, pp, pz->io, ip, map, m)) < 0)
return 1;
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
sfprintf(sfstdout, "%33s %3s %6s %3s\n\n", "FIELD", "COL", "FREQ", "VAL");
if (op & OP_LO)
g = map[0];
else
g = 0;
for (i = g = 0; i < m; i++)
{
if (op & OP_LO)
{
if (g != map[i])
sfprintf(sfstdout, "\n");
g = map[i] + 1;
}
else if (g != pp->lab[i])
{
g = pp->lab[i];
sfprintf(sfstdout, "\n");
}
sfprintf(sfstdout, "%33s %3d %6lu %3d\n", xv[map[i]]->name, map[i], ip[i].changes, ip[i].values);
}
}
else
for (i = 0; i < m;)
{
lp = xv[map[i]];
if (op & OP_LO)
g = map[i] + 1;
else
g = pp->lab[i];
sfprintf(sfstdout, "%33s %3d", lp->name, map[i]);
while (++i < m)
{
if (op & OP_LO)
{
if (g != map[i])
{
sfprintf(sfstdout, "\n");
break;
}
g = map[i] + 1;
}
else if (g != pp->lab[i])
{
sfprintf(sfstdout, "\n");
break;
}
if (xv[map[i]] != lp)
break;
sfprintf(sfstdout, " %3d", map[i]);
}
sfprintf(sfstdout, "\n");
}
return 0;
}
/*
* list info on the mapped fields
*/
static int
info(register Pz_t* pz, register Pzpart_t* pp, int op, register size_t* map, size_t m)
{
register int i;
register int g;
ssize_t rows;
Info_t* ip;
if (!(ip = vmnewof(pz->vm, 0, Info_t, m, 0)))
error(ERROR_SYSTEM|3, "out of space");
if ((rows = gather(pz, pp, pz->io, ip, map, m)) < 0)
return 1;
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
sfprintf(sfstdout, "%3s %6s %3s\n\n", "COL", "FREQ", "VAL");
if (op & OP_LO)
g = map[0];
else
g = 0;
for (i = g = 0; i < m; i++)
{
if (op & OP_LO)
{
if (g != map[i])
sfprintf(sfstdout, "\n");
g = map[i] + 1;
}
else if (g != pp->lab[i])
{
g = pp->lab[i];
sfprintf(sfstdout, "\n");
}
sfprintf(sfstdout, "%3d %6lu %3d\n", map[i], ip[i].changes, ip[i].values);
}
return 0;
}
/*
* copy the row by row diff of path to sfstdout
*/
static int
diff(int op, const char* path, size_t row)
{
register int i;
register int j;
register int k;
ssize_t r;
unsigned char* buf[2];
unsigned char* dif;
Sfio_t* sp;
if (!(buf[0] = newof(0, unsigned char, row, 0)) || !(buf[1] = newof(0, unsigned char, row, 0)) || !(dif = newof(0, unsigned char, row, 0)))
{
error(ERROR_SYSTEM|2, "out of space");
return 1;
}
if (!(sp = sfopen(NiL, path, "r")))
{
error(ERROR_SYSTEM|2, "%s: cannot read", path);
return 1;
}
if (op & OP_ENDIFF)
{
for (i = 0; (r = sfread(sp, buf[i], row)) == row; i = k)
{
k = !i;
for (j = 0; j < row; j++)
dif[j] = buf[i][j] - buf[k][j];
if (sfwrite(sfstdout, dif, row) != row)
break;
}
}
else
{
for (i = 0; (r = sfread(sp, dif, row)) == row; i = k)
{
k = !i;
for (j = 0; j < row; j++)
buf[i][j] = dif[j] + buf[k][j];
if (sfwrite(sfstdout, buf[i], row) != row)
break;
}
}
sfclose(sp);
if (sfsync(sfstdout))
{
error(ERROR_SYSTEM|2, "write error");
return 1;
}
if (r < 0)
{
error(ERROR_SYSTEM|2, "%s: read error", path);
return 1;
}
if (r)
error(1, "%s: last record incomplete", path);
return 0;
}
int
main(int argc, char** argv)
{
register Pz_t* pz;
register Pzpart_t* pp;
register int i;
int m;
size_t* map;
Pzdisc_t disc;
Sfio_t* dp;
int flags = 0;
char* format = 0;
int op = 0;
size_t row = 0;
error_info.id = "pop";
memset(&disc, 0, sizeof(disc));
disc.version = PZ_VERSION;
disc.errorf = errorf;
if (!(dp = sfstropen()))
error(ERROR_SYSTEM|3, "out of space [options]");
for (;;)
{
switch (optget(argv, usage))
{
case 'c':
op |= OP_CUT;
continue;
case 'e':
op |= OP_ENDIFF;
continue;
case 'f':
format = opt_info.arg;
continue;
case 'h':
op &= ~OP_LO;
continue;
case 'i':
op |= OP_INFO;
continue;
case 'l':
op |= OP_LO;
continue;
case 'm':
op |= OP_MAP;
continue;
case 'n':
op |= OP_NL;
continue;
case 'o':
sfputr(dp, opt_info.arg, '\n');
continue;
case 'p':
disc.partition = opt_info.arg;
continue;
case 'r':
row = opt_info.num;
continue;
case 'u':
op |= OP_UNDIFF;
continue;
case 'v':
op |= OP_VERBOSE;
flags |= PZ_VERBOSE;
continue;
case 'x':
op |= OP_ID;
continue;
case 'Q':
sfprintf(dp, "regress\n");
continue;
case 'T':
sfprintf(dp, "test=%s\n", opt_info.arg);
continue;
case 'X':
sfprintf(dp, "prefix=%s\n", opt_info.arg);
continue;
case '?':
error(ERROR_USAGE|4, "%s", opt_info.arg);
continue;
case ':':
if (!opt_info.option[0])
sfputr(dp, &argv[opt_info.index - 1][2], '\n');
else
error(2, "%s", opt_info.arg);
continue;
}
break;
}
argv += opt_info.index;
if (error_info.errors || *argv && *(argv + 1))
error(ERROR_USAGE|4, "%s", optusage(NiL));
if (sfstrtell(dp) && !(disc.options = strdup(sfstruse(dp))))
error(ERROR_SYSTEM|3, "out of space [options]");
sfstrclose(dp);
if (op & (OP_ENDIFF|OP_UNDIFF))
{
if (!row)
error(3, "-r row-size required for -e");
return diff(op, *argv, row);
}
if (row)
{
if (disc.partition)
error(3, "only one of -r and -p may be specified");
if (!(disc.partition = strdup(sfprints("/%I*u/", sizeof(row), row))))
error(ERROR_SYSTEM|3, "out of space");
}
if (!disc.partition)
flags |= PZ_READ;
else if (op & OP_INFO)
flags |= PZ_WRITE;
if (!(pz = pzopen(&disc, *argv, flags)))
return 1;
pp = pz->part;
if (!disc.partition && (op & OP_INFO))
{
sfprintf(sfstdout, "row size %d\n", pp->row);
op |= OP_LO;
}
pz->win = (pz->win / pp->row) * pp->row;
if (op & OP_LO)
{
if (!(map = vmnewof(pz->vm, 0, size_t, pp->row - pp->nmap, 0)))
error(ERROR_SYSTEM|3, "out of space");
m = 0;
for (i = 0; i < pp->row; i++)
if (pp->low[i])
map[m++] = i;
}
else
{
map = pp->map;
m = pp->nmap;
}
if (op & OP_CUT)
i = cut(pz, pp, op, map, m);
else if (format)
i = label(pz, pp, op, map, m, format);
else if (op & OP_INFO)
i = info(pz, pp, op, map, m);
else if (op & OP_MAP)
{
pp->row = pp->nmap;
for (i = 0; i < pp->nmap; i++)
pp->map[i] = i;
pzpartprint(pz, pp, sfstdout);
}
pzclose(pz);
return i;
}