/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1998-2011 AT&T Intellectual Property *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* partitioned fixed ops
*/
static const char usage[] =
"[-?\n@(#)$Id: pop (AT&T Research) 2003-04-05 $\n]"
"[+NAME?pop - operate on partioned fixed row and column data]"
"[+DESCRIPTION?\bpop\b operates on partitioned fixed row and column data files."
" It can cut high or low frequency partition columns, list format field"
" names for partition columns, and list the partition column frequencies."
" See \bpzip\b(1) for a detailed description of file partitions"
" and column frequencies.]"
"[c:cut?Copy selected columns from the input rows to the standard output.]"
"[e:endiff?Copy the row-by-row difference to the standard output.]"
"[f:format?Specifies the data format (schema) file. Two input styles"
" are accepted. The first style lists field names and sizes in"
" consecutive order: `\bname\b,\asize\a[,\acomment\a...]]'. The second"
" style lists the field offset range and name:"
" `\abegin\a[-\aend\a]] \bname\b'. Column offsets start at 0."
" Names are used to label partition group listings on the standard"
" output, with partition groups separated by an empty line.]:[file]"
"[h:high?List information on high frequency columns only. This is"
" the default.]"
"[i:information?List the selected column frequency information on the"
" standard output.]"
"[l:low?List information on low frequency columns only.]"
"[m:map?List the partition file with the row size equal to the number of"
" high frequency columns and the high frequency columns renumbered"
" in order from 0. This partition file can then be used on high"
" frequency data produced by the \b--cut\b option.]"
"[n:newline?Append a newline to each cut output row.]"
"[o:override?Override the column partition. Currently only fixed value"
" columns may be specified. The syntax is"
" \abegin\a[-\aend\a]]='\avalue\a' where \abegin\a is the beginning"
" column offset (starting at 0), \aend\a is the ending column offset"
" for an inclusive range, and \avalue\a is the fixed column value."
" Uncompress time is improved when high frequency columns are given"
" fixed values (see the \b--partition\b option).]:[name=value]"
"[p:partition?Specifies the data row size and the high frequency column"
" partition groups and permutation. The partition file is a sequence"
" of lines. Comments start with # and continue to the end of the line."
" The first non-comment line specifies the optional name string"
" in \"...\". The next non-comment line specifies the row size."
" The remaining lines operate on column offset ranges of the form:"
" \abegin\a[-\aend\a]] where \abegin\a is the beginning column offset"
" (starting at 0), and \aend\a is the ending column offset for an"
" inclusive range. The operators are:]:[file]{"
" [+range [...]]?places all columns in the specified \arange\a"
" list in the same high frequency partition group."
" Each high frequency partition group is processed as"
" a separate block by the underlying compressor"
" (\bgzip\b(1) by default).]"
" [+range='value'?specifies that each column in \arange\a"
" has the fixed character value \avalue\a. C-style"
" character escapes are valid for \avalue\a.]"
"}"
"[r:row?Specifies the input row size (number of byte columns). Exactly"
" one of \b--row\b or \b--partition\b must be specified.]#[row-size]"
"[u:undiff?The inverse of the \b--endiff\b difference encoding.]"
"[v:verbose?List header information on the input \apzip\a file or"
" \apartition-file\a and continue processing.]"
"[x:identify?Identify output information columns with labels from the"
" \b--format\b file.]"
"[Q:regress?Generate output for regression testing, such that identical"
" invocations with identical input files will generate the same output.]"
"[T:test?Enable implementation-specific tests and tracing.]#[test-mask]"
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
" and an optional \aterminator\a. This data is not \bpzip\b compressed."
" \aterminator\a may be one of:]:[count[*terminator]]]{"
" [+\aomitted\a?\acount\a bytes.]"
" [+L?\acount\a \bnewline\b terminated records.]"
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
"\n"
"\n[ file ]\n"
"\n"
"[+SEE ALSO?\bgzip\b(1), \bpin\b(1), \bpzip\b(1), \bpzip\b(3)]"
;
#include <ast.h>
#include <ctype.h>
#include <error.h>
#include <pzip.h>
#include <tok.h>
typedef struct
{
char* name;
int beg;
int end;
} Label_t;
typedef struct
{
} Info_t;
/*
* gather stats from sp into ip
*/
static ssize_t
gather(register Pz_t* pz, register Pzpart_t* pp, Sfio_t* sp, register Info_t* ip, size_t* map, size_t m)
{
register int i;
register int j;
register unsigned char* buf;
register size_t n;
register ssize_t r;
for (i = 0; i < m; i++)
rows = 0;
for (;;)
{
{
if (r < 0)
{
return -1;
}
if (r > 0)
break;
}
for (i = 0; i < m; i++)
{
}
}
for (i = 0; i < m; i++)
return rows;
}
/*
* cut hi (default) or lo cols from stdin to stdout
*/
static int
{
register int i;
register int j;
register size_t n;
register ssize_t r;
register unsigned char* ib;
register unsigned char* ob;
if (op & OP_VERBOSE)
for (n = 0; n < m; n++)
for (;;)
{
{
if (r > 0)
break;
}
for (i = 0; i < n; i++)
{
{
*ob++ = i >> 8;
*ob++ = i;
}
for (j = 0; j < m; j++)
*ob++ = '\n';
}
}
return 0;
}
/*
* label the mapped format fields
*/
static int
{
register char* s;
register int i;
register int g;
{
error_info.line++;
for (; isspace(*s); s++);
if (!*s || *s == '#' || *s == '"')
continue;
if (!isdigit(*s))
{
continue;
}
continue;
continue;
break;
lp++;
}
error(3, "format file row size %d does not match expected %I*d", lp->end + 1, sizeof(pp->row), pp->row);
error_info.file = 0;
error_info.line = 0;
{
return 1;
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
g = map[0];
else
g = 0;
for (i = g = 0; i < m; i++)
{
{
if (g != map[i])
g = map[i] + 1;
}
{
}
}
}
else
for (i = 0; i < m;)
{
g = map[i] + 1;
else
while (++i < m)
{
{
if (g != map[i])
{
break;
}
g = map[i] + 1;
}
{
break;
}
break;
}
}
return 0;
}
/*
* list info on the mapped fields
*/
static int
{
register int i;
register int g;
return 1;
sfprintf(sfstdout, "%s frequency info over %I*d rows\n\n", (op & OP_LO) ? "low" : "high", sizeof(rows), rows);
g = map[0];
else
g = 0;
for (i = g = 0; i < m; i++)
{
{
if (g != map[i])
g = map[i] + 1;
}
{
}
}
return 0;
}
/*
* copy the row by row diff of path to sfstdout
*/
static int
{
register int i;
register int j;
register int k;
ssize_t r;
unsigned char* dif;
if (!(buf[0] = newof(0, unsigned char, row, 0)) || !(buf[1] = newof(0, unsigned char, row, 0)) || !(dif = newof(0, unsigned char, row, 0)))
{
return 1;
}
{
return 1;
}
{
{
k = !i;
for (j = 0; j < row; j++)
break;
}
}
else
{
{
k = !i;
for (j = 0; j < row; j++)
break;
}
}
{
return 1;
}
if (r < 0)
{
return 1;
}
if (r)
return 0;
}
int
{
register int i;
int m;
int flags = 0;
char* format = 0;
int op = 0;
for (;;)
{
{
case 'c':
continue;
case 'e':
continue;
case 'f':
continue;
case 'h':
continue;
case 'i':
continue;
case 'l':
continue;
case 'm':
continue;
case 'n':
continue;
case 'o':
continue;
case 'p':
continue;
case 'r':
continue;
case 'u':
continue;
case 'v':
op |= OP_VERBOSE;
flags |= PZ_VERBOSE;
continue;
case 'x':
continue;
case 'Q':
continue;
case 'T':
continue;
case 'X':
continue;
case '?':
continue;
case ':':
else
continue;
}
break;
}
sfstrclose(dp);
{
if (!row)
}
if (row)
{
}
return 1;
{
}
{
m = 0;
map[m++] = i;
}
else
{
}
else if (format)
{
}
return i;
}