pin.c revision 3f54fd611f536639ec30dd53c48e5ec1897cc7d9
/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1998-2011 AT&T Intellectual Property *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* Glenn Fowler
* Adam Buchsbaum
* AT&T Research
*
* induce a column partition on fixed row data
*/
static const char usage[] =
"[-?\n@(#)$Id: pin (AT&T Research) 2003-07-17 $\n]"
"[+NAME?pin - induce a pzip partition on fixed record data]"
"[+DESCRIPTION?\bpin\b induces a \bpzip\b(1) column partition on data files"
" of fixed length rows (records) and columns (fields). If a partition"
" file is specified then that partition is refined. A partition file,"
" suitable for use by \bpin\b, \bpzip\b(1) or \bpop\b(1) is listed"
" on the standard output. The input \afile\a is referred to as"
" \atraining data\a. \b--size\b allows more than one \afile\a"
" argument, otherwise exactly one file must be specified.]"
"[+?Partitions are induced in a three step process:]{"
" [+(1)?If \b--partition\b is not specified then a"
" subset of columns are filtered from the training data for"
" partitioning. This filtering usually gathers high frequency columns"
" from the data. A high frequency column has data with a high rate of"
" change between rows. High frequency cutoff rates, specified by"
" \b--high\b, typically range"
" from 10% to 50%, depending on the data. \bpzip\b(1) compresses"
" low frequency columns much more efficiently than \bgzip\b(1).]"
" [+(2)?A heuristic search determines an initial ordering of high"
" frequency columns to present to step (3). An optimal solution for"
" both ordering and partitioning is NP-complete.]"
" [+(3)?The optimal partition for the ordering from step (2) is"
" determined by a dynamic program that computes the compressed size"
" for all partitions that preserve order."
" Alternative greedy methods are also available that work more"
" quickly but do not guarantee optimality.]"
"}"
"[+?See \bpzip\b(1) for a detailed description of file partitions and"
" column frequencies. \bpin\b can run for a long time on some data"
" (e.g., 10 minutes on a 400Mhz Pentium with \b--high 80 --window 4M\b)."
" Use \b--verbose\b possibly with \b--test=010\b to monitor progress.]"
"[b:bzip?Use \bbzip\b(1) compression instead of the default \bgzip\b(1)."
" \abzip\a is not fully supported by \apzip\a, pending further"
" investigation.]"
"[c:cache?Generate some information on \afile\a that can be reused"
" on another invocation; this information is saved in \bpin-\b\abase\a,"
" where \abase\a is the base name (no directory) of \afile\a. Saved"
" information includes column frequencies and singleton and pairwise"
" column \agzip\a rates.]"
"[g:group?Sets the maximum number of columns in any partition group. Lower"
" values speed up the the dynamic program but also may produce"
" sub-optimal solutions.]#[columns:=row-size]"
"[h:high?Select this number of columns with the highest frequencies for the"
" columns in the partition. If \acolumns\a is followed by `%' then"
" columns with frequencies larger than this percentage are"
" selected.]:[columns:=10%]"
"[l:level?Sets the \agzip\a compression level to \alevel\a. Levels range"
" from 1 (fastest, worst compression) to 9 (slowest, best compression).]#"
" [level:=6]"
"[m:maxhigh?Exit with exit code 3 if the number of high frequency columns"
" exceeds \bmaxhigh\b. If \bmaxhigh\b is followed by `%' then the limit"
" is \bmaxhigh\b percent of the total number of columns.]:[maxhigh:=40%]"
"[o:sort?Sort the window data by row before inducing the partition.]"
"[p:partition?Specifies the data row size and the high frequency column"
" partition groups and permutation. The partition file is a sequence"
" of lines. Comments start with # and continue to the end of the line."
" The first non-comment line specifies the optional name string"
" in \"...\". The next non-comment line specifies the row size."
" The remaining lines operate on column offset ranges of the form:"
" \abegin\a[-\aend\a]] where \abegin\a is the beginning column offset"
" (starting at 0), and \aend\a is the ending column offset for an"
" inclusive range. The operators are:]:[file]{"
" [+range [...]]?places all columns in the specified \arange\a"
" list in the same high frequency partition group."
" Each high frequency partition group is processed as"
" a separate block by the underlying compressor"
" (\bgzip\b(1) by default).]"
" [+range='value'?specifies that each column in \arange\a"
" has the fixed character value \avalue\a. C-style"
" character escapes are valid for \avalue\a.]"
"}"
"[r:row?Specifies the input row size (number of byte columns). The row size"
" is determined by sampling the input if not specified.]#[row-size]"
"[v:verbose?List partition search details on the standard error.]"
"[w:window?Limit the number of training data rows used to induce the"
" partition. The window size may be decreased to accomodate an"
" integral number of complete rows.]#[window-size:=4M]"
"[O:optimize?Choose the optimization (partitioning) method for step (3)"
" above. The methods"
" are:]:[method:=\foptimize_default\f]{\foptimize_methods\f}"
"[Q:regress?Generate output for regression testing, such that identical"
" invocations with identical input files will generate the same output.]"
"[R:reorder?Choose the reordering method for step (2) above. The methods"
" are:]:[method:=\freorder_default\f]{\freorder_methods\f}"
"[S:size?Ignore \b--row\b, determine the fixed record size based on a window"
" of sampled data, print it on the standard output, and exit. If more"
" than one \afile\a is specified then the record size and name are"
" printed for each file. If the sample is insufficient, or if"
" \b--verify\b is specified, then all of the data read to determine"
" the row size. A \b0\b size means the record size could not be"
" determined.]"
"[T:test?Enable implementation-specific tests and tracing.]#[test-mask]{"
" [+0x0010?Enable reorder keep trace.]"
" [+0x0040?Enable reorder permutation trace.]"
" [+0x0080?Enable reorder level 2 merge prune.]"
" [+0x0100?Disable reorder merge prune.]"
" [+0x0200?Partition using initial tsp cycles.]"
"}"
"[V:verify?Verify \b--size\b by reading all data instead of the window sample.]"
"[X:prefix?Uncompressed data contains a prefix that is defined by \acount\a"
" and an optional \aterminator\a. This data is not \bpzip\b compressed."
" \aterminator\a may be one of:]:[count[*terminator]]]{"
" [+\aomitted\a?\acount\a bytes.]"
" [+L?\acount\a \bnewline\b terminated records.]"
" [+'\achar\a'?\acount\a \achar\a terminated records.]"
"}"
"\n"
"\nfile ...\n"
"\n"
"[+SEE ALSO?\bbzip\b(1), \bgzip\b(1), \bpop\b(1), \bpzip\b(1), \bpzip\b(3)]"
;
#include <ast.h>
#include <ctype.h>
#include <error.h>
#include <ls.h>
#include <pzip.h>
#include <zlib.h>
#include <bzlib.h>
#if _hdr_tsp && _lib_tspopen
#include <tsp.h>
#endif
#define OP_optimize 0x01
#define OP_reorder 0x02
#define OP_size 0x04
#define OP_verify 0x10
#define minof(x,y) ((x)<(y)?(x):(y))
typedef struct
{
} Part_t;
typedef struct
{
unsigned long frequency;
int prev;
int column;
} Stats_t;
struct Optimize_method_s;
typedef void (*Optimize_f)(struct Optimize_method_s*, unsigned char*, unsigned char*, size_t**, size_t*, size_t*, size_t, size_t, size_t);
typedef struct Optimize_method_s
{
const char* name;
const char* description;
struct Reorder_method_s;
typedef void (*Reorder_f)(struct Reorder_method_s*, unsigned char*, unsigned char*, int*, size_t, size_t);
typedef struct Reorder_method_s
{
const char* name;
const char* description;
static struct
{
char* input;
char* cachefile;
int bzip;
int cache;
int level;
int pairs;
int sort;
int test;
int window;
int verbose;
} state;
/*
* allocate an i-length vector of size_t
*/
static size_t*
vector(int i)
{
register size_t* p;
return p;
}
/*
* allocate an i X i matrix of size_t
*/
static size_t**
matrix(int i)
{
register size_t** p;
register size_t* v;
register int k;
size_t n;
n = i * i;
v = (size_t*)(p + i);
for (k = 0; k < i; k++)
{
p[k] = v;
v += i;
}
return p;
}
/*
* allocate an i X i partition workspace
*/
static Part_t*
partition(int i)
{
register Part_t* p;
register size_t* v;
register int k;
size_t n;
n = i * i;
v = (size_t*)(p + n);
for (k = 0; k < n; k++)
{
p[k].member = v;
v += i;
}
return p;
}
/*
* order frequencies hi to lo
*/
static int
{
return 1;
return -1;
return -1;
return 1;
return 0;
}
/*
* order column lo to hi
*/
static int
{
return -1;
return 1;
return 0;
}
/*
* order partition rate lo to hi
*/
static int
{
return -1;
return 1;
return -1;
return 1;
return 0;
}
/*
* order row lo to hi
*/
static int
{
}
/*
* dump one partition line with label to sp
*/
static void
{
register int i;
}
/*
* return the compressed size of buffer b of size bytes
*/
static size_t
{
unsigned long used;
unsigned int dest;
static unsigned char* buf;
static int bufsize;
{
}
{
}
else
{
}
return used;
}
/*
* copy col i through col j from s into t
* and return the compressed size
*/
static size_t
{
register unsigned char* b;
register unsigned char* e;
register int n;
b = t;
e = s + tot;
n = j - i + 1;
for (s += i; s < e; s += row)
{
memcpy(b, s, n);
b += n;
}
return zip(t, b - t);
}
/*
* copy col i and col j from s into t
* and return the compressed size
*/
static size_t
{
register unsigned char* b;
register unsigned char* e;
b = t;
e = s + tot;
j -= i;
for (s += i; s < e; s += row)
{
*b++ = *s;
*b++ = *(s + j);
}
return zip(t, b - t);
}
/*
* return the compressed size for partition pp
*/
static size_t
{
register unsigned char* b;
register unsigned char* e;
register int i;
b = t;
e = s + tot;
for (; s < e; s += row)
return zip(t, b - t);
}
/*
* merge col i with part pp and keep the best compression in np
* (np+1) is used as tmp workspace
* return 1 if the best merge is better than i and pp separately
*/
static int
merge(unsigned char* t, unsigned char* s, int i, Part_t* pp, register Part_t* np, size_t** siz, size_t row, size_t tot)
{
register int j;
register int k;
size_t x;
size_t z;
k = 0;
bp = 0;
for (i = 0;;)
{
if (z < x)
{
for (j = 0; j < k; j++)
}
if (++i >= k)
break;
}
if (bp)
{
for (j = 0; j < k; j++)
return 1;
}
return 0;
}
/*
* filter out the high frequency columns from dat
*/
static size_t
filter(Sfio_t* ip, unsigned char** bufp, unsigned char** datp, Pz_t* pz, int high, int maxhigh, size_t row, size_t tot)
{
register int i;
register int j;
char* q;
unsigned char* s;
unsigned char* t;
unsigned char* dat;
unsigned char* buf;
int c;
size_t n;
ssize_t r;
unsigned long freq;
{
if (pz)
{
}
else
{
{
int column;
unsigned long frequency;
error_info.line = 0;
{
error_info.line++;
if (*q == 'f')
{
if (sfsscanf(q + 1, "%d %d %lu", &c, &column, &frequency) != 3 || c < 0 || c >= row || column < 0 || column >= row)
}
}
for (j = 0; j < row; j++)
error_info.file = 0;
error_info.line = 0;
}
else
{
{
if (high < 0)
else if (high > 0)
}
s = dat;
for (i = 0; i < rows; i++)
for (j = 0; j < row; j++)
{
}
n = rows;
{
r /= row;
for (i = 0; i < r; i++)
for (j = 0; j < row; j++)
{
}
if ((n += r) > 10000000L)
{
/*
* that'll do pig
*/
r = 0;
break;
}
}
if (r < 0)
else if (r)
for (j = 0; j < row; j++)
{
error_info.line = 0;
else
{
for (j = 0; j < row; j++)
{
error_info.line++;
}
}
error_info.file = 0;
error_info.line = 0;
}
}
if (high < 0)
{
for (j = 0; j < row; j++)
break;
high = j;
}
if (maxhigh < 0)
{
}
else if (maxhigh > 0)
{
}
/*
* cut the original data layout some slack
* by using the original column order
*/
for (j = 0; j < high; j++)
for (j = 0; j < row; j++)
for (j = 0; j < high; j++)
}
s = dat;
t = buf;
for (i = 0; i < rows; i++)
{
for (j = 0; j < high; j++)
s += row;
}
}
else
{
for (i = 0; i < row; i++)
}
return row;
}
/*
* permute state.map and dat according to a new order
*/
static void
{
register int i;
unsigned char* end;
for (i = 0; i < row; i++)
for (i = 0; i < row; i++)
{
for (i = 0; i < row; i++)
}
}
/*
* stuff the partion group labels in lab
*/
static int
{
int i;
if (i >= beg)
return label;
}
/*
* list the final partition on sp
*/
static void
{
register int i;
register int j;
register int g;
g = -1;
for (i = 0; i < row; i++)
{
if (g != lab[i])
{
g = lab[i];
}
else
if (j > (i + 2))
{
i = j - 1;
}
}
}
/*
* search for a good initial ordering
*/
static void
reorder_heuristic(Reorder_method_t* method, unsigned char* buf, unsigned char* dat, int* lab, size_t row, size_t tot)
{
register int i;
register int j;
register int k;
char* s;
int ii;
int jj;
size_t y;
size_t z;
/*
* fill in the pair compression size matrix
* and the initial partition candidate list
*/
k = 2;
/*
* check the cache
*/
{
error_info.line = 0;
{
error_info.line++;
if (*s == 'p')
{
if (sfsscanf(s + 1, "%d %d %I*u", &ii, &jj, sizeof(z), &z) != 3 || ii < 0 || jj < 0 || state.pairs && (ii >= state.pairs || jj >= state.pairs))
{
}
continue;
else
{
cp++;
}
}
}
error_info.file = 0;
error_info.line = 0;
}
sp = 0;
for (i = 0; i < row; i++)
if (!hit[i])
{
hit[i] = 1;
{
error_info.line = 0;
{
error_info.line++;
}
}
if (sp)
{
error_info.line++;
}
for (j = 0; j < i; j++)
{
if (z <= y)
{
ii = i;
jj = j;
}
else
{
z = y;
ii = j;
jj = i;
}
{
cp++;
if (sp)
{
error_info.line++;
}
}
}
}
error_info.file = 0;
error_info.line = 0;
/*
* coalesce the partitions
*/
for (;;)
{
if (k == 2)
{
for (i = 0; i < row; i++)
{
if (!(z = siz[i][j]))
z = siz[j][i];
if (z)
{
}
}
}
else
{
ii = 0;
if (i != j)
}
{
bp = 0;
for (i = 0; i < row; i++)
{
if (hit[i] > k)
continue;
z = 0;
jj = 0;
z = 2;
else
{
z = 2;
z = 1;
}
if (z != 1)
continue;
{
}
{
}
}
if (bp)
{
np++;
}
else
}
{
{
fp++;
}
}
break;
k++;
}
/*
* collect the order in ord and the partition labels in lab
*/
j = 0;
jj = 0;
k = 0;
{
if (!jj)
{
jj = 1;
j++;
}
}
for (i = 0; i < row; i++)
if (hit[i])
{
ord[k++] = i;
lab[i] = ++j;
}
/*
* permute state.map and dat according to the new order
*/
/*
* clean up
*/
}
/*
* tsp ordering
*/
static void
reorder_tsp(Reorder_method_t* method, unsigned char* buf, unsigned char* dat, int* lab, size_t row, size_t tot)
{
#if TSP_VERSION
size_t i;
size_t j;
Tsp_cost_t** cost;
Tsp_cost_t* v;
for (i = 0; i < row; i++)
{
cost[i] = v;
v += row;
}
/*
* compute the Tsp_cost_t matrix
*/
for (i = 0; i < row; i++)
for (i = 0; i < row; i++)
for (j = 0; j < row; j++)
if (i != j)
{
}
/*
* generate a tour
*/
/*
* break tour at most expensive link; put order into self
*/
for (i = 0; i < end; i++)
{
breakat = i;
}
j = 0;
for (i = 0; i <= breakat; i++)
/*
* permute state.map and dat according to the new order
*/
/*
* partition
*/
{
/*
* partition according to the initial tsp cycles
*/
for (i = 0; i < row; i++)
}
else
{
/*
* partition according to dependence along tour
*/
for (i = j = 0; i < end; i++)
{
lab[i] = j;
j++;
}
lab[i] = j;
}
/*
* clean up
*/
#else
#endif
}
/*
* dynamic program to find optimal partition based on zip sizes in val
*/
static void
optimize_dynamic(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
{
int i;
int j;
int k;
/*
* fill in the field compress value matrix
*/
for (i = 0; i < row; i++)
{
if (grp <= 0)
k = row;
k = row;
for (j = i; j < k; j++)
for (; j < row; j++)
val[i][j] = ~0;
}
/*
* now run the dynamic program
*/
for (i = 0; i < row; i++)
{
sol[i] = -1;
for (j = 0; j < i; j++)
{
{
sol[i] = j;
}
}
}
}
/*
* greedy algorithm to approximate optimal partition based on zip sizes in val
*/
static void
optimize_greedy(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
{
int i;
sol[0] = -1;
for (i = 1; i < row; i++)
{
sol[i] = i-1;
}
else {
}
}
}
/*
* transitive greedy algorithm to approximate optimal partition
* based on zip sizes in val
*/
static void
optimize_transitive(Optimize_method_t* method, unsigned char* buf, unsigned char* dat, size_t** val, size_t* cst, size_t* sol, size_t row, size_t tot, size_t grp)
{
int i;
sol[0] = -1;
for (i = 1; i < row; i++)
{
{
}
else
{
sol[i] = i-1;
}
}
}
/*
* optimization (partitioning) method table
* the first entry is the default
*/
static Optimize_method_t optimize_methods[] =
{
{
"dynamic",
"dynamic program optimal partition",
},
{
"greedy",
"greedy approximation partition",
},
{
"none",
"no partition",
0
},
{
"transitive",
"transitive greedy approximation partition",
},
};
/*
* reorder method table
* the first entry is the default
*/
static Reorder_method_t reorder_methods[] =
{
{
"heuristic",
"heuristic reorder",
},
{
"none",
"no reordering",
0
},
{
"tsp",
"tsp reordering",
},
};
/*
* optget() info discipline function
*/
static int
{
register int i;
register int n;
n = 0;
if (streq(s, "optimize_default"))
else if (streq(s, "optimize_methods"))
for (i = 0; i < elementsof(optimize_methods); i++)
else if (streq(s, "reorder_default"))
else if (streq(s, "reorder_methods"))
for (i = 0; i < elementsof(reorder_methods); i++)
return n;
}
int
{
unsigned char* dat;
unsigned char* buf;
char* s;
int i;
ssize_t n;
ssize_t m;
int* lab;
int op = 0;
int high = -10;
int maxhigh = -40;
int maxgrp = 0;
char* partition = 0;
for (;;)
{
{
case 'b':
continue;
case 'c':
continue;
case 'g':
continue;
case 'h':
if (*s == '%')
{
s++;
}
if (*s)
continue;
case 'l':
continue;
case 'm':
if (*s == '%')
{
s++;
}
if (*s)
continue;
case 'o':
continue;
case 'p':
continue;
case 'r':
continue;
case 'v':
continue;
case 'w':
continue;
case 'O':
for (i = 0;; i++)
{
if (i >= elementsof(optimize_methods))
{
break;
}
{
optimize_method = &optimize_methods[i];
break;
}
}
continue;
case 'Q':
continue;
case 'R':
for (i = 0;; i++)
{
if (i >= elementsof(reorder_methods))
{
break;
}
{
reorder_method = &reorder_methods[i];
break;
}
}
continue;
case 'S':
continue;
case 'T':
continue;
case 'V':
continue;
case 'X':
continue;
case '?':
continue;
case ':':
else
continue;
}
break;
}
/*
* set up the workspace
*/
if (optimize_method->fun)
op |= OP_optimize;
if (reorder_method->fun)
op |= OP_reorder;
{
row = 0;
dat = 0;
buf = 0;
}
for (;;)
{
ip = 0;
{
if (partition)
{
}
else
{
if (!row)
{
if (m > 0)
{
row = m;
}
}
pz = 0;
}
}
return 1;
break;
{
m = 0;
row = 0;
}
if (!n)
{
return 0;
}
if (pz)
else if (ip)
return 0;
row = 0;
}
{
}
/*
* set up the cache file
*/
s++;
else
{
}
/*
* filter out the high frequency columns
*/
{
}
else
/*
* determine a better ordering
*/
if (op & OP_reorder)
/*
* generate a partition based on the ordering
*/
if (op & OP_optimize)
{
/*
* allocate the tmp workspace
*/
/*
* partition
*/
/*
* gather the optimal partition group labels in lab
*/
/*
* clean up
*/
}
/*
* finished -- list the partition
*/
if (!(op & OP_reorder))
if (maxgrp)
if (high)
else
return 0;
}