/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1985-2010 AT&T Intellectual Property *
* and is licensed under the *
* Common Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* http://www.opensource.org/licenses/cpl1.0.txt *
* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* David Korn <dgk@research.att.com> *
* Phong Vo <kpv@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* determine record format by sampling data in <buf,size>
* total is the total file size, <=0 if not available
* return r:
* -1 could not determine
* RECTYPE(r)==REC_fixed fixed length REC_F_SIZE(r)
* RECTYPE(r)==REC_delimited variable length delimiter=REC_D_DELIMITER(r)
* RECTYPE(r)==REC_variable variable length
*/
#include <recfmt.h>
typedef struct
{
unsigned int rep[4 * 1024];
unsigned int hit[UCHAR_MAX + 1];
} Sample_t;
Recfmt_t
recfmt(const void* buf, size_t size, off_t total)
{
register unsigned char* s;
register unsigned char* t;
register Sample_t* q;
register unsigned int* h;
register unsigned int i;
unsigned int j;
unsigned int k;
unsigned int n;
unsigned int m;
unsigned int x;
unsigned long f;
unsigned long g;
static unsigned char terminators[] = { '\n', 0x15, 0x25 };
/*
* check for V format
*/
s = (unsigned char*)buf;
t = s + size;
while ((k = (t - s)) >= 4 && !s[2] && !s[3])
{
if ((i = (s[0]<<8)|s[1]) > k)
break;
s += i;
}
if (!k || size > 2 * k)
return REC_V_TYPE(4, 0, 2, 0, 1);
s = (unsigned char*)buf;
/*
* check for terminated records
*/
for (i = 0; i < elementsof(terminators); i++)
if ((t = (unsigned char*)memchr((void*)s, k = terminators[i], size / 2)) && (n = t - s + 1) > 1 && (total <= 0 || !(total % n)))
{
for (j = n - 1; j < size; j += n)
if (s[j] != k)
{
n = 0;
break;
}
if (n)
return REC_D_TYPE(terminators[i]);
}
/*
* check fixed length record frequencies
*/
if (!(q = newof(0, Sample_t, 1, 0)))
return REC_N_TYPE();
x = 0;
for (i = 0; i < size; i++)
{
h = q->hit + s[i];
m = i - *h;
*h = i;
if (m < elementsof(q->rep))
{
if (m > x)
x = m;
q->rep[m]++;
}
}
n = 0;
m = 0;
f = ~0;
for (i = x; i > 1; i--)
{
if ((total <= 0 || !(total % i)) && q->rep[i] > q->rep[n])
{
m++;
g = 0;
for (j = i; j < size - i; j += i)
for (k = 0; k < i; k++)
if (s[j + k] != s[j + k - i])
g++;
g = (((g * 100) / i) * 100) / q->rep[i];
if (g <= f)
{
f = g;
n = i;
}
}
}
if (m <= 1 && n <= 2 && total > 1 && total < 256)
{
n = 0;
for (i = 0; i < size; i++)
for (j = 0; j < elementsof(terminators); j++)
if (s[i] == terminators[j])
n++;
n = n ? 0 : total;
}
free(q);
return n ? REC_F_TYPE(n) : REC_N_TYPE();
}
#if MAIN
main()
{
void* s;
size_t size;
off_t total;
if (!(s = sfreserve(sfstdin, SF_UNBOUND, 0)))
{
sfprintf(sfstderr, "read error\n");
return 1;
}
size = sfvalue(sfstdin);
total = sfsize(sfstdin);
sfprintf(sfstdout, "%d\n", recfmt(s, size, total));
return 0;
}
#endif