split.c revision 3f54fd611f536639ec30dd53c48e5ec1897cc7d9
/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1989-2011 AT&T Intellectual Property *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* http://www.eclipse.org/org/documents/epl-v10.html *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* split.c
* David Korn
* AT&T Research
*/
static const char split_usage[] =
"[-?\n@(#)$Id: split (AT&T Research) 2006-09-19 $\n]"
USAGE_LICENSE
"[+NAME?split - split files into pieces]"
"[+DESCRIPTION?\bsplit\b reads an input file and writes one or more"
" output files so that \bcat\b(1) on these files will produce"
" the input file. The default size for each piece is 1000 lines."
" The suffix consists of \asuffix_len\a lower case characters"
" from the POSIX locale.]"
"[+?If \aprefix\a is specified it will be used as a prefix for each"
" of the resulting files from the split operation. If \aprefix\a"
" is specified, the prefix \bx\b will be used.]"
"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bsplit\b"
" copies from standard input starting at the current location.]"
"[+?The option arguments for \b-b\b and \b-C\b can optionally be followed"
" by one of the following characters to specify a different"
" unit other than a single byte:]{"
" [+b?512 bytes.]"
" [+k?1-killobytes.]"
" [+m?1-megabyte.]"
" [+g?1-gigabyte.]"
" [+t?1-terabyte.]"
" }"
"[+?For backwards compatibility, \b-\b\aline_count\a is equivalent to"
" \b-l\b \aline_count\a.]"
"[l:lines]#[line_count:=1000?\aline_count\a specified the number of lines"
" for each piece except the last. If the input does not end in"
" a newline, the partial line is included in the last piece.]"
"[a|n:suffix-length]#[suffix_len:=2?\asuffix_len\a defines the number of"
" letters that form the suffix portion of the file names for each of"
" the pieces that the file is split into.]"
"[b:bytes]#[n?Splits the file into byte size pieces defined by \an\a"
" rather than lines.]"
"[C:line-bytes]#[n?Splits the file into lines totaling a most \an\a bytes.]"
"\n"
"\n[ file [ name ] ]\n"
"\n"
"[+EXIT STATUS]{"
" [+0?Successful completion.]"
" [+>0?An error occurred.]"
"}"
"[+SEE ALSO? \bcsplit\b(1), \bcat\b(1)]"
;
static const char csplit_usage[] =
"[-?\n@(#)$Id: csplit (AT&T Research) 2003-08-21 $\n]"
USAGE_LICENSE
"[+NAME?csplit - split a file into sections determined by context lines]"
"[+DESCRIPTION?\bcsplit\b creates zero or more output files containing"
" sections of the given input \afile\a, or the standard input if the"
" name \b-\b is given. By default, \bcsplit\b prints the number of"
" bytes written to each output file after it has been created.]"
"[+?The contents of the output files are determined by the \apattern\a"
" arguments. An error occurs if a pattern argument refers to a"
" nonexistent line of the input file, such as if no remaining line"
" matches a given regular expression. After all the given patterns have"
" been matched, any remaining output is copied into one last output"
" file. The types of pattern arguments are:]{"
" [+line?Create an output file containing the current line up"
" to (but not including) line \aline\a (a positive"
" integer) of the input file. If followed by a repeat"
" count, also create an output file containing the"
" next \aline\a lines of the input file once for each"
" repeat.]"
" [+/regexp/[offset]]?Create an output file containing the"
" current line up to (but not including) the next line"
" of the input file that contains a match for"
" \aregexp\a. The optional \aoffset\a is a \b+\b or"
" \b-\b followed by a positive integer. If it is given,"
" the input up to the matching line plus or minus"
" \aoffset\a is put into the output file, and the line"
" after that begins the next section of input.]"
" [+%regexp%[offset]]?Like the previous type, except that it"
" does not create an output file, so that section of"
" the input file is effectively ignored.]"
" [+{repeat-count}?Repeat the previous pattern \arepeat-count\a"
" (a positive integer) additional times. An asterisk"
" may be given in place of the (integer) repeat count,"
" in which case the preceeding pattern is repeated as"
" many times as necessary until the input is exausted.]"
" }"
"[+?The output file names consist of a prefix followed by a suffix. By"
" default, the suffix is merely an ascending linear sequence of two-digit"
" decimal numbers starting with 00 and ranging up to 99, however this"
" default may be overridden by either the \b--digits\b option or by the"
" \b--suffix-format\b option (see below.) In any case, concatenating"
" the output files in sorted order by file name produces the original"
" input file, in order. The default output file name prefix is \bxx\b.]"
"[+?By default, if \bcsplit\b encounters an error or receives a hangup,"
" interrupt, quit, or terminate signal, it removes any output files"
" that it has created so far before it exits.]"
"[b:suffix-format?Use the \bprintf\b(3) \aformat\a to generate the file"
" name suffix.]:[format:=\b%02d\b]"
"[f:prefix?Use \aprefix\a to generate the file name prefix.]:[prefix:=\bxx\b]"
"[k:keep-files?Do not remove output files on errors.]"
"[a|n:digits?Use \adigits\a in the generated file name suffixes.]#[digits:=2]"
"[s:silent|quiet?Do not print output file counts and sizes.]"
"[z:elide-empty-files?Remove empty output files.]"
"\n"
"\nfile arg ...\n"
"\n"
"[+EXIT STATUS?]{"
" [+0?Successful completion.]"
" [+>0?An error occurred.]"
"}"
"[+SEE ALSO? \bsplit\b(1), \bcat\b(1)]"
;
#include <cmd.h>
#include <regex.h>
#define S_FLAG 001
#define K_FLAG 002
#define C_FLAG 004
#define B_FLAG 010
#define Z_FLAG 020
#define M_FLAG 040
#define OP_LINES 0
#define OP_SEARCH 1
#define OP_SKIP 2
#define OP_ABSOLUTE 3
#define BLK_SIZE 2048
struct fname
{
char* fname;
char* format;
char* suffix;
char* last;
char low;
char high;
int count;
};
struct op
{
struct op* next;
Sfoff_t size;
size_t repeat;
int flags;
regex_t* re;
};
/*
* create an operation structure
*/
static struct op*
getop(struct op** prev, Sfoff_t size, size_t repeat, int flags, int re)
{
struct op* op;
if (op = newof(0, struct op, 1, re ? sizeof(regex_t) : 0))
{
op->repeat = repeat;
op->flags = flags;
op->size = size;
op->next = 0;
if (re)
op->re = (regex_t*)(op + 1);
*prev = op;
}
return op;
}
/*
* process /expr/offset arguments
* returns new operation structure which is added to linked list
*/
static struct op*
getexpr(struct op** prev, const char* arg)
{
char* cp = (char*)arg;
char* ep;
int n;
struct op* op;
if (op = getop(prev, 0, 1, *cp == '/' ? OP_SEARCH : OP_SKIP, 1))
{
if (n = regcomp(op->re, cp, REG_DELIMITED|REG_NOSUB))
{
regfatal(op->re, 2, n);
return 0;
}
cp += op->re->re_npat;
if (*cp)
{
op->size = strtoll(cp, &ep, 10);
if (*ep)
error(ERROR_exit(1), "%s: invalid offset", cp);
}
}
return op;
}
/*
* set up file name generator whose form is <prefix>... where ... is
* suflen characters from low..high
* returns a pointer to a structure that can be used to create
* file names
*/
static struct fname*
setfname(const char* prefix, char* format, int suflen, int low, int high)
{
struct fname* fp;
int flen;
int slen;
int len;
char* cp;
flen = strlen(prefix);
len = flen + suflen + 1;
if (format)
{
slen = strlen(format);
len += flen + slen + 1;
}
else
slen = 0;
if (fp = newof(0, struct fname, 1, len))
{
cp = (char*)(fp + 1);
if (format)
{
strcpy(fp->format = cp, prefix);
cp += flen;
strcpy(cp, format);
cp += slen + 1;
}
fp->low = low;
fp->high = high;
fp->count = 0;
strcpy(fp->fname = cp, prefix);
cp += flen;
fp->suffix = cp;
while (suflen-- > 0)
*cp++ = low;
*cp-- = 0;
fp->last = cp;
(*cp)--;
flen = _POSIX_NAME_MAX;
if (cp = strrchr(fp->fname, '/'))
{
cp++;
len = strlen(cp);
if (len > flen)
{
*(cp - 1) = 0;
flen = (int)strtol(astconf("NAME_MAX", fp->fname, NiL), NiL, 0);
*(cp - 1) = '/';
}
}
else
{
cp = fp->fname;
if (len > flen)
flen = (int)strtol(astconf("NAME_MAX", ".", NiL), NiL, 0);
}
if (len > flen)
error(ERROR_exit(1), "%s: filename too long", prefix);
}
return fp;
}
/*
* return next sequential file name
*/
static char*
getfname(struct fname* fp)
{
register char* cp = fp->last;
if (fp->format)
return sfprints(fp->format, fp->count++);
while (++(*cp) > fp->high)
{
*cp-- = fp->low;
if (cp < fp->suffix)
{
error(0, "file limit reached");
return 0;
}
}
fp->count++;
return fp->fname;
}
/*
* remove all generated files
*/
static void
removeall(struct fname* fp)
{
register char* cp = fp->suffix;
while (*cp)
*cp++ = fp->low;
*(cp - 1) -= 1;
while (fp->count-- > 0)
{
remove(getfname(fp));
fp->count--;
}
fp->count = 0;
}
static int
msize(Sfio_t* in, long len)
{
Sfoff_t off = sftell(in);
register char* cp;
register char* dp;
register long m;
register long n = len;
register long nlen = 0;
if (sfsize(in) - off <= len)
return len;
while (nlen == 0 && n > 0)
{
n -= BLK_SIZE;
if (n < 0)
n = 0;
sfseek(in, off + n, SEEK_SET);
if (!(dp = cp = sfreserve(in, BLK_SIZE, 0)))
return len;
m = BLK_SIZE;
while (m-- > 0)
{
if (*cp++ == '\n')
nlen = n + (cp - dp);
}
}
if (n > 0)
sfseek(in, off, SEEK_SET);
return nlen ? nlen : len;
}
static int
split(Sfio_t* in, struct fname* fp, struct op* op, int flags)
{
register char* cp;
register char* s;
Sfoff_t len;
Sfoff_t z;
Sfoff_t size;
size_t repeat;
int c;
register Sfio_t* out = 0;
register char* peek = 0;
register long n = 0;
int delim = (flags & B_FLAG) ? -1 : '\n';
size_t lineno = 1;
while (op)
{
if (op->flags == OP_LINES)
len = op->size;
repeat = op->repeat;
do
{
if (op->flags != OP_SKIP)
{
if (!(cp = getfname(fp)))
goto err;
if (!(out = sfopen(NiL, cp, "w")))
{
fp->count--;
error(ERROR_SYSTEM|2, "%s: cannot create", cp);
goto err;
}
}
if (op->flags == OP_ABSOLUTE || op->flags == OP_LINES)
{
if (op->flags == OP_ABSOLUTE)
len = op->size - lineno;
if (peek)
{
if ((n = sfputr(out, peek, delim)) <= 0)
goto done;
peek = 0;
if (len > 0)
len--;
lineno++;
}
if (len)
{
z = (flags & M_FLAG) ? msize(in, len) : len;
if ((n = sfmove(in, out, z, delim)) < z || n < 0)
goto done;
lineno += n;
}
}
else
{
if (peek)
{
if (out && (n = sfputr(out, peek, delim)) <= 0)
goto done;
lineno++;
peek = 0;
}
while (s = sfgetr(in, delim, 1))
{
if (!(c = regexec(op->re, s, 0, NiL, 0)))
break;
lineno++;
if (c != REG_NOMATCH)
{
regfatal(op->re, 2, c);
goto err;
}
if (out && (n = sfputr(out, s, delim)) <= 0)
goto done;
}
if (!(peek = s))
{
while (op->next)
op = op->next;
repeat = 1;
}
}
if (out)
{
size = sfseek(out, (Sfoff_t)0, SEEK_END);
if (!(flags & S_FLAG))
sfprintf(sfstdout, "%I*d\n", sizeof(size), size);
sfclose(out);
out = 0;
if ((flags & Z_FLAG) && size <= 0)
remove(cp);
}
} while (!repeat || --repeat);
op = op->next;
}
done:
if (out)
{
sfclose(out);
if (n <= 0)
remove(cp);
}
if (n >= 0)
return 0;
err:
if (!(flags & K_FLAG))
removeall(fp);
return 1;
}
int
main(int argc, char** argv)
{
struct fname* fp;
struct op* top;
char* cp;
char* prefix;
const char* usage;
Sfio_t* in;
int flags;
ssize_t n;
char* format = 0;
Sfoff_t size = 10000;
int suflen = 2;
if (cp = strrchr(*argv, '/'))
cp++;
else
cp = *argv;
error_info.id = cp;
if (streq(cp, "split"))
{
usage = split_usage;
flags = S_FLAG|K_FLAG;
prefix = "x";
}
else
{
usage = csplit_usage;
flags = C_FLAG;
prefix = "xx";
}
for (;;)
{
switch (optget(argv, usage))
{
case 0:
break;
case 'l':
flags &= ~(B_FLAG|M_FLAG);
if ((size = opt_info.number) <= 0)
error(1, "%s: invalid size", opt_info.arg);
continue;
case 'k':
flags |= K_FLAG;
continue;
case 's':
flags |= S_FLAG;
continue;
case 'z':
flags |= Z_FLAG;
continue;
case 'f':
prefix = opt_info.arg;
continue;
case 'a':
case 'n':
suflen = opt_info.num;
continue;
case 'C':
flags |= M_FLAG;
case 'b':
if (flags & S_FLAG)
{
if ((size = opt_info.number) <= 0)
error(1, "%s: invalid size", opt_info.arg);
flags |= B_FLAG;
}
else
format = opt_info.arg;
continue;
case ':':
error(2, "%s", opt_info.arg);
break;
case '?':
error(ERROR_usage(2), "%s", opt_info.arg);
break;
}
break;
}
argv += opt_info.index;
argc -= opt_info.index;
if (error_info.errors || !(flags & C_FLAG) && argc > 2 || (flags & C_FLAG) && argc < 2)
error(ERROR_usage(2), "%s", optusage(NiL));
cp = *argv++;
if (flags & C_FLAG)
{
struct op* op = 0;
char* sp;
while (sp = *argv++)
{
switch (*sp)
{
case '/':
case '?':
case '%':
op = getexpr(op ? &op->next : &top, sp);
break;
case '{':
if (!op)
error(ERROR_exit(1), "%s: pattern expected for repeat count", *(argv - 1));
if (*++sp == '*' && *(sp + 1) == '}' && !*(sp + 2))
op->repeat = 0;
else
{
if ((n = strtol(sp, &sp, 10)) <= 0 || *sp != '}' || *(sp + 1))
error(ERROR_exit(1), "%s: invalid repeat count", *(argv - 1));
op->repeat = n + 1;
}
if (op->flags == OP_ABSOLUTE)
op->flags = OP_LINES;
break;
default:
if ((size = strtoll(sp, &sp, 10)) <= 0 || *sp)
error(ERROR_exit(1), "%s: invalid line number", *(argv - 1));
op = getop(op ? &op->next : &top, size, 1, OP_ABSOLUTE, 0);
break;
}
}
op = getop(op ? &op->next : &top, SF_UNBOUND, 1, OP_LINES, 0);
fp = setfname(prefix, format, suflen, '0', '9');
}
else
{
if (cp && *argv)
prefix = *argv;
getop(&top, size, SF_UNBOUND, OP_LINES, 0);
fp = setfname(prefix, format, suflen, 'a', 'z');
}
if (!cp || streq(cp, "-"))
in = sfstdin;
else if (!(in = sfopen(NiL, cp, "r")))
error(ERROR_system(1), "%s: cannot open", cp);
n = split(in, fp, top, flags);
if (in != sfstdin)
sfclose(in);
return n;
}