cut.c revision 3e14f97f673e8a630f076077de35afdd43dc1587
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin/***********************************************************************
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* This software is part of the ast package *
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner* Copyright (c) 1992-2010 AT&T Intellectual Property *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* and is licensed under the *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* Common Public License, Version 1.0 *
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin* by AT&T Intellectual Property *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* A copy of the License is available at *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* Information and Software Systems Research *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* AT&T Research *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* Florham Park NJ *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* Glenn Fowler <gsf@research.att.com> *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin* David Korn <dgk@research.att.com> *
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin***********************************************************************/
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * David Korn
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * AT&T Bell Laboratories
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * cut fields or columns from fields from a file
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chinstatic const char usage[] =
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner"[-?\n@(#)$Id: cut (AT&T Research) 2009-12-04 $\n]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+NAME?cut - cut out selected columns or fields of each line of a file]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+DESCRIPTION?\bcut\b bytes, characters, or character-delimited fields "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "from one or more files, contatenating them on standard output.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+?The option argument \alist\a is a comma-separated or blank-separated "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "list of positive numbers and ranges. Ranges can be of three "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "forms. The first is two positive integers separated by a hyphen "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "(\alow\a\b-\b\ahigh\a), which represents all fields from \alow\a to "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "\ahigh\a. The second is a positive number preceded by a hyphen "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "(\b-\b\ahigh\a), which represents all fields from field \b1\b to "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "\ahigh\a. The last is a positive number followed by a hyphen "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "(\alow\a\b-\b), which represents all fields from \alow\a to the "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "last field, inclusive. Elements in the \alist\a can be repeated, "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "can overlap, and can appear in any order. The order of the "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "output is that of the input.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+?One and only one of \b-b\b, \b-c\b, or \b-f\b must be specified.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+?If no \afile\a is given, or if the \afile\a is \b-\b, \bcut\b "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "cuts from standard input. The start of the file is defined "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "as the current offset.]"
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner"[b:bytes]:[list?\bcut\b based on a list of byte counts.]"
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner"[c:characters]:[list?\bcut\b based on a list of character counts.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[d:delimiter]:[delim?The field character for the \b-f\b option is set "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "to \adelim\a. The default is the \btab\b character.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[f:fields]:[list?\bcut\b based on fields separated by the delimiter "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "character specified with the \b-d\b optiion.]"
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner"[n!:split?Split multibyte characters selected by the \b-b\b option.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[R|r:reclen]#[reclen?If \areclen\a > 0, the input will be read as fixed length "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "records of length \areclen\a when used with the \b-b\b or \b-c\b "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "option.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[s:suppress|only-delimited?Suppress lines with no delimiter characters, "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "when used with the \b-f\b option. By default, lines with no "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "delimiters will be passsed in untouched.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[D:line-delimeter|output-delimiter]:[ldelim?The line delimiter character for "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "the \b-f\b option is set to \aldelim\a. The default is the "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "\bnewline\b character.]"
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner"[N!:newline?Output new-lines at end of each record when used "
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "with the \b-b\b or \b-c\b option.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"\n[file ...]\n"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+EXIT STATUS?]{"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "[+0?All files processed successfully.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin "[+>0?One or more files failed to open or could not be read.]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin"[+SEE ALSO?\bpaste\b(1), \bgrep\b(1)]"
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chintypedef struct Cut_s
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner#define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * compare the first of an array of integers
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknermycomp(register const void* a, register const void* b)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (*((int*)a) < *((int*)b))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (*((int*)a) > *((int*)b))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknercutinit(int mode, char* str, Delim_t* wdelim, Delim_t* ldelim, size_t reclen)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int* lp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int c;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int n = 0;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int range = 0;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!(cut = (Cut_t*)stakalloc(sizeof(Cut_t) + strlen(cp) * sizeof(int))))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner memset(cut->space, 0, sizeof(cut->space) / 2);
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner memset(cut->space + sizeof(cut->space) / 2, SP_WIDE, sizeof(cut->space) / 2);
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner memset(cut->space, 0, sizeof(cut->space));
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner cut->eob = (ldelim->len == 1) ? ldelim->chr : 0;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner cut->nosplit = (mode&(C_BYTES|C_NOSPLIT)) == (C_BYTES|C_NOSPLIT) && cut->mb;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner switch(c = *cp++)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin case '\t':
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner /*FALLTHROUGH*/
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin register int *dp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner qsort(lp=cut->list,n,2*sizeof(*lp),mycomp);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* eliminate overlapping regions */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* convert ranges into gaps */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* NOTREACHED */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * cut each line of file <fdin> and put results to <fdout> using list <list>
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknercutcols(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int c;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int len;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int ncol = 0;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register char* bp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!bp && !(bp = sfgetr(fdin, 0, SF_LASTR)))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register const char* s = bp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int z;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!(*s & 0x80))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner else if ((z = mblen(s, w)) <= 0)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register const char* s = bp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int w = len;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register int z;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner while (w > 0 && ncol > 0)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!(*s & 0x80) || (z = mblen(s, w)) <= 0)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!cut->nlflag && (skip || must || cut->reclen))
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner sfwrite(fdout, cut->ldelim.str, cut->ldelim.len);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * cut each line of file <fdin> and put results to <fdout> using list <list>
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin * stream <fdin> must be line buffered
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknercutfields(Cut_t* cut, Sfio_t* fdin, Sfio_t* fdout)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin register unsigned char *cp;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register unsigned char *wp;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin register int c, nfields;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin register unsigned char *copy;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register unsigned char *ep;
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* process each buffer */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner while ((bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) && (c = sfvalue(fdin)) > 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* process each line in the buffer */
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* skip over non-delimiter characters */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner /* mb char possibly spanning buffer boundary -- fun stuff */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if ((c = cp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (!(bp = (unsigned char*)sfreserve(fdin, SF_UNBOUND, -1)) || (c = sfvalue(fdin)) <= 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* check for end-of-line */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner /* restore cut->last character */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if ((c = wp - copy) > 0 && sfwrite(fdout, (char*)copy, c) < 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* set to delimiter unless the first field */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if (copy && (c=cp-copy)>0 && (!nodelim || !cut->sflag) && sfwrite(fdout,(char*)copy,c)< 0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* see whether to save in tmp file */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner if(inword && nodelim && !cut->sflag && (c=cp-first)>0)
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin /* copy line to tmpfile in case no fields */
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulknerb_cut(int argc, char** argv, void* context)
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner register char* cp = 0;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner ldelim.chr = *(unsigned char*)opt_info.arg;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner wdelim.chr = *(unsigned char*)opt_info.arg;
3e14f97f673e8a630f076077de35afdd43dc1587Roger A. Faulkner error(ERROR_usage(2), "%s", opt_info.arg);
da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968chin error(3, "non-empty b, c or f option must be specified");