wclib.c revision 34f9b3eef6fdadbda0a846aa4d68691ac40eace5
/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1992-2009 AT&T Intellectual Property *
* and is licensed under the *
* Common Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* http://www.opensource.org/licenses/cpl1.0.txt *
* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* David Korn <dgk@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* David Korn
* AT&T Bell Laboratories
*
* library interface for word count
*/
#include <cmd.h>
#include <wc.h>
#include <ctype.h>
#if _hdr_wchar && _hdr_wctype && _lib_iswctype
#include <wchar.h>
#include <wctype.h>
#include <lc.h>
#else
#ifndef iswspace
#define iswspace(x) isspace(x)
#endif
#endif
#define WC_SP 0x08
#define WC_NL 0x10
#define WC_MB 0x20
#define WC_ERR 0x40
#define eol(c) ((c)&WC_NL)
#define mbc(c) ((c)&WC_MB)
#define spc(c) ((c)&WC_SP)
#define mbwc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
Wc_t* wc_init(int mode)
{
register int n;
register int w;
Wc_t* wp;
if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
return 0;
if (!mbwide())
wp->mb = 0;
#if _hdr_wchar && _hdr_wctype && _lib_iswctype
else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
wp->mb = 1;
#endif
else
wp->mb = -1;
w = mode & WC_WORDS;
for (n = (1<<CHAR_BIT); --n >= 0;)
wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
wp->type['\n'] = WC_SP|WC_NL;
if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
{
for (n = 0; n < 64; n++)
{
wp->type[0x80+n] |= WC_MB;
if (n<32)
wp->type[0xc0+n] |= WC_MB+1;
else if (n<48)
wp->type[0xc0+n] |= WC_MB+2;
else if (n<56)
wp->type[0xc0+n] |= WC_MB+3;
else if (n<60)
wp->type[0xc0+n] |= WC_MB+4;
else if (n<62)
wp->type[0xc0+n] |= WC_MB+5;
}
wp->type[0xc0] = WC_MB|WC_ERR;
wp->type[0xc1] = WC_MB|WC_ERR;
wp->type[0xfe] = WC_MB|WC_ERR;
wp->type[0xff] = WC_MB|WC_ERR;
}
wp->mode = mode;
return wp;
}
static int invalid(const char *file, int nlines)
{
error_info.file = (char*)file;
error_info.line = nlines;
error(ERROR_SYSTEM|1, "invalid multibyte character");
error_info.file = 0;
error_info.line = 0;
return nlines;
}
/*
* handle utf space characters
*/
static int chkstate(int state, register unsigned int c)
{
switch(state)
{
case 1:
state = (c==0x9a?4:0);
break;
case 2:
state = ((c==0x80||c==0x81)?6+(c&1):0);
break;
case 3:
state = (c==0x80?5:0);
break;
case 4:
state = (c==0x80?10:0);
break;
case 5:
state = (c==0x80?10:0);
break;
case 6:
state = 0;
if(c==0xa0 || c==0xa1)
return(10);
else if((c&0xf0)== 0x80)
{
if((c&=0xf)==7)
return(iswspace(0x2007)?10:0);
if(c<=0xb)
return(10);
}
else if(c==0xaf && iswspace(0x202f))
return(10);
break;
case 7:
state = (c==0x9f?10:0);
break;
case 8:
return (iswspace(c)?10:0);
}
return state;
}
/*
* compute the line, word, and character count for file <fd>
*/
int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
{
register char* type = wp->type;
register unsigned char* cp;
register Sfoff_t nchars;
register Sfoff_t nwords;
register Sfoff_t nlines;
register Sfoff_t eline = -1;
register Sfoff_t longest = 0;
register ssize_t c;
register unsigned char* endbuff;
register int lasttype = WC_SP;
unsigned int lastchar;
ssize_t n;
ssize_t o;
unsigned char* buff;
wchar_t x;
unsigned char side[32];
sfset(fd,SF_WRITE,1);
nlines = nwords = nchars = 0;
wp->longest = 0;
if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
{
cp = buff = endbuff = 0;
for (;;)
{
if (cp >= endbuff || (n = mbwc(x, cp, endbuff-cp)) < 0)
{
if ((o = endbuff-cp) < sizeof(side))
{
if (buff)
{
if (o)
memcpy(side, cp, o);
mbinit();
}
else
o = 0;
cp = side + o;
if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
{
if ((nchars - longest) > wp->longest)
wp->longest = nchars - longest;
break;
}
if ((c = sizeof(side) - o) > n)
c = n;
if (c)
memcpy(cp, buff, c);
endbuff = buff + n;
cp = side;
x = mbchar(cp);
if ((cp-side) < o)
{
cp = buff;
nchars += (cp-side) - 1;
}
else
cp = buff + (cp-side) - o;
}
else
{
cp++;
x = -1;
}
if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
eline = invalid(file, nlines);
}
else
cp += n ? n : 1;
if (x == '\n')
{
if ((nchars - longest) > wp->longest)
wp->longest = nchars - longest;
longest = nchars + 1;
nlines++;
lasttype = 1;
}
else if (iswspace(x))
lasttype = 1;
else if (lasttype)
{
lasttype = 0;
nwords++;
}
nchars++;
}
}
else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
{
if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
{
while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
{
nchars += c;
endbuff = cp + c;
if (*--endbuff == '\n')
nlines++;
else
*endbuff = '\n';
for (;;)
if (*cp++ == '\n')
{
if (cp > endbuff)
break;
nlines++;
}
}
}
else
{
while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
{
nchars += c;
/* check to see whether first character terminates word */
if (c==1)
{
if (eol(lasttype))
nlines++;
if ((c = type[*cp]) && !lasttype)
nwords++;
lasttype = c;
continue;
}
if (!lasttype && type[*cp])
nwords++;
lastchar = cp[--c];
*(endbuff = cp+c) = '\n';
c = lasttype;
/* process each buffer */
for (;;)
{
/* process spaces and new-lines */
do
{
if (eol(c))
for (;;)
{
/* check for end of buffer */
if (cp > endbuff)
goto beob;
nlines++;
if (*cp != '\n')
break;
cp++;
}
} while (c = type[*cp++]);
/* skip over word characters */
while (!(c = type[*cp++]));
nwords++;
}
beob:
if ((cp -= 2) >= buff)
c = type[*cp];
else
c = lasttype;
lasttype = type[lastchar];
/* see if was in word */
if (!c && !lasttype)
nwords--;
}
if (eol(lasttype))
nlines++;
else if (!lasttype)
nwords++;
}
}
else
{
int lineoff=0;
int skip=0;
int adjust=0;
int state=0;
int oldc;
int xspace;
int wasspace = 1;
unsigned char* start;
lastchar = 0;
start = (endbuff = side) + 1;
xspace = iswspace(0xa0) || iswspace(0x85);
while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
{
nchars += c;
start = cp-lineoff;
/* check to see whether first character terminates word */
if(c==1)
{
if(eol(lasttype))
nlines++;
if((c = type[*cp]) && !lasttype)
nwords++;
lasttype = c;
endbuff = start;
continue;
}
lastchar = cp[--c];
endbuff = cp+c;
cp[c] = '\n';
if(mbc(lasttype))
{
c = lasttype;
goto mbyte;
}
if(!lasttype && spc(type[*cp]))
nwords++;
c = lasttype;
/* process each buffer */
for (;;)
{
/* process spaces and new-lines */
spaces:
do
{
if (eol(c))
{
/* check for end of buffer */
if (cp > endbuff)
goto eob;
if(wp->mode&WC_LONGEST)
{
if((cp-start)-adjust > longest)
longest = (cp-start)-adjust-1;
start = cp;
}
nlines++;
nchars -= adjust;
adjust = 0;
}
} while (spc(c = type[*cp++]));
wasspace=1;
if(mbc(c))
{
mbyte:
do
{
if(c&WC_ERR)
goto err;
if(skip && (c&7))
break;
if(!skip)
{
if(!(c&7))
{
skip=1;
break;
}
skip = (c&7);
adjust += skip;
state = 0;
if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
oldc = *cp;
else if(xspace && cp[-1]==0xc2)
{
state = 8;
oldc = *cp;
}
}
else
{
skip--;
if(state && (state=chkstate(state,oldc)))
{
if(state==10)
{
if(!wasspace)
nwords++;
wasspace = 1;
state=0;
goto spaces;
}
oldc = *cp;
}
}
} while (mbc(c = type[*cp++]));
wasspace = 0;
if(skip)
{
if(eol(c) && (cp > endbuff))
goto eob;
err:
skip = 0;
state = 0;
if(eline!=nlines && !(wp->mode & WC_QUIET))
eline = invalid(file, nlines);
while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
c=type[*cp++];
if(eol(c) && (cp > endbuff))
{
c = WC_MB|WC_ERR;
goto eob;
}
if(mbc(c))
goto mbyte;
else if(c&WC_SP)
goto spaces;
}
if(spc(c))
{
nwords++;
continue;
}
}
/* skip over word characters */
while(!(c = type[*cp++]));
if(mbc(c))
goto mbyte;
nwords++;
}
eob:
lineoff = cp-start;
if((cp -= 2) >= buff)
c = type[*cp];
else
c = lasttype;
lasttype = type[lastchar];
/* see if was in word */
if(!c && !lasttype)
nwords--;
}
if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
wp->longest = longest;
if (eol(lasttype))
nlines++;
else if (!lasttype)
nwords++;
nchars -= adjust;
}
wp->chars = nchars;
wp->words = nwords;
wp->lines = nlines;
return 0;
}