/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright 1986, 1994 by Mortice Kern Systems Inc. All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
/*
* awk -- mainline, yylex, etc.
*
*/
#include "awk.h"
#include "y.tab.h"
#include <stdarg.h>
#include <unistd.h>
#include <locale.h>
#include <search.h>
static char *filename;
#ifdef DEBUG
static int dflag;
#endif
static void awkarginit(int c, char **av);
static void awkvarinit(void);
static int usage(void);
char *_cmdname;
/*
* mainline for awk
*/
int
{
char *cmd;
/*
* At this point only messaging should be internationalized.
* numbers are still scanned as in the Posix locale.
*/
#if !defined(TEXT_DOMAIN)
#endif
(void) textdomain(TEXT_DOMAIN);
awkvarinit();
/* running = 1; */
break;
if (*ap == '\0') {
break;
}
++argv;
--argc;
break;
switch (*ap) {
#ifdef DEBUG
case 'd':
dflag = 1;
continue;
#endif
case 'f':
if (argc < 2) {
gettext("Missing script file\n"));
return (1);
}
--argc;
++argv;
continue;
case 'F':
if (argc < 2) {
gettext("Missing field separator\n"));
return (1);
}
--argc;
++argv;
} else
++ap;
break;
case 'v': {
if (argc < 2) {
gettext("Missing variable assignment\n"));
return (1);
}
/*
* Ensure the variable expression
* is valid (correct form).
*/
*vp = '\0';
*vp = '=';
} else {
"Invalid form for variable "
"assignment: %S\n"), arg);
return (1);
}
--argc;
++argv;
continue;
}
default:
return (usage());
}
break;
}
if (save_ptr)
}
if (argc < 2)
return (usage());
--argc;
++argv;
}
/* running = 0; */
(void) yyparse();
lineno = 0;
/*
* Ok, done parsing, so now activate the rest of the nls stuff, set
* the radix character.
*/
awk();
/* NOTREACHED */
return (0);
}
/*
* Do initial setup of buffers, etc.
* This must be called before most processing
* and especially before lexical analysis.
* Variables initialised here will be overruled by command
* line parameter initialisation.
*/
static void
{
gettext("not enough available file descriptors"));
exit(1);
}
#ifdef A_ZERO_POINTERS
#else
{
/* initialize file descriptor table */
}
}
#endif
constundef->n_strlen = 0;
{
}
}
{
case SVAR:
case VAR:
running = 1;
else {
}
running = 0;
break;
case KEYWORD:
break;
}
}
}
}
/*
* Initialise awk ARGC, ARGV variables.
*/
static void
{
int i;
running = 1;
for (i = 0; i < ac; ++i) {
}
running = 0;
}
/*
* Clean up when done parsing a function.
* All formal parameters, because of a deal (funparm) in
* yylex, get put into the symbol table in front of any
* global variable of the same name. When the entire
* function is parsed, remove these formal dummy nodes
* from the symbol table but retain the nodes because
* the generated tree points at them.
*/
void
{
}
/*
* The lexical analyzer.
*/
int
yylex()
#ifdef DEBUG
{
int l;
l = yyhex();
if (dflag)
(void) printf("%d\n", l);
return (l);
}
yyhex()
#endif
{
int i;
static int savetoken = 0;
static int wasfield;
static int isfuncdef;
static struct ctosymstruct {
} ctosym[] = {
{ '}', RBRACE }, { 0, 0 }
};
if (savetoken) {
c = savetoken;
savetoken = 0;
} else if (redelim != '\0') {
c = redelim;
redelim = 0;
catterm = 0;
savetoken = c;
if (iswalpha(c) || c == '_') {
c = lexid(c);
} else if (iswdigit(c) || c == '.') {
c = lexnumber(c);
} else if (isWblank(c)) {
continue;
} else switch (c) {
case 032: /* ^Z */
continue;
#endif
case '"':
c = lexstring(c);
break;
case '#':
;
lexungetc(c);
continue;
case '+':
c = INC;
else if (c1 == '=')
c = AADD;
else
break;
case '-':
c = DEC;
else if (c1 == '=')
c = ASUB;
else
break;
case '*':
c = AMUL;
else if (c1 == '*') {
c = AEXP;
else {
c = EXP;
}
} else
break;
case '^':
c = AEXP;
} else {
c = EXP;
}
break;
case '/':
c = ADIV;
else
break;
case '%':
c = AREM;
else
break;
case '&':
c = AND;
else
break;
case '|':
c = OR;
else {
if (inprint)
c = PIPE;
}
break;
case '>':
c = GE;
else if (c1 == '>')
c = APPEND;
else {
c = WRITE;
}
break;
case '<':
c = LE;
else
break;
case '!':
c = NE;
else if (c1 == '~')
c = NRE;
else
break;
case '=':
c = EQ;
else {
c = ASG;
}
break;
case '\n':
switch (lexlast) {
case ')':
c = ';';
break;
}
/*FALLTHRU*/
case AND:
case OR:
case COMMA:
case '{':
case ELSE:
case ';':
case DO:
continue;
case '}':
if (nbrace != 0)
continue;
default:
c = ';';
break;
}
break;
case ELSE:
if (lexlast != ';') {
c = ';';
}
break;
case '(':
++nparen;
break;
case ')':
if (--nparen < 0)
break;
case '{':
nbrace++;
break;
case '}':
if (--nbrace < 0) {
brk[0] = '{';
}
if (lexlast != ';') {
savetoken = c;
c = ';';
}
break;
case '[':
++nbracket;
break;
case ']':
if (--nbracket < 0) {
brk[0] = '[';
}
break;
case '\\':
continue;
break;
case ',':
c = COMMA;
break;
case '?':
c = QUEST;
break;
case ':':
c = COLON;
break;
default:
if (!iswprint(c))
gettext("invalid character \"%s\""),
toprint(c));
break;
}
break;
}
switch (c) {
case ']':
++catterm;
break;
case VAR:
if (catterm) {
savetoken = c;
c = CONCAT;
catterm = 0;
} else if (!isfuncdef) {
++catterm;
}
isfuncdef = 0;
break;
case PARM:
case CONSTANT:
if (catterm) {
savetoken = c;
c = CONCAT;
catterm = 0;
} else {
if (lexlast == '$')
wasfield = 2;
++catterm;
}
break;
case INC:
case DEC:
break;
/*FALLTHRU*/
case UFUNC:
case FUNC:
case GETLINE:
case '!':
case '$':
case '(':
if (catterm) {
savetoken = c;
c = CONCAT;
catterm = 0;
}
break;
/* { */ case '}':
if (nbrace == 0)
savetoken = ';';
/*FALLTHRU*/
case ';':
inprint = 0;
/*FALLTHRU*/
default:
if (c == DEFFUNC)
isfuncdef = 1;
catterm = 0;
}
lexlast = c;
if (wasfield)
wasfield--;
/*
* Map character constants to symbolic names.
*/
for (i = 0; ctosym[i].c != 0; i++)
if (c == ctosym[i].c) {
break;
}
return ((int)c);
}
/*
* Read a number for the lexical analyzer.
* Input is the first character of the number.
* Return value is the lexical type.
*/
static int
{
int dotfound = 0;
int efound = 0;
do {
if (iswdigit(c))
;
else if (c == '.') {
if (dotfound++)
break;
} else if (c == 'e' || c == 'E') {
lexungetc(c);
c = 'e';
} else
*cp++ = 'e';
if (efound++)
break;
} else
break;
*cp++ = c;
*cp = '\0';
return (DOT);
lexungetc(c);
errno = 0;
else
return (CONSTANT);
}
/*
* Read an identifier.
* Input is first character of identifier.
* Return VAR.
*/
static int
{
size_t i;
do {
*cp++ = c;
c = lexgetc();
*cp = '\0';
lexungetc(c);
case KEYWORD:
switch (np->n_keywtype) {
case PRINT:
case PRINTF:
++inprint;
default:
return ((int)np->n_keywtype);
}
/* NOTREACHED */
case ARRAY:
case VAR:
/*
* If reading the argument list, create a dummy node
* for the duration of that function. These variables
* can be removed from the symbol table at function end
* but they must still exist because the execution tree
* knows about them.
*/
if (funparm) {
(i+1) * sizeof (wchar_t));
(!doing_begin || begin_getline))) {
/*
* If the user program references NF or sets
* FS either outside of a begin block or
* in a begin block after a getline then the
* input line will be split immediately upon read
* rather than when a field is first referenced.
*/
needsplit = 1;
} else if (np == varENVIRON)
needenviron = 1;
/*FALLTHRU*/
case PARM:
return (VAR);
case UFUNC:
/*
* It is ok to redefine functions as parameters
*/
if (funparm) goto do_funparm;
/*FALLTHRU*/
case FUNC:
case GETLINE:
/*
* When a getline is encountered, clear the 'doing_begin' flag.
* This will force the 'needsplit' flag to be set, even inside
* a begin block, if FS is altered. (See VAR case above)
*/
if (doing_begin)
begin_getline = 1;
}
/* NOTREACHED */
return (0);
}
/*
* Read a string for the lexical analyzer.
* `endc' terminates the string.
*/
static int
{
return (CONSTANT);
}
/*
* Read a regular expression.
*/
static int
{
return (URE);
}
/*
* Process a string, converting the escape characters as required by
* 1003.2. The processed string ends up in the global linebuf[]. This
* routine also changes the value of 'progfd' - the program file
* descriptor, so it should be used with some care. It is presently used to
*/
void
{
}
/*
* Read a string or regular expression, terminated by ``endc'',
* for lexical analyzer, processing escape sequences.
* Return string length.
*/
static size_t
{
wint_t c;
int n, max;
if (first_time == 1) {
first_time = 0;
}
if (c == '\n')
if (c == '\\') {
switch (c = lexgetc(), c) {
case '\\':
if (regx)
*cp++ = '\\';
break;
case '/':
c = '/';
break;
case 'n':
c = '\n';
break;
case 'b':
c = '\b';
break;
case 't':
c = '\t';
break;
case 'r':
c = '\r';
break;
case 'f':
c = '\f';
break;
case 'v':
c = '\v';
break;
case 'a':
c = (char)0x07;
break;
case 'x':
n = 0;
if (iswdigit(c))
c -= '0';
else if (iswupper(c))
c -= 'A'-10;
else
c -= 'a'-10;
n = (n<<4) + c;
}
lexungetc(c);
c = n;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
#if 0
/*
* Posix.2 draft 10 disallows the use of back-referencing - it explicitly
* requires processing of the octal escapes both in strings and
* regular expressions. The following code is disabled instead of
* removed as back-referencing may be reintroduced in a future draft
* of the standard.
*/
/*
* For regular expressions, we disallow
* \ooo to mean octal character, in favour
* of back referencing.
*/
if (regx) {
*cp++ = '\\';
break;
}
#endif
max = 3;
n = 0;
do {
n = (n<<3) + c-'0';
break;
} while (--max);
lexungetc(c);
/*
* an octal escape sequence must have at least
* 2 digits after the backslash, otherwise
* it gets passed straight thru for possible
* use in backreferencing.
*/
if (max == 3) {
*cp++ = '\\';
n += '0';
}
c = n;
break;
case '\n':
continue;
default:
if (c != endc || cmd_line_operand) {
*cp++ = '\\';
if (c == endc)
lexungetc(c);
}
}
}
if (c == WEOF)
*cp++ = c;
}
*cp = '\0';
}
/*
* Build a regular expression NODE.
* Argument is the string holding the expression.
*/
NODE *
{
int n;
int m;
char *p;
p = (char *)emalloc(m);
awkerr("/%S/: %s", s, p);
}
return (np);
}
/*
* Get a character for the lexical analyser routine.
*/
static wint_t
lexgetc()
{
wint_t c;
;
else {
if (proglen-- <= 0)
c = WEOF;
else
c = *progptr++;
} else {
else
lineno = 1;
== FNULL) {
exit(1);
}
}
}
}
if (c == '\n')
++lineno;
if (c != WEOF)
*conptr++ = c;
return (c);
}
/*
* Return a character for lexical analyser.
* Only one returned character is (not enforced) legitimite.
*/
static void
{
if (c == '\n')
--lineno;
if (c != WEOF) {
*--conptr = '\0';
}
return;
}
if (c == WEOF)
return;
*--progptr = c;
proglen++;
}
/*
* Syntax errors during parsing.
*/
void
yyerror(char *s, ...)
{
else
awkerr(s);
}
/*
* Error routine for all awk errors.
*/
/* ARGSUSED */
void
{
}
/*
* Error routine like "awkerr" except that it prints out
* a message that includes an errno-specific indication.
*/
/* ARGSUSED */
void
{
}
/*
* Common internal routine for awkerr, awkperr
*/
static void
{
if (running) {
if (phase == 0)
else
} else if (lineno != 0) {
}
if (perr == 1)
int n;
int c;
n = NCONTEXT;
do {
if ((c = *cp++) != '\0')
stderr);
} while (--n != 0);
}
exit(1);
}
wchar_t *
emalloc(unsigned n)
{
return (cp);
}
wchar_t *
{
return (cp);
}
/*
* usage message for awk
*/
static int
usage()
{
"Usage: awk [-F ERE] [-v var=val] 'program' [var=val ...] [file ...]\n"
" awk [-F ERE] -f progfile ... [-v var=val] [var=val ...] [file ...]\n"));
return (2);
}
static wchar_t *
{
if (op != 0)
}
char *
{
static char *op = 0;
if (op != 0)
}
/*
* Solaris port - following functions are typical MKS functions written
* to work for Solaris.
*/
wchar_t *
mbstowcsdup(s)
char *s;
{
int n;
wchar_t *w;
n = strlen(s) + 1;
return (NULL);
return (NULL);
return (w);
}
char *
{
int n;
char *mb;
/* Fetch memory for worst case string length */
n = wslen(w) + 1;
n *= MB_CUR_MAX;
return (NULL);
}
/* Convert the string */
return (0);
}
/* Shrink the string down */
return (NULL);
}
return (mb);
}
/*
* The upe_ctrls[] table contains the printable 'control-sequences' for the
* character values 0..31 and 127. The first entry is for value 127, thus the
* entries for the remaining character values are from 1..32.
*/
static const char *const upe_ctrls[] =
{
"^?",
"^@", "^A", "^B", "^C", "^D", "^E", "^F", "^G",
"^H", "^I", "^J", "^K", "^L", "^M", "^N", "^O",
"^P", "^Q", "^R", "^S", "^T", "^U", "^V", "^W",
"^X", "^Y", "^Z", "^[", "^\\", "^]", "^^", "^_"
};
/*
* Return a printable string corresponding to the given character value. If
* the character is printable, simply return it as the string. If it is in
* the range specified by table 5-101 in the UPE, return the corresponding
* string. Otherwise, return an octal escape sequence.
*/
static const char *
toprint(c)
wchar_t c;
{
int n, len;
unsigned char *ptr;
/* Should never happen */
return (buf);
}
mbch[n] = '\0';
if (iswprint(c)) {
return (mbch);
} else if (c == 127) {
return (upe_ctrls[0]);
} else if (c < 32) {
/* Print as in Table 5-101 in the UPE */
return (upe_ctrls[c+1]);
} else {
/* Print as an octal escape sequence */
}
return (buf);
}
static int
{
int c = 0;
while (c < off) {
int n;
break;
if (n == -1)
n = 1;
c += n;
s++;
}
return (s - astring);
}
static int nregunref;
struct reghashq {
};
struct regcache {
int refcnt;
};
/*
* Generate a hash value of the given wchar string.
* The hashing method is similar to what Java does for strings.
*/
static uint_t
{
int k = 0;
while (*str != L'\0')
k = (31 * k) + *str++;
k += ~(k << 9);
k ^= (k >> 14);
k += (k << 4);
k ^= (k >> 10);
return (k % NREGHASH);
}
int
{
char *mbpattern;
int ret;
break;
}
/* update link. put this one at the beginning */
}
nregunref--; /* no longer unref'ed */
return (REG_OK);
}
return (REG_ESPACE);
return (ret);
return (REG_ESPACE);
return (REG_ESPACE);
}
return (ret);
}
void
{
int cnt;
return;
/* this cache has no reference */
return;
/*
* We've got too much unref'ed regex. Free half of least
* used regex.
*/
cnt = 0;
continue;
/* free half of them */
continue;
/* detach and free */
/* free up */
nregunref--;
}
}
{
}
int
int flags)
{
char *mbs;
int i;
return (REG_ESPACE);
return (REG_ESPACE);
}
int j, k;
for (j = 0; j < nsub; j++) {
}
}
}
}
if (mbsub)
return (i);
}
int
int len, /* destination length */
int *globp) /* IN: occurence, 0 for all; OUT: substitutions */
{
int i;
wchar_t c;
int flags;
int regerr;
/* handle overflow of dst. we need "i" more bytes */
#ifdef OVERFLOW
#define OVERFLOW(i) { \
goto nospace; \
}
#endif
return (REG_ESPACE);
return (REG_EFATAL);
glob = 0; /* match count */
flags = 0;
/* Copy text preceding match */
OVERFLOW(i)
while (i--)
oglob++;
} else
/* Perform replacement of matched substing */
while ((c = *xp++) != '\0') {
if (c == '&')
else if (c == '\\') {
else if (*xp != '\0')
c = *xp++;
}
*op++ = c;
OVERFLOW(1)
OVERFLOW(i)
while (i--)
}
}
break;
/* If empty match copy next char */
OVERFLOW(1)
}
flags = REG_NOTBOL;
}
return (regerr);
/* Copy rest of text */
OVERFLOW(i)
while (i--)
*op++ = '\0';
return (REG_ESPACE);
}
}