ppfsm.c revision 1
1N/A/*********************************************************************** 1N/A* This software is part of the ast package * 1N/A* Copyright (c) 1986-2010 AT&T Intellectual Property * 1N/A* and is licensed under the * 1N/A* Common Public License, Version 1.0 * 1N/A* by AT&T Intellectual Property * 1N/A* A copy of the License is available at * 1N/A* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 1N/A* Information and Software Systems Research * 1N/A* Glenn Fowler <gsf@research.att.com> * 1N/A***********************************************************************/ 1N/A * preprocessor and proto lexical analyzer fsm 1N/A * define PROTOMAIN for standalone proto 1N/A * lexical FSM encoding 1N/A * derived from a standalone ansi cpp by Dennis Ritchie 1N/A * modified for libpp by Glenn Fowler 1N/A * fsm[] is initialized from fsminit[]. The encoding is blown out into 1N/A * fsm[] for time efficiency. When in state state, and one of the 1N/A * characters in ch arrives, enter nextstate. States >= TERMINAL are 1N/A * either final, or at least require special action. In fsminit[] there 1N/A * is a line for each <state,charset,nextstate>. Early entries are 1N/A * overwritten by later ones. C_XXX is the universal set and should 1N/A * always be first. Some of the fsminit[] entries are templates for 1N/A * groups of states. The OP entries trigger the state copies. States 1N/A * above TERMINAL are represented in fsm[] as negative values. S_TOK and 1N/A * S_TOKB encode the resulting token type in the upper bits. These actions 1N/A * differ in that S_TOKB has a lookahead char. 1N/A * fsm[] has three start states: 1N/A * PROTO proto (ANSI -> K&R,C++,ANSI) 1N/A * QUICK standalone ppcpp() 1N/A * TOKEN tokenizing pplex() 1N/A * If the next state remains the same then the fsm[] transition value is 0. 1N/A * MAX+1 is a power of 2 so that fsm[state][EOF==MAX+1] actually accesses 1N/A * fsm[state+1][0] which is ~S_EOB for all states. This preserves the 1N/A * power of 2 fsm[] row size for efficient array indexing. Thanks to 1N/A * D. G. Korn for the last two observations. The pseudo non-terminal state 1N/A * fsm[TERMINAL][state+1] is used to differentiate EOB from EOF. 1N/A * The bit layout is: 1N/A * TERM arg SPLICE next 1N/A * NOTE: these must be `control' characters for all native codesets 1N/A * currently ok for {ascii,ebcdic1,ebcdic2,ebcdic3} 1N/A unsigned char ch[
4];
/* and see one of these */ 1N/A /* proto start state */ 1N/A/* proto {do,else,extern,for,if,inline,return,static,typedef,va_start,void,while,NoN} */ 1N/A /* proto reserved {va_start} */ 1N/A /* proto reserved {return} */ 1N/A /* proto reserved {if} */ 1N/A /* proto reserved {while} */ 1N/A /* proto reserved {else} */ 1N/A /* proto reserved {inline} */ 1N/A /* proto reserved {do,for,void} */ 1N/A /* proto reserved {static} */ 1N/A /* proto reserved {extern} */ 1N/A /* proto reserved {typedef} */ 1N/A /* saw /, perhaps start of comment */ 1N/A /* saw / *, start of comment */ 1N/A /* saw the * possibly ending a comment */ 1N/A /* saw / in / * comment, possible malformed nest */ 1N/A /* saw / /, start of comment */ 1N/A /* saw / in / / comment, possible malformed nest */ 1N/A /* saw * in / /, possible malformed nest */ 1N/A /* normal identifier -- always a macro candidate */ 1N/A /* saw ., operator or dbl constant */ 1N/A /* saw .., possible ... */ 1N/A /* saw L (possible start of normal wide literal) */ 1N/A /* saw " or ' beginning literal */ 1N/A /* saw \ in literal */ 1N/A /* eat malformed numeric constant */ 1N/A /* eat malformed numeric fraction|exponent */ 1N/A /* saw white space, eat it up */ 1N/A /* quick template */ 1N/A /* copy QUICK to QUICK+1 through MAC0+1 */ 1N/A /* quick start state */ 1N/A /* grab non-macro tokens */ 1N/A /* grab numeric and invalid tokens */ 1N/A /* grab exponent token */ 1N/A /* saw *, grab possible bad comment terminator */ 1N/A /* saw L (possible start of wide string or first macro char) */ 1N/A /* macro candidate template */ 1N/A /* copy MAC0+1 to MAC0+2 through MACN */ 1N/A /* saw L (possible start of wide string or macro L) */ 1N/A /* macro hit template */ 1N/A /* copy HIT0+1 to HIT0+2 through HITN */ 1N/A /* saw L (possible start of wide literal) */ 1N/A /* (!PROTOMAIN COM1) saw /, perhaps start of comment or /= */ 1N/A /* normal start state */ 1N/A /* saw 0, possible oct|hex|dec|dbl constant */ 1N/A /* saw 0<oct>, oct constant */ 1N/A /* oct constant qualifier */ 1N/A /* saw 0 [xX], hex constant */ 1N/A /* hex constant qualifier */ 1N/A /* hex [eE][-+] botch */ 1N/A /* hex dbl fraction */ 1N/A /* optional hex dbl exponent sign */ 1N/A /* mandatory hex dbl exponent first digit */ 1N/A /* hex dbl exponent digits */ 1N/A /* hex dbl constant qualifier */ 1N/A /* saw <dec>, dec constant */ 1N/A /* dec constant qualifier */ 1N/A /* saw ., operator or dbl constant */ 1N/A /* optional dbl exponent sign */ 1N/A /* mandatory dbl exponent first digit */ 1N/A /* dbl exponent digits */ 1N/A /* dbl constant qualifier */ 1N/A /* saw < starting include header */ 1N/A /* saw <binop><space> expecting = */ 1N/Astatic char spl[] = {
'\\',
'\r', 0 };
1N/Astatic char aln[] =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_$@";
1N/Astatic char aln[] =
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_";
1N/Astatic char hex[] =
"fedcbaFEDCBA9876543210";
1N/A * runtime FSM modifications 1N/A * ppfsm(FSM_INIT,0) must be called first 1N/A else error(
2,
"%c: cannot add to identifier set", c);
1N/A for (i = 0; i <
sizeof(
fp->
ch) && (c =
fp->
ch[i]); i++)
1N/A * install splice special cases 1N/A * and same non-terminal transitions 1N/A * default character types 1N/A else error(
2,
"%c: cannot add to quote set", c);
1N/A * file buffer refill 1N/A * c is current input char 1N/A static char ket[] = { 0,
'}',
'\n', 0 };