4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * This file contains code for
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * int rexpr(char *expr, char *s);
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * which answers
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * 1 if 's' is in the language described by the regular expression 'expr'
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * 0 if it is not
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * -1 if the regular expression is invalid
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Language membership is determined by constructing a non-deterministic
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * finite automata (NFA) from the regular expression. A depth-
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * first-search is performed on the NFA (graph) to check for a match of 's'.
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Each non-epsilon arc consumes one character from 's'. Backtracking is
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * performed to check all possible paths through the NFA.
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Regular expressions follow the meta-language:
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <regExpr> ::= <andExpr> ( '|' <andExpr> )*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <andExpr> ::= <expr> ( <expr> )*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | '(' <regExpr> ')' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | '{' <regExpr> '}' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | <atom> <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <repeatSymbol> ::= { '*' | '+' }
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <atomList> ::= <atom> ( <atom> )*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | { <atomList> } <atom> '-' <atom> { <atomList> }
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <atom> ::= Token[Atom]
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * ~ means complement the set in [..]. i.e. all characters not listed
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * * means match 0 or more times (can be on expression or atom)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * + means match 1 or more times (can be on expression or atom)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * {} optional
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * () grouping
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * [] set of atoms
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * x-y all characters from x to y (found only in [..])
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * \xx the character with value xx
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Examples:
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * match 1 or more lower-case letters (e.g. variable)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * 0x[0-9A-Fa-f]+
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * match a hex number with 0x on front (e.g. 0xA1FF)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * [0-9]+.[0-9]+{e[0-9]+}
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * match a floating point number (e.g. 3.14e21)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Code example:
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Terence Parr
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Purdue University
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * April 1991
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic void next( void );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int regExpr();
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int andExpr();
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int expr();
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int atomList();
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic void next();
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * return 1 if s in language described by expr
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * 0 if s is not
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * -1 if expr is an invalid regular expression
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync /* free all your memory */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync while ( p!=NULL ) { q = p->track; free(p); p = q; }
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * do a depth-first-search on the NFA looking for a path from start to
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * accept state labelled with the characters of 's'.
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync if ( automaton == accept && *s == '\0' ) return 1; /* match */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync for (p=automaton->arcs; p!=NULL; p=p->next) /* try all arcs */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync else if ( p->label == *s )
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <regExpr> ::= <andExpr> ( '|' {<andExpr>} )*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Return -1 if syntax error
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Return 0 if none found
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * Return 1 if a regExrp was found
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int regExpr(g)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <andExpr> ::= <expr> ( <expr> )*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int andExpr(g)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <expr> ::= {'~'} '[' <atomList> ']' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | '(' <regExpr> ')' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | '{' <regExpr> '}' <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * | <atom> <repeatSymbol>
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic int expr(g)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync char s[257]; /* alloc space for string of char in [] */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync /* S p e c i a l C a s e O p t i o n a l { } */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <repeatSymbol> ::= { '*' | '+' }
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync case '*' : *g = BuildNFA_Astar( *g ); next(); break;
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * <atomList> ::= <atom> { <atom> }*
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * { <atomList> } <atom> '-' <atom> { <atomList> }
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * a-b is same as ab
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync * q-a is same as q
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync char *s = p;
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync/* a somewhat stupid lexical analyzer */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic void next()
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync/* N F A B u i l d i n g R o u t i n e s */
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync if ( freelist != NULL ) p->track = (ArcPtr) freelist;
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsyncstatic void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( g );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( g );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( g );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync if ( s == NULL ) return g;
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync while ( *s != '\0' )
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( g );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( A );
4fd606d1f5abe38e1f42c38de1d2e895166bd0f4vboxsync return( g );