ndr_lex.c revision d0e518695adc90b82233b99af7dffbb3d3f92c00
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <errno.h>
#include <stdarg.h>
#include "ndrgen.h"
#include "y.tab.h"
/*
* C-like lexical analysis.
*
* 1. Define a "struct node"
* 2. Define a "struct symbol" that encapsulates a struct node.
* 3. Define a "struct integer" that encapsulates a struct node.
* 4. Set the YACC stack type in the grammar:
* %{
* #define YYSTYPE struct node *
* %}
* 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
* Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
* "%token STRUCT_KW":
* // atomic values
* %token INTEGER STRING IDENTIFIER
* // keywords
* %token STRUCT_KW CASE_KW
* // operators
* %token PLUS MINUS ASSIGN ARROW
* // overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
* %token INCOP RELOP EQUOP ASSOP
* 6. It's easiest to use the yacc(1) generated token numbers for node
* labels. For node labels that are not actually part of the grammer,
* use a %token with an L_ prefix:
* // node labels (can't be generated by lex)
* %token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
* 7. Call set_lex_input() before parsing.
*/
#define SQ '\''
#define DQ '"'
#define isquote(c) ((c) == SQ || (c) == DQ)
#define iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')
#define is_between(c, l, u) ((l) <= (c) && (c) <= (u))
#define is_white(c) ((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
#define is_lower(c) is_between((c), 'a', 'z')
#define is_upper(c) is_between((c), 'A', 'Z')
#define is_alpha(c) (is_lower(c) || is_upper(c))
#define is_digit(c) is_between((c), '0', '9')
#define is_sstart(c) (is_alpha(c) || (c) == '_')
#define is_sfollow(c) (is_sstart(c) || is_digit(c))
#define is_xdigit(c) \
(is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))
ndr_symbol_t *symbol_list;
static ndr_integer_t *integer_list;
static FILE *lex_infp;
static ndr_symbol_t *file_name;
int line_number;
int n_compile_error;
static int lex_at_bol;
/* In yacc(1) generated parser */
extern struct node *yylval;
/*
* The keywtab[] and optable[] could be external to this lex
* and it would all still work.
*/
static ndr_keyword_t keywtable[] = {
{ "struct", STRUCT_KW, 0 },
{ "union", UNION_KW, 0 },
{ "typedef", TYPEDEF_KW, 0 },
{ "interface", INTERFACE_KW, 0 },
{ "uuid", UUID_KW, 0 },
{ "_no_reorder", _NO_REORDER_KW, 0 },
{ "extern", EXTERN_KW, 0 },
{ "reference", REFERENCE_KW, 0 },
{ "align", ALIGN_KW, 0 },
{ "operation", OPERATION_KW, 0 },
{ "in", IN_KW, 0 },
{ "out", OUT_KW, 0 },
{ "string", STRING_KW, 0 },
{ "size_is", SIZE_IS_KW, 0 },
{ "length_is", LENGTH_IS_KW, 0 },
{ "switch_is", SWITCH_IS_KW, 0 },
{ "case", CASE_KW, 0 },
{ "default", DEFAULT_KW, 0 },
{ "transmit_as", TRANSMIT_AS_KW, 0 },
{ "arg_is", ARG_IS_KW, 0 },
{ "char", BASIC_TYPE, 1 },
{ "uchar", BASIC_TYPE, 1 },
{ "wchar", BASIC_TYPE, 2 },
{ "short", BASIC_TYPE, 2 },
{ "ushort", BASIC_TYPE, 2 },
{ "long", BASIC_TYPE, 4 },
{ "ulong", BASIC_TYPE, 4 },
{0}
};
static ndr_keyword_t optable[] = {
{ "{", LC, 0 },
{ "}", RC, 0 },
{ "(", LP, 0 },
{ ")", RP, 0 },
{ "[", LB, 0 },
{ "]", RB, 0 },
{ "*", STAR, 0 },
{ ";", SEMI, 0 },
{0}
};
static int getch(FILE *fp);
static ndr_integer_t *int_enter(long);
static ndr_symbol_t *sym_find(char *);
static int str_to_sv(char *, char *sv[]);
/*
* Enter the symbols for keyword.
*/
static void
keyw_tab_init(ndr_keyword_t kwtable[])
{
int i;
ndr_keyword_t *kw;
ndr_symbol_t *sym;
for (i = 0; kwtable[i].name; i++) {
kw = &kwtable[i];
sym = sym_enter(kw->name);
sym->kw = kw;
}
}
void
set_lex_input(FILE *fp, char *name)
{
keyw_tab_init(keywtable);
keyw_tab_init(optable);
lex_infp = fp;
file_name = sym_enter(name);
line_number = 1;
lex_at_bol = 1;
}
static int
getch(FILE *fp)
{
return (getc(fp));
}
int
yylex(void)
{
char lexeme[512];
char *p = lexeme;
FILE *fp = lex_infp;
int c, xc;
ndr_symbol_t *sym;
ndr_integer_t *intg;
top:
p = lexeme;
c = getch(fp);
if (c == EOF)
return (EOF);
if (c == '\n') {
line_number++;
lex_at_bol = 1;
goto top;
}
/*
* Handle preprocessor lines. This just notes
* which file we're processing.
*/
if (c == '#' && lex_at_bol) {
char *sv[10];
int sc;
while ((c = getch(fp)) != EOF && c != '\n')
*p++ = c;
*p = 0;
/* note: no ungetc() of newline, we don't want to count it */
if (*lexeme != ' ') {
/* not a line we know */
goto top;
}
sc = str_to_sv(lexeme, sv);
if (sc < 2)
goto top;
file_name = sym_enter(sv[1]);
line_number = atoi(sv[0]); /* for next input line */
lex_at_bol = 1;
goto top;
}
lex_at_bol = 0;
/*
* Skip white space
*/
if (is_white(c))
goto top;
/*
* Symbol? Might be a keyword or just an identifier
*/
if (is_sstart(c)) {
/* we got a symbol */
do {
*p++ = c;
c = getch(fp);
} while (is_sfollow(c));
(void) ungetc(c, fp);
*p = 0;
sym = sym_enter(lexeme);
yylval = &sym->s_node;
if (sym->kw) {
return (sym->kw->token);
} else {
return (IDENTIFIER);
}
}
/*
* Integer constant?
*/
if (is_digit(c)) {
/* we got a number */
*p++ = c;
if (c == '0') {
c = getch(fp);
if (c == 'x' || c == 'X') {
/* handle hex specially */
do {
*p++ = c;
c = getch(fp);
} while (is_xdigit(c));
goto convert_icon;
} else if (c == 'b' || c == 'B' ||
c == 'd' || c == 'D' ||
c == 'o' || c == 'O') {
do {
*p++ = c;
c = getch(fp);
} while (is_digit(c));
goto convert_icon;
}
(void) ungetc(c, fp);
}
/* could be anything */
c = getch(fp);
while (is_digit(c)) {
*p++ = c;
c = getch(fp);
}
convert_icon:
*p = 0;
(void) ungetc(c, fp);
intg = int_enter(strtol(lexeme, 0, 0));
yylval = &intg->s_node;
return (INTEGER);
}
/* Could handle strings. We don't seem to need them yet */
yylval = 0; /* operator tokens have no value */
xc = getch(fp); /* get look-ahead for two-char lexemes */
lexeme[0] = c;
lexeme[1] = xc;
lexeme[2] = 0;
/*
* Look for to-end-of-line comment
*/
if (c == '/' && xc == '/') {
/* eat the comment */
while ((c = getch(fp)) != EOF && c != '\n')
;
(void) ungetc(c, fp); /* put back newline */
goto top;
}
/*
* Look for multi-line comment
*/
if (c == '/' && xc == '*') {
/* eat the comment */
xc = -1;
while ((c = getch(fp)) != EOF) {
if (xc == '*' && c == '/') {
/* that's it */
break;
}
xc = c;
if (c == '\n')
line_number++;
}
goto top;
}
/*
* Use symbol table lookup for two-character and
* one character operator tokens.
*/
sym = sym_find(lexeme);
if (sym) {
/* there better be a keyword attached */
yylval = &sym->s_node;
return (sym->kw->token);
}
/* Try a one-character form */
(void) ungetc(xc, fp);
lexeme[1] = 0;
sym = sym_find(lexeme);
if (sym) {
/* there better be a keyword attached */
yylval = &sym->s_node;
return (sym->kw->token);
}
compile_error("unrecognized character 0x%02x", c);
goto top;
}
static ndr_symbol_t *
sym_find(char *name)
{
ndr_symbol_t **pp;
ndr_symbol_t *p;
for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
if (strcmp(p->name, name) == 0)
return (p);
}
return (0);
}
ndr_symbol_t *
sym_enter(char *name)
{
ndr_symbol_t **pp;
ndr_symbol_t *p;
for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
if (strcmp(p->name, name) == 0)
return (p);
}
p = ndr_alloc(1, sizeof (ndr_symbol_t));
if ((p->name = strdup(name)) == NULL)
fatal_error("%s", strerror(ENOMEM));
p->s_node.label = IDENTIFIER;
p->s_node.n_sym = p;
*pp = p;
return (p);
}
static ndr_integer_t *
int_enter(long value)
{
ndr_integer_t **pp;
ndr_integer_t *p;
for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
if (p->value == value)
return (p);
}
p = ndr_alloc(1, sizeof (ndr_integer_t));
p->value = value;
p->s_node.label = INTEGER;
p->s_node.n_int = value;
*pp = p;
return (p);
}
void *
ndr_alloc(size_t nelem, size_t elsize)
{
void *p;
if ((p = calloc(nelem, elsize)) == NULL) {
fatal_error("%s", strerror(ENOMEM));
/* NOTREACHED */
}
return (p);
}
/*
* The input context (filename, line number) is maintained by the
* lexical analysis, and we generally want such info reported for
* errors in a consistent manner.
*/
void
compile_error(const char *fmt, ...)
{
char buf[NDLBUFSZ];
va_list ap;
va_start(ap, fmt);
(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
va_end(ap);
(void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
file_name->name, line_number, buf);
n_compile_error++;
}
void
fatal_error(const char *fmt, ...)
{
char buf[NDLBUFSZ];
va_list ap;
va_start(ap, fmt);
(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
va_end(ap);
(void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
exit(1);
}
/*
* Setup nodes for the lexical analyzer.
*/
struct node *
n_cons(int label, ...)
{
ndr_node_t *np;
va_list ap;
np = ndr_alloc(1, sizeof (ndr_node_t));
va_start(ap, label);
np->label = label;
np->n_arg[0] = va_arg(ap, void *);
np->n_arg[1] = va_arg(ap, void *);
np->n_arg[2] = va_arg(ap, void *);
va_end(ap);
np->line_number = line_number;
np->file_name = file_name;
return (np);
}
/*
* list: item
* | list item ={ n_splice($1, $2); }
* ;
*/
void
n_splice(struct node *np1, struct node *np2)
{
while (np1->n_next)
np1 = np1->n_next;
np1->n_next = np2;
}
/*
* Convert a string of words to a vector of strings.
* Returns the number of words.
*/
static int
str_to_sv(char *buf, char *sv[])
{
char **pp = sv;
char *p = buf;
char *q = buf;
int in_word = 0;
int c;
for (;;) {
c = *p++;
if (c == 0)
break;
if (!in_word) {
if (iswhite(c))
continue;
*pp++ = q;
in_word = 1;
}
if (isquote(c)) {
int qc = c;
while (((c = *p++) != 0) && (c != qc))
*q++ = c;
if (c == 0)
break;
} else if (iswhite(c)) {
/* end of word */
*q++ = 0;
in_word = 0;
} else {
/* still inside word */
*q++ = c;
}
}
if (in_word)
*q++ = 0;
*pp = (char *)0;
return (pp - sv);
}