scanner.c revision 48edc7cf07b5dccc3ad84bf2dafe4150bd666d60
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
*/
/*
* This file contains the "scanner", which tokenizes charmap files
* for iconv for processing by the higher level grammar processor.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <widec.h>
#include <assert.h>
#include "charmap.h"
#include "parser.tab.h"
int com_char = '#';
int esc_char = '\\';
int mb_cur_min = 1;
int mb_cur_max = MB_LEN_MAX;
int lineno = 1;
int warnings = 0;
static int nextline;
static const char *filename = "<stdin>";
static int instring = 0;
static int escaped = 0;
/*
* Token space ... grows on demand.
*/
static int tokidx;
static int toksz = 0;
static int hadtok = 0;
/*
* The last keyword seen. This is useful to trigger the special lexer rules
* for "copy" and also collating symbols and elements.
*/
int last_kw = 0;
static struct token {
int id;
const char *name;
} keywords[] = {
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_END, "END" },
/*
* These are keywords used in the charmap file. Note that
* Solaris orginally used angle brackets to wrap some of them,
* but we removed that to simplify our parser. The first of these
* items are "global items."
*/
{ T_CHARMAP, "CHARMAP" },
{ T_WIDTH, "WIDTH" },
{ T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
{ -1, NULL },
};
/*
* These special words are only used in a charmap file, enclosed in <>.
*/
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_CODE_SET, "code_set_name" },
{ T_MB_CUR_MAX, "mb_cur_max" },
{ T_MB_CUR_MIN, "mb_cur_min" },
{ -1, NULL },
};
static int categories[] = {
0
};
void
reset_scanner(const char *fname)
{
filename = "<stdin>";
} else {
exit(1);
}
}
com_char = '#';
esc_char = '\\';
instring = 0;
escaped = 0;
lineno = 1;
nextline = 1;
tokidx = 0;
last_kw = 0;
}
#define hex(x) \
static int
scanc(void)
{
int c;
if (c == '\n') {
nextline++;
}
return (c);
}
static void
unscanc(int c)
{
if (c == '\n') {
nextline--;
}
yyerror(_("ungetc failed"));
}
}
static int
scan_hex_byte(void)
{
int v;
yyerror(_("malformed hex digit"));
return (0);
}
yyerror(_("malformed hex digit"));
return (0);
}
return (v);
}
static int
scan_dec_byte(void)
{
int b;
yyerror(_("malformed decimal digit"));
return (0);
}
b = c1 - '0';
yyerror(_("malformed decimal digit"));
return (0);
}
b *= 10;
b += (c2 - '0');
} else {
b *= 10;
b += (c3 - '0');
}
return (b);
}
static int
scan_oct_byte(void)
{
int b;
b = 0;
yyerror(_("malformed octal digit"));
return (0);
}
b = c1 - '0';
yyerror(_("malformed octal digit"));
return (0);
}
b *= 8;
b += (c2 - '0');
} else {
b *= 8;
b += (c3 - '0');
}
return (b);
}
void
add_tok(int c)
{
toksz += 64;
yyerror(_("out of memory"));
tokidx = 0;
toksz = 0;
return;
}
}
}
static int
get_byte(void)
{
int c;
unscanc(c);
return (EOF);
}
c = scanc();
switch (c) {
case 'd':
case 'D':
return (scan_dec_byte());
case 'x':
case 'X':
return (scan_hex_byte());
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/* put the character back so we can get it */
unscanc(c);
return (scan_oct_byte());
default:
unscanc(c);
return (EOF);
}
}
int
get_escaped(int c)
{
switch (c) {
case 'n':
return ('\n');
case 'r':
return ('\r');
case 't':
return ('\t');
case 'f':
return ('\f');
case 'v':
return ('\v');
case 'b':
return ('\b');
case 'a':
return ('\a');
default:
return (c);
}
}
int
get_wide(void)
{
/* NB: yylval.mbs[0] is the length */
int mbi = 0;
int c;
if (mb_cur_max > MB_LEN_MAX) {
yyerror(_("max multibyte character size too big"));
return (T_NULL);
}
for (;;) {
break;
if (mbi == mb_cur_max) {
unscanc(c);
yyerror(_("length > mb_cur_max"));
return (T_NULL);
}
}
/* result in yylval.mbs */
return (T_CHAR);
}
int
get_symbol(void)
{
int c;
if (escaped) {
escaped = 0;
if (c == '\n')
continue;
add_tok(get_escaped(c));
continue;
}
if (c == esc_char) {
escaped = 1;
continue;
}
if (c == '\n') { /* well that's strange! */
yyerror(_("unterminated symbolic name"));
continue;
}
if (c == '>') { /* end of symbol */
/*
* This restarts the token from the beginning
* the next time we scan a character. (This
* token is complete.)
*/
yyerror(_("missing symbolic name"));
return (T_NULL);
}
tokidx = 0;
/*
* A few symbols are handled as keywords outside
* of the normal categories.
*/
int i;
0) {
return (last_kw);
}
}
}
/* its an undefined symbol */
perror("malloc");
exit(1);
}
toksz = 0;
tokidx = 0;
return (T_SYMBOL);
}
add_tok(c);
}
yyerror(_("unterminated symbolic name"));
return (EOF);
}
static int
consume_token(void)
{
int i;
tokidx = 0;
return (T_NULL);
/*
* this one is special, because we don't want it to alter the
* last_kw field.
*/
return (T_ELLIPSIS);
}
/* search for reserved words first */
int j;
continue;
}
/* clear the top level category if we're done with it */
}
/* set the top level category if we're changing */
for (j = 0; categories[j]; j++) {
if (categories[j] != last_kw)
continue;
}
}
/* maybe its a numeric constant? */
char *eptr;
if (*eptr != 0)
yyerror(_("malformed number"));
return (T_NUMBER);
}
/*
* A single lone character is treated as a character literal.
* To avoid duplication of effort, we stick in the charmap.
*/
if (len == 1) {
return (T_CHAR);
}
/* anything else is treated as a symbolic name */
toksz = 0;
tokidx = 0;
return (T_NAME);
}
void
scan_to_eol(void)
{
int c;
while ((c = scanc()) != '\n') {
if (c == EOF) {
/* end of file without newline! */
errf(_("missing newline"));
return;
}
}
assert(c == '\n');
}
int
yylex(void)
{
int c;
/* special handling for quoted string */
if (instring) {
if (escaped) {
escaped = 0;
/* if newline, just eat and forget it */
if (c == '\n')
continue;
if (strchr("xXd01234567", c)) {
unscanc(c);
return (get_wide());
}
return (T_CHAR);
}
if (c == esc_char) {
escaped = 1;
continue;
}
switch (c) {
case '<':
return (get_symbol());
case '>':
/* oops! should generate syntax error */
return (T_GT);
case '"':
instring = 0;
return (T_QUOTE);
default:
return (T_CHAR);
}
}
/* escaped characters first */
if (escaped) {
escaped = 0;
if (c == '\n') {
/* eat the newline */
continue;
}
hadtok = 1;
if (tokidx) {
/* an escape mid-token is nonsense */
return (T_NULL);
}
/* numeric escapes are treated as wide characters */
if (strchr("xXd01234567", c)) {
unscanc(c);
return (get_wide());
}
add_tok(get_escaped(c));
continue;
}
/* if it is the escape charter itself note it */
if (c == esc_char) {
escaped = 1;
continue;
}
/* remove from the comment char to end of line */
if (c == com_char) {
while (c != '\n') {
/* end of file without newline! */
return (EOF);
}
}
assert(c == '\n');
if (!hadtok) {
/*
* If there were no tokens on this line,
* then just pretend it didn't exist at all.
*/
continue;
}
hadtok = 0;
return (T_NL);
}
/*
* These are all token delimiters. If there
* is a token already in progress, we need to
* process it.
*/
unscanc(c);
return (consume_token());
}
switch (c) {
case '\n':
if (!hadtok) {
/*
* If the line was completely devoid of tokens,
* then just ignore it.
*/
continue;
}
/* we're starting a new line, reset the token state */
hadtok = 0;
return (T_NL);
case ',':
hadtok = 1;
return (T_COMMA);
case ';':
hadtok = 1;
return (T_SEMI);
case '(':
hadtok = 1;
return (T_LPAREN);
case ')':
hadtok = 1;
return (T_RPAREN);
case '>':
hadtok = 1;
return (T_GT);
case '<':
/* symbol start! */
hadtok = 1;
return (get_symbol());
case ' ':
case '\t':
/* whitespace, just ignore it */
continue;
case '"':
hadtok = 1;
instring = 1;
return (T_QUOTE);
default:
hadtok = 1;
add_tok(c);
continue;
}
}
return (EOF);
}
void
{
exit(1);
}
void
{
char *msg;
exit(1);
}
void
{
char *msg;
warnings++;
}