cmd/iconv/scanner.c

	scanner.c revision 48edc7cf07b5dccc3ad84bf2dafe4150bd666d60
2N/A/*
2N/A * This file and its contents are supplied under the terms of the
2N/A * Common Development and Distribution License ("CDDL"), version 1.0.
2N/A * You may only use this file in accordance with the terms of version
2N/A * 1.0 of the CDDL.
2N/A *
2N/A * A full copy of the text of the CDDL should have accompanied this
2N/A * source.  A copy of the CDDL is also available via the Internet at
2N/A * http://www.illumos.org/license/CDDL.
2N/A */
2N/A
2N/A/*
2N/A * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
2N/A */
2N/A
2N/A/*
2N/A * This file contains the "scanner", which tokenizes charmap files
2N/A * for iconv for processing by the higher level grammar processor.
2N/A */
2N/A
2N/A#include <stdio.h>
2N/A#include <stdlib.h>
2N/A#include <ctype.h>
2N/A#include <limits.h>
2N/A#include <string.h>
2N/A#include <widec.h>
2N/A#include <sys/types.h>
2N/A#include <assert.h>
2N/A#include "charmap.h"
2N/A#include "parser.tab.h"
2N/A
2N/Aint         com_char = '#';
2N/Aint         esc_char = '\\';
2N/Aint         mb_cur_min = 1;
2N/Aint         mb_cur_max = MB_LEN_MAX;
2N/Aint         lineno = 1;
2N/Aint         warnings = 0;
2N/Astatic int      nextline;
2N/Astatic FILE     *input = stdin;
2N/Astatic const char   *filename = "<stdin>";
2N/Astatic int      instring = 0;
2N/Astatic int      escaped = 0;
2N/A
2N/A/*
2N/A * Token space ... grows on demand.
2N/A */
2N/Astatic char *token = NULL;
2N/Astatic int tokidx;
2N/Astatic int toksz = 0;
2N/Astatic int hadtok = 0;
2N/A
2N/A/*
2N/A * The last keyword seen.  This is useful to trigger the special lexer rules
2N/A * for "copy" and also collating symbols and elements.
2N/A */
2N/Aint last_kw = 0;
2N/Astatic int  category = T_END;
2N/A
2N/Astatic struct token {
2N/A    int id;
2N/A    const char *name;
2N/A} keywords[] = {
2N/A    { T_COM_CHAR,       "comment_char" },
2N/A    { T_ESC_CHAR,       "escape_char" },
2N/A    { T_END,        "END" },
2N/A
2N/A    /*
2N/A     * These are keywords used in the charmap file.  Note that
2N/A     * Solaris orginally used angle brackets to wrap some of them,
2N/A     * but we removed that to simplify our parser.  The first of these
2N/A     * items are "global items."
2N/A     */
2N/A    { T_CHARMAP,        "CHARMAP" },
2N/A    { T_WIDTH,      "WIDTH" },
2N/A    { T_WIDTH_DEFAULT,  "WIDTH_DEFAULT" },
2N/A
2N/A    { -1, NULL },
2N/A};
2N/A
2N/A/*
2N/A * These special words are only used in a charmap file, enclosed in <>.
2N/A */
2N/Astatic struct token symwords[] = {
2N/A    { T_COM_CHAR,       "comment_char" },
2N/A    { T_ESC_CHAR,       "escape_char" },
2N/A    { T_CODE_SET,       "code_set_name" },
2N/A    { T_MB_CUR_MAX,     "mb_cur_max" },
2N/A    { T_MB_CUR_MIN,     "mb_cur_min" },
2N/A    { -1, NULL },
2N/A};
2N/A
2N/Astatic int categories[] = {
2N/A    T_CHARMAP,
2N/A    0
2N/A};
2N/A
2N/Avoid
2N/Areset_scanner(const char *fname)
2N/A{
2N/A    if (fname == NULL) {
        filename = "<stdin>";
        input = stdin;
    } else {
        if (input != stdin)
            (void) fclose(input);
        if ((input = fopen(fname, "r")) == NULL) {
            perror(fname);
            exit(1);
        }
        filename = fname;
    }
    com_char = '#';
    esc_char = '\\';
    instring = 0;
    escaped = 0;
    lineno = 1;
    nextline = 1;
    tokidx = 0;
    last_kw = 0;
    category = T_END;
}

#define hex(x)  \
    (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
#define isodigit(x) ((x >= '0') && (x <= '7'))

static int
scanc(void)
{
    int c;

    c = getc(input);
    lineno = nextline;
    if (c == '\n') {
        nextline++;
    }
    return (c);
}

static void
unscanc(int c)
{
    if (c == '\n') {
        nextline--;
    }
    if (ungetc(c, input) < 0) {
        yyerror(_("ungetc failed"));
    }
}

static int
scan_hex_byte(void)
{
    int c1, c2;
    int v;

    c1 = scanc();
    if (!isxdigit(c1)) {
        yyerror(_("malformed hex digit"));
        return (0);
    }
    c2 = scanc();
    if (!isxdigit(c2)) {
        yyerror(_("malformed hex digit"));
        return (0);
    }
    v = ((hex(c1) << 4) | hex(c2));
    return (v);
}

static int
scan_dec_byte(void)
{
    int c1, c2, c3;
    int b;

    c1 = scanc();
    if (!isdigit(c1)) {
        yyerror(_("malformed decimal digit"));
        return (0);
    }
    b = c1 - '0';
    c2 = scanc();
    if (!isdigit(c2)) {
        yyerror(_("malformed decimal digit"));
        return (0);
    }
    b *= 10;
    b += (c2 - '0');
    c3 = scanc();
    if (!isdigit(c3)) {
        unscanc(c3);
    } else {
        b *= 10;
        b += (c3 - '0');
    }
    return (b);
}

static int
scan_oct_byte(void)
{
    int c1, c2, c3;
    int b;

    b = 0;

    c1 = scanc();
    if (!isodigit(c1)) {
        yyerror(_("malformed octal digit"));
        return (0);
    }
    b = c1 - '0';
    c2 = scanc();
    if (!isodigit(c2)) {
        yyerror(_("malformed octal digit"));
        return (0);
    }
    b *= 8;
    b += (c2 - '0');
    c3 = scanc();
    if (!isodigit(c3)) {
        unscanc(c3);
    } else {
        b *= 8;
        b += (c3 - '0');
    }
    return (b);
}

void
add_tok(int c)
{
    if ((tokidx + 1) >= toksz) {
        toksz += 64;
        if ((token = realloc(token, toksz)) == NULL) {
            yyerror(_("out of memory"));
            tokidx = 0;
            toksz = 0;
            return;
        }
    }

    token[tokidx++] = (char)c;
    token[tokidx] = 0;
}

static int
get_byte(void)
{
    int c;

    if ((c = scanc()) != esc_char) {
        unscanc(c);
        return (EOF);
    }
    c = scanc();

    switch (c) {
    case 'd':
    case 'D':
        return (scan_dec_byte());
    case 'x':
    case 'X':
        return (scan_hex_byte());
    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
        /* put the character back so we can get it */
        unscanc(c);
        return (scan_oct_byte());
    default:
        unscanc(c);
        unscanc(esc_char);
        return (EOF);
    }
}

int
get_escaped(int c)
{
    switch (c) {
    case 'n':
        return ('\n');
    case 'r':
        return ('\r');
    case 't':
        return ('\t');
    case 'f':
        return ('\f');
    case 'v':
        return ('\v');
    case 'b':
        return ('\b');
    case 'a':
        return ('\a');
    default:
        return (c);
    }
}

int
get_wide(void)
{
    /* NB: yylval.mbs[0] is the length */
    char *mbs = &yylval.mbs[1];
    int mbi = 0;
    int c;

    mbs[mbi] = 0;
    if (mb_cur_max > MB_LEN_MAX) {
        yyerror(_("max multibyte character size too big"));
        return (T_NULL);
    }
    for (;;) {
        if ((c = get_byte()) == EOF)
            break;
        if (mbi == mb_cur_max) {
            unscanc(c);
            yyerror(_("length > mb_cur_max"));
            return (T_NULL);
        }
        mbs[mbi++] = c;
        mbs[mbi] = 0;
    }

    /* result in yylval.mbs */
    mbs[-1] = mbi;
    return (T_CHAR);
}

int
get_symbol(void)
{
    int c;

    while ((c = scanc()) != EOF) {
        if (escaped) {
            escaped = 0;
            if (c == '\n')
                continue;
            add_tok(get_escaped(c));
            continue;
        }
        if (c == esc_char) {
            escaped = 1;
            continue;
        }
        if (c == '\n') {    /* well that's strange! */
            yyerror(_("unterminated symbolic name"));
            continue;
        }
        if (c == '>') {     /* end of symbol */

            /*
             * This restarts the token from the beginning
             * the next time we scan a character.  (This
             * token is complete.)
             */

            if (token == NULL) {
                yyerror(_("missing symbolic name"));
                return (T_NULL);
            }
            tokidx = 0;

            /*
             * A few symbols are handled as keywords outside
             * of the normal categories.
             */
            if (category == T_END) {
                int i;
                for (i = 0; symwords[i].name != 0; i++) {
                    if (strcmp(token, symwords[i].name) ==
                        0) {
                        last_kw = symwords[i].id;
                        return (last_kw);
                    }
                }
            }
            /* its an undefined symbol */
            yylval.token = strdup(token);
            if (yylval.token == NULL) {
                perror("malloc");
                exit(1);
            }
            token = NULL;
            toksz = 0;
            tokidx = 0;
            return (T_SYMBOL);
        }
        add_tok(c);
    }

    yyerror(_("unterminated symbolic name"));
    return (EOF);
}


static int
consume_token(void)
{
    int len = tokidx;
    int i;

    tokidx = 0;
    if (token == NULL)
        return (T_NULL);

    /*
     * this one is special, because we don't want it to alter the
     * last_kw field.
     */
    if (strcmp(token, "...") == 0) {
        return (T_ELLIPSIS);
    }

    /* search for reserved words first */
    for (i = 0; keywords[i].name; i++) {
        int j;
        if (strcmp(keywords[i].name, token) != 0) {
            continue;
        }

        last_kw = keywords[i].id;

        /* clear the top level category if we're done with it */
        if (last_kw == T_END) {
            category = T_END;
        }

        /* set the top level category if we're changing */
        for (j = 0; categories[j]; j++) {
            if (categories[j] != last_kw)
                continue;
            category = last_kw;
        }

        return (keywords[i].id);
    }

    /* maybe its a numeric constant? */
    if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
        char *eptr;
        yylval.num = strtol(token, &eptr, 10);
        if (*eptr != 0)
            yyerror(_("malformed number"));
        return (T_NUMBER);
    }

    /*
     * A single lone character is treated as a character literal.
     * To avoid duplication of effort, we stick in the charmap.
     */
    if (len == 1) {
        yylval.mbs[0] = 1; /* length */
        yylval.mbs[1] = token[0];
        yylval.mbs[2] = '\0';
        return (T_CHAR);
    }

    /* anything else is treated as a symbolic name */
    yylval.token = strdup(token);
    token = NULL;
    toksz = 0;
    tokidx = 0;
    return (T_NAME);
}

void
scan_to_eol(void)
{
    int c;
    while ((c = scanc()) != '\n') {
        if (c == EOF) {
            /* end of file without newline! */
            errf(_("missing newline"));
            return;
        }
    }
    assert(c == '\n');
}

int
yylex(void)
{
    int     c;

    while ((c = scanc()) != EOF) {

        /* special handling for quoted string */
        if (instring) {
            if (escaped) {
                escaped = 0;

                /* if newline, just eat and forget it */
                if (c == '\n')
                    continue;

                if (strchr("xXd01234567", c)) {
                    unscanc(c);
                    unscanc(esc_char);
                    return (get_wide());
                }
                yylval.mbs[0] = 1; /* length */
                yylval.mbs[1] = get_escaped(c);
                yylval.mbs[2] = '\0';
                return (T_CHAR);
            }
            if (c == esc_char) {
                escaped = 1;
                continue;
            }
            switch (c) {
            case '<':
                return (get_symbol());
            case '>':
                /* oops! should generate syntax error  */
                return (T_GT);
            case '"':
                instring = 0;
                return (T_QUOTE);
            default:
                yylval.mbs[0] = 1; /* length */
                yylval.mbs[1] = c;
                yylval.mbs[2] = '\0';
                return (T_CHAR);
            }
        }

        /* escaped characters first */
        if (escaped) {
            escaped = 0;
            if (c == '\n') {
                /* eat the newline */
                continue;
            }
            hadtok = 1;
            if (tokidx) {
                /* an escape mid-token is nonsense */
                return (T_NULL);
            }

            /* numeric escapes are treated as wide characters */
            if (strchr("xXd01234567", c)) {
                unscanc(c);
                unscanc(esc_char);
                return (get_wide());
            }

            add_tok(get_escaped(c));
            continue;
        }

        /* if it is the escape charter itself note it */
        if (c == esc_char) {
            escaped = 1;
            continue;
        }

        /* remove from the comment char to end of line */
        if (c == com_char) {
            while (c != '\n') {
                if ((c = scanc()) == EOF) {
                    /* end of file without newline! */
                    return (EOF);
                }
            }
            assert(c == '\n');
            if (!hadtok) {
                /*
                 * If there were no tokens on this line,
                 * then just pretend it didn't exist at all.
                 */
                continue;
            }
            hadtok = 0;
            return (T_NL);
        }

        if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
            /*
             * These are all token delimiters.  If there
             * is a token already in progress, we need to
             * process it.
             */
            unscanc(c);
            return (consume_token());
        }

        switch (c) {
        case '\n':
            if (!hadtok) {
                /*
                 * If the line was completely devoid of tokens,
                 * then just ignore it.
                 */
                continue;
            }
            /* we're starting a new line, reset the token state */
            hadtok = 0;
            return (T_NL);
        case ',':
            hadtok = 1;
            return (T_COMMA);
        case ';':
            hadtok = 1;
            return (T_SEMI);
        case '(':
            hadtok = 1;
            return (T_LPAREN);
        case ')':
            hadtok = 1;
            return (T_RPAREN);
        case '>':
            hadtok = 1;
            return (T_GT);
        case '<':
            /* symbol start! */
            hadtok = 1;
            return (get_symbol());
        case ' ':
        case '\t':
            /* whitespace, just ignore it */
            continue;
        case '"':
            hadtok = 1;
            instring = 1;
            return (T_QUOTE);
        default:
            hadtok = 1;
            add_tok(c);
            continue;
        }
    }
    return (EOF);
}

void
yyerror(const char *msg)
{
    (void) fprintf(stderr, _("%s: %d: error: %s\n"),
        filename, lineno, msg);
    exit(1);
}

void
errf(const char *fmt, ...)
{
    char    *msg;

    va_list va;
    va_start(va, fmt);
    (void) vasprintf(&msg, fmt, va);
    va_end(va);

    (void) fprintf(stderr, _("%s: %d: error: %s\n"),
        filename, lineno, msg);
    free(msg);
    exit(1);
}

void
warn(const char *fmt, ...)
{
    char    *msg;

    va_list va;
    va_start(va, fmt);
    (void) vasprintf(&msg, fmt, va);
    va_end(va);

    (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
        filename, lineno, msg);
    free(msg);
    warnings++;
}