msgcvt.c revision da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968
/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 2000-2007 AT&T Knowledge Ventures *
* and is licensed under the *
* Common Public License, Version 1.0 *
* by AT&T Knowledge Ventures *
* *
* A copy of the License is available at *
* http://www.opensource.org/licenses/cpl1.0.txt *
* (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* Glenn Fowler
* AT&T Research
*/
static const char usage[] =
"[-?\n@(#)$Id: msgcvt (AT&T Research) 2000-05-01 $\n]"
USAGE_LICENSE
"[+NAME?msgcvt - convert message file to/from html]"
"[+DESCRIPTION?\bmsgcvt\b reads a \bgencat\b(1) format file on the standard"
" input and converts it to \bhtml\b on the standard output. The input"
" file must contain the control statement \b$quote \"\b and use the \""
" character to quote message text. The output is in a form suitable for"
" automatic translation by web sites like"
" \bhttp://babelfish.altavista.com/\b or filters like"
" \btranslate\b(1).]"
"[h:html?Generate \bhtml\b from \bgencat\b(1) input. This is the default.]"
"[m:msg?Generate a \bgencat\b(1) message file from (presumably translated)"
" \bhtml\b. Wide characters are UTF-8 encoded.]"
"[r:raw?The message file is raw message text, one message per line, with no"
" quoting or line numbering.]"
"[+SEE ALSO?\bgencat\b(1), \bmsgcc\b(1), \bmsggen\b(1), \btranslate\b(1)]"
;
#include <ast.h>
#include <ctype.h>
#include <error.h>
#define MSG_RAW (1<<0)
#define MSG_SPLICE (1<<1)
#define SPACE(s) (isspace(*s)&&(s+=1)||*s=='\\'&&(*(s+1)=='n'||*(s+1)=='t')&&(s+=2))
typedef void (*Convert_f)(Sfio_t*, Sfio_t*, int);
typedef struct
{
const char* name;
int code;
} Code_t;
static const Code_t codes[] =
{
"aacute", 225,
"Aacute", 193,
"acirc", 226,
"Acirc", 194,
"aelig", 230,
"AElig", 198,
"agrave", 224,
"Agrave", 192,
"amp", '&',
"aring", 229,
"Aring", 197,
"atilde", 227,
"Atilde", 195,
"auml", 228,
"Auml", 196,
"ccedil", 231,
"Ccedil", 199,
"copy", 169,
"eacute", 233,
"Eacute", 201,
"ecirc", 234,
"Ecirc", 202,
"egrave", 232,
"Egrave", 200,
"euml", 235,
"Euml", 203,
"gt", '>',
"iacute", 237,
"Iacute", 205,
"icirc", 238,
"Icirc", 206,
"igrave", 236,
"Igrave", 204,
"iuml", 239,
"Iuml", 207,
"lt", '<',
"nbsp", ' ',
"ntilde", 241,
"Ntilde", 209,
"oacute", 243,
"Oacute", 211,
"ocirc", 244,
"Ocirc", 212,
"ograve", 242,
"Ograve", 210,
"oslash", 248,
"Oslash", 216,
"otilde", 245,
"Otilde", 213,
"ouml", 246,
"Ouml", 214,
"quot", '"',
"reg", 174,
"szlig", 223,
"uacute", 250,
"Uacute", 218,
"ucirc", 251,
"Ucirc", 219,
"ugrave", 249,
"Ugrave", 217,
"uuml", 252,
"Uuml", 220,
"yuml", 255,
};
static int
decode(Sfio_t* ip)
{
register int c;
register int i;
char name[32];
if ((c = sfgetc(ip)) == EOF)
return '&';
name[0] = c;
i = 1;
if (c != '#' && !isalpha(c))
goto bad;
while ((c = sfgetc(ip)) != EOF && c != ';')
{
if (c == '&')
i = 0;
else
{
name[i++] = c;
if (!isalnum(c) && (i > 1 || c != '#') || i >= (elementsof(name) - 1))
goto bad;
}
}
name[i] = 0;
if (name[0] == '#')
{
switch (c = strtol(name + 1, NiL, 10))
{
case 91:
c = '[';
break;
case 93:
c = ']';
break;
}
}
else
{
for (i = 0; i < elementsof(codes); i++)
if (streq(codes[i].name, name))
{
c = codes[i].code;
break;
}
if (i >= elementsof(codes))
goto bad;
}
return c;
bad:
name[i] = 0;
if (c == ';')
error(1, "&%s: unknown HTML special character -- & assumed", name);
else
error(1, "&%s: invalid HTML special character -- & assumed", name);
while (i--)
sfungetc(ip, name[i]);
return '&';
}
static int
sfpututf(Sfio_t* op, register int w)
{
if (!(w & ~0x7F))
return sfputc(op, w);
else if (!(w & ~0x7FF))
sfputc(op, 0xC0 + (w >> 6));
else if (!(w & ~0xFFFF))
{
sfputc(op, 0xE0 + (w >> 12));
sfputc(op, 0x80 + (w >> 6 ) & 0x3F);
}
else
return sfputc(op, '?');
return sfputc(op, 0x80 + (w & 0x3F));
}
static int
sfnext(Sfio_t* ip)
{
register int c;
while (isspace(c = sfgetc(ip)));
return c;
}
static void
html2msg(register Sfio_t* ip, register Sfio_t* op, int flags)
{
register int c;
register int q;
again:
while ((c = sfgetc(ip)) != EOF)
if (c == '<')
{
if ((c = sfnext(ip)) == 'O' &&
(c = sfnext(ip)) == 'L' &&
isspace(c = sfgetc(ip)) &&
(c = sfnext(ip)) == 'S' &&
(c = sfnext(ip)) == 'T' &&
(c = sfnext(ip)) == 'A' &&
(c = sfnext(ip)) == 'R' &&
(c = sfnext(ip)) == 'T' &&
(c = sfnext(ip)) == '=' &&
(c = sfnext(ip)) == '"' &&
(c = sfnext(ip)) == '5' &&
(c = sfnext(ip)) == '5' &&
(c = sfnext(ip)) == '0' &&
(c = sfnext(ip)) == '7' &&
(c = sfnext(ip)) == '1' &&
(c = sfnext(ip)) == '7' &&
(c = sfnext(ip)) == '"' &&
(c = sfnext(ip)) == '>')
break;
while (c != EOF && c != '>')
c = sfgetc(ip);
}
if ((c = sfnext(ip)) != EOF)
sfungetc(ip, c);
q = 0;
for (;;)
{
switch (c = sfgetc(ip))
{
case EOF:
break;
case '&':
c = decode(ip);
sfpututf(op, c);
if (isspace(c))
{
while (isspace(c = sfgetc(ip)));
if (c == EOF)
break;
sfungetc(ip, c);
}
continue;
case '<':
switch (c = sfnext(ip))
{
case '/':
if ((c = sfnext(ip)) == 'O' &&
(c = sfgetc(ip)) == 'L' &&
(c = sfnext(ip)) == '>')
{
if (q)
{
sfputc(op, q);
q = '"';
}
goto again;
}
break;
case 'B':
if ((c = sfgetc(ip)) == 'R' &&
(c = sfnext(ip)) == '>')
sfputc(op, ' ');
break;
case 'L':
if ((c = sfgetc(ip)) == 'I' &&
(c = sfnext(ip)) == '>' &&
isdigit(c = sfnext(ip)))
{
if (q)
sfputc(op, q);
else
q = '"';
sfputc(op, '\n');
do
{
sfputc(op, c);
} while (isdigit(c = sfgetc(ip)));
if (c == EOF)
break;
sfputc(op, ' ');
sfputc(op, '"');
if (isspace(c))
c = sfnext(ip);
if (c == '<' &&
(c = sfnext(ip)) == 'L' &&
(c = sfgetc(ip)) == 'I' &&
(c = sfnext(ip)) == '>')
/* great */;
continue;
}
break;
case 'P':
if ((c = sfnext(ip)) == '>')
sfputc(op, '\n');
else if (c == 'C' &&
(c = sfgetc(ip)) == 'L' &&
(c = sfgetc(ip)) == 'A' &&
(c = sfgetc(ip)) == 'S' &&
(c = sfgetc(ip)) == 'S' &&
(c = sfnext(ip)) == '=' &&
(c = sfnext(ip)) == '"')
for (;;)
{
switch (c = sfgetc(ip))
{
case EOF:
case '"':
break;
case '&':
c = decode(ip);
sfpututf(op, c);
continue;
default:
sfpututf(op, c);
continue;
}
break;
}
break;
}
while (c != EOF && c != '>')
c = sfgetc(ip);
if (c == EOF || (c = sfgetc(ip)) == EOF)
break;
sfungetc(ip, c);
continue;
case '"':
if (!flags)
sfputc(op, '\\');
sfputc(op, c);
continue;
case '\n':
if (flags)
{
sfputc(op, c);
continue;
}
/*FALLTHROUGH*/
case ' ':
case '\t':
while ((c = sfgetc(ip)) != EOF)
if (c == '&')
{
c = decode(ip);
if (!isspace(c))
sfputc(op, ' ');
sfpututf(op, c);
break;
}
else if (!isspace(c))
{
if (c == '<')
{
c = sfgetc(ip);
if (c == EOF)
break;
sfungetc(ip, c);
sfungetc(ip, '<');
if (c != 'L' && c != '/')
sfputc(op, ' ');
}
else
{
if (c != EOF)
sfungetc(ip, c);
sfputc(op, ' ');
}
break;
}
continue;
case '\r':
case '[':
case ']':
continue;
default:
sfpututf(op, c);
continue;
}
break;
}
if (q)
sfputc(op, q);
sfputc(op, '\n');
}
static void
encode(Sfio_t* op, register int c)
{
if (c == '<')
sfprintf(op, "&lt;");
else if (c == '>')
sfprintf(op, "&gt;");
else if (c == '"')
sfprintf(op, "&quot;");
else if (c == '&')
sfprintf(op, "&amp;");
else if (c == '[')
sfprintf(op, "&#091;");
else if (c == ']')
sfprintf(op, "&#093;");
else
sfputc(op, c);
}
static void
msg2html(register Sfio_t* ip, register Sfio_t* op, register int flags)
{
register char* s;
register int c;
register int q;
register int p;
sfprintf(op, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><HTML><HEAD><!-- text massaged for external translation --></HEAD><BODY>\n");
sfprintf(op, "<OL START=\"550717\">\n");
p = q = 0;
while (s = sfgetr(ip, '\n', 1))
{
error_info.line++;
if (flags)
sfprintf(op, "<P>");
else
{
if (*s == '$')
{
if (p)
sfprintf(op, "<P>");
else
p = 1;
sfprintf(op, "<P CLASS=\"", s);
while (c = *s++)
encode(op, c);
sfprintf(op, "\">\n");
continue;
}
p = 0;
if (!isdigit(*s))
continue;
sfprintf(op, "<LI>");
while (isdigit(c = *s++))
sfputc(op, c);
sfprintf(op, "<LI>");
while (c && c != '"')
c = *s++;
if (!c)
s--;
else if (isspace(*s))
{
s++;
sfprintf(op, "<BR>");
}
}
for (;;)
{
switch (c = *s++)
{
case 0:
flags &= ~MSG_SPLICE;
if (q)
{
q = 0;
sfprintf(op, "\">");
}
sfputc(op, '\n');
break;
case '<':
sfprintf(op, "&lt;");
continue;
case '>':
sfprintf(op, "&gt;");
continue;
case '&':
sfprintf(op, "&amp;");
continue;
case '[':
sfprintf(op, "&#091;");
continue;
case ']':
sfprintf(op, "&#093;");
continue;
case '$':
if (!q)
{
q = 1;
sfprintf(op, "<P CLASS=\"");
}
sfputc(op, c);
while (isalnum(c = *s++))
sfputc(op, c);
s--;
continue;
case '%':
if (!q)
{
q = 1;
sfprintf(op, "<P CLASS=\"");
}
sfputc(op, c);
if (*s == '%')
sfputc(op, *s++);
else
do
{
if (!(c = *s++) || c == '"')
{
s--;
break;
}
encode(op, c);
} while (!isalpha(c) || (!islower(c) || c == 'h' || c == 'l') && isalpha(*s));
if (SPACE(s))
sfprintf(op, "&nbsp;");
continue;
case '"':
if (!(flags & MSG_RAW))
{
s = "";
continue;
}
/*FALLTHROUGH*/
case '\'':
case ':':
case '/':
case '+':
case '@':
if (!q)
{
q = 1;
sfprintf(op, "<P CLASS=\"");
}
/*FALLTHROUGH*/
case '.':
case ',':
sfputc(op, c);
if (SPACE(s))
sfprintf(op, "&nbsp;");
continue;
case '\\':
if (!(c = *s++))
{
flags |= MSG_SPLICE;
break;
}
if (c != 'n' && c != 't')
{
if (!q)
{
q = 1;
sfprintf(op, "<P CLASS=\"");
}
sfputc(op, '\\');
encode(op, c);
if (c == 'b')
{
for (;;)
{
if (!(c = *s++) || c == '"')
{
s--;
break;
}
if (c == '?')
{
if (*s != '?')
{
s--;
break;
}
sfputc(op, c);
sfputc(op, *s++);
continue;
}
if (c == '\\')
{
if (!*s)
break;
sfputc(op, c);
if (*s == 'a' || *s == 'b' || *s == '0')
{
sfputc(op, *s++);
break;
}
c = *s++;
}
encode(op, c);
}
}
else if (isdigit(c) && isdigit(*s))
{
sfputc(op, *s++);
if (isdigit(*s))
sfputc(op, *s++);
}
if (SPACE(s))
sfprintf(op, "&nbsp;");
continue;
}
/*FALLTHROUGH*/
case ' ':
case '\t':
while (isspace(*s) || *s == '\\' && (*(s + 1) == 'n' || *(s + 1) == 't') && s++)
s++;
if (*s == '"')
{
if (q)
{
q = 0;
sfprintf(op, " \">");
}
else
sfprintf(op, "<BR>");
continue;
}
c = ' ';
/*FALLTHROUGH*/
default:
if (q)
{
q = 0;
sfprintf(op, "\">");
}
sfputc(op, c);
continue;
}
break;
}
}
sfprintf(op, "</OL>\n");
sfprintf(op, "</BODY></HTML>\n");
error_info.line = 0;
}
int
main(int argc, char** argv)
{
int flags = 0;
Convert_f convert = msg2html;
NoP(argc);
error_info.id = "msgcvt";
for (;;)
{
switch (optget(argv, usage))
{
case 'h':
convert = msg2html;
continue;
case 'm':
convert = html2msg;
continue;
case 'r':
flags |= MSG_RAW;
continue;
case '?':
error(ERROR_USAGE|4, "%s", opt_info.arg);
continue;
case ':':
error(2, "%s", opt_info.arg);
continue;
}
break;
}
argv += opt_info.index;
if (error_info.errors)
error(ERROR_USAGE|4, "%s", optusage(NiL));
(*convert)(sfstdin, sfstdout, flags);
return error_info.errors != 0;
}