man/src/getNAME.c

	getNAME.c revision 7c478bd95313f5f23a4c958a745db2134aa03244
/*
 * Copyright (c) 1998 by Sun Microsystems, Inc.
 * All rights reserved.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*	All Rights Reserved   */

/*
 * Copyright (c) 1980 Regents of the University of California.
 * All rights reserved.  The Berkeley software License Agreement
 * specifies the terms and conditions for redistribution.
 */

#pragma	ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.1	*/

/*
 * Get name sections from manual pages.
 *	-t	for building toc
 *	-i	for building intro entries
 *	other	apropos database
 */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <locale.h>
#include <wchar.h>
#include <errno.h>
#include <sys/param.h>

#define	PLEN	3	/* prefix length "man" */

static char path[MAXPATHLEN+1];
static int tocrc;
static int intro;
static char *progname;

static void trimln(char *);
static void roff_trim(char *cp);
static void doname(char *);
static void section(char *, char *);
static void split(char *, char *);
static void dorefname(char *);
static void troffpage(char *);
static void sgmlpage(char *);

/*
 * Test to see if this is an SGML manpage or a regular manpage
 * Unless the first line begins with <!DOCTYPE, we assume it isn't.
 */
static int
issgml(FILE *fp)
{
	static const char magic[] = "<!DOCTYPE";
	char buf[sizeof (magic)];
	size_t n = sizeof (magic) - 1;

	if (read(fileno(fp), buf, n) != n ||
	    lseek(fileno(fp), 0, SEEK_SET) != 0)
		return (0);
	return (strncmp(magic, buf, n) == 0);
}

int
main(int argc, char *argv[])
{
	int c;

	(void) setlocale(LC_ALL, "");

	progname = argv[0];

	while ((c = getopt(argc, argv, "it")) != EOF)
		switch (c) {
		case 't':
			tocrc++;
			break;
		case 'i':
			intro++;
			break;
		case '?':
		default:
			(void) fprintf(stderr,
			    "usage: %s [-i][-t] files..\n", progname);
			exit(1);
		}

	if (getcwd(path, sizeof (path)) == NULL) {
		(void) fprintf(stderr, "%s: getcwd: %s\n", progname, path);
		exit(1);
	}

	for (; optind < argc; optind++) {
		char *name = argv[optind];

		if (freopen(name, "r", stdin) == 0) {
			(void) fprintf(stderr,
			    "%s: %s: %s\n", progname, name, strerror(errno));
			continue;
		}

		/*
		 * Most of the info we care about is in the first kbyte
		 */
		(void) setvbuf(stdin, NULL, _IOFBF, 1024);

		if (issgml(stdin))
			sgmlpage(name);
		else
			troffpage(name);
	}

	return (0);
}

/*
 * Parse a troff-format manpage
 */
static void
troffpage(char *name)
{
	char headbuf[BUFSIZ];
	char linbuf[BUFSIZ];
	char *strptr;
	int i = 0;

	for (;;) {
		if (fgets(headbuf, sizeof (headbuf), stdin) == NULL)
			return;
		if (headbuf[0] != '.')
			continue;
		if (headbuf[1] == 'T' && headbuf[2] == 'H')
			break;
		if (headbuf[1] == 't' && headbuf[2] == 'h')
			break;
	}
	for (;;) {
		if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
			return;
		if (linbuf[0] != '.')
			continue;
		if (linbuf[1] == 'S' && linbuf[2] == 'H')
			break;
		if (linbuf[1] == 's' && linbuf[2] == 'h')
			break;
	}
	trimln(headbuf);
	if (tocrc)
		doname(name);
	if (!intro)
		section(name, headbuf);
	for (;;) {
		if (fgets(linbuf, sizeof (linbuf), stdin) == NULL)
			break;
		if (linbuf[0] == '.') {
			if (linbuf[1] == 'S' && linbuf[2] == 'H')
				break;
			if (linbuf[1] == 's' && linbuf[2] == 'h')
				break;
			if (linbuf[1] == '\\' && linbuf[2] == '"')
				continue;
		}
		trimln(linbuf);
		roff_trim(linbuf);
		if (intro) {
			split(linbuf, name);
			continue;
		}
		if (i != 0)
			(void) printf(" ");
		i++;
		(void) printf("%s", linbuf);
	}
	(void) printf("\n");
}


/*
 * Substitute section defined in page with new section spec
 * of the form xx/yy where xx is the section suffix of the
 * directory and yy is the filename extension (unless xx
 * and yy are equal, in which case xx is the section).
 * Pages should be placed in their proper directory with the
 * proper name to simplify things.
 *
 * For example take the following names:
 *    man1/ar.1v	(1/1V)
 *    man1/find.1	(1)
 *    man1/loco		(1/)
 *
 */
static void
section(char *name, char *buf)
{
	char scratch[MAXPATHLEN+1];
	char *p = buf;
	char *dir, *fname;
	char *dp, *np;
	int i;
	int plen = PLEN;

	/*
	 * split dirname and filename
	 */
	(void) strcpy(scratch, name);
	if ((fname = strrchr(scratch, '/')) == NULL) {
		fname = name;
		dir = path;
	} else {
		dir = scratch;
		*fname = 0;
		fname++;
	}
	dp = strrchr(dir, '/');

	if (*(dp+1) == 's')
		plen = PLEN + 1;

	dp = dp ? dp+plen+1 : dir+plen;
	np = strrchr(fname, '.');
	np = np ? ++np : "";
	for (i = 0; i < 2; i++) {
		while (*p && *p != ' ' && *p != '\t')
			p++;
		if (!*p)
			break;
		while (*p && (*p == ' ' || *p == '\t'))
			p++;
		if (!*p)
			break;
	}
	*p++ = 0;
	(void) printf("%s", buf);
	if (strcmp(np, dp) == 0)
		(void) printf("%s", dp);
	else
		(void) printf("%s/%s", dp, np);
	while (*p && *p != ' ' && *p != '\t')
		p++;
	(void) printf("%s\t", p);
}

static void
trimln(char *cp)
{
	while (*cp)
		cp++;
	if (*--cp == '\n')
		*cp = 0;
}

static void
roff_trim(char *cp)
{
	if (*cp == '.') {
		while ((*cp != ' ') && (*cp != '\0')) {
			strcpy(cp, cp+1);
		}
		strcpy(cp, cp+1);
	}
	while (*cp) {
		if (strncmp(cp,"\\f",2) == 0) {
			if ((*(cp+2) >= 48) && (*(cp+2) <= 57)) {
				strcpy(cp, cp+3);
			}
			if (*(cp+2) == '(') {
				strcpy(cp, cp+5);
			}
		}
		cp++;
	}
}

static void
doname(char *name)
{
	char *dp = name, *ep;

again:
	while (*dp && *dp != '.')
		(void) putchar(*dp++);
	if (*dp)
		for (ep = dp+1; *ep; ep++)
			if (*ep == '.') {
				(void) putchar(*dp++);
				goto again;
			}
	(void) putchar('(');
	if (*dp)
		dp++;
	while (*dp)
		(void) putchar(*dp++);
	(void) putchar(')');
	(void) putchar(' ');
}

static void
split(char *line, char *name)
{
	char *cp, *dp;
	char *sp, *sep;

	cp = strchr(line, '-');
	if (cp == 0)
		return;
	sp = cp + 1;
	for (--cp; *cp == ' ' || *cp == '\t' || *cp == '\\'; cp--)
		;
	*++cp = '\0';
	while (*sp && (*sp == ' ' || *sp == '\t'))
		sp++;
	for (sep = "", dp = line; dp && *dp; dp = cp, sep = "\n") {
		cp = strchr(dp, ',');
		if (cp) {
			char *tp;

			for (tp = cp - 1; *tp == ' ' || *tp == '\t'; tp--)
				;
			*++tp = '\0';
			for (++cp; *cp == ' ' || *cp == '\t'; cp++)
				;
		}
		(void) printf("%s%s\t", sep, dp);
		dorefname(name);
		(void) printf("\t%s", sp);
	}
}

static void
dorefname(char *name)
{
	char *dp = name, *ep;

again:
	while (*dp && *dp != '.')
		(void) putchar(*dp++);
	if (*dp)
		for (ep = dp+1; *ep; ep++)
			if (*ep == '.') {
				(void) putchar(*dp++);
				goto again;
			}
	(void) putchar('.');
	if (*dp)
		dp++;
	while (*dp)
		(void) putchar(*dp++);
}

/*
 * The rest of the routines in the file form a simplistic parser
 * for SGML manpages.  We assume the input is syntactically correct
 * SGML, and that the fields occur in the input file in order.
 */

/*
 * Some utilities for constructing arbitrary length wide character strings
 */

typedef struct {
	wchar_t *str;
	size_t size;
	long index;
} string_t;

#define	DEF_STR_SIZE	16
#define	DEF_STR_GROWTH	16

static void
outofspace(char *where)
{
	(void) fprintf(stderr, "%s: '%s' - out of memory\n", progname, where);
	exit(1);
}

static string_t *
newstring(size_t initial)
{
	string_t *s = malloc(sizeof (*s));

	if (s == NULL)
		outofspace("new s");

	initial *= sizeof (wchar_t);
	if (initial < DEF_STR_SIZE)
		initial = DEF_STR_SIZE;

	s->str = malloc(initial);
	if (s->str == NULL)
		outofspace("new str");

	s->size = initial;
	s->index = 0;
	*s->str = L'\0';
	return (s);
}

static void
delstring(string_t **s)
{
	free((*s)->str);
	(*s)->str = NULL;
	free(*s);
	*s = NULL;
}

static wchar_t *
getwstring(string_t *s)
{
	static const wchar_t wnull = L'\0';

	if (s)
		return (s->str);
	return ((wchar_t *)&wnull);
}

static char *
getcstring(string_t *s)
{
	size_t len = (wcslen(s->str) + 1) * MB_CUR_MAX;
	char *cstr = malloc(len);
	char *p = cstr;
	wchar_t *wp = s->str;

	if (p == NULL)
		outofspace("getc");
	while (*wp)
		p += wctomb(p, *wp++);
	*p = '\0';
	return (cstr);
}

static void
appendwstring(string_t *s, const wchar_t *str)
{
	size_t len = wcslen(str) + 1;

	s->size += sizeof (wchar_t) * len;
	s->str = realloc(s->str, s->size);
	if (s->str == NULL)
		outofspace("appendw");
	(void) wcscat(s->str, str);
	s->index = wcslen(s->str) + 1;
}

static void
putwstring(string_t *s, wchar_t wc)
{
	if ((s->index + 1) * sizeof (wchar_t) >= s->size) {
		s->size += DEF_STR_GROWTH;
		s->str = realloc(s->str, s->size);
		if (s->str == NULL)
			outofspace("put");
	}
	s->str[s->index++] = wc;
}

/*
 * Find the closing > of an SGML comment block
 * (allowing for multibyte, embedded, comments)
 */
static void
eatcomments(void)
{
	int pending = 1;

	while (pending)
		switch (getwchar()) {
		default:
			break;
		case L'<':
			pending++;
			break;
		case L'>':
			pending--;
			break;
		case WEOF:
			return;
		}
}

/*
 * Find the next token on stdin.
 * Handles nested comment strings, and removes any trailing newlines
 * from the stream after the closing '>'.
 */
static int
find_token(char *tokbuf, size_t tokbuflen)
{
	int c;
	wint_t wc;
	char *tokp;

top:
	while ((wc = getwchar()) != WEOF)
		if (wc == L'<')
			break;

	if (wc == WEOF && errno == EILSEQ)
		return (0);

	switch (c = getchar()) {
	case EOF:
		return (0);
	default:
		(void) ungetc(c, stdin);
		break;
	case '!':
		eatcomments();
		goto top;
	}

	tokp = tokbuf;

	while ((c = getchar()) != EOF) {
		if (c == '>') {
			while ((c = getchar()) != EOF)
				if (c != '\n') {
					(void) ungetc(c, stdin);
					break;
				}
			*tokp = '\0';
			return (1);
		}
		if (tokp - tokbuf < tokbuflen)
			*tokp++ = (char)c;
	}

	return (0);
}

/*
 * This structure is filled out during the parsing of each page we encounter
 */
typedef struct {
	char *name;
	string_t *title;
	string_t *volnum;
	string_t *date;
	string_t *names;
	string_t *purpose;
} manpage_t;

static void
warning(manpage_t *m, const char *fmt, ...)
{
	va_list ap;
	va_start(ap, fmt);
	(void) fprintf(stderr, "%s: %s - ", progname, m->name);
	(void) vfprintf(stderr, fmt, ap);
	va_end(ap);
}

/*
 * Fetch a string from stdin, terminated by the endtoken.
 * These strings may be localized, so do this with wide characters.
 * Hack: skip over (completely ignore) all other tokens
 * Hack: map all &blort; constructs to spaces.
 */
static string_t *
filestring(manpage_t *m, size_t initial, char *endtoken)
{
	char tokbuf[BUFSIZ * MB_LEN_MAX];
	string_t *s = newstring(initial);
	wint_t wc;

	while ((wc = getwchar()) != WEOF)
		switch (wc) {
		case L'\n':
			if ((wc = getwchar()) != WEOF)
				(void) ungetwc(wc, stdin);
			if (wc != L'<')
				putwstring(s, L' ');
			break;
		case L'<':
			(void) ungetwc(wc, stdin);
			if (!find_token(tokbuf, sizeof (tokbuf)) ||
			    strcasecmp(endtoken, tokbuf) == 0)
				goto done;
			break;
		case L'&':
			while ((wc = getwchar()) != WEOF)
				if (wc == L';')
					break;
			wc = L' ';
			/* FALLTHROUGH */
		default:
			putwstring(s, wc);
			break;
		}

	if (errno == EILSEQ)
		warning(m, "%s while parsing %s\n", strerror(errno), endtoken);
done:
	putwstring(s, L'\0');
	return (s);
}

/*
 * <refentrytitle> TITLE </refentrytitle>
 */
static int
refentrytitle(manpage_t *m)
{
	if (m->title != NULL)
		warning(m, "repeated refentrytitle\n");
	m->title = filestring(m, 8, "/refentrytitle");
	return (1);
}

/*
 * <manvolnum> MANVOLNUM </manvolnum>
 */
static int
manvolnum(manpage_t *m)
{
	if (m->volnum != NULL)
		warning(m, "repeated manvolnum\n");
	m->volnum = filestring(m, 3, "/manvolnum");
	return (1);
}

/*
 * <refmiscinfo class="date"> DATE </refmiscinfo>
 */
static int
refmiscinfo_date(manpage_t *m)
{
	if (m->date != NULL)
		warning(m, "repeated date\n");
	m->date = filestring(m, 11, "/refmiscinfo");
	return (1);
}

/*
 * .. </refmeta>
 */
static int
print_refmeta(manpage_t *m)
{
	char headbuf[BUFSIZ];

	(void) snprintf(headbuf, sizeof (headbuf), ".TH %ws %ws \"%ws\"",
	    getwstring(m->title), getwstring(m->volnum), getwstring(m->date));

	trimln(headbuf);
	if (tocrc)
		doname(m->name);
	if (!intro)
		section(m->name, headbuf);

	if (m->title)
		delstring(&m->title);
	if (m->volnum)
		delstring(&m->volnum);
	if (m->date)
		delstring(&m->date);

	return (1);
}

static int
appendname(manpage_t *m, char *term)
{
	string_t *r = filestring(m, 0, term);

	if (m->names) {
		appendwstring(m->names, L", ");
		appendwstring(m->names, getwstring(r));
		delstring(&r);
	} else
		m->names = r;
	return (1);
}

/*
 * <refdescriptor> REFDESCRIPTOR </refdescriptor>
 */
static int
refdescriptor(manpage_t *m)
{
	return (appendname(m, "/refdescriptor"));
}

/*
 * <refname> REFNAME </refname>
 */
static int
refname(manpage_t *m)
{
	return (appendname(m, "/refname"));
}

/*
 * <refpurpose> PURPOSE </refpurpose>
 */
static int
refpurpose(manpage_t *m)
{
	if (m->purpose != NULL)
		warning(m, "repeated refpurpose\n");
	m->purpose = filestring(m, 0, "/refpurpose");
	return (1);
}

/*
 * .. </refnamediv> - this is our chance to bail out.
 */
static int
terminate(manpage_t *m)
{
	if (m->names) {
		appendwstring(m->names, L" \\- ");
		appendwstring(m->names, getwstring(m->purpose));
		if (intro) {
			char *buf = getcstring(m->names);
			split(buf, m->name);
			free(buf);
		} else
			(void) printf("%ws", getwstring(m->names));
	}

	if (m->names)
		delstring(&m->names);
	if (m->purpose)
		delstring(&m->purpose);

	(void) printf("\n");
	return (0);
}


/*
 * Basic control structure of the SGML "parser".
 * It's very simplistic - when named tags are encountered in the
 * input stream, control is transferred to the corresponding routine.
 * No checking is done for correct pairing of tags.  A few other hacks
 * are sneaked into the lexical routines above.
 * Output is generated after seeing the /refmeta and /refnamediv
 * closing tags.
 */
static const struct {
	char *name;
	int (*action)(manpage_t *);
} acts[] = {
	{ "refentrytitle",		refentrytitle },
	{ "manvolnum",			manvolnum },
	{ "refmiscinfo class=\"date\"",	refmiscinfo_date },
	{ "/refmeta",			print_refmeta },
	{ "refdescriptor",		refdescriptor },
	{ "refname",			refname },
	{ "refpurpose",			refpurpose },
	{ "/refnamediv",		terminate },
	{ 0 }
};

static void
sgmlpage(char *name)
{
	int rc = 1, a;
	char tokbuf[BUFSIZ];
	manpage_t manpage, *m = &manpage;

	(void) memset(m, 0, sizeof (*m));
	m->name = name;

	do {
		if (!find_token(tokbuf, sizeof (tokbuf)))
			break;
		for (a = 0; acts[a].name; a++) {
			if (strcasecmp(acts[a].name, tokbuf) != 0)
				continue;
			rc = acts[a].action(m);
			break;
		}
	} while (rc);
}