cmd/html/htmlrefs.c

	htmlrefs.c revision 3f54fd611f536639ec30dd53c48e5ec1897cc7d9
/***********************************************************************
*                                                                      *
*               This software is part of the ast package               *
*          Copyright (c) 1996-2012 AT&T Intellectual Property          *
*                      and is licensed under the                       *
*                 Eclipse Public License, Version 1.0                  *
*                    by AT&T Intellectual Property                     *
*                                                                      *
*                A copy of the License is available at                 *
*          http://www.eclipse.org/org/documents/epl-v10.html           *
*         (with md5 checksum b35adb5213ca9657e911e9befb180842)         *
*                                                                      *
*              Information and Software Systems Research               *
*                            AT&T Research                             *
*                           Florham Park NJ                            *
*                                                                      *
*                 Glenn Fowler <gsf@research.att.com>                  *
*                                                                      *
***********************************************************************/
#pragma prototyped
/*
 * Glenn Fowler
 * AT&T Research
 */

static const char usage[] =
"[-?\n@(#)$Id: htmlrefs (AT&T Research) 2012-01-01 $\n]"
USAGE_LICENSE
"[+NAME?htmlrefs - list html url references]"
"[+DESCRIPTION?\bhtmlrefs\b lists url references from the"
"   local closure of the input \bhtml\b \afile\as. If \afile\a is not"
"   specified then the top level default user file is read. The \bhtml\b"
"   parse is rudimentary; don't use \bhtmlrefs\b to detect valid \bhtml\b"
"   files.]"
"[+?The top level references are determined in this order (the \b--index\b,"
"   \b--root\b and \b--user\b options influence the order):]{"
"   [+$HOME/index.html?Pseudo index containing"
"       \b<LINK href=\b\adir\a \brel=\b\atype\a\b>\b references to"
"       top level directories. \atype\a may be one of:]{"
"       [+document-root?The document root directory containing URL"
"           target documents. Exactly one \bdocument-root\b must"
"           be specified.]"
"       [+program-root?The program root directory containing CGI"
"           support programs and scripts. This type is optional."
"           If specified then the program root directory should"
"           contain a pseudo index for its references.]"
"       [+data-root?The data root directory containing CGI"
"           support data. This type is optional. If specified then"
"           the data root directory should contain a pseudo index"
"           for its references.]"
"       [+dynamic?All files under \adir\a are considered referenced.]"
"       [+host?Provides a default value for the \b--hosts\b option.]"
"       [+ignore?\adir\a is a \bksh\b(1) pattern of paths to ignore.]"
"       [+internal?If \b--external\b is on then \adir\a is a \bksh\b(1)"
"           pattern of internal paths.]"
"       [+secure?Files under this dir are accessed by \bhttps:\b only.]"
"   }"
"   [+$HOME/wwwfiles/index.html?]"
"   [+$HOME/public_html/index.html?]"
"}"
"[a:all?List all references whether they exist or not.]"
"[c:copy?Copy the selected references to \adirectory\a which must already"
"   exist. If \b--external\b is also specified then lines between"
"   \b<!--INTERNAL-->\b ... \b<!--/INTERNAL-->\b lines are not"
"   copied. If \b--unreferenced\b is also specified then files and"
"   directories in \adirectory\a that have not been copied are"
"   removed. Target file modification times are set to match source"
"   times so that future copies can be avoided.]:[directory]"
"[d:dependents?List each selected local file followed by \b:\b and a list of"
"   all local files referring to the file.]"
"[e:external?Do not list references inside \b<!--INTERNAL-->\b ..."
"   \b<!--/INTERNAL-->\b lines. See \bmm2html\b(1) for an html"
"   generator that inserts these lines.]"
"[F:force?By default files are not copied if the source and target size and"
"   modification times match. \b--force\b forces all files to be copied.]"
"[h:hosts?Check only references matching the \bksh\b(1) pattern"
"   \bhttp://\b\apattern\a\b/\b.]:[pattern]"
"[i:index?\aname\a specifies the page named by directory"
"   references.]:[name:=index.html]"
"[k:keep?\apattern\a is used to match file base names that are always"
"   considered referenced.]:[pattern:=.htaccess]"
"[l:limit?Limit \b--copy\b and \b--remove\b operations to path names matching"
"   \apattern\a.]:[pattern]"
"[m:missing?List missing local file references.]"
"[n!:exec?Enable file modification operations. \b--noexec\b lists the"
"   operations but does not do them.]"
"[p!:perlwarn?Check HTML files for unintentional embedded \bperl\b(1)"
"   constructs: a left bracket followed by one of \b-+!$*#\b. Manually"
"   translating left bracket to \b&#0091;\b avoids unwanted \bperl\b"
"   interactions (why didn't they use tags like everyone else?)"
"   \bmm2html\b(1) and \boptget\b(3) do the translation by default.]"
"[X:remove?Unreferenced files are removed when \b--unreferenced\b and"
"   \b--nocopy\b are specified.]"
"[r:root?The local \adirectory\a for \b--user\b"
"   references.]:[directory:=~\auser\a]"
"[K:skip?\apattern\a is used to match file base names that are never"
"   considered referenced.]:[pattern:=00-INDEX-00]"
"[s:strict?By default unreferenced \b--index\b files and the containing"
"   directory are considered referenced; \b--strict\b considers"
"   unreferenced \b--index\b files unreferenced.]"
"[S:symlink?Instruct \b--copy\b to \bsymlink\b(2) files that do not contain"
"   \b<!--INTERNAL-->\b ... \b<!--/INTERNAL-->\b or are not in"
"   \b/cgi-bin/\b.]"
"[u:user?\b~\b\aname\a translates to the \b--root\b"
"   directory.]:[name:=caller-uid]"
"[v:verbose?List files as they are copied (see \b--copy\b.)]"
"[w:warn?Produce a warning diagnostic for missing files.]"
"[x:unreferenced?If \b--copy\b is also specified then remove files and"
"   directories in the \b--copy\b \adirectory\a that have not been copied."
"   Otherwise list unreferenced files in the \b--root\b directory."
"   A directory that contains no referenced files but does contain an"
"   \b--index\b file is considered referenced (along with the \b--index\b"
"   file) unless \b--strict\b is enabled.]"

"\n"
"\n[ file ... ]\n"
"\n"

"[+EXAMPLES]{"
"   [+htmlrefs --hosts=www.research.att.com --missing?List missing"
"       references to the local host \bwww.research.att.com\b.]"
"   [+htmlrefs -n -h www.research.att.com -c ~/external/wwwfiles -e -x?Copy"
"       the local hierarchy to \b~/external/wwwfiles\b for external"
"       release, and remove unreferenced files in the copy.]"
"}"
"[+SEE ALSO?\bhtml2rtf\b(1), \bmm2html\b(1)]"
;

#include <ast.h>
#include <cdt.h>
#include <ctype.h>
#include <error.h>
#include <fts.h>
#include <glob.h>
#include <pwd.h>
#include <tm.h>

#define INDEX           "index.html"
#define KEEP            ".htaccess"
#define SKIP            "00-INDEX-00"

#define CHECKED         0x001
#define COPIED          0x002
#define COPY            0x004
#define DIRECTORY       0x008
#define EXTERNAL        0x010
#define FILTER          0x020
#define INTERNAL        0x040
#define MISSING         0x080
#define SCANNED         0x100
#define SECURE          0x200
#define VERBOSE         0x400

#define HIT         (-1)
#define MISS            (-2)

#define STUFF(s, buf, c)    ((s < &buf[sizeof(buf)]) ? (*s++ = c) : -1)

struct List_s;

typedef struct String_s
{
    char*       data;
    unsigned int    size;
} String_t;

typedef struct File_s
{
    Dtlink_t    link;
    unsigned long   time;
    unsigned int    flags;
    struct List_s*  refs;
    char        name[1];
} File_t;

typedef struct List_s
{
    struct List_s*  next;
    File_t*     file;
} List_t;

typedef struct State_s
{
    Dtdisc_t    disc;
    Dt_t*       files;

    int     all;
    int     dependents;
    int     exec;
    int     external;
    int     force;
    int     missing;
    int     more;
    int     perlwarn;
    int     remove;
    int     strict;
    int     symlink;
    int     unreferenced;
    int     verbose;
    int     warn;

    String_t    copy;
    String_t    dataroot;
    String_t    documentroot;
    String_t    hosts;
    String_t    ignore;
    String_t    index;
    String_t    internal;
    String_t    keep;
    String_t    limit;
    String_t    programroot;
    String_t    root;
    String_t    skip;
    String_t    user;

    char        buf[PATH_MAX];
    char        dir[PATH_MAX];
    char        tmp[PATH_MAX];
} State_t;

static const char   internal[] = "<!--INTERNAL-->";
static const char   external[] = "<!--/INTERNAL-->";

static int
keep(State_t* state, const char* name, int mode)
{
    char*   s;

    if (state->skip.size)
    {
        if (s = strrchr(name, '/'))
            s++;
        else
            s = (char*)name;
        if (strmatch(s, state->skip.data))
            return 0;
    }
    if (mode >= 0 && access(name, mode))
        return 0;
    return 1;
}

/*
 * check for glob(dir/name)
 */

static void
check(register State_t* state, const char* dir, const char* name, unsigned int flags)
{
    register File_t*    dp;
    register char*      s;
    register char**     p;
    glob_t          gl;

    memset(&gl, 0, sizeof(gl));
    sfsprintf(state->dir, sizeof(state->dir) - 1, "%s/(%s)", dir, name);
    if (!glob(state->dir, GLOB_AUGMENTED|GLOB_DISC|GLOB_STACK, 0, &gl))
        for (p = gl.gl_pathv; s = *p++;)
            if (!dtmatch(state->files, s) && keep(state, s, F_OK))
            {
                if (!(dp = newof(0, File_t, 1, strlen(s))))
                    error(ERROR_SYSTEM|3, "out of space [file]");
                strcpy(dp->name, s);
                dtinsert(state->files, dp);
                dp->flags |= flags;
            }
}

/*
 * add reference path s
 */

static File_t*
add(register State_t* state, register char* s, unsigned int flags, const char* path, int prefix, File_t* ref)
{
    register char*      t;
    register File_t*    fp;
    register File_t*    dp;
    register List_t*    lp;
    char*           u;
    struct stat     st;

    if (!(flags & COPIED))
    {
        if (ref && (ref->flags & SECURE))
            flags |= SECURE;
        if (state->hosts.size)
        {
            if (t = strchr(s, ':'))
            {
                if (strneq(s, "http://", t - s + 3))
                {
                    s = t + 3;
                    flags &= ~SECURE;
                }
                else if (strneq(s, "https://", t - s + 4))
                {
                    s = t + 4;
                    flags |= SECURE;
                }
                else
                    return 0;
                if (t = strchr(s, '/'))
                    *t = 0;
                if (!strmatch(s, state->hosts.data))
                    return 0;
                if (t)
                    *(s = t) = '/';
                else
                    s = "/";
            }
            if (*s == '/')
            {
                if (ref && !streq(s, ref->name))
                {
                    if (*(s + 1) != '~')
                        return 0;
                    if (*(s + 2) == '/')
                        s += 2;
                    else if (!state->user.size || !strneq(s + 2, state->user.data, state->user.size) || *(s + 2 + state->user.size) != '/')
                        return 0;
                    else
                        s += 2 + state->user.size;
                    if (state->documentroot.size)
                    {
                        sfsprintf(state->buf, sizeof(state->buf) - 1, "%s%s%s", state->documentroot.data, (flags & SECURE) ? "/secure" : "", s);
                        pathcanon(s = state->buf, sizeof(state->buf), 0);
                    }
                    else if (state->root.size)
                    {
                        sfsprintf(state->buf, sizeof(state->buf) - 1, "%s%s", state->root.data, (flags & SECURE) ? "/secure" : "", s);
                        pathcanon(s = state->buf, sizeof(state->buf), 0);
                    }
                }
            }
            else if (prefix)
            {
                sfsprintf(state->buf, sizeof(state->buf) - 1, "%-.*s%s", prefix, path, s);
                pathcanon(s = state->buf, sizeof(state->buf), 0);
            }
            else if (flags & SECURE)
            {
                sfsprintf(state->tmp, sizeof(state->tmp), "secure/%s", s);
                s = state->tmp;
            }
        }
        if (*s == '.' && *(s + 1) == '/')
            while (*++s == '/');
        if (!*s)
            s = "/";
        for (t = s + strlen(s); t > s && *(t - 1) == '/'; t--);
        if (*t == '/' || !stat(s, &st) && S_ISDIR(st.st_mode))
        {
            if (s >= state->buf && s < state->buf + sizeof(state->buf))
            {
                if (!*t)
                    *t = '/';
                sfsprintf(t + 1, sizeof(state->buf) - (t - s + 2), "%s", state->index.data);
            }
            else
            {
                sfsprintf(state->buf, sizeof(state->buf) - 1, "%-.*s/%s", t - s, s, state->index.data);
                s = state->buf;
            }
        }
    }
    if (!(fp = (File_t*)dtmatch(state->files, s)))
    {
        if (!keep(state, s, -1))
            return 0;
        if (!(fp = newof(0, File_t, 1, strlen(s))))
            error(ERROR_SYSTEM|3, "out of space [file]");
        strcpy(fp->name, s);
        dtinsert(state->files, fp);
        state->more = 1;
        if (t = strrchr(s, '/'))
            do
            {
                *t = 0;
                if (dp = (File_t*)dtmatch(state->files, s))
                {
                    *t = '/';
                    break;
                }
                if (!(dp = newof(0, File_t, 1, strlen(s))))
                    error(ERROR_SYSTEM|3, "out of space [file]");
                strcpy(dp->name, s);
                dtinsert(state->files, dp);
                dp->flags |= DIRECTORY|flags;
                if (!(flags & COPIED))
                {
                    if (!state->strict)
                        check(state, s, state->index.data, flags);
                    if (state->keep.size)
                        check(state, s, state->keep.data, flags);
                }
                u = strrchr(s, '/');
                *t = '/';
            } while ((t = u) && (t - s) > state->root.size);
    }
    fp->flags |= flags;
    if (ref && state->dependents)
    {
        for (lp = fp->refs; lp && lp->file != ref; lp = lp->next);
        if (!lp)
        {
            if (!(lp = newof(0, List_t, 1, 0)))
                error(ERROR_SYSTEM|3, "out of space [file]");
            lp->file = ref;
            lp->next = fp->refs;
            fp->refs = lp;
        }
    }
    return fp;
}

/*
 * order directory stream by name
 */

static int
order(FTSENT* const* a, FTSENT* const* b)
{
    return strcmp((*a)->fts_name, (*b)->fts_name);
}

/*
 * parse and set root dir r from s
 * possibly using tmp buffer buf
 */

static void
rootdir(State_t* state, register String_t* r, register char* s, char* buf, size_t z)
{
    register char*  t;
    register int    n;

    if (t = strrchr(s, '/'))
        *t = 0;
    if (*s == '/')
        n = strlen(s);
    else
    {
        n = sfsprintf(buf, z, "%s/%s", state->root.data, s);
        s = buf;
    }
    if (!(r->data = strdup(s)))
        error(ERROR_SYSTEM|3, "out of space [rootdir]");
    r->size = n;
    if (t)
        *t = '/';
}

/*
 * return next directory entry
 */

static FTSENT*
scan(State_t* state, FTS* fts)
{
    FTSENT* ent;
    Sfio_t* sp;
    char*   s;
    int skip;

    while (ent = fts_read(fts))
    {
        if (state->external && ent->fts_info == FTS_D)
        {
            sfsprintf(state->buf, sizeof(state->buf) - 1, "%s/%s", ent->fts_path, state->index.data);
            if (sp = sfopen(NiL, state->buf, "r"))
            {
                skip = 0;
                while (s = sfgetr(sp, '\n', 1))
                {
                    if (strgrpmatch(s, internal, NiL, 0, 0))
                    {
                        skip = 1;
                        break;
                    }
                    else if (strgrpmatch(s, "</HEAD>", NiL, 0, STR_ICASE))
                        break;
                }
                sfclose(sp);
                if (skip)
                {
                    if (fts_set(NiL, ent, FTS_SKIP))
                        error(1, "%s: cannot skip", ent->fts_path);
                    continue;
                }
            }
        }
        break;
    }
    return ent;
}

/*
 * process refs in path
 */

static void
refs(register State_t* state, const char* path, register Sfio_t* ip, File_t* ref)
{
    register int    c;
    register int    q;
    register int    r;
    register int    a;
    register char*  s;
    char*       p;
    char*       t;
    File_t*     f;
    String_t*   v;
    int     m;
    int     perlwarn;
    int     prefix;
    unsigned int    secure;
    unsigned int    flags;

    char        buf[8 * 1024];

    perlwarn = state->perlwarn && strmatch(path, "*.(html|htm|HTML|HTM)");
    prefix = (s = strrchr(path, '/')) ? s - (char*)path + 1 : 0;
    flags = EXTERNAL;
    for (;;)
    {
        switch (c = sfgetc(ip))
        {
        case EOF:
            break;
        case '<':
            q = 0;
            s = buf;
            for (;;)
            {
                switch (c = sfgetc(ip))
                {
                case EOF:
                    return;
                case '>':
                    sfungetc(ip, c);
                    break;
                default:
                    if (isspace(c))
                        break;
                    STUFF(s, buf, c);
                    continue;
                }
                break;
            }
            q = 0;
            if (flags != INTERNAL && (s == (buf + 1) && (buf[0] == 'A' || buf[0] == 'a') || s == (buf + 4) && (buf[0] == 'L' || buf[0] == 'l') && (buf[1] == 'I' || buf[1] == 'i') && (buf[2] == 'N' || buf[2] == 'n') && (buf[3] == 'K' || buf[3] == 'k')))
            {
                s = buf;
                r = a = 0;
                f = 0;
                for (;;)
                {
                    switch (c = sfgetc(ip))
                    {
                    case EOF:
                        return;
                    case '\'':
                    case '"':
                        if (q == c)
                            q = 0;
                        else if (q == 0)
                            q = c;
                        else if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    case '>':
                    case ' ':
                    case '\t':
                    case '\n':
                        if (!q)
                        {
                            if (r == HIT)
                            {
                                /*UNDENT...*/

    *s = 0;
    s = buf;
    if (!a)
        f = add(state, s, flags, path, prefix, ref);
    else if (f)
    {
        p = f->name;
        if (!strcasecmp(s, "data-root"))
            rootdir(state, &state->dataroot, p, buf, sizeof(buf));
        else if (!strcasecmp(s, "document-root"))
            rootdir(state, &state->documentroot, p, buf, sizeof(buf));
        else if (!strcasecmp(s, "host") || !strcasecmp(s, "hosts"))
        {
            if (!state->hosts.size && (state->hosts.size = strlen(p)) && !(state->hosts.data = strdup(p)))
                error(ERROR_SYSTEM|3, "out of space [hosts]");
        }
        else if (!strcasecmp(s, "program-root"))
            rootdir(state, &state->programroot, p, buf, sizeof(buf));
        else if ((secure = strcasecmp(s, "secure") ? 0 : SECURE) || !strcasecmp(s, "dynamic"))
        {
            FTS*    fts;
            FTSENT* ent;

            if (t = strrchr(p, '/'))
                *t = 0;
            fts = fts_open((char**)p, FTS_ONEPATH|FTS_META|FTS_PHYSICAL|FTS_NOPOSTORDER, order);
            if (t)
                *t = '/';
            if (fts)
            {
                while (ent = scan(state, fts))
                    add(state, ent->fts_path + prefix, flags|secure, f->name, prefix, f);
                if (fts_close(fts))
                    error(ERROR_SYSTEM|2, "%s: directory read error", p);
            }
        }
        else if (!strcasecmp(s, "ignore") && (v = &state->ignore) || state->external && !strcasecmp(s, "internal") && (v = &state->internal))
        {
            if (state->copy.size)
            {
                s = state->copy.data;
                p += state->root.size;
            }
            else
                s = "";
            if (t = strrchr(p, '/'))
                *t = 0;
            m = v->size + strlen(s) + strlen(p) + 6;
            if (!(v->data = newof(v->data, char, m, 0)))
                error(ERROR_SYSTEM|3, "out of space [path pattern]");
            v->size += sfsprintf(v->data + v->size, m, "%s%s%s?(/*)", v->size ? "|" : "", s, p);
            if (t)
                *t = '/';
        }
    }

                                /*...INDENT*/
                            }
                            if (c == '>')
                                break;
                            r = a = 0;
                        }
                        else if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    case '#':
                    case '?':
                        if (r == HIT)
                            STUFF(s, buf, 0);
                        continue;
                    case 'H':
                    case 'h':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 0) ? 1 : MISS;
                        continue;
                    case 'R':
                    case 'r':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                        {
                            if (r == 0)
                            {
                                a = 10;
                                r = a + 1;
                            }
                            r = (r == (a + 1)) ? (a + 2) : MISS;
                        }
                        continue;
                    case 'E':
                    case 'e':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == (a + 2)) ? (a + 3) : MISS;
                        continue;
                    case 'F':
                    case 'f':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 3) ? 4 : MISS;
                        continue;
                    case 'L':
                    case 'l':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == (a + 3)) ? (a + 4) : MISS;
                        continue;
                    case '=':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == (a + 4)) ? HIT : MISS;
                        continue;
                    default:
                        if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    }
                    break;
                }
            }
            else if (flags != INTERNAL && (s == (buf + 5) && (buf[0] == 'F' || buf[0] == 'f') && (buf[1] == 'R' || buf[1] == 'r') && (buf[2] == 'A' || buf[2] == 'a') && (buf[3] == 'M' || buf[3] == 'm') && (buf[4] == 'E' || buf[4] == 'e') || s == (buf + 3) && (buf[0] == 'I' || buf[0] == 'i') && (buf[1] == 'M' || buf[1] == 'm') && (buf[2] == 'G' || buf[2] == 'g') || s == (buf + 6) && (buf[0] == 'S' || buf[0] == 's') && (buf[1] == 'C' || buf[1] == 'c') && (buf[2] == 'R' || buf[2] == 'r') && (buf[3] == 'I' || buf[3] == 'i') && (buf[4] == 'P' || buf[4] == 'p') && (buf[5] == 'T' || buf[5] == 't')))
            {
                s = buf;
                r = 0;
                for (;;)
                {
                    switch (c = sfgetc(ip))
                    {
                    case EOF:
                        return;
                    case '\'':
                    case '"':
                        if (q == c)
                            q = 0;
                        else if (q == 0)
                            q = c;
                        else if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    case '>':
                    case ' ':
                    case '\t':
                    case '\n':
                        if (!q)
                        {
                            if (r == HIT)
                            {
                                *s = 0;
                                s = buf;
                                add(state, s, flags, path, prefix, ref);
                            }
                            if (c == '>')
                                break;
                            r = 0;
                        }
                        else if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    case 'S':
                    case 's':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 0) ? 1 : MISS;
                        continue;
                    case 'R':
                    case 'r':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 1) ? 2 : MISS;
                        continue;
                    case 'C':
                    case 'c':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 2) ? 3 : MISS;
                        continue;
                    case '=':
                        if (r == HIT)
                            STUFF(s, buf, c);
                        else if (!q)
                            r = (r == 3) ? HIT : MISS;
                        continue;
                    default:
                        if (r == HIT)
                            STUFF(s, buf, c);
                        continue;
                    }
                    break;
                }
            }
            else
            {
                if (state->external)
                {
                    if (flags == EXTERNAL)
                    {
                        if (s == (buf + sizeof(internal) - 3) && strneq(buf, internal + 1, sizeof(internal) - 3))
                        {
                            flags = INTERNAL;
                            ref->flags |= FILTER;
                        }
                    }
                    else
                    {
                        if (s == (buf + sizeof(external) - 3) && strneq(buf, external + 1, sizeof(external) - 3))
                            flags = EXTERNAL;
                    }
                }
                for (;;)
                {
                    switch (c = sfgetc(ip))
                    {
                    case EOF:
                        return;
                    case '\'':
                    case '"':
                        if (q == c)
                            q = 0;
                        else if (q == 0)
                            q = c;
                        continue;
                    case '>':
                        if (q == 0)
                            break;
                        continue;
                    default:
                        continue;
                    }
                    break;
                }
            }
            continue;
        case '[':
            if (perlwarn && (c = sfgetc(ip)) != EOF)
            {
                sfungetc(ip, c);
                switch (c)
                {
                case '-':
                case '+':
                case '!':
                case '$':
                case '*':
                case '#':
                    error(1, "%s: file contains embedded perl constructs", path);
                    perlwarn = 0;
                    break;
                }
            }
            continue;
        default:
            if ((iscntrl(c) || !isprint(c)) && !isspace(c))
                break;
            continue;
        }
        break;
    }
}

/*
 * filter out internal text
 * return: <0:error 0:drop >0:keep
 */

static int
filter(register State_t* state, register Sfio_t* ip, Sfio_t* op)
{
    register char*  s;
    register size_t n;
    register size_t lines = 0;
    register int    head = 1;

    for (;;)
    {
        if (!(s = sfgetr(ip, '\n', head)))
            break;
        if ((n = sfvalue(ip)) != sizeof(internal) || !strneq(s, internal, sizeof(internal) - 1))
        {
            if (head)
                sfputr(op, s, '\n');
            else
                sfwrite(op, s, n);
            lines++;
            if (head && strgrpmatch(s, "</HEAD>", NiL, 0, STR_ICASE))
                head = 0;
        }
        else
        {
            while ((s = sfgetr(ip, '\n', 0)) && (sfvalue(ip) != sizeof(external) || !strneq(s, external, sizeof(external) - 1)));
            if (!s)
            {
                if (head)
                    return 0;
                break;
            }
        }
    }
    if (sfvalue(ip) && (s = sfgetr(ip, -1, 0)) && (n = sfvalue(ip)))
        sfwrite(op, s, n);
    return lines > 1;
}

int
main(int argc, char** argv)
{
    register char*      s;
    register char*      p;
    register Sfio_t*    ip;
    register State_t*   state;
    register File_t*    fp;
    register List_t*    lp;
    FTS*            fts;
    FTSENT*         ent;
    struct passwd*      pwd;
    Sfio_t*         op;
    char*           dirs[4];
    int         i;
    int         n;
    struct stat     st;
    struct stat     ts;

    static const char*  www[] = { 0, 0, "wwwfiles", "public_html" };

    NoP(argc);
    error_info.id = "htmlrefs";
    if (!(state = newof(0, State_t, 1, 0)))
        error(ERROR_SYSTEM|3, "out of space [state]");
    state->disc.key = offsetof(File_t, name);
    state->disc.size = 0;
    if (!(state->files = dtopen(&state->disc, Dtoset)))
        error(ERROR_SYSTEM|3, "out of space [dict]");
    state->exec = 1;
    state->perlwarn = 1;
    for (;;)
    {
        switch (optget(argv, usage))
        {
        case 'a':
            state->all = opt_info.num;
            continue;
        case 'c':
            state->copy.size = strlen(state->copy.data = opt_info.arg);
            continue;
        case 'd':
            state->dependents = opt_info.num;
            continue;
        case 'e':
            state->external = opt_info.num;
            continue;
        case 'F':
            state->force = opt_info.num;
            continue;
        case 'h':
            state->hosts.size = strlen(state->hosts.data = opt_info.arg);
            continue;
        case 'i':
            state->index.size = strlen(state->index.data = opt_info.arg);
            continue;
        case 'k':
            state->keep.size = strlen(state->keep.data = opt_info.arg);
            continue;
        case 'K':
            state->skip.size = strlen(state->skip.data = opt_info.arg);
            continue;
        case 'l':
            state->limit.size = strlen(state->limit.data = opt_info.arg);
            continue;
        case 'm':
            state->missing = opt_info.num ? MISSING : 0;
            continue;
        case 'n':
            state->exec = opt_info.num;
            continue;
        case 'r':
            state->root.size = strlen(state->root.data = opt_info.arg);
            continue;
        case 's':
            state->strict = opt_info.num;
            continue;
        case 'u':
            state->user.size = strlen(state->user.data = opt_info.arg);
            continue;
        case 'v':
            state->verbose = opt_info.num;
            continue;
        case 'w':
            state->warn = opt_info.num;
            continue;
        case 'x':
            state->unreferenced = opt_info.num;
            continue;
        case 'S':
            state->symlink = opt_info.num;
        case 'X':
            state->remove = opt_info.num;
            continue;
        case '?':
            error(ERROR_USAGE|4, "%s", opt_info.arg);
            continue;
        case ':':
            error(2, "%s", opt_info.arg);
            continue;
        }
        break;
    }
    argv += opt_info.index;
    if (error_info.errors)
        error(ERROR_USAGE|4, "%s", optusage(NiL));
    if (state->copy.size && (stat(state->copy.data, &st) || !S_ISDIR(st.st_mode)))
        error(ERROR_SYSTEM|3, "%s: not a directory", state->copy.data);
    if (!state->index.size)
        state->index.size = strlen(state->index.data = INDEX);
    if (!state->keep.size)
        state->keep.size = strlen(state->keep.data = KEEP);
    if (!state->skip.size)
        state->skip.size = strlen(state->skip.data = SKIP);
    if (!state->user.size)
        state->user.size = strlen(state->user.data = fmtuid(geteuid()));
    if (!state->root.size || *state->root.data != '/')
    {
        www[0] = (const char*)state->index.data;
        if (state->root.size)
            www[1] = (const char*)state->root.data;
        if (!(pwd = getpwnam(state->user.data)))
            error(3, "%s: unknown user", state->user.data);
        s = pwd->pw_dir;
        for (i = 0; i < elementsof(www); i++)
            if (www[i])
            {
                n = sfsprintf(state->buf, sizeof(state->buf) - 1, "%s/%s", s, www[i]);
                if (!access(state->buf, F_OK))
                {
                    if (i == 0)
                        n = strlen(s);
                    else
                        s = state->buf;
                    if (!(state->root.data = strdup(s)))
                        error(ERROR_SYSTEM|3, "out of space [root]");
                    state->root.size = n;
                    break;
                }
            }
    }
    while (s = *argv++)
        add(state, s, EXTERNAL|VERBOSE, NiL, 0, NiL);
    if (!state->more)
    {
        sfsprintf(state->buf, sizeof(state->buf) - 1, "%s/%s", state->root.data, state->index.data);
        add(state, state->buf, EXTERNAL|VERBOSE, NiL, 0, NiL);
    }
    while (state->more)
    {
        state->more = 0;
        for (fp = (File_t*)dtfirst(state->files); fp; fp = (File_t*)dtnext(state->files, fp))
        {
            if (!(fp->flags & SCANNED))
            {
                fp->flags |= SCANNED;
                if (streq(fp->name, "-") || streq(fp->name, "/dev/stdin") || streq(fp->name, "/dev/fd/0"))
                    ip = sfstdin;
                else if (!(ip = sfopen(NiL, fp->name, "r")))
                {
                    fp->flags |= MISSING;
                    if (state->warn || (fp->flags & VERBOSE))
                        error(ERROR_SYSTEM|2, "%s: cannot read", fp->name);
                    continue;
                }
                refs(state, fp->name, ip, fp);
                if (ip != sfstdin)
                    sfclose(ip);
            }
        }
    }
    if (state->copy.size)
    {
        p = state->buf;
        for (fp = (File_t*)dtfirst(state->files); fp; fp = (File_t*)dtnext(state->files, fp))
            if (!(fp->flags & (CHECKED|COPIED|MISSING)))
            {
                fp->flags |= CHECKED;
                sfsprintf(p, sizeof(state->buf) - 1, "%s%s", state->copy.data, fp->name + state->root.size);
                if (state->internal.size && strmatch(p, state->internal.data))
                    continue;
                add(state, p, COPIED, NiL, 0, NiL);
                if (stat(fp->name, &st))
                    error(ERROR_SYSTEM|3, "%s: cannot stat", fp->name);
                if (state->limit.size && !strmatch(p, state->limit.data))
                    continue;
                if (stat(p, &ts))
                {
                    ts.st_mtime = 0;
                    ts.st_mode = 0;
                }
                if (strmatch(p, "*/cgi-bin/*|*.cgi|*.html"))
                    fp->flags |= COPY;
                if (!state->exec)
                {
                    if (fp->flags & DIRECTORY)
                    {
                        if (!ts.st_mtime)
                            sfprintf(sfstdout, " mkdir %s\n", p);
                    }
                    else if (state->force || st.st_mtime != ts.st_mtime)
                    {
                        if (fp->flags & FILTER)
                            sfprintf(sfstdout, "filter %s\n", p);
                        else if (state->symlink && !(fp->flags & COPY))
                            sfprintf(sfstdout, "  link %s\n", p);
                        else
                            sfprintf(sfstdout, "  copy %s\n", p);
                    }
                }
                else if (fp->flags & DIRECTORY)
                {
                    if (!ts.st_mtime)
                    {
                        if (state->verbose)
                            sfprintf(sfstdout, " mkdir %s\n", p);
                        if (mkdir(p, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH))
                            error(ERROR_SYSTEM|2, "%s: cannot create directory", p);
                    }
                }
                else if (state->symlink && !(fp->flags & (COPY|FILTER)))
                {
                    if (st.st_mtime != ts.st_mtime)
                    {
                        if (state->verbose)
                            sfprintf(sfstdout, " ln -s %s %s\n", fp->name, p);
                        if (ts.st_mtime)
                            remove(p);
                        if (symlink(fp->name, p))
                            error(ERROR_SYSTEM|2, "%s: cannot symlink to %s", fp->name, p);
                    }
                }
                else if (state->force || st.st_mtime != ts.st_mtime)
                {
                    if (!(ip = sfopen(NiL, fp->name, "r")))
                        error(ERROR_SYSTEM|2, "%s: cannot read", fp->name);
                    else if (!(op = sfopen(NiL, p, "w")))
                    {
                        error(ERROR_SYSTEM|2, "%s: cannot write", p);
                        sfclose(ip);
                    }
                    else
                    {
                        if (fp->flags & FILTER)
                        {
                            if (state->verbose)
                                sfprintf(sfstdout, "filter %s\n", p);
                            n = filter(state, ip, op);
                        }
                        else
                        {
                            if (state->verbose)
                                sfprintf(sfstdout, "  copy %s\n", p);
                            if (sfmove(ip, op, SF_UNBOUND, -1) >= 0 && sfeof(ip))
                                n = 1;
                            else
                                n = -1;
                        }
                        if (n < 0)
                            error(ERROR_SYSTEM|2, "%s: read error", fp->name);
                        if (sfclose(op))
                            error(ERROR_SYSTEM|2, "%s: write error", p);
                        sfclose(ip);
                        if (n > 0)
                        {
                            if ((st.st_mode &= S_IPERM) != (ts.st_mode &= S_IPERM) && chmod(p, st.st_mode))
                                error(ERROR_SYSTEM|2, "%s: cannot set mode", p);
                            if (touch(p, st.st_mtime, st.st_mtime, 0))
                                error(ERROR_SYSTEM|2, "%s: cannot set times", p);
                        }
                        else if (!n)
                        {
                            if (state->verbose)
                                sfprintf(sfstdout, " %s %s\n", (fp->flags & DIRECTORY) ? "rmdir" : "   rm", p);
                            if (((fp->flags & DIRECTORY) ? rmdir : remove)(p))
                                error(ERROR_SYSTEM|2, "%s: cannot remove", p);
                        }
                    }
                }
            }
        if (state->unreferenced)
        {
            if (!(fts = fts_open((char**)state->copy.data, FTS_ONEPATH|FTS_META|FTS_PHYSICAL|FTS_NOPREORDER, order)))
                error(ERROR_SYSTEM|3, "%s: cannot search directory", state->copy.data);
            while (ent = scan(state, fts))
                if ((!(fp = dtmatch(state->files, ent->fts_path)) || !(fp->flags & COPIED)) && (!state->ignore.size || !strmatch(ent->fts_path, state->ignore.data)) && (!state->limit.size || strmatch(ent->fts_path, state->limit.data)))
                {
                    if (state->verbose || !state->exec)
                        sfprintf(sfstdout, " %s %s\n", (ent->fts_info & FTS_D) ? "rmdir" : "   rm", ent->fts_path);
                    if (state->exec && ((ent->fts_info & FTS_D) ? rmdir : remove)(ent->fts_path))
                        error(ERROR_SYSTEM|2, "%s: cannot remove", ent->fts_path);
                }
            if (fts_close(fts))
                error(ERROR_SYSTEM|3, "%s: directory read error", state->copy.data);
        }
    }
    else if (state->unreferenced)
    {
        i = 0;
        if (state->documentroot.data)
            dirs[i++] = state->documentroot.data;
        else
        {
            if (!state->root.data)
                state->root.size = strlen(state->root.data = ".");
            dirs[i++] = state->root.data;
        }
        if (state->dataroot.data)
            dirs[i++] = state->dataroot.data;
        if (state->programroot.data)
            dirs[i++] = state->programroot.data;
        dirs[i] = 0;
        if (!(fts = fts_open(dirs, FTS_META|FTS_PHYSICAL|FTS_NOPREORDER, order)))
            error(ERROR_SYSTEM|3, "%s: cannot search directory", state->root.data);
        while (ent = scan(state, fts))
            if (!dtmatch(state->files, ent->fts_path) && (!strmatch(ent->fts_name, state->keep.data) || state->skip.size && strmatch(ent->fts_name, state->skip.data) || state->ignore.size && strmatch(ent->fts_path, state->ignore.data)))
            {
                if (state->strict || !streq(ent->fts_name, state->index.data))
                {
                    if (!state->remove)
                        sfprintf(sfstdout, "%s\n", fmtquote(ent->fts_path, "\"", "\"", ent->fts_pathlen, 0));
                    else if (!state->limit.size || strmatch(ent->fts_path, state->limit.data))
                    {
                        if (state->verbose || !state->exec)
                            sfprintf(sfstdout, " %s %s\n", (ent->fts_info & FTS_D) ? "rmdir" : "   rm", ent->fts_path);
                        if (state->exec && ((ent->fts_info & FTS_D) ? rmdir : remove)(ent->fts_path))
                            error(ERROR_SYSTEM|2, "%s: cannot remove", ent->fts_path);
                    }
                }
                else if (s = strrchr(ent->fts_path, '/'))
                {
                    *s = 0;
                    add(state, ent->fts_path, COPIED, NiL, 0, NiL);
                    *s = '/';
                }
            }
        if (fts_close(fts))
            error(ERROR_SYSTEM|3, "%s: directory read error", state->root.data);
    }
    else
    {
        for (fp = (File_t*)dtfirst(state->files); fp; fp = (File_t*)dtnext(state->files, fp))
            if (state->all || (fp->flags & MISSING) == state->missing)
            {
                sfprintf(sfstdout, "%s", fmtquote(fp->name, "\"", "\"", strlen(fp->name), 0));
                if (state->dependents && fp->refs)
                {
                    sfputc(sfstdout, ' ');
                    sfputc(sfstdout, ':');
                    for (lp = fp->refs; lp; lp = lp->next)
                        sfprintf(sfstdout, " %s", fmtquote(lp->file->name, "\"", "\"", strlen(lp->file->name), 0));
                }
                sfputc(sfstdout, '\n');
            }
    }
    return error_info.errors != 0;
}