htmlrefs.c revision 3f54fd611f536639ec30dd53c48e5ec1897cc7d9
/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1996-2012 AT&T Intellectual Property *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* Glenn Fowler <gsf@research.att.com> *
* *
***********************************************************************/
#pragma prototyped
/*
* Glenn Fowler
* AT&T Research
*/
static const char usage[] =
"[-?\n@(#)$Id: htmlrefs (AT&T Research) 2012-01-01 $\n]"
"[+NAME?htmlrefs - list html url references]"
"[+DESCRIPTION?\bhtmlrefs\b lists url references from the"
" local closure of the input \bhtml\b \afile\as. If \afile\a is not"
" specified then the top level default user file is read. The \bhtml\b"
" parse is rudimentary; don't use \bhtmlrefs\b to detect valid \bhtml\b"
" files.]"
"[+?The top level references are determined in this order (the \b--index\b,"
" \b--root\b and \b--user\b options influence the order):]{"
" [+$HOME/index.html?Pseudo index containing"
" \b<LINK href=\b\adir\a \brel=\b\atype\a\b>\b references to"
" top level directories. \atype\a may be one of:]{"
" [+document-root?The document root directory containing URL"
" target documents. Exactly one \bdocument-root\b must"
" be specified.]"
" [+program-root?The program root directory containing CGI"
" support programs and scripts. This type is optional."
" If specified then the program root directory should"
" contain a pseudo index for its references.]"
" [+data-root?The data root directory containing CGI"
" support data. This type is optional. If specified then"
" the data root directory should contain a pseudo index"
" for its references.]"
" [+dynamic?All files under \adir\a are considered referenced.]"
" [+host?Provides a default value for the \b--hosts\b option.]"
" [+ignore?\adir\a is a \bksh\b(1) pattern of paths to ignore.]"
" [+internal?If \b--external\b is on then \adir\a is a \bksh\b(1)"
" pattern of internal paths.]"
" [+secure?Files under this dir are accessed by \bhttps:\b only.]"
" }"
" [+$HOME/wwwfiles/index.html?]"
" [+$HOME/public_html/index.html?]"
"}"
"[a:all?List all references whether they exist or not.]"
"[c:copy?Copy the selected references to \adirectory\a which must already"
" exist. If \b--external\b is also specified then lines between"
" \b<!--INTERNAL-->\b ... \b<!--/INTERNAL-->\b lines are not"
" copied. If \b--unreferenced\b is also specified then files and"
" directories in \adirectory\a that have not been copied are"
" removed. Target file modification times are set to match source"
" times so that future copies can be avoided.]:[directory]"
"[d:dependents?List each selected local file followed by \b:\b and a list of"
" all local files referring to the file.]"
"[e:external?Do not list references inside \b<!--INTERNAL-->\b ..."
" \b<!--/INTERNAL-->\b lines. See \bmm2html\b(1) for an html"
" generator that inserts these lines.]"
"[F:force?By default files are not copied if the source and target size and"
" modification times match. \b--force\b forces all files to be copied.]"
"[h:hosts?Check only references matching the \bksh\b(1) pattern"
" \bhttp://\b\apattern\a\b/\b.]:[pattern]"
"[i:index?\aname\a specifies the page named by directory"
" references.]:[name:=index.html]"
"[k:keep?\apattern\a is used to match file base names that are always"
" considered referenced.]:[pattern:=.htaccess]"
"[l:limit?Limit \b--copy\b and \b--remove\b operations to path names matching"
" \apattern\a.]:[pattern]"
"[m:missing?List missing local file references.]"
"[n!:exec?Enable file modification operations. \b--noexec\b lists the"
" operations but does not do them.]"
"[p!:perlwarn?Check HTML files for unintentional embedded \bperl\b(1)"
" constructs: a left bracket followed by one of \b-+!$*#\b. Manually"
" translating left bracket to \b[\b avoids unwanted \bperl\b"
" interactions (why didn't they use tags like everyone else?)"
" \bmm2html\b(1) and \boptget\b(3) do the translation by default.]"
"[X:remove?Unreferenced files are removed when \b--unreferenced\b and"
" \b--nocopy\b are specified.]"
"[r:root?The local \adirectory\a for \b--user\b"
" references.]:[directory:=~\auser\a]"
"[K:skip?\apattern\a is used to match file base names that are never"
" considered referenced.]:[pattern:=00-INDEX-00]"
"[s:strict?By default unreferenced \b--index\b files and the containing"
" directory are considered referenced; \b--strict\b considers"
" unreferenced \b--index\b files unreferenced.]"
"[S:symlink?Instruct \b--copy\b to \bsymlink\b(2) files that do not contain"
" \b<!--INTERNAL-->\b ... \b<!--/INTERNAL-->\b or are not in"
"[u:user?\b~\b\aname\a translates to the \b--root\b"
" directory.]:[name:=caller-uid]"
"[v:verbose?List files as they are copied (see \b--copy\b.)]"
"[w:warn?Produce a warning diagnostic for missing files.]"
"[x:unreferenced?If \b--copy\b is also specified then remove files and"
" directories in the \b--copy\b \adirectory\a that have not been copied."
" Otherwise list unreferenced files in the \b--root\b directory."
" A directory that contains no referenced files but does contain an"
" \b--index\b file is considered referenced (along with the \b--index\b"
" file) unless \b--strict\b is enabled.]"
"\n"
"\n[ file ... ]\n"
"\n"
"[+EXAMPLES]{"
" [+htmlrefs --hosts=www.research.att.com --missing?List missing"
" references to the local host \bwww.research.att.com\b.]"
" [+htmlrefs -n -h www.research.att.com -c ~/external/wwwfiles -e -x?Copy"
" release, and remove unreferenced files in the copy.]"
"}"
"[+SEE ALSO?\bhtml2rtf\b(1), \bmm2html\b(1)]"
;
#include <ast.h>
#include <cdt.h>
#include <ctype.h>
#include <error.h>
#include <fts.h>
#include <glob.h>
#include <pwd.h>
#include <tm.h>
#define INDEX "index.html"
#define KEEP ".htaccess"
#define SKIP "00-INDEX-00"
#define CHECKED 0x001
#define COPIED 0x002
#define COPY 0x004
#define DIRECTORY 0x008
#define EXTERNAL 0x010
#define FILTER 0x020
#define INTERNAL 0x040
#define MISSING 0x080
#define SCANNED 0x100
#define SECURE 0x200
#define VERBOSE 0x400
#define HIT (-1)
#define MISS (-2)
struct List_s;
typedef struct String_s
{
char* data;
unsigned int size;
} String_t;
typedef struct File_s
{
unsigned long time;
unsigned int flags;
char name[1];
} File_t;
typedef struct List_s
{
} List_t;
typedef struct State_s
{
int all;
int dependents;
int exec;
int external;
int force;
int missing;
int more;
int perlwarn;
int remove;
int strict;
int symlink;
int unreferenced;
int verbose;
int warn;
} State_t;
static const char internal[] = "<!--INTERNAL-->";
static const char external[] = "<!--/INTERNAL-->";
static int
{
char* s;
{
s++;
else
s = (char*)name;
return 0;
}
return 0;
return 1;
}
/*
*/
static void
{
register char* s;
register char** p;
{
}
}
/*
* add reference path s
*/
static File_t*
add(register State_t* state, register char* s, unsigned int flags, const char* path, int prefix, File_t* ref)
{
register char* t;
char* u;
{
{
if (t = strchr(s, ':'))
{
{
s = t + 3;
}
{
s = t + 4;
}
else
return 0;
if (t = strchr(s, '/'))
*t = 0;
return 0;
if (t)
*(s = t) = '/';
else
s = "/";
}
if (*s == '/')
{
{
if (*(s + 1) != '~')
return 0;
if (*(s + 2) == '/')
s += 2;
else if (!state->user.size || !strneq(s + 2, state->user.data, state->user.size) || *(s + 2 + state->user.size) != '/')
return 0;
else
{
sfsprintf(state->buf, sizeof(state->buf) - 1, "%s%s%s", state->documentroot.data, (flags & SECURE) ? "/secure" : "", s);
}
{
sfsprintf(state->buf, sizeof(state->buf) - 1, "%s%s", state->root.data, (flags & SECURE) ? "/secure" : "", s);
}
}
}
else if (prefix)
{
}
{
}
}
if (*s == '.' && *(s + 1) == '/')
while (*++s == '/');
if (!*s)
s = "/";
{
{
if (!*t)
*t = '/';
}
else
{
}
}
}
{
return 0;
if (t = strrchr(s, '/'))
do
{
*t = 0;
{
*t = '/';
break;
}
{
}
u = strrchr(s, '/');
*t = '/';
}
{
if (!lp)
{
}
}
return fp;
}
/*
* order directory stream by name
*/
static int
{
}
/*
* parse and set root dir r from s
* possibly using tmp buffer buf
*/
static void
{
register char* t;
register int n;
if (t = strrchr(s, '/'))
*t = 0;
if (*s == '/')
n = strlen(s);
else
{
s = buf;
}
r->size = n;
if (t)
*t = '/';
}
/*
* return next directory entry
*/
static FTSENT*
{
char* s;
int skip;
{
{
{
skip = 0;
{
{
skip = 1;
break;
}
break;
}
if (skip)
{
continue;
}
}
}
break;
}
return ent;
}
/*
* process refs in path
*/
static void
{
register int c;
register int q;
register int r;
register int a;
register char* s;
char* p;
char* t;
File_t* f;
String_t* v;
int m;
int perlwarn;
int prefix;
unsigned int secure;
unsigned int flags;
for (;;)
{
{
case EOF:
break;
case '<':
q = 0;
s = buf;
for (;;)
{
{
case EOF:
return;
case '>':
break;
default:
if (isspace(c))
break;
continue;
}
break;
}
q = 0;
if (flags != INTERNAL && (s == (buf + 1) && (buf[0] == 'A' || buf[0] == 'a') || s == (buf + 4) && (buf[0] == 'L' || buf[0] == 'l') && (buf[1] == 'I' || buf[1] == 'i') && (buf[2] == 'N' || buf[2] == 'n') && (buf[3] == 'K' || buf[3] == 'k')))
{
s = buf;
r = a = 0;
f = 0;
for (;;)
{
{
case EOF:
return;
case '\'':
case '"':
if (q == c)
q = 0;
else if (q == 0)
q = c;
else if (r == HIT)
continue;
case '>':
case ' ':
case '\t':
case '\n':
if (!q)
{
if (r == HIT)
{
/*UNDENT...*/
*s = 0;
s = buf;
if (!a)
else if (f)
{
p = f->name;
if (!strcasecmp(s, "data-root"))
else if (!strcasecmp(s, "document-root"))
{
}
else if (!strcasecmp(s, "program-root"))
{
if (t = strrchr(p, '/'))
*t = 0;
if (t)
*t = '/';
if (fts)
{
}
}
else if (!strcasecmp(s, "ignore") && (v = &state->ignore) || state->external && !strcasecmp(s, "internal") && (v = &state->internal))
{
{
}
else
s = "";
if (t = strrchr(p, '/'))
*t = 0;
if (t)
*t = '/';
}
}
/*...INDENT*/
}
if (c == '>')
break;
r = a = 0;
}
else if (r == HIT)
continue;
case '#':
case '?':
if (r == HIT)
continue;
case 'H':
case 'h':
if (r == HIT)
else if (!q)
r = (r == 0) ? 1 : MISS;
continue;
case 'R':
case 'r':
if (r == HIT)
else if (!q)
{
if (r == 0)
{
a = 10;
r = a + 1;
}
}
continue;
case 'E':
case 'e':
if (r == HIT)
else if (!q)
continue;
case 'F':
case 'f':
if (r == HIT)
else if (!q)
continue;
case 'L':
case 'l':
if (r == HIT)
else if (!q)
continue;
case '=':
if (r == HIT)
else if (!q)
continue;
default:
if (r == HIT)
continue;
}
break;
}
}
else if (flags != INTERNAL && (s == (buf + 5) && (buf[0] == 'F' || buf[0] == 'f') && (buf[1] == 'R' || buf[1] == 'r') && (buf[2] == 'A' || buf[2] == 'a') && (buf[3] == 'M' || buf[3] == 'm') && (buf[4] == 'E' || buf[4] == 'e') || s == (buf + 3) && (buf[0] == 'I' || buf[0] == 'i') && (buf[1] == 'M' || buf[1] == 'm') && (buf[2] == 'G' || buf[2] == 'g') || s == (buf + 6) && (buf[0] == 'S' || buf[0] == 's') && (buf[1] == 'C' || buf[1] == 'c') && (buf[2] == 'R' || buf[2] == 'r') && (buf[3] == 'I' || buf[3] == 'i') && (buf[4] == 'P' || buf[4] == 'p') && (buf[5] == 'T' || buf[5] == 't')))
{
s = buf;
r = 0;
for (;;)
{
{
case EOF:
return;
case '\'':
case '"':
if (q == c)
q = 0;
else if (q == 0)
q = c;
else if (r == HIT)
continue;
case '>':
case ' ':
case '\t':
case '\n':
if (!q)
{
if (r == HIT)
{
*s = 0;
s = buf;
}
if (c == '>')
break;
r = 0;
}
else if (r == HIT)
continue;
case 'S':
case 's':
if (r == HIT)
else if (!q)
r = (r == 0) ? 1 : MISS;
continue;
case 'R':
case 'r':
if (r == HIT)
else if (!q)
continue;
case 'C':
case 'c':
if (r == HIT)
else if (!q)
continue;
case '=':
if (r == HIT)
else if (!q)
continue;
default:
if (r == HIT)
continue;
}
break;
}
}
else
{
{
{
{
}
}
else
{
}
}
for (;;)
{
{
case EOF:
return;
case '\'':
case '"':
if (q == c)
q = 0;
else if (q == 0)
q = c;
continue;
case '>':
if (q == 0)
break;
continue;
default:
continue;
}
break;
}
}
continue;
case '[':
{
switch (c)
{
case '-':
case '+':
case '!':
case '$':
case '*':
case '#':
perlwarn = 0;
break;
}
}
continue;
default:
break;
continue;
}
break;
}
}
/*
* filter out internal text
* return: <0:error 0:drop >0:keep
*/
static int
{
register char* s;
register size_t n;
register int head = 1;
for (;;)
{
break;
{
if (head)
else
lines++;
head = 0;
}
else
{
while ((s = sfgetr(ip, '\n', 0)) && (sfvalue(ip) != sizeof(external) || !strneq(s, external, sizeof(external) - 1)));
if (!s)
{
if (head)
return 0;
break;
}
}
}
return lines > 1;
}
int
{
register char* s;
register char* p;
char* dirs[4];
int i;
int n;
for (;;)
{
{
case 'a':
continue;
case 'c':
continue;
case 'd':
continue;
case 'e':
continue;
case 'F':
continue;
case 'h':
continue;
case 'i':
continue;
case 'k':
continue;
case 'K':
continue;
case 'l':
continue;
case 'm':
continue;
case 'n':
continue;
case 'r':
continue;
case 's':
continue;
case 'u':
continue;
case 'v':
continue;
case 'w':
continue;
case 'x':
continue;
case 'S':
case 'X':
continue;
case '?':
continue;
case ':':
continue;
}
break;
}
if (error_info.errors)
{
for (i = 0; i < elementsof(www); i++)
if (www[i])
{
{
if (i == 0)
n = strlen(s);
else
break;
}
}
}
while (s = *argv++)
{
}
{
{
{
{
continue;
}
}
}
}
{
{
continue;
continue;
{
}
if (strmatch(p, "*/cgi-bin/*|*.cgi|*.html"))
{
{
}
{
else
}
}
{
{
}
}
{
{
remove(p);
}
}
{
{
}
else
{
{
}
else
{
n = 1;
else
n = -1;
}
if (n < 0)
if (n > 0)
{
}
else if (!n)
{
}
}
}
}
if (state->unreferenced)
{
if (!(fts = fts_open((char**)state->copy.data, FTS_ONEPATH|FTS_META|FTS_PHYSICAL|FTS_NOPREORDER, order)))
if ((!(fp = dtmatch(state->files, ent->fts_path)) || !(fp->flags & COPIED)) && (!state->ignore.size || !strmatch(ent->fts_path, state->ignore.data)) && (!state->limit.size || strmatch(ent->fts_path, state->limit.data)))
{
}
}
}
else if (state->unreferenced)
{
i = 0;
else
{
}
dirs[i] = 0;
if (!dtmatch(state->files, ent->fts_path) && (!strmatch(ent->fts_name, state->keep.data) || state->skip.size && strmatch(ent->fts_name, state->skip.data) || state->ignore.size && strmatch(ent->fts_path, state->ignore.data)))
{
{
{
}
}
{
*s = 0;
*s = '/';
}
}
}
else
{
{
{
}
}
}
return error_info.errors != 0;
}