crawlsrccomments.sh revision 3e14f97f673e8a630f076077de35afdd43dc1587
#!/usr/bin/ksh93
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
#
# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
export PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
# Make sure all math stuff runs in the "C" locale to avoid problems
# with alternative # radix point representations (e.g. ',' instead of
# '.' in de_DE.*-locales). This needs to be set _before_ any
# floating-point constants are defined in this script).
if [[ "${LC_ALL}" != "" ]] ; then
export \
LC_MONETARY="${LC_ALL}" \
LC_MESSAGES="${LC_ALL}" \
LC_COLLATE="${LC_ALL}" \
LC_CTYPE="${LC_ALL}"
unset LC_ALL
fi
export LC_NUMERIC=C
# constants values for tokenizer/parser stuff
compound -r ch=(
newline=$'\n'
tab=$'\t'
formfeed=$'\f'
)
function fatal_error
{
print -u2 "${progname}: $*"
exit 1
}
function printmsg
{
print -u2 "$*"
}
function attrstrtoattrarray
{
#set -o xtrace
typeset s="$1"
nameref aa=$2 # attribute array
integer aa_count=0
integer aa_count=0
typeset nextattr
integer currattrlen=0
typeset tagstr
typeset tagval
while (( ${#s} > 0 )) ; do
# skip whitespaces
while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
(( currattrlen++ ))
done
s="${s:currattrlen:${#s}}"
# anything left ?
(( ${#s} == 0 )) && break
# Pattern tests:
#x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
#x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
#x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
#x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
# All pattern combined via eregex (w|x|y|z):
#x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
currattrlen=$(( ${#s} - ${#nextattr}))
# add entry
tagstr="${s:0:currattrlen}"
if [[ "${tagstr}" == *=* ]] ; then
# normal case: attribute with value
tagval="${tagstr#*=}"
# strip quotes ('' or "")
if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
tagval="${tagval:1:${#tagval}-2}"
fi
aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
else
# special case for HTML where you have something like <foo baz>
aa[${aa_count}]=( name="${tagstr}" )
fi
(( aa_count++ ))
(( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
done
}
# XML document handler
function handle_xml_document
{
#set -o xtrace
nameref callbacks=${1}
typeset tag_type="${2}"
typeset tag_value="${3}"
typeset tag_attributes="${4}"
nameref doc=${callbacks["arg_tree"]}
nameref nodepath="${stack.items[stack.pos]}"
nameref nodesnum="${stack.items[stack.pos]}num"
case "${tag_type}" in
tag_comment)
nodepath[${nodesnum}]+=(
typeset tagtype="comment"
typeset tagvalue="${tag_value}"
)
(( nodesnum++ ))
;;
esac
# print "xmltok: '${tag_type}' = '${tag_value}'"
}
function xml_tok
{
typeset buf=""
typeset namebuf=""
typeset attrbuf=""
typeset c=""
typeset isendtag # bool: true/false
typeset issingletag # bool: true/false (used for tags like "<br />")
nameref callbacks=${1}
[[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
while IFS='' read -r -N 1 c ; do
isendtag=false
if [[ "$c" == "<" ]] ; then
# flush any text content
if [[ "$buf" != "" ]] ; then
[[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
buf=""
fi
IFS='' read -r -N 1 c
if [[ "$c" == "/" ]] ; then
isendtag=true
else
buf="$c"
fi
IFS='' read -r -d '>' c
buf+="$c"
# handle comments
if [[ "$buf" == ~(El)!-- ]] ; then
# did we read the comment completely ?
if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
buf+=">"
while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
IFS='' read -r -N 1 c || break
buf+="$c"
done
fi
[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
buf=""
continue
fi
# check if the tag starts and ends at the same time (like "<br />")
if [[ "${buf}" == ~(Er).*/ ]] ; then
issingletag=true
buf="${buf%*/}"
else
issingletag=false
fi
# check if the tag has attributes (e.g. space after name)
if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
else
namebuf="$buf"
attrbuf=""
fi
if ${isendtag} ; then
[[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
else
[[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
# handle tags like <br/> (which are start- and end-tag in one piece)
if ${issingletag} ; then
[[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
fi
fi
buf=""
else
buf+="$c"
fi
done
[[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
print # final newline to make filters like "sed" happy
}
# enumerate comments in a shell (or shell-like) script
function enumerate_comments_shell
{
set -o errexit
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer ca=0 # index in "comment_array"
integer res=0
typeset comment=""
while (( res == 0 )) ; do
IFS='' read -r line
(( res=$? ))
if [[ "${line}" == ~(El)#.* ]] ; then
comment+="${line#\#}${ch.newline}"
else
if [[ "$comment" != "" ]] ; then
comment_array[ca++]="${comment}"
comment=""
if (( ca > max_num_comments )) ; then
break
fi
fi
fi
done <"${input_file}"
return 0
}
# enumerate comments in a troff document
function enumerate_comments_troff
{
set -o errexit
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer ca=0 # index in "comment_array"
integer res=0
typeset comment=""
while (( res == 0 )) ; do
IFS='' read -r line
(( res=$? ))
if [[ "${line}" == ~(El)\.*\\\" ]] ; then
comment+="${line#~(El)\.*\\\"}${ch.newline}"
else
if [[ "$comment" != "" ]] ; then
comment_array[ca++]="${comment}"
comment=""
if (( ca > max_num_comments )) ; then
break
fi
fi
fi
done <"${input_file}"
return 0
}
# enumerate comments in files which are preprocessed by
# CPP (e.g. C, C++, Imakefile etc.)
function enumerate_comments_cpp
{
set -o errexit
# set -o nounset
integer err=0
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer max_filesize_for_scan="$4"
integer ca=0 # index in "comment_array"
typeset content
integer content_length
integer file_pos # file position
compound line_pos=(
integer x=0 # X position in line
integer y=0 # Y position in line (line number)
)
typeset c c2
typeset comment
compound state=(
# C comment state
typeset in_c_comment=false
# C++ comment state
compound cxx=(
typeset in_comment=false
typeset comment_continued=false
# position of current //-pos
compound comment_pos=(
integer x=-1
integer y=-1
)
# position of previous //-pos
compound comment_prev_pos=(
integer x=-1
integer y=-1
)
)
# literal state
typeset in_sq_literal=false # single-quote literal
typeset in_dq_literal=false # double-quote literal
)
content="$(< "${input_file}")"
# Truncate file to "max_filesize_for_scan" charatcters.
# This was originally added to work around a performance problem with
# the ${str:offset:chunksize} operator which scales badly in ksh93
# version 's' with the number of characters
if (( ${#content} > max_filesize_for_scan )) ; then
print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
"${input_file}" \
max_filesize_for_scan
content="${content:0:max_filesize_for_scan}"
fi
content_length=${#content}
# Iterate through the source code. The last character
# (when file_pos == content_length) will be empty to indicate
# EOF (this is needed for cases like when
# a C++ comment is not terminated by a newline... ;-/)
for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
c2="${content:file_pos:2}"
c="${c2:0:1}"
if [[ "$c" == "${ch.newline}" ]] ; then
(( line_pos.x=0, line_pos.y++ ))
else
(( line_pos.x++ ))
fi
if ${state.in_c_comment} ; then
if [[ "$c2" == "*/" ]] ; then
(( file_pos++, line_pos.x++ ))
state.in_c_comment=false
# flush comment text
comment_array[ca++]="${comment}"
comment=""
if (( ca > max_num_comments )) ; then
break
fi
else
comment+="$c"
fi
elif ${state.cxx.in_comment} ; then
if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
state.cxx.in_comment=false
# flush comment text
if ${state.cxx.comment_continued} ; then
comment_array[ca-1]+="${ch.newline}${comment}"
(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
else
comment_array[ca++]="${comment}"
(( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
fi
comment=""
if (( ca > max_num_comments )) ; then
break
fi
else
comment+="$c"
fi
elif ${state.in_sq_literal} ; then
if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_sq_literal=false
fi
elif ${state.in_dq_literal} ; then
if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_dq_literal=false
fi
else
if [[ "$c2" == "/*" ]] ; then
(( file_pos++, line_pos.x++ ))
state.in_c_comment=true
comment=""
elif [[ "$c2" == "//" ]] ; then
(( file_pos++, line_pos.x++ ))
if (( state.cxx.comment_prev_pos.x == line_pos.x && \
state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
state.cxx.comment_continued=true
else
state.cxx.comment_continued=false
fi
(( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
state.cxx.in_comment=true
comment=""
elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_sq_literal=true
elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_dq_literal=true
fi
fi
done
if [[ "$comment" != "" ]] ; then
print -u2 "## ERROR: Comment text buffer not empty at EOF."
err=1
fi
if ${state.in_c_comment} ; then
print -u2 "## ERROR: C comment did not close before EOF."
err=1
fi
if ${state.cxx.in_comment} ; then
print -u2 "## ERROR: C++ comment did not close before EOF."
err=1
fi
if ${state.in_dq_literal} ; then
print -u2 "## ERROR: Double-quoted literal did not close before EOF."
err=1
fi
# We treat this one only as warning since things like "foo.html.cpp" may
# trigger this condition accidently
if ${state.in_sq_literal} ; then
print -u2 "## WARNING: Single-quoted literal did not close before EOF."
fi
return $err
}
# determine file type
function get_file_format
{
set -o errexit
typeset filename="$1"
nameref file_format="$2"
typeset fileeval # evaluation result of /usr/bin/file
# check whether "filename" is a plain, readable file
[[ ! -f "$filename" ]] && return 1
[[ ! -r "$filename" ]] && return 1
# In theory this code would exclusively look at the contents of
# the file to figure out it's file format - unfortunately
# /usr/bin/file is virtually useless (the heuristics, matching
# and output unreliable) for many file formats and therefore
# we have to do a multi-stage approach which looks
# at the file's content if possible and at the filename
# otherwise. Fun... ;-(
# pass one: Find matches for file formats where /usr/bin/file
# is known to be unreliable:
case "$filename" in
*.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
file_format="c_source"
return 0
;;
*Imakefile)
file_format="imakefile"
return 0
;;
*Makefile)
file_format="makefile"
return 0
;;
esac
# pass two: match by file content via /usr/bin/file
fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
case "$fileeval" in
~(E)roff)
file_format="troff"
return 0
;;
~(E)html\ document)
file_format="html"
return 0
;;
~(E)sgml\ document)
file_format="sgml"
return 0
;;
~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
file_format="shell"
return 0
;;
~(E)executable.*/perl\ script)
file_format="perl"
return 0
;;
esac
# pass three: fallhack to filename matching
case "$filename" in
*.man)
file_format="troff"
return 0
;;
*.html)
file_format="html"
return 0
;;
*.sgml)
file_format="sgml"
return 0
;;
*.xml)
file_format="xml"
return 0
;;
*.png)
file_format="image_png"
return 0
;;
*.xcf)
file_format="image_xcf"
return 0
;;
*.shar)
file_format="archive_shell"
return 0
;;
*.sh)
file_format="shell"
return 0
;;
*.pcf)
file_format="font_pcf"
return 0
;;
*.bdf)
file_format="font_bdf"
return 0
;;
*.pmf)
file_format="font_pmf"
return 0
;;
*.ttf | *.otf)
file_format="font_ttf"
return 0
;;
*.pfa | *.pfb)
file_format="font_postscript"
return 0
;;
esac
return 1
}
function extract_comments
{
set -o errexit
nameref records="$1"
typeset filename="$2"
integer max_num_comments="$3"
integer max_filesize_for_scan="$4"
typeset datatype=""
records[${filename}]=(
typeset filename="$filename"
typeset fileformat_found="false" # "true" or "false"
typeset file_format=""
typeset -A hashsum
typeset comments_parsed="false" # "true" or "false"
typeset -a comments
)
records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")"
records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
if get_file_format "$filename" datatype ; then
records[${filename}].fileformat_found="true"
records[${filename}].file_format="$datatype"
else
return 1
fi
case "$datatype" in
c_source|imakefile)
enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
shell|makefile)
enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
troff)
enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
# NOTE: Disabled for now
#xml|html|sgml)
# enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
# records[${filename}].comments_parsed=true
# ;;
esac
return 0
}
# parse HTTP return code, cookies etc.
function parse_http_response
{
nameref response="$1"
typeset h statuscode statusmsg i
# we use '\r' as additional IFS to filter the final '\r'
IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
[[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; }
response.statuscode="$statuscode"
response.statusmsg="$statusmsg"
# skip remaining headers
while IFS='' read -r i ; do
[[ "$i" == $'\r' ]] && break
# strip '\r' at the end
i="${i/~(Er)$'\r'/}"
case "$i" in
~(Eli)Content-Type:.*)
response.content_type="${i/~(El).*:[[:blank:]]*/}"
;;
~(Eli)Content-Length:[[:blank:]]*[0-9]*)
integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
;;
~(Eli)Transfer-Encoding:.*)
response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
;;
esac
done
return 0
}
function cat_http_body
{
typeset emode="$1"
typeset hexchunksize="0"
integer chunksize=0
if [[ "${emode}" == "chunked" ]] ; then
while IFS=$'\r' read hexchunksize &&
[[ "${hexchunksize}" == ~(Elri)[0-9abcdef]+ ]] &&
(( chunksize=$( printf "16#%s\n" "${hexchunksize}" ) )) && (( chunksize > 0 )) ; do
dd bs=1 count="${chunksize}" 2>/dev/null
done
else
cat
fi
return 0
}
function cat_url
{
typeset protocol="${1%://*}"
typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
if [[ "${protocol}" == "file" ]] ; then
cat "${path1}"
return $?
elif [[ "${protocol}" == ~(Elr)http(|s) ]] ; then
typeset host="${path1%%/*}"
typeset path="${path1#*/}"
typeset port="${host##*:}"
integer netfd
compound httpresponse # http response
# If URL did not contain a port number in the host part then look at the
# protocol to get the port number
if [[ "${port}" == "${host}" ]] ; then
case "${protocol}" in
"http") port=80 ;;
"https") port=443 ;;
*) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
esac
else
host="${host%:*}"
fi
printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
# prechecks
[[ "${protocol}" != "" ]] || { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
[[ "${port}" != "" ]] || { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; }
[[ "${host}" != "" ]] || { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; }
[[ "${path}" != "" ]] || { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; }
# open TCP channel
if [[ "${protocol}" == "https" ]] ; then
compound sslfifo
sslfifo.dir="$(mktemp -d)"
sslfifo.in="${sslfifo.dir}/in"
sslfifo.out="${sslfifo.dir}/out"
# register an EXIT trap and use "errexit" to leave it at the first error
# (this saves lots of if/fi tests for error checking)
trap "rm -r \"${sslfifo.dir}\"" EXIT
set -o errexit
mkfifo "${sslfifo.in}" "${sslfifo.out}"
# create async openssl child to handle https
openssl s_client -quiet -connect "${host}:${port}" <"${sslfifo.in}" >>"${sslfifo.out}" &
# send HTTP request
request="GET /${path} HTTP/1.1\r\n"
request+="Host: ${host}\r\n"
request+="User-Agent: crawlsrccomments/ksh93(ssl) (2010-03-27; $(uname -s -r -p))\r\n"
request+="Connection: close\r\n"
print -n -- "${request}\r\n" >> "${sslfifo.in}"
# collect response and send it to stdout
{
parse_http_response httpresponse
cat_http_body "${httpresponse.transfer_encoding}"
} <"${sslfifo.out}"
wait || { print -u2 -f "%s: openssl failed.\n" ; exit 1 ; }
return 0
else
redirect {netfd}<> "/dev/tcp/${host}/${port}"
(( $? != 0 )) && { print -u2 -f "%s: Could not open %s\n" "$0" "${1}" ; return 1 ; }
# send HTTP request
request="GET /${path} HTTP/1.1\r\n"
request+="Host: ${host}\r\n"
request+="User-Agent: crawlsrccomments/ksh93 (2010-03-27; $(uname -s -r -p))\r\n"
request+="Connection: close\r\n"
print -n -- "${request}\r\n" >&${netfd}
# collect response and send it to stdout
parse_http_response httpresponse <&${netfd}
cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
# close connection
redirect {netfd}<&-
return 0
fi
else
return 1
fi
# notreached
}
function print_stats
{
set -o errexit
# gather some statistics
compound stats=(
integer files_with_comments=0
integer files_without_comments=0
integer files_without_known_format=0
integer files_with_license_info=0
integer files_without_license_info=0
integer total_num_files=0
)
for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
if "${records[$i].comments_parsed}" ; then
(( stats.files_with_comments++ ))
else
(( stats.files_without_comments++ ))
fi
if ! "${records[$i].fileformat_found}" ; then
(( stats.files_without_known_format++ ))
fi
if "${records[$i].license_info_found}" ; then
(( stats.files_with_license_info++ ))
else
(( stats.files_without_license_info++ ))
fi
(( stats.total_num_files++ ))
done
print -v stats
return 0
}
function print_comments_plain
{
set -o errexit
nameref records=$1
nameref options=$2
typeset i j
for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
nameref node=records[$i]
if [[ "${options.filepattern.accept}" != "" ]] && \
[[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
continue
fi
if [[ "${options.filepattern.reject}" != "" ]] && \
[[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
continue
fi
node.license_info_found=false
if ! "${node.comments_parsed}" ; then
continue
fi
for j in "${!node.comments[@]}" ; do
typeset s="${node.comments[$j]}"
typeset match=false
if [[ "${options.commentpattern.accept}" != "" ]] && \
[[ "$s" == ${options.commentpattern.accept} ]] ; then
match=true
fi
if [[ "${options.commentpattern.reject}" != "" ]] && \
[[ "$s" == ${options.commentpattern.reject} ]] ; then
match=false
fi
if "${match}" ; then
printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
printf "%s\n" "$s"
node.license_info_found=true
fi
done
if ! "${node.license_info_found}" ; then
printf "## no match found in '%s'," "${node.filename}"
printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
"${node.comments_parsed}" \
"${node.fileformat_found}" \
"${node.file_format}"
fi
done
return 0
}
function print_comments_duplicates_compressed
{
set -o errexit
nameref records=$1
nameref options=$2
typeset i j
typeset -A hashed_comments
integer num_hashed_comments
for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
nameref node=records[$i]
if [[ "${options.filepattern.accept}" != "" ]] && \
[[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
continue
fi
if [[ "${options.filepattern.reject}" != "" ]] && \
[[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
continue
fi
node.license_info_found=false
if ! "${node.comments_parsed}" ; then
continue
fi
for j in "${!node.comments[@]}" ; do
typeset s="${node.comments[$j]}"
typeset match=false
if [[ "${options.commentpattern.accept}" != "" ]] && \
[[ "$s" == ${options.commentpattern.accept} ]] ; then
match=true
fi
if [[ "${options.commentpattern.reject}" != "" ]] && \
[[ "$s" == ${options.commentpattern.reject} ]] ; then
match=false
fi
if "${match}" ; then
typeset -l hashstring # lowercase
# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
# ... and then create a MD5 hash from this string
hash="$(sum -x md5 <<<"${hashstring}")"
nameref hc_node=hashed_comments[${hash}]
if [[ "${hc_node}" == "" ]] ; then
# build node if there isn't one yet
typeset -a hc_node.fileids
typeset hc_node.comment="$s"
fi
hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
node.license_info_found=true
fi
done
if ! "${node.license_info_found}" ; then
printf "## no match found in "
printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
"${node.comments_parsed}" \
"${node.fileformat_found}" \
"${node.file_format}"
fi
done
# print comments and all fileids (filename+hash sums) which include this comment
for i in "${!hashed_comments[@]}" ; do
printf "\f## The comment (ID=%s) ..." "${i}"
printf "\n-- snip --"
printf "\n%s" "${hashed_comments[${i}].comment}"
printf "\n-- snip --"
printf "\n... applies to the following files:\n"
printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
done
return 0
}
function do_crawl
{
set -o errexit
compound options=(
integer max_filesize_for_scan=$((256*1024))
integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
)
shift
while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
case ${OPT} in
S) options.max_filesize_for_scan="${OPTARG}" ;;
N) options.max_num_comments="${OPTARG}" ;;
*) usage do_crawl_usage ;;
esac
done
shift $((OPTIND-1))
compound scan=(
typeset -A records
)
# read filenames from stdin
while read i ; do
printf "## scanning %s ...\n" "$i"
extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
done
# print compound variable array (we strip the "typeset -A records" for now)
print -v scan >"crawlsrccomments_extracted_comments.cpv"
print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
return 0
}
function do_getcomments
{
set -o errexit
# vars
compound scan
typeset database
typeset tmp
compound options=(
typeset database="crawlsrccomments_extracted_comments.cpv"
typeset print_stats=false
typeset zapduplicates=false
compound filepattern=(
typeset accept="*"
typeset reject=""
)
compound commentpattern=(
typeset accept="~(Ei)(license|copyright)"
typeset reject=""
)
)
shift
while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
case ${OPT} in
c) options.commentpattern.accept="${OPTARG}" ;;
C) options.commentpattern.reject="${OPTARG}" ;;
D) options.database="${OPTARG}" ;;
l) options.filepattern.accept="${OPTARG}" ;;
L) options.filepattern.reject="${OPTARG}" ;;
S) options.print_stats=true ;;
+S) options.print_stats=false ;;
Z) options.zapduplicates=true ;;
+Z) options.zapduplicates=false ;;
*) usage do_getcomments_usage ;;
esac
done
shift $((OPTIND-1))
# array of temporary files which should be cleaned-up upon exit
typeset -a tmpfiles
trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
# Support for HTTP URLs
if [[ "${options.database}" == ~(El)(http|https)://.* ]] ; then
database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
tmpfiles+=( "${database}" )
print -u2 "# Loading URL..."
cat_url "${options.database}" >"${database}"
print -u2 "# Loading URL done."
else
database="${options.database}"
fi
if [[ ! -r "${database}" ]] ; then
fatal_error "Can't read ${database}."
fi
# Support for compressed database files
case "$(LC_ALL=C /usr/bin/file "${database}")" in
*bzip2*)
tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
tmpfiles+=( "${tmp}" )
print -u2 "# Uncompressing data (bzip2) ..."
bzcat <"${database}" >"${tmp}"
print -u2 "# Uncompression done."
database="${tmp}"
;;
*gzip*)
tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
tmpfiles+=( "${tmp}" )
print -u2 "# Uncompressing data (gzip) ..."
gunzip -c <"${database}" >"${tmp}"
print -u2 "# Uncompression done."
database="${tmp}"
;;
esac
# Read compound variable which contain all recorded comments
print -u2 "# reading records..."
read -C scan <"${database}" || fatal_error 'Error reading data.'
print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
# print comments
print -u2 "# processing data..."
print "## comments start:"
if "${options.zapduplicates}" ; then
print_comments_duplicates_compressed scan.records options
else
print_comments_plain scan.records options
fi
print "## comments end"
print -u2 "# processing data done."
if "${options.print_stats}" ; then
print_stats
fi
return 0
}
function usage
{
nameref usagemsg=$1
OPTIND=0
getopts -a "${progname}" "${usagemsg}" OPT '-?'
exit 2
}
typeset -r do_getcomments_usage=$'+
[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?getcomments - extract license information from source files]
[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
license information from the "\bgetcomments\b"-database
file created by \bcrawl\b. The script allows various
filters (see options below) to be applied on the database]
[+?The license extraction is done in two steps - first a crawler script
called \bcrawl\b will scan all source files, extract
the comments and stores this information in a "database" file called
"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
queries on this database.]
[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
[S:stats?Print statistics.]
[Z:zapsimilar?Combine similar/duplicate comments in the report.]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
typeset -r do_crawl_usage=$'+
[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawl - crawl comment information from source files]
[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
a list of source code files from stdin, determinates the type of
syntax used by these files and then extracts
comments from the source code and stores this information into a
"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
be processed by \bextract_license\b or similar processing tools.]
[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
Defaults to 256K characters.]:[numchars]
[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
typeset -r crawlsrccomments_usage=$'+
[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawlsrccomments - extract and filter comment information from source files]
[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
a list of source code files from stdin, determinates the type of
syntax used by these files and then extracts
comments from the source code and stores this information into a
"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
be processed by \bextract_license\b or similar processing tools.]
[crawl|getcomments] options
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
# program start
builtin basename
builtin cat
builtin date
builtin uname
builtin rm
builtin sum || fatal_error "sum builtin not found."
# exit at the first error we hit
set -o errexit
typeset progname="${ basename "${0}" ; }"
while getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
case ${OPT} in
*) usage crawlsrccomments_usage ;;
esac
done
shift $((OPTIND-1))
typeset cmd="$1"
case "$cmd" in
"crawl")
progname+=" ${cmd}"
do_crawl "$@"
exit $?
;;
"getcomments")
progname+=" ${cmd}"
do_getcomments "$@"
exit $?
;;
*)
usage crawlsrccomments_usage
;;
esac
fatal_error "not reached."
# EOF.