crawlsrccomments.sh revision 7c2fbfb345896881c631598ee3852ce9ce33fb07
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CDDL HEADER START
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# The contents of this file are subject to the terms of the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Common Development and Distribution License (the "License").
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# You may not use this file except in compliance with the License.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# See the License for the specific language governing permissions
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# and limitations under the License.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# When distributing Covered Code, include this CDDL HEADER in each
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# If applicable, add the following below this CDDL HEADER, with the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# fields enclosed by brackets "[]" replaced with your own identifying
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# information: Portions Copyright [yyyy] [name of copyright owner]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CDDL HEADER END
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Use is subject to license terms.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Make sure all math stuff runs in the "C" locale to avoid problems
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# with alternative # radix point representations (e.g. ',' instead of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# '.' in de_DE.*-locales). This needs to be set _before_ any
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# floating-point constants are defined in this script).
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# constants values for tokenizer/parser stuff
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#set -o xtrace
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # special case for HTML where you have something like <foo baz>
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# XML document handler
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#set -o xtrace
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# print "xmltok: '${tag_type}' = '${tag_value}'"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset issingletag # bool: true/false (used for tags like "<br />")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check if the tag starts and ends at the same time (like "<br />")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check if the tag has attributes (e.g. space after name)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # handle tags like <br/> (which are start- and end-tag in one piece)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print # final newline to make filters like "sed" happy
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in a shell (or shell-like) script
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in a troff document
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in files which are preprocessed by
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CPP (e.g. C, C++, Imakefile etc.)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# set -o nounset
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_comment=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer x=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer y=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer x=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer y=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_sq_literal=false # single-quote literal
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_dq_literal=false # double-quote literal
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Truncate file to "max_filesize_for_scan" charatcters.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # This was originally added to work around a performance problem with
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # the ${str:offset:chunksize} operator which scales badly in ksh93
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ${#content} > max_filesize_for_scan )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Iterate through the source code. The last character
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # (when file_pos == content_length) will be empty to indicate
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # a C++ comment is not terminated by a newline... ;-/)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_sq_literal=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif ${state.in_dq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_dq_literal=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( file_pos++, line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_c_comment=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( file_pos++, line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( state.cxx.comment_prev_pos.x == line_pos.x && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_continued=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_dq_literal=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$comment" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: Comment text buffer not empty at EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_c_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: C comment did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.cxx.in_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: C++ comment did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_dq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: Double-quoted literal did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # We treat this one only as warning since things like "foo.html.cpp" may
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # trigger this condition accidently
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_sq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## WARNING: Single-quoted literal did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# determine file type
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction get_file_format
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset filename="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref file_format="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset fileeval # evaluation result of /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check whether "filename" is a plain, readable file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -f "$filename" ]] && return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -r "$filename" ]] && return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # In theory this code would exclusively look at the contents of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # the file to figure out it's file format - unfortunately
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # /usr/bin/file is virtually useless (the heuristics, matching
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # and output unreliable) for many file formats and therefore
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # we have to do a multi-stage approach which looks
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # at the file's content if possible and at the filename
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # pass one: Find matches for file formats where /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # pass two: match by file content via /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset fileformat_found="false" # "true" or "false"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comments_parsed="false" # "true" or "false"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# parse HTTP return code, cookies etc.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # we use '\r' as additional IFS to filter the final '\r'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.content_type="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # If URL did not contain a port number in the host part then look at the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin redirect {netfd}<&-
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "## no match found in '%s'," "${node.filename}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # ... and then create a MD5 hash from this string
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref hc_node=hashed_comments[${hash}]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # print comments and all fileids (filename+hash sums) which include this comment
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while read i ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # print compound variable array (we strip the "typeset -A records" for now)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset database="crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # array of temporary files which should be cleaned-up upon exit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.database}" == ~(El)http://.* ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin database="/tmp/extract_license_cat_http_${PPID}_$$.tmp"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$(LC_ALL=C /usr/bin/file "${database}")" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Read compound variable which contain all recorded comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin } | read -C scan.records || fatal_error 'Error reading data.'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print_comments_duplicates_compressed scan.records options
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?getcomments - extract license information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin license information from the "\bgetcomments\b"-database
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file created by \bcrawl\b. The script allows various
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin filters (see options below) to be applied on the database]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+?The license extraction is done in two steps - first a crawler script
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin called \bcrawl\b will scan all source files, extract
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin the comments and stores this information in a "database" file called
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin queries on this database.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[D:database?Database file for input (either file or http://-URL).]:[database]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[S:stats?Print statistics.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[Z:zapsimilar?Combine similar/duplicate comments in the report.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?crawl - crawl comment information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin a list of source code files from stdin, determinates the type of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin syntax used by these files and then extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comments from the source code and stores this information into a
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin be processed by \bextract_license\b or similar processing tools.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin Defaults to 256K characters.]:[numchars]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?crawlsrccomments - extract and filter comment information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin a list of source code files from stdin, determinates the type of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin syntax used by these files and then extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comments from the source code and stores this information into a
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin be processed by \bextract_license\b or similar processing tools.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[crawl|getcomments] options
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# program start
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin sum || fatal_error "sum builtin not found."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# exit at the first error we hit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do