crawlsrccomments.sh revision 7c2fbfb345896881c631598ee3852ce9ce33fb07
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#!/usr/bin/ksh93
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CDDL HEADER START
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# The contents of this file are subject to the terms of the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Common Development and Distribution License (the "License").
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# You may not use this file except in compliance with the License.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# or http://www.opensolaris.org/os/licensing.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# See the License for the specific language governing permissions
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# and limitations under the License.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# When distributing Covered Code, include this CDDL HEADER in each
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# If applicable, add the following below this CDDL HEADER, with the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# fields enclosed by brackets "[]" replaced with your own identifying
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# information: Portions Copyright [yyyy] [name of copyright owner]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CDDL HEADER END
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Use is subject to license terms.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Solaris needs /usr/xpg6/bin:/usr/xpg4/bin because the tools in /usr/bin are not POSIX-conformant
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinexport PATH=/usr/xpg6/bin:/usr/xpg4/bin:/bin:/usr/bin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# Make sure all math stuff runs in the "C" locale to avoid problems
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# with alternative # radix point representations (e.g. ',' instead of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# '.' in de_DE.*-locales). This needs to be set _before_ any
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# floating-point constants are defined in this script).
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinif [[ "${LC_ALL}" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin export \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin LC_MONETARY="${LC_ALL}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin LC_MESSAGES="${LC_ALL}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin LC_COLLATE="${LC_ALL}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin LC_CTYPE="${LC_ALL}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin unset LC_ALL
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinexport LC_NUMERIC=C
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# constants values for tokenizer/parser stuff
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset -r ch=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin newline=$'\n'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tab=$'\t'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin formfeed=$'\f'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction fatal_error
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "${progname}: $*"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin exit 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction printmsg
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "$*"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction attrstrtoattrarray
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#set -o xtrace
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset s="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref aa=$2 # attribute array
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer aa_count=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer aa_count=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset nextattr
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer currattrlen=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tagstr
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tagval
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while (( ${#s} > 0 )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # skip whitespaces
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while [[ "${s:currattrlen:1}" == ~(E)[[:blank:][:space:]] ]] ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( currattrlen++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin s="${s:currattrlen:${#s}}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # anything left ?
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( ${#s} == 0 )) && break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Pattern tests:
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # All pattern combined via eregex (w|x|y|z):
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin currattrlen=$(( ${#s} - ${#nextattr}))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # add entry
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tagstr="${s:0:currattrlen}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${tagstr}" == *=* ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # normal case: attribute with value
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tagval="${tagstr#*=}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # strip quotes ('' or "")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${tagval}" == ~(Elr)(\'.*\'|\".*\") ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tagval="${tagval:1:${#tagval}-2}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin aa[${aa_count}]=( name="${tagstr%%=*}" value="${tagval}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # special case for HTML where you have something like <foo baz>
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin aa[${aa_count}]=( name="${tagstr}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( aa_count++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( aa_count > 1000 )) && fatal_error "$0: aa_count too large" # assert
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# XML document handler
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction handle_xml_document
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin#set -o xtrace
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref callbacks=${1}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tag_type="${2}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tag_value="${3}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tag_attributes="${4}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref doc=${callbacks["arg_tree"]}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref nodepath="${stack.items[stack.pos]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref nodesnum="${stack.items[stack.pos]}num"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "${tag_type}" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tag_comment)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nodepath[${nodesnum}]+=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tagtype="comment"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tagvalue="${tag_value}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( nodesnum++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# print "xmltok: '${tag_type}' = '${tag_value}'"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction xml_tok
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset buf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset namebuf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset attrbuf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset c=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset isendtag # bool: true/false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset issingletag # bool: true/false (used for tags like "<br />")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref callbacks=${1}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while IFS='' read -r -N 1 c ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin isendtag=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "<" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # flush any text content
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$buf" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_text"]}" ]] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS='' read -r -N 1 c
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "/" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin isendtag=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS='' read -r -d '>' c
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf+="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # handle comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$buf" == ~(El)!-- ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # did we read the comment completely ?
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$buf" != ~(Elr)!--.*-- ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf+=">"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while [[ "$buf" != ~(Elr)!--.*-- ]] ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS='' read -r -N 1 c || break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf+="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check if the tag starts and ends at the same time (like "<br />")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${buf}" == ~(Er).*/ ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin issingletag=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf="${buf%*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin issingletag=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check if the tag has attributes (e.g. space after name)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$buf" == ~(E)[[:space:][:blank:]] ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin namebuf="$buf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin attrbuf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${isendtag} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # handle tags like <br/> (which are start- and end-tag in one piece)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${issingletag} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["tag_end"]}" ]] && ${callbacks["tag_end"]} "${1}" "tag_end" "$namebuf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin buf+="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print # final newline to make filters like "sed" happy
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in a shell (or shell-like) script
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction enumerate_comments_shell
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset input_file="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref comment_array="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments="$3"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer ca=0 # index in "comment_array"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer res=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while (( res == 0 )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS='' read -r line
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( res=$? ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${line}" == ~(El)#.* ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment+="${line#\#}${ch.newline}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$comment" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment_array[ca++]="${comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ca > max_num_comments )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done <"${input_file}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in a troff document
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction enumerate_comments_troff
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset input_file="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref comment_array="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments="$3"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer ca=0 # index in "comment_array"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer res=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while (( res == 0 )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS='' read -r line
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( res=$? ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${line}" == ~(El)\.*\\\" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment+="${line#~(El)\.*\\\"}${ch.newline}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$comment" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment_array[ca++]="${comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ca > max_num_comments )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done <"${input_file}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# enumerate comments in files which are preprocessed by
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# CPP (e.g. C, C++, Imakefile etc.)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction enumerate_comments_cpp
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# set -o nounset
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer err=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset input_file="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref comment_array="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments="$3"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_filesize_for_scan="$4"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer ca=0 # index in "comment_array"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset content
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer content_length
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer file_pos # file position
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset line_pos=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer x=0 # X position in line
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer y=0 # Y position in line (line number)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset c c2
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset state=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # C comment state
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_c_comment=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # C++ comment state
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset cxx=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_comment=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment_continued=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # position of current //-pos
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment_pos=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer x=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer y=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # position of previous //-pos
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comment_prev_pos=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer x=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer y=-1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # literal state
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_sq_literal=false # single-quote literal
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset in_dq_literal=false # double-quote literal
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin content="$(< "${input_file}")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Truncate file to "max_filesize_for_scan" charatcters.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # This was originally added to work around a performance problem with
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # the ${str:offset:chunksize} operator which scales badly in ksh93
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # version 's' with the number of characters
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ${#content} > max_filesize_for_scan )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 -f "## WARNING: File '%s' truncated to %d characters\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${input_file}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin max_filesize_for_scan
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin content="${content:0:max_filesize_for_scan}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin content_length=${#content}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Iterate through the source code. The last character
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # (when file_pos == content_length) will be empty to indicate
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # EOF (this is needed for cases like when
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # a C++ comment is not terminated by a newline... ;-/)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for (( file_pos=0 ; file_pos <= content_length ; file_pos++ )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin c2="${content:file_pos:2}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin c="${c2:0:1}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "${ch.newline}" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( line_pos.x=0, line_pos.y++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_c_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c2" == "*/" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( file_pos++, line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_c_comment=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # flush comment text
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment_array[ca++]="${comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ca > max_num_comments )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment+="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif ${state.cxx.in_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "${ch.newline}" || "$c" == "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.in_comment=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # flush comment text
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.cxx.comment_continued} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment_array[ca-1]+="${ch.newline}${comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment_array[ca++]="${comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_prev_pos.x=state.cxx.comment_pos.x ,
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y=state.cxx.comment_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( ca > max_num_comments )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment+="$c"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif ${state.in_sq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_sq_literal=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif ${state.in_dq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_dq_literal=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$c2" == "/*" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( file_pos++, line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_c_comment=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif [[ "$c2" == "//" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( file_pos++, line_pos.x++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if (( state.cxx.comment_prev_pos.x == line_pos.x && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_continued=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.comment_continued=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( state.cxx.comment_pos.x=line_pos.x , state.cxx.comment_pos.y=line_pos.y ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.cxx.in_comment=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comment=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_sq_literal=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin state.in_dq_literal=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "$comment" != "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: Comment text buffer not empty at EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin err=1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_c_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: C comment did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin err=1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.cxx.in_comment} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: C++ comment did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin err=1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_dq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## ERROR: Double-quoted literal did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin err=1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # We treat this one only as warning since things like "foo.html.cpp" may
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # trigger this condition accidently
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ${state.in_sq_literal} ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "## WARNING: Single-quoted literal did not close before EOF."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return $err
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# determine file type
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction get_file_format
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset filename="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref file_format="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset fileeval # evaluation result of /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # check whether "filename" is a plain, readable file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -f "$filename" ]] && return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ ! -r "$filename" ]] && return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # In theory this code would exclusively look at the contents of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # the file to figure out it's file format - unfortunately
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # /usr/bin/file is virtually useless (the heuristics, matching
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # and output unreliable) for many file formats and therefore
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # we have to do a multi-stage approach which looks
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # at the file's content if possible and at the filename
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # otherwise. Fun... ;-(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # pass one: Find matches for file formats where /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # is known to be unreliable:
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$filename" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.[ch] | *.cpp | *.cc | *.cxx | *.hxx)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="c_source"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *Imakefile)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="imakefile"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *Makefile)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="makefile"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # pass two: match by file content via /usr/bin/file
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fileeval="$(LC_ALL=C /usr/bin/file "$filename")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$fileeval" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)roff)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="troff"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)html\ document)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="html"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)sgml\ document)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="sgml"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)executable.*(shell|(/|/r|/pf)(sh|ksh|ksh93|rksh93|dtksh|tksh|bash))\ script)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="shell"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(E)executable.*/perl\ script)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="perl"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # pass three: fallhack to filename matching
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$filename" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.man)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="troff"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.html)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="html"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.sgml)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="sgml"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.xml)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="xml"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.png)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="image_png"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.xcf)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="image_xcf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.shar)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="archive_shell"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.sh)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="shell"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.pcf)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="font_pcf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.bdf)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="font_bdf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.pmf)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="font_pmf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.ttf | *.otf)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="font_ttf"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *.pfa | *.pfb)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file_format="font_postscript"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction extract_comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref records="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset filename="$2"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments="$3"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_filesize_for_scan="$4"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset datatype=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}]=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset filename="$filename"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset fileformat_found="false" # "true" or "false"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset file_format=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -A hashsum
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset comments_parsed="false" # "true" or "false"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -a comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].hashsum["md5"]="$(sum -x md5 < "$filename")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].hashsum["sha1"]="$(sum -x sha1 < "$filename")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if get_file_format "$filename" datatype ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].fileformat_found="true"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].file_format="$datatype"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$datatype" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin c_source|imakefile)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].comments_parsed=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin shell|makefile)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].comments_parsed=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin troff)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin records[${filename}].comments_parsed=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # NOTE: Disabled for now
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin #xml|html|sgml)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # records[${filename}].comments_parsed=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# parse HTTP return code, cookies etc.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction parse_http_response
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref response="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset h statuscode statusmsg i
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # we use '\r' as additional IFS to filter the final '\r'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin IFS=$' \t\r' read -r h statuscode statusmsg # read HTTP/1.[01] <code>
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$h" != ~(Eil)HTTP/.* ]] && { print -u2 -f $"%s: HTTP/ header missing\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.statuscode="$statuscode"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.statusmsg="$statusmsg"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # skip remaining headers
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while IFS='' read -r i ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$i" == $'\r' ]] && break
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # strip '\r' at the end
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin i="${i/~(Er)$'\r'/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$i" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(Eli)Content-Type:.*)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.content_type="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(Eli)Content-Length:[[:blank:]]*[0-9]*)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ~(Eli)Transfer-Encoding:.*)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction cat_http_body
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset emode="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset hexchunksize="0"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer chunksize=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${emode}" == "chunked" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while IFS=$'\r' read hexchunksize &&
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${hexchunksize}" == ~(Elri)[0-9abcdef]* ]] &&
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( chunksize=16#${hexchunksize} )) && (( chunksize > 0 )) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin dd bs=1 count="${chunksize}" 2>/dev/null
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin cat
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction cat_http
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset protocol="${1%://*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset host="${path1%%/*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset path="${path1#*/}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset port="${host##*:}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer netfd
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -C httpresponse # http response
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # If URL did not contain a port number in the host part then look at the
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # protocol to get the port number
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${port}" == "${host}" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "${protocol}" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "http") port=80 ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin host="${host%:*}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # prechecks
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${protocol}" == "" ]] && { print -u2 -f "%s: protocol not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${port}" == "" ]] && { print -u2 -f "%s: port not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${host}" == "" ]] && { print -u2 -f "%s: host not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${path}" == "" ]] && { print -u2 -f "%s: path not set.\n" "$0" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # open TCP channel
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin redirect {netfd}<>"/dev/tcp/${host}/${port}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( $? != 0 )) && { print -u2 -f "%s: Couldn't open %s\n" "$0" "${1}" ; return 1 ; }
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # send HTTP request
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin request="GET /${path} HTTP/1.1\r\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin request+="Host: ${host}\r\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin request+="User-Agent: crawlsrccomments/ksh93 (2008-06-14; $(uname -s -r -p))\r\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin request+="Connection: close\r\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -n -- "${request}\r\n" >&${netfd}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # collect response and send it to stdout
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin parse_http_response httpresponse <&${netfd}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin cat_http_body "${httpresponse.transfer_encoding}" <&${netfd}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # close connection
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin redirect {netfd}<&-
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction print_stats
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # gather some statistics
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset stats=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer files_with_comments=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer files_without_comments=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer files_without_known_format=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer files_with_license_info=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer files_without_license_info=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer total_num_files=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${records[$i].comments_parsed}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.files_with_comments++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.files_without_comments++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ! "${records[$i].fileformat_found}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.files_without_known_format++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${records[$i].license_info_found}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.files_with_license_info++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.files_without_license_info++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin (( stats.total_num_files++ ))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "%B\n" stats
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction print_comments_plain
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref records=$1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref options=$2
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset i j
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref node=records[$i]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin node.license_info_found=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ! "${node.comments_parsed}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for j in "${!node.comments[@]}" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset s="${node.comments[$j]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin match=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${match}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "%s\n" "$s"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin node.license_info_found=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ! "${node.license_info_found}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "## no match found in '%s'," "${node.filename}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.comments_parsed}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.fileformat_found}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.file_format}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction print_comments_duplicates_compressed
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref records=$1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref options=$2
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset i j
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -A hashed_comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer num_hashed_comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in $(printf "%s\n" "${!records[@]}" | sort) ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref node=records[$i]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" != ${options.filepattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.filepattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "${node.filename}" == ${options.filepattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin node.license_info_found=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ! "${node.comments_parsed}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin continue
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for j in "${!node.comments[@]}" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset s="${node.comments[$j]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.accept}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.accept} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin match=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.commentpattern.reject}" != "" ]] && \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin [[ "$s" == ${options.commentpattern.reject} ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin match=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${match}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -l hashstring # lowercase
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # ... and then create a MD5 hash from this string
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin hash="$(sum -x md5 <<<"${hashstring}")"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref hc_node=hashed_comments[${hash}]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${hc_node}" == "" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # build node if there isn't one yet
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -a hc_node.fileids
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset hc_node.comment="$s"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin node.license_info_found=true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if ! "${node.license_info_found}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "## no match found in "
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "comments_parsed=%s, fileformat_found=%s, file_format=%s\n" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.comments_parsed}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.fileformat_found}" \
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "${node.file_format}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # print comments and all fileids (filename+hash sums) which include this comment
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin for i in "${!hashed_comments[@]}" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\f## The comment (ID=%s) ..." "${i}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\n-- snip --"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\n%s" "${hashed_comments[${i}].comment}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\n-- snip --"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\n... applies to the following files:\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction do_crawl
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset options=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_filesize_for_scan=$((256*1024))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin integer max_num_comments=$((2**62)) # FIXME: This should be "+Inf" (=Infinite)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin shift
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while getopts -a "${progname}" "${do_crawl_usage}" OPT "$@" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case ${OPT} in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin S) options.max_filesize_for_scan="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin N) options.max_num_comments="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *) usage do_crawl_usage ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin shift $((OPTIND-1))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset scan=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -A records
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # read filenames from stdin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while read i ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "## scanning %s ...\n" "$i"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # print compound variable array (we strip the "typeset -A records" for now)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "%B\n" scan |
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin sed $'s/^#.*$//;s/^\(//;s/^\)//;s/^\ttypeset -A records=\(//;s/^\t\)//' >"crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction do_getcomments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin set -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # vars
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset scan=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -A records
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset database
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset tmp
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset options=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset database="crawlsrccomments_extracted_comments.cpv"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset print_stats=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset zapduplicates=false
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset filepattern=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset accept="*"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset reject=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset commentpattern=(
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset accept="~(Ei)(license|copyright)"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset reject=""
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin shift
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin while getopts -a "${progname}" "${do_getcomments_usage}" OPT "$@" ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case ${OPT} in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin c) options.commentpattern.accept="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin C) options.commentpattern.reject="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin D) options.database="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin l) options.filepattern.accept="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin L) options.filepattern.reject="${OPTARG}" ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin S) options.print_stats=true ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin +S) options.print_stats=false ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin Z) options.zapduplicates=true ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin +Z) options.zapduplicates=false ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *) usage do_getcomments_usage ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin done
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin shift $((OPTIND-1))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # array of temporary files which should be cleaned-up upon exit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin typeset -a tmpfiles
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Support for HTTP URLs
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ "${options.database}" == ~(El)http://.* ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin database="/tmp/extract_license_cat_http_${PPID}_$$.tmp"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tmpfiles+=( "${database}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Loading URL..."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin cat_http "${options.database}" >"${database}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Loading URL done."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin database="${options.database}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if [[ ! -r "${database}" ]] ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fatal_error "Can't read ${database}."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Support for compressed database files
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case "$(LC_ALL=C /usr/bin/file "${database}")" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *bzip2*)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tmpfiles+=( "${tmp}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Uncompressing data (bzip2) ..."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin bzcat <"${database}" >"${tmp}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Uncompression done."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin database="${tmp}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *gzip*)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin tmpfiles+=( "${tmp}" )
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Uncompressing data (gzip) ..."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin gunzip -c <"${database}" >"${tmp}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# Uncompression done."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin database="${tmp}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # Read compound variable which contain all recorded comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# reading records..."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin {
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf "("
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin cat "${database}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin printf ")\n"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin } | read -C scan.records || fatal_error 'Error reading data.'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # print comments
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# processing data..."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print "## comments start:"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${options.zapduplicates}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print_comments_duplicates_compressed scan.records options
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin else
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print_comments_plain scan.records options
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print "## comments end"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print -u2 "# processing data done."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin if "${options.print_stats}" ; then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin print_stats
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin fi
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin return 0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfunction usage
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin{
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin nameref usagemsg=$1
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin OPTIND=0
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin getopts -a "${progname}" "${usagemsg}" OPT '-?'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin exit 2
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin}
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset -r do_getcomments_usage=$'+
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: getcomments (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?getcomments - extract license information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin license information from the "\bgetcomments\b"-database
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin file created by \bcrawl\b. The script allows various
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin filters (see options below) to be applied on the database]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+?The license extraction is done in two steps - first a crawler script
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin called \bcrawl\b will scan all source files, extract
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin the comments and stores this information in a "database" file called
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin queries on this database.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[D:database?Database file for input (either file or http://-URL).]:[database]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[S:stats?Print statistics.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[Z:zapsimilar?Combine similar/duplicate comments in the report.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset -r do_crawl_usage=$'+
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: crawl (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?crawl - crawl comment information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin a list of source code files from stdin, determinates the type of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin syntax used by these files and then extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comments from the source code and stores this information into a
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin be processed by \bextract_license\b or similar processing tools.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin Defaults to 256K characters.]:[numchars]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset -r crawlsrccomments_usage=$'+
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2008-10-14 \$\n]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[-author?Roland Mainz <roland.mainz@sun.com>]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+NAME?crawlsrccomments - extract and filter comment information from source files]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin a list of source code files from stdin, determinates the type of
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin syntax used by these files and then extracts
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin comments from the source code and stores this information into a
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin be processed by \bextract_license\b or similar processing tools.]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[crawl|getcomments] options
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin'
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# program start
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin basename
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin cat
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin date
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin uname
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin rm
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinbuiltin sum || fatal_error "sum builtin not found."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# exit at the first error we hit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinset -o errexit
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset progname="${ basename "${0}" ; }"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinwhile getopts -a "${progname}" "${crawlsrccomments_usage}" OPT ; do
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin # printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin case ${OPT} in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *) usage crawlsrccomments_usage ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin esac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chindone
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinshift $((OPTIND-1))
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chintypeset cmd="$1"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chincase "$cmd" in
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "crawl")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin progname+=" ${cmd}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin do_crawl "$@"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin exit $?
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin "getcomments")
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin progname+=" ${cmd}"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin do_getcomments "$@"
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin exit $?
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin *)
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin usage crawlsrccomments_usage
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin ;;
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinesac
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chinfatal_error "not reached."
7c2fbfb345896881c631598ee3852ce9ce33fb07April Chin# EOF.