#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
#
# Make sure all math stuff runs in the "C" locale to avoid problems
# with alternative # radix point representations (e.g. ',' instead of
# '.' in de_DE.*-locales). This needs to be set _before_ any
# floating-point constants are defined in this script).
if [[ "${LC_ALL}" != "" ]] ; then
export \
LC_MONETARY="${LC_ALL}" \
LC_MESSAGES="${LC_ALL}" \
LC_COLLATE="${LC_ALL}" \
LC_CTYPE="${LC_ALL}"
unset LC_ALL
fi
export LC_NUMERIC=C
compound -r ch=(
newline=$'\n'
tab=$'\t'
formfeed=$'\f'
)
function fatal_error
{
print -u2 "${progname}: $*"
exit 1
}
function printmsg
{
print -u2 "$*"
}
function attrstrtoattrarray
{
#set -o xtrace
typeset s="$1"
nameref aa=$2 # attribute array
integer aa_count=0
integer aa_count=0
typeset nextattr
integer currattrlen=0
typeset tagstr
typeset tagval
while (( ${#s} > 0 )) ; do
# skip whitespaces
done
s="${s:currattrlen:${#s}}"
# anything left ?
(( ${#s} == 0 )) && break
# Pattern tests:
#x="foo=bar huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=[^[:blank:]\"]*}"
#x='foo="ba=r o" huz=123' ; print "${x##~(E)[[:alnum:]_-:]*=\"[^\"]*\"}"
#x="foo='ba=r o' huz=123" ; print "${x##~(E)[[:alnum:]_-:]*=\'[^\"]*\'}"
#x="foox huz=123" ; print "${x##~(E)[[:alnum:]_-:]*}"
# All pattern combined via eregex (w|x|y|z):
#x='foo="bar=o" huz=123' ; print "${x##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\')}"
nextattr="${s##~(E)([[:alnum:]_-:]*=[^[:blank:]\"]*|[[:alnum:]_-:]*=\"[^\"]*\"|[[:alnum:]_-:]*=\'[^\"]*\'|[[:alnum:]_-:]*)}"
# add entry
tagstr="${s:0:currattrlen}"
if [[ "${tagstr}" == *=* ]] ; then
# normal case: attribute with value
tagval="${tagstr#*=}"
# strip quotes ('' or "")
tagval="${tagval:1:${#tagval}-2}"
fi
else
# special case for HTML where you have something like <foo baz>
fi
done
}
# XML document handler
function handle_xml_document
{
#set -o xtrace
nameref callbacks=${1}
typeset tag_type="${2}"
typeset tag_value="${3}"
typeset tag_attributes="${4}"
nameref nodepath="${stack.items[stack.pos]}"
nameref nodesnum="${stack.items[stack.pos]}num"
typeset tagtype="comment"
typeset tagvalue="${tag_value}"
)
;;
esac
# print "xmltok: '${tag_type}' = '${tag_value}'"
}
function xml_tok
{
typeset buf=""
typeset namebuf=""
typeset attrbuf=""
typeset c=""
typeset issingletag # bool: true/false (used for tags like "<br />")
nameref callbacks=${1}
[[ ! -z "${callbacks["document_start"]}" ]] && ${callbacks["document_start"]} "${1}" "document_start"
isendtag=false
if [[ "$c" == "<" ]] ; then
# flush any text content
buf=""
fi
if [[ "$c" == "/" ]] ; then
isendtag=true
else
buf="$c"
fi
buf+="$c"
# handle comments
# did we read the comment completely ?
buf+=">"
buf+="$c"
done
fi
[[ ! -z "${callbacks["tag_comment"]}" ]] && ${callbacks["tag_comment"]} "${1}" "tag_comment" "${buf:3:${#buf}-5}"
buf=""
continue
fi
# check if the tag starts and ends at the same time (like "<br />")
issingletag=true
buf="${buf%*/}"
else
issingletag=false
fi
# check if the tag has attributes (e.g. space after name)
namebuf="${buf%%~(E)[[:space:][:blank:]].*}"
attrbuf="${buf#~(E).*[[:space:][:blank:]]}"
else
attrbuf=""
fi
if ${isendtag} ; then
else
[[ ! -z "${callbacks["tag_begin"]}" ]] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$namebuf" "$attrbuf"
# handle tags like <br/> (which are start- and end-tag in one piece)
if ${issingletag} ; then
fi
fi
buf=""
else
buf+="$c"
fi
done
[[ ! -z "${callbacks["document_end"]}" ]] && ${callbacks["document_end"]} "${1}" "document_end" "exit_success"
print # final newline to make filters like "sed" happy
}
# enumerate comments in a shell (or shell-like) script
function enumerate_comments_shell
{
set -o errexit
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer ca=0 # index in "comment_array"
integer res=0
typeset comment=""
comment+="${line#\#}${ch.newline}"
else
comment_array[ca++]="${comment}"
comment=""
break
fi
fi
fi
done <"${input_file}"
return 0
}
# enumerate comments in a troff document
function enumerate_comments_troff
{
set -o errexit
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer ca=0 # index in "comment_array"
integer res=0
typeset comment=""
comment+="${line#~(El)\.*\\\"}${ch.newline}"
else
comment_array[ca++]="${comment}"
comment=""
break
fi
fi
fi
done <"${input_file}"
return 0
}
# enumerate comments in files which are preprocessed by
# CPP (e.g. C, C++, Imakefile etc.)
function enumerate_comments_cpp
{
set -o errexit
# set -o nounset
integer err=0
typeset input_file="$1"
nameref comment_array="$2"
integer max_num_comments="$3"
integer max_filesize_for_scan="$4"
integer ca=0 # index in "comment_array"
typeset content
integer content_length
integer file_pos # file position
compound line_pos=(
integer x=0 # X position in line
integer y=0 # Y position in line (line number)
)
typeset c c2
typeset comment
compound state=(
# C comment state
typeset in_c_comment=false
# C++ comment state
compound cxx=(
typeset in_comment=false
typeset comment_continued=false
# position of current //-pos
compound comment_pos=(
integer x=-1
integer y=-1
)
# position of previous //-pos
compound comment_prev_pos=(
integer x=-1
integer y=-1
)
)
# literal state
typeset in_sq_literal=false # single-quote literal
typeset in_dq_literal=false # double-quote literal
)
# Truncate file to "max_filesize_for_scan" charatcters.
# This was originally added to work around a performance problem with
# the ${str:offset:chunksize} operator which scales badly in ksh93
# version 's' with the number of characters
if (( ${#content} > max_filesize_for_scan )) ; then
"${input_file}" \
content="${content:0:max_filesize_for_scan}"
fi
content_length=${#content}
# Iterate through the source code. The last character
# (when file_pos == content_length) will be empty to indicate
# EOF (this is needed for cases like when
# a C++ comment is not terminated by a newline... ;-/)
c2="${content:file_pos:2}"
c="${c2:0:1}"
else
fi
if ${state.in_c_comment} ; then
state.in_c_comment=false
# flush comment text
comment_array[ca++]="${comment}"
comment=""
break
fi
else
comment+="$c"
fi
elif ${state.cxx.in_comment} ; then
state.cxx.in_comment=false
# flush comment text
if ${state.cxx.comment_continued} ; then
comment_array[ca-1]+="${ch.newline}${comment}"
else
comment_array[ca++]="${comment}"
fi
comment=""
break
fi
else
comment+="$c"
fi
elif ${state.in_sq_literal} ; then
if [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_sq_literal=false
fi
elif ${state.in_dq_literal} ; then
state.in_dq_literal=false
fi
else
(( file_pos++, line_pos.x++ ))
state.in_c_comment=true
comment=""
(( file_pos++, line_pos.x++ ))
if (( state.cxx.comment_prev_pos.x == line_pos.x && \
state.cxx.comment_prev_pos.y == (line_pos.y-1) )) ; then
state.cxx.comment_continued=true
else
state.cxx.comment_continued=false
fi
state.cxx.in_comment=true
comment=""
elif [[ "$c" == "'" && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_sq_literal=true
elif [[ "$c" == '"' && "${content:file_pos-1:1}" != '\' ]] ; then
state.in_dq_literal=true
fi
fi
done
if [[ "$comment" != "" ]] ; then
print -u2 "## ERROR: Comment text buffer not empty at EOF."
err=1
fi
if ${state.in_c_comment} ; then
print -u2 "## ERROR: C comment did not close before EOF."
err=1
fi
if ${state.cxx.in_comment} ; then
print -u2 "## ERROR: C++ comment did not close before EOF."
err=1
fi
if ${state.in_dq_literal} ; then
print -u2 "## ERROR: Double-quoted literal did not close before EOF."
err=1
fi
# We treat this one only as warning since things like "foo.html.cpp" may
# trigger this condition accidently
if ${state.in_sq_literal} ; then
print -u2 "## WARNING: Single-quoted literal did not close before EOF."
fi
return $err
}
# determine file type
function get_file_format
{
set -o errexit
typeset filename="$1"
nameref file_format="$2"
# check whether "filename" is a plain, readable file
[[ ! -f "$filename" ]] && return 1
[[ ! -r "$filename" ]] && return 1
# In theory this code would exclusively look at the contents of
# the file to figure out it's file format - unfortunately
# and output unreliable) for many file formats and therefore
# we have to do a multi-stage approach which looks
# at the file's content if possible and at the filename
# otherwise. Fun... ;-(
# is known to be unreliable:
file_format="c_source"
return 0
;;
file_format="imakefile"
return 0
;;
*Makefile)
file_format="makefile"
return 0
;;
esac
file_format="troff"
return 0
;;
file_format="html"
return 0
;;
file_format="sgml"
return 0
;;
file_format="shell"
return 0
;;
file_format="perl"
return 0
;;
esac
# pass three: fallhack to filename matching
*.man)
file_format="troff"
return 0
;;
*.html)
file_format="html"
return 0
;;
*.sgml)
file_format="sgml"
return 0
;;
*.xml)
file_format="xml"
return 0
;;
*.png)
file_format="image_png"
return 0
;;
*.xcf)
file_format="image_xcf"
return 0
;;
*.shar)
file_format="archive_shell"
return 0
;;
*.sh)
file_format="shell"
return 0
;;
*.pcf)
file_format="font_pcf"
return 0
;;
*.bdf)
file_format="font_bdf"
return 0
;;
*.pmf)
file_format="font_pmf"
return 0
;;
file_format="font_ttf"
return 0
;;
file_format="font_postscript"
return 0
;;
esac
return 1
}
function extract_comments
{
set -o errexit
nameref records="$1"
typeset filename="$2"
integer max_num_comments="$3"
integer max_filesize_for_scan="$4"
typeset datatype=""
typeset file_format=""
typeset -A hashsum
typeset -a comments
)
records[${filename}].fileformat_found="true"
records[${filename}].file_format="$datatype"
else
return 1
fi
enumerate_comments_cpp "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
enumerate_comments_shell "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
enumerate_comments_troff "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
records[${filename}].comments_parsed=true
;;
# NOTE: Disabled for now
#xml|html|sgml)
# enumerate_comments_xml "${filename}" "records[${filename}].comments" ${max_num_comments} ${max_filesize_for_scan} && \
# records[${filename}].comments_parsed=true
# ;;
esac
return 0
}
# parse HTTP return code, cookies etc.
function parse_http_response
{
nameref response="$1"
typeset h statuscode statusmsg i
# we use '\r' as additional IFS to filter the final '\r'
[[ "$statuscode" != ~(Elr)[0-9]* ]] && { print -u2 -f $"%s: invalid status code\n" "$0" ; return 1 ; }
# skip remaining headers
[[ "$i" == $'\r' ]] && break
# strip '\r' at the end
i="${i/~(Er)$'\r'/}"
response.content_type="${i/~(El).*:[[:blank:]]*/}"
;;
integer response.content_length="${i/~(El).*:[[:blank:]]*/}"
;;
response.transfer_encoding="${i/~(El).*:[[:blank:]]*/}"
;;
esac
done
return 0
}
function cat_http_body
{
typeset emode="$1"
typeset hexchunksize="0"
integer chunksize=0
if [[ "${emode}" == "chunked" ]] ; then
done
else
fi
return 0
}
function cat_url
{
typeset protocol="${1%://*}"
if [[ "${protocol}" == "file" ]] ; then
cat "${path1}"
return $?
typeset host="${path1%%/*}"
typeset path="${path1#*/}"
typeset port="${host##*:}"
integer netfd
compound httpresponse # http response
# If URL did not contain a port number in the host part then look at the
# protocol to get the port number
if [[ "${port}" == "${host}" ]] ; then
esac
else
host="${host%:*}"
fi
printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
# prechecks
# open TCP channel
if [[ "${protocol}" == "https" ]] ; then
compound sslfifo
sslfifo.in="${sslfifo.dir}/in"
sslfifo.out="${sslfifo.dir}/out"
# register an EXIT trap and use "errexit" to leave it at the first error
set -o errexit
mkfifo "${sslfifo.in}" "${sslfifo.out}"
# create async openssl child to handle https
# send HTTP request
request="GET /${path} HTTP/1.1\r\n"
request+="Host: ${host}\r\n"
request+="Connection: close\r\n"
# collect response and send it to stdout
{
cat_http_body "${httpresponse.transfer_encoding}"
return 0
else
# send HTTP request
request="GET /${path} HTTP/1.1\r\n"
request+="Host: ${host}\r\n"
request+="Connection: close\r\n"
# collect response and send it to stdout
# close connection
redirect {netfd}<&-
return 0
fi
else
return 1
fi
# notreached
}
function print_stats
{
set -o errexit
# gather some statistics
compound stats=(
integer files_with_comments=0
integer files_without_comments=0
integer files_without_known_format=0
integer files_with_license_info=0
integer files_without_license_info=0
integer total_num_files=0
)
if "${records[$i].comments_parsed}" ; then
else
fi
if ! "${records[$i].fileformat_found}" ; then
fi
if "${records[$i].license_info_found}" ; then
else
fi
done
print -v stats
return 0
}
function print_comments_plain
{
set -o errexit
nameref records=$1
nameref options=$2
typeset i j
continue
fi
continue
fi
node.license_info_found=false
continue
fi
typeset s="${node.comments[$j]}"
typeset match=false
match=true
fi
match=false
fi
if "${match}" ; then
printf "\f#### filename='%s',\tcomment=%s\n" "${node.filename}" "$j"
printf "%s\n" "$s"
fi
done
printf "## no match found in '%s'," "${node.filename}"
"${node.file_format}"
fi
done
return 0
}
function print_comments_duplicates_compressed
{
set -o errexit
nameref records=$1
nameref options=$2
typeset i j
typeset -A hashed_comments
integer num_hashed_comments
continue
fi
continue
fi
node.license_info_found=false
continue
fi
typeset s="${node.comments[$j]}"
typeset match=false
match=true
fi
match=false
fi
if "${match}" ; then
typeset -l hashstring # lowercase
# compress the comment (e.g. convert whiteapces and '.,:;()"' to newline characters) ...
hashstring="${s//+([\n\r\t\v*#.,:;\(\)\"[:space:][:blank:]])/${ch.newline}}"
# ... and then create a MD5 hash from this string
nameref hc_node=hashed_comments[${hash}]
if [[ "${hc_node}" == "" ]] ; then
# build node if there isn't one yet
typeset -a hc_node.fileids
typeset hc_node.comment="$s"
fi
hc_node.fileids+=( "$(printf "%s (md5='%s', sha1='%s')\n" "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}")" )
fi
done
printf "%s (md5='%s', sha1='%s'), " "${node.filename}" "${node.hashsum["md5"]}" "${node.hashsum["sha1"]}"
"${node.file_format}"
fi
done
# print comments and all fileids (filename+hash sums) which include this comment
printf "\f## The comment (ID=%s) ..." "${i}"
printf "\n-- snip --"
printf "\n%s" "${hashed_comments[${i}].comment}"
printf "\n-- snip --"
printf "\n... applies to the following files:\n"
printf "\t%s\n" "${hashed_comments[${i}].fileids[@]}" # printf repeats the format string for each array memeber
done
return 0
}
function do_crawl
{
set -o errexit
compound options=(
)
shift
printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
*) usage do_crawl_usage ;;
esac
done
compound scan=(
typeset -A records
)
# read filenames from stdin
while read i ; do
printf "## scanning %s ...\n" "$i"
extract_comments scan.records "$i" ${options.max_num_comments} ${options.max_filesize_for_scan} || true
done
# print compound variable array (we strip the "typeset -A records" for now)
print -v scan >"crawlsrccomments_extracted_comments.cpv"
print "# Wrote results to crawlsrccomments_extracted_comments.cpv"
return 0
}
function do_getcomments
{
set -o errexit
# vars
compound scan
typeset database
typeset tmp
compound options=(
typeset print_stats=false
typeset zapduplicates=false
compound filepattern=(
typeset accept="*"
typeset reject=""
)
compound commentpattern=(
typeset accept="~(Ei)(license|copyright)"
typeset reject=""
)
)
shift
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
S) options.print_stats=true ;;
+S) options.print_stats=false ;;
Z) options.zapduplicates=true ;;
+Z) options.zapduplicates=false ;;
*) usage do_getcomments_usage ;;
esac
done
# array of temporary files which should be cleaned-up upon exit
typeset -a tmpfiles
trap 'set -o errexit ; print -u2 "# Cleaning up..." ; ((${#tmpfiles[@]} > 0)) && rm -- "${tmpfiles[@]}" ; print -u2 "# Done."' EXIT
# Support for HTTP URLs
database="/tmp/extract_license_cat_url_${PPID}_$$.tmp"
print -u2 "# Loading URL..."
print -u2 "# Loading URL done."
else
database="${options.database}"
fi
if [[ ! -r "${database}" ]] ; then
fatal_error "Can't read ${database}."
fi
# Support for compressed database files
*bzip2*)
tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
print -u2 "# Uncompressing data (bzip2) ..."
print -u2 "# Uncompression done."
database="${tmp}"
;;
*gzip*)
tmp="/tmp/extract_license_bzcat_${PPID}_$$.tmp"
print -u2 "# Uncompressing data (gzip) ..."
print -u2 "# Uncompression done."
database="${tmp}"
;;
esac
# Read compound variable which contain all recorded comments
print -u2 "# reading records..."
print -u2 -f "# reading %d records done.\n" "${#scan.records[@]}"
# print comments
print -u2 "# processing data..."
print "## comments start:"
else
fi
print "## comments end"
print -u2 "# processing data done."
fi
return 0
}
function usage
{
nameref usagemsg=$1
OPTIND=0
exit 2
}
typeset -r do_getcomments_usage=$'+
[-?\n@(#)\$Id: getcomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?getcomments - extract license information from source files]
[+DESCRIPTION?\bgetcomments\b is a small utilty script which extracts
license information from the "\bgetcomments\b"-database
file created by \bcrawl\b. The script allows various
filters (see options below) to be applied on the database]
[+?The license extraction is done in two steps - first a crawler script
called \bcrawl\b will scan all source files, extract
the comments and stores this information in a "database" file called
"crawlsrccomments_extracted_comments.cpv" and then \bextract_license\b allows
queries on this database.]
[D:database?Database file for input (either file, http:// or https://-URL).]:[database]
[l:acceptfilepattern?Process only files which match pattern.]:[pattern]
[L:rejectfilepattern?Process only files which do not match pattern.]:[pattern]
[c:acceptcommentpattern?Match comments which match pattern. Defaults to ~(Ei)(license|copyright)]:[pattern]
[C:rejectcommentpattern?Discard comments which match pattern. Defaults to ""]:[pattern]
[S:stats?Print statistics.]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
typeset -r do_crawl_usage=$'+
[-?\n@(#)\$Id: crawl (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawl - crawl comment information from source files]
[+DESCRIPTION?\bcrawl\b is a small utilty script which reads
a list of source code files from stdin, determinates the type of
syntax used by these files and then extracts
comments from the source code and stores this information into a
"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
be processed by \bextract_license\b or similar processing tools.]
[S:scanmaxcharacters?Scan a maximum number of numchars characters for comments.
Defaults to 256K characters.]:[numchars]
[N:maxnumcomments?Maximum numbers of comments to crawl. Defaults to "+Infinite"]:[numcomments]
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
typeset -r crawlsrccomments_usage=$'+
[-?\n@(#)\$Id: crawlsrccomments (Roland Mainz) 2010-03-27 \$\n]
[-author?Roland Mainz <roland.mainz@sun.com>]
[-author?Roland Mainz <roland.mainz@nrubsig.org>]
[+NAME?crawlsrccomments - extract and filter comment information from source files]
[+DESCRIPTION?\bcrawlsrccomments\b is a small utilty script which reads
a list of source code files from stdin, determinates the type of
syntax used by these files and then extracts
comments from the source code and stores this information into a
"database"-like file called "crawlsrccomments_extracted_comments.cpv" which can then
be processed by \bextract_license\b or similar processing tools.]
[crawl|getcomments] options
[+SEE ALSO?\bksh93\b(1), \bsvcprop\b(1)]
'
# program start
builtin basename
builtin cat
builtin date
builtin uname
builtin rm
builtin sum || fatal_error "sum builtin not found."
# exit at the first error we hit
set -o errexit
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
*) usage crawlsrccomments_usage ;;
esac
done
typeset cmd="$1"
"crawl")
progname+=" ${cmd}"
do_crawl "$@"
exit $?
;;
"getcomments")
progname+=" ${cmd}"
do_getcomments "$@"
exit $?
;;
*)
;;
esac
fatal_error "not reached."
# EOF.