copyright-extractor revision 20126
#
# Script for extracting copyright and licensing information from source code
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License, Version 1.0 only
# (the "License"). You may not use this file except in compliance
# with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
#
use strict;
use warnings;
use Cwd;
my $min_merge = 10;
print "copyright-extractor [options] <source directory>\n";
print "\n";
print "Options:\n";
print " -r, --raw\n";
print " Print raw comments only, do not attempt to merge,\n";
print " only unify identical comments.\n";
print " -c, --copyright-first\n";
print " Attempt to move copyright statements to the start of the\n";
print " comment block.\n";
print " Note: when using this option, there is a chance that\n";
print " lines get mixed up if a copyright statement extends to\n";
print " more than one line.\n";
print " -m n --min=n\n";
print " only merge if there are at least n consecutive identical\n";
print " -g, --gpl\n";
print " Add the disclaimer about GPLv2 to the beginning of the\n";
print " -O, --omitted\n";
print " Print a list of files that were not checked\n";
print " -h, --help\n";
print " Print this usage information\n";
print " -d n, --debug=n\n";
print " Turn on debug output.\n";
}
my %blurbs;
my $gpl_found = 0;
my @files_omitted;
my $debug = 0;
my $dumb_mode = 0;
my $copyright_first = 0;
my $gpl_disclaimer = 0;
my $print_omitted = 0;
my @dirs;
sub process_args {
my $arg = shift;
print "Try --help for usage.\n";
exit (1);
}
}
sub process_options {
}
# a very simple file type check based on the file name
# fname: the file name to classify
# Returns: one of the above contants
my $fname = shift;
if ($fname =~ /([~]$|\/(ChangeLog|configure\.in|Makefile|ltmain\.sh|README|NEWS|INSTALL|HACKING|configure$|config\.)$)/) {
# some file names to ignore
return FTYPE_IGNORE;
# some more file names to ignore
return FTYPE_IGNORE;
return FTYPE_C;
return FTYPE_PERL;
return FTYPE_PYTHON;
return FTYPE_SHELL;
return FTYPE_JAVA;
} else {
# FIXME: could do something smart here
return FTYPE_IGNORE;
}
}
# return 1 if the string includes words that suggest that the string
# is some sort of legal text. If none of these words appear in the
# string, this program will ignore it and assume that it's some other
# comment that happens to be at the beginning of the file
my $str = shift;
if ($str =~ /(licen[cs]|legal|terms|condition|copyright|rights|\(c\)|copying|distribut|gpl|gnu|source code|software|permission|authori)/) {
return 1;
}
return 0;
}
# extract the comments
my $fname = shift;
my $blurb;
my $line;
next if $line =~ /^#!/;
# not a comment line
} else {
}
}
}
next;
}
$line =~ s/^#//;
# delete certain types of comments, like emacs mode spec, etc
$line =~ s/^\s*-\*-.*-\*-\s*$//;
}
}
} else {
}
}
}
my $fname = shift;
my $blurb;
my $in_comment_block = 0;
my $line;
$line =~ s/\*\/.*//;
$in_comment_block = 0;
$line =~ s/^\/\///;
}
} else {
$line =~ s/^.*\/\//\/\//;
$line =~ s/^.*\/\*/\/\*/;
} else {
next;
}
}
$in_comment_block = 1;
$line =~ s/^\s*\/\*//;
$line =~ s/^\s*\/\///;
# add to blurb if not the start of the blurb
} else {
# end of comment block
} else {
}
}
}
next;
}
}
# delete certain types of comments, like emacs mode spec, etc
$line =~ s/^\s*-\*-.*-\*-\s*$//;
$line =~ s/^\s*\**\s*\@-type\@\s*$//;
}
}
} else {
}
}
}
my $fname = shift;
my $blurb;
my $in_comment_block = 0;
my $line;
$line =~ s/"""//;
$in_comment_block = 0;
$line =~ s/^#//;
}
} else {
$in_comment_block = 1;
$line =~ s/^\s*"""//;
$in_comment_block = 1;
$line =~ s/^\s*r"""//;
$line =~ s/^\s*"""//;
} elsif ($line eq '') {
# add to blurb if not the start of the blurb
} else {
# end of comment block
if (defined ($blurb)) {
if(is_legalese ($blurb)) {
} else {
my @b = ($blurb);
}
}
}
$blurb = undef;
next;
}
}
# delete certain types of comments, like emacs mode spec, etc
$line =~ s/^\s*-\*-.*-\*-\s*$//;
$line =~ s/^\s*vim(:\S+=\S+)+\s*$//;
$line =~ s/^\s\$Id:.*\$\s*$//;
chomp ($line);
if (defined $blurb) {
} elsif ($line ne '') {
}
$line = undef;
}
close SRCFILE;
} else {
my @b = ($blurb);
}
}
}
sub extract_comments($);
# process a directory or a file recursively: extract the comments
# from the beginning of each file and save them in %blurbs
sub extract_comments($) {
my $fname = shift;
if (-d $fname) {
# directory -> process recursively
my @thefiles= readdir(DIR);
closedir(DIR);
foreach my $f (@thefiles) {
next if $f eq '.';
next if $f eq '..';
next if $f eq '.libs';
next if $f eq 'intl';
}
} elsif (-f $fname) {
# regular file -> identify file type and read comments
return if $ftype == FTYPE_IGNORE;
if ($ftype == FTYPE_C) {
extract_comments_c ($fname);
} elsif ($ftype == FTYPE_PERL) {
extract_comments_shell ($fname);
} elsif ($ftype == FTYPE_SHELL) {
extract_comments_shell ($fname);
} elsif ($ftype == FTYPE_PYTHON) {
extract_comments_python ($fname);
} elsif ($ftype == FTYPE_JAVA) {
extract_comments_c ($fname);
}
} else {
}
}
# like uniq(1)
sub uniq (@) {
my @list = @_;
my $prev;
if (not @list) {
return @list;
}
my @uniq_list = ($prev);
foreach my $str (@list) {
push (@uniq_list, $str);
}
return @uniq_list;
}
# return the number of lines in str
sub line_count ($) {
my $str = shift;
return ($str =~ tr/\n//) + 1;
}
# return 1 if str is a member of the list, 0 otherwise
sub is_member ($@) {
my $str = shift;
my @list = @_;
foreach my $s (@list) {
if ($str eq $s) {
return 1;
}
}
return 0;
}
sub do_merge_comments ($$$$$);
# Args: references to lists of strings (lines of the texts)
#
# ml1: lines from the first text already processed
# l1: remaining lines of the 1st text
# nl1: remaining normalised lines of the 1st text
# l2: remaining lines of the 2nd text
# nl2: remaining normalised lines of the 1st text
#
# Return: list of merged lines
sub do_merge_comments ($$$$$) {
my $ml1_ref = shift;
my $l1_ref = shift;
my $nl1_ref = shift;
my $l2_ref = shift;
my $nl2_ref = shift;
my @mlines1 = @$ml1_ref;
my @nmlines1;
my @lines1 = @$l1_ref;
my @norm_lines1 = @$nl1_ref;
my @lines2 = @$l2_ref;
my @norm_lines2 = @$nl2_ref;
my @nmlines2;
my @mlines2;
my @merged_lines;
my $line1;
my $norm_line1;
my $line2;
my $norm_line2;
if ($debug > 2) {
if (@mlines1) {
foreach my $l (@mlines1) {
}
}
foreach my $l (@lines1) {
}
foreach my $l (@lines2) {
}
}
if (not @lines1) {
push (@merged_lines, @mlines1);
push (@merged_lines, @lines2);
return @merged_lines;
}
if (not @lines2) {
push (@merged_lines, @mlines1);
push (@merged_lines, @lines1);
return @merged_lines;
}
# first save the lines only appearing in lines1,
# stop at the first 2 common lines that are not empty
while (@lines1) {
$line1 = shift (@lines1);
$norm_line1 = shift (@norm_lines1);
if (($norm_line1 ne '') and
is_member ($norm_line1, @norm_lines2)) {
last;
} else {
push (@mlines1, $line1);
push (@nmlines1, $norm_line1);
}
}
# now save the lines appearing in lines2 before the common line
while (@lines2) {
$line2 = shift (@lines2);
$norm_line2 = shift (@norm_lines2);
if ($norm_line2 ne $norm_line1) {
push (@mlines2, $line2);
push (@nmlines2, $line2);
} else {
last;
}
}
my @common_lines;
my @ncommon_lines;
# now save the first common line
if ($norm_line1 eq $norm_line2) {
if ($debug > 3) {
}
@common_lines = ($line1);
@ncommon_lines = ($norm_line2);
} else {
# no common lines were found
# lines1 should be empty, all lines moved to mlines1
push (@merged_lines, @mlines1);
push (@merged_lines, @mlines2);
return @merged_lines;
}
# save all common lines
while (@lines1 and @lines2) {
$line1 = shift (@lines1);
$norm_line1 = shift (@norm_lines1);
$line2 = shift (@lines2);
$norm_line2 = shift (@norm_lines2);
if ($norm_line1 ne $norm_line2) {
if ($debug > 3) {
}
unshift (@lines1, $line1);
unshift (@norm_lines1, $norm_line1);
unshift (@lines2, $line2);
unshift (@norm_lines2, $norm_line2);
last;
} else {
if ($debug > 3) {
}
push (@common_lines, $line1);
push (@ncommon_lines, $norm_line1);
}
}
# only merge if the number of common lines is at least $min_merge
# or we are at the end of one of the texts or if at the
# beginning of the 2nd text
if (($#common_lines >= $min_merge) or
(not @lines1) or (not @lines2) or
(not @mlines2)) {
if ($debug > 1) {
foreach my $l (@common_lines) {
}
}
# first the lines from the 1st text
push (@merged_lines, @mlines1);
# then the lines from the 2nd text
push (@merged_lines, @mlines2);
# finally the common lines
push (@merged_lines, @common_lines);
} else {
# don't merge
# add the common lines to the processed part of the 1st text
push (@mlines1, @common_lines);
# add the common lines back to the unprocessed part of the 2nd text
unshift (@lines2, @common_lines);
# add the lines before the common lines back to the unprocessed
# part of the 2nd text
unshift (@lines2, @mlines2);
# add the normalised common lines back to
# the unprocessed part of the 2nd text
unshift (@norm_lines2, @ncommon_lines);
# add the normalised lines before the common lines back to
# the unprocessed part of the 2nd text
unshift (@norm_lines2, @nmlines2);
# add the normalised common lines back to
# try to merge the rest of the texts
my @more_merged_lines = do_merge_comments (\@mlines1,
\@lines1, \@norm_lines1, \@lines2, \@norm_lines2);
push (@merged_lines, @more_merged_lines);
return @merged_lines;
}
if (not @lines1) {
push (@merged_lines, @lines2);
} elsif (not @lines2) {
push (@merged_lines, @lines1);
} else {
# repeat the process for the remaining lines
my @l1;
my @more_merged_lines = do_merge_comments (\@l1,
\@lines1, \@norm_lines1, \@lines2, \@norm_lines2);
push (@merged_lines, @more_merged_lines);
}
return @merged_lines;
}
sub merge_comments ($$) {
my $str1 = shift;
my $str2 = shift;
my @lines1 = split /\n/, $str1;
my @lines2 = split /\n/, $str2;
my @norm_lines1;
my @norm_lines2;
foreach my $l0 (@lines1) {
# ignore whitespace differences
$l1 =~ s/\s+/ /g;
$l1 =~ s/^ //g;
chomp ($l1);
$l1 =~ s/ $//g;
push (@norm_lines1, $l1);
}
foreach my $l0 (@lines2) {
# ignore whitespace differences
$l2 =~ s/\s+/ /g;
$l2 =~ s/^ //g;
chomp ($l2);
$l2 =~ s/ $//g;
push (@norm_lines2, $l2);
}
my @l0;
my @merged_lines = do_merge_comments (\@l0, \@lines1, \@norm_lines1,
\@lines2, \@norm_lines2);
my $merged_str;
if ($copyright_first) {
my @copyright_lines;
my @non_cr_lines;
foreach my $line (@merged_lines) {
if ($line =~ /^\s*(copyright|\(c\)|©|author:|all rights reserved)/i) {
push (@copyright_lines, $line);
} else {
push (@non_cr_lines, $line);
}
}
@copyright_lines = sort (@copyright_lines);
@copyright_lines = uniq (@copyright_lines);
} else {
}
return $merged_str;
}
my @all_comments;
my %comments;
sub unify_comments () {
foreach my $fname (keys %blurbs) {
$gpl_found = 1;
}
} else {
}
}
@all_comments = (keys %comments);
}
}
sub smart_merge_comments () {
my @temp_all_comments = @all_comments;
@all_comments = ();
my $i = 0;
while ($i <= $#temp_all_comments) {
my $did_merge = 0;
my $c1 = $temp_all_comments[$i];
for (my $j = $i+1; $j <= $#temp_all_comments; $j++) {
my $c2 = $temp_all_comments[$j];
my $c12_merged = merge_comments ($c1, $c2);
my $c12_lc = line_count ($c12_merged);
# if more than 10 lines or more than 25% saved then
# keep the merged comments
if ($debug > 0) {
print "++++++++++++++++++++++++++++++++++++++++++++++\n";
print "==============================================\n";
}
$temp_all_comments[$j] = $c12_merged;
$did_merge = 1;
last;
}
}
if (not $did_merge) {
push (@all_comments, $c1);
}
$i++;
}
}
sub print_comments () {
if ($gpl_found and $gpl_disclaimer) {
For the avoidance of doubt, except that if any license choice other
than GPL or LGPL is available it will apply instead, Oracle elects to
use only the General Public License version 2 (GPLv2) at this time
for any software where a choice of GPL license versions is made
available with the language indicating that GPLv2 or any later
version may be used, or where a choice of which version of the GPL
is applied is otherwise unspecified.
--------------------------------------------------------------------
__EOF
}
foreach my $comment (@all_comments) {
print $comment;
print "\n\n" .
"--------------------------------------------------------------------" .
"\n\n";
}
}
sub main() {
my $srcdir;
process_options ();
if (not @dirs) {
usage();
exit (1);
}
foreach my $srcdir (@dirs) {
if ($srcdir =~ /^\./) {
$srcdir = getcwd();
}
extract_comments ($srcdir);
}
unify_comments ();
if (not $dumb_mode) {
smart_merge_comments ();
}
print_comments ();
if ($print_omitted and @files_omitted) {
foreach my $fname (@files_omitted) {
}
}
}
main();