Cross Reference: Stemmer.java
xref
: /
openjdk7
/
hotspot
/
test
/
compiler
/
7070134
/
Stemmer.java
Home
History
Annotate
Line#
Navigate
Download
Search
only in
./
2665
N/A
/**
2665
N/A
*
@test
2665
N/A
*
@bug
7070134
2665
N/A
*
@summary
Hotspot crashes with sigsegv from PorterStemmer
2665
N/A
*
2665
N/A
*
@run
shell Test7070134.sh
2665
N/A
*/
2665
N/A
2665
N/A
/*
2665
N/A
2665
N/A
Porter stemmer in Java. The original paper is in
2665
N/A
2665
N/A
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
2665
N/A
no. 3, pp 130-137,
2665
N/A
3368
N/A
http://www.tartarus.org/~martin/PorterStemmer
3368
N/A
3368
N/A
The software is completely free for any purpose, unless notes at the head
3368
N/A
of the program text indicates otherwise (which is rare). In any case,
3368
N/A
the notes about licensing are never more restrictive than the BSD License.
3368
N/A
3368
N/A
In every case where the software is not written by me (Martin Porter),
3368
N/A
this licensing arrangement has been endorsed by the contributor, and it is
3368
N/A
therefore unnecessary to ask the contributor again to confirm it.
3368
N/A
3368
N/A
I have not asked any contributors (or their employers, if they have them)
3368
N/A
for proofs that they have the right to distribute their software in this way.
2665
N/A
2665
N/A
History:
2665
N/A
2665
N/A
Release 1
2665
N/A
2665
N/A
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
2665
N/A
The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
2665
N/A
is then out outside the bounds of b.
2665
N/A
2665
N/A
Release 2
2665
N/A
2665
N/A
Similarly,
2665
N/A
2665
N/A
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
2665
N/A
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
2665
N/A
b[j] is then outside the bounds of b.
2665
N/A
2665
N/A
Release 3
2665
N/A
2665
N/A
Considerably revised 4/9/00 in the light of many helpful suggestions
2665
N/A
from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
2665
N/A
2665
N/A
Release 4
2665
N/A
2665
N/A
*/
2665
N/A
2665
N/A
import
java
.
io
.*;
2665
N/A
2665
N/A
/**
2665
N/A
* Stemmer, implementing the Porter Stemming Algorithm
2665
N/A
*
2665
N/A
* The Stemmer class transforms a word into its root form. The input
2665
N/A
* word can be provided a character at time (by calling add()), or at once
2665
N/A
* by calling one of the various stem(something) methods.
2665
N/A
*/
2665
N/A
2665
N/A
class
Stemmer
2665
N/A
{
private
char
[] b;
2665
N/A
private
int
i,
/* offset into b */
2665
N/A
i_end
,
/* offset to end of stemmed word */
2665
N/A
j, k;
2665
N/A
private
static
final
int
INC
=
50
;
2665
N/A
/* unit of size whereby b is increased */
2665
N/A
public
Stemmer
()
2665
N/A
{ b =
new
char
[
INC
];
2665
N/A
i =
0
;
2665
N/A
i_end
=
0
;
2665
N/A
}
2665
N/A
2665
N/A
/**
2665
N/A
* Add a character to the word being stemmed. When you are finished
2665
N/A
* adding characters, you can call stem(void) to stem the word.
2665
N/A
*/
2665
N/A
2665
N/A
public
void
add
(
char
ch
)
2665
N/A
{
if
(i == b.
length
)
2665
N/A
{
char
[]
new_b
=
new
char
[i+
INC
];
2665
N/A
for
(
int
c =
0
; c < i; c++)
new_b
[c] = b[c];
2665
N/A
b =
new_b
;
2665
N/A
}
2665
N/A
b[i++] =
ch
;
2665
N/A
}
2665
N/A
2665
N/A
2665
N/A
/** Adds wLen characters to the word being stemmed contained in a portion
2665
N/A
* of a char[] array. This is like repeated calls of add(char ch), but
2665
N/A
* faster.
2665
N/A
*/
2665
N/A
2665
N/A
public
void
add
(
char
[] w,
int
wLen
)
2665
N/A
{
if
(i+
wLen
>= b.
length
)
2665
N/A
{
char
[]
new_b
=
new
char
[i+
wLen
+
INC
];
2665
N/A
for
(
int
c =
0
; c < i; c++)
new_b
[c] = b[c];
2665
N/A
b =
new_b
;
2665
N/A
}
2665
N/A
for
(
int
c =
0
; c <
wLen
; c++) b[i++] = w[c];
2665
N/A
}
2665
N/A
2665
N/A
/**
2665
N/A
* After a word has been stemmed, it can be retrieved by toString(),
2665
N/A
* or a reference to the internal buffer can be retrieved by getResultBuffer
2665
N/A
* and getResultLength (which is generally more efficient.)
2665
N/A
*/
2665
N/A
public
String
toString
() {
return
new
String
(b,
0
,
i_end
); }
2665
N/A
2665
N/A
/**
2665
N/A
* Returns the length of the word resulting from the stemming process.
2665
N/A
*/
2665
N/A
public
int
getResultLength
() {
return
i_end
; }
2665
N/A
2665
N/A
/**
2665
N/A
* Returns a reference to a character buffer containing the results of
2665
N/A
* the stemming process. You also need to consult getResultLength()
2665
N/A
* to determine the length of the result.
2665
N/A
*/
2665
N/A
public
char
[]
getResultBuffer
() {
return
b; }
2665
N/A
2665
N/A
/* cons(i) is true <=> b[i] is a consonant. */
2665
N/A
2665
N/A
private
final
boolean
cons
(
int
i)
2665
N/A
{
switch
(b[i])
2665
N/A
{
case
'a'
:
case
'e'
:
case
'i'
:
case
'o'
:
case
'u'
:
return
false
;
2665
N/A
case
'y'
:
return
(i==
0
) ?
true
: !
cons
(i-
1
);
2665
N/A
default
:
return
true
;
2665
N/A
}
2665
N/A
}
2665
N/A
2665
N/A
/* m() measures the number of consonant sequences between 0 and j. if c is
2665
N/A
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
2665
N/A
presence,
2665
N/A
2665
N/A
<c><v> gives 0
2665
N/A
<c>vc<v> gives 1
2665
N/A
<c>vcvc<v> gives 2
2665
N/A
<c>vcvcvc<v> gives 3
2665
N/A
....
2665
N/A
*/
2665
N/A
2665
N/A
private
final
int
m()
2665
N/A
{
int
n =
0
;
2665
N/A
int
i =
0
;
2665
N/A
while
(
true
)
2665
N/A
{
if
(i > j)
return
n;
2665
N/A
if
(!
cons
(i))
break
; i++;
2665
N/A
}
2665
N/A
i++;
2665
N/A
while
(
true
)
2665
N/A
{
while
(
true
)
2665
N/A
{
if
(i > j)
return
n;
2665
N/A
if
(
cons
(i))
break
;
2665
N/A
i++;
2665
N/A
}
2665
N/A
i++;
2665
N/A
n++;
2665
N/A
while
(
true
)
2665
N/A
{
if
(i > j)
return
n;
2665
N/A
if
(!
cons
(i))
break
;
2665
N/A
i++;
2665
N/A
}
2665
N/A
i++;
2665
N/A
}
2665
N/A
}
2665
N/A
2665
N/A
/* vowelinstem() is true <=> 0,...j contains a vowel */
2665
N/A
2665
N/A
private
final
boolean
vowelinstem
()
2665
N/A
{
int
i;
for
(i =
0
; i <= j; i++)
if
(!
cons
(i))
return
true
;
2665
N/A
return
false
;
2665
N/A
}
2665
N/A
2665
N/A
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
2665
N/A
2665
N/A
private
final
boolean
doublec
(
int
j)
2665
N/A
{
if
(j <
1
)
return
false
;
2665
N/A
if
(b[j] != b[j-
1
])
return
false
;
2665
N/A
return
cons
(j);
2665
N/A
}
2665
N/A
2665
N/A
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
2665
N/A
and also if the second c is not w,x or y. this is used when trying to
2665
N/A
restore an e at the end of a short word. e.g.
2665
N/A
2665
N/A
cav(e), lov(e), hop(e), crim(e), but
2665
N/A
snow, box, tray.
2665
N/A
2665
N/A
*/
2665
N/A
2665
N/A
private
final
boolean
cvc
(
int
i)
2665
N/A
{
if
(i <
2
|| !
cons
(i) ||
cons
(i-
1
) || !
cons
(i-
2
))
return
false
;
2665
N/A
{
int
ch
= b[i];
2665
N/A
if
(
ch
==
'w'
||
ch
==
'x'
||
ch
==
'y'
)
return
false
;
2665
N/A
}
2665
N/A
return
true
;
2665
N/A
}
2665
N/A
2665
N/A
private
final
boolean
ends
(
String
s)
2665
N/A
{
int
l = s.
length
();
2665
N/A
int
o = k-l+
1
;
2665
N/A
if
(o <
0
)
return
false
;
2665
N/A
for
(
int
i =
0
; i < l; i++)
if
(b[o+i] != s.
charAt
(i))
return
false
;
2665
N/A
j = k-l;
2665
N/A
return
true
;
2665
N/A
}
2665
N/A
2665
N/A
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
2665
N/A
k. */
2665
N/A
2665
N/A
private
final
void
setto
(
String
s)
2665
N/A
{
int
l = s.
length
();
2665
N/A
int
o = j+
1
;
2665
N/A
for
(
int
i =
0
; i < l; i++) b[o+i] = s.
charAt
(i);
2665
N/A
k = j+l;
2665
N/A
}
2665
N/A
2665
N/A
/* r(s) is used further down. */
2665
N/A
2665
N/A
private
final
void
r(
String
s) {
if
(m() >
0
)
setto
(s); }
2665
N/A
2665
N/A
/* step1() gets rid of plurals and -ed or -ing. e.g.
2665
N/A
2665
N/A
caresses -> caress
2665
N/A
ponies -> poni
2665
N/A
ties -> ti
2665
N/A
caress -> caress
2665
N/A
cats -> cat
2665
N/A
2665
N/A
feed -> feed
2665
N/A
agreed -> agree
2665
N/A
disabled -> disable
2665
N/A
2665
N/A
matting -> mat
2665
N/A
mating -> mate
2665
N/A
meeting -> meet
2665
N/A
milling -> mill
2665
N/A
messing -> mess
2665
N/A
2665
N/A
meetings -> meet
2665
N/A
2665
N/A
*/
2665
N/A
2665
N/A
private
final
void
step1
()
2665
N/A
{
if
(b[k] ==
's'
)
2665
N/A
{
if
(
ends
(
"sses"
)) k -=
2
;
else
2665
N/A
if
(
ends
(
"ies"
))
setto
(
"i"
);
else
2665
N/A
if
(b[k-
1
] !=
's'
) k--;
2665
N/A
}
2665
N/A
if
(
ends
(
"eed"
)) {
if
(m() >
0
) k--; }
else
2665
N/A
if
((
ends
(
"ed"
) ||
ends
(
"ing"
)) &&
vowelinstem
())
2665
N/A
{ k = j;
2665
N/A
if
(
ends
(
"at"
))
setto
(
"ate"
);
else
2665
N/A
if
(
ends
(
"bl"
))
setto
(
"ble"
);
else
2665
N/A
if
(
ends
(
"iz"
))
setto
(
"ize"
);
else
2665
N/A
if
(
doublec
(k))
2665
N/A
{ k--;
2665
N/A
{
int
ch
= b[k];
2665
N/A
if
(
ch
==
'l'
||
ch
==
's'
||
ch
==
'z'
) k++;
2665
N/A
}
2665
N/A
}
2665
N/A
else
if
(m() ==
1
&&
cvc
(k))
setto
(
"e"
);
2665
N/A
}
2665
N/A
}
2665
N/A
2665
N/A
/* step2() turns terminal y to i when there is another vowel in the stem. */
2665
N/A
2665
N/A
private
final
void
step2
() {
if
(
ends
(
"y"
) &&
vowelinstem
()) b[k] =
'i'
; }
2665
N/A
2665
N/A
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
2665
N/A
-ation) maps to -ize etc. note that the string before the suffix must give
2665
N/A
m() > 0. */
2665
N/A
2665
N/A
private
final
void
step3
() {
if
(k ==
0
)
return
;
/* For Bug 1 */
switch
(b[k-
1
])
2665
N/A
{
2665
N/A
case
'a'
:
if
(
ends
(
"ational"
)) { r(
"ate"
);
break
; }
2665
N/A
if
(
ends
(
"tional"
)) { r(
"tion"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'c'
:
if
(
ends
(
"enci"
)) { r(
"ence"
);
break
; }
2665
N/A
if
(
ends
(
"anci"
)) { r(
"ance"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'e'
:
if
(
ends
(
"izer"
)) { r(
"ize"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'l'
:
if
(
ends
(
"bli"
)) { r(
"ble"
);
break
; }
2665
N/A
if
(
ends
(
"alli"
)) { r(
"al"
);
break
; }
2665
N/A
if
(
ends
(
"entli"
)) { r(
"ent"
);
break
; }
2665
N/A
if
(
ends
(
"eli"
)) { r(
"e"
);
break
; }
2665
N/A
if
(
ends
(
"ousli"
)) { r(
"ous"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'o'
:
if
(
ends
(
"ization"
)) { r(
"ize"
);
break
; }
2665
N/A
if
(
ends
(
"ation"
)) { r(
"ate"
);
break
; }
2665
N/A
if
(
ends
(
"ator"
)) { r(
"ate"
);
break
; }
2665
N/A
break
;
2665
N/A
case
's'
:
if
(
ends
(
"alism"
)) { r(
"al"
);
break
; }
2665
N/A
if
(
ends
(
"iveness"
)) { r(
"ive"
);
break
; }
2665
N/A
if
(
ends
(
"fulness"
)) { r(
"ful"
);
break
; }
2665
N/A
if
(
ends
(
"ousness"
)) { r(
"ous"
);
break
; }
2665
N/A
break
;
2665
N/A
case
't'
:
if
(
ends
(
"aliti"
)) { r(
"al"
);
break
; }
2665
N/A
if
(
ends
(
"iviti"
)) { r(
"ive"
);
break
; }
2665
N/A
if
(
ends
(
"biliti"
)) { r(
"ble"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'g'
:
if
(
ends
(
"logi"
)) { r(
"log"
);
break
; }
2665
N/A
} }
2665
N/A
2665
N/A
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
2665
N/A
2665
N/A
private
final
void
step4
() {
switch
(b[k])
2665
N/A
{
2665
N/A
case
'e'
:
if
(
ends
(
"icate"
)) { r(
"ic"
);
break
; }
2665
N/A
if
(
ends
(
"ative"
)) { r(
""
);
break
; }
2665
N/A
if
(
ends
(
"alize"
)) { r(
"al"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'i'
:
if
(
ends
(
"iciti"
)) { r(
"ic"
);
break
; }
2665
N/A
break
;
2665
N/A
case
'l'
:
if
(
ends
(
"ical"
)) { r(
"ic"
);
break
; }
2665
N/A
if
(
ends
(
"ful"
)) { r(
""
);
break
; }
2665
N/A
break
;
2665
N/A
case
's'
:
if
(
ends
(
"ness"
)) { r(
""
);
break
; }
2665
N/A
break
;
2665
N/A
} }
2665
N/A
2665
N/A
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
2665
N/A
2665
N/A
private
final
void
step5
()
2665
N/A
{
if
(k ==
0
)
return
;
/* for Bug 1 */
switch
(b[k-
1
])
2665
N/A
{
case
'a'
:
if
(
ends
(
"al"
))
break
;
return
;
2665
N/A
case
'c'
:
if
(
ends
(
"ance"
))
break
;
2665
N/A
if
(
ends
(
"ence"
))
break
;
return
;
2665
N/A
case
'e'
:
if
(
ends
(
"er"
))
break
;
return
;
2665
N/A
case
'i'
:
if
(
ends
(
"ic"
))
break
;
return
;
2665
N/A
case
'l'
:
if
(
ends
(
"able"
))
break
;
2665
N/A
if
(
ends
(
"ible"
))
break
;
return
;
2665
N/A
case
'n'
:
if
(
ends
(
"ant"
))
break
;
2665
N/A
if
(
ends
(
"ement"
))
break
;
2665
N/A
if
(
ends
(
"ment"
))
break
;
2665
N/A
/* element etc. not stripped before the m */
2665
N/A
if
(
ends
(
"ent"
))
break
;
return
;
2665
N/A
case
'o'
:
if
(
ends
(
"ion"
) && j >=
0
&& (b[j] ==
's'
|| b[j] ==
't'
))
break
;
2665
N/A
/* j >= 0 fixes Bug 2 */
2665
N/A
if
(
ends
(
"ou"
))
break
;
return
;
2665
N/A
/* takes care of -ous */
2665
N/A
case
's'
:
if
(
ends
(
"ism"
))
break
;
return
;
2665
N/A
case
't'
:
if
(
ends
(
"ate"
))
break
;
2665
N/A
if
(
ends
(
"iti"
))
break
;
return
;
2665
N/A
case
'u'
:
if
(
ends
(
"ous"
))
break
;
return
;
2665
N/A
case
'v'
:
if
(
ends
(
"ive"
))
break
;
return
;
2665
N/A
case
'z'
:
if
(
ends
(
"ize"
))
break
;
return
;
2665
N/A
default
:
return
;
2665
N/A
}
2665
N/A
if
(m() >
1
) k = j;
2665
N/A
}
2665
N/A
2665
N/A
/* step6() removes a final -e if m() > 1. */
2665
N/A
2665
N/A
private
final
void
step6
()
2665
N/A
{ j = k;
2665
N/A
if
(b[k] ==
'e'
)
2665
N/A
{
int
a = m();
2665
N/A
if
(a >
1
|| a ==
1
&& !
cvc
(k-
1
)) k--;
2665
N/A
}
2665
N/A
if
(b[k] ==
'l'
&&
doublec
(k) && m() >
1
) k--;
2665
N/A
}
2665
N/A
2665
N/A
/** Stem the word placed into the Stemmer buffer through calls to add().
2665
N/A
* Returns true if the stemming process resulted in a word different
2665
N/A
* from the input. You can retrieve the result with
2665
N/A
* getResultLength()/getResultBuffer() or toString().
2665
N/A
*/
2665
N/A
public
void
stem
()
2665
N/A
{ k = i -
1
;
2665
N/A
if
(k >
1
) {
step1
();
step2
();
step3
();
step4
();
step5
();
step6
(); }
2665
N/A
i_end
= k+
1
; i =
0
;
2665
N/A
}
2665
N/A
2665
N/A
/** Test program for demonstrating the Stemmer. It reads text from a
2665
N/A
* a list of files, stems each word, and writes the result to standard
2665
N/A
* output. Note that the word stemmed is expected to be in lower case:
2665
N/A
* forcing lower case must be done outside the Stemmer class.
2665
N/A
* Usage: Stemmer file-name file-name ...
2665
N/A
*/
2665
N/A
public
static
void
main
(
String
[]
args
)
2665
N/A
{
2665
N/A
char
[] w =
new
char
[
501
];
2665
N/A
Stemmer
s =
new
Stemmer
();
2665
N/A
for
(
int
i =
0
; i <
args
.
length
; i++)
2665
N/A
try
2665
N/A
{
2665
N/A
FileInputStream
in
=
new
FileInputStream
(
args
[i]);
2665
N/A
2665
N/A
try
2665
N/A
{
while
(
true
)
2665
N/A
2665
N/A
{
int
ch
=
in
.
read
();
2665
N/A
if
(
Character
.
isLetter
((
char
)
ch
))
2665
N/A
{
2665
N/A
int
j =
0
;
2665
N/A
while
(
true
)
2665
N/A
{
ch
=
Character
.
toLowerCase
((
char
)
ch
);
2665
N/A
w[j] = (
char
)
ch
;
2665
N/A
if
(j <
500
) j++;
2665
N/A
ch
=
in
.
read
();
2665
N/A
if
(!
Character
.
isLetter
((
char
)
ch
))
2665
N/A
{
2665
N/A
/* to test add(char ch) */
2665
N/A
for
(
int
c =
0
; c < j; c++) s.
add
(w[c]);
2665
N/A
2665
N/A
/* or, to test add(char[] w, int j) */
2665
N/A
/* s.add(w, j); */
2665
N/A
2665
N/A
s.
stem
();
2665
N/A
{
String
u;
2665
N/A
2665
N/A
/* and now, to test toString() : */
2665
N/A
u = s.
toString
();
2665
N/A
2665
N/A
/* to test getResultBuffer(), getResultLength() : */
2665
N/A
/* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
2665
N/A
2665
N/A
System
.
out
.
print
(u);
2665
N/A
}
2665
N/A
break
;
2665
N/A
}
2665
N/A
}
2665
N/A
}
2665
N/A
if
(
ch
<
0
)
break
;
2665
N/A
System
.
out
.
print
((
char
)
ch
);
2665
N/A
}
2665
N/A
}
2665
N/A
catch
(
IOException
e)
2665
N/A
{
System
.
out
.
println
(
"error reading "
+
args
[i]);
2665
N/A
break
;
2665
N/A
}
2665
N/A
}
2665
N/A
catch
(
FileNotFoundException
e)
2665
N/A
{
System
.
out
.
println
(
"file "
+
args
[i] +
" not found"
);
2665
N/A
break
;
2665
N/A
}
2665
N/A
}
2665
N/A
}