/**
* @test
* @bug 7070134
* @summary Hotspot crashes with sigsegv from PorterStemmer
*
* @run shell Test7070134.sh
*/
/*
Porter stemmer in Java. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
The software is completely free for any purpose, unless notes at the head
of the program text indicates otherwise (which is rare). In any case,
the notes about licensing are never more restrictive than the BSD License.
In every case where the software is not written by me (Martin Porter),
this licensing arrangement has been endorsed by the contributor, and it is
therefore unnecessary to ask the contributor again to confirm it.
I have not asked any contributors (or their employers, if they have them)
for proofs that they have the right to distribute their software in this way.
History:
Release 1
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.
Release 2
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.
Release 3
Considerably revised 4/9/00 in the light of many helpful suggestions
from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
Release 4
*/
/**
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
class Stemmer
{ private char[] b;
private int i, /* offset into b */
j, k;
/* unit of size whereby b is increased */
public Stemmer()
{ b = new char[INC];
i = 0;
i_end = 0;
}
/**
* Add a character to the word being stemmed. When you are finished
* adding characters, you can call stem(void) to stem the word.
*/
{ if (i == b.length)
for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
}
b[i++] = ch;
}
/** Adds wLen characters to the word being stemmed contained in a portion
* of a char[] array. This is like repeated calls of add(char ch), but
* faster.
*/
for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
}
for (int c = 0; c < wLen; c++) b[i++] = w[c];
}
/**
* After a word has been stemmed, it can be retrieved by toString(),
* or a reference to the internal buffer can be retrieved by getResultBuffer
* and getResultLength (which is generally more efficient.)
*/
/**
* Returns the length of the word resulting from the stemming process.
*/
/**
* Returns a reference to a character buffer containing the results of
* the stemming process. You also need to consult getResultLength()
* to determine the length of the result.
*/
public char[] getResultBuffer() { return b; }
/* cons(i) is true <=> b[i] is a consonant. */
private final boolean cons(int i)
{ switch (b[i])
{ case 'a': case 'e': case 'i': case 'o': case 'u': return false;
default: return true;
}
}
/* m() measures the number of consonant sequences between 0 and j. if c is
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
private final int m()
{ int n = 0;
int i = 0;
while(true)
{ if (i > j) return n;
if (! cons(i)) break; i++;
}
i++;
while(true)
{ while(true)
{ if (i > j) return n;
if (cons(i)) break;
i++;
}
i++;
n++;
while(true)
{ if (i > j) return n;
if (! cons(i)) break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> 0,...j contains a vowel */
private final boolean vowelinstem()
{ int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
return false;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
private final boolean doublec(int j)
{ if (j < 1) return false;
if (b[j] != b[j-1]) return false;
return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
private final boolean cvc(int i)
{ int ch = b[i];
}
return true;
}
{ int l = s.length();
int o = k-l+1;
if (o < 0) return false;
for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
j = k-l;
return true;
}
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
k. */
{ int l = s.length();
int o = j+1;
for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
k = j+l;
}
/* r(s) is used further down. */
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
private final void step1()
{ if (b[k] == 's')
if (b[k-1] != 's') k--;
}
{ k = j;
if (doublec(k))
{ k--;
{ int ch = b[k];
}
}
}
}
/* step2() turns terminal y to i when there is another vowel in the stem. */
/* step3() maps double suffices to single ones. so -ization ( = -ize plus
-ation) maps to -ize etc. note that the string before the suffix must give
m() > 0. */
{
break;
break;
break;
break;
break;
break;
break;
} }
/* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
private final void step4() { switch (b[k])
{
break;
break;
break;
break;
} }
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
private final void step5()
{ if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
if (ends("ence")) break; return;
if (ends("ible")) break; return;
if (ends("ement")) break;
if (ends("ment")) break;
/* element etc. not stripped before the m */
if (ends("ent")) break; return;
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break; return;
/* takes care of -ous */
if (ends("iti")) break; return;
default: return;
}
if (m() > 1) k = j;
}
/* step6() removes a final -e if m() > 1. */
private final void step6()
{ j = k;
if (b[k] == 'e')
{ int a = m();
}
}
/** Stem the word placed into the Stemmer buffer through calls to add().
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public void stem()
{ k = i - 1;
}
/** Test program for demonstrating the Stemmer. It reads text from a
* a list of files, stems each word, and writes the result to standard
* output. Note that the word stemmed is expected to be in lower case:
* forcing lower case must be done outside the Stemmer class.
* Usage: Stemmer file-name file-name ...
*/
{
char[] w = new char[501];
try
{
try
{ while(true)
{
int j = 0;
while(true)
w[j] = (char) ch;
if (j < 500) j++;
{
/* to test add(char ch) */
for (int c = 0; c < j; c++) s.add(w[c]);
/* or, to test add(char[] w, int j) */
/* s.add(w, j); */
s.stem();
{ String u;
/* and now, to test toString() : */
u = s.toString();
/* to test getResultBuffer(), getResultLength() : */
/* u = new String(s.getResultBuffer(), 0, s.getResultLength()); */
}
break;
}
}
}
if (ch < 0) break;
}
}
catch (IOException e)
break;
}
}
catch (FileNotFoundException e)
break;
}
}
}