DoubleMetaphoneApproximateMatchingRule.java revision 00870f7dc92907ec0b59c43b783cde7367071ff0
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at
* trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable,
* add the following below this CDDL HEADER, with the fields enclosed
* by brackets "[]" replaced with your own identifying information:
* Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
*
* Portions Copyright 2006-2007 Sun Microsystems, Inc.
*/
/**
* This class defines an approximate matching rule based on the Double Metaphone
* algorithm. The Metaphone and Double Metaphone algorithms were originally
* devised by Lawrence Philips (published in the December 1990 issue of
* <I>Computer Language</I> and the
* <A HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue
* of <I>C/C++ Users Journal</I></A>, respectively), and this version of the
* algorithm is based on a version modified by Kevin Atkinson to include
* bugfixes and additional functionality (source is available
* <A HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional
* Metaphone and Double Metaphone information is available at
* <A HREF="http://aspell.net/metaphone/">http://aspell.net/metaphone/</A>).
* This implementation is largely the same as the one provided by Kevin
* Atkinson, but it has been re-written for better readability, for more
* efficiency, to get rid of checks for conditions that can't possibly happen,
* and to get rid of redundant checks that aren't needed. It has also been
* updated to always only generate a single value rather than one or possibly
* two values.
*/
public class DoubleMetaphoneApproximateMatchingRule
extends ApproximateMatchingRule
{
/**
* The tracer object for the debug logger.
*/
/**
* Creates a new instance of this double metaphone approximate matching rule.
*/
{
super();
}
/**
* {@inheritDoc}
*/
{
// No initialization is required.
}
/**
* Retrieves the common name for this matching rule.
*
* @return The common name for this matching rule, or <CODE>null</CODE> if
* it does not have a name.
*/
{
return AMR_DOUBLE_METAPHONE_NAME;
}
/**
* Retrieves the OID for this matching rule.
*
* @return The OID for this matching rule.
*/
{
return AMR_DOUBLE_METAPHONE_OID;
}
/**
* Retrieves the description for this matching rule.
*
* @return The description for this matching rule, or <CODE>null</CODE> if
* there is none.
*/
public String getDescription()
{
// There is no standard description for this matching rule.
return AMR_DOUBLE_METAPHONE_DESCRIPTION;
}
/**
* Retrieves the OID of the syntax with which this matching rule is
* associated.
*
* @return The OID of the syntax with which this matching rule is associated.
*/
public String getSyntaxOID()
{
// Approximate matching is really only appropriate for DirectoryString
// values.
return SYNTAX_DIRECTORY_STRING_OID;
}
/**
* Retrieves the normalized form of the provided value, which is best suited
* for efficiently performing matching operations on that value.
*
* @param value The value to be normalized.
*
* @return The normalized version of the provided value.
*
* @throws DirectoryException If the provided value is invalid according to
* the associated attribute syntax.
*/
throws DirectoryException
{
if (length == 0)
{
// The value is empty, so it is already normalized.
return new ASN1OctetString();
}
// Pad the value to allow for checks to go past the end of the value.
// The metaphone value that is being constructed.
// Skip over GN, KN, PN, WR, and PS at the beginning of a word.
int pos = 0;
{
pos++;
}
// 'X' at the beginning of a word will sound like Z, but Z will always be
// mapped to S.
{
pos++;
}
// Loop until we have at least four metaphone characters or have reached the
// end of the string.
{
// Check the character at the current position against various targets.
char posMinusFour;
char posMinusThree;
char posMinusTwo;
char posMinusOne;
char posPlusOne;
char posPlusTwo;
{
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'Y':
// All initial vowels map to 'A'. All others will be ignored.
if (pos == 0)
{
}
pos++;
break;
case 'B':
// B and BB will be mapped to P, with the exception of "MB" as in
// "crumb", but that will be handled elsewhere.
{
pos++;
}
break;
case 'C':
// Check for various Germanic sequences, which will be mapped to 'K'.
// This basically includes all occurrences of "ACH" where the
// preceding character is not a vowel and the following character is
// neither an 'E' nor an 'I' except in "BACHER" and "MACHER".
if ((pos > 1) &&
((posPlusTwo != 'E') ||
{
pos += 2;
break;
}
// Check for a special case of "caesar", which will be maped to 'S'.
{
pos += 2;
break;
}
// CH can be treated in lots of different ways.
{
// Check for "chia" as in "chianti" and map to 'K'.
{
pos += 2;
break;
}
// Check for "chae" as in "michael" and map to 'K'.
{
pos += 2;
break;
}
// Check for a Greek root at the beginning of the value like
// chemistry or chorus and map to 'K'.
{
pos += 2;
break;
}
// Check for "CH" values that produce a "KH" sound that will be
// mapped to 'K'.
if (isGermanic(valueString) ||
(posPlusTwo == 'S') ||
(((pos == 0) ||
(posMinusOne == 'E'))) &&
(posPlusTwo == 'W'))))
{
pos += 2;
break;
}
// All other "CH" values.
if (pos > 0)
{
{
}
else
{
}
}
else
{
}
pos += 2;
break;
}
// Check for "CZ" as in "czerny" but not "wicz" and map to 'S'.
if ((posPlusOne == 'Z') &&
{
pos += 2;
break;
}
// Check for "CIA" as in "focaccia" and map to 'X'.
{
pos += 3;
break;
}
// Check for a double C but not in values that start with "McC"
if ((posPlusOne == 'C') &&
{
{
{
// Values like "accident", "accede", and "succeed".
pos += 2;
break;
}
else
{
// Values like "bacci" or "bertucci".
pos += 3;
break;
}
}
else
{
// This is Pierce's Rule, whatever that means.
pos += 2;
break;
}
}
// Check for CK, CG, or CQ and map to 'K'. Check for CI, CE, and CY
// and map to "S".
{
pos += 2;
break;
}
// Check for CI, CE, or CY and map to 'S'.
{
pos += 2;
break;
}
// All other cases of "C" will be mapped to 'K'. However, the number
// of positions that we skip ahead may vary. If there is a value that
// consists of two words like "mac caffrey", then skip ahead three.
// For the character combinations of "CK" and "CQ", then skip ahead
// two. For the character combinations of "CC" except "CCE" and
// "CCI", then skip ahead two. For all other cases, skip ahead one.
{
case ' ':
{
case 'C':
case 'Q':
case 'G':
pos += 3;
break;
default:
pos++;
break;
}
break;
case 'K':
case 'Q':
pos += 2;
break;
case 'C':
{
case 'E':
case 'I':
pos++;
break;
default:
pos += 2;
break;
}
break;
default:
pos++;
}
break;
case 'D':
// DG will be mapped to either 'J' (in cases like edge) or 'TK' (in
// cases like Edgar).
{
{
pos += 3;
break;
}
else
{
pos += 2;
break;
}
}
// DT and DD will be mapped to 'T'.
{
pos += 2;
break;
}
// All other cases will be mapped to 'T'.
pos++;
break;
case 'F':
// F always maps to F. If there is a double F, then skip the second
// one.
pos++;
{
pos++;
}
break;
case 'G':
{
// A "GH" that is not preceded by a vowel will be mapped to 'K'.
{
pos += 2;
break;
}
if (pos == 0)
{
{
// Words like ghislane or ghiradelli
}
else
{
}
pos += 2;
break;
}
// A refined version of Parker's Rule.
if (((pos > 1) &&
((pos > 2) &&
((pos > 3) &&
(posMinusFour == 'H'))))
{
pos += 2;
break;
}
else
{
{
// Words like laugh, McLaughlin, cough, rough are mapped to 'F'.
}
{
}
pos += 2;
break;
}
}
if (posPlusOne == 'N')
{
(! isSlavoGermanic(valueString)))
{
pos += 2;
break;
}
else
{
(! isSlavoGermanic(valueString)))
{
}
else
{
}
pos += 2;
break;
}
}
// GLI as in tagliaro will be mapped to "KL".
{
pos += 2;
break;
}
// Forms of GY, GE, and GI at the beginning of a word will map to 'K'.
if ((pos == 0) &&
((posPlusOne == 'Y') ||
{
pos += 2;
break;
}
// Some occurrences of GER and GY in a word will be mapped to 'K'.
(posPlusOne == 'Y')) &&
(posMinusOne != 'I') &&
{
pos += 2;
break;
}
// Check for Italian uses like 'biaggi" and map to 'J'.
(posPlusOne == 'Y') ||
{
// Germanic uses will be mapped to 'K'.
if (isGermanic(valueString) ||
{
}
else
{
}
pos += 2;
break;
}
// All other cases will be mapped to 'K'. If there is a double G,
// then skip two. Otherwise, just skip one.
pos++;
if (posPlusOne == 'G')
{
pos++;
}
break;
case 'H':
// The letter 'H' will only be processed if it is immediately followed
// by a vowel and is either the start of the word or preceded by a
// vowel.
{
{
pos++;
}
}
pos++;
break;
case 'J':
// Take care of obvious Spanish uses that should map to 'H'.
{
pos++;
break;
}
{
{
}
else
{
}
pos++;
break;
}
// All other cases will be mapped to 'J'.
{
pos++;
}
pos++;
break;
case 'K':
// 'K' will always be mapped to 'K'. KK will be treated like K.
{
pos++;
}
pos++;
break;
case 'L':
// 'L' will always be mapped to 'L'. LL will be treated like L, even
// for potential Spanish uses.
{
pos++;
}
pos++;
break;
case 'M':
// 'M' will always be mapped to 'M'. MM will be treated like M.
// UMB in cases like "dumb" and "thumb" will be treated like M.
{
pos++;
}
{
{
pos++;
}
}
pos++;
break;
case 'N':
// 'N' will always be mapped to 'N'. NN will be treated like N.
{
pos++;
}
pos++;
break;
case 'P':
// PH will be mapped to 'F'.
{
pos += 2;
break;
}
// All other cases will be mapped to 'P', with PP and PB being treated
// like P.
{
pos++;
}
pos++;
break;
case 'Q':
// 'Q' will always be mapped to 'K'. QQ will be treated like Q.
{
pos++;
}
pos++;
break;
case 'R':
// Ignore R at the end of French words.
{
pos++;
break;
}
// All other cases will be mapped to 'R', with RR treated like R.
{
pos++;
}
pos++;
break;
case 'S':
// Special cases like isle and carlysle will be silent.
{
pos++;
break;
}
// Special case of sugar mapped to 'X'.
{
pos++;
break;
}
// SH is generally mapped to 'X', but not in Germanic cases.
{
{
}
else
{
}
pos += 2;
break;
}
// Italian and Armenian cases will map to "S".
{
pos += 3;
break;
}
// SZ should be mapped to 'S'.
if (posPlusOne == 'Z')
{
pos += 2;
break;
}
// Various combinations at the beginning of words will be mapped to
// 'S'.
if ((pos == 0) &&
{
pos++;
break;
}
// SC should be mapped to either SK, X, or S.
if (posPlusOne == 'C')
{
{
{
}
else
{
}
pos += 3;
break;
}
(posPlusTwo == 'Y'))
{
pos += 3;
break;
}
pos += 3;
break;
}
// Ignore a trailing S in French words. All others will be mapped to
// 'S'.
{
}
{
pos++;
}
pos++;
break;
case 'T':
// "TION", "TIA", and "TCH" will be mapped to 'X'.
{
pos += 3;
break;
}
// TH or TTH will be mapped to either T (for Germanic cases) or
// 0 (zero) for the rest.
{
if (isGermanic(valueString) ||
{
}
else
{
}
pos += 2;
break;
}
// All other cases will map to T, with TT and TD being treated like T.
{
pos++;
}
pos++;
break;
case 'V':
// 'V' will always be mapped to 'F', with VV treated like V.
{
pos++;
}
pos++;
break;
case 'W':
// WR should always map to R.
{
pos += 2;
break;
}
// W[AEIOUYH] at the beginning of the word should be mapped to A.
{
// FIXME -- This isn't in the algorithm as written. Should it be?
pos += 2;
break;
}
// A Polish value like WICZ or WITZ should be mapped to TS.
{
pos += 4;
break;
}
// Otherwise, we'll just skip it.
pos++;
break;
case 'X':
// X maps to KS except at the end of French words.
{
}
(posPlusOne == 'X'))
{
pos++;
}
pos++;
break;
case 'Z':
// Chinese usages like zhao will map to J.
{
pos += 2;
break;
}
// All other cases map to "S". ZZ will be treated like Z.
if (posPlusOne == 'Z')
{
pos++;
}
pos++;
break;
case '\u00C7': // C with a cedilla
// This will always be mapped to 'S'.
pos++;
break;
case '\u00D1': // N with a tilde
// This will always be mapped to 'N'.
pos++;
break;
default:
// We don't have any special treatment for this character, so skip it.
pos++;
break;
}
}
}
/**
* Indicates whether the two provided normalized values are approximately
* equal to each other.
*
* @param value1 The normalized form of the first value to compare.
* @param value2 The normalized form of the second value to compare.
*
* @return <CODE>true</CODE> if the provided values are approximately equal,
* or <CODE>false</CODE> if not.
*/
{
// If the values have been normalized, then we just need to compare their
// byte arrays.
}
/**
* Indicates whether the provided value has the given substring at the
* specified position.
*
* @param value The value containing the range for which to make the
* determination.
* @param start The position in the value at which to start the
* comparison.
* @param substring The substring to compare against the specified value
* range.
*
* @return <CODE>true</CODE> if the specified portion of the value matches
* the given substring, or <CODE>false</CODE> if it does not.
*/
{
try
{
// This can happen since a lot of the rules "look behind" and
// rightfully don't check if it's the first character
if (start < 0) {
return false;
}
// value isn't big enough to do the comparison
{
return false;
}
{
{
return false;
}
}
return true;
}
catch (Exception e)
{
if (debugEnabled())
{
}
return false;
}
}
/**
* Indicates whether the provided character is a vowel (including "Y").
*
* @param c The character for which to make the determination.
*
* @return <CODE>true</CODE> if the provided character is a vowel, or
* <CODE>false</CODE> if not.
*/
private boolean isVowel(char c)
{
switch (c)
{
case 'A':
case 'E':
case 'I':
case 'O':
case 'U':
case 'Y':
return true;
default:
return false;
}
}
/**
* Indicates whether the provided string appears to be Slavo-Germanic.
*
* @param s The string for which to make the determination.
*
* @return <CODE>true</CODE> if the provided string appears to be
* Slavo-Germanic, or <CODE>false</CODE> if not.
*/
private boolean isSlavoGermanic(String s)
{
s.contains("WITZ"));
}
/**
* Indicates whether the provided string appears Germanic (starts with "VAN ",
* "VON ", or "SCH").
*
* @param s The string for which to make the determination.
*
* @return <CODE>true</CODE> if the provided string appears Germanic, or
* <CODE>false</CODE> if not.
*/
private boolean isGermanic(String s)
{
s.startsWith("SCH"));
}
}