ldap/schema/DoubleMetaphoneApproximateMatchingRuleImpl.java

	DoubleMetaphoneApproximateMatchingRuleImpl.java revision 52674c7996797d28e6671590293a44c1e5f93017
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at legal-notices/CDDLv1_0.txt
 * or http://forgerock.org/license/CDDLv1.0.html.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at legal-notices/CDDLv1_0.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information:
 *      Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 *
 *      Copyright 2009 Sun Microsystems, Inc.
 *      Portions copyright 2014-2015 ForgeRock AS
 */
package org.forgerock.opendj.ldap.schema;

import static org.forgerock.opendj.ldap.schema.SchemaConstants.*;

import org.forgerock.i18n.LocalizableMessage;
import org.forgerock.i18n.slf4j.LocalizedLogger;
import org.forgerock.opendj.ldap.ByteSequence;
import org.forgerock.opendj.ldap.ByteString;

/**
 * This class defines an approximate matching rule based on the Double Metaphone
 * algorithm. The Metaphone and Double Metaphone algorithms were originally
 * devised by Lawrence Philips (published in the December 1990 issue of
 * <I>Computer Language</I> and the <A
 * HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue of
 * <I>C/C++ Users Journal</I></A>, respectively), and this version of the
 * algorithm is based on a version modified by Kevin Atkinson to include
 * bugfixes and additional functionality (source is available <A
 * HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional
 * Metaphone and Double Metaphone information is available at <A
 * HREF="http://aspell.net/metaphone/">http://aspell.net/ metaphone/</A>). This
 * implementation is largely the same as the one provided by Kevin Atkinson, but
 * it has been re-written for better readability, for more efficiency, to get
 * rid of checks for conditions that can't possibly happen, and to get rid of
 * redundant checks that aren't needed. It has also been updated to always only
 * generate a single value rather than one or possibly two values.
 */
final class DoubleMetaphoneApproximateMatchingRuleImpl extends AbstractApproximateMatchingRuleImpl {

    private static final LocalizedLogger logger = LocalizedLogger.getLoggerForThisClass();

    DoubleMetaphoneApproximateMatchingRuleImpl() {
      super(AMR_DOUBLE_METAPHONE_NAME);
    }

    /** {@inheritDoc} */
    public ByteString normalizeAttributeValue(final Schema schema, final ByteSequence value) {
        String valueString = value.toString();
        final int length = valueString.length();
        if (length == 0) {
            // The value is empty, so it is already normalized.
            return ByteString.empty();
        }

        final int last = length - 1;

        // Pad the value to allow for checks to go past the end of the value.
        valueString = valueString.toUpperCase() + "     ";

        // The metaphone value that is being constructed.
        final StringBuilder metaphone = new StringBuilder(4);

        // Skip over GN, KN, PN, WR, and PS at the beginning of a word.
        int pos = 0;
        String substring = valueString.substring(0, 2);
        if (substring.equals("GN") || substring.equals("KN") || substring.equals("PN")
                || substring.equals("WR") || substring.equals("PS")) {
            pos++;
        } else if (valueString.charAt(0) == 'X') {
            // 'X' at the beginning of a word will sound like Z, but Z will
            // always be mapped to S.
            metaphone.append("S");
            pos++;
        }

        // Loop until we have at least four metaphone characters or have
        // reached the end of the string.
        while (metaphone.length() < 4 && pos < length) {
            // Check the character at the current position against various targets.
            char posMinusFour;
            char posMinusThree;
            char posMinusTwo;
            char posMinusOne;
            char posPlusOne;
            char posPlusTwo;
            switch (valueString.charAt(pos)) {
            case 'A':
            case 'E':
            case 'I':
            case 'O':
            case 'U':
            case 'Y':
                // All initial vowels map to 'A'. All others will be ignored.
                if (pos == 0) {
                    metaphone.append("A");
                }

                pos++;
                break;

            case 'B':
                // B and BB will be mapped to P, with the exception of "MB" as
                // in "crumb", but that will be handled elsewhere.
                metaphone.append("P");

                if (valueString.charAt(++pos) == 'B') {
                    pos++;
                }

                break;

            case 'C':
                // Check for various Germanic sequences, which will be mapped to 'K'.
                // This basically includes all occurrences of "ACH" where
                // the preceding character is not a vowel and the following
                // character is neither an 'E' nor an 'I' except in "BACHER" and
                // "MACHER".
                if (pos > 1
                        && !isVowel(posMinusTwo = valueString.charAt(pos - 2))
                        && hasSubstring(valueString, pos - 1, "ACH")
                        && (posPlusTwo = valueString.charAt(pos + 2)) != 'I'
                        && (posPlusTwo != 'E'
                            || (valueString.charAt(pos + 3) == 'R'
                                && (posMinusTwo == 'B' || posMinusTwo == 'M')))) {
                    metaphone.append("K");
                    pos += 2;
                    break;
                }

                // Check for a special case of "caesar", which will be mapped to 'S'.
                if (pos == 0 && hasSubstring(valueString, pos + 1, "AESAR")) {
                    metaphone.append("S");
                    pos += 2;
                    break;
                }

                // CH can be treated in lots of different ways.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H') {
                    // Check for "chia" as in "chianti" and map to 'K'.
                    if (hasSubstring(valueString, pos + 2, "IA")) {
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }

                    // Check for "chae" as in "michael" and map to 'K'.
                    if (hasSubstring(valueString, pos + 2, "AE")) {
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }

                    // Check for a Greek root at the beginning of the value like
                    // chemistry or chorus and map to 'K'.
                    if (pos == 0
                            && !hasSubstring(valueString, 2, "ORE")
                            && (hasSubstring(valueString, 2, "ARAC")
                                    || hasSubstring(valueString, 2, "ARIS")
                                    || hasSubstring(valueString, 2, "OR")
                                    || hasSubstring(valueString, 2, "YM")
                                    || hasSubstring(valueString, 2, "IA") || hasSubstring(
                                        valueString, 2, "EM"))) {
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }

                    // Check for "CH" values that produce a "KH" sound that will
                    // be mapped to 'K'.
                    if (isGermanic(valueString)
                            || hasSubstring(valueString, pos - 2, "ORCHES")
                            || hasSubstring(valueString, pos - 2, "ARCHIT")
                            || hasSubstring(valueString, pos - 2, "ORCHID")
                            || (posPlusTwo = valueString.charAt(pos + 2)) == 'T'
                            || posPlusTwo == 'S'
                            || (pos == 0 || (posMinusOne = valueString.charAt(pos - 1)) == 'A'
                                    || posMinusOne == 'O' || posMinusOne == 'U' || posMinusOne == 'E')
                            && (posPlusTwo == 'L' || posPlusTwo == 'R' || posPlusTwo == 'N'
                                    || posPlusTwo == 'M' || posPlusTwo == 'B' || posPlusTwo == 'H'
                                    || posPlusTwo == 'F' || posPlusTwo == 'V' || posPlusTwo == 'W')) {
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }

                    // All other "CH" values.
                    if (pos > 0) {
                        if (hasSubstring(valueString, 0, "MC")) {
                            metaphone.append("K");
                        } else {
                            metaphone.append("X");
                        }
                    } else {
                        metaphone.append("X");
                    }

                    pos += 2;
                    break;
                }

                // Check for "CZ" as in "czerny" but not "wicz" and map to 'S'.
                if (posPlusOne == 'Z' && !hasSubstring(valueString, pos - 2, "WI")) {
                    metaphone.append("S");
                    pos += 2;
                    break;
                }

                // Check for "CIA" as in "focaccia" and map to 'X'.
                if (posPlusOne == 'I' && valueString.charAt(pos + 2) == 'A') {
                    metaphone.append("X");
                    pos += 3;
                    break;
                }

                // Check for a double C but not in values that start with "McC"
                if (posPlusOne == 'C' && !(pos == 1 && valueString.charAt(0) == 'M')) {
                    posPlusTwo = valueString.charAt(pos + 2);
                    if ((posPlusTwo == 'I' || posPlusTwo == 'E' || posPlusTwo == 'H')
                            && !(posPlusTwo == 'H' && valueString.charAt(pos + 3) == 'U')) {
                        if ((pos == 1 && valueString.charAt(pos - 1) == 'A')
                                || hasSubstring(valueString, pos - 1, "UCCEE")
                                || hasSubstring(valueString, pos - 1, "UCCES")) {
                            // Values like "accident", "accede", and "succeed".
                            metaphone.append("K");
                            pos += 2;
                            break;
                        } else {
                            // Values like "bacci" or "bertucci".
                            metaphone.append("X");
                            pos += 3;
                            break;
                        }
                    } else {
                        // This is Pierce's Rule, whatever that means.
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }
                }

                // Check for CK, CG, or CQ and map to 'K'. Check for CI, CE, and
                // CY and map to "S".
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'K' || posPlusOne == 'G' || posPlusOne == 'Q') {
                    metaphone.append("K");
                    pos += 2;
                    break;
                }

                // Check for CI, CE, or CY and map to 'S'.
                if (posPlusOne == 'I' || posPlusOne == 'E' || posPlusOne == 'Y') {
                    metaphone.append("S");
                    pos += 2;
                    break;
                }

                // All other cases of "C" will be mapped to 'K'. However, the
                // number of positions that we skip ahead may vary. If there is
                // a value that consists of two words like "mac caffrey", then
                // skip ahead three. For the character combinations of "CK" and
                // "CQ", then skip ahead two. For the character combinations of
                // "CC" except "CCE" and "CCI", then skip ahead two. For all
                // other cases, skip ahead one.
                metaphone.append("K");
                switch (valueString.charAt(pos + 1)) {
                case ' ':
                    switch (valueString.charAt(pos + 2)) {
                    case 'C':
                    case 'Q':
                    case 'G':
                        pos += 3;
                        break;
                    default:
                        pos++;
                        break;
                    }
                    break;

                case 'K':
                case 'Q':
                    pos += 2;
                    break;

                case 'C':
                    switch (valueString.charAt(pos + 2)) {
                    case 'E':
                    case 'I':
                        pos++;
                        break;
                    default:
                        pos += 2;
                        break;
                    }
                    break;
                default:
                    pos++;
                }
                break;

            case 'D':
                // DG will be mapped to either 'J' (in cases like edge) or 'TK'
                // (in cases like Edgar).
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'G') {
                    posPlusTwo = valueString.charAt(pos + 2);
                    if (posPlusTwo == 'I' || posPlusTwo == 'E' || posPlusTwo == 'Y') {
                        metaphone.append("J");
                        pos += 3;
                    } else {
                        metaphone.append("TK");
                        pos += 2;
                    }
                    break;
                }

                // DT and DD will be mapped to 'T'.
                if (posPlusOne == 'T' || posPlusOne == 'D') {
                    metaphone.append("T");
                    pos += 2;
                    break;
                }

                // All other cases will be mapped to 'T'.
                metaphone.append("T");
                pos++;
                break;

            case 'F':
                // F always maps to F. If there is a double F, then skip the second one.
                metaphone.append("F");
                pos++;
                if (valueString.charAt(pos) == 'F') {
                    pos++;
                }
                break;

            case 'G':
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H') {
                    // A "GH" that is not preceded by a vowel will be mapped to 'K'.
                    if (pos > 0 && !isVowel(valueString.charAt(pos - 1))) {
                        metaphone.append("K");
                        pos += 2;
                        break;
                    }

                    if (pos == 0) {
                        if (valueString.charAt(pos + 2) == 'I') {
                            // Words like ghislane or ghiradelli
                            metaphone.append("J");
                        } else {
                            metaphone.append("K");
                        }

                        pos += 2;
                        break;
                    }

                    // A refined version of Parker's Rule.
                    if (pos > 1
                            && ((posMinusTwo = valueString.charAt(pos - 2)) == 'B'
                                    || posMinusTwo == 'H' || posMinusTwo == 'D')
                            || pos > 2
                            && ((posMinusThree = valueString.charAt(pos - 3)) == 'B'
                                    || posMinusThree == 'H' || posMinusThree == 'D')
                            || pos > 3
                            && ((posMinusFour = valueString.charAt(pos - 4)) == 'B' || posMinusFour == 'H')) {
                        pos += 2;
                        break;
                    } else {
                        if (pos > 2
                                && valueString.charAt(pos - 1) == 'U'
                                && ((posMinusThree = valueString.charAt(pos - 3)) == 'C'
                                        || posMinusThree == 'G' || posMinusThree == 'L'
                                        || posMinusThree == 'R' || posMinusThree == 'T')) {
                            // Words like laugh, McLaughlin, cough, rough are mapped to 'F'.
                            metaphone.append("F");
                        } else if (pos > 0 && valueString.charAt(pos - 1) != 'I') {
                            metaphone.append("K");
                        }

                        pos += 2;
                        break;
                    }
                }

                if (posPlusOne == 'N') {
                    if (pos == 1 && isVowel(valueString.charAt(0)) && !isSlavoGermanic(valueString)) {
                        metaphone.append("KN");
                        pos += 2;
                        break;
                    } else {
                        if (!hasSubstring(valueString, pos + 2, "EY")
                                && !isSlavoGermanic(valueString)) {
                            metaphone.append("N");
                        } else {
                            metaphone.append("KN");
                        }

                        pos += 2;
                        break;
                    }
                }

                // GLI as in tagliaro will be mapped to "KL".
                if (posPlusOne == 'L' && valueString.charAt(pos + 2) == 'I') {
                    metaphone.append("KL");
                    pos += 2;
                    break;
                }

                // Forms of GY, GE, and GI at the beginning of a word will map to 'K'.
                if (pos == 0
                        && (posPlusOne == 'Y'
                                || (substring = valueString.substring(pos + 1, pos + 3)).equals("ES")
                                || substring.equals("EP")
                                || substring.equals("EB") || substring.equals("EL")
                                || substring.equals("EY") || substring.equals("IB")
                                || substring.equals("IL") || substring.equals("IN")
                                || substring.equals("IE") || substring.equals("EI")
                                || substring.equals("ER"))) {
                    metaphone.append("K");
                    pos += 2;
                    break;
                }

                // Some occurrences of GER and GY in a word will be mapped to 'K'.
                posPlusTwo = valueString.charAt(pos + 2);
                if (((posPlusOne == 'E' && posPlusTwo == 'R') || posPlusOne == 'Y')
                        && (posMinusOne = valueString.charAt(pos - 1)) != 'E' && posMinusOne != 'I'
                        && !hasSubstring(valueString, 0, "DANGER")
                        && !hasSubstring(valueString, 0, "RANGER")
                        && !hasSubstring(valueString, 0, "MANGER")
                        && !hasSubstring(valueString, pos - 1, "RGY")
                        && !hasSubstring(valueString, pos - 1, "OGY")) {
                    metaphone.append("K");
                    pos += 2;
                    break;
                }

                // Check for Italian uses like 'biaggi" and map to 'J'.
                if (posPlusOne == 'E' || posPlusOne == 'I' || posPlusOne == 'Y'
                        || hasSubstring(valueString, pos - 1, "AGGI")
                        || hasSubstring(valueString, pos - 1, "OGGI")) {
                    // Germanic uses will be mapped to 'K'.
                    if (isGermanic(valueString) || hasSubstring(valueString, pos + 1, "ET")) {
                        metaphone.append("K");
                    } else {
                        metaphone.append("J");
                    }

                    pos += 2;
                    break;
                }

                // All other cases will be mapped to 'K'. If there is a double
                // G, then skip two. Otherwise, just skip one.
                metaphone.append("K");
                pos++;

                if (posPlusOne == 'G') {
                    pos++;
                }

                break;

            case 'H':
                // The letter 'H' will only be processed if it is immediately
                // followed by a vowel and is either the start of the word or
                // preceded by a vowel.
                if (isVowel(valueString.charAt(pos + 1))
                        && (pos == 0 || isVowel(valueString.charAt(pos - 1)))) {
                    metaphone.append("H");
                    pos++;
                }

                pos++;
                break;

            case 'J':
                // Take care of obvious Spanish uses that should map to 'H'.
                if (hasSubstring(valueString, 0, "SAN ")) {
                    metaphone.append("H");
                    pos++;
                    break;
                }

                if (hasSubstring(valueString, pos, "JOSE")) {
                    if (pos == 0 && valueString.charAt(pos + 4) == ' ') {
                        metaphone.append("H");
                    } else {
                        metaphone.append("J");
                    }

                    pos++;
                    break;
                }

                // All other cases will be mapped to 'J'.
                metaphone.append("J");

                if (valueString.charAt(pos + 1) == 'J') {
                    pos++;
                }

                pos++;
                break;

            case 'K':
                // 'K' will always be mapped to 'K'. KK will be treated like K.
                metaphone.append("K");

                if (valueString.charAt(pos + 1) == 'K') {
                    pos++;
                }

                pos++;
                break;

            case 'L':
                // 'L' will always be mapped to 'L'. LL will be treated like L,
                // even for potential Spanish uses.
                metaphone.append("L");

                if (valueString.charAt(pos + 1) == 'L') {
                    pos++;
                }

                pos++;
                break;

            case 'M':
                // 'M' will always be mapped to 'M'. MM will be treated like M.
                // UMB in cases like "dumb" and "thumb" will be treated like M.
                metaphone.append("M");

                if (valueString.charAt(pos + 1) == 'M') {
                    pos++;
                } else if (hasSubstring(valueString, pos - 1, "UMB")
                        && (pos + 1 == last || hasSubstring(valueString, pos + 2, "ER"))) {
                    pos++;
                }

                pos++;
                break;

            case 'N':
                // 'N' will always be mapped to 'N'. NN will be treated like N.
                metaphone.append("N");

                if (valueString.charAt(pos + 1) == 'N') {
                    pos++;
                }

                pos++;
                break;

            case 'P':
                // PH will be mapped to 'F'.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H') {
                    metaphone.append("F");
                    pos += 2;
                    break;
                }

                // All other cases will be mapped to 'P', with PP and PB being
                // treated like P.
                metaphone.append("P");

                if (posPlusOne == 'P' || posPlusOne == 'B') {
                    pos++;
                }

                pos++;
                break;

            case 'Q':
                // 'Q' will always be mapped to 'K'. QQ will be treated like Q.
                metaphone.append("K");

                if (valueString.charAt(pos + 1) == 'Q') {
                    pos++;
                }

                pos++;
                break;

            case 'R':
                // Ignore R at the end of French words.
                if (pos == last && !isSlavoGermanic(valueString)
                        && hasSubstring(valueString, pos - 2, "IE")
                        && !hasSubstring(valueString, pos - 4, "ME")
                        && !hasSubstring(valueString, pos - 4, "MA")) {
                    pos++;
                    break;
                }

                // All other cases will be mapped to 'R', with RR treated like R.
                metaphone.append("R");

                if (valueString.charAt(pos + 1) == 'R') {
                    pos++;
                }

                pos++;
                break;

            case 'S':
                // Special cases like isle and carlysle will be silent.
                if (hasSubstring(valueString, pos - 1, "ISL")
                        || hasSubstring(valueString, pos - 1, "YSL")) {
                    pos++;
                    break;
                }

                // Special case of sugar mapped to 'X'.
                if (hasSubstring(valueString, pos + 1, "UGAR")) {
                    metaphone.append("X");
                    pos++;
                    break;
                }

                // SH is generally mapped to 'X', but not in Germanic cases.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H') {
                    if (hasSubstring(valueString, pos + 1, "HEIM")
                            || hasSubstring(valueString, pos + 1, "HOEK")
                            || hasSubstring(valueString, pos + 1, "HOLM")
                            || hasSubstring(valueString, pos + 1, "HOLZ")) {
                        metaphone.append("S");
                    } else {
                        metaphone.append("X");
                    }

                    pos += 2;
                    break;
                }

                // Italian and Armenian cases will map to "S".
                if (hasSubstring(valueString, pos + 1, "IO")
                        || hasSubstring(valueString, pos + 1, "IA")) {
                    metaphone.append("S");
                    pos += 3;
                    break;
                }

                // SZ should be mapped to 'S'.
                if (posPlusOne == 'Z') {
                    metaphone.append("S");
                    pos += 2;
                    break;
                }

                // Various combinations at the beginning of words will be mapped to 'S'.
                if (pos == 0
                        && (posPlusOne == 'M' || posPlusOne == 'N' || posPlusOne == 'L' || posPlusOne == 'W')) {
                    metaphone.append("S");
                    pos++;
                    break;
                }

                // SC should be mapped to either SK, X, or S.
                if (posPlusOne == 'C') {
                    posPlusTwo = valueString.charAt(pos + 2);
                    if (posPlusTwo == 'H') {
                        if (hasSubstring(valueString, pos + 3, "OO")
                                || hasSubstring(valueString, pos + 3, "UY")
                                || hasSubstring(valueString, pos + 3, "ED")
                                || hasSubstring(valueString, pos + 3, "EM")) {
                            metaphone.append("SK");
                        } else {
                            metaphone.append("X");
                        }

                        pos += 3;
                        break;
                    }

                    if (posPlusTwo == 'I' || posPlusTwo == 'E' || posPlusTwo == 'Y') {
                        metaphone.append("S");
                        pos += 3;
                        break;
                    }

                    metaphone.append("SK");
                    pos += 3;
                    break;
                }

                // Ignore a trailing S in French words. All others will be
                // mapped to 'S'.
                if (!(pos == last && (hasSubstring(valueString, pos - 2, "AI") || hasSubstring(
                        valueString, pos - 2, "OI")))) {
                    metaphone.append("S");
                }

                if (posPlusOne == 'S' || posPlusOne == 'Z') {
                    pos++;
                }

                pos++;
                break;

            case 'T':
                // "TION", "TIA", and "TCH" will be mapped to 'X'.
                if (hasSubstring(valueString, pos, "TION") || hasSubstring(valueString, pos, "TIA")
                        || hasSubstring(valueString, pos, "TCH")) {
                    metaphone.append("X");
                    pos += 3;
                    break;
                }

                // TH or TTH will be mapped to either T (for Germanic cases) or
                // 0 (zero) for the rest.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H'
                        || (posPlusOne == 'T' && valueString.charAt(pos + 2) == 'H')) {
                    if (isGermanic(valueString) || hasSubstring(valueString, pos + 2, "OM")
                            || hasSubstring(valueString, pos + 2, "AM")) {
                        metaphone.append("T");
                    } else {
                        metaphone.append("0");
                    }

                    pos += 2;
                    break;
                }

                // All other cases will map to T, with TT and TD being treated like T.
                metaphone.append("T");

                if (posPlusOne == 'T' || posPlusOne == 'D') {
                    pos++;
                }

                pos++;
                break;

            case 'V':
                // 'V' will always be mapped to 'F', with VV treated like V.
                metaphone.append("F");

                if (valueString.charAt(pos + 1) == 'V') {
                    pos++;
                }

                pos++;
                break;

            case 'W':
                // WR should always map to R.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'R') {
                    metaphone.append("R");
                    pos += 2;
                    break;
                }

                // W[AEIOUYH] at the beginning of the word should be mapped to A.
                if (pos == 0 && (isVowel(posPlusOne) || posPlusOne == 'H')) {
                    metaphone.append("A");

                    // FIXME -- This isn't in the algorithm as written. Should it be?
                    pos += 2;
                    break;
                }

                // A Polish value like WICZ or WITZ should be mapped to TS.
                if (hasSubstring(valueString, pos + 1, "WICZ")
                        || hasSubstring(valueString, pos + 1, "WITZ")) {
                    metaphone.append("TS");
                    pos += 4;
                    break;
                }

                // Otherwise, we'll just skip it.
                pos++;
                break;

            case 'X':
                // X maps to KS except at the end of French words.
                if (!(pos == last && (hasSubstring(valueString, pos - 3, "IAU")
                        || hasSubstring(valueString, pos - 3, "EAU")
                        || hasSubstring(valueString, pos - 2, "AU") || hasSubstring(valueString,
                            pos - 2, "OU")))) {
                    metaphone.append("KS");
                }

                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'C' || posPlusOne == 'X') {
                    pos++;
                }

                pos++;
                break;

            case 'Z':
                // Chinese usages like zhao will map to J.
                posPlusOne = valueString.charAt(pos + 1);
                if (posPlusOne == 'H') {
                    metaphone.append("J");
                    pos += 2;
                    break;
                }

                // All other cases map to "S". ZZ will be treated like Z.
                metaphone.append("S");

                if (posPlusOne == 'Z') {
                    pos++;
                }

                pos++;
                break;

            case '\u00C7': // C with a cedilla
                // This will always be mapped to 'S'.
                metaphone.append("S");
                pos++;
                break;

            case '\u00D1': // N with a tilde
                // This will always be mapped to 'N'.
                metaphone.append("N");
                pos++;
                break;

            default:
                // We don't have any special treatment for this character, so
                // skip it.
                pos++;
                break;
            }
        }

        return ByteString.valueOf(metaphone);
    }

    /**
     * Indicates whether the provided value has the given substring at the
     * specified position.
     *
     * @param value
     *            The value containing the range for which to make the
     *            determination.
     * @param start
     *            The position in the value at which to start the comparison.
     * @param substring
     *            The substring to compare against the specified value range.
     * @return <CODE>true</CODE> if the specified portion of the value matches
     *         the given substring, or <CODE>false</CODE> if it does not.
     */
    private boolean hasSubstring(final String value, final int start, final String substring) {
        try {
            // This can happen since a lot of the rules "look behind" and
            // rightfully don't check if it's the first character
            if (start < 0) {
                return false;
            }

            final int end = start + substring.length();

            // value isn't big enough to do the comparison
            if (end > value.length()) {
                return false;
            }

            for (int i = 0, pos = start; pos < end; i++, pos++) {
                if (value.charAt(pos) != substring.charAt(i)) {
                    return false;
                }
            }

            return true;
        } catch (final Exception e) {
            logger.debug(LocalizableMessage.raw(
                "Unable to check that '%s' has substring '%s' at position %d: %s", value, substring, start, e));
            return false;
        }
    }

    /**
     * Indicates whether the provided string appears Germanic (starts with
     * "VAN ", "VON ", or "SCH").
     *
     * @param s
     *            The string for which to make the determination.
     * @return <CODE>true</CODE> if the provided string appears Germanic, or
     *         <CODE>false</CODE> if not.
     */
    private boolean isGermanic(final String s) {
        return s.startsWith("VAN ") || s.startsWith("VON ") || s.startsWith("SCH");
    }

    /**
     * Indicates whether the provided string appears to be Slavo-Germanic.
     *
     * @param s
     *            The string for which to make the determination.
     * @return <CODE>true</CODE> if the provided string appears to be
     *         Slavo-Germanic, or <CODE>false</CODE> if not.
     */
    private boolean isSlavoGermanic(final String s) {
        return s.contains("W") || s.contains("K") || s.contains("CZ") || s.contains("WITZ");
    }

    /**
     * Indicates whether the provided character is a vowel (including "Y").
     *
     * @param c
     *            The character for which to make the determination.
     * @return <CODE>true</CODE> if the provided character is a vowel, or
     *         <CODE>false</CODE> if not.
     */
    private boolean isVowel(final char c) {
        switch (c) {
        case 'A':
        case 'E':
        case 'I':
        case 'O':
        case 'U':
        case 'Y':
            return true;

        default:
            return false;
        }
    }
}