text-wordbreak.js revision cc70141342b8690e43bef6d8cef0c38fcfba8227
/**
* Provides utility methods for splitting strings on word breaks and determining
* whether a character index represents a word boundary, using the generic word
* breaking algorithm defined in the Unicode Text Segmentation guidelines
* (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
* Annex #29</a>).
*
* @module text
* @submodule text-wordbreak
* @class Text.WordBreak
* @static
*/
// Constants representing code point classifications.
ALETTER = 0,
MIDNUMLET = 1,
MIDLETTER = 2,
MIDNUM = 3,
NUMERIC = 4,
CR = 5,
LF = 6,
NEWLINE = 7,
EXTEND = 8,
FORMAT = 9,
KATAKANA = 10,
EXTENDNUMLET = 11,
OTHER = 12,
// RegExp objects generated from code point data. Each regex matches a single
// character against a set of Unicode code points. The index of each item in
// this array must match its corresponding code point constant value defined
// above.
SETS = [
new RegExp(WBData.extendnumlet)
],
EMPTY_STRING = '',
WHITESPACE = /\s/,
WordBreak = {
// -- Public Static Methods ------------------------------------------------
/**
* Splits the specified string into an array of individual words.
*
* @method getWords
* @param {String} string String to split.
* @param {Object} options (optional) Options object containing zero or more
* of the following properties:
*
* <dl>
* <dt>ignoreCase (Boolean)</dt>
* <dd>
* If <code>true</code>, the string will be converted to lowercase
* before being split. Default is <code>false</code>.
* </dd>
*
* <dt>includePunctuation (Boolean)</dt>
* <dd>
* If <code>true</code>, the returned array will include punctuation
* characters. Default is <code>false</code>.
* </dd>
*
* <dt>includeWhitespace (Boolean)</dt>
* <dd>
* If <code>true</code>, the returned array will include whitespace
* characters. Default is <code>false</code>.
* </dd>
* </dl>
* @return {Array} Array of words.
* @static
*/
var i = 0,
word = [],
words = [],
chr,
if (!options) {
options = {};
}
if (options.ignoreCase) {
}
// Loop through each character in the classification map and determine
// whether it precedes a word boundary, building an array of distinct
// words as we go.
for (; i < len; ++i) {
// Append this character to the current word.
// If there's a word boundary between the current character and the
// next character, append the current word to the words array and
// start building a new word.
if (word &&
}
word = [];
}
}
return words;
},
/**
* Returns an array containing only unique words from the specified string.
* For example, the string <code>'foo bar baz foo'</code> would result in
* the array <code>['foo', 'bar', 'baz']</code>.
*
* @method getUniqueWords
* @param {String} string String to split.
* @param {Object} options (optional) Options (see <code>getWords()</code>
* for details).
* @return {Array} Array of unique words.
* @static
*/
},
/**
* Returns <code>true</code> if the specified character index is a word
* boundary.
*
* @method isWordBoundary
* @param {String} string String to test.
* @param {Number} index Character index to test within the string.
* @return {Boolean} <code>true</code> for a word boundary,
* <code>false</code> otherwise.
* @static
*/
},
// -- Protected Static Methods ---------------------------------------------
/**
* Returns a character classification map for the specified string.
*
* @method _classify
* @param {String} string String to classify.
* @return {Array} Classification map.
* @protected
* @static
*/
var chr,
map = [],
i = 0,
j,
set,
type;
for (; i < stringLength; ++i) {
for (j = 0; j < setsLength; ++j) {
type = j;
break;
}
}
}
return map;
},
/**
* Returns <code>true</code> if there is a word boundary at or after the
* specified character index, <code>false</code> otherwise.
*
* @method _isWordBoundary
* @param {Array} map Character classification map generated by
* <code>_classify</code>.
* @param {Number} index Character index to test.
* @return {Boolean}
* @protected
* @static
*/
var prevType,
// WB5. Don't break between most letters.
return false;
}
// WB6. Don't break letters across certain punctuation.
nextNextType === ALETTER) {
return false;
}
// WB7. Don't break letters across certain punctuation.
return false;
}
// adjacent to letters.
return false;
}
// WB11. Don't break inside numeric sequences like "3.2" or
// "3,456.789".
return false;
}
// WB12. Don't break inside numeric sequences like "3.2" or
// "3,456.789".
nextNextType === NUMERIC) {
return false;
}
// WB4. Ignore format and extend characters.
return false;
}
// WB3. Don't break inside CRLF.
return false;
}
// WB3a. Break before newlines (including CR and LF).
return true;
}
// WB3b. Break after newlines (including CR and LF).
return true;
}
// WB13. Don't break between Katakana characters.
return false;
}
// WB13a. Don't break from extenders.
if (nextType === EXTENDNUMLET &&
type === EXTENDNUMLET)) {
return false;
}
// WB13b. Don't break from extenders.
if (type === EXTENDNUMLET &&
return false;
}
// Break after any character not covered by the rules above.
return true;
}
};