regcmp.c revision 2
2N/A * The contents of this file are subject to the terms of the 2N/A * Common Development and Distribution License (the "License"). 2N/A * You may not use this file except in compliance with the License. 2N/A * See the License for the specific language governing permissions 2N/A * and limitations under the License. 2N/A * When distributing Covered Code, include this CDDL HEADER in each 2N/A * If applicable, add the following below this CDDL HEADER, with the 2N/A * fields enclosed by brackets "[]" replaced with your own identifying 2N/A * information: Portions Copyright [yyyy] [name of copyright owner] 2N/A * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 2N/A * Use is subject to license terms. 2N/A/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 2N/A/* All Rights Reserved */ 2N/A#
pragma ident "%Z%%M% %I% %E% SMI" 2N/A * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS. 2N/A * IT IS **NOT** CHARACTER SET INDEPENDENT. 2N/A/* CONSTANTS SHARED WITH regex() */ 2N/A/* PRIVATE CONSTANTS */ 2N/A/* PRIVATE GLOBAL VARIABLES */ 2N/A/* DECLARATIONS OF PRIVATE FUNCTIONS */ 2N/A/* DEFINITIONS OF PUBLIC VARIABLES */ 2N/A * define thread-specific storage for __i_size 2N/A/* DEFINITION OF regcmp() */ 2N/A * When compiling a regular expression, regcmp() generates at most 2N/A * two extra single-byte characters for each character in the 2N/A * expression, so allocating three times the number of bytes in all 2N/A * the strings that comprise the regular expression will ensure that 2N/A * regcmp() won't overwrite the end of the allocated block when 2N/A * compiling the expression. 2N/A /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */ 2N/A }
else /* (char_size == 0 ) */ {
2N/A /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */ 2N/A }
else if /* (char_size == 0) && */ (
next_argp != (
char *)0) {
2N/A /* ((char_size==0) && (next_argp==(char *)0)) */ 2N/A * the regular expression is "^" 2N/A /* COMPILE THE REGULAR EXPRESSION */ 2N/A * At the end of each iteration get the next character 2N/A * from the regular expression and increment regexp to 2N/A * point to the following character. Exit when all 2N/A * the characters in all the strings in the argument 2N/A * list have been read. 2N/A * No fall-through. Each case ends with either 2N/A * a break or an error exit. Each case starts 2N/A * with compilep addressing the next location to 2N/A * be written in the compiled regular expression, 2N/A * and with regexp addressing the next character 2N/A * to be read from the regular expression being 2N/A * compiled. Each case that doesn't return 2N/A * increments regexp to address the next character 2N/A * to be read from the regular expression and 2N/A * increments compilep to address the next 2N/A * location to be written in the compiled 2N/A * regular expression. 2N/A * NOTE: The comments for each case give the meaning 2N/A * of the regular expression compiled by the case 2N/A * and the character string written to the compiled 2N/A * regular expression by the case. Each single 2N/A * written to the compiled regular expression is 2N/A * shown enclosed in angle brackets (<>). Each 2N/A * compiled regular expression begins with a marker 2N/A * character which is shown as a named constant 2N/A * (e.g. <ASCII_CHAR>). Character constants are 2N/A * shown enclosed in single quotes (e.g. <'$'>). 2N/A * All other single characters written to the 2N/A * compiled regular expression are shown as lower 2N/A * case variable names (e.g. <ascii_char> or 2N/A * <multibyte_char>). Multicharacter 2N/A * strings written to the compiled regular expression 2N/A * are shown as variable names followed by elipses 2N/A * (e.g. <regex...>). 2N/A /* end of string marker or simple dollar sign */ 2N/A /* compiles to <END_OF_STRING_MARK> or */ 2N/A /* <ASCII_CHAR><'$'> */ 2N/A break;
/* end case DOLLAR_SIGN */ 2N/A /* compiles to <ANY_CHAR> */ 2N/A break;
/* end case DOT */ 2N/A * compiles to <ASCII_CHAR><ascii_char> or 2N/A * <MULTIBYTE_CHAR><multibyte_char> 2N/A break;
/* end case '\\' */ 2N/A /* start of a character class expression */ 2N/A * [^...c...] compiles to 2N/A * <NOT_IN_CLASS><class_length><...c...> 2N/A * [^...a-z...] compiles to 2N/A * <NOT_IN_CLASS><class_length><...a<THRU>z...> 2N/A * [...c...] compiles to 2N/A * <IN_CLASS><class_length><...c...> 2N/A * [...a-z...] compiles to 2N/A * <IN_CLASS><class_length><...a<THRU>z...> 2N/A * NOTE: <class_length> includes the 2N/A * <class_length> byte 2N/A /* DETERMINE THE CLASS TYPE */ 2N/A * NOTE: This algorithm checks the value of the 2N/A * to find out if regcmp() 2N/A * is compiling the regular expression in a 2N/A /* leave space for <class_length> */ 2N/A /* leave space for <class_length> */ 2N/A /* COMPILE THE CLASS */ 2N/A * check for a leading right square bracket, 2N/A * the leading RIGHT_SQUARE_BRACKET may 2N/A * be part of a character range 2N/A * expression like "[]-\]" 2N/A * decode the character in the following 2N/A * while loop and decide then if it can 2N/A * be the first character 2N/A * in a character range expression 2N/A * if a DASH follows current_char, 2N/A * current_char, the DASH and the 2N/A * character that follows the DASH 2N/A * may form a character range 2N/A }
else if /* (current_char == DASH) && */ 2N/A * current_char is a DASH, but 2N/A * either begins the entire 2N/A * character class or follows a 2N/A * character that's already 2N/A * part of a character range 2N/A * expression, so it simply 2N/A * represents the DASH character 2N/A * if another DASH follows this 2N/A * one, this DASH is part 2N/A * of a character range expression 2N/A }
else /* ((current_char == DASH && */ 2N/A /* (dash_indicates_range == B_TRUE)) */ {
2N/A * the DASH appears after a single 2N/A * character that isn't 2N/A * already part of a character 2N/A * range expression, so it 2N/A * and the characters preceding 2N/A * and following it can form a 2N/A * character range expression 2N/A * the preceding DASH is 2N/A * the last character in the 2N/A * class and represents the 2N/A * DASH character itself 2N/A * the DASH is part of a 2N/A * expression; encode the 2N/A * rest of the expression 2N/A * if a DASH follows this 2N/A * it represents the DASH 2N/A /* GET THE NEXT CHARACTER */ 2N/A /* end while (current_char != RIGHT_SQUARE_BRACKET) */ 2N/A /* INSERT THE LENGTH OF THE CLASS INTO THE */ 2N/A /* COMPILED EXPRESSION */ 2N/A break;
/* end case LEFT_SQUARE_BRACKET */ 2N/A * start of a parenthesized group of regular 2N/A * expressions compiles to <'\0'><'\0'>, leaving 2N/A * space in the compiled regular expression for 2N/A * <group_type|ADDED_LENGTH_BITS><group_length> 2N/A * groups can contain groups, so group 2N/A * must be saved and restored in sequence 2N/A break;
/* end case LEFT_PAREN */ 2N/A /* end of a marked group of regular expressions */ 2N/A * (<regex>)$0-9 compiles to 2N/A * <SAVED_GROUP><substringn><compiled_regex...>\ 2N/A * <END_SAVED_GROUP><substringn><return_arg_number> 2N/A * (<regex>)* compiles to 2N/A * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS> 2N/A * <group_length> <compiled_regex...> 2N/A * <END_GROUP|ZERO_OR_MORE><groupn> 2N/A * (<regex>)+ compiles to 2N/A * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS> 2N/A * <compiled_regex...><END_GROUP|ONE_OR_MORE> 2N/A * (<regex>){...} compiles to 2N/A * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 2N/A * <compiled_regex...><END_GROUP|COUNT><groupn>\ 2N/A * <minimum_repeat_count><maximum_repeat_count> 2N/A * otherwise (<regex>) compiles to 2N/A * <SIMPLE_GROUP><blank><compiled_regex...> 2N/A * <END_GROUP><groupn> 2N/A * group_length + (256 * ADDED_LENGTH_BITS) == 2N/A * length_of(<compiled_regex...><END_GROUP|...> 2N/A * length_of(<group_type|ADDED_LENGTH_BITS> 2N/A * <group_length>\ <compiled_regex...>) 2N/A * groupn no longer seems to be used, but the code 2N/A * still computes it to preserve backward 2N/A * with earlier versions of regex(). 2N/A /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */ 2N/A break;
/* end case RIGHT_PAREN */ 2N/A case STAR:
/* zero or more repetitions of the */ 2N/A /* preceding expression */ 2N/A * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\ 2N/A * <compiled_regex...> 2N/A * (<regex...>)* compiles to 2N/A * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 2N/A * <group_length><compiled_regex...>\ 2N/A * <END_GROUP|ZERO_OR_MORE><groupn> 2N/A break;
/* end case '*' */ 2N/A /* one or more repetitions of the preceding */ 2N/A * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\ 2N/A * <compiled_regex...> (<regex...>)+ compiles to 2N/A * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\ 2N/A * <group_length><compiled_regex...>\ 2N/A * <END_GROUP|ONE_OR_MORE><groupn> 2N/A (
unsigned char)((
unsigned int)*
2N/A break;
/* end case '+' */ 2N/A * repeat the preceding regular expression 2N/A * at least min_count times 2N/A * and at most max_count times 2N/A * <regex...>{min_count} compiles to 2N/A * <regex type|COUNT><compiled_regex...> 2N/A * <min_count><min_count> 2N/A * <regex...>{min_count,} compiles to 2N/A * <regex type|COUNT><compiled_regex...> 2N/A * <min_count><UNLIMITED> 2N/A * <regex...>{min_count,max_count} compiles to 2N/A * <regex type>|COUNT><compiled_regex...> 2N/A * <min_count><max_count> 2N/A * (<regex...>){min_count,max_count} compiles to 2N/A * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\ 2N/A * <compiled_regex...><END_GROUP|COUNT><groupn>\ 2N/A * <minimum_match_count><maximum_match_count> 2N/A }
else {
/* {min_count,max_count} */ 2N/A }
else {
/* invalid expression */ 2N/A break;
/* end case LEFT_CURLY_BRACE */ 2N/A default:
/* a single non-special character */ 2N/A * compiles to <ASCII_CHAR><ascii_char> or 2N/A * <MULTIBYTE_CHAR><multibyte_char> 2N/A }
/* end switch (current_char) */ 2N/A /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */ 2N/A }
else if /* (char_size == 0) && */ (
next_argp != (
char *)0) {
2N/A }
else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
2N/A /* unmatched parentheses */ 2N/A }
/* end for (;;) */ 2N/A/* DEFINITIONS OF PRIVATE FUNCTIONS */ 2N/A if ((
unsigned int)
wchar <= (
unsigned int)
0x7f) {
2N/A if ((
unsigned int)
wchar <= (
unsigned int)
0x7f) {
2N/A }
else if ((
unsigned char)*
regexp <= (
unsigned char)
0x7f) {