/* Extended regular expression matching and search library.
Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
const re_node_set *nodes,
const re_node_set *nodes,
unsigned int context,
/* Functions for string operation. */
/* This function allocate the buffers. It is necessary to call
re_string_reconstruct before using the object. */
static reg_errcode_t
{
/* Ensure at least one character fits into the buffers. */
return ret;
return REG_NOERROR;
}
/* This function allocate the buffers, and initialize them. */
static reg_errcode_t
{
if (len > 0)
{
return ret;
}
if (icase)
{
#ifdef RE_ENABLE_I18N
{
while (1)
{
return ret;
break;
break;
return ret;
}
}
else
#endif /* RE_ENABLE_I18N */
}
else
{
#ifdef RE_ENABLE_I18N
else
#endif /* RE_ENABLE_I18N */
{
else
{
}
}
}
return REG_NOERROR;
}
/* Helper functions for re_string_allocate, and re_string_construct. */
static reg_errcode_t
{
#ifdef RE_ENABLE_I18N
{
/* Avoid overflow. */
return REG_ESPACE;
return REG_ESPACE;
{
return REG_ESPACE;
}
}
#endif /* RE_ENABLE_I18N */
if (pstr->mbs_allocated)
{
return REG_ESPACE;
}
return REG_NOERROR;
}
static void
{
}
#ifdef RE_ENABLE_I18N
/* Build wide character buffer PSTR->WCS.
If the byte sequence of the string are:
<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
Then wide character buffer will be:
<wc1> , WEOF , <wc2> , WEOF , <wc3>
We use WEOF for padding, they indicate that the position isn't
a first byte of a multibyte character.
Note that this function assumes PSTR->VALID_LEN elements are already
built and starts from PSTR->VALID_LEN. */
static void
{
#ifdef _LIBC
#else
unsigned char buf[64];
#endif
/* Build the buffers from pstr->valid_len to either pstr->len or
pstr->bufs_len. */
{
const char *p;
/* Apply the translation if we need. */
{
int i, ch;
{
}
p = (const char *) buf;
}
else
{
/* The buffer doesn't have enough space, finish to build. */
break;
}
{
/* We treat these cases as a singlebyte character. */
mbclen = 1;
}
/* Write wide character and padding. */
/* Write paddings. */
}
}
/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
but for REG_ICASE. */
static reg_errcode_t
{
#ifdef _LIBC
#else
#endif
/* The following optimization assumes that ASCII characters can be
mapped to wide characters with a simple cast. */
{
{
{
/* In case of a singlebyte character. */
/* The next step uses the assumption that wchar_t is encoded
ASCII-safe: all ASCII values can be converted like this. */
++byte_idx;
continue;
}
{
{
else
{
goto offsets_needed;
}
}
else
/* Write paddings. */
}
{
/* It is an invalid character or '\0'. Just use the byte. */
/* And also cast it to wide char. */
}
else
{
/* The buffer doesn't have enough space, finish to build. */
break;
}
}
return REG_NOERROR;
}
else
{
const char *p;
{
int i, ch;
{
}
p = (const char *) buf;
}
else
{
{
{
size_t i;
{
break;
}
{
return REG_ESPACE;
}
if (!pstr->offsets_needed)
{
}
for (i = 1; i < mbcdlen; ++i)
{
}
continue;
}
else
}
else
{
size_t i;
for (i = 0; i < mbclen; ++i)
}
/* Write paddings. */
}
{
/* It is an invalid character or '\0'. Just use the byte. */
++src_idx;
/* And also cast it to wide char. */
}
else
{
/* The buffer doesn't have enough space, finish to build. */
break;
}
}
return REG_NOERROR;
}
/* Skip characters until the index becomes greater than NEW_RAW_IDX.
Return the index. */
static Idx
{
/* Skip the characters which are not necessary to check. */
{
{
/* We treat these cases as a single byte character. */
if (mbclen == 0 || remain_len == 0)
wc = L'\0';
else
mbclen = 1;
}
else
/* Then proceed the next character. */
rawbuf_idx += mbclen;
}
return rawbuf_idx;
}
#endif /* RE_ENABLE_I18N */
/* Build the buffer PSTR->MBS, and apply the translation if we need.
This function is used in case of REG_ICASE. */
static void
{
{
else
}
}
/* Apply TRANS to the buffer in PSTR. */
static void
{
{
}
}
/* This function re-construct the buffers.
Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
convert to upper case in case of REG_ICASE, apply translation. */
static reg_errcode_t
{
else
{
/* Reset buffer. */
#ifdef RE_ENABLE_I18N
#endif /* RE_ENABLE_I18N */
pstr->raw_mbs_idx = 0;
pstr->valid_raw_len = 0;
pstr->offsets_needed = 0;
if (!pstr->mbs_allocated)
}
{
/* Should the already checked characters be kept? */
{
/* Yes, move them to the front of the buffer. */
#ifdef RE_ENABLE_I18N
{
do
{
else
break;
}
++mid;
eflags);
/* This can be quite complicated, so handle specially
only the common and easy case where the character with
different length representation of lower and upper
case is present at or after offset. */
{
}
else
{
/* Otherwise, just find out how long the partial multibyte
character at offset is and fill it with WEOF/255. */
pstr->offsets_needed = 0;
--mid;
break;
else
++mid;
else
{
{
}
}
}
}
else
#endif
{
eflags);
#ifdef RE_ENABLE_I18N
#endif /* RE_ENABLE_I18N */
#if DEBUG
#endif
}
}
else
{
#ifdef RE_ENABLE_I18N
/* No, skip all characters until IDX. */
{
pstr->offsets_needed = 0;
}
#endif
#ifdef RE_ENABLE_I18N
{
{
/* Special case UTF-8. Multi-byte chars start with any
byte other than 0x80 - 0xbf. */
#ifdef _LIBC
/* We know the wchar_t encoding is UCS4, so for the simple
case, ASCII characters, skip the conversion step. */
{
/* pstr->valid_len = 0; */
}
else
#endif
for (; p >= end; --p)
if ((*p & 0xc0) != 0x80)
{
#if 0 /* dead code: buf is set but never used */
unsigned char buf[6];
{
while (--i >= 0)
}
#endif
/* XXX Don't use mbrtowc, we know which conversion
to use (UTF-8 -> UCS4). */
&cur_state);
{
sizeof (mbstate_t));
}
break;
}
}
else
&& IS_WIDE_WORD_CHAR (wc))
: ((IS_WIDE_NEWLINE (wc)
&& pstr->newline_anchor)
? CONTEXT_NEWLINE : 0));
{
if (pstr->mbs_allocated)
}
}
else
#endif /* RE_ENABLE_I18N */
{
pstr->valid_raw_len = 0;
? CONTEXT_NEWLINE : 0));
}
}
}
/* Then build the buffers. */
#ifdef RE_ENABLE_I18N
{
{
return ret;
}
else
}
else
#endif /* RE_ENABLE_I18N */
{
}
else
return REG_NOERROR;
}
static unsigned char
{
int ch;
/* Handle the common (easiest) cases first. */
#ifdef RE_ENABLE_I18N
#endif
#ifdef RE_ENABLE_I18N
if (pstr->offsets_needed)
#endif
#ifdef RE_ENABLE_I18N
/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
this function returns CAPITAL LETTER I instead of first byte of
DOTLESS SMALL LETTER I. The latter would confuse the parser,
since peek_byte_case doesn't advance cur_idx in any way. */
#endif
return ch;
}
static unsigned char
{
return re_string_fetch_byte (pstr);
#ifdef RE_ENABLE_I18N
if (pstr->offsets_needed)
{
int ch;
/* For tr_TR.UTF-8 [[:islower:]] there is
[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
in that case the whole multi-byte character and return
the original letter. On the other side, with
[[: DOTLESS SMALL LETTER I return [[:I, as doing
anything else would complicate things too much. */
return re_string_fetch_byte (pstr);
return re_string_fetch_byte (pstr);
return ch;
}
#endif
}
static void
{
#ifdef RE_ENABLE_I18N
#endif /* RE_ENABLE_I18N */
if (pstr->mbs_allocated)
}
/* Return the context at IDX in INPUT. */
static unsigned int
{
int c;
/* In this case, we use the value stored in input->tip_context,
since we can't know the character in input->mbs[-1] here. */
return input->tip_context;
#ifdef RE_ENABLE_I18N
{
{
#ifdef DEBUG
/* It must not happen. */
#endif
--wc_idx;
if (! REG_VALID_INDEX (wc_idx))
return input->tip_context;
}
return CONTEXT_WORD;
? CONTEXT_NEWLINE : 0);
}
else
#endif
{
return CONTEXT_WORD;
}
}
/* Functions for set operation. */
static reg_errcode_t
{
return REG_ESPACE;
return REG_NOERROR;
}
static reg_errcode_t
{
{
return REG_ESPACE;
}
return REG_NOERROR;
}
static reg_errcode_t
{
return REG_ESPACE;
{
}
else
{
{
}
else
{
}
}
return REG_NOERROR;
}
static reg_errcode_t
{
{
{
return REG_ESPACE;
}
}
else
return REG_NOERROR;
}
/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
DEST. Return value indicate the error code or REG_NOERROR if succeeded.
Note: We assume dest->elems is NULL, when dest->alloc is 0. */
static reg_errcode_t
const re_node_set *src2)
{
return REG_NOERROR;
/* We need dest->nelem + 2 * elems_in_intersection; this is a
conservative estimate. */
{
return REG_ESPACE;
}
/* Find the items in the intersection of SRC1 and SRC2, and copy
into the top of DEST those that are not already in DEST itself. */
for (;;)
{
{
/* Try to find the item in DEST. Maybe we could binary search? */
--id;
break;
}
/* Lower the highest of the two items. */
{
if (! REG_VALID_INDEX (--i2))
break;
}
else
{
if (! REG_VALID_INDEX (--i1))
break;
}
}
/* Now copy. When DELTA becomes zero, the remaining
DEST elements are already in place; this is more or
less the same loop that is in re_node_set_merge. */
for (;;)
{
{
/* Copy from the top. */
if (delta == 0)
break;
}
else
{
/* Slide from the bottom. */
if (! REG_VALID_INDEX (--id))
break;
}
}
/* Copy remaining SRC elements. */
return REG_NOERROR;
}
/* Calculate the union set of the sets SRC1 and SRC2. And store it to
DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
static reg_errcode_t
const re_node_set *src2)
{
{
return REG_ESPACE;
}
else
{
else
return REG_NOERROR;
}
{
{
continue;
}
++i2;
}
{
}
{
}
return REG_NOERROR;
}
/* Calculate the union set of the sets DEST and SRC. And store it to
DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
static reg_errcode_t
{
return REG_NOERROR;
{
return REG_ESPACE;
}
{
return REG_NOERROR;
}
/* Copy into the top of DEST the items of SRC that are not
found in DEST. Maybe we could binary search in DEST? */
{
else /* if (dest->elems[id] > src->elems[is]) */
--id;
}
if (REG_VALID_INDEX (is))
{
/* If DEST is exhausted, the remaining items of SRC must be unique. */
}
if (delta == 0)
return REG_NOERROR;
/* Now copy. When DELTA becomes zero, the remaining
DEST elements are already in place. */
for (;;)
{
{
/* Copy from the top. */
if (delta == 0)
break;
}
else
{
/* Slide from the bottom. */
if (! REG_VALID_INDEX (--id))
{
/* Copy remaining SRC elements. */
break;
}
}
}
return REG_NOERROR;
}
/* Insert the new element ELEM to the re_node_set* SET.
SET should not already have ELEM.
Return true if successful. */
static bool
{
/* In case the set is empty. */
{
/* We already guaranteed above that set->alloc != 0. */
return true;
}
/* Realloc if we need. */
{
return false;
}
/* Move the elements which follows the new element. Test the
first element separately to skip a check in the inner loop. */
{
idx = 0;
}
else
{
}
/* Insert the new element. */
return true;
}
/* Insert the new element ELEM to the re_node_set* SET.
SET should not already have any element greater than or equal to ELEM.
Return true if successful. */
static bool
{
/* Realloc if we need. */
{
return false;
}
/* Insert the new element. */
return true;
}
/* Compare two node sets SET1 and SET2.
Return true if SET1 and SET2 are equivalent. */
static bool
{
Idx i;
return false;
return false;
return true;
}
/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
static Idx
{
return 0;
/* Binary search the element. */
idx = 0;
{
else
}
}
static void
{
return;
}
/* Add the token TOKEN to dfa->nodes, and return the index of the token.
Or return REG_MISSING if an error occurred. */
static Idx
{
{
MAX (sizeof (re_token_t),
MAX (sizeof (re_node_set),
sizeof (Idx)));
/* Avoid overflows. */
return REG_MISSING;
return REG_MISSING;
return REG_MISSING;
}
#ifdef RE_ENABLE_I18N
{
}
#endif
}
static inline re_hashval_t
{
Idx i;
return hash;
}
/* Search for the state whose node_set is equivalent to NODES.
Return the pointer to the state, if we found it in the DFA.
Otherwise create the new one and return it. In case of an error
return NULL and set the error code in ERR.
Note: - We assume NULL as the invalid state, then it is possible that
return value is NULL and ERR is REG_NOERROR.
- We never return non-NULL value in case of any errors, it is for
optimization. */
static re_dfastate_t *
const re_node_set *nodes)
{
Idx i;
#ifdef lint
/* Suppress bogus uninitialized-variable warnings. */
*err = REG_NOERROR;
#endif
{
*err = REG_NOERROR;
return NULL;
}
{
continue;
return state;
}
/* There are no appropriate state in the dfa, create the new one. */
*err = REG_ESPACE;
return new_state;
}
/* Search for the state whose node_set is equivalent to NODES and
whose context is equivalent to CONTEXT.
Return the pointer to the state, if we found it in the DFA.
Otherwise create the new one and return it. In case of an error
return NULL and set the error code in ERR.
Note: - We assume NULL as the invalid state, then it is possible that
return value is NULL and ERR is REG_NOERROR.
- We never return non-NULL value in case of any errors, it is for
optimization. */
static re_dfastate_t *
{
Idx i;
#ifdef lint
/* Suppress bogus uninitialized-variable warnings. */
*err = REG_NOERROR;
#endif
{
*err = REG_NOERROR;
return NULL;
}
{
return state;
}
/* There are no appropriate state in `dfa', create the new one. */
*err = REG_ESPACE;
return new_state;
}
/* Finish initialization of the new state NEWSTATE, and using its hash value
HASH put in the appropriate bucket of DFA's state table. Return value
indicates the error code if failed. */
static reg_errcode_t
{
Idx i;
return REG_ESPACE;
{
return REG_ESPACE;
}
{
return REG_ESPACE;
}
return REG_NOERROR;
}
static void
{
{
}
}
/* Create the new state which is independ of contexts.
Return the new state if succeeded, otherwise return NULL. */
static re_dfastate_t *
{
Idx i;
return NULL;
{
return NULL;
}
{
continue;
#ifdef RE_ENABLE_I18N
#endif /* RE_ENABLE_I18N */
/* If the state has the halt node, the state is a halt state. */
else if (type == OP_BACK_REF)
}
{
}
return newstate;
}
/* Create the new state which is depend on the context CONTEXT.
Return the new state if succeeded, otherwise return NULL. */
static re_dfastate_t *
{
return NULL;
{
return NULL;
}
{
continue;
#ifdef RE_ENABLE_I18N
#endif /* RE_ENABLE_I18N */
/* If the state has the halt node, the state is a halt state. */
else if (type == OP_BACK_REF)
if (constraint)
{
{
{
return NULL;
}
!= REG_NOERROR)
return NULL;
nctx_nodes = 0;
}
{
++nctx_nodes;
}
}
}
{
}
return newstate;
}