regcomp.c revision eda71b4a8fb1d0b34d0f08c47b43af49428d24c3
/*
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Henry Spencer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#include "lint.h"
#include "file64.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
#include <wchar.h>
#include <wctype.h>
#include "runetype.h"
#include "collate.h"
#include "utils.h"
#include "regex2.h"
#include "cname.h"
#include "mblocal.h"
/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
*/
struct parse {
char *next; /* next character in RE */
char *end; /* end of string (-> NUL normally) */
int error; /* has an error been seen? */
int ncsalloc; /* number of csets allocated */
struct re_guts *g;
};
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regcomp.c === */
static void nonnewline(struct parse *p);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */
/*
* macros for use with parse structure
* BEWARE: these know that the parse structure is named `p' !!!
*/
#ifndef NDEBUG
static int never = 0; /* for use in asserts; shuts lint up */
#else
#endif
/*
* regcomp - interface for parser and compilation
*/
int /* 0 success, otherwise REG_something */
const char *_RESTRICT_KYWD pattern,
int cflags)
{
struct re_guts *g;
int i;
#ifdef REDEBUG
#define GOODFLAGS(f) (f)
#else
#endif
/* We had REG_INVARG, but we don't have that on Solaris. */
return (REG_EFATAL);
return (REG_EFATAL);
} else
/* do the mallocs early so failure handling is easy */
if (g == NULL)
return (REG_ESPACE);
p->slen = 0;
free((char *)g);
return (REG_ESPACE);
}
/* set things up */
p->g = g;
p->error = 0;
p->ncsalloc = 0;
for (i = 0; i < NPAREN; i++) {
p->pbegin[i] = 0;
p->pend[i] = 0;
}
g->ncsets = 0;
g->iflags = 0;
g->nbol = 0;
g->neol = 0;
g->moffset = -1;
g->mlen = 0;
g->nsub = 0;
g->backrefs = 0;
/* do it */
g->firststate = THERE();
if (cflags®_EXTENDED)
else if (cflags®_NOSPEC)
p_str(p);
else
/* tidy up loose ends and fill things in */
stripsnug(p, g);
findmust(p, g);
/*
* only use Boyer-Moore algorithm if the pattern is bigger
* than three characters
*/
if (g->mlen > 3) {
computejumps(p, g);
computematchjumps(p, g);
}
}
#ifndef REDEBUG
/* not debugging, so can't rely on the assert() in regexec() */
#endif
/* win or lose, we're done */
if (p->error != 0) /* lose */
return (p->error);
}
/*
* p_ere - ERE parser top level, concatenation and alternation
*/
static void
{
char c;
for (;;) {
/* do a bunch of concatenated expressions */
p_ere_exp(p);
/* require nonempty */
if (!EAT('|'))
break; /* NOTE BREAK OUT */
if (first) {
first = 0;
}
}
if (!first) { /* tail-end fixups */
}
}
/*
* p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
*/
static void
{
char c;
int count;
int count2;
int wascaret = 0;
c = GETNEXT();
switch (c) {
case '(':
p->g->nsub++;
if (!SEE(')'))
p_ere(p, ')');
}
break;
#ifndef POSIX_MISTAKE
case ')': /* happens only if no current unmatched ( */
/*
* You may ask, why the ifndef? Because I didn't notice
* this until slightly too late for 1003.2, and none of the
* other 1003.2 regular-expression reviewers noticed it at
* all. So an unmatched ) is legal POSIX, at least until
* we can get it fixed.
*/
break;
#endif
case '^':
p->g->nbol++;
wascaret = 1;
break;
case '$':
p->g->neol++;
break;
case '|':
break;
case '*':
case '+':
case '?':
break;
case '.':
if (p->g->cflags®_NEWLINE)
nonnewline(p);
else
break;
case '[':
p_bracket(p);
break;
case '\\':
break;
case '{': /* okay as ordinary except if digit follows */
/* FALLTHROUGH */
default:
p->next--;
break;
}
if (!MORE())
return;
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!(c == '*' || c == '+' || c == '?' ||
return; /* no repetition, we're done */
NEXT();
switch (c) {
case '*': /* implemented as +? */
/* this case does not require the (y|) trick, noKLUDGE */
break;
case '+':
break;
case '?':
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
break;
case '{':
if (EAT(',')) {
} else /* single number with comma */
} else /* just a single number */
NEXT();
}
break;
}
if (!MORE())
return;
c = PEEK();
if (!(c == '*' || c == '+' || c == '?' ||
return;
}
/*
* p_str - string (no metacharacters) "parser"
*/
static void
{
while (MORE())
}
/*
* p_bre - BRE parser top level, anchoring and concatenation
*
* This implementation is a bit of a kludge, in that a trailing $ is first
* taken as an ordinary character and then revised to be an anchor.
* The amount of lookahead needed to avoid this kludge is excessive.
*/
static void
{
int wasdollar = 0;
if (EAT('^')) {
p->g->nbol++;
}
first = 0;
}
if (wasdollar) { /* oops, that was a trailing anchor */
DROP(1);
p->g->neol++;
}
}
/*
* p_simp_re - parse a simple RE, an atom possibly followed by a repetition
*/
static int /* was the simple RE an unbackslashed $? */
int starordinary) /* is a leading * an ordinary character? */
{
int c;
int count;
int count2;
int i;
c = GETNEXT();
if (c == '\\') {
}
switch (c) {
case '.':
if (p->g->cflags®_NEWLINE)
nonnewline(p);
else
break;
case '[':
p_bracket(p);
break;
case BACKSL|'{':
break;
case BACKSL|'(':
p->g->nsub++;
/* the MORE here is an error heuristic */
}
break;
case BACKSL|'}':
break;
case BACKSL|'1':
case BACKSL|'2':
case BACKSL|'3':
case BACKSL|'4':
case BACKSL|'5':
case BACKSL|'6':
case BACKSL|'7':
case BACKSL|'8':
case BACKSL|'9':
i = (c&~BACKSL) - '0';
if (p->pend[i] != 0) {
} else
p->g->backrefs = 1;
break;
case '*':
/* FALLTHROUGH */
default:
p->next--;
break;
}
/* this case does not require the (y|) trick, noKLUDGE */
if (EAT(',')) {
} else /* single number with comma */
} else /* just a single number */
NEXT();
}
} else if (c == '$') /* $ (but not \$) ends it */
return (1);
return (0);
}
/*
* p_count - parse a repetition count
*/
static int /* the value */
{
int count = 0;
int ndigits = 0;
ndigits++;
}
return (count);
}
/*
* p_bracket - parse a bracketed character list
*/
static void
{
/* Dept of Truly Sickening Special-Case Kludges */
NEXTn(6);
return;
}
NEXTn(6);
return;
}
return;
if (EAT('^'))
if (EAT(']'))
else if (EAT('-'))
if (EAT('-'))
if (p->error != 0) /* don't mess things up further */
return;
} else
}
/*
* p_b_term - parse one term of a bracketed character list
*/
static void
{
char c;
wint_t i;
/* classify what we've got */
case '[':
break;
case '-':
return; /* NOTE RETURN */
break;
default:
c = '\0';
break;
}
switch (c) {
case ':': /* character class */
NEXT2();
c = PEEK();
p_b_cclass(p, cs);
break;
case '=': /* equivalence class */
NEXT2();
c = PEEK();
p_b_eclass(p, cs);
break;
default: /* symbol, ordinary character, or range */
start = p_b_symbol(p);
/* range */
NEXT();
if (EAT('-'))
finish = '-';
else
finish = p_b_symbol(p);
} else
else {
if (__collate_load_error) {
} else {
finish) <= 0, REG_ERANGE);
for (i = 0; i <= UCHAR_MAX; i++) {
if (__collate_range_cmp(start, i) <=
0 &&
__collate_range_cmp(i, finish) <=
0)
}
}
}
break;
}
}
/*
* p_b_cclass - parse a character-class name and deal with it
*/
static void
{
char clname[16];
NEXT();
return;
}
return;
}
}
/*
* p_b_eclass - parse an equivalence-class name and deal with it
*
* This implementation is incomplete. xxx
*/
static void
{
wint_t c;
c = p_b_coll_elem(p, '=');
}
/*
* p_b_symbol - parse a character or [..]ed multicharacter collating symbol
*/
static wint_t /* value of symbol */
p_b_symbol(struct parse *p)
{
return (WGETNEXT());
/* collating symbol */
return (value);
}
/*
* p_b_coll_elem - parse a collating-element name and look it up
*/
static wint_t /* value of collating element */
p_b_coll_elem(struct parse *p,
{
int len;
NEXT();
if (!MORE()) {
return (0);
}
return (wc); /* single character */
else
return (0);
}
/*
* othercase - return the case counterpart of an alphabetic
*/
static wint_t /* if no counterpart, return ch */
{
else /* peculiar, but could happen */
return (ch);
}
/*
* bothcases - emit a dualcase version of a two-case character
*
* Boy, is this implementation ever a kludge...
*/
static void
{
size_t n;
bracket[n] = ']';
p_bracket(p);
}
/*
* ordinary - emit an ordinary character
*/
static void
{
else {
/*
* Kludge: character is too big to fit into an OCHAR operand.
* Emit a singleton set.
*/
return;
}
}
/*
* nonnewline - emit REG_NEWLINE version of OANY
*
* Boy, is this implementation ever a kludge...
*/
static void
nonnewline(struct parse *p)
{
char bracket[4];
bracket[0] = '^';
p_bracket(p);
}
/*
* repeat - generate code for a bounded repetition, recursively if needed
*/
static void
int from, /* repeated from this number */
int to) /* to this number of times (maybe INFINITY) */
{
#define N 2
#define INF 3
#define REP(f, t) ((f)*8 + (t))
if (p->error != 0) /* head off possible runaway recursion */
return;
case REP(0, 0): /* must be user doing this */
break;
case REP(0, N): /* as x{1,n}? */
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
break;
/* done */
break;
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
break;
break;
case REP(N, N): /* as xx{m-1,n-1} */
break;
break;
default: /* "can't happen" */
break;
}
}
/*
* wgetnext - helper function for WGETNEXT() macro. Gets the next wide
* character from the parse struct, signals a REG_ILLSEQ error if the
* character can't be converted. Returns the number of bytes consumed.
*/
static wint_t
{
size_t n;
return (0);
}
if (n == 0)
n = 1;
p->next += n;
return (wc);
}
/*
* seterr - set an error condition
*/
static int /* useless but makes type checking happy */
{
if (p->error == 0) /* keep earliest error condition */
p->error = e;
return (0); /* make the return value well-defined */
}
/*
* allocset - allocate a set of characters for []
*/
static cset *
{
return (NULL);
}
return (cs);
}
/*
* freeset - free a now-unused set
*/
static void
{
p->g->ncsets--;
}
/*
* singleton - Determine whether a set contains only one character,
* returning it if so, otherwise returning OUT.
*/
static wint_t
{
wint_t i, s, n;
for (i = n = 0; i < NC; i++)
n++;
s = i;
}
if (n == 1)
return (s);
/* Don't bother handling the other cases. */
return (OUT);
}
/*
* CHadd - add character to character set.
*/
static void
{
else {
return;
}
}
}
}
/*
* CHaddrange - add all characters in the range [min,max] to a character set.
*/
static void
{
return;
return;
}
}
/*
* CHaddtype - add all characters of a certain type to a character set.
*/
static void
{
wint_t i;
for (i = 0; i < NC; i++)
return;
}
}
/*
* dupl - emit a duplicate of a bunch of sops
*/
static sopno /* start of duplicate */
{
if (len == 0)
return (ret);
return (ret);
}
/*
* doemit - emit a strip operator
*
* It might seem better to implement this as a macro with a function as
* hard-case backup, but it's just too big and messy unless there are
* some changes to the data structures. Maybe later.
*/
static void
{
/* avoid making error situations worse */
if (p->error != 0)
return;
/* deal with oversize operands ("can't happen", more or less) */
/* deal with undersized strip */
/* finally, it's all reduced to the easy case */
}
/*
* doinsert - insert a sop into the strip
*/
static void
{
sop s;
int i;
/* avoid making error situations worse */
if (p->error != 0)
return;
/* adjust paren pointers */
for (i = 1; i < NPAREN; i++) {
p->pbegin[i]++;
}
p->pend[i]++;
}
}
}
/*
* dofwd - complete a forward reference
*/
static void
{
/* avoid making error situations worse */
if (p->error != 0)
return;
}
/*
* enlarge - enlarge the strip
*/
static void
{
return;
return;
}
}
/*
* stripsnug - compact the strip
*/
static void
{
}
}
/*
* findmust - fill in must and mlen with longest mandatory literal string
*
* This algorithm could do fancy things like analyzing the operands of |
* for common subsequences. Someday. This code is simple and finds most
* of the interesting cases.
*
* Note that must and mlen got initialized during setup.
*/
static void
{
sop s;
char *cp;
int offset;
char buf[MB_LEN_MAX];
/* avoid making error situations worse */
if (p->error != 0)
return;
/*
* It's not generally safe to do a ``char'' substring search on
* multibyte character strings, but it's safe for at least
* UTF-8 (see RFC 3629).
*/
if (MB_CUR_MAX > 1 &&
return;
/* find the longest OCHAR sequence in strip */
newlen = 0;
offset = 0;
g->moffset = 0;
do {
s = *scan++;
switch (OP(s)) {
case OCHAR: /* sequence member */
if (newlen == 0) { /* new sequence */
}
goto toohard;
break;
case OPLUS_: /* things that don't break one */
case OLPAREN:
case ORPAREN:
break;
case OQUEST_: /* things that must be skipped */
case OCH_:
scan--;
do {
s = *scan;
/* assert() interferes w debug printouts */
return;
}
/* FALLTHROUGH */
case OBOW: /* things that break a sequence */
case OEOW:
case OBOL:
case OEOL:
case O_QUEST:
case O_CH:
case OEND:
if (offset > -1) {
} else
} else {
if (offset > -1)
}
newlen = 0;
break;
case OANY:
if (offset > -1) {
} else
} else {
if (offset > -1)
}
if (offset > -1)
offset++;
newlen = 0;
break;
case OANYOF: /* may or may not invalidate offset */
/* First, everything as OANY */
if (offset > -1) {
} else
} else {
if (offset > -1)
}
if (offset > -1)
offset++;
newlen = 0;
break;
default:
/*
* Anything here makes it impossible or too hard
* to calculate the offset -- so we give up;
* save the last known good offset, in case the
* must sequence doesn't occur later.
*/
if (offset > -1)
else
}
offset = -1;
newlen = 0;
break;
}
if (g->mlen == 0) { /* there isn't one */
g->moffset = -1;
return;
}
/* turn it into a character string */
g->mlen = 0;
g->moffset = -1;
return;
}
continue;
}
}
/*
* altoffset - choose biggest offset among multiple choices
*
* Compute, recursively if necessary, the largest offset among multiple
* re paths.
*/
static int
{
int largest;
int try;
sop s;
/* If we gave up already on offsets, return */
if (offset == -1)
return (-1);
largest = 0;
try = 0;
s = *scan++;
switch (OP(s)) {
case OOR1:
try = 0;
break;
case OQUEST_:
case OCH_:
if (try == -1)
return (-1);
scan--;
do {
s = *scan;
return (-1);
/*
* We must skip to the next position, or we'll
* leave altoffset() too early.
*/
scan++;
break;
case OANYOF:
case OCHAR:
case OANY:
try++;
/*FALLTHRU*/
case OBOW:
case OEOW:
case OLPAREN:
case ORPAREN:
case OOR2:
break;
default:
try = -1;
break;
}
if (try == -1)
return (-1);
s = *scan++;
}
}
/*
* computejumps - compute char jumps for BM scan
*
* This algorithm assumes g->must exists and is has size greater than
* zero. It's based on the algorithm found on Computer Algorithms by
* Sara Baase.
*
* A char jump is the number of characters one needs to jump based on
* the value of the character from the text that was mismatched.
*/
static void
{
int ch;
int mindex;
/* Avoid making errors worse */
if (p->error != 0)
return;
return;
/* Adjust for signed chars, if necessary */
/*
* If the character does not exist in the pattern, the jump
* is equal to the number of characters in the pattern.
*/
/*
* If the character does exist, compute the jump that would
* take us to the last character in the pattern equal to it
* (notice that we match right to left, so that last character
* is the first one that would be matched).
*/
}
/*
* computematchjumps - compute match jumps for BM scan
*
* This algorithm assumes g->must exists and is has size greater than
* zero. It's based on the algorithm found on Computer Algorithms by
* Sara Baase.
*
* A match jump is the number of characters one needs to advance based
* on the already-matched suffix.
* Notice that all values here are minus (g->mlen-1), because of the way
* the search algorithm works.
*/
static void
{
int mindex; /* General "must" iterator */
int suffix; /* Keeps track of matching suffix */
int ssuffix; /* Keeps track of suffixes' suffix */
int *pmatches;
/*
* pmatches[k] points to the next i
* such that i+1...mlen is a substring
* of k+1...k+mlen-i-1
*/
/* Avoid making errors worse */
if (p->error != 0)
return;
return;
}
return;
/* Set maximum possible jump for each character in the pattern */
/* Compute pmatches[] */
/*
* If a mismatch is found, interrupting the substring,
* compute the matchjump for that position. If no
* mismatch is found, then a text substring mismatched
* against the suffix will also mismatch against the
* substring.
*/
}
}
/*
* Compute the matchjump up to the last substring found to jump
* to the beginning of the largest must pattern prefix matching
* it's own suffix.
*/
suffix++;
}
}
}
/*
* pluscount - count + nesting
*/
static sopno /* nesting depth */
{
sop s;
if (p->error != 0)
return (0); /* there may not be an OEND */
do {
s = *scan++;
switch (OP(s)) {
case OPLUS_:
plusnest++;
break;
case O_PLUS:
plusnest--;
break;
}
if (plusnest != 0)
return (maxnest);
}