engine.c revision 84441f85b19f6b8080883f30109e58e43c893709
1N/A/*
1N/A * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
1N/A * Copyright (c) 1992, 1993, 1994 Henry Spencer.
1N/A * Copyright (c) 1992, 1993, 1994
1N/A * The Regents of the University of California. All rights reserved.
1N/A *
1N/A * This code is derived from software contributed to Berkeley by
1N/A * Henry Spencer.
1N/A *
1N/A * Redistribution and use in source and binary forms, with or without
1N/A * modification, are permitted provided that the following conditions
1N/A * are met:
1N/A * 1. Redistributions of source code must retain the above copyright
1N/A * notice, this list of conditions and the following disclaimer.
1N/A * 2. Redistributions in binary form must reproduce the above copyright
1N/A * notice, this list of conditions and the following disclaimer in the
1N/A * documentation and/or other materials provided with the distribution.
1N/A * 4. Neither the name of the University nor the names of its contributors
1N/A * may be used to endorse or promote products derived from this software
1N/A * without specific prior written permission.
1N/A *
1N/A * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1N/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1N/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1N/A * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
1N/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1N/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1N/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1N/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1N/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1N/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
1N/A * SUCH DAMAGE.
1N/A */
1N/A
1N/A/*
1N/A * The matching engine and friends. This file is #included by regexec.c
1N/A * after suitable #defines of a variety of macros used herein, so that
1N/A * different state representations can be used without duplicating masses
1N/A * of code.
1N/A */
1N/A
1N/A#ifdef SNAMES
1N/A#define matcher smatcher
1N/A#define fast sfast
1N/A#define slow sslow
1N/A#define dissect sdissect
1N/A#define backref sbackref
1N/A#define step sstep
1N/A#define print sprint
1N/A#define at sat
1N/A#define match smat
1N/A#endif
1N/A#ifdef LNAMES
1N/A#define matcher lmatcher
1N/A#define fast lfast
1N/A#define slow lslow
1N/A#define dissect ldissect
1N/A#define backref lbackref
1N/A#define step lstep
1N/A#define print lprint
1N/A#define at lat
1N/A#define match lmat
1N/A#endif
1N/A#ifdef MNAMES
1N/A#define matcher mmatcher
1N/A#define fast mfast
1N/A#define slow mslow
1N/A#define dissect mdissect
1N/A#define backref mbackref
1N/A#define step mstep
1N/A#define print mprint
1N/A#define at mat
1N/A#define match mmat
1N/A#endif
1N/A
1N/A/* another structure passed up and down to avoid zillions of parameters */
1N/Astruct match {
1N/A struct re_guts *g;
1N/A int eflags;
1N/A regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
1N/A const char *offp; /* offsets work from here */
1N/A const char *beginp; /* start of string -- virtual NUL precedes */
1N/A const char *endp; /* end of string -- virtual NUL here */
1N/A const char *coldp; /* can be no match starting before here */
1N/A const char **lastpos; /* [nplus+1] */
1N/A STATEVARS;
1N/A states st; /* current states */
1N/A states fresh; /* states for a fresh start */
1N/A states tmp; /* temporary */
1N/A states empty; /* empty set of states */
1N/A mbstate_t mbs; /* multibyte conversion state */
1N/A};
1N/A
1N/A/* ========= begin header generated by ./mkh ========= */
1N/A#ifdef __cplusplus
1N/Aextern "C" {
1N/A#endif
1N/A
1N/A/* === engine.c === */
1N/Astatic int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int);
1N/Astatic const char *dissect(struct match *, const char *, const char *,
1N/A sopno, sopno);
1N/Astatic const char *backref(struct match *, const char *, const char *, sopno,
1N/A sopno, sopno, int);
1N/Astatic const char *fast(struct match *, const char *, const char *, sopno,
1N/A sopno);
1N/Astatic const char *slow(struct match *, const char *, const char *, sopno,
1N/A sopno);
1N/Astatic states step(struct re_guts *, sopno, sopno, states, wint_t, states);
1N/A#define MAX_RECURSION 100
1N/A#define BOL (OUT-1)
1N/A#define EOL (BOL-1)
1N/A#define BOLEOL (BOL-2)
1N/A#define NOTHING (BOL-3)
1N/A#define BOW (BOL-4)
1N/A#define EOW (BOL-5)
1N/A#define BADCHAR (BOL-6)
1N/A#define NONCHAR(c) ((c) <= OUT)
1N/A#ifdef REDEBUG
1N/Astatic void print(struct match *, const char *, states, int, FILE *);
1N/A#endif
1N/A#ifdef REDEBUG
1N/Astatic void at(struct match *, const char *, const char *, const char *,
1N/A sopno, sopno);
1N/A#endif
1N/A#ifdef REDEBUG
1N/Astatic const char *pchar(int ch);
1N/A#endif
1N/A
1N/A#ifdef __cplusplus
1N/A}
1N/A#endif
1N/A/* ========= end header generated by ./mkh ========= */
1N/A
1N/A#ifdef REDEBUG
1N/A#define SP(t, s, c) print(m, t, s, c, stdout)
1N/A#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
1N/A#define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
1N/A#else
1N/A#define SP(t, s, c) /* nothing */
1N/A#define AT(t, p1, p2, s1, s2) /* nothing */
1N/A#define NOTE(s) /* nothing */
1N/A#endif
1N/A
1N/A/*
1N/A * matcher - the actual matching engine
1N/A */
1N/Astatic int /* 0 success, REG_NOMATCH failure */
1N/Amatcher(struct re_guts *g,
1N/A const char *string,
1N/A size_t nmatch,
1N/A regmatch_t pmatch[],
1N/A int eflags)
1N/A{
1N/A const char *endp;
1N/A int i;
1N/A struct match mv;
1N/A struct match *m = &mv;
1N/A const char *dp;
1N/A const sopno gf = g->firststate+1; /* +1 for OEND */
1N/A const sopno gl = g->laststate;
1N/A const char *start;
1N/A const char *stop;
1N/A /* Boyer-Moore algorithms variables */
1N/A const char *pp;
1N/A int cj, mj;
1N/A const char *mustfirst;
1N/A const char *mustlast;
1N/A int *matchjump;
1N/A int *charjump;
1N/A
1N/A /* simplify the situation where possible */
1N/A if (g->cflags&REG_NOSUB)
1N/A nmatch = 0;
1N/A
1N/A if (eflags&REG_STARTEND) {
1N/A start = string + pmatch[0].rm_so;
1N/A stop = string + pmatch[0].rm_eo;
1N/A } else {
1N/A start = string;
1N/A stop = start + strlen(start);
1N/A }
1N/A
1N/A if (stop < start)
1N/A return (REG_EFATAL);
1N/A
1N/A /* prescreening; this does wonders for this rather slow code */
1N/A if (g->must != NULL) {
1N/A if (g->charjump != NULL && g->matchjump != NULL) {
1N/A mustfirst = g->must;
1N/A mustlast = g->must + g->mlen - 1;
1N/A charjump = g->charjump;
1N/A matchjump = g->matchjump;
1N/A pp = mustlast;
1N/A for (dp = start+g->mlen-1; dp < stop; ) {
1N/A /* Fast skip non-matches */
1N/A while (dp < stop && charjump[(int)*dp])
1N/A dp += charjump[(int)*dp];
1N/A
1N/A if (dp >= stop)
1N/A break;
1N/A
1N/A /* Greedy matcher */
1N/A /*
1N/A * We depend on not being used for
1N/A * for strings of length 1
1N/A */
1N/A while (*--dp == *--pp && pp != mustfirst)
1N/A ;
1N/A
1N/A if (*dp == *pp)
1N/A break;
1N/A
1N/A /* Jump to next possible match */
1N/A mj = matchjump[pp - mustfirst];
1N/A cj = charjump[(int)*dp];
1N/A dp += (cj < mj ? mj : cj);
1N/A pp = mustlast;
1N/A }
1N/A if (pp != mustfirst)
1N/A return (REG_NOMATCH);
1N/A } else {
1N/A for (dp = start; dp < stop; dp++)
1N/A if (*dp == g->must[0] &&
1N/A stop - dp >= g->mlen &&
1N/A memcmp(dp, g->must, (size_t)g->mlen) == 0)
1N/A break;
1N/A if (dp == stop) /* we didn't find g->must */
1N/A return (REG_NOMATCH);
1N/A }
1N/A }
1N/A
1N/A /* match struct setup */
1N/A m->g = g;
1N/A m->eflags = eflags;
1N/A m->pmatch = NULL;
1N/A m->lastpos = NULL;
1N/A m->offp = string;
1N/A m->beginp = start;
1N/A m->endp = stop;
1N/A STATESETUP(m, 4);
1N/A SETUP(m->st);
1N/A SETUP(m->fresh);
1N/A SETUP(m->tmp);
1N/A SETUP(m->empty);
1N/A CLEAR(m->empty);
1N/A ZAPSTATE(&m->mbs);
1N/A
1N/A /* Adjust start according to moffset, to speed things up */
1N/A if (g->moffset > -1)
1N/A start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
1N/A
1N/A SP("mloop", m->st, *start);
1N/A
1N/A /* this loop does only one repetition except for backrefs */
1N/A for (;;) {
1N/A endp = fast(m, start, stop, gf, gl);
1N/A if (endp == NULL) { /* a miss */
1N/A if (m->pmatch != NULL)
1N/A free((char *)m->pmatch);
1N/A if (m->lastpos != NULL)
1N/A free((char *)m->lastpos);
1N/A STATETEARDOWN(m);
1N/A return (REG_NOMATCH);
1N/A }
1N/A if (nmatch == 0 && !g->backrefs)
1N/A break; /* no further info needed */
1N/A
1N/A /* where? */
1N/A assert(m->coldp != NULL);
1N/A for (;;) {
1N/A NOTE("finding start");
1N/A endp = slow(m, m->coldp, stop, gf, gl);
1N/A if (endp != NULL)
1N/A break;
1N/A assert(m->coldp < m->endp);
1N/A m->coldp += XMBRTOWC(NULL, m->coldp,
1N/A m->endp - m->coldp, &m->mbs, 0);
1N/A }
1N/A if (nmatch == 1 && !g->backrefs)
1N/A break; /* no further info needed */
1N/A
1N/A /* oh my, he wants the subexpressions... */
1N/A if (m->pmatch == NULL)
1N/A m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
1N/A sizeof (regmatch_t));
1N/A if (m->pmatch == NULL) {
1N/A STATETEARDOWN(m);
1N/A return (REG_ESPACE);
1N/A }
1N/A for (i = 1; i <= m->g->nsub; i++)
1N/A m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
1N/A /* NB: FreeBSD has REG_BACKR, we do not */
1N/A if (!g->backrefs /* && !(m->eflags&REG_BACKR) */) {
1N/A NOTE("dissecting");
1N/A dp = dissect(m, m->coldp, endp, gf, gl);
1N/A } else {
1N/A if (g->nplus > 0 && m->lastpos == NULL)
1N/A m->lastpos = malloc((g->nplus+1) *
1N/A sizeof (const char *));
1N/A if (g->nplus > 0 && m->lastpos == NULL) {
1N/A free(m->pmatch);
1N/A STATETEARDOWN(m);
1N/A return (REG_ESPACE);
1N/A }
1N/A NOTE("backref dissect");
1N/A dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
1N/A }
1N/A if (dp != NULL)
1N/A break;
1N/A
1N/A /* uh-oh... we couldn't find a subexpression-level match */
1N/A assert(g->backrefs); /* must be back references doing it */
1N/A assert(g->nplus == 0 || m->lastpos != NULL);
1N/A for (;;) {
1N/A if (dp != NULL || endp <= m->coldp)
1N/A break; /* defeat */
1N/A NOTE("backoff");
1N/A endp = slow(m, m->coldp, endp-1, gf, gl);
1N/A if (endp == NULL)
1N/A break; /* defeat */
1N/A /* try it on a shorter possibility */
1N/A#ifndef NDEBUG
1N/A for (i = 1; i <= m->g->nsub; i++) {
1N/A assert(m->pmatch[i].rm_so == -1);
1N/A assert(m->pmatch[i].rm_eo == -1);
1N/A }
1N/A#endif
1N/A NOTE("backoff dissect");
1N/A dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
1N/A }
1N/A assert(dp == NULL || dp == endp);
1N/A if (dp != NULL) /* found a shorter one */
1N/A break;
1N/A
1N/A /* despite initial appearances, there is no match here */
1N/A NOTE("false alarm");
1N/A /* recycle starting later */
1N/A start = m->coldp + XMBRTOWC(NULL, m->coldp,
1N/A stop - m->coldp, &m->mbs, 0);
1N/A assert(start <= stop);
1N/A }
1N/A
1N/A /* fill in the details if requested */
1N/A if (nmatch > 0) {
1N/A pmatch[0].rm_so = m->coldp - m->offp;
1N/A pmatch[0].rm_eo = endp - m->offp;
1N/A }
1N/A if (nmatch > 1) {
1N/A assert(m->pmatch != NULL);
1N/A for (i = 1; i < nmatch; i++)
1N/A if (i <= m->g->nsub)
1N/A pmatch[i] = m->pmatch[i];
1N/A else {
1N/A pmatch[i].rm_so = -1;
1N/A pmatch[i].rm_eo = -1;
1N/A }
1N/A }
1N/A
1N/A if (m->pmatch != NULL)
1N/A free((char *)m->pmatch);
1N/A if (m->lastpos != NULL)
1N/A free((char *)m->lastpos);
1N/A STATETEARDOWN(m);
1N/A return (0);
1N/A}
1N/A
1N/A/*
1N/A * dissect - figure out what matched what, no back references
1N/A */
1N/Astatic const char *
1N/Adissect(struct match *m, const char *start, const char *stop, sopno startst,
1N/A sopno stopst)
1N/A{
1N/A int i;
1N/A sopno ss; /* start sop of current subRE */
1N/A sopno es; /* end sop of current subRE */
1N/A const char *sp; /* start of string matched by it */
1N/A const char *stp; /* string matched by it cannot pass here */
1N/A const char *rest; /* start of rest of string */
1N/A const char *tail; /* string unmatched by rest of RE */
1N/A sopno ssub; /* start sop of subsubRE */
1N/A sopno esub; /* end sop of subsubRE */
1N/A const char *ssp; /* start of string matched by subsubRE */
1N/A const char *sep; /* end of string matched by subsubRE */
1N/A const char *oldssp; /* previous ssp */
1N/A const char *dp;
1N/A
1N/A AT("diss", start, stop, startst, stopst);
1N/A sp = start;
1N/A for (ss = startst; ss < stopst; ss = es) {
1N/A /* identify end of subRE */
1N/A es = ss;
1N/A switch (OP(m->g->strip[es])) {
1N/A case OPLUS_:
1N/A case OQUEST_:
1N/A es += OPND(m->g->strip[es]);
1N/A break;
1N/A case OCH_:
1N/A while (OP(m->g->strip[es]) != O_CH)
1N/A es += OPND(m->g->strip[es]);
1N/A break;
1N/A }
1N/A es++;
1N/A
1N/A /* figure out what it matched */
1N/A switch (OP(m->g->strip[ss])) {
1N/A case OEND:
1N/A assert(0);
1N/A break;
1N/A case OCHAR:
1N/A sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
1N/A break;
1N/A case OBOL:
1N/A case OEOL:
1N/A case OBOW:
1N/A case OEOW:
1N/A break;
1N/A case OANY:
1N/A case OANYOF:
1N/A sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
1N/A break;
1N/A case OBACK_:
1N/A case O_BACK:
1N/A assert(0);
1N/A break;
1N/A /* cases where length of match is hard to find */
1N/A case OQUEST_:
1N/A stp = stop;
1N/A for (;;) {
1N/A /* how long could this one be? */
1N/A rest = slow(m, sp, stp, ss, es);
1N/A assert(rest != NULL); /* it did match */
1N/A /* could the rest match the rest? */
1N/A tail = slow(m, rest, stop, es, stopst);
1N/A if (tail == stop)
1N/A break; /* yes! */
1N/A /* no -- try a shorter match for this one */
1N/A stp = rest - 1;
1N/A assert(stp >= sp); /* it did work */
1N/A }
1N/A ssub = ss + 1;
1N/A esub = es - 1;
1N/A /* did innards match? */
1N/A if (slow(m, sp, rest, ssub, esub) != NULL) {
1N/A dp = dissect(m, sp, rest, ssub, esub);
1N/A assert(dp == rest);
1N/A#if defined(__lint)
1N/A (void) dp;
1N/A#endif
1N/A } else /* no */
1N/A assert(sp == rest);
1N/A sp = rest;
1N/A break;
1N/A case OPLUS_:
1N/A stp = stop;
1N/A for (;;) {
1N/A /* how long could this one be? */
1N/A rest = slow(m, sp, stp, ss, es);
1N/A assert(rest != NULL); /* it did match */
1N/A /* could the rest match the rest? */
1N/A tail = slow(m, rest, stop, es, stopst);
1N/A if (tail == stop)
1N/A break; /* yes! */
1N/A /* no -- try a shorter match for this one */
1N/A stp = rest - 1;
1N/A assert(stp >= sp); /* it did work */
1N/A }
1N/A ssub = ss + 1;
1N/A esub = es - 1;
1N/A ssp = sp;
1N/A oldssp = ssp;
1N/A for (;;) { /* find last match of innards */
1N/A sep = slow(m, ssp, rest, ssub, esub);
1N/A if (sep == NULL || sep == ssp)
1N/A break; /* failed or matched null */
1N/A oldssp = ssp; /* on to next try */
1N/A ssp = sep;
1N/A }
1N/A if (sep == NULL) {
1N/A /* last successful match */
1N/A sep = ssp;
1N/A ssp = oldssp;
1N/A }
1N/A assert(sep == rest); /* must exhaust substring */
1N/A assert(slow(m, ssp, sep, ssub, esub) == rest);
1N/A dp = dissect(m, ssp, sep, ssub, esub);
1N/A assert(dp == sep);
1N/A sp = rest;
1N/A break;
1N/A case OCH_:
1N/A stp = stop;
1N/A for (;;) {
1N/A /* how long could this one be? */
1N/A rest = slow(m, sp, stp, ss, es);
1N/A assert(rest != NULL); /* it did match */
1N/A /* could the rest match the rest? */
1N/A tail = slow(m, rest, stop, es, stopst);
1N/A if (tail == stop)
1N/A break; /* yes! */
1N/A /* no -- try a shorter match for this one */
1N/A stp = rest - 1;
1N/A assert(stp >= sp); /* it did work */
1N/A }
1N/A ssub = ss + 1;
1N/A esub = ss + OPND(m->g->strip[ss]) - 1;
1N/A assert(OP(m->g->strip[esub]) == OOR1);
1N/A for (;;) { /* find first matching branch */
1N/A if (slow(m, sp, rest, ssub, esub) == rest)
1N/A break; /* it matched all of it */
1N/A /* that one missed, try next one */
1N/A assert(OP(m->g->strip[esub]) == OOR1);
1N/A esub++;
1N/A assert(OP(m->g->strip[esub]) == OOR2);
1N/A ssub = esub + 1;
1N/A esub += OPND(m->g->strip[esub]);
1N/A if (OP(m->g->strip[esub]) == OOR2)
1N/A esub--;
1N/A else
1N/A assert(OP(m->g->strip[esub]) == O_CH);
1N/A }
1N/A dp = dissect(m, sp, rest, ssub, esub);
1N/A assert(dp == rest);
1N/A sp = rest;
1N/A break;
1N/A case O_PLUS:
1N/A case O_QUEST:
1N/A case OOR1:
1N/A case OOR2:
1N/A case O_CH:
1N/A assert(0);
1N/A break;
1N/A case OLPAREN:
1N/A i = OPND(m->g->strip[ss]);
1N/A assert(0 < i && i <= m->g->nsub);
1N/A m->pmatch[i].rm_so = sp - m->offp;
1N/A break;
1N/A case ORPAREN:
1N/A i = OPND(m->g->strip[ss]);
1N/A assert(0 < i && i <= m->g->nsub);
1N/A m->pmatch[i].rm_eo = sp - m->offp;
1N/A break;
1N/A default: /* uh oh */
1N/A assert(0);
1N/A break;
1N/A }
1N/A }
1N/A
1N/A assert(sp == stop);
1N/A return (sp);
1N/A}
1N/A
1N/A/*
1N/A * backref - figure out what matched what, figuring in back references
1N/A */
1N/Astatic const char *
1N/Abackref(struct match *m, const char *start, const char *stop, sopno startst,
1N/A sopno stopst, sopno lev, /* PLUS nesting level */
1N/A int rec)
1N/A{
1N/A int i;
1N/A sopno ss; /* start sop of current subRE */
1N/A const char *sp; /* start of string matched by it */
1N/A sopno ssub; /* start sop of subsubRE */
1N/A sopno esub; /* end sop of subsubRE */
1N/A const char *ssp; /* start of string matched by subsubRE */
1N/A const char *dp;
1N/A size_t len;
1N/A int hard;
1N/A sop s;
1N/A regoff_t offsave;
1N/A cset *cs;
1N/A wint_t wc;
1N/A
1N/A AT("back", start, stop, startst, stopst);
1N/A sp = start;
1N/A
1N/A /* get as far as we can with easy stuff */
1N/A hard = 0;
1N/A for (ss = startst; !hard && ss < stopst; ss++)
1N/A switch (OP(s = m->g->strip[ss])) {
1N/A case OCHAR:
1N/A if (sp == stop)
1N/A return (NULL);
1N/A sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
1N/A if (wc != OPND(s))
1N/A return (NULL);
1N/A break;
1N/A case OANY:
1N/A if (sp == stop)
1N/A return (NULL);
1N/A sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
1N/A if (wc == BADCHAR)
1N/A return (NULL);
1N/A break;
1N/A case OANYOF:
1N/A if (sp == stop)
1N/A return (NULL);
1N/A cs = &m->g->sets[OPND(s)];
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR || !CHIN(cs, wc))
return (NULL);
break;
case OBOL:
if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
(sp < m->endp && *(sp-1) == '\n' &&
(m->g->cflags&REG_NEWLINE))) {
break;
}
else
return (NULL);
break;
case OEOL:
if ((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
(sp < m->endp && *sp == '\n' &&
(m->g->cflags&REG_NEWLINE))) {
break;
}
else
return (NULL);
break;
case OBOW:
if (((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
(sp < m->endp && *(sp-1) == '\n' &&
(m->g->cflags&REG_NEWLINE)) ||
(sp > m->beginp && !ISWORD(*(sp-1)))) &&
(sp < m->endp && ISWORD(*sp))) {
break;
} else
return (NULL);
break;
case OEOW:
if (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
(sp < m->endp && *sp == '\n' &&
(m->g->cflags&REG_NEWLINE)) ||
(sp < m->endp && !ISWORD(*sp))) &&
(sp > m->beginp && ISWORD(*(sp-1)))) {
break;
} else
return (NULL);
break;
case O_QUEST:
break;
case OOR1: /* matches null but needs to skip */
ss++;
s = m->g->strip[ss];
do {
assert(OP(s) == OOR2);
ss += OPND(s);
} while (OP(s = m->g->strip[ss]) != O_CH);
/* note that the ss++ gets us past the O_CH */
break;
default: /* have to make a choice */
hard = 1;
break;
}
if (!hard) { /* that was it! */
if (sp != stop)
return (NULL);
return (sp);
}
ss--; /* adjust for the for's final increment */
/* the hard stuff */
AT("hard", sp, stop, ss, stopst);
s = m->g->strip[ss];
switch (OP(s)) {
case OBACK_: /* the vilest depths */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
if (m->pmatch[i].rm_eo == -1)
return (NULL);
assert(m->pmatch[i].rm_so != -1);
len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
if (len == 0 && rec++ > MAX_RECURSION)
return (NULL);
assert(stop - m->beginp >= len);
if (sp > stop - len)
return (NULL); /* not enough left to match */
ssp = m->offp + m->pmatch[i].rm_so;
if (memcmp(sp, ssp, len) != 0)
return (NULL);
while (m->g->strip[ss] != SOP(O_BACK, i))
ss++;
return (backref(m, sp+len, stop, ss+1, stopst, lev, rec));
break;
case OQUEST_: /* to null or not */
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return (dp); /* not */
return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
break;
case OPLUS_:
assert(m->lastpos != NULL);
assert(lev+1 <= m->g->nplus);
m->lastpos[lev+1] = sp;
return (backref(m, sp, stop, ss+1, stopst, lev+1, rec));
break;
case O_PLUS:
if (sp == m->lastpos[lev]) /* last pass matched null */
return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
/* try another pass */
m->lastpos[lev] = sp;
dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
if (dp == NULL)
return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
else
return (dp);
break;
case OCH_: /* find the right one, if any */
ssub = ss + 1;
esub = ss + OPND(s) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
dp = backref(m, sp, stop, ssub, esub, lev, rec);
if (dp != NULL)
return (dp);
/* that one missed, try next one */
if (OP(m->g->strip[esub]) == O_CH)
return (NULL); /* there is none */
esub++;
assert(OP(m->g->strip[esub]) == OOR2);
ssub = esub + 1;
esub += OPND(m->g->strip[esub]);
if (OP(m->g->strip[esub]) == OOR2)
esub--;
else
assert(OP(m->g->strip[esub]) == O_CH);
}
break;
case OLPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_so;
m->pmatch[i].rm_so = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return (dp);
m->pmatch[i].rm_so = offsave;
return (NULL);
break;
case ORPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_eo;
m->pmatch[i].rm_eo = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return (dp);
m->pmatch[i].rm_eo = offsave;
return (NULL);
break;
default: /* uh oh */
assert(0);
break;
}
/* "can't happen" */
assert(0);
return (NULL);
}
/*
* fast - step through the string at top speed
*/
static const char *
fast(struct match *m, const char *start, const char *stop, sopno startst,
sopno stopst)
{
states st = m->st;
states fresh = m->fresh;
states tmp = m->tmp;
const char *p = start;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
const char *coldp; /* last p after which no match was underway */
size_t clen;
CLEAR(st);
SET1(st, startst);
SP("fast", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
if (p == m->endp) {
clen = 0;
c = OUT;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (EQ(st, fresh))
coldp = p;
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
i = 0;
if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
(lastc == OUT && !(m->eflags&REG_NOTBOL))) {
flagch = BOL;
i = m->g->nbol;
}
if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
(c == OUT && !(m->eflags&REG_NOTEOL))) {
flagch = (flagch == BOL) ? BOLEOL : EOL;
i += m->g->neol;
}
if (i != 0) {
for (; i > 0; i--)
st = step(m->g, startst, stopst, st,
flagch, st);
SP("boleol", st, c);
}
/* how about a word boundary? */
if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
(c != OUT && ISWORD(c))) {
flagch = BOW;
}
if ((lastc != OUT && ISWORD(lastc)) &&
(flagch == EOL || (c != OUT && !ISWORD(c)))) {
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, st, flagch, st);
SP("boweow", st, c);
}
/* are we done? */
if (ISSET(st, stopst) || p == stop || clen > stop - p)
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
ASSIGN(tmp, st);
ASSIGN(st, fresh);
assert(c != OUT);
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
else
return (NULL);
}
/*
* slow - step through the string more deliberately
*/
static const char *
slow(struct match *m, const char *start, const char *stop, sopno startst,
sopno stopst)
{
states st = m->st;
states empty = m->empty;
states tmp = m->tmp;
const char *p = start;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
const char *matchp; /* last p at which a match ended */
size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
SET1(st, startst);
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
if (p == m->endp) {
c = OUT;
clen = 0;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
i = 0;
if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
(lastc == OUT && !(m->eflags&REG_NOTBOL))) {
flagch = BOL;
i = m->g->nbol;
}
if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
(c == OUT && !(m->eflags&REG_NOTEOL))) {
flagch = (flagch == BOL) ? BOLEOL : EOL;
i += m->g->neol;
}
if (i != 0) {
for (; i > 0; i--)
st = step(m->g, startst, stopst, st,
flagch, st);
SP("sboleol", st, c);
}
/* how about a word boundary? */
if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
(c != OUT && ISWORD(c))) {
flagch = BOW;
}
if ((lastc != OUT && ISWORD(lastc)) &&
(flagch == EOL || (c != OUT && !ISWORD(c)))) {
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, st, flagch, st);
SP("sboweow", st, c);
}
/* are we done? */
if (ISSET(st, stopst))
matchp = p;
if (EQ(st, empty) || p == stop || clen > stop - p)
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
ASSIGN(tmp, st);
ASSIGN(st, empty);
assert(c != OUT);
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p += clen;
}
return (matchp);
}
/*
* step - map set of states reachable before char to set reachable after
*/
static states
step(struct re_guts *g,
sopno start, /* start state within strip */
sopno stop, /* state after stop state within strip */
states bef, /* states reachable before */
wint_t ch, /* character or NONCHAR code */
states aft) /* states already known reachable after */
{
cset *cs;
sop s;
sopno pc;
onestate here; /* note, macros know this name */
sopno look;
int i;
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
s = g->strip[pc];
switch (OP(s)) {
case OEND:
assert(pc == stop-1);
break;
case OCHAR:
/* only characters can match */
assert(!NONCHAR(ch) || ch != OPND(s));
if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:
if (ch == BOL || ch == BOLEOL)
FWD(aft, bef, 1);
break;
case OEOL:
if (ch == EOL || ch == BOLEOL)
FWD(aft, bef, 1);
break;
case OBOW:
if (ch == BOW)
FWD(aft, bef, 1);
break;
case OEOW:
if (ch == EOW)
FWD(aft, bef, 1);
break;
case OANY:
if (!NONCHAR(ch))
FWD(aft, bef, 1);
break;
case OANYOF:
cs = &g->sets[OPND(s)];
if (!NONCHAR(ch) && CHIN(cs, ch))
FWD(aft, bef, 1);
break;
case OBACK_: /* ignored here */
case O_BACK:
FWD(aft, aft, 1);
break;
case OPLUS_: /* forward, this is just an empty */
FWD(aft, aft, 1);
break;
case O_PLUS: /* both forward and back */
FWD(aft, aft, 1);
i = ISSETBACK(aft, OPND(s));
BACK(aft, aft, OPND(s));
if (!i && ISSETBACK(aft, OPND(s))) {
/* oho, must reconsider loop body */
pc -= OPND(s) + 1;
INIT(here, pc);
}
break;
case OQUEST_: /* two branches, both forward */
FWD(aft, aft, 1);
FWD(aft, aft, OPND(s));
break;
case O_QUEST: /* just an empty */
FWD(aft, aft, 1);
break;
case OLPAREN: /* not significant here */
case ORPAREN:
FWD(aft, aft, 1);
break;
case OCH_: /* mark the first two branches */
FWD(aft, aft, 1);
assert(OP(g->strip[pc+OPND(s)]) == OOR2);
FWD(aft, aft, OPND(s));
break;
case OOR1: /* done a branch, find the O_CH */
if (ISSTATEIN(aft, here)) {
for (look = 1;
OP(s = g->strip[pc+look]) != O_CH;
look += OPND(s))
assert(OP(s) == OOR2);
FWD(aft, aft, look + 1);
}
break;
case OOR2: /* propagate OCH_'s marking */
FWD(aft, aft, 1);
if (OP(g->strip[pc+OPND(s)]) != O_CH) {
assert(OP(g->strip[pc+OPND(s)]) == OOR2);
FWD(aft, aft, OPND(s));
}
break;
case O_CH: /* just empty */
FWD(aft, aft, 1);
break;
default: /* ooooops... */
assert(0);
break;
}
}
return (aft);
}
#ifdef REDEBUG
/*
* print - print a set of states
*/
static void
print(struct match *m, const char *caption, states st, int ch, FILE *d)
{
struct re_guts *g = m->g;
int i;
int first = 1;
if (!(m->eflags&REG_TRACE))
return;
(void) fprintf(d, "%s", caption);
if (ch != '\0')
(void) fprintf(d, " %s", pchar(ch));
for (i = 0; i < g->nstates; i++)
if (ISSET(st, i)) {
(void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
first = 0;
}
(void) fprintf(d, "\n");
}
/*
* at - print current situation
*/
static void
at(struct match *m, const char *title, const char *start, const char *stop,
sopno startst, sopno stopst)
{
if (!(m->eflags&REG_TRACE))
return;
(void) printf("%s %s-", title, pchar(*start));
(void) printf("%s ", pchar(*stop));
(void) printf("%ld-%ld\n", (long)startst, (long)stopst);
}
#ifndef PCHARDONE
#define PCHARDONE /* never again */
/*
* pchar - make a character printable
*
* Is this identical to regchar() over in debug.c? Well, yes. But a
* duplicate here avoids having a debugging-capable regexec.o tied to
* a matching debug.o, and this is convenient. It all disappears in
* the non-debug compilation anyway, so it doesn't matter much.
*/
static const char *
pchar(int ch)
{
static char pbuf[10];
if (isprint((uch)ch) || ch == ' ')
(void) sprintf(pbuf, "%c", ch);
else
(void) sprintf(pbuf, "\\%o", ch);
return (pbuf);
}
#endif
#endif
#undef matcher
#undef fast
#undef slow
#undef dissect
#undef backref
#undef step
#undef print
#undef at
#undef match