port/locale/engine.c

/*
 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
 * Copyright 2012 Milan Jurik. All rights reserved.
 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
 * Copyright (c) 1992, 1993, 1994
 *  The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Henry Spencer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * The matching engine and friends.  This file is #included by regexec.c
 * after suitable #defines of a variety of macros used herein, so that
 * different state representations can be used without duplicating masses
 * of code.
 */

#ifdef SNAMES
#define matcher smatcher
#define fast    sfast
#define slow    sslow
#define dissect sdissect
#define backref sbackref
#define step    sstep
#define print   sprint
#define at  sat
#define match   smat
#endif
#ifdef LNAMES
#define matcher lmatcher
#define fast    lfast
#define slow    lslow
#define dissect ldissect
#define backref lbackref
#define step    lstep
#define print   lprint
#define at  lat
#define match   lmat
#endif
#ifdef MNAMES
#define matcher mmatcher
#define fast    mfast
#define slow    mslow
#define dissect mdissect
#define backref mbackref
#define step    mstep
#define print   mprint
#define at  mat
#define match   mmat
#endif

/* another structure passed up and down to avoid zillions of parameters */
struct match {
    struct re_guts *g;
    int eflags;
    regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
    const char *offp;   /* offsets work from here */
    const char *beginp; /* start of string -- virtual NUL precedes */
    const char *endp;   /* end of string -- virtual NUL here */
    const char *coldp;  /* can be no match starting before here */
    const char **lastpos;   /* [nplus+1] */
    STATEVARS;
    states st;      /* current states */
    states fresh;       /* states for a fresh start */
    states tmp;     /* temporary */
    states empty;       /* empty set of states */
    mbstate_t mbs;      /* multibyte conversion state */
};

/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif

/* === engine.c === */
static int matcher(struct re_guts *, const char *, size_t, regmatch_t[], int);
static const char *dissect(struct match *, const char *, const char *,
    sopno, sopno);
static const char *backref(struct match *, const char *, const char *, sopno,
    sopno, sopno, int);
static const char *fast(struct match *, const char *, const char *, sopno,
    sopno);
static const char *slow(struct match *, const char *, const char *, sopno,
    sopno);
static states step(struct re_guts *, sopno, sopno, states, wint_t, states);
#define MAX_RECURSION   100
#define BOL (OUT-1)
#define EOL (BOL-1)
#define BOLEOL  (BOL-2)
#define NOTHING (BOL-3)
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
#define NONCHAR(c)  ((c) <= OUT)
#ifdef REDEBUG
static void print(struct match *, const char *, states, int, FILE *);
#endif
#ifdef REDEBUG
static void at(struct match *, const char *, const char *, const char *,
    sopno, sopno);
#endif
#ifdef REDEBUG
static const char *pchar(int ch);
#endif

#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

#ifdef REDEBUG
#define SP(t, s, c) print(m, t, s, c, stdout)
#define AT(t, p1, p2, s1, s2)   at(m, t, p1, p2, s1, s2)
#define NOTE(str)   { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
#else
#define SP(t, s, c) /* nothing */
#define AT(t, p1, p2, s1, s2)   /* nothing */
#define NOTE(s) /* nothing */
#endif

/*
 * matcher - the actual matching engine
 */
static int          /* 0 success, REG_NOMATCH failure */
matcher(struct re_guts *g,
    const char *string,
    size_t nmatch,
    regmatch_t pmatch[],
    int eflags)
{
    const char *endp;
    int i;
    struct match mv;
    struct match *m = &mv;
    const char *dp;
    const sopno gf = g->firststate+1;   /* +1 for OEND */
    const sopno gl = g->laststate;
    const char *start;
    const char *stop;
    /* Boyer-Moore algorithms variables */
    const char *pp;
    int cj, mj;
    const char *mustfirst;
    const char *mustlast;
    int *matchjump;
    int *charjump;

    /* simplify the situation where possible */
    if (g->cflags&REG_NOSUB)
        nmatch = 0;

    if (eflags&REG_STARTEND) {
        start = string + pmatch[0].rm_so;
        stop = string + pmatch[0].rm_eo;
    } else {
        start = string;
        stop = start + strlen(start);
    }

    if (stop < start)
        return (REG_EFATAL);

    /* prescreening; this does wonders for this rather slow code */
    if (g->must != NULL) {
        if (g->charjump != NULL && g->matchjump != NULL) {
            mustfirst = g->must;
            mustlast = g->must + g->mlen - 1;
            charjump = g->charjump;
            matchjump = g->matchjump;
            pp = mustlast;
            for (dp = start+g->mlen-1; dp < stop; ) {
                /* Fast skip non-matches */
                while (dp < stop && charjump[(int)*dp])
                    dp += charjump[(int)*dp];

                if (dp >= stop)
                    break;

                /* Greedy matcher */
                /*
                 * We depend on not being used for
                 * for strings of length 1
                 */
                while (*--dp == *--pp && pp != mustfirst)
                    ;

                if (*dp == *pp)
                    break;

                /* Jump to next possible match */
                mj = matchjump[pp - mustfirst];
                cj = charjump[(int)*dp];
                dp += (cj < mj ? mj : cj);
                pp = mustlast;
            }
            if (pp != mustfirst)
                return (REG_NOMATCH);
        } else {
            for (dp = start; dp < stop; dp++)
                if (*dp == g->must[0] &&
                    stop - dp >= g->mlen &&
                    memcmp(dp, g->must, (size_t)g->mlen) == 0)
                    break;
            if (dp == stop)     /* we didn't find g->must */
                return (REG_NOMATCH);
        }
    }

    /* match struct setup */
    m->g = g;
    m->eflags = eflags;
    m->pmatch = NULL;
    m->lastpos = NULL;
    m->offp = string;
    m->beginp = start;
    m->endp = stop;
    STATESETUP(m, 4);
    SETUP(m->st);
    SETUP(m->fresh);
    SETUP(m->tmp);
    SETUP(m->empty);
    CLEAR(m->empty);
    ZAPSTATE(&m->mbs);

    /* Adjust start according to moffset, to speed things up */
    if (g->moffset > -1)
        start = ((dp - g->moffset) < start) ? start : dp - g->moffset;

    SP("mloop", m->st, *start);

    /* this loop does only one repetition except for backrefs */
    for (;;) {
        endp = fast(m, start, stop, gf, gl);
        if (endp == NULL) {     /* a miss */
            if (m->pmatch != NULL)
                free((char *)m->pmatch);
            if (m->lastpos != NULL)
                free((char *)m->lastpos);
            STATETEARDOWN(m);
            return (REG_NOMATCH);
        }
        if (nmatch == 0 && !g->backrefs)
            break;      /* no further info needed */

        /* where? */
        assert(m->coldp != NULL);
        for (;;) {
            NOTE("finding start");
            endp = slow(m, m->coldp, stop, gf, gl);
            if (endp != NULL)
                break;
            assert(m->coldp < m->endp);
            m->coldp += XMBRTOWC(NULL, m->coldp,
                m->endp - m->coldp, &m->mbs, 0);
        }
        if (nmatch == 1 && !g->backrefs)
            break;      /* no further info needed */

        /* oh my, he wants the subexpressions... */
        if (m->pmatch == NULL)
            m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
                sizeof (regmatch_t));
        if (m->pmatch == NULL) {
            STATETEARDOWN(m);
            return (REG_ESPACE);
        }
        for (i = 1; i <= m->g->nsub; i++)
            m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
        /* NB: FreeBSD has REG_BACKR, we do not */
        if (!g->backrefs /* && !(m->eflags&REG_BACKR) */) {
            NOTE("dissecting");
            dp = dissect(m, m->coldp, endp, gf, gl);
        } else {
            if (g->nplus > 0 && m->lastpos == NULL)
                m->lastpos = malloc((g->nplus+1) *
                    sizeof (const char *));
            if (g->nplus > 0 && m->lastpos == NULL) {
                free(m->pmatch);
                STATETEARDOWN(m);
                return (REG_ESPACE);
            }
            NOTE("backref dissect");
            dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
        }
        if (dp != NULL)
            break;

        /* uh-oh... we couldn't find a subexpression-level match */
        assert(g->backrefs);    /* must be back references doing it */
        assert(g->nplus == 0 || m->lastpos != NULL);
        for (;;) {
            if (dp != NULL || endp <= m->coldp)
                break;      /* defeat */
            NOTE("backoff");
            endp = slow(m, m->coldp, endp-1, gf, gl);
            if (endp == NULL)
                break;      /* defeat */
            /* try it on a shorter possibility */
#ifndef NDEBUG
            for (i = 1; i <= m->g->nsub; i++) {
                assert(m->pmatch[i].rm_so == -1);
                assert(m->pmatch[i].rm_eo == -1);
            }
#endif
            NOTE("backoff dissect");
            dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
        }
        assert(dp == NULL || dp == endp);
        if (dp != NULL)     /* found a shorter one */
            break;

        /* despite initial appearances, there is no match here */
        NOTE("false alarm");
        /* recycle starting later */
        start = m->coldp + XMBRTOWC(NULL, m->coldp,
            stop - m->coldp, &m->mbs, 0);
        assert(start <= stop);
    }

    /* fill in the details if requested */
    if (nmatch > 0) {
        pmatch[0].rm_so = m->coldp - m->offp;
        pmatch[0].rm_eo = endp - m->offp;
    }
    if (nmatch > 1) {
        assert(m->pmatch != NULL);
        for (i = 1; i < nmatch; i++)
            if (i <= m->g->nsub)
                pmatch[i] = m->pmatch[i];
            else {
                pmatch[i].rm_so = -1;
                pmatch[i].rm_eo = -1;
            }
    }

    if (m->pmatch != NULL)
        free((char *)m->pmatch);
    if (m->lastpos != NULL)
        free((char *)m->lastpos);
    STATETEARDOWN(m);
    return (0);
}

/*
 * dissect - figure out what matched what, no back references
 */
static const char *
dissect(struct match *m, const char *start, const char *stop, sopno startst,
    sopno stopst)
{
    int i;
    sopno ss;       /* start sop of current subRE */
    sopno es;       /* end sop of current subRE */
    const char *sp;     /* start of string matched by it */
    const char *stp;    /* string matched by it cannot pass here */
    const char *rest;   /* start of rest of string */
    const char *tail;   /* string unmatched by rest of RE */
    sopno ssub;     /* start sop of subsubRE */
    sopno esub;     /* end sop of subsubRE */
    const char *ssp;    /* start of string matched by subsubRE */
    const char *sep;    /* end of string matched by subsubRE */
    const char *oldssp; /* previous ssp */
    const char *dp;

    AT("diss", start, stop, startst, stopst);
    sp = start;
    for (ss = startst; ss < stopst; ss = es) {
        /* identify end of subRE */
        es = ss;
        switch (OP(m->g->strip[es])) {
        case OPLUS_:
        case OQUEST_:
            es += OPND(m->g->strip[es]);
            break;
        case OCH_:
            while (OP(m->g->strip[es]) != O_CH)
                es += OPND(m->g->strip[es]);
            break;
        }
        es++;

        /* figure out what it matched */
        switch (OP(m->g->strip[ss])) {
        case OEND:
            assert(0);
            break;
        case OCHAR:
            sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
            break;
        case OBOL:
        case OEOL:
        case OBOW:
        case OEOW:
            break;
        case OANY:
        case OANYOF:
            sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
            break;
        case OBACK_:
        case O_BACK:
            assert(0);
            break;
        /* cases where length of match is hard to find */
        case OQUEST_:
            stp = stop;
            for (;;) {
                /* how long could this one be? */
                rest = slow(m, sp, stp, ss, es);
                assert(rest != NULL);   /* it did match */
                /* could the rest match the rest? */
                tail = slow(m, rest, stop, es, stopst);
                if (tail == stop)
                    break;      /* yes! */
                /* no -- try a shorter match for this one */
                stp = rest - 1;
                assert(stp >= sp);  /* it did work */
            }
            ssub = ss + 1;
            esub = es - 1;
            /* did innards match? */
            if (slow(m, sp, rest, ssub, esub) != NULL) {
                dp = dissect(m, sp, rest, ssub, esub);
                assert(dp == rest);
#if defined(__lint)
                (void) dp;
#endif
            } else      /* no */
                assert(sp == rest);
            sp = rest;
            break;
        case OPLUS_:
            stp = stop;
            for (;;) {
                /* how long could this one be? */
                rest = slow(m, sp, stp, ss, es);
                assert(rest != NULL);   /* it did match */
                /* could the rest match the rest? */
                tail = slow(m, rest, stop, es, stopst);
                if (tail == stop)
                    break;      /* yes! */
                /* no -- try a shorter match for this one */
                stp = rest - 1;
                assert(stp >= sp);  /* it did work */
            }
            ssub = ss + 1;
            esub = es - 1;
            ssp = sp;
            oldssp = ssp;
            for (;;) {  /* find last match of innards */
                sep = slow(m, ssp, rest, ssub, esub);
                if (sep == NULL || sep == ssp)
                    break;  /* failed or matched null */
                oldssp = ssp;   /* on to next try */
                ssp = sep;
            }
            if (sep == NULL) {
                /* last successful match */
                sep = ssp;
                ssp = oldssp;
            }
            assert(sep == rest);    /* must exhaust substring */
            assert(slow(m, ssp, sep, ssub, esub) == rest);
            dp = dissect(m, ssp, sep, ssub, esub);
            assert(dp == sep);
            sp = rest;
            break;
        case OCH_:
            stp = stop;
            for (;;) {
                /* how long could this one be? */
                rest = slow(m, sp, stp, ss, es);
                assert(rest != NULL);   /* it did match */
                /* could the rest match the rest? */
                tail = slow(m, rest, stop, es, stopst);
                if (tail == stop)
                    break;      /* yes! */
                /* no -- try a shorter match for this one */
                stp = rest - 1;
                assert(stp >= sp);  /* it did work */
            }
            ssub = ss + 1;
            esub = ss + OPND(m->g->strip[ss]) - 1;
            assert(OP(m->g->strip[esub]) == OOR1);
            for (;;) {  /* find first matching branch */
                if (slow(m, sp, rest, ssub, esub) == rest)
                    break;  /* it matched all of it */
                /* that one missed, try next one */
                assert(OP(m->g->strip[esub]) == OOR1);
                esub++;
                assert(OP(m->g->strip[esub]) == OOR2);
                ssub = esub + 1;
                esub += OPND(m->g->strip[esub]);
                if (OP(m->g->strip[esub]) == OOR2)
                    esub--;
                else
                    assert(OP(m->g->strip[esub]) == O_CH);
            }
            dp = dissect(m, sp, rest, ssub, esub);
            assert(dp == rest);
            sp = rest;
            break;
        case O_PLUS:
        case O_QUEST:
        case OOR1:
        case OOR2:
        case O_CH:
            assert(0);
            break;
        case OLPAREN:
            i = OPND(m->g->strip[ss]);
            assert(0 < i && i <= m->g->nsub);
            m->pmatch[i].rm_so = sp - m->offp;
            break;
        case ORPAREN:
            i = OPND(m->g->strip[ss]);
            assert(0 < i && i <= m->g->nsub);
            m->pmatch[i].rm_eo = sp - m->offp;
            break;
        default:        /* uh oh */
            assert(0);
            break;
        }
    }

    assert(sp == stop);
    return (sp);
}

/*
 * backref - figure out what matched what, figuring in back references
 */
static const char *
backref(struct match *m, const char *start, const char *stop, sopno startst,
    sopno stopst, sopno lev,        /* PLUS nesting level */
    int rec)
{
    int i;
    sopno ss;       /* start sop of current subRE */
    const char *sp;     /* start of string matched by it */
    sopno ssub;     /* start sop of subsubRE */
    sopno esub;     /* end sop of subsubRE */
    const char *ssp;    /* start of string matched by subsubRE */
    const char *dp;
    size_t len;
    int hard;
    sop s;
    regoff_t offsave;
    cset *cs;
    wint_t wc;

    AT("back", start, stop, startst, stopst);
    sp = start;

    /* get as far as we can with easy stuff */
    hard = 0;
    for (ss = startst; !hard && ss < stopst; ss++)
        switch (OP(s = m->g->strip[ss])) {
        case OCHAR:
            if (sp == stop)
                return (NULL);
            sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
            if (wc != OPND(s))
                return (NULL);
            break;
        case OANY:
            if (sp == stop)
                return (NULL);
            sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
            if (wc == BADCHAR)
                return (NULL);
            break;
        case OANYOF:
            if (sp == stop)
                return (NULL);
            cs = &m->g->sets[OPND(s)];
            sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
            if (wc == BADCHAR || !CHIN(cs, wc))
                return (NULL);
            break;
        case OBOL:
            if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
                (sp < m->endp && *(sp-1) == '\n' &&
                (m->g->cflags&REG_NEWLINE))) {
                break;
            }
            return (NULL);
        case OEOL:
            if ((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
                (sp < m->endp && *sp == '\n' &&
                (m->g->cflags&REG_NEWLINE))) {
                break;
            }
            return (NULL);
        case OBOW:
            if (((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
                (sp < m->endp && *(sp-1) == '\n' &&
                (m->g->cflags&REG_NEWLINE)) ||
                (sp > m->beginp && !ISWORD(*(sp-1)))) &&
                (sp < m->endp && ISWORD(*sp))) {
                break;
            }
            return (NULL);
        case OEOW:
            if (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
                (sp < m->endp && *sp == '\n' &&
                (m->g->cflags&REG_NEWLINE)) ||
                (sp < m->endp && !ISWORD(*sp))) &&
                (sp > m->beginp && ISWORD(*(sp-1)))) {
                break;
            }
            return (NULL);
        case O_QUEST:
            break;
        case OOR1:  /* matches null but needs to skip */
            ss++;
            s = m->g->strip[ss];
            do {
                assert(OP(s) == OOR2);
                ss += OPND(s);
            } while (OP(s = m->g->strip[ss]) != O_CH);
            /* note that the ss++ gets us past the O_CH */
            break;
        default:    /* have to make a choice */
            hard = 1;
            break;
        }
    if (!hard) {        /* that was it! */
        if (sp != stop)
            return (NULL);
        return (sp);
    }
    ss--;           /* adjust for the for's final increment */

    /* the hard stuff */
    AT("hard", sp, stop, ss, stopst);
    s = m->g->strip[ss];
    switch (OP(s)) {
    case OBACK_:        /* the vilest depths */
        i = OPND(s);
        assert(0 < i && i <= m->g->nsub);
        if (m->pmatch[i].rm_eo == -1)
            return (NULL);
        assert(m->pmatch[i].rm_so != -1);
        len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
        if (len == 0 && rec++ > MAX_RECURSION)
            return (NULL);
        assert(stop - m->beginp >= len);
        if (sp > stop - len)
            return (NULL);  /* not enough left to match */
        ssp = m->offp + m->pmatch[i].rm_so;
        if (memcmp(sp, ssp, len) != 0)
            return (NULL);
        while (m->g->strip[ss] != SOP(O_BACK, i))
            ss++;
        return (backref(m, sp+len, stop, ss+1, stopst, lev, rec));
    case OQUEST_:       /* to null or not */
        dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
        if (dp != NULL)
            return (dp);    /* not */
        return (backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
    case OPLUS_:
        assert(m->lastpos != NULL);
        assert(lev+1 <= m->g->nplus);
        m->lastpos[lev+1] = sp;
        return (backref(m, sp, stop, ss+1, stopst, lev+1, rec));
    case O_PLUS:
        if (sp == m->lastpos[lev])  /* last pass matched null */
            return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
        /* try another pass */
        m->lastpos[lev] = sp;
        dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
        if (dp == NULL)
            return (backref(m, sp, stop, ss+1, stopst, lev-1, rec));
        return (dp);
    case OCH_:      /* find the right one, if any */
        ssub = ss + 1;
        esub = ss + OPND(s) - 1;
        assert(OP(m->g->strip[esub]) == OOR1);
        for (;;) {  /* find first matching branch */
            dp = backref(m, sp, stop, ssub, esub, lev, rec);
            if (dp != NULL)
                return (dp);
            /* that one missed, try next one */
            if (OP(m->g->strip[esub]) == O_CH)
                return (NULL);  /* there is none */
            esub++;
            assert(OP(m->g->strip[esub]) == OOR2);
            ssub = esub + 1;
            esub += OPND(m->g->strip[esub]);
            if (OP(m->g->strip[esub]) == OOR2)
                esub--;
            else
                assert(OP(m->g->strip[esub]) == O_CH);
        }
        /* NOTREACHED */
        break;
    case OLPAREN:       /* must undo assignment if rest fails */
        i = OPND(s);
        assert(0 < i && i <= m->g->nsub);
        offsave = m->pmatch[i].rm_so;
        m->pmatch[i].rm_so = sp - m->offp;
        dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
        if (dp != NULL)
            return (dp);
        m->pmatch[i].rm_so = offsave;
        return (NULL);
    case ORPAREN:       /* must undo assignment if rest fails */
        i = OPND(s);
        assert(0 < i && i <= m->g->nsub);
        offsave = m->pmatch[i].rm_eo;
        m->pmatch[i].rm_eo = sp - m->offp;
        dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
        if (dp != NULL)
            return (dp);
        m->pmatch[i].rm_eo = offsave;
        return (NULL);
    default:        /* uh oh */
        assert(0);
        break;
    }

    /* "can't happen" */
    assert(0);
    return (NULL);
}

/*
 * fast - step through the string at top speed
 */
static const char *
fast(struct match *m, const char *start, const char *stop, sopno startst,
    sopno stopst)
{
    states st = m->st;
    states fresh = m->fresh;
    states tmp = m->tmp;
    const char *p = start;
    wint_t c;
    wint_t lastc;       /* previous c */
    wint_t flagch;
    int i;
    const char *coldp;  /* last p after which no match was underway */
    size_t clen;

    CLEAR(st);
    SET1(st, startst);
    SP("fast", st, *p);
    st = step(m->g, startst, stopst, st, NOTHING, st);
    ASSIGN(fresh, st);
    SP("start", st, *p);
    coldp = NULL;
    if (start == m->beginp)
        c = OUT;
    else {
        /*
         * XXX Wrong if the previous character was multi-byte.
         * Newline never is (in encodings supported by FreeBSD),
         * so this only breaks the ISWORD tests below.
         */
        c = (uch)*(start - 1);
    }
    for (;;) {
        /* next character */
        lastc = c;
        if (p == m->endp) {
            clen = 0;
            c = OUT;
        } else
            clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
        if (EQ(st, fresh))
            coldp = p;

        /* is there an EOL and/or BOL between lastc and c? */
        flagch = '\0';
        i = 0;
        if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
            (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
            flagch = BOL;
            i = m->g->nbol;
        }
        if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
            (c == OUT && !(m->eflags&REG_NOTEOL))) {
            flagch = (flagch == BOL) ? BOLEOL : EOL;
            i += m->g->neol;
        }
        if (i != 0) {
            for (; i > 0; i--)
                st = step(m->g, startst, stopst, st,
                    flagch, st);
            SP("boleol", st, c);
        }

        /* how about a word boundary? */
        if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
            (c != OUT && ISWORD(c))) {
            flagch = BOW;
        }
        if ((lastc != OUT && ISWORD(lastc)) &&
            (flagch == EOL || (c != OUT && !ISWORD(c)))) {
            flagch = EOW;
        }
        if (flagch == BOW || flagch == EOW) {
            st = step(m->g, startst, stopst, st, flagch, st);
            SP("boweow", st, c);
        }

        /* are we done? */
        if (ISSET(st, stopst) || p == stop || clen > stop - p)
            break;      /* NOTE BREAK OUT */

        /* no, we must deal with this character */
        ASSIGN(tmp, st);
        ASSIGN(st, fresh);
        assert(c != OUT);
        st = step(m->g, startst, stopst, tmp, c, st);
        SP("aft", st, c);
        assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
        p += clen;
    }

    assert(coldp != NULL);
    m->coldp = coldp;
    if (ISSET(st, stopst))
        return (p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
    else
        return (NULL);
}

/*
 * slow - step through the string more deliberately
 */
static const char *
slow(struct match *m, const char *start, const char *stop, sopno startst,
    sopno stopst)
{
    states st = m->st;
    states empty = m->empty;
    states tmp = m->tmp;
    const char *p = start;
    wint_t c;
    wint_t lastc;       /* previous c */
    wint_t flagch;
    int i;
    const char *matchp; /* last p at which a match ended */
    size_t clen;

    AT("slow", start, stop, startst, stopst);
    CLEAR(st);
    SET1(st, startst);
    SP("sstart", st, *p);
    st = step(m->g, startst, stopst, st, NOTHING, st);
    matchp = NULL;
    if (start == m->beginp)
        c = OUT;
    else {
        /*
         * XXX Wrong if the previous character was multi-byte.
         * Newline never is (in encodings supported by FreeBSD),
         * so this only breaks the ISWORD tests below.
         */
        c = (uch)*(start - 1);
    }
    for (;;) {
        /* next character */
        lastc = c;
        if (p == m->endp) {
            c = OUT;
            clen = 0;
        } else
            clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);

        /* is there an EOL and/or BOL between lastc and c? */
        flagch = '\0';
        i = 0;
        if ((lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
            (lastc == OUT && !(m->eflags&REG_NOTBOL))) {
            flagch = BOL;
            i = m->g->nbol;
        }
        if ((c == '\n' && m->g->cflags&REG_NEWLINE) ||
            (c == OUT && !(m->eflags&REG_NOTEOL))) {
            flagch = (flagch == BOL) ? BOLEOL : EOL;
            i += m->g->neol;
        }
        if (i != 0) {
            for (; i > 0; i--)
                st = step(m->g, startst, stopst, st,
                    flagch, st);
            SP("sboleol", st, c);
        }

        /* how about a word boundary? */
        if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
            (c != OUT && ISWORD(c))) {
            flagch = BOW;
        }
        if ((lastc != OUT && ISWORD(lastc)) &&
            (flagch == EOL || (c != OUT && !ISWORD(c)))) {
            flagch = EOW;
        }
        if (flagch == BOW || flagch == EOW) {
            st = step(m->g, startst, stopst, st, flagch, st);
            SP("sboweow", st, c);
        }

        /* are we done? */
        if (ISSET(st, stopst))
            matchp = p;
        if (EQ(st, empty) || p == stop || clen > stop - p)
            break;      /* NOTE BREAK OUT */

        /* no, we must deal with this character */
        ASSIGN(tmp, st);
        ASSIGN(st, empty);
        assert(c != OUT);
        st = step(m->g, startst, stopst, tmp, c, st);
        SP("saft", st, c);
        assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
        p += clen;
    }

    return (matchp);
}


/*
 * step - map set of states reachable before char to set reachable after
 */
static states
step(struct re_guts *g,
    sopno start,        /* start state within strip */
    sopno stop,     /* state after stop state within strip */
    states bef,     /* states reachable before */
    wint_t ch,      /* character or NONCHAR code */
    states aft)     /* states already known reachable after */
{
    cset *cs;
    sop s;
    sopno pc;
    onestate here;      /* note, macros know this name */
    sopno look;
    int i;

    for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
        s = g->strip[pc];
        switch (OP(s)) {
        case OEND:
            assert(pc == stop-1);
            break;
        case OCHAR:
            /* only characters can match */
            assert(!NONCHAR(ch) || ch != OPND(s));
            if (ch == OPND(s))
                FWD(aft, bef, 1);
            break;
        case OBOL:
            if (ch == BOL || ch == BOLEOL)
                FWD(aft, bef, 1);
            break;
        case OEOL:
            if (ch == EOL || ch == BOLEOL)
                FWD(aft, bef, 1);
            break;
        case OBOW:
            if (ch == BOW)
                FWD(aft, bef, 1);
            break;
        case OEOW:
            if (ch == EOW)
                FWD(aft, bef, 1);
            break;
        case OANY:
            if (!NONCHAR(ch))
                FWD(aft, bef, 1);
            break;
        case OANYOF:
            cs = &g->sets[OPND(s)];
            if (!NONCHAR(ch) && CHIN(cs, ch))
                FWD(aft, bef, 1);
            break;
        case OBACK_:        /* ignored here */
        case O_BACK:
            FWD(aft, aft, 1);
            break;
        case OPLUS_:        /* forward, this is just an empty */
            FWD(aft, aft, 1);
            break;
        case O_PLUS:        /* both forward and back */
            FWD(aft, aft, 1);
            i = ISSETBACK(aft, OPND(s));
            BACK(aft, aft, OPND(s));
            if (!i && ISSETBACK(aft, OPND(s))) {
                /* oho, must reconsider loop body */
                pc -= OPND(s) + 1;
                INIT(here, pc);
            }
            break;
        case OQUEST_:       /* two branches, both forward */
            FWD(aft, aft, 1);
            FWD(aft, aft, OPND(s));
            break;
        case O_QUEST:       /* just an empty */
            FWD(aft, aft, 1);
            break;
        case OLPAREN:       /* not significant here */
        case ORPAREN:
            FWD(aft, aft, 1);
            break;
        case OCH_:      /* mark the first two branches */
            FWD(aft, aft, 1);
            assert(OP(g->strip[pc+OPND(s)]) == OOR2);
            FWD(aft, aft, OPND(s));
            break;
        case OOR1:      /* done a branch, find the O_CH */
            if (ISSTATEIN(aft, here)) {
                for (look = 1;
                    OP(s = g->strip[pc+look]) != O_CH;
                    look += OPND(s))
                    assert(OP(s) == OOR2);
                FWD(aft, aft, look + 1);
            }
            break;
        case OOR2:      /* propagate OCH_'s marking */
            FWD(aft, aft, 1);
            if (OP(g->strip[pc+OPND(s)]) != O_CH) {
                assert(OP(g->strip[pc+OPND(s)]) == OOR2);
                FWD(aft, aft, OPND(s));
            }
            break;
        case O_CH:      /* just empty */
            FWD(aft, aft, 1);
            break;
        default:        /* ooooops... */
            assert(0);
            break;
        }
    }

    return (aft);
}

#ifdef REDEBUG
/*
 * print - print a set of states
 */
static void
print(struct match *m, const char *caption, states st, int ch, FILE *d)
{
    struct re_guts *g = m->g;
    int i;
    int first = 1;

    if (!(m->eflags&REG_TRACE))
        return;

    (void) fprintf(d, "%s", caption);
    if (ch != '\0')
        (void) fprintf(d, " %s", pchar(ch));
    for (i = 0; i < g->nstates; i++)
        if (ISSET(st, i)) {
            (void) fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
            first = 0;
        }
    (void) fprintf(d, "\n");
}

/*
 * at - print current situation
 */
static void
at(struct match *m, const char *title, const char *start, const char *stop,
    sopno startst, sopno stopst)
{
    if (!(m->eflags&REG_TRACE))
        return;

    (void) printf("%s %s-", title, pchar(*start));
    (void) printf("%s ", pchar(*stop));
    (void) printf("%ld-%ld\n", (long)startst, (long)stopst);
}

#ifndef PCHARDONE
#define PCHARDONE   /* never again */
/*
 * pchar - make a character printable
 *
 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
 * duplicate here avoids having a debugging-capable regexec.o tied to
 * a matching debug.o, and this is convenient.  It all disappears in
 * the non-debug compilation anyway, so it doesn't matter much.
 */
static const char *
pchar(int ch)
{
    static char pbuf[10];

    if (isprint((uch)ch) || ch == ' ')
        (void) sprintf(pbuf, "%c", ch);
    else
        (void) sprintf(pbuf, "\\%o", ch);
    return (pbuf);
}
#endif
#endif

#undef  matcher
#undef  fast
#undef  slow
#undef  dissect
#undef  backref
#undef  step
#undef  print
#undef  at
#undef  match