engine.c revision 84441f85b19f6b8080883f30109e58e43c893709
1N/A * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 1N/A * Copyright (c) 1992, 1993, 1994 Henry Spencer. 1N/A * Copyright (c) 1992, 1993, 1994 1N/A * The Regents of the University of California. All rights reserved. 1N/A * This code is derived from software contributed to Berkeley by 1N/A * Redistribution and use in source and binary forms, with or without 1N/A * modification, are permitted provided that the following conditions 1N/A * 1. Redistributions of source code must retain the above copyright 1N/A * notice, this list of conditions and the following disclaimer. 1N/A * 2. Redistributions in binary form must reproduce the above copyright 1N/A * notice, this list of conditions and the following disclaimer in the 1N/A * documentation and/or other materials provided with the distribution. 1N/A * 4. Neither the name of the University nor the names of its contributors 1N/A * may be used to endorse or promote products derived from this software 1N/A * without specific prior written permission. 1N/A * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 1N/A * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1N/A * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1N/A * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 1N/A * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1N/A * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 1N/A * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 1N/A * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 1N/A * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 1N/A * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 1N/A * The matching engine and friends. This file is #included by regexec.c 1N/A * after suitable #defines of a variety of macros used herein, so that 1N/A * different state representations can be used without duplicating masses 1N/A/* another structure passed up and down to avoid zillions of parameters */ 1N/A const char *
offp;
/* offsets work from here */ 1N/A const char *
beginp;
/* start of string -- virtual NUL precedes */ 1N/A const char *
endp;
/* end of string -- virtual NUL here */ 1N/A const char *
coldp;
/* can be no match starting before here */ 1N/A/* ========= begin header generated by ./mkh ========= */ 1N/Astatic void at(
struct match *,
const char *,
const char *,
const char *,
1N/A/* ========= end header generated by ./mkh ========= */ 1N/A#
define SP(t, s, c)
/* nothing */ 1N/A * matcher - the actual matching engine 1N/Astatic int /* 0 success, REG_NOMATCH failure */ 1N/A /* Boyer-Moore algorithms variables */ 1N/A /* simplify the situation where possible */ 1N/A /* prescreening; this does wonders for this rather slow code */ 1N/A /* Fast skip non-matches */ 1N/A /* Greedy matcher */ 1N/A * We depend on not being used for 1N/A * for strings of length 1 1N/A /* Jump to next possible match */ 1N/A /* match struct setup */ 1N/A /* Adjust start according to moffset, to speed things up */ 1N/A /* this loop does only one repetition except for backrefs */ 1N/A break;
/* no further info needed */ 1N/A break;
/* no further info needed */ 1N/A /* oh my, he wants the subexpressions... */ 1N/A /* NB: FreeBSD has REG_BACKR, we do not */ 1N/A sizeof (
const char *));
1N/A /* uh-oh... we couldn't find a subexpression-level match */ 1N/A /* try it on a shorter possibility */ 1N/A for (i =
1; i <= m->g->
nsub; i++) {
1N/A /* despite initial appearances, there is no match here */ 1N/A /* recycle starting later */ 1N/A /* fill in the details if requested */ 1N/A * dissect - figure out what matched what, no back references 1N/A const char *
sp;
/* start of string matched by it */ 1N/A const char *
stp;
/* string matched by it cannot pass here */ 1N/A const char *
rest;
/* start of rest of string */ 1N/A const char *
tail;
/* string unmatched by rest of RE */ 1N/A const char *
ssp;
/* start of string matched by subsubRE */ 1N/A const char *
sep;
/* end of string matched by subsubRE */ 1N/A /* identify end of subRE */ 1N/A /* figure out what it matched */ 1N/A /* cases where length of match is hard to find */ 1N/A /* how long could this one be? */ 1N/A /* could the rest match the rest? */ 1N/A /* no -- try a shorter match for this one */ 1N/A /* did innards match? */ 1N/A /* how long could this one be? */ 1N/A /* could the rest match the rest? */ 1N/A /* no -- try a shorter match for this one */ 1N/A for (;;) {
/* find last match of innards */ 1N/A break;
/* failed or matched null */ 1N/A /* last successful match */ 1N/A /* how long could this one be? */ 1N/A /* could the rest match the rest? */ 1N/A /* no -- try a shorter match for this one */ 1N/A for (;;) {
/* find first matching branch */ 1N/A break;
/* it matched all of it */ 1N/A /* that one missed, try next one */ 1N/A default:
/* uh oh */ 1N/A * backref - figure out what matched what, figuring in back references 1N/A const char *
sp;
/* start of string matched by it */ 1N/A const char *
ssp;
/* start of string matched by subsubRE */ 1N/A /* get as far as we can with easy stuff */ case OOR1:
/* matches null but needs to skip */ /* note that the ss++ gets us past the O_CH */ default:
/* have to make a choice */ if (!
hard) {
/* that was it! */ ss--;
/* adjust for the for's final increment */ case OBACK_:
/* the vilest depths */ return (
NULL);
/* not enough left to match */ case OCH_:
/* find the right one, if any */ for (;;) {
/* find first matching branch */ /* that one missed, try next one */ return (
NULL);
/* there is none */ case OLPAREN:
/* must undo assignment if rest fails */ case ORPAREN:
/* must undo assignment if rest fails */ * fast - step through the string at top speed const char *
coldp;
/* last p after which no match was underway */ * XXX Wrong if the previous character was multi-byte. * Newline never is (in encodings supported by FreeBSD), * so this only breaks the ISWORD tests below. /* is there an EOL and/or BOL between lastc and c? */ /* how about a word boundary? */ break;
/* NOTE BREAK OUT */ /* no, we must deal with this character */ * slow - step through the string more deliberately const char *
matchp;
/* last p at which a match ended */ * XXX Wrong if the previous character was multi-byte. * Newline never is (in encodings supported by FreeBSD), * so this only breaks the ISWORD tests below. /* is there an EOL and/or BOL between lastc and c? */ /* how about a word boundary? */ break;
/* NOTE BREAK OUT */ /* no, we must deal with this character */ * step - map set of states reachable before char to set reachable after sopno stop,
/* state after stop state within strip */ wint_t ch,
/* character or NONCHAR code */ states aft)
/* states already known reachable after */ /* only characters can match */ case OBACK_:
/* ignored here */ case OPLUS_:
/* forward, this is just an empty */ case O_PLUS:
/* both forward and back */ /* oho, must reconsider loop body */ case OQUEST_:
/* two branches, both forward */ case OLPAREN:
/* not significant here */ case OCH_:
/* mark the first two branches */ case OOR1:
/* done a branch, find the O_CH */ case OOR2:
/* propagate OCH_'s marking */ case O_CH:
/* just empty */ default:
/* ooooops... */ * print - print a set of states * at - print current situation * pchar - make a character printable * Is this identical to regchar() over in debug.c? Well, yes. But a * duplicate here avoids having a debugging-capable regexec.o tied to * a matching debug.o, and this is convenient. It all disappears in * the non-debug compilation anyway, so it doesn't matter much.