/* regcomp.c
*/
/*
* "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
*/
/* NOTE: this is derived from Henry Spencer's regexp code, and should not
* confused with the original package (see point 3 below). Thanks, Henry!
*/
/* Additional note: this code is very heavily munged from Henry's version
* in places. In some spots I've traded clarity for efficiency, so don't
* blame Henry for some of the lack of readability.
*/
/* The names of the functions have been changed from regcomp and
* regexec to pregcomp and pregexec in order to avoid conflicts
* with the POSIX routines of the same names.
*/
#ifdef PERL_EXT_RE_BUILD
/* need to replace pregcomp et al, so enable that */
# ifndef PERL_IN_XSUB_RE
# define PERL_IN_XSUB_RE
# endif
/* need access to debugger hooks */
# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
# define DEBUGGING
# endif
#endif
#ifdef PERL_IN_XSUB_RE
/* We *really* need to overwrite these symbols: */
/* *These* symbols are masked to allow static link. */
# define PERL_NO_GET_CONTEXT
#endif
/*SUPPRESS 112*/
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
* Copyright (c) 1986 by University of Toronto.
* Written by Henry Spencer. Not derived from licensed software.
*
* Permission is granted to anyone to use this software for any
* purpose on any computer system, and to redistribute it freely,
* subject to the following restrictions:
*
* 1. The author is not responsible for the consequences of use of
* this software, no matter how awful, even if they arise
* from defects in it.
*
* 2. The origin of this software must not be misrepresented, either
* by explicit claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*
*
**** Alterations to Henry's code are...
****
**** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
**** 2000, 2001, 2002, 2003, by Larry Wall and others
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
*
* Beware that some of this code is subtly aware of the way operator
* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*/
#include "EXTERN.h"
#define PERL_IN_REGCOMP_C
#include "perl.h"
#ifndef PERL_IN_XSUB_RE
# include "INTERN.h"
#endif
#define REG_COMP_C
#include "regcomp.h"
#ifdef op
#endif /* op */
#ifdef MSDOS
# if defined(BUGGY_MSC6)
/* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
# endif /* BUGGY_MSC6 */
#endif /* MSDOS */
#ifndef STATIC
#define STATIC static
#endif
typedef struct RExC_state_t {
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
#if ADD_TO_REGEXEC
#endif
} RExC_state_t;
((*s) == '{' && regcurly(s)))
#ifdef SPSTART
#endif
/*
* Flags to be passed up and down.
*/
/* Length of a variant. */
typedef struct scan_data_t {
} scan_data_t;
/*
* Forward declarations for pregcomp()'s friends.
*/
0, 0, 0, 0, 0, 0};
#ifdef NO_UNARY_PLUS
#else
#endif
/* length of regex to show in messages that don't mark a position within */
/*
* If MARKER[12] are adjusted, be sure to adjust the constants at the top
*/
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
* arg. Show regex, up to a maximum length. If it's too long, chop and add
* "...".
*/
char *ellipses = ""; \
\
if (!SIZE_ONLY) \
if (len > RegexLengthToShowInErrorMessages) { \
/* chop 10 shorter than the max, to ensure meaning of "..." */ \
ellipses = "..."; \
} \
} STMT_END
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
* args. Show regex, up to a maximum length. If it's too long, chop and add
* "...".
*/
char *ellipses = ""; \
\
if (!SIZE_ONLY) \
if (len > RegexLengthToShowInErrorMessages) { \
/* chop 10 shorter than the max, to ensure meaning of "..." */ \
ellipses = "..."; \
} \
} STMT_END
/*
* Simple_vFAIL -- like FAIL, but marks the current location in the scan
*/
} STMT_END
/*
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
*/
if (!SIZE_ONLY) \
Simple_vFAIL(m); \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts two arguments.
*/
} STMT_END
/*
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
*/
if (!SIZE_ONLY) \
Simple_vFAIL2(m, a1); \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts three arguments.
*/
} STMT_END
/*
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
*/
if (!SIZE_ONLY) \
} STMT_END
/*
* Like Simple_vFAIL(), but accepts four arguments.
*/
} STMT_END
/*
* Like Simple_vFAIL(), but accepts five arguments.
*/
} STMT_END
} STMT_END
"%s" REPORT_LOCATION, \
} STMT_END
} STMT_END
} STMT_END
} STMT_END
} STMT_END
/* Allow for side effects in s */
if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
} STMT_END
/* Macros for recording node offsets. 20001227 mjd@plover.com
* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
* element 2*n-1 of the array. Element #2n holds the byte length node #n.
* Element 0 holds the number n.
*/
#define MJD_OFFSET_DEBUG(x)
/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
if((node) < 0) { \
} else { \
} \
} \
} STMT_END
if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
if((node) < 0) { \
} else { \
} \
} \
} STMT_END
/* Get offsets and lengths */
/* Mark that we cannot extend a found fixed substring at this point.
Updata the longest found anchored substring and the longest found
floating substrings if needed. */
STATIC void
{
else
}
else {
data->offset_float_max = (l
else
}
}
{
}
}
/* Can match anything (initialization) */
STATIC void
{
if (LOC)
}
/* Can match anything (initialization) */
STATIC int
{
int value;
return 1;
return 0;
if (!ANYOF_BITMAP_TESTALLSET(cl))
return 0;
return 1;
}
/* Can match anything (initialization) */
STATIC void
{
}
STATIC void
{
if (LOC)
}
/* 'And' a given class with another one. Can create false positives */
/* We assume that cl is not inverted */
STATIC void
struct regnode_charclass_class *and_with)
{
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
else
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
}
}
/* 'OR' a given class with another one. Can create false positives */
/* We assume that cl is not inverted */
STATIC void
S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
{
/* We do not use
* (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
* <= (B1 | !B2) | (CL1 | !CL2)
* which is wasteful if CL2 is small, but we ignore CL2:
* (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
* XXXX Can we handle case-fold? Unclear:
* (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} /* XXXX: logic is complicated otherwise */
else {
}
} else {
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
int i;
/* OR char bitmap and class bitmap separately */
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
}
}
else { /* XXXX: logic is complicated, leave it along for a moment. */
}
}
}
}
}
/*
* There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
* These need to be revisited when a newer toolchain becomes available.
*/
#if defined(__sparc64__) && defined(__GNUC__)
# endif
#endif
/* REx optimizer. Converts nodes into quickier variants "in place".
Finds fixed substrings. */
/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
to the position after last scanned or to NULL. */
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
{
/* Peephole optimizer: */
/* Merge several consecutive EXACTish nodes into one. */
#ifdef DEBUGGING
#endif
/* Skip NOTHING, merge EXACT*. */
while (n &&
&& NEXT_OFF(n)
stringok = 0;
next = n + NODE_STEP_REGNODE;
#ifdef DEBUGGING
if (stringok)
stop = n;
#endif
n = regnext(n);
}
else if (stringok) {
break;
next = n + NODE_SZ_STR(n);
/* Now we can overwrite *n : */
#ifdef DEBUGGING
#endif
n = nnext;
}
}
/*
Two problematic code points in Unicode casefolding of EXACT nodes:
U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
which casefold to
Unicode UTF-8
U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
This means that in case-insensitive matching (or "loose matching",
as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
length of the above casefolded versions) can match a target string
of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
This would rather mess up the minimum length computation.
What we'll do is to look for the tail four bytes, and then peek
at the preceding two bytes to see whether we need to decrease
the minimum length by four (six minus two).
Thanks to the design of UTF-8, there cannot be false matches:
A sequence of valid UTF-8 bytes cannot be a subsequence of
another valid sequence of UTF-8 bytes.
*/
for (s = s0 + 2;
s = t + 4) {
min -= 4;
}
}
#ifdef DEBUGGING
/* Allow dumping */
while (n <= stop) {
NEXT_OFF(n) = 0;
}
n++;
}
#endif
}
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
? I32_MAX
/* I32 may be smaller than U16 on CRAYs! */
int noff;
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
else
}
/* The principal pseudo-switch. Cannot be a switch, since we
look into several different things. */
if (flags & SCF_DO_STCLASS)
num++;
if (data) {
}
else
if (flags & SCF_DO_STCLASS) {
f = SCF_DO_STCLASS_AND;
}
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
pars++;
if (data)
if (flags & SCF_DO_STCLASS)
break;
}
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
}
if (flags & SCF_DO_STCLASS_OR) {
if (min1) {
flags &= ~SCF_DO_STCLASS;
}
}
else if (flags & SCF_DO_STCLASS_AND) {
if (min1) {
flags &= ~SCF_DO_STCLASS;
}
else {
/* Switch to OR mode: cache the old value of
* data->start_class */
struct regnode_charclass_class);
flags &= ~SCF_DO_STCLASS_AND;
struct regnode_charclass_class);
}
}
}
else /* single branch is optimized. */
continue;
}
if (UTF) {
l = utf8_length(s, s + l);
}
min += l;
/* The code below prefers earlier match for fixed
offset, later match for variable offset. */
}
{
}
if (UTF)
}
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
if (uc >= 0x100 ||
)
compat = 0;
if (compat)
if (uc < 0x100)
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded */
if (uc < 0x100)
else
}
flags &= ~SCF_DO_STCLASS;
}
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR)
if (UTF) {
l = utf8_length(s, s + l);
}
min += l;
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
if (uc >= 0x100 ||
compat = 0;
if (compat) {
}
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
if (uc < 0x100)
}
}
flags &= ~SCF_DO_STCLASS;
}
case WHILEM: /* End of (?:...)* . */
goto finish;
case PLUS:
mincount = 1;
goto do_curly;
}
}
if (flags & SCF_DO_SUBSTR)
min++;
/* Fall through. */
case STAR:
if (flags & SCF_DO_STCLASS) {
mincount = 0;
goto do_curly;
}
if (flags & SCF_DO_SUBSTR) {
}
goto optimize_curly_tail;
case CURLY:
}
if (flags & SCF_DO_SUBSTR) {
}
if (data) {
if (is_inf)
}
if (flags & SCF_DO_STCLASS) {
f |= SCF_DO_STCLASS_AND;
f &= ~SCF_DO_STCLASS_OR;
}
/* These are the cases when once a subexpression
fails at a particular position, it cannot succeed
even after backtracking at the enclosing scope.
XXXX what if minimal match and we are at the
initial run of {n,m}? */
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
mincount == 0
? (f & ~SCF_DO_SUBSTR) : f);
if (flags & SCF_DO_STCLASS)
if (flags & SCF_DO_STCLASS_OR) {
}
else if (flags & SCF_DO_STCLASS_AND) {
/* Switch to OR mode: cache the old value of
* data->start_class */
struct regnode_charclass_class);
flags &= ~SCF_DO_STCLASS_AND;
struct regnode_charclass_class);
}
} else { /* Non-zero len */
if (flags & SCF_DO_STCLASS_OR) {
}
else if (flags & SCF_DO_STCLASS_AND)
flags &= ~SCF_DO_STCLASS;
}
if (!scan) /* It was not CURLYX, but CURLY. */
if (ckWARN(WARN_REGEXP)
/* ? quantifier ok, except for (?{ ... }) */
{
"Quantifier unexpected on zero-length expression");
}
/* Try powerful optimization CURLYX => CURLYN. */
/* Try to optimize to CURLYN. */
#ifdef DEBUGGING
#endif
/* Skip open. */
goto nogo;
#ifdef DEBUGGING
#endif
goto nogo;
/* Now we know that nxt2 is the only contents: */
#ifdef DEBUGGING
#endif
}
nogo:
/* Try optimization CURLYX => CURLYM. */
&& !deltanext ) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
/* Need to optimize away parenths. */
/* Set the parenth number. */
FAIL("Panic opt close");
#ifdef DEBUGGING
#endif
#if 0
else
}
}
#endif
/* Optimize again: */
NULL, 0);
}
else
}
&& (flags & SCF_WHILEM_VISITED_POS)
/* See the comment on a similar expression above.
However, this time it not a subexpression
we care about, but the expression itself. */
/* Find WHILEM (as in regexec.c) */
}
pars++;
if (flags & SCF_DO_SUBSTR) {
#if defined(SPARC64_GCC_WORKAROUND)
I32 b = 0;
STRLEN l = 0;
char *s = NULL;
b = pos_before;
else
b = data->last_start_min;
l = 0;
#else
STRLEN l;
#endif
if (UTF)
l -= old;
/* Get the added string: */
if (UTF)
if (deltanext == 0 && pos_before == b) {
/* What was added is a constant string */
if (mincount > 1) {
/* Add additional parts. */
{
}
}
} else {
/* start offset must point into the last copy */
}
}
/* It is counted once already... */
/* Cannot extend fixed substrings found inside
the group. */
? I32_MAX
}
}
}
}
continue;
default: /* REF and CLUMP only? */
if (flags & SCF_DO_SUBSTR) {
}
if (flags & SCF_DO_STCLASS_OR)
flags &= ~SCF_DO_STCLASS;
break;
}
}
int value = 0;
if (flags & SCF_DO_SUBSTR) {
}
min++;
if (flags & SCF_DO_STCLASS) {
/* Some of the logic below assumes that switching
locale on will only add false positives. */
case SANY:
default:
/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
break;
case REG_ANY:
goto do_default;
}
break;
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
(struct regnode_charclass_class*)scan);
else
(struct regnode_charclass_class*)scan);
break;
case ALNUM:
if (flags & SCF_DO_STCLASS_AND) {
}
}
else {
else {
}
}
break;
case ALNUML:
if (flags & SCF_DO_STCLASS_AND) {
}
else {
}
break;
case NALNUM:
if (flags & SCF_DO_STCLASS_AND) {
}
}
else {
else {
}
}
break;
case NALNUML:
if (flags & SCF_DO_STCLASS_AND) {
}
else {
}
break;
case SPACE:
if (flags & SCF_DO_STCLASS_AND) {
}
}
else {
else {
}
}
break;
case SPACEL:
if (flags & SCF_DO_STCLASS_AND) {
}
else {
}
break;
case NSPACE:
if (flags & SCF_DO_STCLASS_AND) {
}
}
else {
else {
}
}
break;
case NSPACEL:
if (flags & SCF_DO_STCLASS_AND) {
}
}
else {
}
break;
case DIGIT:
if (flags & SCF_DO_STCLASS_AND) {
}
else {
else {
}
}
break;
case NDIGIT:
if (flags & SCF_DO_STCLASS_AND) {
}
else {
else {
}
}
break;
}
if (flags & SCF_DO_STCLASS_OR)
flags &= ~SCF_DO_STCLASS;
}
}
: SF_BEFORE_SEOL);
}
/* Lookahead/lookbehind */
int f = 0;
if (data) {
}
else
f |= SCF_DO_STCLASS_AND;
}
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
if (deltanext) {
vFAIL("Variable length lookbehind not implemented");
}
}
}
pars++;
if (data)
if (f & SCF_DO_STCLASS_AND) {
if (was)
}
}
pars++;
}
is_par = 0; /* Disable optimization */
}
if (data)
}
if (data)
}
if (flags & SCF_DO_SUBSTR) {
}
flags &= ~SCF_DO_STCLASS;
}
/* Else: zero-length, ignore. */
}
is_par = 0;
}
}
if (flags & SCF_DO_STCLASS_OR)
return min;
}
{
char, struct reg_data);
}
else {
char, struct reg_data);
}
}
void
{
int i = 0;
char *s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
while (++i < 6) {
s = strchr(s, '\t');
if (s) {
*s = '\0';
PL_colors[i] = ++s;
}
else
PL_colors[i] = s = "";
}
} else {
while (i < 6)
PL_colors[i++] = "";
}
PL_colorset = 1;
}
/*
- pregcomp - compile a regular expression into internal code
*
* We can't allocate space until we know how big the compiled form will be,
* but we can't compile it (and thus know how big it is) until we've got a
* place to put the code. So we cheat: we compile it twice, once with code
* generation turned off and size counting turned on, and once "for real".
* This also means that we don't allocate space until we are sure that the
* thing really will compile successfully, and we never have to move the
* code and thus invalidate pointers into it. (Note that it has to be in
* one piece because free() must be able to free it all.) [NB: not true in perl]
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp. [I'll say.]
*/
regexp *
{
register regexp *r;
FAIL("NULL regexp argument");
DEBUG_r({
if (!PL_colorset) reginitcolors();
});
RExC_precomp = exp;
RExC_sawback = 0;
RExC_seen = 0;
RExC_seen_evals = 0;
RExC_extralen = 0;
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_start = exp;
RExC_naughty = 0;
RExC_npar = 1;
RExC_size = 0L;
RExC_emit = &PL_regdummy;
RExC_whilem_seen = 0;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
#endif
return(NULL);
}
if (RExC_utf8 && !RExC_orig_utf8) {
/* It's possible to write a regexp in ascii that represents unicode
codepoints outside of the byte range, such as via \x{100}. If we
detect such a sequence we have to convert the entire pattern to utf8
and then recompile, as our sizing calculation will have been based
on 1 byte == 1 character, but we will need to use utf8 to encode
at least some part of the pattern, and therefore must convert the whole
thing.
XXX: somehow figure out how to make this less expensive...
-- dmq */
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
goto redo_first_pass;
}
/* Small enough for pointer-storage convention?
If extralen==0, this means that we will not need long jumps. */
else
RExC_extralen = 0;
if (RExC_whilem_seen > 15)
RExC_whilem_seen = 15;
/* Allocate space and initialize. */
char, regexp);
if (r == NULL)
FAIL("Regexp out of space");
#ifdef DEBUGGING
/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
#endif
r->refcnt = 1;
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
if (r->offsets) {
}
RExC_rx = r;
/* Second pass: emit code. */
RExC_parse = exp;
RExC_naughty = 0;
RExC_npar = 1;
RExC_emit_start = r->program;
/* Store the count of eval-groups for security checks: */
r->data = 0;
return(NULL);
/* Dig out information for optimizations. */
if (UTF)
r->regstclass = NULL;
r->reganch |= ROPT_NAUGHTY;
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
/* XXXX Should not we check for something else? Usually it is OPEN1... */
int stclass_flag;
/* Skip introductions and multiplicators >= 1. */
/* An OR of *one* alternative - should not happen now. */
/* An {n,m} with n>0 */
sawplus = 1;
else
}
/* Starting-point info. */
; /* Empty, get anchored substr later. */
r->regstclass = first;
}
r->regstclass = first;
r->regstclass = first;
: ROPT_ANCH_BOL));
goto again;
}
r->reganch |= ROPT_ANCH_GPOS;
goto again;
}
{
/* turn .* into ^.* with an implied $*=1 */
else
goto again;
}
/* x+ must match at the 1st pos of run of x's */
/* Scan is after the zeroth branch, first is atomic matcher. */
/*
* If there's something expensive in the r.e., find the
* longest literal string that must appear and make it the
* regmust. Resolve ties in favor of later strings, since
* the regstart check works with the beginning of the r.e.
* and avoiding duplication strengthens checking. Not a
* strong reason, but sufficient in the absence of others.
* [Now we resolve ties in favor of the earlier string if
* it happens that c_offset_min has been invalidated, since the
* earlier string may buy us something the later one won't.]
*/
minlen = 0;
if (!r->regstclass) {
} else /* XXXX Check for BOUND? */
stclass_flag = 0;
r->reganch |= ROPT_CHECK_ALL;
|| (RExC_flags & PMf_MULTILINE)))) {
int t;
goto remove_float; /* As in (a)+. */
r->float_substr = Nullsv;
} else {
r->float_utf8 = Nullsv;
}
|| (RExC_flags & PMf_MULTILINE)));
}
else {
longest_float_length = 0;
}
|| (RExC_flags & PMf_MULTILINE)))) {
int t;
r->anchored_substr = Nullsv;
} else {
r->anchored_utf8 = Nullsv;
}
|| (RExC_flags & PMf_MULTILINE)));
}
else {
longest_fixed_length = 0;
}
if (r->regstclass
r->regstclass = NULL;
&& stclass_flag
{
struct regnode_charclass_class);
struct regnode_charclass_class);
"synthetic stclass `%s'.\n",
}
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
if (longest_fixed_length > longest_float_length) {
r->check_substr = r->anchored_substr;
r->check_utf8 = r->anchored_utf8;
if (r->reganch & ROPT_ANCH_SINGLE)
r->reganch |= ROPT_NOSCAN;
}
else {
r->check_substr = r->float_substr;
r->check_utf8 = r->float_utf8;
}
/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
This should be changed ASAP! */
r->reganch |= RE_USE_INTUIT;
r->reganch |= RE_INTUIT_TAIL;
}
}
else {
/* Several toplevels. Best we can is to set minlen. */
minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
{
struct regnode_charclass_class);
struct regnode_charclass_class);
"synthetic stclass `%s'.\n",
}
}
if (RExC_seen & REG_SEEN_GPOS)
r->reganch |= ROPT_GPOS_SEEN;
if (RExC_seen & REG_SEEN_LOOKBEHIND)
r->reganch |= ROPT_LOOKBEHIND_SEEN;
if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
if (RExC_seen & REG_SEEN_CANY)
r->reganch |= ROPT_CANY_SEEN;
return(r);
}
/*
- reg - regular expression, i.e. main body or parenthesized thing
*
* Caller must absorb opening parenthesis.
*
* Combining parenthesis handling with the base level of regular expression
* is a trifle forced, but the need to tie the tails of the branches to what
* follows makes it hard to avoid.
*/
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
/* for (?g), (?gc), and (?o) warnings; warning
about (?c) will warn about (?g) -- japhy */
char c;
*flagp = 0; /* Tentatively. */
/* Make an OPEN node, if parenthesized. */
if (paren) {
int logical = 0;
RExC_parse++;
paren = *RExC_parse++;
switch (paren) {
case '<': /* (?<...) */
if (*RExC_parse == '!')
paren = ',';
goto unknown;
RExC_parse++;
case '=': /* (?=...) */
case '!': /* (?!...) */
case ':': /* (?:...) */
case '>': /* (?>...) */
break;
case '$': /* (?$...) */
case '@': /* (?@...) */
break;
case '#': /* (?#...) */
RExC_parse++;
if (*RExC_parse != ')')
FAIL("Sequence (?#... not terminated");
return NULL;
case 'p': /* (?p...) */
/* FALL THROUGH*/
case '?': /* (??...) */
logical = 1;
if (*RExC_parse != '{')
goto unknown;
paren = *RExC_parse++;
/* FALL THROUGH */
case '{': /* (?{...}) */
{
char c;
char *s = RExC_parse;
while (count && (c = *RExC_parse)) {
RExC_parse++;
else if (c == '{')
count++;
else if (c == '}')
count--;
RExC_parse++;
}
if (*RExC_parse != ')')
{
RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
if (RExC_parse - 1 - s)
else
/* re_dup will OpREFCNT_inc */
}
else { /* First pass */
if (PL_reginterp_cnt < ++RExC_seen_evals
&& IN_PERL_RUNTIME)
/* No compiled RE interpolated, has runtime
components ===> unsafe. */
FAIL("Eval-group not allowed at runtime, use re 'eval'");
if (PL_tainting && PL_tainted)
FAIL("Eval-group in insecure regular expression");
}
if (logical) {
if (!SIZE_ONLY)
/* deal with the length of this later - MJD */
return ret;
}
return ret;
}
case '(': /* (?(?{...})...) and (?(?=...)...) */
{
if (!SIZE_ONLY)
goto insert_if;
}
}
/* (?(1)...) */
while (isDIGIT(*RExC_parse))
RExC_parse++;
vFAIL("Switch condition not recognized");
else
c = *nextchar(pRExC_state);
if (c == '|') {
c = *nextchar(pRExC_state);
}
else
if (c != ')')
vFAIL("Switch (?(condition)... contains too many branches");
if (lastbr) {
}
else
return ret;
}
else {
}
}
case 0:
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
default:
--RExC_parse;
parse_flags: /* (?i) */
/* (?g), (?gc) and (?o) are useless here
and must be globally applied -- japhy */
if (! (wastedflags & wflagbit) ) {
wastedflags |= wflagbit;
RExC_parse + 1,
"Useless (%s%c) - %suse /%c modifier",
);
}
}
}
else if (*RExC_parse == 'c') {
if (! (wastedflags & wasted_c) ) {
wastedflags |= wasted_gc;
RExC_parse + 1,
"Useless (%sc) - %suse /gc modifier",
);
}
}
}
++RExC_parse;
}
if (*RExC_parse == '-') {
wastedflags = 0; /* reset so (?g-c) warns twice */
++RExC_parse;
goto parse_flags;
}
RExC_flags |= posflags;
RExC_flags &= ~negflags;
if (*RExC_parse == ':') {
RExC_parse++;
paren = ':';
break;
}
if (*RExC_parse != ')') {
RExC_parse++;
}
return NULL;
}
}
else { /* (...) */
RExC_npar++;
open = 1;
}
}
else /* ! paren */
/* Pick up the branches, linking them together. */
/* branch_len = (paren != 0); */
return(NULL);
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
}
else { /* MJD */
}
have_branch = 1;
if (SIZE_ONLY)
}
else if (paren == ':') {
}
if (open) { /* Starts with OPEN. */
}
while (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
}
if (SIZE_ONLY)
return(NULL);
}
/* Make a closing node, and hook it on the end. */
switch (paren) {
case ':':
break;
case 1:
break;
case '<':
case ',':
case '=':
case '!':
/* FALL THROUGH */
case '>':
break;
case 0:
break;
}
if (have_branch) {
/* Hook the tails of the branches to the closing node. */
}
}
}
{
char *p;
if (paren == '>')
}
}
/* Check for proper termination. */
if (paren) {
vFAIL("Unmatched (");
}
}
if (*RExC_parse == ')') {
RExC_parse++;
vFAIL("Unmatched )");
}
else
/* NOTREACHED */
}
return(ret);
}
/*
- regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
*/
{
if (first)
else {
if (!SIZE_ONLY && RExC_extralen)
else {
}
}
RExC_parse--;
continue;
return(NULL);
}
else {
RExC_naughty++;
}
c++;
}
}
if (c == 1) {
}
return(ret);
}
/*
- regpiece - something followed by possible [*+?]
*
* Note that the branching code sequences used for ? and the general cases
* of * and + are somewhat optimized: they use the same NOTHING node as
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
*/
{
register char op;
register char *next;
char *maxpos;
char *parse_start;
return(NULL);
}
op = *RExC_parse;
if (*next == ',') {
if (maxpos)
break;
else
}
next++;
}
if (!maxpos)
RExC_parse++;
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
RExC_parse = next;
}
else {
w->flags = 0;
if (!SIZE_ONLY && RExC_extralen) {
}
/* MJD hk */
if (!SIZE_ONLY && RExC_extralen)
if (SIZE_ONLY)
}
if (min > 0)
if (max > 0)
vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
}
goto nest_check;
}
}
return(ret);
}
#if 0 /* Now runtime fix should be reliable. */
/* if this is reinstated, don't forget to put this back into perldiag:
=item Regexp *+ operand could be empty at {#} in regex m/%s/
(F) The part of the regexp subject to either the * or + quantifier
could match an empty string. The {#} shows in the regular
expression about where the problem was discovered.
*/
vFAIL("Regexp *+ operand could be empty");
#endif
RExC_naughty += 4;
}
else if (op == '*') {
min = 0;
goto do_curly;
}
RExC_naughty += 3;
}
else if (op == '+') {
min = 1;
goto do_curly;
}
else if (op == '?') {
goto do_curly;
}
"%.*s matches null string many times",
}
if (*RExC_parse == '?') {
}
if (ISMULT2(RExC_parse)) {
RExC_parse++;
vFAIL("Nested quantifiers");
}
return(ret);
}
/*
- regatom - the lowest level
*
* Optimization: gobbles an entire sequence of ordinary characters so that
* it can turn them into a single node, which is smaller to store and
* faster to run. Backslashed characters are exceptions, each becoming a
* separate node; the code is simpler that way and it's not worth fixing.
*
* [Yes, it is worth fixing, some scripts can run twice the speed.] */
{
switch (*RExC_parse) {
case '^':
if (RExC_flags & PMf_MULTILINE)
else if (RExC_flags & PMf_SINGLELINE)
else
break;
case '$':
if (*RExC_parse)
if (RExC_flags & PMf_MULTILINE)
else if (RExC_flags & PMf_SINGLELINE)
else
break;
case '.':
if (RExC_flags & PMf_SINGLELINE)
else
RExC_naughty++;
break;
case '[':
{
if (*RExC_parse != ']') {
vFAIL("Unmatched [");
}
break;
}
case '(':
if (RExC_parse == RExC_end) {
/* Make parent create an empty node if needed. */
return(NULL);
}
goto tryagain;
}
return(NULL);
}
break;
case '|':
case ')':
return NULL;
}
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
case '{':
if (!regcurly(RExC_parse)) {
RExC_parse++;
goto defchar;
}
/* FALL THROUGH */
case '?':
case '+':
case '*':
RExC_parse++;
vFAIL("Quantifier follows nothing");
break;
case '\\':
switch (*++RExC_parse) {
case 'A':
break;
case 'G':
break;
case 'Z':
RExC_seen_zerolen++; /* Do not optimize RE away */
break;
case 'z':
RExC_seen_zerolen++; /* Do not optimize RE away */
break;
case 'C':
break;
case 'X':
break;
case 'w':
break;
case 'W':
break;
case 'b':
break;
case 'B':
break;
case 's':
break;
case 'S':
break;
case 'd':
break;
case 'D':
break;
case 'p':
case 'P':
{
/* a lovely hack--pretend we saw [\pX] instead */
if (!RExC_end) {
RExC_parse += 2;
vFAIL2("Missing right brace on \\%c{}", c);
}
RExC_end++;
}
else {
if (RExC_end > oldregxend)
}
RExC_parse--;
RExC_parse--;
}
break;
case 'n':
case 'r':
case 't':
case 'f':
case 'e':
case 'a':
case 'x':
case 'c':
case '0':
goto defchar;
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
goto defchar;
else {
while (isDIGIT(*RExC_parse))
RExC_parse++;
vFAIL("Reference to nonexistent group");
RExC_sawback = 1;
num);
/* override incorrect value set in reganode MJD */
RExC_parse--;
}
}
break;
case '\0':
if (RExC_parse >= RExC_end)
FAIL("Trailing \\");
/* FALL THROUGH */
default:
/* Do not generate `unrecognized' warnings here, we fall
back into the quick-grab loop below */
parse_start--;
goto defchar;
}
break;
case '#':
if (RExC_flags & PMf_EXTENDED) {
if (RExC_parse < RExC_end)
goto tryagain;
}
/* FALL THROUGH */
default: {
register char *p;
char *oldp, *s;
RExC_parse++;
ender = 0;
len++)
{
oldp = p;
if (RExC_flags & PMf_EXTENDED)
switch (*p) {
case '^':
case '$':
case '.':
case '[':
case '(':
case ')':
case '|':
goto loopdone;
case '\\':
switch (*++p) {
case 'A':
case 'C':
case 'X':
case 'G':
case 'Z':
case 'z':
case 'w':
case 'W':
case 'b':
case 'B':
case 's':
case 'S':
case 'd':
case 'D':
case 'p':
case 'P':
--p;
goto loopdone;
case 'n':
ender = '\n';
p++;
break;
case 'r':
ender = '\r';
p++;
break;
case 't':
ender = '\t';
p++;
break;
case 'f':
ender = '\f';
p++;
break;
case 'e':
p++;
break;
case 'a':
p++;
break;
case 'x':
if (*++p == '{') {
char* e = strchr(p, '}');
if (!e) {
RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
}
else {
numlen = e - p - 1;
if (ender > 0xff)
RExC_utf8 = 1;
p = e + 1;
}
}
else {
numlen = 2;
p += numlen;
}
break;
case 'c':
p++;
break;
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
numlen = 3;
p += numlen;
}
else {
--p;
goto loopdone;
}
break;
case '\0':
if (p >= RExC_end)
FAIL("Trailing \\");
/* FALL THROUGH */
default:
goto normal_default;
}
break;
default:
if (UTF8_IS_START(*p) && UTF) {
&numlen, 0);
p += numlen;
}
else
ender = *p++;
break;
}
if (RExC_flags & PMf_EXTENDED)
/* Prime the casefolded buffer. */
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
else if (UTF) {
if (FOLD) {
/* Emit all the Unicode characters. */
if (numlen > 0) {
s += unilen;
/* In EBCDIC the numlen
* and unilen can differ. */
break;
}
else
break; /* "Can't happen." */
}
}
else {
if (unilen > 0) {
s += unilen;
}
}
}
else {
len++;
}
break;
}
if (UTF) {
if (FOLD) {
/* Emit all the Unicode characters. */
if (numlen > 0) {
s += unilen;
/* In EBCDIC the numlen
* and unilen can differ. */
break;
}
else
break;
}
}
else {
if (unilen > 0) {
s += unilen;
}
}
len--;
}
else
}
RExC_parse = p - 1;
{
/* len is STRLEN which is unsigned, need to copy to signed */
if (iv < 0)
vFAIL("Internal disaster");
}
if (len > 0)
if (!SIZE_ONLY)
if (SIZE_ONLY)
else
}
break;
}
/* If the encoding pragma is in effect recode the text of
* any EXACT-kind nodes. */
if (RExC_utf8)
RExC_utf8 = 1;
if (!SIZE_ONLY) {
(int)newlen, s));
} else
}
}
return(ret);
}
STATIC char *
{
while (p < e) {
if (isSPACE(*p))
++p;
else if (*p == '#') {
do {
p++;
} while (p < e && *p != '\n');
}
else
break;
}
return p;
}
/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
but trigger failures because they are currently unimplemented. */
{
char *posixcc = 0;
/* I smell either [: or [= or [. -- POSIX has been here, right? */
char c = UCHARAT(RExC_parse);
char* s = RExC_parse++;
RExC_parse++;
if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
RExC_parse = s;
else {
char* t = RExC_parse++; /* skip over the c */
RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
switch (*posixcc) {
case 'a':
break;
case 'b':
break;
case 'c':
break;
case 'd':
break;
case 'g':
break;
case 'l':
break;
case 'p':
break;
case 's':
break;
case 'u':
break;
case 'w': /* this is not POSIX, this is the Perl \w */
skip = 4;
}
break;
case 'x':
skip = 6;
}
break;
}
if (namedclass == OOB_NAMEDCLASS ||
{
Simple_vFAIL3("POSIX class [:%.*s:] unknown",
t - s - 1, s + 1);
}
} else if (!SIZE_ONLY) {
/* [[=foo=]] and [[.foo.]] are still future. */
/* adjust RExC_parse so the warning shows after
the class closes */
RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
} else {
/* Maternal grandfather:
* "[:" ending in ":" but not in ":]" */
RExC_parse = s;
}
}
}
return namedclass;
}
STATIC void
{
char *s = RExC_parse;
char c = *s++;
while(*s && isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
if (ckWARN(WARN_REGEXP))
vWARN3(s+2,
"POSIX syntax [%c %c] belongs inside character classes",
c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
if (POSIXCC_NOTYET(c)) {
/* adjust RExC_parse so the error shows after
the class closes */
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
}
}
}
{
char *rangebegin = 0;
bool need_class = 0;
register char *e;
UV n;
#ifdef EBCDIC
#endif
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) = 0;
RExC_naughty++;
RExC_parse++;
if (!SIZE_ONLY)
}
if (SIZE_ONLY)
RExC_size += ANYOF_SKIP;
else {
RExC_emit += ANYOF_SKIP;
if (FOLD)
if (LOC)
}
/* allow 1st char to be ] (allowing it to be - is dealt with later) */
goto charclassloop;
if (!range)
if (UTF) {
&numlen, 0);
RExC_parse += numlen;
}
else
else if (value == '\\') {
if (UTF) {
&numlen, 0);
RExC_parse += numlen;
}
else
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
* be a problem later if we want switch on Unicode.
* A similar issue a little bit later when switching on
* namedclass. --jhi */
case 'p':
case 'P':
if (RExC_parse >= RExC_end)
if (*RExC_parse == '{') {
if (!e)
vFAIL2("Missing right brace on \\%c{}", c);
RExC_parse++;
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
n--;
}
else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
RExC_parse++;
n--;
RExC_parse++;
n--;
}
}
if (value == 'p')
"+utf8::%.*s\n", (int)n, RExC_parse);
else
"!utf8::%.*s\n", (int)n, RExC_parse);
}
RExC_parse = e + 1;
continue;
case 'x':
if (*RExC_parse == '{') {
if (!e)
vFAIL("Missing right brace on \\x{}");
numlen = e - RExC_parse;
RExC_parse = e + 1;
}
else {
numlen = 2;
RExC_parse += numlen;
}
break;
case 'c':
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
numlen = 3;
RExC_parse += numlen;
break;
}
default:
"Unrecognized escape \\%c in character class passed through",
(int)value);
break;
}
} /* end of \blah */
#ifdef EBCDIC
else
#endif
if (!SIZE_ONLY && !need_class)
need_class = 1;
/* a bad range like a-\d, a-[:digit:] ? */
if (range) {
if (!SIZE_ONLY) {
if (ckWARN(WARN_REGEXP))
"False [] range \"%*.*s\"",
if (prevvalue < 256) {
}
else {
}
}
range = 0; /* this was not a true range */
}
if (!SIZE_ONLY) {
if (namedclass > OOB_NAMEDCLASS)
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
case ANYOF_ALNUM:
if (LOC)
else {
}
break;
case ANYOF_NALNUM:
if (LOC)
else {
}
break;
case ANYOF_ALNUMC:
if (LOC)
else {
}
break;
case ANYOF_NALNUMC:
if (LOC)
else {
}
break;
case ANYOF_ALPHA:
if (LOC)
else {
}
break;
case ANYOF_NALPHA:
if (LOC)
else {
}
break;
case ANYOF_ASCII:
if (LOC)
else {
#ifndef EBCDIC
#else /* EBCDIC */
}
#endif /* EBCDIC */
}
break;
case ANYOF_NASCII:
if (LOC)
else {
#ifndef EBCDIC
#else /* EBCDIC */
}
#endif /* EBCDIC */
}
break;
case ANYOF_BLANK:
if (LOC)
else {
}
break;
case ANYOF_NBLANK:
if (LOC)
else {
}
break;
case ANYOF_CNTRL:
if (LOC)
else {
}
break;
case ANYOF_NCNTRL:
if (LOC)
else {
}
break;
case ANYOF_DIGIT:
if (LOC)
else {
/* consecutive digits assumed */
}
break;
case ANYOF_NDIGIT:
if (LOC)
else {
/* consecutive digits assumed */
}
break;
case ANYOF_GRAPH:
if (LOC)
else {
}
break;
case ANYOF_NGRAPH:
if (LOC)
else {
}
break;
case ANYOF_LOWER:
if (LOC)
else {
}
break;
case ANYOF_NLOWER:
if (LOC)
else {
}
break;
case ANYOF_PRINT:
if (LOC)
else {
}
break;
case ANYOF_NPRINT:
if (LOC)
else {
}
break;
case ANYOF_PSXSPC:
if (LOC)
else {
}
break;
case ANYOF_NPSXSPC:
if (LOC)
else {
}
break;
case ANYOF_PUNCT:
if (LOC)
else {
}
break;
case ANYOF_NPUNCT:
if (LOC)
else {
}
break;
case ANYOF_SPACE:
if (LOC)
else {
}
break;
case ANYOF_NSPACE:
if (LOC)
else {
}
break;
case ANYOF_UPPER:
if (LOC)
else {
}
break;
case ANYOF_NUPPER:
if (LOC)
else {
}
break;
case ANYOF_XDIGIT:
if (LOC)
else {
}
break;
case ANYOF_NXDIGIT:
if (LOC)
else {
}
break;
default:
vFAIL("Invalid [::] class");
break;
}
if (LOC)
continue;
}
} /* end of namedclass \blah */
if (range) {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
range = 0; /* not a valid range */
}
}
else {
RExC_parse++;
/* a bad range like \w-, [:word:]- ? */
if (namedclass > OOB_NAMEDCLASS) {
if (ckWARN(WARN_REGEXP))
"False [] range \"%*.*s\"",
if (!SIZE_ONLY)
} else
continue; /* but do it the next time */
}
}
/* now is the next time */
if (!SIZE_ONLY) {
IV i;
if (prevvalue < 256) {
#ifdef EBCDIC
/* In EBCDIC [\x89-\x91] should include
* the \x8e but [i-j] should not. */
if (literal_endpoint == 2 &&
{
if (isLOWER(i))
ANYOF_BITMAP_SET(ret, i);
} else {
if (isUPPER(i))
ANYOF_BITMAP_SET(ret, i);
}
}
else
#endif
ANYOF_BITMAP_SET(ret, i);
}
}
else if (prevnatvalue == natvalue) {
if (FOLD) {
/* If folding and foldable and a single
* character, insert also the folded version
* to the charclass. */
if (f != value) {
else {
/* Any multicharacter foldings
* require the following transform:
* [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
* where E folds into "pq" and F folds
* into "rst", all other characters
* fold to single characters. We save
* away these multicharacter foldings,
* to be later saved as part of the
* additional "s" data. */
if (!unicode_alternate)
unicode_alternate = newAV();
}
}
/* If folding and the value is one of the Greek
* sigmas insert a few more sigmas to make the
* folding rules of the sigmas to work right.
* Note that not all the possible combinations
* are handled here: some of them are handled
* by the standard folding rules, and some of
* them (literal or EXACTF cases) are handled
* during runtime in regexec.c:S_find_byclass(). */
if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
}
else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
}
}
}
#ifdef EBCDIC
literal_endpoint = 0;
#endif
}
range = 0; /* this range (if it was one) is done now */
}
if (need_class) {
if (SIZE_ONLY)
else
}
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
/* If the only flag is folding (plus possibly inversion). */
) {
}
}
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
if (!SIZE_ONLY && optimize_invert &&
/* If the only flag is inversion. */
}
if (!SIZE_ONLY) {
/* The 0th element stores the character class description
* in its textual form: used later (regexec.c:Perl_regclass_swash())
* to initialize the appropriate swash (which gets stored in
* the 1st element), and also useful for dumping the regnode.
* The 2nd element stores the multicharacter foldings,
* used later (regexec.c:S_reginclass()). */
}
return ret;
}
STATIC char*
{
for (;;) {
while (*RExC_parse != ')') {
if (RExC_parse == RExC_end)
FAIL("Sequence (?#... not terminated");
RExC_parse++;
}
RExC_parse++;
continue;
}
if (RExC_flags & PMf_EXTENDED) {
if (isSPACE(*RExC_parse)) {
RExC_parse++;
continue;
}
else if (*RExC_parse == '#') {
while (RExC_parse < RExC_end)
if (*RExC_parse++ == '\n') break;
continue;
}
}
return retval;
}
}
/*
- reg_node - emit a node
*/
{
if (SIZE_ONLY) {
RExC_size += 1;
return(ret);
}
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
"reg_node", __LINE__,
? "Overwriting end of array!\n" : "OK",
RExC_offsets[0]));
}
return(ret);
}
/*
- reganode - emit a node with an argument
*/
{
if (SIZE_ONLY) {
RExC_size += 2;
return(ret);
}
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
"reganode",
"Overwriting end of array!\n" : "OK",
RExC_offsets[0]));
}
return(ret);
}
/*
- reguni - emit (if appropriate) a Unicode character
*/
STATIC void
{
}
/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
*/
STATIC void
{
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
if (SIZE_ONLY) {
return;
}
if (RExC_offsets) { /* MJD 20010112 */
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
"reg_insert",
? "Overwriting end of array!\n" : "OK",
RExC_offsets[0]));
}
}
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
"reginsert",
? "Overwriting end of array!\n" : "OK",
RExC_offsets[0]));
}
}
/*
- regtail - set the next-pointer at the end of a node chain of p to val.
*/
STATIC void
{
if (SIZE_ONLY)
return;
/* Find last node. */
scan = p;
for (;;) {
break;
}
}
else {
}
}
/*
- regoptail - regtail on operand of first argument; nop if operandless
*/
STATIC void
{
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
return;
}
}
else
return;
}
/*
- regcurly - a little FSA that accepts {\d+,?\d*}
*/
{
if (*s++ != '{')
return FALSE;
if (!isDIGIT(*s))
return FALSE;
while (isDIGIT(*s))
s++;
if (*s == ',')
s++;
while (isDIGIT(*s))
s++;
if (*s != '}')
return FALSE;
return TRUE;
}
#ifdef DEBUGGING
{
/* While that wasn't END last time... */
l--;
/* Where, what. */
goto after_print;
else
: next);
}
}
}
}
}
/* arglen 1 + class block */
? ANYOF_CLASS_SKIP : ANYOF_SKIP);
}
/* Literal string, where present. */
}
else {
}
l++;
l--;
}
return node;
}
#endif /* DEBUGGING */
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
void
{
#ifdef DEBUGGING
/* Header fields of interest. */
if (r->anchored_substr)
PL_colors[0],
SvPVX(r->anchored_substr),
PL_colors[1],
(IV)r->anchored_offset);
else if (r->anchored_utf8)
PL_colors[0],
SvPVX(r->anchored_utf8),
PL_colors[1],
(IV)r->anchored_offset);
if (r->float_substr)
PL_colors[0],
SvPVX(r->float_substr),
PL_colors[1],
else if (r->float_utf8)
PL_colors[0],
SvPVX(r->float_utf8),
PL_colors[1],
if (r->check_substr || r->check_utf8)
r->check_substr == r->float_substr
&& r->check_utf8 == r->float_utf8
? "(checking floating" : "(checking anchored");
if (r->reganch & ROPT_NOSCAN)
if (r->reganch & ROPT_CHECK_ALL)
if (r->check_substr || r->check_utf8)
if (r->regstclass) {
}
if (r->reganch & ROPT_ANCH_BOL)
if (r->reganch & ROPT_ANCH_MBOL)
if (r->reganch & ROPT_ANCH_SBOL)
if (r->reganch & ROPT_ANCH_GPOS)
}
if (r->reganch & ROPT_GPOS_SEEN)
if (r->reganch & ROPT_IMPLICIT)
if (r->reganch & ROPT_EVAL_SEEN)
if (r->offsets) {
U32 i;
for (i = 1; i <= len; i++)
}
#endif /* DEBUGGING */
}
#ifdef DEBUGGING
STATIC void
{
else if (c == '-' || c == ']' || c == '\\' || c == '^')
else
}
#endif /* DEBUGGING */
/*
- regprop - printable representation of opcode
*/
void
{
#ifdef DEBUGGING
register int k;
/* It would be nice to FAIL() here, but this may be called from
regexec.c, and it would be hard to supply pRExC_state. */
if (k == EXACT) {
/* Using is_utf8_string() is a crude hack but it may
* be the best for now since we have no flag "this EXACTish
* node was UTF-8" --jhi */
char *s = do_utf8 ?
STRING(o);
strlen(s) :
STR_LEN(o);
PL_colors[0],
len, s,
PL_colors[1]);
}
else if (k == CURLY) {
}
else if (k == LOGICAL)
else if (k == ANYOF) {
* ANYOF_ #xdefines in regcomp.h */
"\\w",
"\\W",
"\\s",
"\\S",
"\\d",
"\\D",
"[:alnum:]",
"[:^alnum:]",
"[:alpha:]",
"[:^alpha:]",
"[:ascii:]",
"[:^ascii:]",
"[:ctrl:]",
"[:^ctrl:]",
"[:graph:]",
"[:^graph:]",
"[:lower:]",
"[:^lower:]",
"[:print:]",
"[:^print:]",
"[:punct:]",
"[:^punct:]",
"[:upper:]",
"[:^upper:]",
"[:xdigit:]",
"[:^xdigit:]",
"[:space:]",
"[:^space:]",
"[:blank:]",
"[:^blank:]"
};
if (flags & ANYOF_LOCALE)
if (flags & ANYOF_FOLD)
if (flags & ANYOF_INVERT)
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++)
else {
}
rangestart = -1;
}
}
if (o->flags & ANYOF_CLASS)
for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
if (ANYOF_CLASS_TEST(o,i))
if (flags & ANYOF_UNICODE)
else if (flags & ANYOF_UNICODE_ALL)
{
if (lv) {
if (sw) {
for (i = 0; i <= 256; i++) { /* just the first 256 */
U8 *e = uvchr_to_utf8(s, i);
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
U8 *p;
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
}
else {
for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
}
rangestart = -1;
}
}
}
{
char *origs = s;
while(*s && *s != '\n') s++;
if (*s == '\n') {
char *t = ++s;
while (*s) {
if (*s == '\n')
*s = ' ';
s++;
}
if (s[-1] == ' ')
s[-1] = 0;
}
}
}
}
}
#endif /* DEBUGGING */
}
SV *
{ /* Assume that RE_INTUIT is set */
if (!PL_colorset) reginitcolors();
"%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
PL_colors[4],
s,
PL_colors[1],
} );
}
void
{
#ifdef DEBUGGING
#endif
if (!r || (--r->refcnt > 0))
return;
DEBUG_r({
int len;
char *s;
if (!PL_colorset)
"%sFreeing REx:%s `%s%*.*s%s%s'\n",
PL_colors[1],
});
if (r->precomp)
if (r->offsets) /* 20010421 MJD */
if (RX_MATCH_COPIED(r))
if (r->substrs) {
if (r->anchored_substr)
if (r->anchored_utf8)
if (r->float_substr)
SvREFCNT_dec(r->float_substr);
if (r->float_utf8)
SvREFCNT_dec(r->float_utf8);
}
if (r->data) {
while (--n >= 0) {
/* If you add a ->what type here, update the comment in regcomp.h */
case 's':
break;
case 'f':
break;
case 'p':
break;
case 'o':
if (new_comppad == NULL)
/* Watch out for global destruction's random ordering. */
);
}
new_comppad = NULL;
break;
case 'n':
break;
default:
}
}
}
Safefree(r);
}
/*
- regnext - dig the "next" pointer out of a node
*
* [Note, when REGALIGN is defined there are two places in regmatch()
* that bypass this code for speed.]
*/
regnode *
{
if (p == &PL_regdummy)
return(NULL);
if (offset == 0)
return(NULL);
return(p+offset);
}
STATIC void
{
char *message;
if (l1 > 510)
l1 = 510;
#ifdef I_STDARG
/* ANSI variant takes additional second argument */
#else
#endif
if (l1 > 512)
l1 = 512;
}
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
void
{
PL_reg_start_tmp = 0;
PL_reg_start_tmpl = 0;
PL_reg_oldsavedlen = 0;
PL_reg_maxiter = 0;
PL_reg_leftiter = 0;
PL_reg_poscache_size = 0;
{
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
U32 i;
}
}
}
#ifdef DEBUGGING
#endif
}
static void
{
ReREFCNT_dec((regexp *)r);
}