_sre.c revision 4fd606d1f5abe38e1f42c38de1d2e895166bd0f4
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* partial history:
* 1999-10-24 fl created (based on existing template matcher code)
* 2000-03-06 fl first alpha, sort of
* 2000-08-01 fl fixes for 1.6b1
* 2000-08-07 fl use PyOS_CheckStack() if available
* 2000-09-20 fl added expand method
* 2001-03-20 fl lots of fixes for 2.1b2
* 2001-04-15 fl export copyright as Python attribute, not global
* 2001-04-28 fl added __copy__ methods (work in progress)
* 2001-05-14 fl fixes for 1.5.2 compatibility
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
* 2001-10-24 fl added finditer primitive (for 2.2 only)
* 2003-04-18 mvl fully support 4-byte codes
* 2003-10-17 gn implemented non recursive scheme
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* This version of the SRE library can be redistributed under CNRI's
* Python 1.6 license. For any other use, please contact Secret Labs
* AB (info@pythonware.com).
*
* Portions of this engine have been developed in cooperation with
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
* other compatibility work.
*/
/* Get rid of these macros to prevent collisions between EFI and Python in this file. */
#ifndef SRE_RECURSIVE
static char copyright[] =
" SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
#define PY_SSIZE_T_CLEAN
#include "Python.h"
#include "structmember.h" /* offsetof */
#include "sre.h"
#include <ctype.h>
/* name of this module, minus the leading underscore */
#if !defined(SRE_MODULE)
#define SRE_MODULE "sre"
#endif
#define SRE_PY_MODULE "re"
/* defining this one enables tracing */
#if PY_VERSION_HEX >= 0x01060000
/* defining this enables unicode support (default under 1.6a1 and later) */
#define HAVE_UNICODE
#endif
#endif
/* -------------------------------------------------------------------- */
/* optional features */
/* enables fast searching */
#define USE_FAST_SEARCH
/* enables aggressive inlining (always on for Visual C) */
#if PY_VERSION_HEX < 0x01060000
#endif
/* -------------------------------------------------------------------- */
#if defined(_MSC_VER)
/* fastest possible local call under MSVC */
#elif defined(USE_INLINE)
#else
#endif
/* error codes */
#if defined(VERBOSE)
#else
#define TRACE(v)
#endif
/* -------------------------------------------------------------------- */
/* search engine state */
/* default character predicates (run sre_chars.py to regenerate tables) */
#define SRE_DIGIT_MASK 1
#define SRE_SPACE_MASK 2
#define SRE_LINEBREAK_MASK 4
#define SRE_ALNUM_MASK 8
#define SRE_WORD_MASK 16
/* FIXME: this assumes ASCII. create tables in init_sre() instead */
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127 };
#define SRE_IS_DIGIT(ch)\
#define SRE_IS_SPACE(ch)\
#define SRE_IS_LINEBREAK(ch)\
#define SRE_IS_ALNUM(ch)\
#define SRE_IS_WORD(ch)\
{
}
/* locale-specific character predicates */
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
* warnings when c's type supports only numbers < N+1 */
static unsigned int sre_lower_locale(unsigned int ch)
{
}
/* unicode-specific character predicates */
#if defined(HAVE_UNICODE)
static unsigned int sre_lower_unicode(unsigned int ch)
{
}
#endif
LOCAL(int)
{
switch (category) {
case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch);
case SRE_CATEGORY_NOT_DIGIT:
return !SRE_IS_DIGIT(ch);
case SRE_CATEGORY_SPACE:
return SRE_IS_SPACE(ch);
case SRE_CATEGORY_NOT_SPACE:
return !SRE_IS_SPACE(ch);
case SRE_CATEGORY_WORD:
return SRE_IS_WORD(ch);
case SRE_CATEGORY_NOT_WORD:
return !SRE_IS_WORD(ch);
case SRE_CATEGORY_LINEBREAK:
return SRE_IS_LINEBREAK(ch);
return !SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_WORD:
return SRE_LOC_IS_WORD(ch);
return !SRE_LOC_IS_WORD(ch);
#if defined(HAVE_UNICODE)
case SRE_CATEGORY_UNI_DIGIT:
return SRE_UNI_IS_DIGIT(ch);
return !SRE_UNI_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_SPACE:
return SRE_UNI_IS_SPACE(ch);
return !SRE_UNI_IS_SPACE(ch);
case SRE_CATEGORY_UNI_WORD:
return SRE_UNI_IS_WORD(ch);
return !SRE_UNI_IS_WORD(ch);
return SRE_UNI_IS_LINEBREAK(ch);
return !SRE_UNI_IS_LINEBREAK(ch);
#else
case SRE_CATEGORY_UNI_DIGIT:
return SRE_IS_DIGIT(ch);
return !SRE_IS_DIGIT(ch);
case SRE_CATEGORY_UNI_SPACE:
return SRE_IS_SPACE(ch);
return !SRE_IS_SPACE(ch);
case SRE_CATEGORY_UNI_WORD:
return SRE_LOC_IS_WORD(ch);
return !SRE_LOC_IS_WORD(ch);
return SRE_IS_LINEBREAK(ch);
return !SRE_IS_LINEBREAK(ch);
#endif
}
return 0;
}
/* helpers */
static void
{
if (state->data_stack) {
}
}
static int
{
void* stack;
if (!stack) {
return SRE_ERROR_MEMORY;
}
}
return 0;
}
/* generate 8-bit version */
#define SRE_CHAR unsigned char
#define SRE_CHARSET sre_charset
#define SRE_MATCH_CONTEXT sre_match_context
#define SRE_SEARCH sre_search
#if defined(HAVE_UNICODE)
#define SRE_RECURSIVE
#include "_sre.c"
/* generate 16-bit unicode version */
#define SRE_CHAR Py_UNICODE
#define SRE_COUNT sre_ucount
#define SRE_CHARSET sre_ucharset
#define SRE_MATCH sre_umatch
#define SRE_MATCH_CONTEXT sre_umatch_context
#define SRE_SEARCH sre_usearch
#endif
#endif /* SRE_RECURSIVE */
/* -------------------------------------------------------------------- */
/* String matching engine */
/* the following section is compiled twice, with different character
settings */
LOCAL(int)
{
/* check if pointer is at given position */
switch (at) {
case SRE_AT_BEGINNING:
case SRE_AT_BEGINNING_STRING:
case SRE_AT_BEGINNING_LINE:
case SRE_AT_END:
SRE_IS_LINEBREAK((int) ptr[0])) ||
case SRE_AT_END_LINE:
SRE_IS_LINEBREAK((int) ptr[0]));
case SRE_AT_END_STRING:
case SRE_AT_BOUNDARY:
return 0;
SRE_IS_WORD((int) ptr[0]) : 0;
case SRE_AT_NON_BOUNDARY:
return 0;
SRE_IS_WORD((int) ptr[0]) : 0;
case SRE_AT_LOC_BOUNDARY:
return 0;
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
case SRE_AT_LOC_NON_BOUNDARY:
return 0;
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
#if defined(HAVE_UNICODE)
case SRE_AT_UNI_BOUNDARY:
return 0;
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
case SRE_AT_UNI_NON_BOUNDARY:
return 0;
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
#endif
}
return 0;
}
LOCAL(int)
{
/* check if character is a member of the given set */
int ok = 1;
for (;;) {
switch (*set++) {
case SRE_OP_FAILURE:
return !ok;
case SRE_OP_LITERAL:
/* <LITERAL> <code> */
return ok;
set++;
break;
case SRE_OP_CATEGORY:
/* <CATEGORY> <code> */
return ok;
set += 1;
break;
case SRE_OP_CHARSET:
if (sizeof(SRE_CODE) == 2) {
/* <CHARSET> <bitmap> (16 bits per code word) */
return ok;
set += 16;
}
else {
/* <CHARSET> <bitmap> (32 bits per code word) */
return ok;
set += 8;
}
break;
case SRE_OP_RANGE:
/* <RANGE> <lower> <upper> */
return ok;
set += 2;
break;
case SRE_OP_NEGATE:
break;
case SRE_OP_BIGCHARSET:
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
{
if (sizeof(SRE_CODE) == 2) {
set += 128;
return ok;
}
else {
/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
* warnings when c's type supports only numbers < N+1 */
if (!(ch & ~65535))
else
block = -1;
set += 64;
if (block >=0 &&
return ok;
}
break;
}
default:
/* internal error -- there's not much we can do about it
here, so let's just pretend it didn't match... */
return 0;
}
}
}
{
Py_ssize_t i;
/* adjust end */
switch (pattern[0]) {
case SRE_OP_IN:
/* repeated set */
ptr++;
break;
case SRE_OP_ANY:
/* repeated dot wildcard. */
ptr++;
break;
case SRE_OP_ANY_ALL:
/* repeated dot wildcard. skip to the end of the target
string, and backtrack from there */
break;
case SRE_OP_LITERAL:
/* repeated literal */
ptr++;
break;
case SRE_OP_LITERAL_IGNORE:
/* repeated literal */
ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* repeated non-literal */
ptr++;
break;
/* repeated non-literal */
ptr++;
break;
default:
/* repeated single character pattern */
if (i < 0)
return i;
if (!i)
break;
}
}
}
#if 0 /* not used in this release */
LOCAL(int)
{
/* check if an SRE_OP_INFO block matches at the current position.
returns the number of SRE_CODE objects to skip if successful, 0
if no match */
Py_ssize_t i;
/* check minimal length */
return 0;
/* check known prefix */
/* <length> <skip> <prefix data> <overlap data> */
for (i = 0; i < pattern[5]; i++)
return 0;
}
return pattern[0];
}
#endif
/* The macros below should be used to protect recursive SRE_MATCH()
* calls that *failed* and do *not* return immediately (IOW, those
* that will backtrack). Explaining:
*
* - Recursive SRE_MATCH() returned true: that's usually a success
* (besides atypical cases like ASSERT_NOT), therefore there's no
* reason to restore lastmark;
*
* - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
* is returning to the caller: If the current SRE_MATCH() is the
* top function of the recursion, returning false will be a matching
* failure, and it doesn't matter where lastmark is pointing to.
* If it's *not* the top function, it will be a recursive SRE_MATCH()
* failure by itself, and the calling SRE_MATCH() will have to deal
* with the failure by the same rules explained here (it will restore
* lastmark by itself if necessary);
*
* - Recursive SRE_MATCH() returned false, and will continue the
* outside 'for' loop: must be protected when breaking, since the next
* OP could potentially depend on lastmark;
*
* - Recursive SRE_MATCH() returned false, and will be called again
* loop iteration, since the recursive SRE_MATCH() could do anything,
* and could potentially depend on lastmark.
*
* For more information, check the discussion at SF patch #712900.
*/
#define LASTMARK_SAVE() \
do { \
} while (0)
#define LASTMARK_RESTORE() \
do { \
} while (0)
#define RETURN_ERROR(i) do { return i; } while(0)
#define RETURN_ON_ERROR(i) \
do { if (i < 0) RETURN_ERROR(i); } while (0)
#define RETURN_ON_SUCCESS(i) \
do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
#define RETURN_ON_FAILURE(i) \
do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
#define SFY(x) #x
do { \
TRACE(("allocating %s in %d (%d)\n", \
if (j < 0) return j; \
if (ctx_pos != -1) \
} \
} while (0)
do { \
} while (0)
do { \
TRACE(("copy data in %p to %d (%d)\n", \
if (j < 0) return j; \
if (ctx_pos != -1) \
} \
} while (0)
do { \
TRACE(("copy data to %p from %d (%d)\n", \
if (discard) \
} while (0)
do { \
TRACE(("discard data from %d (%d)\n", \
} while(0)
#define DATA_PUSH(x) \
DATA_STACK_PUSH(state, (x), sizeof(*(x)))
#define DATA_POP(x) \
#define DATA_POP_DISCARD(x) \
DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
#define DATA_ALLOC(t,p) \
DATA_STACK_ALLOC(state, t, p)
#define DATA_LOOKUP_AT(t,p,pos) \
do if (lastmark > 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
} while (0)
do if (lastmark > 0) { \
} while (0)
#define MARK_POP_KEEP(lastmark) \
do if (lastmark > 0) { \
} while (0)
#define MARK_POP_DISCARD(lastmark) \
do if (lastmark > 0) { \
} while (0)
#define JUMP_NONE 0
#define JUMP_MAX_UNTIL_1 1
#define JUMP_MAX_UNTIL_2 2
#define JUMP_MAX_UNTIL_3 3
#define JUMP_MIN_UNTIL_1 4
#define JUMP_MIN_UNTIL_2 5
#define JUMP_MIN_UNTIL_3 6
#define JUMP_REPEAT 7
#define JUMP_REPEAT_ONE_1 8
#define JUMP_REPEAT_ONE_2 9
#define JUMP_MIN_REPEAT_ONE 10
#define JUMP_BRANCH 11
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
goto entrance; \
jumplabel: \
while (0) /* gcc doesn't like labels at end of scopes */ \
typedef struct {
union {
} u;
/* check if string matches the given pattern. returns <0 for
error, 0 for failure, and 1 for success */
{
Py_ssize_t i, ret = 0;
unsigned int sigcount=0;
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
TRACE(("reject (got %d chars, need %d)\n",
}
}
for (;;) {
++sigcount;
case SRE_OP_MARK:
/* set mark */
/* <MARK> <gid> */
if (i & 1)
/* state->lastmark is the highest valid index in the
state->mark array. If it is increased by more than 1,
the intervening marks must be set to NULL to signal
that these marks have not been encountered. */
while (j < i)
}
break;
case SRE_OP_LITERAL:
/* match literal string */
/* <LITERAL> <code> */
break;
case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */
/* <NOT_LITERAL> <code> */
break;
case SRE_OP_SUCCESS:
/* end of pattern */
case SRE_OP_AT:
/* match at given position */
/* <AT> <code> */
break;
case SRE_OP_CATEGORY:
/* match at given category */
/* <CATEGORY> <code> */
break;
case SRE_OP_ANY:
/* match anything (except a newline) */
/* <ANY> */
break;
case SRE_OP_ANY_ALL:
/* match anything */
/* <ANY_ALL> */
break;
case SRE_OP_IN:
/* match set member (or non_member) */
/* <IN> <skip> <set> */
break;
case SRE_OP_LITERAL_IGNORE:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
break;
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
break;
case SRE_OP_IN_IGNORE:
break;
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */
/* <JUMP> <offset> */
break;
case SRE_OP_BRANCH:
/* alternation */
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
continue;
continue;
if (ret) {
}
}
case SRE_OP_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */
/* this operator only works if the repeated item is
exactly one character wide, and we're not already
collecting backtracking points. for other cases,
use the MAX_REPEAT operator */
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
RETURN_FAILURE; /* cannot match */
/* when we arrive here, count contains the number of
matches, and ctx->ptr points to the tail of the target
string. check if the rest of the pattern matches,
and backtrack if not. */
/* tail is empty. we're finished */
}
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
for (;;) {
}
break;
if (ret) {
}
}
} else {
/* general case */
if (ret) {
}
}
}
case SRE_OP_MIN_REPEAT_ONE:
/* match repeated sequence (minimizing regexp) */
/* this operator only works if the repeated item is
exactly one character wide, and we're not already
collecting backtracking points. for other cases,
use the MIN_REPEAT operator */
/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
RETURN_FAILURE; /* cannot match */
else {
/* count using pattern min as the maximum */
/* didn't match minimum number of times */
/* advance past minimum matches of repeat */
}
/* tail is empty. we're finished */
} else {
/* general case */
if (ret) {
}
if (ret == 0)
break;
}
}
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
/* install new repeat context */
}
if (ret) {
}
case SRE_OP_MAX_UNTIL:
/* maximizing repeat */
/* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
/* FIXME: we probably need to deal with zero-width
matches in here... */
/* not enough matches */
if (ret) {
}
}
/* we may have enough matches, but if we can
match another item, do so */
/* zero-width match protection */
if (ret) {
}
}
/* cannot match more repeated items here. make sure the
tail matches */
case SRE_OP_MIN_UNTIL:
/* minimizing repeat */
/* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
/* not enough matches */
if (ret) {
}
}
/* see if the tail matches */
if (ret) {
}
if (ret) {
}
case SRE_OP_GROUPREF:
/* match backreference */
{
Py_ssize_t groupref = i+i;
} else {
if (!p || !e || e < p)
while (p < e) {
}
}
}
break;
case SRE_OP_GROUPREF_IGNORE:
/* match backreference */
{
Py_ssize_t groupref = i+i;
} else {
if (!p || !e || e < p)
while (p < e) {
}
}
}
break;
case SRE_OP_GROUPREF_EXISTS:
/* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
{
Py_ssize_t groupref = i+i;
break;
} else {
if (!p || !e || e < p) {
break;
}
}
}
break;
case SRE_OP_ASSERT:
/* assert subpattern */
/* <ASSERT> <skip> <back> <pattern> */
break;
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* <ASSERT_NOT> <skip> <back> <pattern> */
if (ret) {
}
}
break;
case SRE_OP_FAILURE:
/* immediate failure */
default:
}
}
exit:
if (ctx_pos == -1)
return ret;
switch (jump) {
case JUMP_MAX_UNTIL_2:
goto jump_max_until_2;
case JUMP_MAX_UNTIL_3:
goto jump_max_until_3;
case JUMP_MIN_UNTIL_2:
goto jump_min_until_2;
case JUMP_MIN_UNTIL_3:
goto jump_min_until_3;
case JUMP_BRANCH:
goto jump_branch;
case JUMP_MAX_UNTIL_1:
goto jump_max_until_1;
case JUMP_MIN_UNTIL_1:
goto jump_min_until_1;
case JUMP_REPEAT:
goto jump_repeat;
case JUMP_REPEAT_ONE_1:
goto jump_repeat_one_1;
case JUMP_REPEAT_ONE_2:
goto jump_repeat_one_2;
case JUMP_MIN_REPEAT_ONE:
goto jump_min_repeat_one;
case JUMP_ASSERT:
goto jump_assert;
case JUMP_ASSERT_NOT:
goto jump_assert_not;
case JUMP_NONE:
break;
}
return ret; /* should never get here */
}
{
Py_ssize_t status = 0;
Py_ssize_t prefix_len = 0;
Py_ssize_t prefix_skip = 0;
int flags = 0;
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
/* adjust end point (but make sure we leave at least one
character in there, so literal search will work) */
}
if (flags & SRE_INFO_PREFIX) {
/* pattern starts with a known prefix */
/* <length> <skip> <prefix data> <overlap data> */
} else if (flags & SRE_INFO_CHARSET)
/* pattern starts with a character from a known set */
/* <charset> */
}
#if defined(USE_FAST_SEARCH)
if (prefix_len > 1) {
/* pattern starts with a known prefix. use the overlap
table to skip forward as fast as we possibly can */
Py_ssize_t i = 0;
for (;;) {
if (!i)
break;
else
i = overlap[i];
} else {
if (++i == prefix_len) {
/* found a potential match */
if (flags & SRE_INFO_LITERAL)
return 1; /* we got all of it */
if (status != 0)
return status;
/* close but no cigar -- try again */
i = overlap[i];
}
break;
}
}
ptr++;
}
return 0;
}
#endif
if (pattern[0] == SRE_OP_LITERAL) {
/* pattern starts with a literal character. this is used
for short prefixes, and if fast search is disabled */
for (;;) {
ptr++;
return 0;
if (flags & SRE_INFO_LITERAL)
return 1; /* we got all of it */
if (status != 0)
break;
}
} else if (charset) {
/* pattern starts with a character from a known set */
for (;;) {
ptr++;
return 0;
if (status != 0)
break;
ptr++;
}
} else
/* general case */
if (status != 0)
break;
}
return status;
}
LOCAL(int)
{
/* check if given string is a literal template (i.e. no escapes) */
while (len-- > 0)
if (*ptr++ == '\\')
return 0;
return 1;
}
#if !defined(SRE_RECURSIVE)
/* -------------------------------------------------------------------- */
/* factories and destructors */
/* see sre.h for object declarations */
static PyObject *
{
}
static PyObject *
{
return NULL;
if (flags & SRE_FLAG_LOCALE)
if (flags & SRE_FLAG_UNICODE)
#if defined(HAVE_UNICODE)
#else
#endif
}
LOCAL(void)
{
/* FIXME: dynamic! */
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
}
static void*
{
/* given a python object, return a data pointer, a length (in
characters), and a character size. return NULL if the object
is not a string (or not compatible) */
int charsize;
void* ptr;
#if defined(HAVE_UNICODE)
if (PyUnicode_Check(string)) {
/* unicode strings doesn't always support the buffer interface */
/* bytes = PyUnicode_GET_DATA_SIZE(string); */
charsize = sizeof(Py_UNICODE);
} else {
#endif
/* get pointer to string buffer */
return NULL;
}
/* determine buffer size */
if (bytes < 0) {
return NULL;
}
/* determine character size */
#if PY_VERSION_HEX >= 0x01060000
#else
#endif
charsize = 1;
#if defined(HAVE_UNICODE)
charsize = sizeof(Py_UNICODE);
#endif
else {
return NULL;
}
#if defined(HAVE_UNICODE)
}
#endif
*p_charsize = charsize;
return ptr;
}
{
/* prepare state object */
int charsize;
void* ptr;
if (!ptr)
return NULL;
/* adjust boundaries */
if (start < 0)
start = 0;
if (end < 0)
end = 0;
#if defined(HAVE_UNICODE)
#else
#endif
else
return string;
}
LOCAL(void)
{
}
/* calculate offset from start of string */
{
Py_ssize_t i, j;
if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
if (empty)
/* want empty string */
i = j = 0;
else {
return Py_None;
}
} else {
}
return PySequence_GetSlice(string, i, j);
}
static void
pattern_error(int status)
{
switch (status) {
"maximum recursion limit exceeded"
);
break;
case SRE_ERROR_MEMORY:
break;
case SRE_ERROR_INTERRUPTED:
/* An exception has already been raised, so let it fly */
break;
default:
"internal error in regular expression engine"
);
}
}
static void
{
}
static PyObject*
{
int status;
Py_ssize_t start = 0;
return NULL;
if (!string)
return NULL;
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
return NULL;
state_fini(&state);
}
static PyObject*
{
int status;
Py_ssize_t start = 0;
return NULL;
if (!string)
return NULL;
} else {
#if defined(HAVE_UNICODE)
#endif
}
state_fini(&state);
if (PyErr_Occurred())
return NULL;
}
static PyObject*
{
if (!args)
return NULL;
if (!name)
return NULL;
if (!mod)
return NULL;
if (!func)
return NULL;
return result;
}
#ifdef USE_BUILTIN_COPY
static int
{
"copy", "deepcopy",
);
if (!copy)
return 0;
return 1; /* success */
}
#endif
static PyObject*
{
/* join list elements */
#if PY_VERSION_HEX >= 0x01060000
#endif
if (!joiner)
return NULL;
if (PyList_GET_SIZE(list) == 0) {
return joiner;
}
#if PY_VERSION_HEX >= 0x01060000
if (!function) {
return NULL;
}
if (!args) {
return NULL;
}
#else
"string", "join",
);
#endif
return result;
}
static PyObject*
{
int status;
Py_ssize_t i, b, e;
Py_ssize_t start = 0;
return NULL;
if (!string)
return NULL;
list = PyList_New(0);
if (!list) {
state_fini(&state);
return NULL;
}
state_reset(&state);
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
goto error;
if (status <= 0) {
if (status == 0)
break;
goto error;
}
/* don't bother to build a match object */
case 0:
if (!item)
goto error;
break;
case 1:
if (!item)
goto error;
break;
default:
if (!item)
goto error;
if (!o) {
goto error;
}
PyTuple_SET_ITEM(item, i, o);
}
break;
}
if (status < 0)
goto error;
else
}
state_fini(&state);
return list;
state_fini(&state);
return NULL;
}
#if PY_VERSION_HEX >= 0x02020000
static PyObject*
{
if (!scanner)
return NULL;
if (!search)
return NULL;
return iterator;
}
#endif
static PyObject*
{
int status;
Py_ssize_t n;
Py_ssize_t i;
void* last;
Py_ssize_t maxsplit = 0;
return NULL;
if (!string)
return NULL;
list = PyList_New(0);
if (!list) {
state_fini(&state);
return NULL;
}
n = 0;
state_reset(&state);
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
goto error;
if (status <= 0) {
if (status == 0)
break;
goto error;
}
break;
/* skip one character */
continue;
}
/* get segment before this match */
);
if (!item)
goto error;
if (status < 0)
goto error;
/* add groups (if any) */
if (!item)
goto error;
if (status < 0)
goto error;
}
n = n + 1;
}
/* get segment following last match (even if empty) */
);
if (!item)
goto error;
if (status < 0)
goto error;
state_fini(&state);
return list;
state_fini(&state);
return NULL;
}
static PyObject*
{
void* ptr;
int status;
Py_ssize_t n;
Py_ssize_t i, b, e;
int bint;
int filter_is_callable;
if (PyCallable_Check(ptemplate)) {
filter_is_callable = 1;
} else {
/* if not callable, check if it's a literal string */
int literal;
b = bint;
if (ptr) {
if (b == 1) {
} else {
#if defined(HAVE_UNICODE)
#endif
}
} else {
PyErr_Clear();
literal = 0;
}
if (literal) {
filter_is_callable = 0;
} else {
/* not a literal; hand it over to the template compiler */
SRE_PY_MODULE, "_subx",
);
if (!filter)
return NULL;
}
}
if (!string) {
return NULL;
}
list = PyList_New(0);
if (!list) {
state_fini(&state);
return NULL;
}
n = i = 0;
state_reset(&state);
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
goto error;
if (status <= 0) {
if (status == 0)
break;
goto error;
}
if (i < b) {
/* get segment before this match */
if (!item)
goto error;
if (status < 0)
goto error;
} else if (i == b && i == e && n > 0)
/* ignore empty match on latest position */
goto next;
if (filter_is_callable) {
/* pass match object through filter */
if (!match)
goto error;
if (!args) {
goto error;
}
if (!item)
goto error;
} else {
/* filter is literal string */
}
/* add to list */
if (status < 0)
goto error;
}
i = e;
n = n + 1;
next:
/* move on */
else
}
/* get segment following last match */
if (!item)
goto error;
if (status < 0)
goto error;
}
state_fini(&state);
/* convert list to single string (also removes list) */
if (!item)
return NULL;
if (subn)
return item;
state_fini(&state);
return NULL;
}
static PyObject*
{
Py_ssize_t count = 0;
return NULL;
}
static PyObject*
{
Py_ssize_t count = 0;
return NULL;
}
static PyObject*
{
#ifdef USE_BUILTIN_COPY
int offset;
if (!copy)
return NULL;
#else
return NULL;
#endif
}
static PyObject*
{
#ifdef USE_BUILTIN_COPY
if (!copy)
return NULL;
return NULL;
}
#else
return NULL;
#endif
}
"match(string[, pos[, endpos]]) --> match object or None.\n\
Matches zero or more characters at the beginning of the string");
"search(string[, pos[, endpos]]) --> match object or None.\n\
Scan through string looking for a match, and return a corresponding\n\
MatchObject instance. Return None if no position in the string matches.");
"split(string[, maxsplit = 0]) --> list.\n\
Split string by the occurrences of pattern.");
"findall(string[, pos[, endpos]]) --> list.\n\
Return a list of all non-overlapping matches of pattern in string.");
"finditer(string[, pos[, endpos]]) --> iterator.\n\
Return an iterator over all non-overlapping matches for the \n\
RE pattern in string. For each match, the iterator returns a\n\
match object.");
"sub(repl, string[, count = 0]) --> newstring\n\
Return the string obtained by replacing the leftmost non-overlapping\n\
occurrences of pattern in string by the replacement repl.");
"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
Return the tuple (new_string, number_of_subs_made) found by replacing\n\
the leftmost non-overlapping occurrences of pattern with the\n\
replacement repl.");
static PyMethodDef pattern_methods[] = {
#if PY_VERSION_HEX >= 0x02020000
#endif
};
static PyObject*
{
if (res)
return res;
PyErr_Clear();
/* attributes */
}
return self->groupindex;
}
return NULL;
}
sizeof(PatternObject), sizeof(SRE_CODE),
0, /*tp_print*/
0, /* tp_setattr */
0, /* tp_compare */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_HAVE_WEAKREFS, /* tp_flags */
pattern_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
};
static PyObject *
{
/* "compile" pattern descriptor to pattern object */
Py_ssize_t i, n;
int flags = 0;
Py_ssize_t groups = 0;
&groupindex, &indexgroup))
return NULL;
n = PyList_GET_SIZE(code);
/* coverity[ampersand_in_size] */
if (!self)
return NULL;
for (i = 0; i < n; i++) {
: PyLong_AsUnsignedLong(o);
"regular expression code size limit exceeded");
break;
}
}
if (PyErr_Occurred()) {
return NULL;
}
return NULL;
}
}
/* -------------------------------------------------------------------- */
/* Code validation */
/* To learn more about this code, have a look at the _compile() function in
Lib/sre_compile.py. The validation functions below checks the code array
for conformance with the code patterns generated there.
The nice thing about the generated code is that it is position-independent:
all jumps are relative jumps forward. Also, jumps don't cross each other:
the target of a later jump is always earlier than the target of an earlier
jump. IOW, this is okay:
J---------J-------T--------T
\ \_____/ /
\______________________/
but this is not:
J---------J-------T--------T
\_________\_____/ /
\____________/
It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
bytes wide (the latter if Python is compiled for "wide" unicode support).
*/
/* Defining this one enables tracing of the validator */
/* Trace macro for the validator */
#if defined(VVERBOSE)
#else
#define VTRACE(v)
#endif
/* Report failure */
/* Extract opcode, argument, or skip count from code array */
#define GET_OP \
do { \
} while (0)
#define GET_ARG \
do { \
} while (0)
#define GET_SKIP_ADJ(adj) \
do { \
VTRACE(("%lu (skip to %p)\n", \
FAIL; \
code++; \
} while (0)
#define GET_SKIP GET_SKIP_ADJ(0)
static int
{
/* Some variables are manipulated by the macros above */
int i;
switch (op) {
case SRE_OP_NEGATE:
break;
case SRE_OP_LITERAL:
break;
case SRE_OP_RANGE:
break;
case SRE_OP_CHARSET:
FAIL;
break;
case SRE_OP_BIGCHARSET:
GET_ARG; /* Number of blocks */
FAIL;
/* Make sure that each byte points to a valid block */
for (i = 0; i < 256; i++) {
FAIL;
}
FAIL;
break;
case SRE_OP_CATEGORY:
switch (arg) {
case SRE_CATEGORY_DIGIT:
case SRE_CATEGORY_NOT_DIGIT:
case SRE_CATEGORY_SPACE:
case SRE_CATEGORY_NOT_SPACE:
case SRE_CATEGORY_WORD:
case SRE_CATEGORY_NOT_WORD:
case SRE_CATEGORY_LINEBREAK:
case SRE_CATEGORY_LOC_WORD:
case SRE_CATEGORY_UNI_DIGIT:
case SRE_CATEGORY_UNI_SPACE:
case SRE_CATEGORY_UNI_WORD:
break;
default:
FAIL;
}
break;
default:
FAIL;
}
}
return 1;
}
static int
{
/* Some variables are manipulated by the macros above */
FAIL;
switch (op) {
case SRE_OP_MARK:
/* We don't check whether marks are properly nested; the
sre_match() code is robust even if they don't, and the worst
you can get is nonsensical match results. */
FAIL;
}
break;
case SRE_OP_LITERAL:
case SRE_OP_NOT_LITERAL:
case SRE_OP_LITERAL_IGNORE:
/* The arg is just a character, nothing to check */
break;
case SRE_OP_SUCCESS:
case SRE_OP_FAILURE:
/* Nothing to check; these normally end the matching process */
break;
case SRE_OP_AT:
switch (arg) {
case SRE_AT_BEGINNING:
case SRE_AT_BEGINNING_STRING:
case SRE_AT_BEGINNING_LINE:
case SRE_AT_END:
case SRE_AT_END_LINE:
case SRE_AT_END_STRING:
case SRE_AT_BOUNDARY:
case SRE_AT_NON_BOUNDARY:
case SRE_AT_LOC_BOUNDARY:
case SRE_AT_LOC_NON_BOUNDARY:
case SRE_AT_UNI_BOUNDARY:
case SRE_AT_UNI_NON_BOUNDARY:
break;
default:
FAIL;
}
break;
case SRE_OP_ANY:
case SRE_OP_ANY_ALL:
/* These have no operands */
break;
case SRE_OP_IN:
case SRE_OP_IN_IGNORE:
/* Stop 1 before the end; we check the FAILURE below */
FAIL;
FAIL;
break;
case SRE_OP_INFO:
{
/* A minimal info field is
<INFO> <1=skip> <2=flags> <3=min> <4=max>;
If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
more follows. */
GET_ARG; /* min */
GET_ARG; /* max */
/* Check that only valid flags are present */
if ((flags & ~(SRE_INFO_PREFIX |
SRE_INFO_CHARSET)) != 0)
FAIL;
/* PREFIX and CHARSET are mutually exclusive */
if ((flags & SRE_INFO_PREFIX) &&
(flags & SRE_INFO_CHARSET))
FAIL;
/* LITERAL implies PREFIX */
if ((flags & SRE_INFO_LITERAL) &&
!(flags & SRE_INFO_PREFIX))
FAIL;
/* Validate the prefix */
if (flags & SRE_INFO_PREFIX) {
GET_ARG; /* prefix skip */
/* Here comes the prefix string */
FAIL;
code += prefix_len;
/* And here comes the overlap table */
FAIL;
/* Each overlap value should be < prefix_len */
for (i = 0; i < prefix_len; i++) {
if (code[i] >= prefix_len)
FAIL;
}
code += prefix_len;
}
/* Validate the charset */
if (flags & SRE_INFO_CHARSET) {
FAIL;
FAIL;
}
FAIL;
}
}
break;
case SRE_OP_BRANCH:
{
for (;;) {
if (skip == 0)
break;
/* Stop 2 before the end; we check the JUMP below */
FAIL;
/* Check that it ends with a JUMP, and that each JUMP
has the same target */
if (op != SRE_OP_JUMP)
FAIL;
FAIL;
}
}
break;
case SRE_OP_REPEAT_ONE:
case SRE_OP_MIN_REPEAT_ONE:
{
FAIL;
#ifdef Py_UNICODE_WIDE
if (max > 65535)
FAIL;
#endif
FAIL;
if (op != SRE_OP_SUCCESS)
FAIL;
}
break;
case SRE_OP_REPEAT:
{
FAIL;
#ifdef Py_UNICODE_WIDE
if (max > 65535)
FAIL;
#endif
FAIL;
FAIL;
}
break;
case SRE_OP_GROUPREF:
case SRE_OP_GROUPREF_IGNORE:
FAIL;
break;
case SRE_OP_GROUPREF_EXISTS:
/* The regex syntax for this is: '(?(group)then|else)', where
'group' is either an integer group number or a group name,
'then' and 'else' are sub-regexes, and 'else' is optional. */
FAIL;
GET_SKIP_ADJ(1);
code--; /* The skip is relative to the first arg! */
/* There are two possibilities here: if there is both a 'then'
part and an 'else' part, the generated code looks like:
GROUPREF_EXISTS
<group>
<skipyes>
...then part...
JUMP
<skipno>
(<skipyes> jumps here)
...else part...
(<skipno> jumps here)
If there is only a 'then' part, it looks like:
GROUPREF_EXISTS
<group>
<skip>
...then part...
(<skip> jumps here)
There is no direct way to decide which it is, and we don't want
to allow arbitrary jumps anywhere in the code; so we just look
for a JUMP opcode preceding our skip target.
*/
{
VTRACE(("both then and else parts present\n"));
FAIL;
FAIL;
}
else {
VTRACE(("only a then part present\n"));
FAIL;
}
break;
case SRE_OP_ASSERT:
case SRE_OP_ASSERT_NOT:
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
if (arg & 0x80000000)
FAIL; /* Width too large */
/* Stop 1 before the end; we check the SUCCESS below */
FAIL;
if (op != SRE_OP_SUCCESS)
FAIL;
break;
default:
FAIL;
}
}
VTRACE(("okay\n"));
return 1;
}
static int
{
FAIL;
if (groups == 0) /* fix for simplejson */
}
static int
{
{
return 0;
}
else
VTRACE(("Success!\n"));
return 1;
}
/* -------------------------------------------------------------------- */
/* match methods */
static void
{
}
static PyObject*
{
/* raise IndexError if we were given a bad group number */
"no such group"
);
return NULL;
}
index *= 2;
/* return default value if the string or group is undefined */
return def;
}
return PySequence_GetSlice(
);
}
static Py_ssize_t
{
Py_ssize_t i;
if (PyInt_Check(index))
return PyInt_AsSsize_t(index);
i = -1;
if (index) {
i = PyInt_AsSsize_t(index);
} else
PyErr_Clear();
}
return i;
}
static PyObject*
{
}
static PyObject*
{
/* delegate to Python code */
return call(
SRE_PY_MODULE, "_expand",
);
}
static PyObject*
{
Py_ssize_t i, size;
switch (size) {
case 0:
break;
case 1:
break;
default:
/* fetch multiple items */
if (!result)
return NULL;
for (i = 0; i < size; i++) {
);
if (!item) {
return NULL;
}
}
break;
}
return result;
}
static PyObject*
{
return NULL;
if (!result)
return NULL;
if (!item) {
return NULL;
}
}
return result;
}
static PyObject*
{
return NULL;
result = PyDict_New();
return result;
if (!keys)
goto failed;
int status;
if (!key)
goto failed;
if (!value) {
goto failed;
}
if (status < 0)
goto failed;
}
return result;
return NULL;
}
static PyObject*
{
return NULL;
"no such group"
);
return NULL;
}
/* mark is -1 if group is undefined */
}
static PyObject*
{
return NULL;
"no such group"
);
return NULL;
}
/* mark is -1 if group is undefined */
}
{
if (!pair)
return NULL;
if (!item)
goto error;
if (!item)
goto error;
return pair;
return NULL;
}
static PyObject*
{
return NULL;
"no such group"
);
return NULL;
}
/* marks are -1 if group is undefined */
}
static PyObject*
{
if (!regs)
return NULL;
if (!item) {
return NULL;
}
}
return regs;
}
static PyObject*
{
#ifdef USE_BUILTIN_COPY
if (!copy)
return NULL;
/* this value a constant, but any compiler should be able to
figure that out all by itself */
#else
return NULL;
#endif
}
static PyObject*
{
#ifdef USE_BUILTIN_COPY
if (!copy)
return NULL;
return NULL;
}
#else
return NULL;
#endif
}
static PyMethodDef match_methods[] = {
};
static PyObject*
{
if (res)
return res;
PyErr_Clear();
return Py_None;
}
);
if (result)
return result;
PyErr_Clear();
}
return Py_None;
}
} else {
return Py_None;
}
}
} else
return match_regs(self);
}
}
return NULL;
}
/* FIXME: implement setattr("string", None) as a special case (to
detach the associated string, if any */
sizeof(MatchObject), sizeof(Py_ssize_t),
0, /*tp_print*/
};
static PyObject*
{
/* create match object (from state object) */
Py_ssize_t i, j;
char* base;
int n;
if (status > 0) {
/* create match object (with room for extra group marks) */
/* coverity[ampersand_in_size] */
if (!match)
return NULL;
/* fill in group slices */
} else
} else if (status == 0) {
/* no match */
return Py_None;
}
/* internal error */
return NULL;
}
/* -------------------------------------------------------------------- */
/* scanner methods (experimental) */
static void
{
}
static PyObject*
{
int status;
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
return NULL;
else
return match;
}
static PyObject*
{
int status;
} else {
#if defined(HAVE_UNICODE)
#endif
}
if (PyErr_Occurred())
return NULL;
else
return match;
}
static PyMethodDef scanner_methods[] = {
};
static PyObject*
{
if (res)
return res;
PyErr_Clear();
/* attributes */
}
return NULL;
}
sizeof(ScannerObject), 0,
0, /*tp_print*/
};
static PyObject*
{
/* create search state object */
Py_ssize_t start = 0;
return NULL;
/* create scanner object */
if (!self)
return NULL;
if (!string) {
return NULL;
}
}
static PyMethodDef _functions[] = {
};
#if PY_VERSION_HEX < 0x02030000
#else
PyMODINIT_FUNC init_sre(void)
#endif
{
PyObject* m;
PyObject* d;
PyObject* x;
/* Patch object types */
return;
if (m == NULL)
return;
d = PyModule_GetDict(m);
x = PyInt_FromLong(SRE_MAGIC);
if (x) {
PyDict_SetItemString(d, "MAGIC", x);
Py_DECREF(x);
}
x = PyInt_FromLong(sizeof(SRE_CODE));
if (x) {
PyDict_SetItemString(d, "CODESIZE", x);
Py_DECREF(x);
}
x = PyString_FromString(copyright);
if (x) {
PyDict_SetItemString(d, "copyright", x);
Py_DECREF(x);
}
}
#endif /* !defined(SRE_RECURSIVE) */
/* vim:ts=4:sw=4:et
*/