/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2014 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This header contains definitions that are shared between the different
modules, but which are not relevant to the exported API. This includes some
functions whose names all begin with "_pcre_", "_pcre16_" or "_pcre32_"
depending on the PRIV macro. */
#ifndef PCRE_INTERNAL_H
#define PCRE_INTERNAL_H
/* Define PCRE_DEBUG to get debugging output on stdout. */
#if 0
#define PCRE_DEBUG
#endif
/* PCRE is compiled as an 8 bit library if it is not requested otherwise. */
#if !defined COMPILE_PCRE16 && !defined COMPILE_PCRE32
#define COMPILE_PCRE8
#endif
/* If SUPPORT_UCP is defined, SUPPORT_UTF must also be defined. The
"configure" script ensures this, but not everybody uses "configure". */
#if defined SUPPORT_UCP && !(defined SUPPORT_UTF)
#endif
/* We define SUPPORT_UTF if SUPPORT_UTF8 is enabled for compatibility
reasons with existing code. */
#if defined SUPPORT_UTF8 && !(defined SUPPORT_UTF)
#endif
/* Fixme: SUPPORT_UTF8 should be eventually disappear from the code.
Until then we define it if SUPPORT_UTF is defined. */
#if defined SUPPORT_UTF && !(defined SUPPORT_UTF8)
#endif
/* We do not support both EBCDIC and UTF-8/16/32 at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */
#if defined EBCDIC && defined SUPPORT_UTF
#endif
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
inline, and there are *still* stupid compilers about that don't like indented
pre-processor statements, or at least there were when I first wrote this. After
all, it had only been about 10 years then...
It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
be absolutely sure we get our version. */
#ifdef PCRE_DEBUG
#else
#endif
/* Standard C headers plus the external interface definition. The only time
setjmp and stdarg are used is when NO_RECURSE is set. */
#include <ctype.h>
#include <limits.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Valgrind (memcheck) support */
#ifdef SUPPORT_VALGRIND
#include <valgrind/memcheck.h>
#endif
/* When compiling a DLL for Windows, the exported symbols have to be declared
using some MS magic. I found some useful information on this web page:
http://msdn2.microsoft.com/en-us/library/y4h7bcy6(VS.80).aspx. According to the
information there, using __declspec(dllexport) without "extern" we have a
definition; with "extern" we have a declaration. The settings here override the
setting in pcre.h (which is included below); it defines only PCRE_EXP_DECL,
which is all that is needed for applications (they just import the symbols). We
use:
PCRE_EXP_DECL for declarations
PCRE_EXP_DEFN for definitions of exported functions
PCRE_EXP_DATA_DEFN for definitions of exported variables
The reason for the two DEFN macros is that in non-Windows environments, one
does not want to have "extern" before variable definitions because it leads to
compiler warnings. So we distinguish between functions and variables. In
Windows, the two should always be the same.
The reason for wrapping this in #ifndef PCRE_EXP_DECL is so that pcretest,
which is an application, but needs to import this file in order to "peek" at
internals, can #include pcre.h first to get an application's-eye view.
In principle, people compiling for non-Windows, non-Unix-like (i.e. uncommon,
special-purpose environments) might want to stick other stuff in front of
exported symbols. That's why, in the non-Windows case, we set PCRE_EXP_DEFN and
PCRE_EXP_DATA_DEFN only if they are not already set. */
#ifndef PCRE_EXP_DECL
# ifdef _WIN32
# ifndef PCRE_STATIC
# else
# define PCRE_EXP_DECL extern
# define PCRE_EXP_DEFN
# define PCRE_EXP_DATA_DEFN
# endif
# else
# ifdef __cplusplus
# else
# define PCRE_EXP_DECL extern
# endif
# ifndef PCRE_EXP_DEFN
# endif
# ifndef PCRE_EXP_DATA_DEFN
# define PCRE_EXP_DATA_DEFN
# endif
# endif
#endif
/* When compiling with the MSVC compiler, it is sometimes necessary to include
a "calling convention" before exported function names. (This is secondhand
information; I know nothing about MSVC myself). For example, something like
void __cdecl function(....)
might be needed. In order so make this easy, all the exported functions have
PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
set, we ensure here that it has no effect. */
#ifndef PCRE_CALL_CONVENTION
#define PCRE_CALL_CONVENTION
#endif
/* We need to have types that specify unsigned 8, 16 and 32-bit integers. We
cannot determine these outside the compilation (e.g. by running a program as
part of "configure") because PCRE is often cross-compiled for use on other
systems. Instead we make use of the maximum sizes that are available at
preprocessor time in standard C environments. */
typedef unsigned char pcre_uint8;
#if USHRT_MAX == 65535
typedef unsigned short pcre_uint16;
typedef short pcre_int16;
typedef unsigned int pcre_uint16;
typedef int pcre_int16;
#else
#endif
#if UINT_MAX == 4294967295U
typedef unsigned int pcre_uint32;
typedef int pcre_int32;
typedef unsigned long int pcre_uint32;
typedef long int pcre_int32;
#else
#endif
/* When checking for integer overflow in pcre_compile(), we need to handle
large integers. If a 64-bit integer type is available, we can use that.
Otherwise we have to cast to double, which of course requires floating point
arithmetic. Handle this by defining a macro for the appropriate type. If
stdint.h is available, include it; it may define INT64_MAX. Systems that do not
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
by "configure". */
#if defined HAVE_STDINT_H
#include <stdint.h>
#elif defined HAVE_INTTYPES_H
#include <inttypes.h>
#endif
#else
#define INT64_OR_DOUBLE double
#endif
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
However, we leave the interface to the outside world as char * or short *,
because that should make things easier for callers. This character type is
called pcre_uchar.
The IN_UCHARS macro multiply its argument with the byte size of the current
pcre_uchar type. Useful for memcpy and such operations, whose require the
The MAX_255 macro checks whether its pcre_uchar input is less than 256.
The TABLE_GET macro is designed for accessing elements of tables whose contain
exactly 256 items. When the character is able to contain more than 256
items, some check is needed before accessing these tables.
*/
#if defined COMPILE_PCRE8
typedef unsigned char pcre_uchar;
#define IN_UCHARS(x) (x)
#elif defined COMPILE_PCRE16
#if USHRT_MAX != 65535
/* This is a warning message. Change PCRE_UCHAR16 to a 16 bit data type in
pcre.h(.in) and disable (comment out) this message. */
#endif
#elif defined COMPILE_PCRE32
#else
#endif /* COMPILE_PCRE[8|16|32] */
/* This is an unsigned int value that no character can ever have. UTF-8
characters only go up to 0x7fffffff (though Unicode doesn't go beyond
0x0010ffff). */
/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
"any" and "anycrlf" at present). The following macros are used to package up
testing for newlines. NLBLOCK, PSSTART, and PSEND are defined in the various
modules to indicate in which datablock the parameters exist, and what the
/* This macro checks for a newline at the given position */
#define IS_NEWLINE(p) \
: \
) \
)
/* This macro checks for a newline immediately preceding the given position */
#define WAS_NEWLINE(p) \
: \
) \
)
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
with a custom type. This makes it possible, for example, to allow pcre_exec()
to process subject strings that are discontinuous by using a smart pointer
class. It must always be possible to inspect all of the subject string in
pcre_exec() because of the way it backtracks. Two macros are required in the
normal case, for sign-unspecified and unsigned char pointers. The former is
used for the external interface and appears in pcre.h, which is why its name
must begin with PCRE_. */
#ifdef CUSTOM_SUBJECT_PTR
#else
#endif
/* Include the public PCRE header and the definitions of UCP character property
values. */
#include "pcre.h"
#include "ucp.h"
#ifdef COMPILE_PCRE32
/* Assert that the public PCRE_UCHAR32 is a 32-bit type */
#endif
/* When compiling for use with the Virtual Pascal compiler, these functions
need to have their names changed. PCRE must be compiled with the -DVPCOMPAT
option on the command line. */
#ifdef VPCOMPAT
#else /* VPCOMPAT */
/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
define a macro for memmove() if HAVE_MEMMOVE is false, provided that HAVE_BCOPY
is set. Otherwise, include an emulating function for those systems that have
neither (there some non-Unix environments where this is the case). */
#ifndef HAVE_MEMMOVE
#ifdef HAVE_BCOPY
#else /* HAVE_BCOPY */
static void *
{
size_t i;
unsigned char *dest = (unsigned char *)d;
const unsigned char *src = (const unsigned char *)s;
{
dest += n;
src += n;
return (void *)dest;
}
else
{
return (void *)(dest - n);
}
}
#endif /* not HAVE_BCOPY */
#endif /* not HAVE_MEMMOVE */
#endif /* not VPCOMPAT */
/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
in big-endian order) by default. These are used, for example, to link from the
start of a subpattern to its alternatives and its end. The use of 2 bytes per
offset limits the size of the compiled regex to around 64K, which is big enough
for almost everybody. However, I received a request for an even bigger limit.
For this reason, and also to make the code easier to maintain, the storing and
loading of offsets from the byte string is now handled by the macros that are
defined here.
The macros are controlled by the value of LINK_SIZE. This defaults to 2 in
the config.h file, but can be overridden by using -D on the command line. This
is automated on Unix systems via the "configure" command. */
#if defined COMPILE_PCRE8
#if LINK_SIZE == 2
#define PUT(a,n,d) \
(a[n] = (d) >> 8), \
(a[(n)+1] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 8) | (a)[(n)+1])
#define PUT(a,n,d) \
(a[n] = (d) >> 16), \
(a[(n)+1] = (d) >> 8), \
(a[(n)+2] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
#define PUT(a,n,d) \
(a[n] = (d) >> 24), \
(a[(n)+1] = (d) >> 16), \
(a[(n)+2] = (d) >> 8), \
(a[(n)+3] = (d) & 255)
#define GET(a,n) \
(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
/* Keep it positive */
#else
#endif
#elif defined COMPILE_PCRE16
#if LINK_SIZE == 2
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */
#define PUT(a,n,d) \
(a[n] = (d))
#define GET(a,n) \
(a[n])
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */
#define PUT(a,n,d) \
(a[n] = (d) >> 16), \
(a[(n)+1] = (d) & 65535)
#define GET(a,n) \
(((a)[n] << 16) | (a)[(n)+1])
/* Keep it positive */
#else
#endif
#elif defined COMPILE_PCRE32
/* Only supported LINK_SIZE is 4 */
/* Redefine LINK_SIZE as a multiple of sizeof(pcre_uchar) */
#define PUT(a,n,d) \
(a[n] = (d))
#define GET(a,n) \
(a[n])
/* Keep it positive */
#else
#endif /* COMPILE_PCRE[8|16|32] */
/* Convenience macro defined in terms of the others */
/* PCRE uses some other 2-byte quantities that do not change when the size of
offsets changes. There are used for repeat counts and for other things such as
capturing parenthesis numbers in back references. */
#if defined COMPILE_PCRE8
#define PUT2(a,n,d) \
a[n] = (d) >> 8; \
a[(n)+1] = (d) & 255
/* For reasons that I do not understand, the expression in this GET2 macro is
treated by gcc as a signed expression, even when a is declared as unsigned. It
seems that any kind of arithmetic results in a signed value. */
#define GET2(a,n) \
(unsigned int)(((a)[n] << 8) | (a)[(n)+1])
#elif defined COMPILE_PCRE16
#define PUT2(a,n,d) \
a[n] = d
#define GET2(a,n) \
a[n]
#elif defined COMPILE_PCRE32
#define PUT2(a,n,d) \
a[n] = d
#define GET2(a,n) \
a[n]
#else
#endif /* COMPILE_PCRE[8|16|32] */
/* The maximum length of a MARK name is currently one data unit; it may be
changed in future to be a fixed number of bytes or to depend on LINK_SIZE. */
#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
#else
#endif
/* There is a proposed future special "UTF-21" mode, in which only the lowest
21 bits of a 32-bit character are interpreted as UTF, with the remaining 11
high-order bits available to the application for other uses. In preparation for
the future implementation of this mode, there are macros that load a data item
and, if in this special mode, mask it to 21 bits. These macros all have names
starting with UCHAR21. In all other modes, including the normal 32-bit
library, the macros all have the same simple definitions. When the new mode is
implemented, it is expected that these definitions will be varied appropriately
using #ifdef when compiling the library that supports the special mode. */
/* When UTF encoding is being used, a character is no longer just a single
byte in 8-bit mode or a single short in 16-bit mode. The macros for character
handling generate simple sequences when used in the basic mode, and more
complicated ones for UTF characters. GETCHARLENTEST and other macros are not
used when UTF is not supported. To make sure they can never even appear when
UTF support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF
/* #define MAX_VALUE_FOR_SINGLE_CHAR */
/* #define HAS_EXTRALEN(c) */
/* #define GET_EXTRALEN(c) */
/* #define NOT_FIRSTCHAR(c) */
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
/* #define FORWARDCHAR(eptr) */
/* #define ACROSSCHAR(condition, eptr, action) */
#else /* SUPPORT_UTF */
/* Tests whether the code point needs extra characters to decode. */
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */
{ \
if ((c & 0x20) == 0) \
else if ((c & 0x10) == 0) \
else if ((c & 0x08) == 0) \
else if ((c & 0x04) == 0) \
else \
}
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */
{ \
if ((c & 0x20) == 0) \
else if ((c & 0x10) == 0) \
{ \
eptr += 2; \
} \
else if ((c & 0x08) == 0) \
{ \
eptr += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
eptr += 4; \
} \
else \
{ \
eptr += 5; \
} \
}
#if defined COMPILE_PCRE8
/* These macros were originally written in the form of loops that used data
from the tables whose names start with PRIV(utf8_table). They were rewritten by
a user so as not to use loops, because in some environments this gives a
significant performance advantage, and it seems never to do any harm. */
/* Tells the biggest code point which can be encoded as a single character. */
/* Tests whether the code point needs extra characters to decode. */
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
c = *eptr; \
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
c = *eptr; \
/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */
c = *eptr++; \
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */
c = *eptr++; \
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
{ \
if ((c & 0x20) == 0) \
{ \
len++; \
} \
else if ((c & 0x10) == 0) \
{ \
len += 2; \
} \
else if ((c & 0x08) == 0) \
{\
len += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
len += 4; \
} \
else \
{\
len += 5; \
} \
}
/* Get the next UTF-8 character, not advancing the pointer, incrementing length
if there are extra bytes. This is called when we know we are in UTF-8 mode. */
c = *eptr; \
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
do not know if we are in UTF-8 mode. */
c = *eptr; \
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
because almost all calls are already within a block of UTF-8 only code. */
/* Same as above, just in the other direction. */
/* Same as above, but it allows a fully customizable form. */
#elif defined COMPILE_PCRE16
/* Tells the biggest code point which can be encoded as a single character. */
/* Tests whether the code point needs extra characters to decode. */
/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
Otherwise it has an undefined behaviour. */
/* Returns TRUE, if the given character is not the first character
of a UTF sequence. */
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer. */
/* Get the next UTF-16 character, not advancing the pointer. This is called when
we know we are in UTF-16 mode. */
c = *eptr; \
/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
pointer. */
c = *eptr; \
/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
the pointer. */
/* Get the next UTF-16 character, advancing the pointer. This is called when we
know we are in UTF-16 mode. */
c = *eptr++; \
/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-16 mode. */
c = *eptr++; \
/* Base macro to pick up the low surrogate of a UTF-16 character, not
advancing the pointer, incrementing the length. */
/* Get the next UTF-16 character, not advancing the pointer, incrementing
length if there is a low surrogate. This is called when we know we are in
UTF-16 mode. */
c = *eptr; \
/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
pointer, incrementing length if there is a low surrogate. This is called when
we do not know if we are in UTF-16 mode. */
c = *eptr; \
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-16 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-16 only
code. */
/* Same as above, just in the other direction. */
/* Same as above, but it allows a fully customizable form. */
#elif defined COMPILE_PCRE32
/* These are trivial for the 32-bit library, since all UTF-32 characters fit
into one pcre_uchar unit. */
#define HAS_EXTRALEN(c) (0)
#define GET_EXTRALEN(c) (0)
#define NOT_FIRSTCHAR(c) (0)
/* Get the next UTF-32 character, not advancing the pointer. This is called when
we know we are in UTF-32 mode. */
c = *(eptr);
/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
pointer. */
c = *(eptr);
/* Get the next UTF-32 character, advancing the pointer. This is called when we
know we are in UTF-32 mode. */
c = *((eptr)++);
/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-32 mode. */
c = *((eptr)++);
/* Get the next UTF-32 character, not advancing the pointer, not incrementing
length (since all UTF-32 is of length 1). This is called when we know we are in
UTF-32 mode. */
/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
pointer, not incrementing the length (since all UTF-32 is of length 1).
This is called when we do not know if we are in UTF-32 mode. */
GETCHARTEST(c, eptr)
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-32 mode - we don't put a test within the
macro because almost all calls are already within a block of UTF-32 only
code.
These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
/* Same as above, just in the other direction. */
/* Same as above, but it allows a fully customizable form. */
#else
#endif /* COMPILE_PCRE[8|16|32] */
#endif /* SUPPORT_UTF */
/* Tests for Unicode horizontal and vertical whitespace characters must check a
number of different values. Using a switch statement for this generates the
fastest code (no loop, no memory access), and there are several places in the
interpreter code where this happens. In order to ensure that all the case lists
remain in step, we use macros so that there is only one place where the lists
are defined.
These values are also required as lists in pcre_compile.c when processing \h,
\H, \v and \V in a character class. The lists are defined in pcre_tables.c, but
macros that define the values are here so that all the definitions are
together. The lists must be in ascending character order, terminated by
NOTACHAR (which is 0xffffffff).
Any changes should ensure that the various macros are kept in step with each
other. NOTE: The values also appear in pcre_jit_compile.c. */
#ifndef EBCDIC
#define HSPACE_LIST \
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
#define HSPACE_MULTIBYTE_CASES \
case 0x1680: /* OGHAM SPACE MARK */ \
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ \
case 0x2000: /* EN QUAD */ \
case 0x2001: /* EM QUAD */ \
case 0x2002: /* EN SPACE */ \
case 0x2003: /* EM SPACE */ \
case 0x2004: /* THREE-PER-EM SPACE */ \
case 0x2005: /* FOUR-PER-EM SPACE */ \
case 0x2006: /* SIX-PER-EM SPACE */ \
case 0x2007: /* FIGURE SPACE */ \
case 0x2008: /* PUNCTUATION SPACE */ \
case 0x2009: /* THIN SPACE */ \
case 0x200A: /* HAIR SPACE */ \
case 0x202f: /* NARROW NO-BREAK SPACE */ \
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ \
case 0x3000 /* IDEOGRAPHIC SPACE */
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE: \
case 0xa0 /* NBSP */
#define HSPACE_CASES \
#define VSPACE_LIST \
#define VSPACE_MULTIBYTE_CASES \
case 0x2028: /* LINE SEPARATOR */ \
case 0x2029 /* PARAGRAPH SEPARATOR */
#define VSPACE_BYTE_CASES \
case CHAR_LF: \
case CHAR_VT: \
case CHAR_FF: \
case CHAR_CR: \
case CHAR_NEL
#define VSPACE_CASES \
/* ------ EBCDIC environments ------ */
#else
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE
#ifdef EBCDIC_NL25
#define VSPACE_LIST \
#else
#define VSPACE_LIST \
#endif
#define VSPACE_BYTE_CASES \
case CHAR_LF: \
case CHAR_VT: \
case CHAR_FF: \
case CHAR_CR: \
case CHAR_NEL
#endif /* EBCDIC */
/* ------ End of whitespace macros ------ */
/* Private flags containing information about the compiled regex. They used to
live at the top end of the options word, but that got almost full, so they were
moved to a 16-bit flags word - which got almost full, so now they are in a
32-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as the
restrictions on partial matching have been lifted. It remains for backwards
compatibility. */
#if defined COMPILE_PCRE8
#elif defined COMPILE_PCRE16
#elif defined COMPILE_PCRE32
#endif
/* Flags for the "extra" block produced by pcre_study(). */
/* Masks for identifying the public options that are permitted at compile
time, run time, or study time, respectively. */
#define PUBLIC_COMPILE_OPTIONS \
#define PUBLIC_EXEC_OPTIONS \
#define PUBLIC_DFA_EXEC_OPTIONS \
#define PUBLIC_STUDY_OPTIONS \
#define PUBLIC_JIT_EXEC_OPTIONS \
/* Magic number to provide a small check against being handed junk. */
/* This variable is used to detect a loaded regular expression
in different endianness. */
/* The maximum remaining length of subject we are prepared to search for a
req_byte match. */
/* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
environments where these macros are defined elsewhere. Unfortunately, there
is no way to do the same for the typedef. */
typedef int BOOL;
#ifndef FALSE
#define FALSE 0
#endif
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
character constants like '*' because the compiler would emit their EBCDIC code,
is enabled. When UTF-8 support is not enabled, the definitions use character
literals. Both character and string versions of each character are needed, and
there are some longer strings as well.
This means that, on EBCDIC platforms, the PCRE library can handle either
EBCDIC, or UTF-8, but not both. To support both in the same compiled library
would need different lookups depending on whether PCRE_UTF8 was set or not.
which would reduce performance. For a theoretical use (which nobody has asked
for) in a minority area (EBCDIC platforms), this is not sensible. Any
application that did need both could compile two versions of the library, using
macros to give the functions distinct names. */
#ifndef SUPPORT_UTF
/* UTF-8 support is not enabled; use the platform-dependent character literals
so that PCRE works in both ASCII and EBCDIC environments, but only in non-UTF
mode. Newline characters are problematic in EBCDIC. Though it has CR and LF
characters, a common practice has been to use its NL (0x15) character as the
line terminator in C-like processing environments. However, sometimes the LF
(0x25) character is used instead, according to this Unicode document:
PCRE defaults EBCDIC NL to 0x15, but has a build-time option to select 0x25
instead. Whichever is *not* chosen is defined as NEL.
In both ASCII and EBCDIC environments, CHAR_NL and CHAR_LF are synonyms for the
same code point. */
#ifdef EBCDIC
#ifndef EBCDIC_NL25
#else
#endif
#else /* Not EBCDIC */
compatibility. NEL is the Unicode newline character; make sure it is
a positive value. */
#endif /* EBCDIC */
/* The remaining definitions work in both environments. */
#else /* SUPPORT_UTF */
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
only. */
#define STRING_WEIRD_STARTWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_LESS_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
#define STRING_WEIRD_ENDWORD STR_LEFT_SQUARE_BRACKET STR_COLON STR_GREATER_THAN_SIGN STR_COLON STR_RIGHT_SQUARE_BRACKET STR_RIGHT_SQUARE_BRACKET
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
#define STRING_NO_AUTO_POSSESS_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_A STR_U STR_T STR_O STR_UNDERSCORE STR_P STR_O STR_S STR_S STR_E STR_S STR_S STR_RIGHT_PARENTHESIS
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
#define STRING_LIMIT_MATCH_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_M STR_A STR_T STR_C STR_H STR_EQUALS_SIGN
#define STRING_LIMIT_RECURSION_EQ STR_L STR_I STR_M STR_I STR_T STR_UNDERSCORE STR_R STR_E STR_C STR_U STR_R STR_S STR_I STR_O STR_N STR_EQUALS_SIGN
#endif /* SUPPORT_UTF */
/* Escape items that are just an encoding of a particular data value. */
#ifndef ESC_e
#endif
#ifndef ESC_f
#endif
#ifndef ESC_n
#endif
#ifndef ESC_r
#endif
/* We can't officially use ESC_t because it is a POSIX reserved identifier
(presumably because of all the others like size_t). */
#ifndef ESC_tee
#endif
/* Codes for different types of Unicode property */
/* The following special properties are used only in XCLASS items, when POSIX
classes are specified and PCRE_UCP is set - in other words, for Unicode
handling of these classes. They are not available via the \p or \P escapes like
those in the above list, and so they do not take part in the autopossessifying
table. */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain characters with values greater than 255. */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns 0
for a data character. Also, they must appear in the same order as in the
opcode definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In
non-DOTALL mode, "." behaves like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
when PCRE_UCP is set and replacement of \d etc by \p sequences is required.
They must be contiguous, and remain in order so that the replacements can be
looked up from a table.
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
check_escape(). There are two tests in the code for an escape
greater than ESC_b and less than ESC_Z to detect the types that may be
repeated. These are the types that consume characters. If any new escapes are
put in between that don't consume a character, that code will have to change.
*/
/********************** Opcode definitions ******************/
/****** NOTE NOTE NOTE ******
Starting from 1 (i.e. after OP_END), the values up to OP_EOD must correspond in
order to the list of escapes immediately above. Furthermore, values up to
OP_DOLLM must not be changed without adjusting the table called autoposstab in
Whenever this list is updated, the two macro definitions that follow must be
updated to match. The possessification table called "opcode_possessify" in
pcre_compile.c must also be updated, and also the tables called "coptable"
and "poptable" in pcre_dfa_exec.c.
****** NOTE NOTE NOTE ******/
/* The values between FIRST_AUTOTAB_OP and LAST_AUTOTAB_RIGHT_OP, inclusive,
are used in a table for deciding whether a repeated character type can be
auto-possessified. */
enum {
/* Values corresponding to backslashed metacharacters */
/* Line end assertions */
/* Single characters; caseful must precede the caseless ones */
/* The following sets of 13 opcodes must always be kept in step because
the offset from the first one is used to generate the others. */
/* Repeated characters; caseful must precede the caseless ones */
/* Repeated characters; caseless must follow the caseful ones */
/* The negated ones must follow the non-negated ones, and match them */
/* Negated repeated character, caseful; must precede the caseless ones */
/* Negated repeated character, caseless; must follow the caseful ones */
/* Character types */
/* These are used for character classes and back references; only the
first six are the same as the sets above. */
/* End of quantifier opcodes */
class - the difference is relevant only when a
character > 255 is encountered. */
class. This does both positive and negative. */
/* The assertions must come before BRA, CBRA, ONCE, and COND, and the four
asserts must remain in order. */
/* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately
after the assertions, with ONCE first, as there's a test for >= ONCE for a
subpattern that isn't an assertion. The POS versions must immediately follow
the non-POS versions in each case. */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
/* The next two pairs must (respectively) be kept together. */
/* These are backtracking control verbs */
/* These are forced failure and success verbs */
/* This is used to skip a subpattern with a {0} quantifier */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
some in the past. */
};
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
called "opcode_possessify" in pcre_compile.c and "coptable" and "poptable" in
pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
for debugging, and some of them are only partial names. The macro is referenced
only in pcre_printint.c, which fills out the full names in many cases (and in
some cases doesn't actually use these names at all). */
#define OP_NAME_LIST \
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
"extuni", "\\Z", "\\z", \
"$", "$", "^", "^", "char", "chari", "not", "noti", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", \
"{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
"*+","++", "?+", "{", \
"*", "*?", "+", "+?", "?", "??", "{", "{", \
"*+","++", "?+", "{", \
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
"Recurse", "Callout", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", "AssertB", "AssertB not", \
"Once", "Once_NC", \
"Bra", "BraPos", "CBra", "CBraPos", \
"Cond", \
"SBra", "SBraPos", "SCBra", "SCBraPos", \
"SCond", \
"Cond ref", "Cond dnref", "Cond rec", "Cond dnrec", "Cond def", \
"Brazero", "Braminzero", "Braposzero", \
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*FAIL", \
"*ACCEPT", "*ASSERT_ACCEPT", \
"Close", "Skip zero"
/* This macro defines the length of fixed length operations in the compiled
regex. The lengths are used when searching for specific things, and also in the
debugging printing of a compiled regex. We use a macro so that it can be
defined close to the definitions of the opcodes themselves.
As things have been extended, some of these are no longer fixed lenths, but are
minima instead. For example, the length of a single-character repeat may vary
in UTF-8 mode. The code that uses this table must know about such things. */
#define OP_LENGTHS \
1, /* End */ \
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1, 1, 1, /* Any, AllAny, Anybyte */ \
3, 3, /* \P, \p */ \
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1, /* \X */ \
1, 1, 1, 1, 1, 1, /* \Z, \z, $, $M ^, ^M */ \
2, /* Char - the minimum length */ \
2, /* Chari - the minimum length */ \
2, /* not */ \
2, /* noti */ \
/* Positive single-char repeats ** These are */ \
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
2, 2, 2, 2, 2, 2, /* *I, *?I, +I, +?I, ?I, ??I ** UTF-8 */ \
/* Negative single-char repeats - only for chars < 256 */ \
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
2, 2, 2, 2, 2, 2, /* NOT *I, *?I, +I, +?I, ?I, ??I */ \
/* Positive type repeats */ \
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
/* Character class & ref repeats */ \
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
0, /* XCLASS - variable length */ \
1, /* DEF */ \
1, 1, 1, /* BRAZERO, BRAMINZERO, BRAPOSZERO */ \
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1, 3, /* SKIP, SKIP_ARG */ \
1, 3, /* THEN, THEN_ARG */ \
1, 1, 1, 1, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */ \
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the table called eint in
pcreposix.c must be updated. */
/* JIT compiling modes. The function list is indexed by them. */
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
offset to the name table so that if a regex is compiled on one host, saved, and
then run on another where the size of pointers is different, all might still
be well.
The size of the structure must be a multiple of 8 bytes. For the case of
compiled-on-4 and run-on-8, we include an extra pointer that is always NULL so
that there are an even number of pointers which therefore are a multiple of 8
bytes.
It is necessary to fork the struct for the 32 bit library, since it needs to
use pcre_uint32 for first_char and req_char. We can't put an ifdef inside the
typedef because pcretest needs access to the struct of the 8-, 16- and 32-bit
variants.
*** WARNING ***
When new fields are added to these structures, remember to adjust the code in
pcre_byte_order.c that is concerned with swapping the byte order of the fields
when a compiled regex is reloaded on a host with different endianness.
*** WARNING ***
There is also similar byte-flipping code in pcretest.c, which is used for
testing the byte-flipping features. It must also be kept in step.
*** WARNING ***
*/
typedef struct real_pcre8_or_16 {
typedef struct real_pcre32 {
} real_pcre32;
#if defined COMPILE_PCRE8
#elif defined COMPILE_PCRE16
#elif defined COMPILE_PCRE32
#endif
/* Assert that the size of REAL_PCRE is divisible by 8 */
/* Needed in pcretest to access some fields in the real_pcre* structures
* directly. They're unified for 8/16/32 bits since the structs only differ
* after these fields; if that ever changes, need to fork those defines into
* 8/16 and 32 bit versions. */
/* The format of the block used to store data from pcre_study(). The same
remark (see NOTE above) about extending this structure applies. */
typedef struct pcre_study_data {
/* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
encountered. This is also used to identify subpatterns that contain recursive
back references to themselves, so that they can be made atomic. */
typedef struct open_capitem {
} open_capitem;
/* Structure for building a list of named groups during the first pass of
compiling. */
typedef struct named_group {
} named_group;
/* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */
typedef struct compile_data {
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
branches, for testing for left recursion while compiling. */
typedef struct branch_chain {
} branch_chain;
/* Structure for items in a linked list that represents an explicit recursive
call within the pattern; used by pcre_exec(). */
typedef struct recursion_info {
/* A similar structure for pcre_dfa_exec(). */
typedef struct dfa_recursion_info {
int group_num;
/* Structure for building a chain of data for holding the values of the subject
pointer at the start of each subpattern, so as to detect when an empty string
has been matched by a subpattern - to break infinite loops; used by
pcre_exec(). */
typedef struct eptrblock {
} eptrblock;
/* Structure for passing "static" information around between the functions
doing traditional NFA matching, so that they are thread-safe. */
typedef struct match_data {
#ifdef NO_RECURSE
#endif
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
functions. */
typedef struct dfa_match_data {
/* Bit definitions for entries in the pcre_ctypes table. */
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
of bits for a class map. Some classes are built by combining these tables. */
/* Offsets of the various tables from the base tables pointer, and
total length. */
#define lcc_offset 0
/* Internal function and data prefixes. */
#if defined COMPILE_PCRE8
#ifndef PUBL
#endif
#ifndef PRIV
#endif
#elif defined COMPILE_PCRE16
#ifndef PUBL
#endif
#ifndef PRIV
#endif
#elif defined COMPILE_PCRE32
#ifndef PUBL
#endif
#ifndef PRIV
#endif
#else
#endif /* COMPILE_PCRE[8|16|32] */
/* Layout of the UCP type table that translates property names into types and
codes. Each entry used to point directly to a name, but to reduce the number of
relocations in shared libraries, it now has an offset into a single string
instead. */
typedef struct {
/* Internal shared data tables. These are tables that are used by more than one
of the exported public functions. They have to be "external" in the C sense,
but are not part of the PCRE public API. The data for these tables is in the
pcre_tables.c module. */
#ifdef COMPILE_PCRE8
extern const int PRIV(utf8_table1)[];
extern const int PRIV(utf8_table1_size);
extern const int PRIV(utf8_table2)[];
extern const int PRIV(utf8_table3)[];
#endif /* COMPILE_PCRE8 */
/* Internal shared functions. These are functions that are used by more than
one of the exported public functions. They have to be "external" in the C
sense, but are not part of the PCRE public API. */
/* String comparison functions. */
#if defined COMPILE_PCRE8
const pcre_uchar *);
const char *);
const pcre_uchar *, unsigned int num);
const char *, unsigned int num);
#endif /* COMPILE_PCRE[8|16|32] */
#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
#elif defined COMPILE_PCRE32
const pcre_uchar *);
const char *);
#endif /* COMPILE_PCRE[8|16|32] */
int *, BOOL);
int *, BOOL);
#ifdef SUPPORT_JIT
const pcre_uchar *, int, int, int, int *, int);
extern int PRIV(jit_get_size)(void *);
extern const char* PRIV(jit_get_target)(void);
#endif
/* Unicode character database (UCD) */
typedef struct {
} ucd_record;
#ifdef SUPPORT_JIT
extern const int PRIV(ucp_typerange)[];
#endif
#ifdef SUPPORT_UCP
/* UCD access macros */
#endif /* SUPPORT_UCP */
#endif
/* End of pcre_internal.h */