mirror of
https://git.freebsd.org/ports.git
synced 2025-07-07 20:39:18 -04:00
10262 lines
346 KiB
Text
10262 lines
346 KiB
Text
diff -ruN ../pcre.orig/config.m4 ./config.m4
|
|
--- ../pcre.orig/config.m4 Mon Dec 4 19:01:53 2006
|
|
+++ ./config.m4 Fri Feb 9 22:31:18 2007
|
|
@@ -13,7 +13,7 @@
|
|
|
|
if test "$PHP_PCRE_REGEX" != "no"; then
|
|
if test "$PHP_PCRE_REGEX" = "yes"; then
|
|
- PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -I@ext_srcdir@/pcrelib)
|
|
+ PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib)
|
|
PHP_ADD_BUILD_DIR($ext_builddir/pcrelib)
|
|
PHP_INSTALL_HEADERS([ext/pcre], [php_pcre.h pcrelib/])
|
|
AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ])
|
|
diff -ruN ../pcre.orig/pcrelib/dftables.c ./pcrelib/dftables.c
|
|
--- ../pcre.orig/pcrelib/dftables.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/dftables.c Fri Feb 9 22:31:19 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -86,7 +86,16 @@
|
|
fprintf(f,
|
|
"This file contains the default tables for characters with codes less than\n"
|
|
"128 (ASCII characters). These tables are used when no external tables are\n"
|
|
- "passed to PCRE. */\n\n"
|
|
+ "passed to PCRE.\n\n");
|
|
+fprintf(f,
|
|
+ "The following #include is present because without it gcc 4.x may remove\n"
|
|
+ "the array definition from the final binary if PCRE is built into a static\n"
|
|
+ "library and dead code stripping is activated. This leads to link errors.\n"
|
|
+ "Pulling in the header ensures that the array gets flagged as \"someone\n"
|
|
+ "outside this compilation unit might reference this\" and so it will always\n"
|
|
+ "be supplied to the linker. */\n\n"
|
|
+ "#include \"pcre_internal.h\"\n\n");
|
|
+fprintf(f,
|
|
"const unsigned char _pcre_default_tables[] = {\n\n"
|
|
"/* This table is a lower casing table. */\n\n");
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre.h ./pcrelib/pcre.h
|
|
--- ../pcre.orig/pcrelib/pcre.h Wed Jan 3 19:32:27 2007
|
|
+++ ./pcrelib/pcre.h Fri Feb 9 22:31:19 2007
|
|
@@ -5,7 +5,7 @@
|
|
/* This is the public header file for the PCRE library, to be #included by
|
|
applications that call the PCRE functions.
|
|
|
|
- Copyright (c) 1997-2005 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -38,7 +38,7 @@
|
|
|
|
#ifndef _PCRE_H
|
|
#define _PCRE_H
|
|
-
|
|
+
|
|
#include "php_compat.h"
|
|
|
|
/* The current PCRE version information. */
|
|
@@ -54,10 +54,10 @@
|
|
cannot run ./configure. As it now stands, this file need not be edited in that
|
|
circumstance. */
|
|
|
|
-#define PCRE_MAJOR 6
|
|
-#define PCRE_MINOR 7
|
|
+#define PCRE_MAJOR 7
|
|
+#define PCRE_MINOR 0
|
|
#define PCRE_PRERELEASE
|
|
-#define PCRE_DATE 04-Jul-2006
|
|
+#define PCRE_DATE 18-Dec-2006
|
|
|
|
/* Win32 uses DLL by default; it needs special stuff for exported functions
|
|
when building PCRE. */
|
|
@@ -120,6 +120,7 @@
|
|
#define PCRE_NEWLINE_CR 0x00100000
|
|
#define PCRE_NEWLINE_LF 0x00200000
|
|
#define PCRE_NEWLINE_CRLF 0x00300000
|
|
+#define PCRE_NEWLINE_ANY 0x00400000
|
|
|
|
/* Exec-time and get/set-time error codes */
|
|
|
|
@@ -127,7 +128,8 @@
|
|
#define PCRE_ERROR_NULL (-2)
|
|
#define PCRE_ERROR_BADOPTION (-3)
|
|
#define PCRE_ERROR_BADMAGIC (-4)
|
|
-#define PCRE_ERROR_UNKNOWN_NODE (-5)
|
|
+#define PCRE_ERROR_UNKNOWN_OPCODE (-5)
|
|
+#define PCRE_ERROR_UNKNOWN_NODE (-5) /* For backward compatibility */
|
|
#define PCRE_ERROR_NOMEMORY (-6)
|
|
#define PCRE_ERROR_NOSUBSTRING (-7)
|
|
#define PCRE_ERROR_MATCHLIMIT (-8)
|
|
@@ -144,6 +146,8 @@
|
|
#define PCRE_ERROR_DFA_WSSIZE (-19)
|
|
#define PCRE_ERROR_DFA_RECURSE (-20)
|
|
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
|
+#define PCRE_ERROR_NULLWSLIMIT (-22)
|
|
+#define PCRE_ERROR_BADNEWLINE (-23)
|
|
|
|
/* Request types for pcre_fullinfo() */
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_compile.c ./pcrelib/pcre_compile.c
|
|
--- ../pcre.orig/pcrelib/pcre_compile.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_compile.c Fri Feb 9 22:31:19 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -42,7 +42,11 @@
|
|
supporting internal functions that are not used by other modules. */
|
|
|
|
|
|
-#define NLBLOCK cd /* The block containing newline information */
|
|
+#define NLBLOCK cd /* Block containing newline information */
|
|
+#define PSSTART start_pattern /* Field containing processed string start */
|
|
+#define PSEND end_pattern /* Field containing processed string end */
|
|
+
|
|
+
|
|
#include "pcre_internal.h"
|
|
|
|
|
|
@@ -54,18 +58,23 @@
|
|
#endif
|
|
|
|
|
|
-
|
|
/*************************************************
|
|
* Code parameters and static tables *
|
|
*************************************************/
|
|
|
|
-/* Maximum number of items on the nested bracket stacks at compile time. This
|
|
-applies to the nesting of all kinds of parentheses. It does not limit
|
|
-un-nested, non-capturing parentheses. This number can be made bigger if
|
|
-necessary - it is used to dimension one int and one unsigned char vector at
|
|
-compile time. */
|
|
+/* This value specifies the size of stack workspace that is used during the
|
|
+first pre-compile phase that determines how much memory is required. The regex
|
|
+is partly compiled into this space, but the compiled parts are discarded as
|
|
+soon as they can be, so that hopefully there will never be an overrun. The code
|
|
+does, however, check for an overrun. The largest amount I've seen used is 218,
|
|
+so this number is very generous.
|
|
+
|
|
+The same workspace is used during the second, actual compile phase for
|
|
+remembering forward references to groups so that they can be filled in at the
|
|
+end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
|
|
+is 4 there is plenty of room. */
|
|
|
|
-#define BRASTACK_SIZE 200
|
|
+#define COMPILE_WORK_SIZE (4096)
|
|
|
|
|
|
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
|
|
@@ -79,10 +88,10 @@
|
|
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
|
|
'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
|
|
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
|
|
--ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
|
|
+-ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
|
|
-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
|
|
'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
|
|
- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
|
|
+ 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
|
|
-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
|
|
0, 0, -ESC_z /* x - z */
|
|
};
|
|
@@ -98,7 +107,7 @@
|
|
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
|
|
/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
|
|
/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
|
|
-/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
|
|
+/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
|
|
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
|
|
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
|
|
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
|
|
@@ -107,7 +116,7 @@
|
|
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
|
|
/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
|
|
-/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
|
|
+/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
|
|
/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
|
|
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
|
|
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
@@ -156,8 +165,13 @@
|
|
};
|
|
|
|
|
|
+#define STRING(a) # a
|
|
+#define XSTRING(s) STRING(s)
|
|
+
|
|
/* The texts of compile-time error messages. These are "char *" because they
|
|
-are passed to the outside world. */
|
|
+are passed to the outside world. Do not ever re-use any error number, because
|
|
+they are documented. Always add a new error instead. Messages marked DEAD below
|
|
+are no longer used. */
|
|
|
|
static const char *error_texts[] = {
|
|
"no error",
|
|
@@ -172,7 +186,7 @@
|
|
"range out of order in character class",
|
|
"nothing to repeat",
|
|
/* 10 */
|
|
- "operand of unlimited repeat could match the empty string",
|
|
+ "operand of unlimited repeat could match the empty string", /** DEAD **/
|
|
"internal error: unexpected repeat",
|
|
"unrecognized character after (?",
|
|
"POSIX named classes are supported only within a class",
|
|
@@ -182,7 +196,7 @@
|
|
"erroffset passed as NULL",
|
|
"unknown option bit(s) set",
|
|
"missing ) after comment",
|
|
- "parentheses nested too deeply",
|
|
+ "parentheses nested too deeply", /** DEAD **/
|
|
/* 20 */
|
|
"regular expression too large",
|
|
"failed to get memory",
|
|
@@ -199,7 +213,7 @@
|
|
"unknown POSIX class name",
|
|
"POSIX collating elements are not supported",
|
|
"this version of PCRE is not compiled with PCRE_UTF8 support",
|
|
- "spare error",
|
|
+ "spare error", /** DEAD **/
|
|
"character value in \\x{...} sequence is too large",
|
|
/* 35 */
|
|
"invalid condition (?(0)",
|
|
@@ -210,18 +224,25 @@
|
|
/* 40 */
|
|
"recursive call could loop indefinitely",
|
|
"unrecognized character after (?P",
|
|
- "syntax error after (?P",
|
|
+ "syntax error in subpattern name (missing terminator)",
|
|
"two named subpatterns have the same name",
|
|
"invalid UTF-8 string",
|
|
/* 45 */
|
|
"support for \\P, \\p, and \\X has not been compiled",
|
|
"malformed \\P or \\p sequence",
|
|
"unknown property name after \\P or \\p",
|
|
- "subpattern name is too long (maximum 32 characters)",
|
|
- "too many named subpatterns (maximum 10,000)",
|
|
+ "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
|
|
+ "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
|
|
/* 50 */
|
|
"repeated subpattern is too long",
|
|
- "octal value is greater than \\377 (not in UTF-8 mode)"
|
|
+ "octal value is greater than \\377 (not in UTF-8 mode)",
|
|
+ "internal error: overran compiling workspace",
|
|
+ "internal error: previously-checked referenced subpattern not found",
|
|
+ "DEFINE group contains more than one branch",
|
|
+ /* 55 */
|
|
+ "repeating a DEFINE group is not allowed",
|
|
+ "inconsistent NEWLINE options",
|
|
+ "\\g is not followed by an (optionally braced) non-zero number"
|
|
};
|
|
|
|
|
|
@@ -352,8 +373,8 @@
|
|
/* Definition to allow mutual recursion */
|
|
|
|
static BOOL
|
|
- compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
|
|
- int *, int *, branch_chain *, compile_data *);
|
|
+ compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
|
|
+ int *, branch_chain *, compile_data *, int *);
|
|
|
|
|
|
|
|
@@ -363,9 +384,11 @@
|
|
|
|
/* This function is called when a \ has been encountered. It either returns a
|
|
positive value for a simple escape such as \n, or a negative value which
|
|
-encodes one of the more complicated things such as \d. When UTF-8 is enabled,
|
|
-a positive value greater than 255 may be returned. On entry, ptr is pointing at
|
|
-the \. On exit, it is on the final character of the escape sequence.
|
|
+encodes one of the more complicated things such as \d. A backreference to group
|
|
+n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
|
|
+UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
|
|
+ptr is pointing at the \. On exit, it is on the final character of the escape
|
|
+sequence.
|
|
|
|
Arguments:
|
|
ptrptr points to the pattern position pointer
|
|
@@ -412,6 +435,8 @@
|
|
else
|
|
{
|
|
const uschar *oldptr;
|
|
+ BOOL braced, negated;
|
|
+
|
|
switch (c)
|
|
{
|
|
/* A number of Perl escapes are not handled by PCRE. We give an explicit
|
|
@@ -425,6 +450,48 @@
|
|
*errorcodeptr = ERR37;
|
|
break;
|
|
|
|
+ /* \g must be followed by a number, either plain or braced. If positive, it
|
|
+ is an absolute backreference. If negative, it is a relative backreference.
|
|
+ This is a Perl 5.10 feature. */
|
|
+
|
|
+ case 'g':
|
|
+ if (ptr[1] == '{')
|
|
+ {
|
|
+ braced = TRUE;
|
|
+ ptr++;
|
|
+ }
|
|
+ else braced = FALSE;
|
|
+
|
|
+ if (ptr[1] == '-')
|
|
+ {
|
|
+ negated = TRUE;
|
|
+ ptr++;
|
|
+ }
|
|
+ else negated = FALSE;
|
|
+
|
|
+ c = 0;
|
|
+ while ((digitab[ptr[1]] & ctype_digit) != 0)
|
|
+ c = c * 10 + *(++ptr) - '0';
|
|
+
|
|
+ if (c == 0 || (braced && *(++ptr) != '}'))
|
|
+ {
|
|
+ *errorcodeptr = ERR57;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (negated)
|
|
+ {
|
|
+ if (c > bracount)
|
|
+ {
|
|
+ *errorcodeptr = ERR15;
|
|
+ return 0;
|
|
+ }
|
|
+ c = bracount - (c - 1);
|
|
+ }
|
|
+
|
|
+ c = -(ESC_REF + c);
|
|
+ break;
|
|
+
|
|
/* The handling of escape sequences consisting of a string of digits
|
|
starting with one that is not zero is not straightforward. By experiment,
|
|
the way Perl works seems to be as follows:
|
|
@@ -532,7 +599,9 @@
|
|
}
|
|
break;
|
|
|
|
- /* Other special escapes not starting with a digit are straightforward */
|
|
+ /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
|
|
+ This coding is ASCII-specific, but then the whole concept of \cx is
|
|
+ ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
|
|
|
|
case 'c':
|
|
c = *(++ptr);
|
|
@@ -542,10 +611,6 @@
|
|
return 0;
|
|
}
|
|
|
|
- /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
|
|
- is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
|
|
- (However, an EBCDIC equivalent has now been added.) */
|
|
-
|
|
#if !EBCDIC /* ASCII coding */
|
|
if (c >= 'a' && c <= 'z') c -= 32;
|
|
c ^= 0x40;
|
|
@@ -772,42 +837,111 @@
|
|
|
|
|
|
/*************************************************
|
|
-* Find forward referenced named subpattern *
|
|
+* Find forward referenced subpattern *
|
|
*************************************************/
|
|
|
|
-/* This function scans along a pattern looking for capturing subpatterns, and
|
|
-counting them. If it finds a named pattern that matches the name it is given,
|
|
-it returns its number. This is used for forward references to named
|
|
-subpatterns. We know that if (?P< is encountered, the name will be terminated
|
|
-by '>' because that is checked in the first pass.
|
|
+/* This function scans along a pattern's text looking for capturing
|
|
+subpatterns, and counting them. If it finds a named pattern that matches the
|
|
+name it is given, it returns its number. Alternatively, if the name is NULL, it
|
|
+returns when it reaches a given numbered subpattern. This is used for forward
|
|
+references to subpatterns. We know that if (?P< is encountered, the name will
|
|
+be terminated by '>' because that is checked in the first pass.
|
|
|
|
Arguments:
|
|
- pointer current position in the pattern
|
|
- count current count of capturing parens
|
|
- name name to seek
|
|
- namelen name length
|
|
+ ptr current position in the pattern
|
|
+ count current count of capturing parens so far encountered
|
|
+ name name to seek, or NULL if seeking a numbered subpattern
|
|
+ lorn name length, or subpattern number if name is NULL
|
|
+ xmode TRUE if we are in /x mode
|
|
|
|
Returns: the number of the named subpattern, or -1 if not found
|
|
*/
|
|
|
|
static int
|
|
-find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
|
|
+find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
|
|
+ BOOL xmode)
|
|
{
|
|
const uschar *thisname;
|
|
+
|
|
for (; *ptr != 0; ptr++)
|
|
{
|
|
- if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
|
|
+ int term;
|
|
+
|
|
+ /* Skip over backslashed characters and also entire \Q...\E */
|
|
+
|
|
+ if (*ptr == '\\')
|
|
+ {
|
|
+ if (*(++ptr) == 0) return -1;
|
|
+ if (*ptr == 'Q') for (;;)
|
|
+ {
|
|
+ while (*(++ptr) != 0 && *ptr != '\\');
|
|
+ if (*ptr == 0) return -1;
|
|
+ if (*(++ptr) == 'E') break;
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Skip over character classes */
|
|
+
|
|
+ if (*ptr == '[')
|
|
+ {
|
|
+ while (*(++ptr) != ']')
|
|
+ {
|
|
+ if (*ptr == '\\')
|
|
+ {
|
|
+ if (*(++ptr) == 0) return -1;
|
|
+ if (*ptr == 'Q') for (;;)
|
|
+ {
|
|
+ while (*(++ptr) != 0 && *ptr != '\\');
|
|
+ if (*ptr == 0) return -1;
|
|
+ if (*(++ptr) == 'E') break;
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Skip comments in /x mode */
|
|
+
|
|
+ if (xmode && *ptr == '#')
|
|
+ {
|
|
+ while (*(++ptr) != 0 && *ptr != '\n');
|
|
+ if (*ptr == 0) return -1;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* An opening parens must now be a real metacharacter */
|
|
+
|
|
if (*ptr != '(') continue;
|
|
- if (ptr[1] != '?') { count++; continue; }
|
|
- if (ptr[2] == '(') { ptr += 2; continue; }
|
|
- if (ptr[2] != 'P' || ptr[3] != '<') continue;
|
|
+ if (ptr[1] != '?')
|
|
+ {
|
|
+ count++;
|
|
+ if (name == NULL && count == lorn) return count;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ ptr += 2;
|
|
+ if (*ptr == 'P') ptr++; /* Allow optional P */
|
|
+
|
|
+ /* We have to disambiguate (?<! and (?<= from (?<name> */
|
|
+
|
|
+ if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
|
|
+ *ptr != '\'')
|
|
+ continue;
|
|
+
|
|
count++;
|
|
- ptr += 4;
|
|
+
|
|
+ if (name == NULL && count == lorn) return count;
|
|
+ term = *ptr++;
|
|
+ if (term == '<') term = '>';
|
|
thisname = ptr;
|
|
- while (*ptr != '>') ptr++;
|
|
- if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0)
|
|
+ while (*ptr != term) ptr++;
|
|
+ if (name != NULL && lorn == ptr - thisname &&
|
|
+ strncmp((const char *)name, (const char *)thisname, lorn) == 0)
|
|
return count;
|
|
}
|
|
+
|
|
return -1;
|
|
}
|
|
|
|
@@ -862,7 +996,8 @@
|
|
|
|
case OP_CALLOUT:
|
|
case OP_CREF:
|
|
- case OP_BRANUMBER:
|
|
+ case OP_RREF:
|
|
+ case OP_DEF:
|
|
code += _pcre_OP_lengths[*code];
|
|
break;
|
|
|
|
@@ -907,14 +1042,14 @@
|
|
{
|
|
int d;
|
|
register int op = *cc;
|
|
- if (op >= OP_BRA) op = OP_BRA;
|
|
|
|
switch (op)
|
|
{
|
|
+ case OP_CBRA:
|
|
case OP_BRA:
|
|
case OP_ONCE:
|
|
case OP_COND:
|
|
- d = find_fixedlength(cc, options);
|
|
+ d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
|
|
if (d < 0) return d;
|
|
branchlength += d;
|
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
|
@@ -949,8 +1084,9 @@
|
|
/* Skip over things that don't match chars */
|
|
|
|
case OP_REVERSE:
|
|
- case OP_BRANUMBER:
|
|
case OP_CREF:
|
|
+ case OP_RREF:
|
|
+ case OP_DEF:
|
|
case OP_OPT:
|
|
case OP_CALLOUT:
|
|
case OP_SOD:
|
|
@@ -1094,21 +1230,18 @@
|
|
|
|
if (c == OP_XCLASS) code += GET(code, 1);
|
|
|
|
- /* Handle bracketed group */
|
|
+ /* Handle capturing bracket */
|
|
|
|
- else if (c > OP_BRA)
|
|
+ else if (c == OP_CBRA)
|
|
{
|
|
- int n = c - OP_BRA;
|
|
- if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
|
|
+ int n = GET2(code, 1+LINK_SIZE);
|
|
if (n == number) return (uschar *)code;
|
|
- code += _pcre_OP_lengths[OP_BRA];
|
|
+ code += _pcre_OP_lengths[c];
|
|
}
|
|
|
|
- /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
|
|
- that are followed by a character may be followed by a multi-byte character.
|
|
- The length in the table is a minimum, so we have to scan along to skip the
|
|
- extra bytes. All opcodes are less than 128, so we can use relatively
|
|
- efficient code. */
|
|
+ /* In UTF-8 mode, opcodes that are followed by a character may be followed by
|
|
+ a multi-byte character. The length in the table is a minimum, so we have to
|
|
+ arrange to skip the extra bytes. */
|
|
|
|
else
|
|
{
|
|
@@ -1120,13 +1253,17 @@
|
|
case OP_EXACT:
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
+ case OP_POSUPTO:
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
+ case OP_POSSTAR:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
case OP_QUERY:
|
|
case OP_MINQUERY:
|
|
- while ((*code & 0xc0) == 0x80) code++;
|
|
+ case OP_POSQUERY:
|
|
+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
|
|
break;
|
|
}
|
|
}
|
|
@@ -1164,18 +1301,10 @@
|
|
|
|
if (c == OP_XCLASS) code += GET(code, 1);
|
|
|
|
- /* All bracketed groups have the same length. */
|
|
-
|
|
- else if (c > OP_BRA)
|
|
- {
|
|
- code += _pcre_OP_lengths[OP_BRA];
|
|
- }
|
|
-
|
|
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
|
|
that are followed by a character may be followed by a multi-byte character.
|
|
- The length in the table is a minimum, so we have to scan along to skip the
|
|
- extra bytes. All opcodes are less than 128, so we can use relatively
|
|
- efficient code. */
|
|
+ The length in the table is a minimum, so we have to arrange to skip the extra
|
|
+ bytes. */
|
|
|
|
else
|
|
{
|
|
@@ -1187,13 +1316,17 @@
|
|
case OP_EXACT:
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
+ case OP_POSUPTO:
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
+ case OP_POSSTAR:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
case OP_QUERY:
|
|
case OP_MINQUERY:
|
|
- while ((*code & 0xc0) == 0x80) code++;
|
|
+ case OP_POSQUERY:
|
|
+ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
|
|
break;
|
|
}
|
|
}
|
|
@@ -1207,10 +1340,11 @@
|
|
*************************************************/
|
|
|
|
/* This function scans through a branch of a compiled pattern to see whether it
|
|
-can match the empty string or not. It is called only from could_be_empty()
|
|
-below. Note that first_significant_code() skips over assertions. If we hit an
|
|
-unclosed bracket, we return "empty" - this means we've struck an inner bracket
|
|
-whose current branch will already have been scanned.
|
|
+can match the empty string or not. It is called from could_be_empty()
|
|
+below and from compile_branch() when checking for an unlimited repeat of a
|
|
+group that can match nothing. Note that first_significant_code() skips over
|
|
+assertions. If we hit an unclosed bracket, we return "empty" - this means we've
|
|
+struck an inner bracket whose current branch will already have been scanned.
|
|
|
|
Arguments:
|
|
code points to start of search
|
|
@@ -1224,7 +1358,7 @@
|
|
could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
|
|
{
|
|
register int c;
|
|
-for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
|
|
+for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
|
|
code < endcode;
|
|
code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
|
|
{
|
|
@@ -1232,7 +1366,7 @@
|
|
|
|
c = *code;
|
|
|
|
- if (c >= OP_BRA)
|
|
+ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
|
|
{
|
|
BOOL empty_branch;
|
|
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
|
|
@@ -1248,11 +1382,18 @@
|
|
}
|
|
while (*code == OP_ALT);
|
|
if (!empty_branch) return FALSE; /* All branches are non-empty */
|
|
- code += 1 + LINK_SIZE;
|
|
- c = *code;
|
|
+
|
|
+ /* Move past the KET and fudge things so that the increment in the "for"
|
|
+ above has no effect. */
|
|
+
|
|
+ c = OP_END;
|
|
+ code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
|
|
+ continue;
|
|
}
|
|
|
|
- else switch (c)
|
|
+ /* Handle the other opcodes */
|
|
+
|
|
+ switch (c)
|
|
{
|
|
/* Check for quantifiers after a class */
|
|
|
|
@@ -1308,12 +1449,15 @@
|
|
case OP_NOT:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
case OP_EXACT:
|
|
case OP_NOTPLUS:
|
|
case OP_NOTMINPLUS:
|
|
+ case OP_NOTPOSPLUS:
|
|
case OP_NOTEXACT:
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEMINPLUS:
|
|
+ case OP_TYPEPOSPLUS:
|
|
case OP_TYPEEXACT:
|
|
return FALSE;
|
|
|
|
@@ -1325,16 +1469,19 @@
|
|
case OP_ALT:
|
|
return TRUE;
|
|
|
|
- /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
|
|
- followed by a multibyte character */
|
|
+ /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
|
|
+ MINUPTO, and POSUPTO may be followed by a multibyte character */
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
+ case OP_POSSTAR:
|
|
case OP_QUERY:
|
|
case OP_MINQUERY:
|
|
+ case OP_POSQUERY:
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
+ case OP_POSUPTO:
|
|
if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
|
|
break;
|
|
#endif
|
|
@@ -1452,26 +1599,57 @@
|
|
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
|
|
it, after it has been compiled. This means that any OP_RECURSE items within it
|
|
that refer to the group itself or any contained groups have to have their
|
|
-offsets adjusted. That is the job of this function. Before it is called, the
|
|
-partially compiled regex must be temporarily terminated with OP_END.
|
|
+offsets adjusted. That one of the jobs of this function. Before it is called,
|
|
+the partially compiled regex must be temporarily terminated with OP_END.
|
|
+
|
|
+This function has been extended with the possibility of forward references for
|
|
+recursions and subroutine calls. It must also check the list of such references
|
|
+for the group we are dealing with. If it finds that one of the recursions in
|
|
+the current group is on this list, it adjusts the offset in the list, not the
|
|
+value in the reference (which is a group number).
|
|
|
|
Arguments:
|
|
group points to the start of the group
|
|
adjust the amount by which the group is to be moved
|
|
utf8 TRUE in UTF-8 mode
|
|
cd contains pointers to tables etc.
|
|
+ save_hwm the hwm forward reference pointer at the start of the group
|
|
|
|
Returns: nothing
|
|
*/
|
|
|
|
static void
|
|
-adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
|
|
+adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
|
|
+ uschar *save_hwm)
|
|
{
|
|
uschar *ptr = group;
|
|
while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
|
|
{
|
|
- int offset = GET(ptr, 1);
|
|
- if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
|
|
+ int offset;
|
|
+ uschar *hc;
|
|
+
|
|
+ /* See if this recursion is on the forward reference list. If so, adjust the
|
|
+ reference. */
|
|
+
|
|
+ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
|
|
+ {
|
|
+ offset = GET(hc, 0);
|
|
+ if (cd->start_code + offset == ptr + 1)
|
|
+ {
|
|
+ PUT(hc, 0, offset + adjust);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Otherwise, adjust the recursion offset if it's after the start of this
|
|
+ group. */
|
|
+
|
|
+ if (hc >= cd->hwm)
|
|
+ {
|
|
+ offset = GET(ptr, 1);
|
|
+ if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
|
|
+ }
|
|
+
|
|
ptr += 1 + LINK_SIZE;
|
|
}
|
|
}
|
|
@@ -1550,12 +1728,13 @@
|
|
*/
|
|
|
|
static BOOL
|
|
-get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
|
|
+get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
|
|
+ unsigned int *odptr)
|
|
{
|
|
-int c, othercase, next;
|
|
+unsigned int c, othercase, next;
|
|
|
|
for (c = *cptr; c <= d; c++)
|
|
- { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
|
|
+ { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
|
|
|
|
if (c > d) return FALSE;
|
|
|
|
@@ -1576,17 +1755,249 @@
|
|
#endif /* SUPPORT_UCP */
|
|
|
|
|
|
+
|
|
+/*************************************************
|
|
+* Check if auto-possessifying is possible *
|
|
+*************************************************/
|
|
+
|
|
+/* This function is called for unlimited repeats of certain items, to see
|
|
+whether the next thing could possibly match the repeated item. If not, it makes
|
|
+sense to automatically possessify the repeated item.
|
|
+
|
|
+Arguments:
|
|
+ op_code the repeated op code
|
|
+ this data for this item, depends on the opcode
|
|
+ utf8 TRUE in UTF-8 mode
|
|
+ utf8_char used for utf8 character bytes, NULL if not relevant
|
|
+ ptr next character in pattern
|
|
+ options options bits
|
|
+ cd contains pointers to tables etc.
|
|
+
|
|
+Returns: TRUE if possessifying is wanted
|
|
+*/
|
|
+
|
|
+static BOOL
|
|
+check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
|
|
+ const uschar *ptr, int options, compile_data *cd)
|
|
+{
|
|
+int next;
|
|
+
|
|
+/* Skip whitespace and comments in extended mode */
|
|
+
|
|
+if ((options & PCRE_EXTENDED) != 0)
|
|
+ {
|
|
+ for (;;)
|
|
+ {
|
|
+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
|
|
+ if (*ptr == '#')
|
|
+ {
|
|
+ while (*(++ptr) != 0)
|
|
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
|
|
+ }
|
|
+ else break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+/* If the next item is one that we can handle, get its value. A non-negative
|
|
+value is a character, a negative value is an escape value. */
|
|
+
|
|
+if (*ptr == '\\')
|
|
+ {
|
|
+ int temperrorcode = 0;
|
|
+ next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
|
|
+ if (temperrorcode != 0) return FALSE;
|
|
+ ptr++; /* Point after the escape sequence */
|
|
+ }
|
|
+
|
|
+else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
|
|
+ {
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8) { GETCHARINC(next, ptr); } else
|
|
+#endif
|
|
+ next = *ptr++;
|
|
+ }
|
|
+
|
|
+else return FALSE;
|
|
+
|
|
+/* Skip whitespace and comments in extended mode */
|
|
+
|
|
+if ((options & PCRE_EXTENDED) != 0)
|
|
+ {
|
|
+ for (;;)
|
|
+ {
|
|
+ while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
|
|
+ if (*ptr == '#')
|
|
+ {
|
|
+ while (*(++ptr) != 0)
|
|
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
|
|
+ }
|
|
+ else break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+/* If the next thing is itself optional, we have to give up. */
|
|
+
|
|
+if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
|
|
+ return FALSE;
|
|
+
|
|
+/* Now compare the next item with the previous opcode. If the previous is a
|
|
+positive single character match, "item" either contains the character or, if
|
|
+"item" is greater than 127 in utf8 mode, the character's bytes are in
|
|
+utf8_char. */
|
|
+
|
|
+
|
|
+/* Handle cases when the next item is a character. */
|
|
+
|
|
+if (next >= 0) switch(op_code)
|
|
+ {
|
|
+ case OP_CHAR:
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
|
|
+#endif
|
|
+ return item != next;
|
|
+
|
|
+ /* For CHARNC (caseless character) we must check the other case. If we have
|
|
+ Unicode property support, we can use it to test the other case of
|
|
+ high-valued characters. */
|
|
+
|
|
+ case OP_CHARNC:
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
|
|
+#endif
|
|
+ if (item == next) return FALSE;
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8)
|
|
+ {
|
|
+ unsigned int othercase;
|
|
+ if (next < 128) othercase = cd->fcc[next]; else
|
|
+#ifdef SUPPORT_UCP
|
|
+ othercase = _pcre_ucp_othercase((unsigned int)next);
|
|
+#else
|
|
+ othercase = NOTACHAR;
|
|
+#endif
|
|
+ return (unsigned int)item != othercase;
|
|
+ }
|
|
+ else
|
|
+#endif /* SUPPORT_UTF8 */
|
|
+ return (item != cd->fcc[next]); /* Non-UTF-8 mode */
|
|
+
|
|
+ /* For OP_NOT, "item" must be a single-byte character. */
|
|
+
|
|
+ case OP_NOT:
|
|
+ if (next < 0) return FALSE; /* Not a character */
|
|
+ if (item == next) return TRUE;
|
|
+ if ((options & PCRE_CASELESS) == 0) return FALSE;
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8)
|
|
+ {
|
|
+ unsigned int othercase;
|
|
+ if (next < 128) othercase = cd->fcc[next]; else
|
|
+#ifdef SUPPORT_UCP
|
|
+ othercase = _pcre_ucp_othercase(next);
|
|
+#else
|
|
+ othercase = NOTACHAR;
|
|
+#endif
|
|
+ return (unsigned int)item == othercase;
|
|
+ }
|
|
+ else
|
|
+#endif /* SUPPORT_UTF8 */
|
|
+ return (item == cd->fcc[next]); /* Non-UTF-8 mode */
|
|
+
|
|
+ case OP_DIGIT:
|
|
+ return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
|
|
+
|
|
+ case OP_NOT_DIGIT:
|
|
+ return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
|
|
+
|
|
+ case OP_WHITESPACE:
|
|
+ return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
|
|
+
|
|
+ case OP_NOT_WHITESPACE:
|
|
+ return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
|
|
+
|
|
+ case OP_WORDCHAR:
|
|
+ return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
|
|
+
|
|
+ case OP_NOT_WORDCHAR:
|
|
+ return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
|
|
+
|
|
+ default:
|
|
+ return FALSE;
|
|
+ }
|
|
+
|
|
+
|
|
+/* Handle the case when the next item is \d, \s, etc. */
|
|
+
|
|
+switch(op_code)
|
|
+ {
|
|
+ case OP_CHAR:
|
|
+ case OP_CHARNC:
|
|
+#ifdef SUPPORT_UTF8
|
|
+ if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
|
|
+#endif
|
|
+ switch(-next)
|
|
+ {
|
|
+ case ESC_d:
|
|
+ return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
|
|
+
|
|
+ case ESC_D:
|
|
+ return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
|
|
+
|
|
+ case ESC_s:
|
|
+ return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
|
|
+
|
|
+ case ESC_S:
|
|
+ return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
|
|
+
|
|
+ case ESC_w:
|
|
+ return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
|
|
+
|
|
+ case ESC_W:
|
|
+ return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
|
|
+
|
|
+ default:
|
|
+ return FALSE;
|
|
+ }
|
|
+
|
|
+ case OP_DIGIT:
|
|
+ return next == -ESC_D || next == -ESC_s || next == -ESC_W;
|
|
+
|
|
+ case OP_NOT_DIGIT:
|
|
+ return next == -ESC_d;
|
|
+
|
|
+ case OP_WHITESPACE:
|
|
+ return next == -ESC_S || next == -ESC_d || next == -ESC_w;
|
|
+
|
|
+ case OP_NOT_WHITESPACE:
|
|
+ return next == -ESC_s;
|
|
+
|
|
+ case OP_WORDCHAR:
|
|
+ return next == -ESC_W || next == -ESC_s;
|
|
+
|
|
+ case OP_NOT_WORDCHAR:
|
|
+ return next == -ESC_w || next == -ESC_d;
|
|
+
|
|
+ default:
|
|
+ return FALSE;
|
|
+ }
|
|
+
|
|
+/* Control does not reach here */
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
/*************************************************
|
|
* Compile one branch *
|
|
*************************************************/
|
|
|
|
-/* Scan the pattern, compiling it into the code vector. If the options are
|
|
+/* Scan the pattern, compiling it into the a vector. If the options are
|
|
changed during the branch, the pointer is used to change the external options
|
|
-bits.
|
|
+bits. This function is used during the pre-compile phase when we are trying
|
|
+to find out the amount of memory needed, as well as during the real compile
|
|
+phase. The value of lengthptr distinguishes the two phases.
|
|
|
|
Arguments:
|
|
optionsptr pointer to the option bits
|
|
- brackets points to number of extracting brackets used
|
|
codeptr points to the pointer to the current code point
|
|
ptrptr points to the current pattern pointer
|
|
errorcodeptr points to error code variable
|
|
@@ -1594,15 +2005,17 @@
|
|
reqbyteptr set to the last literal character required, else < 0
|
|
bcptr points to current branch chain
|
|
cd contains pointers to tables etc.
|
|
+ lengthptr NULL during the real compile phase
|
|
+ points to length accumulator during pre-compile phase
|
|
|
|
Returns: TRUE on success
|
|
FALSE, with *errorcodeptr set non-zero on error
|
|
*/
|
|
|
|
static BOOL
|
|
-compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
|
|
- const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
|
|
- int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
|
|
+compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
|
|
+ int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
|
|
+ compile_data *cd, int *lengthptr)
|
|
{
|
|
int repeat_type, op_type;
|
|
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
|
|
@@ -1613,8 +2026,11 @@
|
|
int req_caseopt, reqvary, tempreqvary;
|
|
int options = *optionsptr;
|
|
int after_manual_callout = 0;
|
|
+int length_prevgroup = 0;
|
|
register int c;
|
|
register uschar *code = *codeptr;
|
|
+uschar *last_code = code;
|
|
+uschar *orig_code = code;
|
|
uschar *tempcode;
|
|
BOOL inescq = FALSE;
|
|
BOOL groupsetfirstbyte = FALSE;
|
|
@@ -1622,6 +2038,7 @@
|
|
const uschar *tempptr;
|
|
uschar *previous = NULL;
|
|
uschar *previous_callout = NULL;
|
|
+uschar *save_hwm = NULL;
|
|
uschar classbits[32];
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
@@ -1631,6 +2048,11 @@
|
|
uschar utf8_char[6];
|
|
#else
|
|
BOOL utf8 = FALSE;
|
|
+uschar *utf8_char = NULL;
|
|
+#endif
|
|
+
|
|
+#ifdef DEBUG
|
|
+if (lengthptr != NULL) DPRINTF((">> start branch\n"));
|
|
#endif
|
|
|
|
/* Set up the default and non-default settings for greediness */
|
|
@@ -1664,6 +2086,7 @@
|
|
BOOL negate_class;
|
|
BOOL possessive_quantifier;
|
|
BOOL is_quantifier;
|
|
+ BOOL is_recurse;
|
|
int class_charcount;
|
|
int class_lastchar;
|
|
int newoptions;
|
|
@@ -1671,13 +2094,68 @@
|
|
int skipbytes;
|
|
int subreqbyte;
|
|
int subfirstbyte;
|
|
+ int terminator;
|
|
int mclength;
|
|
uschar mcbuffer[8];
|
|
|
|
- /* Next byte in the pattern */
|
|
+ /* Get next byte in the pattern */
|
|
|
|
c = *ptr;
|
|
|
|
+ /* If we are in the pre-compile phase, accumulate the length used for the
|
|
+ previous cycle of this loop. */
|
|
+
|
|
+ if (lengthptr != NULL)
|
|
+ {
|
|
+#ifdef DEBUG
|
|
+ if (code > cd->hwm) cd->hwm = code; /* High water info */
|
|
+#endif
|
|
+ if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
|
|
+ {
|
|
+ *errorcodeptr = ERR52;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ /* There is at least one situation where code goes backwards: this is the
|
|
+ case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
|
|
+ the class is simply eliminated. However, it is created first, so we have to
|
|
+ allow memory for it. Therefore, don't ever reduce the length at this point.
|
|
+ */
|
|
+
|
|
+ if (code < last_code) code = last_code;
|
|
+ *lengthptr += code - last_code;
|
|
+ DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
|
|
+
|
|
+ /* If "previous" is set and it is not at the start of the work space, move
|
|
+ it back to there, in order to avoid filling up the work space. Otherwise,
|
|
+ if "previous" is NULL, reset the current code pointer to the start. */
|
|
+
|
|
+ if (previous != NULL)
|
|
+ {
|
|
+ if (previous > orig_code)
|
|
+ {
|
|
+ memmove(orig_code, previous, code - previous);
|
|
+ code -= previous - orig_code;
|
|
+ previous = orig_code;
|
|
+ }
|
|
+ }
|
|
+ else code = orig_code;
|
|
+
|
|
+ /* Remember where this code item starts so we can pick up the length
|
|
+ next time round. */
|
|
+
|
|
+ last_code = code;
|
|
+ }
|
|
+
|
|
+ /* In the real compile phase, just check the workspace used by the forward
|
|
+ reference list. */
|
|
+
|
|
+ else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
|
|
+ {
|
|
+ *errorcodeptr = ERR52;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
/* If in \Q...\E, check for the end; if not, we have a literal */
|
|
|
|
if (inescq && c != 0)
|
|
@@ -1692,7 +2170,8 @@
|
|
{
|
|
if (previous_callout != NULL)
|
|
{
|
|
- complete_callout(previous_callout, ptr, cd);
|
|
+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
|
|
+ complete_callout(previous_callout, ptr, cd);
|
|
previous_callout = NULL;
|
|
}
|
|
if ((options & PCRE_AUTO_CALLOUT) != 0)
|
|
@@ -1713,7 +2192,8 @@
|
|
if (!is_quantifier && previous_callout != NULL &&
|
|
after_manual_callout-- <= 0)
|
|
{
|
|
- complete_callout(previous_callout, ptr, cd);
|
|
+ if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
|
|
+ complete_callout(previous_callout, ptr, cd);
|
|
previous_callout = NULL;
|
|
}
|
|
|
|
@@ -1724,12 +2204,12 @@
|
|
if ((cd->ctypes[c] & ctype_space) != 0) continue;
|
|
if (c == '#')
|
|
{
|
|
- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
|
|
- if (*ptr != 0)
|
|
+ while (*(++ptr) != 0)
|
|
{
|
|
- ptr += cd->nllen - 1;
|
|
- continue;
|
|
+ if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
|
|
}
|
|
+ if (*ptr != 0) continue;
|
|
+
|
|
/* Else fall through to handle end of string */
|
|
c = 0;
|
|
}
|
|
@@ -1745,17 +2225,23 @@
|
|
|
|
switch(c)
|
|
{
|
|
- /* The branch terminates at end of string, |, or ). */
|
|
-
|
|
- case 0:
|
|
- case '|':
|
|
+ /* ===================================================================*/
|
|
+ case 0: /* The branch terminates at string end */
|
|
+ case '|': /* or | or ) */
|
|
case ')':
|
|
*firstbyteptr = firstbyte;
|
|
*reqbyteptr = reqbyte;
|
|
*codeptr = code;
|
|
*ptrptr = ptr;
|
|
+ if (lengthptr != NULL)
|
|
+ {
|
|
+ *lengthptr += code - last_code; /* To include callout length */
|
|
+ DPRINTF((">> end branch\n"));
|
|
+ }
|
|
return TRUE;
|
|
|
|
+
|
|
+ /* ===================================================================*/
|
|
/* Handle single-character metacharacters. In multiline mode, ^ disables
|
|
the setting of any following char as a first character. */
|
|
|
|
@@ -1784,6 +2270,8 @@
|
|
*code++ = OP_ANY;
|
|
break;
|
|
|
|
+
|
|
+ /* ===================================================================*/
|
|
/* Character classes. If the included characters are all < 256, we build a
|
|
32-byte bitmap of the permitted characters, except in the special case
|
|
where there is only one such character. For negated classes, we build the
|
|
@@ -1822,32 +2310,32 @@
|
|
}
|
|
|
|
/* Keep a count of chars with values < 256 so that we can optimize the case
|
|
- of just a single character (as long as it's < 256). For higher valued UTF-8
|
|
- characters, we don't yet do any optimization. */
|
|
+ of just a single character (as long as it's < 256). However, For higher
|
|
+ valued UTF-8 characters, we don't yet do any optimization. */
|
|
|
|
class_charcount = 0;
|
|
class_lastchar = -1;
|
|
|
|
+ /* Initialize the 32-char bit map to all zeros. We build the map in a
|
|
+ temporary bit of memory, in case the class contains only 1 character (less
|
|
+ than 256), because in that case the compiled code doesn't use the bit map.
|
|
+ */
|
|
+
|
|
+ memset(classbits, 0, 32 * sizeof(uschar));
|
|
+
|
|
#ifdef SUPPORT_UTF8
|
|
class_utf8 = FALSE; /* No chars >= 256 */
|
|
- class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
|
|
+ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
|
|
#endif
|
|
|
|
- /* Initialize the 32-char bit map to all zeros. We have to build the
|
|
- map in a temporary bit of store, in case the class contains only 1
|
|
- character (< 256), because in that case the compiled code doesn't use the
|
|
- bit map. */
|
|
-
|
|
- memset(classbits, 0, 32 * sizeof(uschar));
|
|
-
|
|
/* Process characters until ] is reached. By writing this as a "do" it
|
|
- means that an initial ] is taken as a data character. The first pass
|
|
- through the regex checked the overall syntax, so we don't need to be very
|
|
- strict here. At the start of the loop, c contains the first byte of the
|
|
- character. */
|
|
+ means that an initial ] is taken as a data character. At the start of the
|
|
+ loop, c contains the first byte of the character. */
|
|
|
|
- do
|
|
+ if (c != 0) do
|
|
{
|
|
+ const uschar *oldptr;
|
|
+
|
|
#ifdef SUPPORT_UTF8
|
|
if (utf8 && c > 127)
|
|
{ /* Braces are required because the */
|
|
@@ -1859,13 +2347,13 @@
|
|
|
|
if (inescq)
|
|
{
|
|
- if (c == '\\' && ptr[1] == 'E')
|
|
+ if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
|
|
{
|
|
- inescq = FALSE;
|
|
- ptr++;
|
|
- continue;
|
|
+ inescq = FALSE; /* Reset literal state */
|
|
+ ptr++; /* Skip the 'E' */
|
|
+ continue; /* Carry on with next */
|
|
}
|
|
- else goto LONE_SINGLE_CHARACTER;
|
|
+ goto CHECK_RANGE; /* Could be range if \E follows */
|
|
}
|
|
|
|
/* Handle POSIX class names. Perl allows a negation extension of the
|
|
@@ -1956,19 +2444,20 @@
|
|
}
|
|
|
|
/* Backslash may introduce a single character, or it may introduce one
|
|
- of the specials, which just set a flag. Escaped items are checked for
|
|
- validity in the pre-compiling pass. The sequence \b is a special case.
|
|
- Inside a class (and only there) it is treated as backspace. Elsewhere
|
|
- it marks a word boundary. Other escapes have preset maps ready to
|
|
- or into the one we are building. We assume they have more than one
|
|
+ of the specials, which just set a flag. The sequence \b is a special
|
|
+ case. Inside a class (and only there) it is treated as backspace.
|
|
+ Elsewhere it marks a word boundary. Other escapes have preset maps ready
|
|
+ to or into the one we are building. We assume they have more than one
|
|
character in them, so set class_charcount bigger than one. */
|
|
|
|
if (c == '\\')
|
|
{
|
|
- c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
|
|
+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
|
|
+ if (*errorcodeptr != 0) goto FAILED;
|
|
|
|
if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
|
|
else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
|
|
+ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
|
|
else if (-c == ESC_Q) /* Handle start of quoted string */
|
|
{
|
|
if (ptr[1] == '\\' && ptr[2] == 'E')
|
|
@@ -1983,7 +2472,10 @@
|
|
{
|
|
register const uschar *cbits = cd->cbits;
|
|
class_charcount += 2; /* Greater than 1 is what matters */
|
|
- switch (-c)
|
|
+
|
|
+ /* Save time by not doing this in the pre-compile phase. */
|
|
+
|
|
+ if (lengthptr == NULL) switch (-c)
|
|
{
|
|
case ESC_d:
|
|
for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
|
|
@@ -2011,52 +2503,91 @@
|
|
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
|
|
continue;
|
|
|
|
-#ifdef SUPPORT_UCP
|
|
- case ESC_p:
|
|
- case ESC_P:
|
|
- {
|
|
- BOOL negated;
|
|
- int pdata;
|
|
- int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
|
|
- if (ptype < 0) goto FAILED;
|
|
- class_utf8 = TRUE;
|
|
- *class_utf8data++ = ((-c == ESC_p) != negated)?
|
|
- XCL_PROP : XCL_NOTPROP;
|
|
- *class_utf8data++ = ptype;
|
|
- *class_utf8data++ = pdata;
|
|
- class_charcount -= 2; /* Not a < 256 character */
|
|
- }
|
|
+ case ESC_E: /* Perl ignores an orphan \E */
|
|
continue;
|
|
-#endif
|
|
-
|
|
- /* Unrecognized escapes are faulted if PCRE is running in its
|
|
- strict mode. By default, for compatibility with Perl, they are
|
|
- treated as literals. */
|
|
|
|
- default:
|
|
- if ((options & PCRE_EXTRA) != 0)
|
|
- {
|
|
- *errorcodeptr = ERR7;
|
|
- goto FAILED;
|
|
- }
|
|
- c = *ptr; /* The final character */
|
|
- class_charcount -= 2; /* Undo the default count from above */
|
|
+ default: /* Not recognized; fall through */
|
|
+ break; /* Need "default" setting to stop compiler warning. */
|
|
}
|
|
- }
|
|
|
|
- /* Fall through if we have a single character (c >= 0). This may be
|
|
- > 256 in UTF-8 mode. */
|
|
+ /* In the pre-compile phase, just do the recognition. */
|
|
|
|
- } /* End of backslash handling */
|
|
+ else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
|
|
+ c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
|
|
+
|
|
+ /* We need to deal with \P and \p in both phases. */
|
|
+
|
|
+#ifdef SUPPORT_UCP
|
|
+ if (-c == ESC_p || -c == ESC_P)
|
|
+ {
|
|
+ BOOL negated;
|
|
+ int pdata;
|
|
+ int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
|
|
+ if (ptype < 0) goto FAILED;
|
|
+ class_utf8 = TRUE;
|
|
+ *class_utf8data++ = ((-c == ESC_p) != negated)?
|
|
+ XCL_PROP : XCL_NOTPROP;
|
|
+ *class_utf8data++ = ptype;
|
|
+ *class_utf8data++ = pdata;
|
|
+ class_charcount -= 2; /* Not a < 256 character */
|
|
+ continue;
|
|
+ }
|
|
+#endif
|
|
+ /* Unrecognized escapes are faulted if PCRE is running in its
|
|
+ strict mode. By default, for compatibility with Perl, they are
|
|
+ treated as literals. */
|
|
+
|
|
+ if ((options & PCRE_EXTRA) != 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR7;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ class_charcount -= 2; /* Undo the default count from above */
|
|
+ c = *ptr; /* Get the final character and fall through */
|
|
+ }
|
|
+
|
|
+ /* Fall through if we have a single character (c >= 0). This may be
|
|
+ greater than 256 in UTF-8 mode. */
|
|
+
|
|
+ } /* End of backslash handling */
|
|
|
|
/* A single character may be followed by '-' to form a range. However,
|
|
Perl does not permit ']' to be the end of the range. A '-' character
|
|
- here is treated as a literal. */
|
|
+ at the end is treated as a literal. Perl ignores orphaned \E sequences
|
|
+ entirely. The code for handling \Q and \E is messy. */
|
|
+
|
|
+ CHECK_RANGE:
|
|
+ while (ptr[1] == '\\' && ptr[2] == 'E')
|
|
+ {
|
|
+ inescq = FALSE;
|
|
+ ptr += 2;
|
|
+ }
|
|
+
|
|
+ oldptr = ptr;
|
|
|
|
- if (ptr[1] == '-' && ptr[2] != ']')
|
|
+ if (!inescq && ptr[1] == '-')
|
|
{
|
|
int d;
|
|
ptr += 2;
|
|
+ while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
|
|
+
|
|
+ /* If we hit \Q (not followed by \E) at this point, go into escaped
|
|
+ mode. */
|
|
+
|
|
+ while (*ptr == '\\' && ptr[1] == 'Q')
|
|
+ {
|
|
+ ptr += 2;
|
|
+ if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
|
|
+ inescq = TRUE;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (*ptr == 0 || (!inescq && *ptr == ']'))
|
|
+ {
|
|
+ ptr = oldptr;
|
|
+ goto LONE_SINGLE_CHARACTER;
|
|
+ }
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
if (utf8)
|
|
@@ -2071,27 +2602,34 @@
|
|
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
|
|
in such circumstances. */
|
|
|
|
- if (d == '\\')
|
|
+ if (!inescq && d == '\\')
|
|
{
|
|
- const uschar *oldptr = ptr;
|
|
- d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
|
|
+ d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
|
|
+ if (*errorcodeptr != 0) goto FAILED;
|
|
|
|
- /* \b is backslash; \X is literal X; any other special means the '-'
|
|
- was literal */
|
|
+ /* \b is backslash; \X is literal X; \R is literal R; any other
|
|
+ special means the '-' was literal */
|
|
|
|
if (d < 0)
|
|
{
|
|
if (d == -ESC_b) d = '\b';
|
|
- else if (d == -ESC_X) d = 'X'; else
|
|
+ else if (d == -ESC_X) d = 'X';
|
|
+ else if (d == -ESC_R) d = 'R'; else
|
|
{
|
|
- ptr = oldptr - 2;
|
|
+ ptr = oldptr;
|
|
goto LONE_SINGLE_CHARACTER; /* A few lines below */
|
|
}
|
|
}
|
|
}
|
|
|
|
- /* The check that the two values are in the correct order happens in
|
|
- the pre-pass. Optimize one-character ranges */
|
|
+ /* Check that the two values are in the correct order. Optimize
|
|
+ one-character ranges */
|
|
+
|
|
+ if (d < c)
|
|
+ {
|
|
+ *errorcodeptr = ERR8;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
|
|
|
|
@@ -2112,9 +2650,9 @@
|
|
#ifdef SUPPORT_UCP
|
|
if ((options & PCRE_CASELESS) != 0)
|
|
{
|
|
- int occ, ocd;
|
|
- int cc = c;
|
|
- int origd = d;
|
|
+ unsigned int occ, ocd;
|
|
+ unsigned int cc = c;
|
|
+ unsigned int origd = d;
|
|
while (get_othercase_range(&cc, origd, &occ, &ocd))
|
|
{
|
|
if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
|
|
@@ -2172,7 +2710,12 @@
|
|
ranges that lie entirely within 0-127 when there is UCP support; else
|
|
for partial ranges without UCP support. */
|
|
|
|
- for (; c <= d; c++)
|
|
+ class_charcount += d - c + 1;
|
|
+ class_lastchar = d;
|
|
+
|
|
+ /* We can save a bit of time by skipping this in the pre-compile. */
|
|
+
|
|
+ if (lengthptr == NULL) for (; c <= d; c++)
|
|
{
|
|
classbits[c/8] |= (1 << (c&7));
|
|
if ((options & PCRE_CASELESS) != 0)
|
|
@@ -2180,8 +2723,6 @@
|
|
int uc = cd->fcc[c]; /* flip case */
|
|
classbits[uc/8] |= (1 << (uc&7));
|
|
}
|
|
- class_charcount++; /* in case a one-char range */
|
|
- class_lastchar = c;
|
|
}
|
|
|
|
continue; /* Go get the next char in the class */
|
|
@@ -2205,8 +2746,8 @@
|
|
#ifdef SUPPORT_UCP
|
|
if ((options & PCRE_CASELESS) != 0)
|
|
{
|
|
- int othercase;
|
|
- if ((othercase = _pcre_ucp_othercase(c)) >= 0)
|
|
+ unsigned int othercase;
|
|
+ if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
|
|
{
|
|
*class_utf8data++ = XCL_SINGLE;
|
|
class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
|
|
@@ -2231,10 +2772,15 @@
|
|
}
|
|
}
|
|
|
|
- /* Loop until ']' reached; the check for end of string happens inside the
|
|
- loop. This "while" is the end of the "do" above. */
|
|
+ /* Loop until ']' reached. This "while" is the end of the "do" above. */
|
|
|
|
- while ((c = *(++ptr)) != ']' || inescq);
|
|
+ while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
|
|
+
|
|
+ if (c == 0) /* Missing terminating ']' */
|
|
+ {
|
|
+ *errorcodeptr = ERR6;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
/* If class_charcount is 1, we saw precisely one character whose value is
|
|
less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
|
|
@@ -2298,7 +2844,7 @@
|
|
|
|
/* If there are characters with values > 255, we have to compile an
|
|
extended class, with its own opcode. If there are no characters < 256,
|
|
- we can omit the bitmap. */
|
|
+ we can omit the bitmap in the actual compiled code. */
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
if (class_utf8)
|
|
@@ -2308,24 +2854,17 @@
|
|
code += LINK_SIZE;
|
|
*code = negate_class? XCL_NOT : 0;
|
|
|
|
- /* If the map is required, install it, and move on to the end of
|
|
- the extra data */
|
|
+ /* If the map is required, move up the extra data to make room for it;
|
|
+ otherwise just move the code pointer to the end of the extra data. */
|
|
|
|
if (class_charcount > 0)
|
|
{
|
|
*code++ |= XCL_MAP;
|
|
+ memmove(code + 32, code, class_utf8data - code);
|
|
memcpy(code, classbits, 32);
|
|
- code = class_utf8data;
|
|
- }
|
|
-
|
|
- /* If the map is not required, slide down the extra data. */
|
|
-
|
|
- else
|
|
- {
|
|
- int len = class_utf8data - (code + 33);
|
|
- memmove(code + 1, code + 33, len);
|
|
- code += len + 1;
|
|
+ code = class_utf8data + 32;
|
|
}
|
|
+ else code = class_utf8data;
|
|
|
|
/* Now fill in the complete length of the item */
|
|
|
|
@@ -2342,7 +2881,8 @@
|
|
if (negate_class)
|
|
{
|
|
*code++ = OP_NCLASS;
|
|
- for (c = 0; c < 32; c++) code[c] = ~classbits[c];
|
|
+ if (lengthptr == NULL) /* Save time in the pre-compile phase */
|
|
+ for (c = 0; c < 32; c++) code[c] = ~classbits[c];
|
|
}
|
|
else
|
|
{
|
|
@@ -2352,6 +2892,8 @@
|
|
code += 32;
|
|
break;
|
|
|
|
+
|
|
+ /* ===================================================================*/
|
|
/* Various kinds of repeat; '{' is not necessarily a quantifier, but this
|
|
has been tested above. */
|
|
|
|
@@ -2419,20 +2961,6 @@
|
|
}
|
|
else repeat_type = greedy_default;
|
|
|
|
- /* If previous was a recursion, we need to wrap it inside brackets so that
|
|
- it can be replicated if necessary. */
|
|
-
|
|
- if (*previous == OP_RECURSE)
|
|
- {
|
|
- memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
|
|
- code += 1 + LINK_SIZE;
|
|
- *previous = OP_BRA;
|
|
- PUT(previous, 1, code - previous);
|
|
- *code = OP_KET;
|
|
- PUT(code, 1, code - previous);
|
|
- code += 1 + LINK_SIZE;
|
|
- }
|
|
-
|
|
/* If previous was a character match, abolish the item and generate a
|
|
repeat item instead. If a char item has a minumum of more than one, ensure
|
|
that it is set in reqbyte - it might not be if a sequence such as x{3} is
|
|
@@ -2466,18 +2994,40 @@
|
|
if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
|
|
}
|
|
|
|
+ /* If the repetition is unlimited, it pays to see if the next thing on
|
|
+ the line is something that cannot possibly match this character. If so,
|
|
+ automatically possessifying this item gains some performance in the case
|
|
+ where the match fails. */
|
|
+
|
|
+ if (!possessive_quantifier &&
|
|
+ repeat_max < 0 &&
|
|
+ check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
|
|
+ options, cd))
|
|
+ {
|
|
+ repeat_type = 0; /* Force greedy */
|
|
+ possessive_quantifier = TRUE;
|
|
+ }
|
|
+
|
|
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
|
|
}
|
|
|
|
/* If previous was a single negated character ([^a] or similar), we use
|
|
one of the special opcodes, replacing it. The code is shared with single-
|
|
character repeats by setting opt_type to add a suitable offset into
|
|
- repeat_type. OP_NOT is currently used only for single-byte chars. */
|
|
+ repeat_type. We can also test for auto-possessification. OP_NOT is
|
|
+ currently used only for single-byte chars. */
|
|
|
|
else if (*previous == OP_NOT)
|
|
{
|
|
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
|
|
c = previous[1];
|
|
+ if (!possessive_quantifier &&
|
|
+ repeat_max < 0 &&
|
|
+ check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
|
|
+ {
|
|
+ repeat_type = 0; /* Force greedy */
|
|
+ possessive_quantifier = TRUE;
|
|
+ }
|
|
goto OUTPUT_SINGLE_REPEAT;
|
|
}
|
|
|
|
@@ -2495,6 +3045,14 @@
|
|
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
|
|
c = *previous;
|
|
|
|
+ if (!possessive_quantifier &&
|
|
+ repeat_max < 0 &&
|
|
+ check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
|
|
+ {
|
|
+ repeat_type = 0; /* Force greedy */
|
|
+ possessive_quantifier = TRUE;
|
|
+ }
|
|
+
|
|
OUTPUT_SINGLE_REPEAT:
|
|
if (*previous == OP_PROP || *previous == OP_NOTPROP)
|
|
{
|
|
@@ -2535,7 +3093,7 @@
|
|
}
|
|
|
|
/* A repeat minimum of 1 is optimized into some special cases. If the
|
|
- maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
|
|
+ maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
|
|
left in place and, if the maximum is greater than 1, we use OP_UPTO with
|
|
one less than the maximum. */
|
|
|
|
@@ -2588,7 +3146,8 @@
|
|
}
|
|
|
|
/* Else insert an UPTO if the max is greater than the min, again
|
|
- preceded by the character, for the previously inserted code. */
|
|
+ preceded by the character, for the previously inserted code. If the
|
|
+ UPTO is just for 1 instance, we can use QUERY instead. */
|
|
|
|
else if (repeat_max != repeat_min)
|
|
{
|
|
@@ -2607,8 +3166,16 @@
|
|
*code++ = prop_value;
|
|
}
|
|
repeat_max -= repeat_min;
|
|
- *code++ = OP_UPTO + repeat_type;
|
|
- PUT2INC(code, 0, repeat_max);
|
|
+
|
|
+ if (repeat_max == 1)
|
|
+ {
|
|
+ *code++ = OP_QUERY + repeat_type;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ *code++ = OP_UPTO + repeat_type;
|
|
+ PUT2INC(code, 0, repeat_max);
|
|
+ }
|
|
}
|
|
}
|
|
|
|
@@ -2675,14 +3242,30 @@
|
|
/* If previous was a bracket group, we may have to replicate it in certain
|
|
cases. */
|
|
|
|
- else if (*previous >= OP_BRA || *previous == OP_ONCE ||
|
|
- *previous == OP_COND)
|
|
+ else if (*previous == OP_BRA || *previous == OP_CBRA ||
|
|
+ *previous == OP_ONCE || *previous == OP_COND)
|
|
{
|
|
register int i;
|
|
int ketoffset = 0;
|
|
int len = code - previous;
|
|
uschar *bralink = NULL;
|
|
|
|
+ /* Repeating a DEFINE group is pointless */
|
|
+
|
|
+ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
|
|
+ {
|
|
+ *errorcodeptr = ERR55;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ /* This is a paranoid check to stop integer overflow later on */
|
|
+
|
|
+ if (len > MAX_DUPLENGTH)
|
|
+ {
|
|
+ *errorcodeptr = ERR50;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
/* If the maximum repeat count is unlimited, find the end of the bracket
|
|
by scanning through from the start, and compute the offset back to it
|
|
from the current code pointer. There may be an OP_OPT setting following
|
|
@@ -2717,13 +3300,14 @@
|
|
/* If the maximum is 1 or unlimited, we just have to stick in the
|
|
BRAZERO and do no more at this point. However, we do need to adjust
|
|
any OP_RECURSE calls inside the group that refer to the group itself or
|
|
- any internal group, because the offset is from the start of the whole
|
|
- regex. Temporarily terminate the pattern while doing this. */
|
|
+ any internal or forward referenced group, because the offset is from
|
|
+ the start of the whole regex. Temporarily terminate the pattern while
|
|
+ doing this. */
|
|
|
|
if (repeat_max <= 1)
|
|
{
|
|
*code = OP_END;
|
|
- adjust_recurse(previous, 1, utf8, cd);
|
|
+ adjust_recurse(previous, 1, utf8, cd, save_hwm);
|
|
memmove(previous+1, previous, len);
|
|
code++;
|
|
*previous++ = OP_BRAZERO + repeat_type;
|
|
@@ -2741,7 +3325,7 @@
|
|
{
|
|
int offset;
|
|
*code = OP_END;
|
|
- adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
|
|
+ adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
|
|
memmove(previous + 2 + LINK_SIZE, previous, len);
|
|
code += 2 + LINK_SIZE;
|
|
*previous++ = OP_BRAZERO + repeat_type;
|
|
@@ -2761,19 +3345,41 @@
|
|
/* If the minimum is greater than zero, replicate the group as many
|
|
times as necessary, and adjust the maximum to the number of subsequent
|
|
copies that we need. If we set a first char from the group, and didn't
|
|
- set a required char, copy the latter from the former. */
|
|
+ set a required char, copy the latter from the former. If there are any
|
|
+ forward reference subroutine calls in the group, there will be entries on
|
|
+ the workspace list; replicate these with an appropriate increment. */
|
|
|
|
else
|
|
{
|
|
if (repeat_min > 1)
|
|
{
|
|
- if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
|
|
- for (i = 1; i < repeat_min; i++)
|
|
+ /* In the pre-compile phase, we don't actually do the replication. We
|
|
+ just adjust the length as if we had. */
|
|
+
|
|
+ if (lengthptr != NULL)
|
|
+ *lengthptr += (repeat_min - 1)*length_prevgroup;
|
|
+
|
|
+ /* This is compiling for real */
|
|
+
|
|
+ else
|
|
{
|
|
- memcpy(code, previous, len);
|
|
- code += len;
|
|
+ if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
|
|
+ for (i = 1; i < repeat_min; i++)
|
|
+ {
|
|
+ uschar *hc;
|
|
+ uschar *this_hwm = cd->hwm;
|
|
+ memcpy(code, previous, len);
|
|
+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
|
|
+ {
|
|
+ PUT(cd->hwm, 0, GET(hc, 0) + len);
|
|
+ cd->hwm += LINK_SIZE;
|
|
+ }
|
|
+ save_hwm = this_hwm;
|
|
+ code += len;
|
|
+ }
|
|
}
|
|
}
|
|
+
|
|
if (repeat_max > 0) repeat_max -= repeat_min;
|
|
}
|
|
|
|
@@ -2781,12 +3387,27 @@
|
|
the maximum is limited, it replicates the group in a nested fashion,
|
|
remembering the bracket starts on a stack. In the case of a zero minimum,
|
|
the first one was set up above. In all cases the repeat_max now specifies
|
|
- the number of additional copies needed. */
|
|
+ the number of additional copies needed. Again, we must remember to
|
|
+ replicate entries on the forward reference list. */
|
|
|
|
if (repeat_max >= 0)
|
|
{
|
|
- for (i = repeat_max - 1; i >= 0; i--)
|
|
+ /* In the pre-compile phase, we don't actually do the replication. We
|
|
+ just adjust the length as if we had. For each repetition we must add 1
|
|
+ to the length for BRAZERO and for all but the last repetition we must
|
|
+ add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
|
|
+
|
|
+ if (lengthptr != NULL && repeat_max > 0)
|
|
+ *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
|
|
+ 2 - 2*LINK_SIZE; /* Last one doesn't nest */
|
|
+
|
|
+ /* This is compiling for real */
|
|
+
|
|
+ else for (i = repeat_max - 1; i >= 0; i--)
|
|
{
|
|
+ uschar *hc;
|
|
+ uschar *this_hwm = cd->hwm;
|
|
+
|
|
*code++ = OP_BRAZERO + repeat_type;
|
|
|
|
/* All but the final copy start a new nesting, maintaining the
|
|
@@ -2802,6 +3423,12 @@
|
|
}
|
|
|
|
memcpy(code, previous, len);
|
|
+ for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
|
|
+ {
|
|
+ PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
|
|
+ cd->hwm += LINK_SIZE;
|
|
+ }
|
|
+ save_hwm = this_hwm;
|
|
code += len;
|
|
}
|
|
|
|
@@ -2824,9 +3451,34 @@
|
|
/* If the maximum is unlimited, set a repeater in the final copy. We
|
|
can't just offset backwards from the current code point, because we
|
|
don't know if there's been an options resetting after the ket. The
|
|
- correct offset was computed above. */
|
|
+ correct offset was computed above.
|
|
+
|
|
+ Then, when we are doing the actual compile phase, check to see whether
|
|
+ this group is a non-atomic one that could match an empty string. If so,
|
|
+ convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
|
|
+ that runtime checking can be done. [This check is also applied to
|
|
+ atomic groups at runtime, but in a different way.] */
|
|
|
|
- else code[-ketoffset] = OP_KETRMAX + repeat_type;
|
|
+ else
|
|
+ {
|
|
+ uschar *ketcode = code - ketoffset;
|
|
+ uschar *bracode = ketcode - GET(ketcode, 1);
|
|
+ *ketcode = OP_KETRMAX + repeat_type;
|
|
+ if (lengthptr == NULL && *bracode != OP_ONCE)
|
|
+ {
|
|
+ uschar *scode = bracode;
|
|
+ do
|
|
+ {
|
|
+ if (could_be_empty_branch(scode, ketcode, utf8))
|
|
+ {
|
|
+ *bracode += OP_SBRA - OP_BRA;
|
|
+ break;
|
|
+ }
|
|
+ scode += GET(scode, 1);
|
|
+ }
|
|
+ while (*scode == OP_ALT);
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
/* Else there's some kind of shambles */
|
|
@@ -2837,22 +3489,53 @@
|
|
goto FAILED;
|
|
}
|
|
|
|
- /* If the character following a repeat is '+', we wrap the entire repeated
|
|
- item inside OP_ONCE brackets. This is just syntactic sugar, taken from
|
|
- Sun's Java package. The repeated item starts at tempcode, not at previous,
|
|
- which might be the first part of a string whose (former) last char we
|
|
- repeated. However, we don't support '+' after a greediness '?'. */
|
|
+ /* If the character following a repeat is '+', or if certain optimization
|
|
+ tests above succeeded, possessive_quantifier is TRUE. For some of the
|
|
+ simpler opcodes, there is an special alternative opcode for this. For
|
|
+ anything else, we wrap the entire repeated item inside OP_ONCE brackets.
|
|
+ The '+' notation is just syntactic sugar, taken from Sun's Java package,
|
|
+ but the special opcodes can optimize it a bit. The repeated item starts at
|
|
+ tempcode, not at previous, which might be the first part of a string whose
|
|
+ (former) last char we repeated.
|
|
+
|
|
+ Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
|
|
+ an 'upto' may follow. We skip over an 'exact' item, and then test the
|
|
+ length of what remains before proceeding. */
|
|
|
|
if (possessive_quantifier)
|
|
{
|
|
- int len = code - tempcode;
|
|
- memmove(tempcode + 1+LINK_SIZE, tempcode, len);
|
|
- code += 1 + LINK_SIZE;
|
|
- len += 1 + LINK_SIZE;
|
|
- tempcode[0] = OP_ONCE;
|
|
- *code++ = OP_KET;
|
|
- PUTINC(code, 0, len);
|
|
- PUT(tempcode, 1, len);
|
|
+ int len;
|
|
+ if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
|
|
+ *tempcode == OP_NOTEXACT)
|
|
+ tempcode += _pcre_OP_lengths[*tempcode];
|
|
+ len = code - tempcode;
|
|
+ if (len > 0) switch (*tempcode)
|
|
+ {
|
|
+ case OP_STAR: *tempcode = OP_POSSTAR; break;
|
|
+ case OP_PLUS: *tempcode = OP_POSPLUS; break;
|
|
+ case OP_QUERY: *tempcode = OP_POSQUERY; break;
|
|
+ case OP_UPTO: *tempcode = OP_POSUPTO; break;
|
|
+
|
|
+ case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
|
|
+ case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
|
|
+ case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
|
|
+ case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
|
|
+
|
|
+ case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
|
|
+ case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
|
|
+ case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
|
|
+ case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
|
|
+
|
|
+ default:
|
|
+ memmove(tempcode + 1+LINK_SIZE, tempcode, len);
|
|
+ code += 1 + LINK_SIZE;
|
|
+ len += 1 + LINK_SIZE;
|
|
+ tempcode[0] = OP_ONCE;
|
|
+ *code++ = OP_KET;
|
|
+ PUTINC(code, 0, len);
|
|
+ PUT(tempcode, 1, len);
|
|
+ break;
|
|
+ }
|
|
}
|
|
|
|
/* In all case we no longer have a previous item. We also set the
|
|
@@ -2865,162 +3548,275 @@
|
|
break;
|
|
|
|
|
|
- /* Start of nested bracket sub-expression, or comment or lookahead or
|
|
- lookbehind or option setting or condition. First deal with special things
|
|
- that can come after a bracket; all are introduced by ?, and the appearance
|
|
- of any of them means that this is not a referencing group. They were
|
|
- checked for validity in the first pass over the string, so we don't have to
|
|
- check for syntax errors here. */
|
|
+ /* ===================================================================*/
|
|
+ /* Start of nested parenthesized sub-expression, or comment or lookahead or
|
|
+ lookbehind or option setting or condition or all the other extended
|
|
+ parenthesis forms. First deal with the specials; all are introduced by ?,
|
|
+ and the appearance of any of them means that this is not a capturing
|
|
+ group. */
|
|
|
|
case '(':
|
|
newoptions = options;
|
|
skipbytes = 0;
|
|
+ bravalue = OP_CBRA;
|
|
+ save_hwm = cd->hwm;
|
|
|
|
if (*(++ptr) == '?')
|
|
{
|
|
- int set, unset;
|
|
+ int i, set, unset, namelen;
|
|
int *optset;
|
|
+ const uschar *name;
|
|
+ uschar *slot;
|
|
|
|
switch (*(++ptr))
|
|
{
|
|
case '#': /* Comment; skip to ket */
|
|
ptr++;
|
|
- while (*ptr != ')') ptr++;
|
|
+ while (*ptr != 0 && *ptr != ')') ptr++;
|
|
+ if (*ptr == 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR18;
|
|
+ goto FAILED;
|
|
+ }
|
|
continue;
|
|
|
|
- case ':': /* Non-extracting bracket */
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case ':': /* Non-capturing bracket */
|
|
bravalue = OP_BRA;
|
|
ptr++;
|
|
break;
|
|
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
case '(':
|
|
bravalue = OP_COND; /* Conditional group */
|
|
|
|
- /* A condition can be a number, referring to a numbered group, a name,
|
|
- referring to a named group, 'R', referring to recursion, or an
|
|
- assertion. There are two unfortunate ambiguities, caused by history.
|
|
- (a) 'R' can be the recursive thing or the name 'R', and (b) a number
|
|
- could be a name that consists of digits. In both cases, we look for a
|
|
- name first; if not found, we try the other cases. If the first
|
|
- character after (?( is a word character, we know the rest up to ) will
|
|
- also be word characters because the syntax was checked in the first
|
|
- pass. */
|
|
-
|
|
- if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
|
|
- {
|
|
- int i, namelen;
|
|
- int condref = 0;
|
|
- const uschar *name;
|
|
- uschar *slot = cd->name_table;
|
|
+ /* A condition can be an assertion, a number (referring to a numbered
|
|
+ group), a name (referring to a named group), or 'R', referring to
|
|
+ recursion. R<digits> and R&name are also permitted for recursion tests.
|
|
+
|
|
+ There are several syntaxes for testing a named group: (?(name)) is used
|
|
+ by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
|
|
+
|
|
+ There are two unfortunate ambiguities, caused by history. (a) 'R' can
|
|
+ be the recursive thing or the name 'R' (and similarly for 'R' followed
|
|
+ by digits), and (b) a number could be a name that consists of digits.
|
|
+ In both cases, we look for a name first; if not found, we try the other
|
|
+ cases. */
|
|
+
|
|
+ /* For conditions that are assertions, check the syntax, and then exit
|
|
+ the switch. This will take control down to where bracketed groups,
|
|
+ including assertions, are processed. */
|
|
|
|
- /* This is needed for all successful cases. */
|
|
+ if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
|
|
+ break;
|
|
|
|
- skipbytes = 3;
|
|
+ /* Most other conditions use OP_CREF (a couple change to OP_RREF
|
|
+ below), and all need to skip 3 bytes at the start of the group. */
|
|
|
|
- /* Read the name, but also get it as a number if it's all digits */
|
|
+ code[1+LINK_SIZE] = OP_CREF;
|
|
+ skipbytes = 3;
|
|
|
|
- name = ++ptr;
|
|
- while (*ptr != ')')
|
|
- {
|
|
- if (condref >= 0)
|
|
- condref = ((digitab[*ptr] & ctype_digit) != 0)?
|
|
- condref * 10 + *ptr - '0' : -1;
|
|
- ptr++;
|
|
- }
|
|
- namelen = ptr - name;
|
|
+ /* Check for a test for recursion in a named group. */
|
|
+
|
|
+ if (ptr[1] == 'R' && ptr[2] == '&')
|
|
+ {
|
|
+ terminator = -1;
|
|
+ ptr += 2;
|
|
+ code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
|
|
+ }
|
|
+
|
|
+ /* Check for a test for a named group's having been set, using the Perl
|
|
+ syntax (?(<name>) or (?('name') */
|
|
+
|
|
+ else if (ptr[1] == '<')
|
|
+ {
|
|
+ terminator = '>';
|
|
ptr++;
|
|
+ }
|
|
+ else if (ptr[1] == '\'')
|
|
+ {
|
|
+ terminator = '\'';
|
|
+ ptr++;
|
|
+ }
|
|
+ else terminator = 0;
|
|
|
|
- for (i = 0; i < cd->names_found; i++)
|
|
- {
|
|
- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
|
|
- slot += cd->name_entry_size;
|
|
- }
|
|
+ /* We now expect to read a name; any thing else is an error */
|
|
|
|
- /* Found a previous named subpattern */
|
|
+ if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
|
|
+ {
|
|
+ ptr += 1; /* To get the right offset */
|
|
+ *errorcodeptr = ERR28;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
- if (i < cd->names_found)
|
|
- {
|
|
- condref = GET2(slot, 0);
|
|
- code[1+LINK_SIZE] = OP_CREF;
|
|
- PUT2(code, 2+LINK_SIZE, condref);
|
|
- }
|
|
+ /* Read the name, but also get it as a number if it's all digits */
|
|
|
|
- /* Search the pattern for a forward reference */
|
|
+ recno = 0;
|
|
+ name = ++ptr;
|
|
+ while ((cd->ctypes[*ptr] & ctype_word) != 0)
|
|
+ {
|
|
+ if (recno >= 0)
|
|
+ recno = ((digitab[*ptr] & ctype_digit) != 0)?
|
|
+ recno * 10 + *ptr - '0' : -1;
|
|
+ ptr++;
|
|
+ }
|
|
+ namelen = ptr - name;
|
|
|
|
- else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
|
|
- {
|
|
- code[1+LINK_SIZE] = OP_CREF;
|
|
- PUT2(code, 2+LINK_SIZE, i);
|
|
- }
|
|
+ if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
|
|
+ {
|
|
+ ptr--; /* Error offset */
|
|
+ *errorcodeptr = ERR26;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
- /* Check for 'R' for recursion */
|
|
+ /* Do no further checking in the pre-compile phase. */
|
|
|
|
- else if (namelen == 1 && *name == 'R')
|
|
- {
|
|
- code[1+LINK_SIZE] = OP_CREF;
|
|
- PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
|
|
- }
|
|
+ if (lengthptr != NULL) break;
|
|
|
|
- /* Check for a subpattern number */
|
|
+ /* In the real compile we do the work of looking for the actual
|
|
+ reference. */
|
|
|
|
- else if (condref > 0)
|
|
- {
|
|
- code[1+LINK_SIZE] = OP_CREF;
|
|
- PUT2(code, 2+LINK_SIZE, condref);
|
|
- }
|
|
+ slot = cd->name_table;
|
|
+ for (i = 0; i < cd->names_found; i++)
|
|
+ {
|
|
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
|
|
+ slot += cd->name_entry_size;
|
|
+ }
|
|
|
|
- /* Either an unidentified subpattern, or a reference to (?(0) */
|
|
+ /* Found a previous named subpattern */
|
|
|
|
- else
|
|
+ if (i < cd->names_found)
|
|
+ {
|
|
+ recno = GET2(slot, 0);
|
|
+ PUT2(code, 2+LINK_SIZE, recno);
|
|
+ }
|
|
+
|
|
+ /* Search the pattern for a forward reference */
|
|
+
|
|
+ else if ((i = find_parens(ptr, cd->bracount, name, namelen,
|
|
+ (options & PCRE_EXTENDED) != 0)) > 0)
|
|
+ {
|
|
+ PUT2(code, 2+LINK_SIZE, i);
|
|
+ }
|
|
+
|
|
+ /* If terminator == 0 it means that the name followed directly after
|
|
+ the opening parenthesis [e.g. (?(abc)...] and in this case there are
|
|
+ some further alternatives to try. For the cases where terminator != 0
|
|
+ [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
|
|
+ now checked all the possibilities, so give an error. */
|
|
+
|
|
+ else if (terminator != 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR15;
|
|
+ goto FAILED;
|
|
+ }
|
|
+
|
|
+ /* Check for (?(R) for recursion. Allow digits after R to specify a
|
|
+ specific group number. */
|
|
+
|
|
+ else if (*name == 'R')
|
|
+ {
|
|
+ recno = 0;
|
|
+ for (i = 1; i < namelen; i++)
|
|
{
|
|
- *errorcodeptr = (condref == 0)? ERR35: ERR15;
|
|
- goto FAILED;
|
|
+ if ((digitab[name[i]] & ctype_digit) == 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR15;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ recno = recno * 10 + name[i] - '0';
|
|
}
|
|
+ if (recno == 0) recno = RREF_ANY;
|
|
+ code[1+LINK_SIZE] = OP_RREF; /* Change test type */
|
|
+ PUT2(code, 2+LINK_SIZE, recno);
|
|
+ }
|
|
+
|
|
+ /* Similarly, check for the (?(DEFINE) "condition", which is always
|
|
+ false. */
|
|
+
|
|
+ else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
|
|
+ {
|
|
+ code[1+LINK_SIZE] = OP_DEF;
|
|
+ skipbytes = 1;
|
|
+ }
|
|
+
|
|
+ /* Check for the "name" actually being a subpattern number. */
|
|
+
|
|
+ else if (recno > 0)
|
|
+ {
|
|
+ PUT2(code, 2+LINK_SIZE, recno);
|
|
}
|
|
|
|
- /* For conditions that are assertions, we just fall through, having
|
|
- set bravalue above. */
|
|
+ /* Either an unidentified subpattern, or a reference to (?(0) */
|
|
|
|
+ else
|
|
+ {
|
|
+ *errorcodeptr = (recno == 0)? ERR35: ERR15;
|
|
+ goto FAILED;
|
|
+ }
|
|
break;
|
|
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
case '=': /* Positive lookahead */
|
|
bravalue = OP_ASSERT;
|
|
ptr++;
|
|
break;
|
|
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
case '!': /* Negative lookahead */
|
|
bravalue = OP_ASSERT_NOT;
|
|
ptr++;
|
|
break;
|
|
|
|
- case '<': /* Lookbehinds */
|
|
- switch (*(++ptr))
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case '<': /* Lookbehind or named define */
|
|
+ switch (ptr[1])
|
|
{
|
|
case '=': /* Positive lookbehind */
|
|
bravalue = OP_ASSERTBACK;
|
|
- ptr++;
|
|
+ ptr += 2;
|
|
break;
|
|
|
|
case '!': /* Negative lookbehind */
|
|
bravalue = OP_ASSERTBACK_NOT;
|
|
- ptr++;
|
|
+ ptr += 2;
|
|
break;
|
|
+
|
|
+ default: /* Could be name define, else bad */
|
|
+ if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
|
|
+ ptr++; /* Correct offset for error */
|
|
+ *errorcodeptr = ERR24;
|
|
+ goto FAILED;
|
|
}
|
|
break;
|
|
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
case '>': /* One-time brackets */
|
|
bravalue = OP_ONCE;
|
|
ptr++;
|
|
break;
|
|
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
case 'C': /* Callout - may be followed by digits; */
|
|
previous_callout = code; /* Save for later completion */
|
|
after_manual_callout = 1; /* Skip one item before completing */
|
|
- *code++ = OP_CALLOUT; /* Already checked that the terminating */
|
|
- { /* closing parenthesis is present. */
|
|
+ *code++ = OP_CALLOUT;
|
|
+ {
|
|
int n = 0;
|
|
while ((digitab[*(++ptr)] & ctype_digit) != 0)
|
|
n = n * 10 + *ptr - '0';
|
|
+ if (*ptr != ')')
|
|
+ {
|
|
+ *errorcodeptr = ERR39;
|
|
+ goto FAILED;
|
|
+ }
|
|
if (n > 255)
|
|
{
|
|
*errorcodeptr = ERR38;
|
|
@@ -3034,134 +3830,232 @@
|
|
previous = NULL;
|
|
continue;
|
|
|
|
- case 'P': /* Named subpattern handling */
|
|
- if (*(++ptr) == '<') /* Definition */
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case 'P': /* Python-style named subpattern handling */
|
|
+ if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
|
|
+ {
|
|
+ is_recurse = *ptr == '>';
|
|
+ terminator = ')';
|
|
+ goto NAMED_REF_OR_RECURSE;
|
|
+ }
|
|
+ else if (*ptr != '<') /* Test for Python-style definition */
|
|
+ {
|
|
+ *errorcodeptr = ERR41;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ /* Fall through to handle (?P< as (?< is handled */
|
|
+
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
+ DEFINE_NAME: /* Come here from (?< handling */
|
|
+ case '\'':
|
|
{
|
|
- int i, namelen;
|
|
- uschar *slot = cd->name_table;
|
|
- const uschar *name; /* Don't amalgamate; some compilers */
|
|
- name = ++ptr; /* grumble at autoincrement in declaration */
|
|
+ terminator = (*ptr == '<')? '>' : '\'';
|
|
+ name = ++ptr;
|
|
+
|
|
+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
|
|
+ namelen = ptr - name;
|
|
|
|
- while (*ptr++ != '>');
|
|
- namelen = ptr - name - 1;
|
|
+ /* In the pre-compile phase, just do a syntax check. */
|
|
|
|
- for (i = 0; i < cd->names_found; i++)
|
|
+ if (lengthptr != NULL)
|
|
+ {
|
|
+ if (*ptr != terminator)
|
|
+ {
|
|
+ *errorcodeptr = ERR42;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ if (cd->names_found >= MAX_NAME_COUNT)
|
|
+ {
|
|
+ *errorcodeptr = ERR49;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ if (namelen + 3 > cd->name_entry_size)
|
|
+ {
|
|
+ cd->name_entry_size = namelen + 3;
|
|
+ if (namelen > MAX_NAME_SIZE)
|
|
+ {
|
|
+ *errorcodeptr = ERR48;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* In the real compile, create the entry in the table */
|
|
+
|
|
+ else
|
|
{
|
|
- int crc = memcmp(name, slot+2, namelen);
|
|
- if (crc == 0)
|
|
+ slot = cd->name_table;
|
|
+ for (i = 0; i < cd->names_found; i++)
|
|
{
|
|
- if (slot[2+namelen] == 0)
|
|
+ int crc = memcmp(name, slot+2, namelen);
|
|
+ if (crc == 0)
|
|
{
|
|
- if ((options & PCRE_DUPNAMES) == 0)
|
|
+ if (slot[2+namelen] == 0)
|
|
{
|
|
- *errorcodeptr = ERR43;
|
|
- goto FAILED;
|
|
+ if ((options & PCRE_DUPNAMES) == 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR43;
|
|
+ goto FAILED;
|
|
+ }
|
|
}
|
|
+ else crc = -1; /* Current name is substring */
|
|
}
|
|
- else crc = -1; /* Current name is substring */
|
|
- }
|
|
- if (crc < 0)
|
|
- {
|
|
- memmove(slot + cd->name_entry_size, slot,
|
|
- (cd->names_found - i) * cd->name_entry_size);
|
|
- break;
|
|
+ if (crc < 0)
|
|
+ {
|
|
+ memmove(slot + cd->name_entry_size, slot,
|
|
+ (cd->names_found - i) * cd->name_entry_size);
|
|
+ break;
|
|
+ }
|
|
+ slot += cd->name_entry_size;
|
|
}
|
|
- slot += cd->name_entry_size;
|
|
- }
|
|
|
|
- PUT2(slot, 0, *brackets + 1);
|
|
- memcpy(slot + 2, name, namelen);
|
|
- slot[2+namelen] = 0;
|
|
- cd->names_found++;
|
|
- goto NUMBERED_GROUP;
|
|
+ PUT2(slot, 0, cd->bracount + 1);
|
|
+ memcpy(slot + 2, name, namelen);
|
|
+ slot[2+namelen] = 0;
|
|
+ }
|
|
}
|
|
|
|
- if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
|
|
- {
|
|
- int i, namelen;
|
|
- int type = *ptr++;
|
|
- const uschar *name = ptr;
|
|
- uschar *slot = cd->name_table;
|
|
+ /* In both cases, count the number of names we've encountered. */
|
|
|
|
- while (*ptr != ')') ptr++;
|
|
- namelen = ptr - name;
|
|
+ ptr++; /* Move past > or ' */
|
|
+ cd->names_found++;
|
|
+ goto NUMBERED_GROUP;
|
|
|
|
- for (i = 0; i < cd->names_found; i++)
|
|
+
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case '&': /* Perl recursion/subroutine syntax */
|
|
+ terminator = ')';
|
|
+ is_recurse = TRUE;
|
|
+ /* Fall through */
|
|
+
|
|
+ /* We come here from the Python syntax above that handles both
|
|
+ references (?P=name) and recursion (?P>name), as well as falling
|
|
+ through from the Perl recursion syntax (?&name). */
|
|
+
|
|
+ NAMED_REF_OR_RECURSE:
|
|
+ name = ++ptr;
|
|
+ while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
|
|
+ namelen = ptr - name;
|
|
+
|
|
+ /* In the pre-compile phase, do a syntax check and set a dummy
|
|
+ reference number. */
|
|
+
|
|
+ if (lengthptr != NULL)
|
|
+ {
|
|
+ if (*ptr != terminator)
|
|
{
|
|
- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
|
|
- slot += cd->name_entry_size;
|
|
+ *errorcodeptr = ERR42;
|
|
+ goto FAILED;
|
|
}
|
|
-
|
|
- if (i < cd->names_found) /* Back reference */
|
|
+ if (namelen > MAX_NAME_SIZE)
|
|
+ {
|
|
+ *errorcodeptr = ERR48;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ recno = 0;
|
|
+ }
|
|
+
|
|
+ /* In the real compile, seek the name in the table */
|
|
+
|
|
+ else
|
|
+ {
|
|
+ slot = cd->name_table;
|
|
+ for (i = 0; i < cd->names_found; i++)
|
|
+ {
|
|
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
|
|
+ slot += cd->name_entry_size;
|
|
+ }
|
|
+
|
|
+ if (i < cd->names_found) /* Back reference */
|
|
{
|
|
recno = GET2(slot, 0);
|
|
}
|
|
else if ((recno = /* Forward back reference */
|
|
- find_named_parens(ptr, *brackets, name, namelen)) <= 0)
|
|
+ find_parens(ptr, cd->bracount, name, namelen,
|
|
+ (options & PCRE_EXTENDED) != 0)) <= 0)
|
|
{
|
|
*errorcodeptr = ERR15;
|
|
goto FAILED;
|
|
}
|
|
+ }
|
|
|
|
- if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
|
|
-
|
|
- /* Back reference */
|
|
+ /* In both phases, we can now go to the code than handles numerical
|
|
+ recursion or backreferences. */
|
|
|
|
- previous = code;
|
|
- *code++ = OP_REF;
|
|
- PUT2INC(code, 0, recno);
|
|
- cd->backref_map |= (recno < 32)? (1 << recno) : 1;
|
|
- if (recno > cd->top_backref) cd->top_backref = recno;
|
|
- continue;
|
|
- }
|
|
+ if (is_recurse) goto HANDLE_RECURSION;
|
|
+ else goto HANDLE_REFERENCE;
|
|
|
|
- /* Should never happen */
|
|
- break;
|
|
|
|
- case 'R': /* Pattern recursion */
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case 'R': /* Recursion */
|
|
ptr++; /* Same as (?0) */
|
|
/* Fall through */
|
|
|
|
- /* Recursion or "subroutine" call */
|
|
|
|
- case '0': case '1': case '2': case '3': case '4':
|
|
- case '5': case '6': case '7': case '8': case '9':
|
|
+ /* ------------------------------------------------------------ */
|
|
+ case '0': case '1': case '2': case '3': case '4': /* Recursion or */
|
|
+ case '5': case '6': case '7': case '8': case '9': /* subroutine */
|
|
{
|
|
const uschar *called;
|
|
recno = 0;
|
|
while((digitab[*ptr] & ctype_digit) != 0)
|
|
recno = recno * 10 + *ptr++ - '0';
|
|
+ if (*ptr != ')')
|
|
+ {
|
|
+ *errorcodeptr = ERR29;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
/* Come here from code above that handles a named recursion */
|
|
|
|
HANDLE_RECURSION:
|
|
|
|
previous = code;
|
|
+ called = cd->start_code;
|
|
|
|
- /* Find the bracket that is being referenced. Temporarily end the
|
|
- regex in case it doesn't exist. */
|
|
+ /* When we are actually compiling, find the bracket that is being
|
|
+ referenced. Temporarily end the regex in case it doesn't exist before
|
|
+ this point. If we end up with a forward reference, first check that
|
|
+ the bracket does occur later so we can give the error (and position)
|
|
+ now. Then remember this forward reference in the workspace so it can
|
|
+ be filled in at the end. */
|
|
|
|
- *code = OP_END;
|
|
- called = (recno == 0)? cd->start_code :
|
|
- find_bracket(cd->start_code, utf8, recno);
|
|
- if (called == NULL)
|
|
+ if (lengthptr == NULL)
|
|
{
|
|
- *errorcodeptr = ERR15;
|
|
- goto FAILED;
|
|
- }
|
|
+ *code = OP_END;
|
|
+ if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
|
|
|
|
- /* If the subpattern is still open, this is a recursive call. We
|
|
- check to see if this is a left recursion that could loop for ever,
|
|
- and diagnose that case. */
|
|
+ /* Forward reference */
|
|
|
|
- if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
|
|
- {
|
|
- *errorcodeptr = ERR40;
|
|
- goto FAILED;
|
|
+ if (called == NULL)
|
|
+ {
|
|
+ if (find_parens(ptr, cd->bracount, NULL, recno,
|
|
+ (options & PCRE_EXTENDED) != 0) < 0)
|
|
+ {
|
|
+ *errorcodeptr = ERR15;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ called = cd->start_code + recno;
|
|
+ PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
|
|
+ }
|
|
+
|
|
+ /* If not a forward reference, and the subpattern is still open,
|
|
+ this is a recursive call. We check to see if this is a left
|
|
+ recursion that could loop for ever, and diagnose that case. */
|
|
+
|
|
+ else if (GET(called, 1) == 0 &&
|
|
+ could_be_empty(called, code, bcptr, utf8))
|
|
+ {
|
|
+ *errorcodeptr = ERR40;
|
|
+ goto FAILED;
|
|
+ }
|
|
}
|
|
|
|
/* Insert the recursion/subroutine item, automatically wrapped inside
|
|
- "once" brackets. */
|
|
+ "once" brackets. Set up a "previous group" length so that a
|
|
+ subsequent quantifier will work. */
|
|
|
|
*code = OP_ONCE;
|
|
PUT(code, 1, 2 + 2*LINK_SIZE);
|
|
@@ -3174,12 +4068,18 @@
|
|
*code = OP_KET;
|
|
PUT(code, 1, 2 + 2*LINK_SIZE);
|
|
code += 1 + LINK_SIZE;
|
|
+
|
|
+ length_prevgroup = 3 + 3*LINK_SIZE;
|
|
}
|
|
+
|
|
+ /* Can't determine a first byte now */
|
|
+
|
|
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
|
|
continue;
|
|
|
|
- /* Character after (? not specially recognized */
|
|
|
|
- default: /* Option setting */
|
|
+ /* ------------------------------------------------------------ */
|
|
+ default: /* Other characters: check option setting */
|
|
set = unset = 0;
|
|
optset = &set;
|
|
|
|
@@ -3189,13 +4089,21 @@
|
|
{
|
|
case '-': optset = &unset; break;
|
|
|
|
+ case 'J': /* Record that it changed in the external options */
|
|
+ *optset |= PCRE_DUPNAMES;
|
|
+ cd->external_options |= PCRE_JCHANGED;
|
|
+ break;
|
|
+
|
|
case 'i': *optset |= PCRE_CASELESS; break;
|
|
- case 'J': *optset |= PCRE_DUPNAMES; break;
|
|
case 'm': *optset |= PCRE_MULTILINE; break;
|
|
case 's': *optset |= PCRE_DOTALL; break;
|
|
case 'x': *optset |= PCRE_EXTENDED; break;
|
|
case 'U': *optset |= PCRE_UNGREEDY; break;
|
|
case 'X': *optset |= PCRE_EXTRA; break;
|
|
+
|
|
+ default: *errorcodeptr = ERR12;
|
|
+ ptr--; /* Correct the offset */
|
|
+ goto FAILED;
|
|
}
|
|
}
|
|
|
|
@@ -3204,32 +4112,54 @@
|
|
newoptions = (options | set) & (~unset);
|
|
|
|
/* If the options ended with ')' this is not the start of a nested
|
|
- group with option changes, so the options change at this level. Compile
|
|
- code to change the ims options if this setting actually changes any of
|
|
- them. We also pass the new setting back so that it can be put at the
|
|
- start of any following branches, and when this group ends (if we are in
|
|
- a group), a resetting item can be compiled.
|
|
-
|
|
- Note that if this item is right at the start of the pattern, the
|
|
- options will have been abstracted and made global, so there will be no
|
|
- change to compile. */
|
|
+ group with option changes, so the options change at this level. If this
|
|
+ item is right at the start of the pattern, the options can be
|
|
+ abstracted and made external in the pre-compile phase, and ignored in
|
|
+ the compile phase. This can be helpful when matching -- for instance in
|
|
+ caseless checking of required bytes.
|
|
+
|
|
+ If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
|
|
+ definitely *not* at the start of the pattern because something has been
|
|
+ compiled. In the pre-compile phase, however, the code pointer can have
|
|
+ that value after the start, because it gets reset as code is discarded
|
|
+ during the pre-compile. However, this can happen only at top level - if
|
|
+ we are within parentheses, the starting BRA will still be present. At
|
|
+ any parenthesis level, the length value can be used to test if anything
|
|
+ has been compiled at that level. Thus, a test for both these conditions
|
|
+ is necessary to ensure we correctly detect the start of the pattern in
|
|
+ both phases.
|
|
+
|
|
+ If we are not at the pattern start, compile code to change the ims
|
|
+ options if this setting actually changes any of them. We also pass the
|
|
+ new setting back so that it can be put at the start of any following
|
|
+ branches, and when this group ends (if we are in a group), a resetting
|
|
+ item can be compiled. */
|
|
|
|
if (*ptr == ')')
|
|
{
|
|
- if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
|
|
+ if (code == cd->start_code + 1 + LINK_SIZE &&
|
|
+ (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
|
|
{
|
|
- *code++ = OP_OPT;
|
|
- *code++ = newoptions & PCRE_IMS;
|
|
+ cd->external_options = newoptions;
|
|
+ options = newoptions;
|
|
}
|
|
+ else
|
|
+ {
|
|
+ if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
|
|
+ {
|
|
+ *code++ = OP_OPT;
|
|
+ *code++ = newoptions & PCRE_IMS;
|
|
+ }
|
|
|
|
- /* Change options at this level, and pass them back for use
|
|
- in subsequent branches. Reset the greedy defaults and the case
|
|
- value for firstbyte and reqbyte. */
|
|
-
|
|
- *optionsptr = options = newoptions;
|
|
- greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
|
|
- greedy_non_default = greedy_default ^ 1;
|
|
- req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
|
|
+ /* Change options at this level, and pass them back for use
|
|
+ in subsequent branches. Reset the greedy defaults and the case
|
|
+ value for firstbyte and reqbyte. */
|
|
+
|
|
+ *optionsptr = options = newoptions;
|
|
+ greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
|
|
+ greedy_non_default = greedy_default ^ 1;
|
|
+ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
|
|
+ }
|
|
|
|
previous = NULL; /* This item can't be repeated */
|
|
continue; /* It is complete */
|
|
@@ -3242,58 +4172,56 @@
|
|
|
|
bravalue = OP_BRA;
|
|
ptr++;
|
|
- }
|
|
- }
|
|
+ } /* End of switch for character following (? */
|
|
+ } /* End of (? handling */
|
|
|
|
- /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
|
|
- non-capturing and behave like (?:...) brackets */
|
|
+ /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
|
|
+ all unadorned brackets become non-capturing and behave like (?:...)
|
|
+ brackets. */
|
|
|
|
else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
|
|
{
|
|
bravalue = OP_BRA;
|
|
}
|
|
|
|
- /* Else we have a referencing group; adjust the opcode. If the bracket
|
|
- number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
|
|
- arrange for the true number to follow later, in an OP_BRANUMBER item. */
|
|
+ /* Else we have a capturing group. */
|
|
|
|
else
|
|
{
|
|
NUMBERED_GROUP:
|
|
- if (++(*brackets) > EXTRACT_BASIC_MAX)
|
|
- {
|
|
- bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
|
|
- code[1+LINK_SIZE] = OP_BRANUMBER;
|
|
- PUT2(code, 2+LINK_SIZE, *brackets);
|
|
- skipbytes = 3;
|
|
- }
|
|
- else bravalue = OP_BRA + *brackets;
|
|
+ cd->bracount += 1;
|
|
+ PUT2(code, 1+LINK_SIZE, cd->bracount);
|
|
+ skipbytes = 2;
|
|
}
|
|
|
|
- /* Process nested bracketed re. Assertions may not be repeated, but other
|
|
- kinds can be. We copy code into a non-register variable in order to be able
|
|
- to pass its address because some compilers complain otherwise. Pass in a
|
|
- new setting for the ims options if they have changed. */
|
|
+ /* Process nested bracketed regex. Assertions may not be repeated, but
|
|
+ other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
|
|
+ non-register variable in order to be able to pass its address because some
|
|
+ compilers complain otherwise. Pass in a new setting for the ims options if
|
|
+ they have changed. */
|
|
|
|
previous = (bravalue >= OP_ONCE)? code : NULL;
|
|
*code = bravalue;
|
|
tempcode = code;
|
|
tempreqvary = cd->req_varyopt; /* Save value before bracket */
|
|
+ length_prevgroup = 0; /* Initialize for pre-compile phase */
|
|
|
|
if (!compile_regex(
|
|
newoptions, /* The complete new option state */
|
|
options & PCRE_IMS, /* The previous ims option state */
|
|
- brackets, /* Extracting bracket count */
|
|
&tempcode, /* Where to put code (updated) */
|
|
&ptr, /* Input pointer (updated) */
|
|
errorcodeptr, /* Where to put an error message */
|
|
(bravalue == OP_ASSERTBACK ||
|
|
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
|
|
- skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
|
|
+ skipbytes, /* Skip over bracket number */
|
|
&subfirstbyte, /* For possible first char */
|
|
&subreqbyte, /* For possible last char */
|
|
bcptr, /* Current branch chain */
|
|
- cd)) /* Tables block */
|
|
+ cd, /* Tables block */
|
|
+ (lengthptr == NULL)? NULL : /* Actual compile phase */
|
|
+ &length_prevgroup /* Pre-compile phase */
|
|
+ ))
|
|
goto FAILED;
|
|
|
|
/* At the end of compiling, code is still pointing to the start of the
|
|
@@ -3302,9 +4230,9 @@
|
|
is on the bracket. */
|
|
|
|
/* If this is a conditional bracket, check that there are no more than
|
|
- two branches in the group. */
|
|
+ two branches in the group, or just one if it's a DEFINE group. */
|
|
|
|
- else if (bravalue == OP_COND)
|
|
+ if (bravalue == OP_COND)
|
|
{
|
|
uschar *tc = code;
|
|
int condcount = 0;
|
|
@@ -3315,29 +4243,77 @@
|
|
}
|
|
while (*tc != OP_KET);
|
|
|
|
- if (condcount > 2)
|
|
+ /* A DEFINE group is never obeyed inline (the "condition" is always
|
|
+ false). It must have only one branch. */
|
|
+
|
|
+ if (code[LINK_SIZE+1] == OP_DEF)
|
|
{
|
|
- *errorcodeptr = ERR27;
|
|
- goto FAILED;
|
|
+ if (condcount > 1)
|
|
+ {
|
|
+ *errorcodeptr = ERR54;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ bravalue = OP_DEF; /* Just a flag to suppress char handling below */
|
|
+ }
|
|
+
|
|
+ /* A "normal" conditional group. If there is just one branch, we must not
|
|
+ make use of its firstbyte or reqbyte, because this is equivalent to an
|
|
+ empty second branch. */
|
|
+
|
|
+ else
|
|
+ {
|
|
+ if (condcount > 2)
|
|
+ {
|
|
+ *errorcodeptr = ERR27;
|
|
+ goto FAILED;
|
|
+ }
|
|
+ if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
|
|
}
|
|
+ }
|
|
+
|
|
+ /* Error if hit end of pattern */
|
|
|
|
- /* If there is just one branch, we must not make use of its firstbyte or
|
|
- reqbyte, because this is equivalent to an empty second branch. */
|
|
+ if (*ptr != ')')
|
|
+ {
|
|
+ *errorcodeptr = ERR14;
|
|
+ goto FAILED;
|
|
+ }
|
|
|
|
- if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
|
|
+ /* In the pre-compile phase, update the length by the length of the nested
|
|
+ group, less the brackets at either end. Then reduce the compiled code to
|
|
+ just the brackets so that it doesn't use much memory if it is duplicated by
|
|
+ a quantifier. */
|
|
+
|
|
+ if (lengthptr != NULL)
|
|
+ {
|
|
+ *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
|
|
+ code++;
|
|
+ PUTINC(code, 0, 1 + LINK_SIZE);
|
|
+ *code++ = OP_KET;
|
|
+ PUTINC(code, 0, 1 + LINK_SIZE);
|
|
}
|
|
|
|
- /* Handle updating of the required and first characters. Update for normal
|
|
- brackets of all kinds, and conditions with two branches (see code above).
|
|
- If the bracket is followed by a quantifier with zero repeat, we have to
|
|
- back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
|
|
- main loop so that they can be accessed for the back off. */
|
|
+ /* Otherwise update the main code pointer to the end of the group. */
|
|
+
|
|
+ else code = tempcode;
|
|
+
|
|
+ /* For a DEFINE group, required and first character settings are not
|
|
+ relevant. */
|
|
+
|
|
+ if (bravalue == OP_DEF) break;
|
|
+
|
|
+ /* Handle updating of the required and first characters for other types of
|
|
+ group. Update for normal brackets of all kinds, and conditions with two
|
|
+ branches (see code above). If the bracket is followed by a quantifier with
|
|
+ zero repeat, we have to back off. Hence the definition of zeroreqbyte and
|
|
+ zerofirstbyte outside the main loop so that they can be accessed for the
|
|
+ back off. */
|
|
|
|
zeroreqbyte = reqbyte;
|
|
zerofirstbyte = firstbyte;
|
|
groupsetfirstbyte = FALSE;
|
|
|
|
- if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
|
|
+ if (bravalue >= OP_ONCE)
|
|
{
|
|
/* If we have not yet set a firstbyte in this branch, take it from the
|
|
subpattern, remembering that it was set here so that a repeat of more
|
|
@@ -3378,35 +4354,22 @@
|
|
firstbyte, looking for an asserted first char. */
|
|
|
|
else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
|
|
+ break; /* End of processing '(' */
|
|
|
|
- /* Now update the main code pointer to the end of the group. */
|
|
-
|
|
- code = tempcode;
|
|
-
|
|
- /* Error if hit end of pattern */
|
|
-
|
|
- if (*ptr != ')')
|
|
- {
|
|
- *errorcodeptr = ERR14;
|
|
- goto FAILED;
|
|
- }
|
|
- break;
|
|
-
|
|
- /* Check \ for being a real metacharacter; if not, fall through and handle
|
|
- it as a data character at the start of a string. Escape items are checked
|
|
- for validity in the pre-compiling pass. */
|
|
-
|
|
- case '\\':
|
|
- tempptr = ptr;
|
|
- c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
|
|
|
|
- /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
|
|
+ /* ===================================================================*/
|
|
+ /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
|
|
are arranged to be the negation of the corresponding OP_values. For the
|
|
back references, the values are ESC_REF plus the reference number. Only
|
|
back references and those types that consume a character may be repeated.
|
|
We can test for values between ESC_b and ESC_Z for the latter; this may
|
|
have to change if any new ones are ever created. */
|
|
|
|
+ case '\\':
|
|
+ tempptr = ptr;
|
|
+ c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
|
|
+ if (*errorcodeptr != 0) goto FAILED;
|
|
+
|
|
if (c < 0)
|
|
{
|
|
if (-c == ESC_Q) /* Handle start of quoted string */
|
|
@@ -3416,6 +4379,8 @@
|
|
continue;
|
|
}
|
|
|
|
+ if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
|
|
+
|
|
/* For metasequences that actually match a character, we disable the
|
|
setting of a first character if it hasn't already been set. */
|
|
|
|
@@ -3427,18 +4392,33 @@
|
|
zerofirstbyte = firstbyte;
|
|
zeroreqbyte = reqbyte;
|
|
|
|
- /* Back references are handled specially */
|
|
+ /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
|
|
+
|
|
+ if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
|
|
+ {
|
|
+ is_recurse = FALSE;
|
|
+ terminator = (*(++ptr) == '<')? '>' : '\'';
|
|
+ goto NAMED_REF_OR_RECURSE;
|
|
+ }
|
|
+
|
|
+ /* Back references are handled specially; must disable firstbyte if
|
|
+ not set to cope with cases like (?=(\w+))\1: which would otherwise set
|
|
+ ':' later. */
|
|
|
|
if (-c >= ESC_REF)
|
|
{
|
|
- int number = -c - ESC_REF;
|
|
+ recno = -c - ESC_REF;
|
|
+
|
|
+ HANDLE_REFERENCE: /* Come here from named backref handling */
|
|
+ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
|
|
previous = code;
|
|
*code++ = OP_REF;
|
|
- PUT2INC(code, 0, number);
|
|
+ PUT2INC(code, 0, recno);
|
|
+ cd->backref_map |= (recno < 32)? (1 << recno) : 1;
|
|
+ if (recno > cd->top_backref) cd->top_backref = recno;
|
|
}
|
|
|
|
- /* So are Unicode property matches, if supported. We know that get_ucp
|
|
- won't fail because it was tested in the pre-pass. */
|
|
+ /* So are Unicode property matches, if supported. */
|
|
|
|
#ifdef SUPPORT_UCP
|
|
else if (-c == ESC_P || -c == ESC_p)
|
|
@@ -3446,15 +4426,26 @@
|
|
BOOL negated;
|
|
int pdata;
|
|
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
|
|
+ if (ptype < 0) goto FAILED;
|
|
previous = code;
|
|
*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
|
|
*code++ = ptype;
|
|
*code++ = pdata;
|
|
}
|
|
+#else
|
|
+
|
|
+ /* If Unicode properties are not supported, \X, \P, and \p are not
|
|
+ allowed. */
|
|
+
|
|
+ else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
|
|
+ {
|
|
+ *errorcodeptr = ERR45;
|
|
+ goto FAILED;
|
|
+ }
|
|
#endif
|
|
|
|
- /* For the rest, we can obtain the OP value by negating the escape
|
|
- value */
|
|
+ /* For the rest (including \X when Unicode properties are supported), we
|
|
+ can obtain the OP value by negating the escape value. */
|
|
|
|
else
|
|
{
|
|
@@ -3478,9 +4469,10 @@
|
|
mcbuffer[0] = c;
|
|
mclength = 1;
|
|
}
|
|
-
|
|
goto ONE_CHAR;
|
|
|
|
+
|
|
+ /* ===================================================================*/
|
|
/* Handle a literal character. It is guaranteed not to be whitespace or #
|
|
when the extended flag is set. If we are in UTF-8 mode, it may be a
|
|
multi-byte literal character. */
|
|
@@ -3491,7 +4483,7 @@
|
|
mcbuffer[0] = c;
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
- if (utf8 && (c & 0xc0) == 0xc0)
|
|
+ if (utf8 && c >= 0xc0)
|
|
{
|
|
while ((ptr[1] & 0xc0) == 0x80)
|
|
mcbuffer[mclength++] = *(++ptr);
|
|
@@ -3542,6 +4534,7 @@
|
|
}
|
|
} /* end of big loop */
|
|
|
|
+
|
|
/* Control never reaches here by falling through, only by a goto for all the
|
|
error states. Pass back the position in the pattern so that it can be displayed
|
|
to the user for diagnosing the error. */
|
|
@@ -3558,35 +4551,40 @@
|
|
* Compile sequence of alternatives *
|
|
*************************************************/
|
|
|
|
-/* On entry, ptr is pointing past the bracket character, but on return
|
|
-it points to the closing bracket, or vertical bar, or end of string.
|
|
-The code variable is pointing at the byte into which the BRA operator has been
|
|
-stored. If the ims options are changed at the start (for a (?ims: group) or
|
|
-during any branch, we need to insert an OP_OPT item at the start of every
|
|
-following branch to ensure they get set correctly at run time, and also pass
|
|
-the new options into every subsequent branch compile.
|
|
+/* On entry, ptr is pointing past the bracket character, but on return it
|
|
+points to the closing bracket, or vertical bar, or end of string. The code
|
|
+variable is pointing at the byte into which the BRA operator has been stored.
|
|
+If the ims options are changed at the start (for a (?ims: group) or during any
|
|
+branch, we need to insert an OP_OPT item at the start of every following branch
|
|
+to ensure they get set correctly at run time, and also pass the new options
|
|
+into every subsequent branch compile.
|
|
+
|
|
+This function is used during the pre-compile phase when we are trying to find
|
|
+out the amount of memory needed, as well as during the real compile phase. The
|
|
+value of lengthptr distinguishes the two phases.
|
|
|
|
Argument:
|
|
options option bits, including any changes for this subpattern
|
|
oldims previous settings of ims option bits
|
|
- brackets -> int containing the number of extracting brackets used
|
|
codeptr -> the address of the current code pointer
|
|
ptrptr -> the address of the current pattern pointer
|
|
errorcodeptr -> pointer to error code variable
|
|
lookbehind TRUE if this is a lookbehind assertion
|
|
- skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
|
|
+ skipbytes skip this many bytes at start (for brackets and OP_COND)
|
|
firstbyteptr place to put the first required character, or a negative number
|
|
reqbyteptr place to put the last required character, or a negative number
|
|
bcptr pointer to the chain of currently open branches
|
|
cd points to the data block with tables pointers etc.
|
|
+ lengthptr NULL during the real compile phase
|
|
+ points to length accumulator during pre-compile phase
|
|
|
|
-Returns: TRUE on success
|
|
+Returns: TRUE on success
|
|
*/
|
|
|
|
static BOOL
|
|
-compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
|
|
- const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
|
|
- int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
|
|
+compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
|
|
+ int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
|
|
+ int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
|
|
{
|
|
const uschar *ptr = *ptrptr;
|
|
uschar *code = *codeptr;
|
|
@@ -3595,6 +4593,7 @@
|
|
uschar *reverse_count = NULL;
|
|
int firstbyte, reqbyte;
|
|
int branchfirstbyte, branchreqbyte;
|
|
+int length;
|
|
branch_chain bc;
|
|
|
|
bc.outer = bcptr;
|
|
@@ -3602,6 +4601,20 @@
|
|
|
|
firstbyte = reqbyte = REQ_UNSET;
|
|
|
|
+/* Accumulate the length for use in the pre-compile phase. Start with the
|
|
+length of the BRA and KET and any extra bytes that are required at the
|
|
+beginning. We accumulate in a local variable to save frequent testing of
|
|
+lenthptr for NULL. We cannot do this by looking at the value of code at the
|
|
+start and end of each alternative, because compiled items are discarded during
|
|
+the pre-compile phase so that the work space is not exceeded. */
|
|
+
|
|
+length = 2 + 2*LINK_SIZE + skipbytes;
|
|
+
|
|
+/* WARNING: If the above line is changed for any reason, you must also change
|
|
+the code that abstracts option settings at the start of the pattern and makes
|
|
+them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
|
|
+pre-compile phase to find out whether anything has yet been compiled or not. */
|
|
+
|
|
/* Offset is set zero to mark that this bracket is still open */
|
|
|
|
PUT(code, 1, 0);
|
|
@@ -3617,6 +4630,7 @@
|
|
{
|
|
*code++ = OP_OPT;
|
|
*code++ = options & PCRE_IMS;
|
|
+ length += 2;
|
|
}
|
|
|
|
/* Set up dummy OP_REVERSE if lookbehind assertion */
|
|
@@ -3626,73 +4640,80 @@
|
|
*code++ = OP_REVERSE;
|
|
reverse_count = code;
|
|
PUTINC(code, 0, 0);
|
|
+ length += 1 + LINK_SIZE;
|
|
}
|
|
|
|
- /* Now compile the branch */
|
|
+ /* Now compile the branch; in the pre-compile phase its length gets added
|
|
+ into the length. */
|
|
|
|
- if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
|
|
- &branchfirstbyte, &branchreqbyte, &bc, cd))
|
|
+ if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
|
|
+ &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
|
|
{
|
|
*ptrptr = ptr;
|
|
return FALSE;
|
|
}
|
|
|
|
- /* If this is the first branch, the firstbyte and reqbyte values for the
|
|
- branch become the values for the regex. */
|
|
+ /* In the real compile phase, there is some post-processing to be done. */
|
|
|
|
- if (*last_branch != OP_ALT)
|
|
+ if (lengthptr == NULL)
|
|
{
|
|
- firstbyte = branchfirstbyte;
|
|
- reqbyte = branchreqbyte;
|
|
- }
|
|
+ /* If this is the first branch, the firstbyte and reqbyte values for the
|
|
+ branch become the values for the regex. */
|
|
|
|
- /* If this is not the first branch, the first char and reqbyte have to
|
|
- match the values from all the previous branches, except that if the previous
|
|
- value for reqbyte didn't have REQ_VARY set, it can still match, and we set
|
|
- REQ_VARY for the regex. */
|
|
+ if (*last_branch != OP_ALT)
|
|
+ {
|
|
+ firstbyte = branchfirstbyte;
|
|
+ reqbyte = branchreqbyte;
|
|
+ }
|
|
|
|
- else
|
|
- {
|
|
- /* If we previously had a firstbyte, but it doesn't match the new branch,
|
|
- we have to abandon the firstbyte for the regex, but if there was previously
|
|
- no reqbyte, it takes on the value of the old firstbyte. */
|
|
+ /* If this is not the first branch, the first char and reqbyte have to
|
|
+ match the values from all the previous branches, except that if the
|
|
+ previous value for reqbyte didn't have REQ_VARY set, it can still match,
|
|
+ and we set REQ_VARY for the regex. */
|
|
|
|
- if (firstbyte >= 0 && firstbyte != branchfirstbyte)
|
|
+ else
|
|
{
|
|
- if (reqbyte < 0) reqbyte = firstbyte;
|
|
- firstbyte = REQ_NONE;
|
|
- }
|
|
+ /* If we previously had a firstbyte, but it doesn't match the new branch,
|
|
+ we have to abandon the firstbyte for the regex, but if there was
|
|
+ previously no reqbyte, it takes on the value of the old firstbyte. */
|
|
+
|
|
+ if (firstbyte >= 0 && firstbyte != branchfirstbyte)
|
|
+ {
|
|
+ if (reqbyte < 0) reqbyte = firstbyte;
|
|
+ firstbyte = REQ_NONE;
|
|
+ }
|
|
|
|
- /* If we (now or from before) have no firstbyte, a firstbyte from the
|
|
- branch becomes a reqbyte if there isn't a branch reqbyte. */
|
|
+ /* If we (now or from before) have no firstbyte, a firstbyte from the
|
|
+ branch becomes a reqbyte if there isn't a branch reqbyte. */
|
|
|
|
- if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
|
|
- branchreqbyte = branchfirstbyte;
|
|
+ if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
|
|
+ branchreqbyte = branchfirstbyte;
|
|
|
|
- /* Now ensure that the reqbytes match */
|
|
+ /* Now ensure that the reqbytes match */
|
|
|
|
- if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
|
|
- reqbyte = REQ_NONE;
|
|
- else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
|
|
- }
|
|
+ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
|
|
+ reqbyte = REQ_NONE;
|
|
+ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
|
|
+ }
|
|
|
|
- /* If lookbehind, check that this branch matches a fixed-length string,
|
|
- and put the length into the OP_REVERSE item. Temporarily mark the end of
|
|
- the branch with OP_END. */
|
|
+ /* If lookbehind, check that this branch matches a fixed-length string, and
|
|
+ put the length into the OP_REVERSE item. Temporarily mark the end of the
|
|
+ branch with OP_END. */
|
|
|
|
- if (lookbehind)
|
|
- {
|
|
- int length;
|
|
- *code = OP_END;
|
|
- length = find_fixedlength(last_branch, options);
|
|
- DPRINTF(("fixed length = %d\n", length));
|
|
- if (length < 0)
|
|
+ if (lookbehind)
|
|
{
|
|
- *errorcodeptr = (length == -2)? ERR36 : ERR25;
|
|
- *ptrptr = ptr;
|
|
- return FALSE;
|
|
+ int fixed_length;
|
|
+ *code = OP_END;
|
|
+ fixed_length = find_fixedlength(last_branch, options);
|
|
+ DPRINTF(("fixed length = %d\n", fixed_length));
|
|
+ if (fixed_length < 0)
|
|
+ {
|
|
+ *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
|
|
+ *ptrptr = ptr;
|
|
+ return FALSE;
|
|
+ }
|
|
+ PUT(reverse_count, 0, fixed_length);
|
|
}
|
|
- PUT(reverse_count, 0, length);
|
|
}
|
|
|
|
/* Reached end of expression, either ')' or end of pattern. Go back through
|
|
@@ -3706,15 +4727,15 @@
|
|
|
|
if (*ptr != '|')
|
|
{
|
|
- int length = code - last_branch;
|
|
+ int branch_length = code - last_branch;
|
|
do
|
|
{
|
|
int prev_length = GET(last_branch, 1);
|
|
- PUT(last_branch, 1, length);
|
|
- length = prev_length;
|
|
- last_branch -= length;
|
|
+ PUT(last_branch, 1, branch_length);
|
|
+ branch_length = prev_length;
|
|
+ last_branch -= branch_length;
|
|
}
|
|
- while (length > 0);
|
|
+ while (branch_length > 0);
|
|
|
|
/* Fill in the ket */
|
|
|
|
@@ -3728,6 +4749,7 @@
|
|
{
|
|
*code++ = OP_OPT;
|
|
*code++ = oldims;
|
|
+ length += 2;
|
|
}
|
|
|
|
/* Set values to pass back */
|
|
@@ -3736,6 +4758,7 @@
|
|
*ptrptr = ptr;
|
|
*firstbyteptr = firstbyte;
|
|
*reqbyteptr = reqbyte;
|
|
+ if (lengthptr != NULL) *lengthptr += length;
|
|
return TRUE;
|
|
}
|
|
|
|
@@ -3749,6 +4772,7 @@
|
|
bc.current = last_branch = code;
|
|
code += 1 + LINK_SIZE;
|
|
ptr++;
|
|
+ length += 1 + LINK_SIZE;
|
|
}
|
|
/* Control never reaches here */
|
|
}
|
|
@@ -3799,24 +4823,29 @@
|
|
unsigned int backref_map)
|
|
{
|
|
do {
|
|
- const uschar *scode =
|
|
- first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
|
|
+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
|
|
+ options, PCRE_MULTILINE, FALSE);
|
|
register int op = *scode;
|
|
|
|
+ /* Non-capturing brackets */
|
|
+
|
|
+ if (op == OP_BRA)
|
|
+ {
|
|
+ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
|
|
+ }
|
|
+
|
|
/* Capturing brackets */
|
|
|
|
- if (op > OP_BRA)
|
|
+ else if (op == OP_CBRA)
|
|
{
|
|
- int new_map;
|
|
- op -= OP_BRA;
|
|
- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
|
|
- new_map = bracket_map | ((op < 32)? (1 << op) : 1);
|
|
+ int n = GET2(scode, 1+LINK_SIZE);
|
|
+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
|
|
if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
|
|
}
|
|
|
|
/* Other brackets */
|
|
|
|
- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
|
|
+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
|
|
{
|
|
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
|
|
}
|
|
@@ -3824,7 +4853,8 @@
|
|
/* .* is not anchored unless DOTALL is set and it isn't in brackets that
|
|
are or may be referenced. */
|
|
|
|
- else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
|
|
+ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
|
|
+ op == OP_TYPEPOSSTAR) &&
|
|
(*options & PCRE_DOTALL) != 0)
|
|
{
|
|
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
|
|
@@ -3869,30 +4899,35 @@
|
|
unsigned int backref_map)
|
|
{
|
|
do {
|
|
- const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
|
|
- FALSE);
|
|
+ const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
|
|
+ NULL, 0, FALSE);
|
|
register int op = *scode;
|
|
|
|
+ /* Non-capturing brackets */
|
|
+
|
|
+ if (op == OP_BRA)
|
|
+ {
|
|
+ if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
|
|
+ }
|
|
+
|
|
/* Capturing brackets */
|
|
|
|
- if (op > OP_BRA)
|
|
+ else if (op == OP_CBRA)
|
|
{
|
|
- int new_map;
|
|
- op -= OP_BRA;
|
|
- if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
|
|
- new_map = bracket_map | ((op < 32)? (1 << op) : 1);
|
|
+ int n = GET2(scode, 1+LINK_SIZE);
|
|
+ int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
|
|
if (!is_startline(scode, new_map, backref_map)) return FALSE;
|
|
}
|
|
|
|
/* Other brackets */
|
|
|
|
- else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
|
|
+ else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
|
|
{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
|
|
|
|
/* .* means "start at start or after \n" if it isn't in brackets that
|
|
may be referenced. */
|
|
|
|
- else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
|
|
+ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
|
|
{
|
|
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
|
|
}
|
|
@@ -3941,14 +4976,13 @@
|
|
first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
|
|
register int op = *scode;
|
|
|
|
- if (op >= OP_BRA) op = OP_BRA;
|
|
-
|
|
switch(op)
|
|
{
|
|
default:
|
|
return -1;
|
|
|
|
case OP_BRA:
|
|
+ case OP_CBRA:
|
|
case OP_ASSERT:
|
|
case OP_ONCE:
|
|
case OP_COND:
|
|
@@ -3964,6 +4998,7 @@
|
|
case OP_CHARNC:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
if (!inassert) return -1;
|
|
if (c < 0)
|
|
{
|
|
@@ -4012,37 +5047,36 @@
|
|
}
|
|
|
|
|
|
-
|
|
PCRE_DATA_SCOPE pcre *
|
|
pcre_compile2(const char *pattern, int options, int *errorcodeptr,
|
|
const char **errorptr, int *erroroffset, const unsigned char *tables)
|
|
{
|
|
real_pcre *re;
|
|
-int length = 1 + LINK_SIZE; /* For initial BRA plus length */
|
|
-int c, firstbyte, reqbyte, newline;
|
|
-int bracount = 0;
|
|
-int branch_extra = 0;
|
|
-int branch_newextra;
|
|
-int item_count = -1;
|
|
-int name_count = 0;
|
|
-int max_name_size = 0;
|
|
-int lastitemlength = 0;
|
|
+int length = 1; /* For final END opcode */
|
|
+int firstbyte, reqbyte, newline;
|
|
int errorcode = 0;
|
|
#ifdef SUPPORT_UTF8
|
|
BOOL utf8;
|
|
-BOOL class_utf8;
|
|
#endif
|
|
-BOOL inescq = FALSE;
|
|
-BOOL capturing;
|
|
-unsigned int brastackptr = 0;
|
|
size_t size;
|
|
uschar *code;
|
|
const uschar *codestart;
|
|
const uschar *ptr;
|
|
compile_data compile_block;
|
|
compile_data *cd = &compile_block;
|
|
-int brastack[BRASTACK_SIZE];
|
|
-uschar bralenstack[BRASTACK_SIZE];
|
|
+
|
|
+/* This space is used for "compiling" into during the first phase, when we are
|
|
+computing the amount of memory that is needed. Compiled items are thrown away
|
|
+as soon as possible, so that a fairly large buffer should be sufficient for
|
|
+this purpose. The same space is used in the second phase for remembering where
|
|
+to fill in forward references to subpatterns. */
|
|
+
|
|
+uschar cworkspace[COMPILE_WORK_SIZE];
|
|
+
|
|
+
|
|
+/* Set this early so that early errors get offset 0. */
|
|
+
|
|
+ptr = (const uschar *)pattern;
|
|
|
|
/* We can't pass back an error message if errorptr is NULL; I guess the best we
|
|
can do is just return NULL, but we can set a code value if there is a code
|
|
@@ -4075,7 +5109,7 @@
|
|
(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
|
|
{
|
|
errorcode = ERR44;
|
|
- goto PCRE_EARLY_ERROR_RETURN;
|
|
+ goto PCRE_UTF8_ERROR_RETURN;
|
|
}
|
|
#else
|
|
if ((options & PCRE_UTF8) != 0)
|
|
@@ -4099,34 +5133,43 @@
|
|
cd->cbits = tables + cbits_offset;
|
|
cd->ctypes = tables + ctypes_offset;
|
|
|
|
-/* Handle different types of newline. The two bits give four cases. The current
|
|
-code allows for one- or two-byte sequences. */
|
|
+/* Handle different types of newline. The three bits give seven cases. The
|
|
+current code allows for fixed one- or two-byte sequences, plus "any". */
|
|
|
|
-switch (options & PCRE_NEWLINE_CRLF)
|
|
+switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
|
|
{
|
|
- default: newline = NEWLINE; break; /* Compile-time default */
|
|
+ case 0: newline = NEWLINE; break; /* Compile-time default */
|
|
case PCRE_NEWLINE_CR: newline = '\r'; break;
|
|
case PCRE_NEWLINE_LF: newline = '\n'; break;
|
|
case PCRE_NEWLINE_CR+
|
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
|
|
+ case PCRE_NEWLINE_ANY: newline = -1; break;
|
|
+ default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
|
|
}
|
|
|
|
-if (newline > 255)
|
|
+if (newline < 0)
|
|
{
|
|
- cd->nllen = 2;
|
|
- cd->nl[0] = (newline >> 8) & 255;
|
|
- cd->nl[1] = newline & 255;
|
|
+ cd->nltype = NLTYPE_ANY;
|
|
}
|
|
else
|
|
{
|
|
- cd->nllen = 1;
|
|
- cd->nl[0] = newline;
|
|
+ cd->nltype = NLTYPE_FIXED;
|
|
+ if (newline > 255)
|
|
+ {
|
|
+ cd->nllen = 2;
|
|
+ cd->nl[0] = (newline >> 8) & 255;
|
|
+ cd->nl[1] = newline & 255;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ cd->nllen = 1;
|
|
+ cd->nl[0] = newline;
|
|
+ }
|
|
}
|
|
|
|
-/* Maximum back reference and backref bitmap. This is updated for numeric
|
|
-references during the first pass, but for named references during the actual
|
|
-compile pass. The bitmap records up to 31 back references to help in deciding
|
|
-whether (.*) can be treated as anchored or not. */
|
|
+/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
|
|
+references to help in deciding whether (.*) can be treated as anchored or not.
|
|
+*/
|
|
|
|
cd->top_backref = 0;
|
|
cd->backref_map = 0;
|
|
@@ -4136,1041 +5179,151 @@
|
|
DPRINTF(("------------------------------------------------------------------\n"));
|
|
DPRINTF(("%s\n", pattern));
|
|
|
|
-/* The first thing to do is to make a pass over the pattern to compute the
|
|
-amount of store required to hold the compiled code. This does not have to be
|
|
-perfect as long as errors are overestimates. At the same time we can detect any
|
|
-flag settings right at the start, and extract them. Make an attempt to correct
|
|
-for any counted white space if an "extended" flag setting appears late in the
|
|
-pattern. We can't be so clever for #-comments. */
|
|
-
|
|
-ptr = (const uschar *)(pattern - 1);
|
|
-while ((c = *(++ptr)) != 0)
|
|
- {
|
|
- int min, max;
|
|
- int class_optcount;
|
|
- int bracket_length;
|
|
- int duplength;
|
|
+/* Pretend to compile the pattern while actually just accumulating the length
|
|
+of memory required. This behaviour is triggered by passing a non-NULL final
|
|
+argument to compile_regex(). We pass a block of workspace (cworkspace) for it
|
|
+to compile parts of the pattern into; the compiled code is discarded when it is
|
|
+no longer needed, so hopefully this workspace will never overflow, though there
|
|
+is a test for its doing so. */
|
|
|
|
- /* If we are inside a \Q...\E sequence, all chars are literal */
|
|
+cd->bracount = 0;
|
|
+cd->names_found = 0;
|
|
+cd->name_entry_size = 0;
|
|
+cd->name_table = NULL;
|
|
+cd->start_workspace = cworkspace;
|
|
+cd->start_code = cworkspace;
|
|
+cd->hwm = cworkspace;
|
|
+cd->start_pattern = (const uschar *)pattern;
|
|
+cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
|
|
+cd->req_varyopt = 0;
|
|
+cd->nopartial = FALSE;
|
|
+cd->external_options = options;
|
|
|
|
- if (inescq)
|
|
- {
|
|
- if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
|
|
- goto NORMAL_CHAR;
|
|
- }
|
|
+/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
|
|
+don't need to look at the result of the function here. The initial options have
|
|
+been put into the cd block so that they can be changed if an option setting is
|
|
+found within the regex right at the beginning. Bringing initial option settings
|
|
+outside can help speed up starting point checks. */
|
|
|
|
- /* Otherwise, first check for ignored whitespace and comments */
|
|
+code = cworkspace;
|
|
+*code = OP_BRA;
|
|
+(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
|
|
+ &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
|
|
+if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
|
|
|
|
- if ((options & PCRE_EXTENDED) != 0)
|
|
- {
|
|
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
|
|
- if (c == '#')
|
|
- {
|
|
- while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
|
|
- if (*ptr != 0)
|
|
- {
|
|
- ptr += cd->nllen - 1;
|
|
- continue;
|
|
- }
|
|
- break; /* End loop at end of pattern */
|
|
- }
|
|
- }
|
|
+DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
|
|
+ cd->hwm - cworkspace));
|
|
|
|
- item_count++; /* Is zero for the first non-comment item */
|
|
+if (length > MAX_PATTERN_SIZE)
|
|
+ {
|
|
+ errorcode = ERR20;
|
|
+ goto PCRE_EARLY_ERROR_RETURN;
|
|
+ }
|
|
|
|
- /* Allow space for auto callout before every item except quantifiers. */
|
|
+/* Compute the size of data block needed and get it, either from malloc or
|
|
+externally provided function. Integer overflow should no longer be possible
|
|
+because nowadays we limit the maximum value of cd->names_found and
|
|
+cd->name_entry_size. */
|
|
|
|
- if ((options & PCRE_AUTO_CALLOUT) != 0 &&
|
|
- c != '*' && c != '+' && c != '?' &&
|
|
- (c != '{' || !is_counted_repeat(ptr + 1)))
|
|
- length += 2 + 2*LINK_SIZE;
|
|
+size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
|
|
+re = (real_pcre *)(pcre_malloc)(size);
|
|
|
|
- switch(c)
|
|
- {
|
|
- /* A backslashed item may be an escaped data character or it may be a
|
|
- character type. */
|
|
+if (re == NULL)
|
|
+ {
|
|
+ errorcode = ERR21;
|
|
+ goto PCRE_EARLY_ERROR_RETURN;
|
|
+ }
|
|
|
|
- case '\\':
|
|
- c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
+/* Put in the magic number, and save the sizes, initial options, and character
|
|
+table pointer. NULL is used for the default character tables. The nullpad field
|
|
+is at the end; it's there to help in the case when a regex compiled on a system
|
|
+with 4-byte pointers is run on another with 8-byte pointers. */
|
|
|
|
- lastitemlength = 1; /* Default length of last item for repeats */
|
|
+re->magic_number = MAGIC_NUMBER;
|
|
+re->size = size;
|
|
+re->options = cd->external_options;
|
|
+re->dummy1 = 0;
|
|
+re->first_byte = 0;
|
|
+re->req_byte = 0;
|
|
+re->name_table_offset = sizeof(real_pcre);
|
|
+re->name_entry_size = cd->name_entry_size;
|
|
+re->name_count = cd->names_found;
|
|
+re->ref_count = 0;
|
|
+re->tables = (tables == _pcre_default_tables)? NULL : tables;
|
|
+re->nullpad = NULL;
|
|
|
|
- if (c >= 0) /* Data character */
|
|
- {
|
|
- length += 2; /* For a one-byte character */
|
|
+/* The starting points of the name/number translation table and of the code are
|
|
+passed around in the compile data block. The start/end pattern and initial
|
|
+options are already set from the pre-compile phase, as is the name_entry_size
|
|
+field. Reset the bracket count and the names_found field. Also reset the hwm
|
|
+field; this time it's used for remembering forward references to subpatterns.
|
|
+*/
|
|
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8 && c > 127)
|
|
- {
|
|
- int i;
|
|
- for (i = 0; i < _pcre_utf8_table1_size; i++)
|
|
- if (c <= _pcre_utf8_table1[i]) break;
|
|
- length += i;
|
|
- lastitemlength += i;
|
|
- }
|
|
-#endif
|
|
+cd->bracount = 0;
|
|
+cd->names_found = 0;
|
|
+cd->name_table = (uschar *)re + re->name_table_offset;
|
|
+codestart = cd->name_table + re->name_entry_size * re->name_count;
|
|
+cd->start_code = codestart;
|
|
+cd->hwm = cworkspace;
|
|
+cd->req_varyopt = 0;
|
|
+cd->nopartial = FALSE;
|
|
|
|
- continue;
|
|
- }
|
|
+/* Set up a starting, non-extracting bracket, then compile the expression. On
|
|
+error, errorcode will be set non-zero, so we don't need to look at the result
|
|
+of the function here. */
|
|
|
|
- /* If \Q, enter "literal" mode */
|
|
+ptr = (const uschar *)pattern;
|
|
+code = (uschar *)codestart;
|
|
+*code = OP_BRA;
|
|
+(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
|
|
+ &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
|
|
+re->top_bracket = cd->bracount;
|
|
+re->top_backref = cd->top_backref;
|
|
|
|
- if (-c == ESC_Q)
|
|
- {
|
|
- inescq = TRUE;
|
|
- continue;
|
|
- }
|
|
+if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
|
|
|
|
- /* \X is supported only if Unicode property support is compiled */
|
|
+/* If not reached end of pattern on success, there's an excess bracket. */
|
|
|
|
-#ifndef SUPPORT_UCP
|
|
- if (-c == ESC_X)
|
|
- {
|
|
- errorcode = ERR45;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
-#endif
|
|
+if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
|
|
|
|
- /* \P and \p are for Unicode properties, but only when the support has
|
|
- been compiled. Each item needs 3 bytes. */
|
|
+/* Fill in the terminating state and check for disastrous overflow, but
|
|
+if debugging, leave the test till after things are printed out. */
|
|
|
|
- else if (-c == ESC_P || -c == ESC_p)
|
|
- {
|
|
-#ifdef SUPPORT_UCP
|
|
- BOOL negated;
|
|
- BOOL pdata;
|
|
- length += 3;
|
|
- lastitemlength = 3;
|
|
- if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
|
|
- goto PCRE_ERROR_RETURN;
|
|
- continue;
|
|
-#else
|
|
- errorcode = ERR45;
|
|
- goto PCRE_ERROR_RETURN;
|
|
+*code++ = OP_END;
|
|
+
|
|
+#ifndef DEBUG
|
|
+if (code - codestart > length) errorcode = ERR23;
|
|
#endif
|
|
- }
|
|
|
|
- /* Other escapes need one byte */
|
|
+/* Fill in any forward references that are required. */
|
|
|
|
- length++;
|
|
+while (errorcode == 0 && cd->hwm > cworkspace)
|
|
+ {
|
|
+ int offset, recno;
|
|
+ const uschar *groupptr;
|
|
+ cd->hwm -= LINK_SIZE;
|
|
+ offset = GET(cd->hwm, 0);
|
|
+ recno = GET(codestart, offset);
|
|
+ groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
|
|
+ if (groupptr == NULL) errorcode = ERR53;
|
|
+ else PUT(((uschar *)codestart), offset, groupptr - codestart);
|
|
+ }
|
|
|
|
- /* A back reference needs an additional 2 bytes, plus either one or 5
|
|
- bytes for a repeat. We also need to keep the value of the highest
|
|
- back reference. */
|
|
+/* Give an error if there's back reference to a non-existent capturing
|
|
+subpattern. */
|
|
|
|
- if (c <= -ESC_REF)
|
|
- {
|
|
- int refnum = -c - ESC_REF;
|
|
- cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
|
|
- if (refnum > cd->top_backref)
|
|
- cd->top_backref = refnum;
|
|
- length += 2; /* For single back reference */
|
|
- if (ptr[1] == '{' && is_counted_repeat(ptr+2))
|
|
- {
|
|
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
- if ((min == 0 && (max == 1 || max == -1)) ||
|
|
- (min == 1 && max == -1))
|
|
- length++;
|
|
- else length += 5;
|
|
- if (ptr[1] == '?') ptr++;
|
|
- }
|
|
- }
|
|
- continue;
|
|
-
|
|
- case '^': /* Single-byte metacharacters */
|
|
- case '.':
|
|
- case '$':
|
|
- length++;
|
|
- lastitemlength = 1;
|
|
- continue;
|
|
-
|
|
- case '*': /* These repeats won't be after brackets; */
|
|
- case '+': /* those are handled separately */
|
|
- case '?':
|
|
- length++;
|
|
- goto POSESSIVE; /* A few lines below */
|
|
-
|
|
- /* This covers the cases of braced repeats after a single char, metachar,
|
|
- class, or back reference. */
|
|
-
|
|
- case '{':
|
|
- if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
|
|
- ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
-
|
|
- /* These special cases just insert one extra opcode */
|
|
-
|
|
- if ((min == 0 && (max == 1 || max == -1)) ||
|
|
- (min == 1 && max == -1))
|
|
- length++;
|
|
-
|
|
- /* These cases might insert additional copies of a preceding character. */
|
|
-
|
|
- else
|
|
- {
|
|
- if (min != 1)
|
|
- {
|
|
- length -= lastitemlength; /* Uncount the original char or metachar */
|
|
- if (min > 0) length += 3 + lastitemlength;
|
|
- }
|
|
- length += lastitemlength + ((max > 0)? 3 : 1);
|
|
- }
|
|
-
|
|
- if (ptr[1] == '?') ptr++; /* Needs no extra length */
|
|
-
|
|
- POSESSIVE: /* Test for possessive quantifier */
|
|
- if (ptr[1] == '+')
|
|
- {
|
|
- ptr++;
|
|
- length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
|
|
- }
|
|
- continue;
|
|
-
|
|
- /* An alternation contains an offset to the next branch or ket. If any ims
|
|
- options changed in the previous branch(es), and/or if we are in a
|
|
- lookbehind assertion, extra space will be needed at the start of the
|
|
- branch. This is handled by branch_extra. */
|
|
-
|
|
- case '|':
|
|
- length += 1 + LINK_SIZE + branch_extra;
|
|
- continue;
|
|
-
|
|
- /* A character class uses 33 characters provided that all the character
|
|
- values are less than 256. Otherwise, it uses a bit map for low valued
|
|
- characters, and individual items for others. Don't worry about character
|
|
- types that aren't allowed in classes - they'll get picked up during the
|
|
- compile. A character class that contains only one single-byte character
|
|
- uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
|
|
- where we can. (In UTF-8 mode we can do this only for chars < 128.) */
|
|
-
|
|
- case '[':
|
|
- if (*(++ptr) == '^')
|
|
- {
|
|
- class_optcount = 10; /* Greater than one */
|
|
- ptr++;
|
|
- }
|
|
- else class_optcount = 0;
|
|
-
|
|
-#ifdef SUPPORT_UTF8
|
|
- class_utf8 = FALSE;
|
|
-#endif
|
|
-
|
|
- /* Written as a "do" so that an initial ']' is taken as data */
|
|
-
|
|
- if (*ptr != 0) do
|
|
- {
|
|
- /* Inside \Q...\E everything is literal except \E */
|
|
-
|
|
- if (inescq)
|
|
- {
|
|
- if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
|
|
- inescq = FALSE;
|
|
- ptr += 1;
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* Outside \Q...\E, check for escapes */
|
|
-
|
|
- if (*ptr == '\\')
|
|
- {
|
|
- c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
-
|
|
- /* \b is backspace inside a class; \X is literal */
|
|
-
|
|
- if (-c == ESC_b) c = '\b';
|
|
- else if (-c == ESC_X) c = 'X';
|
|
-
|
|
- /* \Q enters quoting mode */
|
|
-
|
|
- else if (-c == ESC_Q)
|
|
- {
|
|
- inescq = TRUE;
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* Handle escapes that turn into characters */
|
|
-
|
|
- if (c >= 0) goto NON_SPECIAL_CHARACTER;
|
|
-
|
|
- /* Escapes that are meta-things. The normal ones just affect the
|
|
- bit map, but Unicode properties require an XCLASS extended item. */
|
|
-
|
|
- else
|
|
- {
|
|
- class_optcount = 10; /* \d, \s etc; make sure > 1 */
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (-c == ESC_p || -c == ESC_P)
|
|
- {
|
|
- if (!class_utf8)
|
|
- {
|
|
- class_utf8 = TRUE;
|
|
- length += LINK_SIZE + 2;
|
|
- }
|
|
- length += 3;
|
|
- }
|
|
-#endif
|
|
- }
|
|
- }
|
|
-
|
|
- /* Check the syntax for POSIX stuff. The bits we actually handle are
|
|
- checked during the real compile phase. */
|
|
-
|
|
- else if (*ptr == '[' &&
|
|
- (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
|
|
- check_posix_syntax(ptr, &ptr, cd))
|
|
- {
|
|
- ptr++;
|
|
- class_optcount = 10; /* Make sure > 1 */
|
|
- }
|
|
-
|
|
- /* Anything else increments the possible optimization count. We have to
|
|
- detect ranges here so that we can compute the number of extra ranges for
|
|
- caseless wide characters when UCP support is available. If there are wide
|
|
- characters, we are going to have to use an XCLASS, even for single
|
|
- characters. */
|
|
-
|
|
- else
|
|
- {
|
|
- int d;
|
|
-
|
|
- GET_ONE_CHARACTER:
|
|
-
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8)
|
|
- {
|
|
- int extra = 0;
|
|
- GETCHARLEN(c, ptr, extra);
|
|
- ptr += extra;
|
|
- }
|
|
- else c = *ptr;
|
|
-#else
|
|
- c = *ptr;
|
|
-#endif
|
|
-
|
|
- /* Come here from handling \ above when it escapes to a char value */
|
|
-
|
|
- NON_SPECIAL_CHARACTER:
|
|
- class_optcount++;
|
|
-
|
|
- d = -1;
|
|
- if (ptr[1] == '-')
|
|
- {
|
|
- uschar const *hyptr = ptr++;
|
|
- if (ptr[1] == '\\')
|
|
- {
|
|
- ptr++;
|
|
- d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
- if (-d == ESC_b) d = '\b'; /* backspace */
|
|
- else if (-d == ESC_X) d = 'X'; /* literal X in a class */
|
|
- }
|
|
- else if (ptr[1] != 0 && ptr[1] != ']')
|
|
- {
|
|
- ptr++;
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8)
|
|
- {
|
|
- int extra = 0;
|
|
- GETCHARLEN(d, ptr, extra);
|
|
- ptr += extra;
|
|
- }
|
|
- else
|
|
-#endif
|
|
- d = *ptr;
|
|
- }
|
|
- if (d < 0) ptr = hyptr; /* go back to hyphen as data */
|
|
- }
|
|
-
|
|
- /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
|
|
- 127 for caseless matching, we will need to use an XCLASS. */
|
|
-
|
|
- if (d >= 0)
|
|
- {
|
|
- class_optcount = 10; /* Ensure > 1 */
|
|
- if (d < c)
|
|
- {
|
|
- errorcode = ERR8;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
-
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
|
|
- {
|
|
- uschar buffer[6];
|
|
- if (!class_utf8) /* Allow for XCLASS overhead */
|
|
- {
|
|
- class_utf8 = TRUE;
|
|
- length += LINK_SIZE + 2;
|
|
- }
|
|
-
|
|
-#ifdef SUPPORT_UCP
|
|
- /* If we have UCP support, find out how many extra ranges are
|
|
- needed to map the other case of characters within this range. We
|
|
- have to mimic the range optimization here, because extending the
|
|
- range upwards might push d over a boundary that makes is use
|
|
- another byte in the UTF-8 representation. */
|
|
-
|
|
- if ((options & PCRE_CASELESS) != 0)
|
|
- {
|
|
- int occ, ocd;
|
|
- int cc = c;
|
|
- int origd = d;
|
|
- while (get_othercase_range(&cc, origd, &occ, &ocd))
|
|
- {
|
|
- if (occ >= c && ocd <= d) continue; /* Skip embedded */
|
|
-
|
|
- if (occ < c && ocd >= c - 1) /* Extend the basic range */
|
|
- { /* if there is overlap, */
|
|
- c = occ; /* noting that if occ < c */
|
|
- continue; /* we can't have ocd > d */
|
|
- } /* because a subrange is */
|
|
- if (ocd > d && occ <= d + 1) /* always shorter than */
|
|
- { /* the basic range. */
|
|
- d = ocd;
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* An extra item is needed */
|
|
-
|
|
- length += 1 + _pcre_ord2utf8(occ, buffer) +
|
|
- ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
|
|
- }
|
|
- }
|
|
-#endif /* SUPPORT_UCP */
|
|
-
|
|
- /* The length of the (possibly extended) range */
|
|
-
|
|
- length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
|
|
- }
|
|
-#endif /* SUPPORT_UTF8 */
|
|
-
|
|
- }
|
|
-
|
|
- /* We have a single character. There is nothing to be done unless we
|
|
- are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
|
|
- allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
|
|
- support. */
|
|
-
|
|
- else
|
|
- {
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
|
|
- {
|
|
- uschar buffer[6];
|
|
- class_optcount = 10; /* Ensure > 1 */
|
|
- if (!class_utf8) /* Allow for XCLASS overhead */
|
|
- {
|
|
- class_utf8 = TRUE;
|
|
- length += LINK_SIZE + 2;
|
|
- }
|
|
-#ifdef SUPPORT_UCP
|
|
- length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
|
|
- (1 + _pcre_ord2utf8(c, buffer));
|
|
-#else /* SUPPORT_UCP */
|
|
- length += 1 + _pcre_ord2utf8(c, buffer);
|
|
-#endif /* SUPPORT_UCP */
|
|
- }
|
|
-#endif /* SUPPORT_UTF8 */
|
|
- }
|
|
- }
|
|
- }
|
|
- while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
|
|
-
|
|
- if (*ptr == 0) /* Missing terminating ']' */
|
|
- {
|
|
- errorcode = ERR6;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
-
|
|
- /* We can optimize when there was only one optimizable character. Repeats
|
|
- for positive and negated single one-byte chars are handled by the general
|
|
- code. Here, we handle repeats for the class opcodes. */
|
|
-
|
|
- if (class_optcount == 1) length += 3; else
|
|
- {
|
|
- length += 33;
|
|
-
|
|
- /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
|
|
- we also need extra for wrapping the whole thing in a sub-pattern. */
|
|
-
|
|
- if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
|
|
- {
|
|
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
- if ((min == 0 && (max == 1 || max == -1)) ||
|
|
- (min == 1 && max == -1))
|
|
- length++;
|
|
- else length += 5;
|
|
- if (ptr[1] == '+')
|
|
- {
|
|
- ptr++;
|
|
- length += 2 + 2*LINK_SIZE;
|
|
- }
|
|
- else if (ptr[1] == '?') ptr++;
|
|
- }
|
|
- }
|
|
- continue;
|
|
-
|
|
- /* Brackets may be genuine groups or special things */
|
|
-
|
|
- case '(':
|
|
- branch_newextra = 0;
|
|
- bracket_length = 1 + LINK_SIZE;
|
|
- capturing = FALSE;
|
|
-
|
|
- /* Handle special forms of bracket, which all start (? */
|
|
-
|
|
- if (ptr[1] == '?')
|
|
- {
|
|
- int set, unset;
|
|
- int *optset;
|
|
-
|
|
- switch (c = ptr[2])
|
|
- {
|
|
- /* Skip over comments entirely */
|
|
- case '#':
|
|
- ptr += 3;
|
|
- while (*ptr != 0 && *ptr != ')') ptr++;
|
|
- if (*ptr == 0)
|
|
- {
|
|
- errorcode = ERR18;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- continue;
|
|
-
|
|
- /* Non-referencing groups and lookaheads just move the pointer on, and
|
|
- then behave like a non-special bracket, except that they don't increment
|
|
- the count of extracting brackets. Ditto for the "once only" bracket,
|
|
- which is in Perl from version 5.005. */
|
|
-
|
|
- case ':':
|
|
- case '=':
|
|
- case '!':
|
|
- case '>':
|
|
- ptr += 2;
|
|
- break;
|
|
-
|
|
- /* Named subpatterns are an extension copied from Python */
|
|
-
|
|
- case 'P':
|
|
- ptr += 3;
|
|
-
|
|
- /* Handle the definition of a named subpattern */
|
|
-
|
|
- if (*ptr == '<')
|
|
- {
|
|
- const uschar *p; /* Don't amalgamate; some compilers */
|
|
- p = ++ptr; /* grumble at autoincrement in declaration */
|
|
- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
|
|
- if (*ptr != '>')
|
|
- {
|
|
- errorcode = ERR42;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- name_count++;
|
|
- if (name_count > MAX_NAME_COUNT)
|
|
- {
|
|
- errorcode = ERR49;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- if (ptr - p > max_name_size)
|
|
- {
|
|
- max_name_size = (ptr - p);
|
|
- if (max_name_size > MAX_NAME_SIZE)
|
|
- {
|
|
- errorcode = ERR48;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- }
|
|
- capturing = TRUE; /* Named parentheses are always capturing */
|
|
- break; /* Go handle capturing parentheses */
|
|
- }
|
|
-
|
|
- /* Handle back references and recursive calls to named subpatterns */
|
|
-
|
|
- if (*ptr == '=' || *ptr == '>')
|
|
- {
|
|
- length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */
|
|
- while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
|
|
- if (*ptr != ')')
|
|
- {
|
|
- errorcode = ERR42;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- goto RECURSE_CHECK_QUANTIFIED;
|
|
- }
|
|
-
|
|
- /* Unknown character after (?P */
|
|
-
|
|
- errorcode = ERR41;
|
|
- goto PCRE_ERROR_RETURN;
|
|
-
|
|
- /* (?R) specifies a recursive call to the regex, which is an extension
|
|
- to provide the facility which can be obtained by (?p{perl-code}) in
|
|
- Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
|
|
-
|
|
- From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
|
|
- the appropriate numbered brackets. This includes both recursive and
|
|
- non-recursive calls. (?R) is now synonymous with (?0). */
|
|
-
|
|
- case 'R':
|
|
- ptr++;
|
|
-
|
|
- case '0': case '1': case '2': case '3': case '4':
|
|
- case '5': case '6': case '7': case '8': case '9':
|
|
- ptr += 2;
|
|
- if (c != 'R')
|
|
- while ((digitab[*(++ptr)] & ctype_digit) != 0);
|
|
- if (*ptr != ')')
|
|
- {
|
|
- errorcode = ERR29;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
|
|
-
|
|
- /* If this item is quantified, it will get wrapped inside brackets so
|
|
- as to use the code for quantified brackets. We jump down and use the
|
|
- code that handles this for real brackets. Come here from code for
|
|
- named recursions/subroutines. */
|
|
-
|
|
- RECURSE_CHECK_QUANTIFIED:
|
|
- if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
|
|
- {
|
|
- length += 2 + 2 * LINK_SIZE; /* to make bracketed */
|
|
- duplength = 5 + 3 * LINK_SIZE;
|
|
- goto HANDLE_QUANTIFIED_BRACKETS;
|
|
- }
|
|
- continue;
|
|
-
|
|
- /* (?C) is an extension which provides "callout" - to provide a bit of
|
|
- the functionality of the Perl (?{...}) feature. An optional number may
|
|
- follow (default is zero). */
|
|
-
|
|
- case 'C':
|
|
- ptr += 2;
|
|
- while ((digitab[*(++ptr)] & ctype_digit) != 0);
|
|
- if (*ptr != ')')
|
|
- {
|
|
- errorcode = ERR39;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- length += 2 + 2*LINK_SIZE;
|
|
- continue;
|
|
-
|
|
- /* Lookbehinds are in Perl from version 5.005 */
|
|
-
|
|
- case '<':
|
|
- ptr += 3;
|
|
- if (*ptr == '=' || *ptr == '!')
|
|
- {
|
|
- branch_newextra = 1 + LINK_SIZE;
|
|
- length += 1 + LINK_SIZE; /* For the first branch */
|
|
- break;
|
|
- }
|
|
- errorcode = ERR24;
|
|
- goto PCRE_ERROR_RETURN;
|
|
-
|
|
- /* Conditionals are in Perl from version 5.005. The bracket must either
|
|
- be followed by a number (for bracket reference) or by an assertion
|
|
- group. PCRE extends this by allowing a name to reference a named group;
|
|
- unfortunately, previously 'R' was implemented for a recursion test.
|
|
- When this is compiled, we look for the named group 'R' first. At this
|
|
- point we just do a basic syntax check. */
|
|
-
|
|
- case '(':
|
|
- if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
|
|
- {
|
|
- ptr += 4;
|
|
- length += 3;
|
|
- while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
|
|
- if (*ptr != ')')
|
|
- {
|
|
- errorcode = ERR26;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- }
|
|
- else /* An assertion must follow */
|
|
- {
|
|
- ptr++; /* Can treat like ':' as far as spacing is concerned */
|
|
- if (ptr[2] != '?' ||
|
|
- (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
|
|
- {
|
|
- ptr += 2; /* To get right offset in message */
|
|
- errorcode = ERR28;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- }
|
|
- break;
|
|
-
|
|
- /* Else loop checking valid options until ) is met. Anything else is an
|
|
- error. If we are without any brackets, i.e. at top level, the settings
|
|
- act as if specified in the options, so massage the options immediately.
|
|
- This is for backward compatibility with Perl 5.004. */
|
|
-
|
|
- default:
|
|
- set = unset = 0;
|
|
- optset = &set;
|
|
- ptr += 2;
|
|
-
|
|
- for (;; ptr++)
|
|
- {
|
|
- c = *ptr;
|
|
- switch (c)
|
|
- {
|
|
- case 'i':
|
|
- *optset |= PCRE_CASELESS;
|
|
- continue;
|
|
-
|
|
- case 'J':
|
|
- *optset |= PCRE_DUPNAMES;
|
|
- options |= PCRE_JCHANGED; /* Record that it changed */
|
|
- continue;
|
|
-
|
|
- case 'm':
|
|
- *optset |= PCRE_MULTILINE;
|
|
- continue;
|
|
-
|
|
- case 's':
|
|
- *optset |= PCRE_DOTALL;
|
|
- continue;
|
|
-
|
|
- case 'x':
|
|
- *optset |= PCRE_EXTENDED;
|
|
- continue;
|
|
-
|
|
- case 'X':
|
|
- *optset |= PCRE_EXTRA;
|
|
- continue;
|
|
-
|
|
- case 'U':
|
|
- *optset |= PCRE_UNGREEDY;
|
|
- continue;
|
|
-
|
|
- case '-':
|
|
- optset = &unset;
|
|
- continue;
|
|
-
|
|
- /* A termination by ')' indicates an options-setting-only item; if
|
|
- this is at the very start of the pattern (indicated by item_count
|
|
- being zero), we use it to set the global options. This is helpful
|
|
- when analyzing the pattern for first characters, etc. Otherwise
|
|
- nothing is done here and it is handled during the compiling
|
|
- process.
|
|
-
|
|
- We allow for more than one options setting at the start. If such
|
|
- settings do not change the existing options, nothing is compiled.
|
|
- However, we must leave space just in case something is compiled.
|
|
- This can happen for pathological sequences such as (?i)(?-i)
|
|
- because the global options will end up with -i set. The space is
|
|
- small and not significant. (Before I did this there was a reported
|
|
- bug with (?i)(?-i) in a machine-generated pattern.)
|
|
-
|
|
- [Historical note: Up to Perl 5.8, options settings at top level
|
|
- were always global settings, wherever they appeared in the pattern.
|
|
- That is, they were equivalent to an external setting. From 5.8
|
|
- onwards, they apply only to what follows (which is what you might
|
|
- expect).] */
|
|
-
|
|
- case ')':
|
|
- if (item_count == 0)
|
|
- {
|
|
- options = (options | set) & (~unset);
|
|
- set = unset = 0; /* To save length */
|
|
- item_count--; /* To allow for several */
|
|
- length += 2;
|
|
- }
|
|
-
|
|
- /* Fall through */
|
|
-
|
|
- /* A termination by ':' indicates the start of a nested group with
|
|
- the given options set. This is again handled at compile time, but
|
|
- we must allow for compiled space if any of the ims options are
|
|
- set. We also have to allow for resetting space at the end of
|
|
- the group, which is why 4 is added to the length and not just 2.
|
|
- If there are several changes of options within the same group, this
|
|
- will lead to an over-estimate on the length, but this shouldn't
|
|
- matter very much. We also have to allow for resetting options at
|
|
- the start of any alternations, which we do by setting
|
|
- branch_newextra to 2. */
|
|
-
|
|
- case ':':
|
|
- if (((set|unset) & PCRE_IMS) != 0)
|
|
- {
|
|
- length += 4;
|
|
- branch_newextra = 2;
|
|
- }
|
|
- goto END_OPTIONS;
|
|
-
|
|
- /* Unrecognized option character */
|
|
-
|
|
- default:
|
|
- errorcode = ERR12;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- }
|
|
-
|
|
- /* If we hit a closing bracket, that's it - this is a freestanding
|
|
- option-setting. We need to ensure that branch_extra is updated if
|
|
- necessary. The only values branch_newextra can have here are 0 or 2.
|
|
- If the value is 2, then branch_extra must either be 2 or 5, depending
|
|
- on whether this is a lookbehind group or not. */
|
|
-
|
|
- END_OPTIONS:
|
|
- if (c == ')')
|
|
- {
|
|
- if (branch_newextra == 2 &&
|
|
- (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
|
|
- branch_extra += branch_newextra;
|
|
- continue;
|
|
- }
|
|
-
|
|
- /* If options were terminated by ':' control comes here. This is a
|
|
- non-capturing group with an options change. There is nothing more that
|
|
- needs to be done because "capturing" is already set FALSE by default;
|
|
- we can just fall through. */
|
|
-
|
|
- }
|
|
- }
|
|
-
|
|
- /* Ordinary parentheses, not followed by '?', are capturing unless
|
|
- PCRE_NO_AUTO_CAPTURE is set. */
|
|
-
|
|
- else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
|
|
-
|
|
- /* Capturing brackets must be counted so we can process escapes in a
|
|
- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
|
|
- an additional 3 bytes of memory per capturing bracket. */
|
|
-
|
|
- if (capturing)
|
|
- {
|
|
- bracount++;
|
|
- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
|
|
- }
|
|
-
|
|
- /* Save length for computing whole length at end if there's a repeat that
|
|
- requires duplication of the group. Also save the current value of
|
|
- branch_extra, and start the new group with the new value. If non-zero, this
|
|
- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
|
|
-
|
|
- if (brastackptr >= sizeof(brastack)/sizeof(int))
|
|
- {
|
|
- errorcode = ERR19;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
-
|
|
- bralenstack[brastackptr] = branch_extra;
|
|
- branch_extra = branch_newextra;
|
|
-
|
|
- brastack[brastackptr++] = length;
|
|
- length += bracket_length;
|
|
- continue;
|
|
-
|
|
- /* Handle ket. Look for subsequent max/min; for certain sets of values we
|
|
- have to replicate this bracket up to that many times. If brastackptr is
|
|
- 0 this is an unmatched bracket which will generate an error, but take care
|
|
- not to try to access brastack[-1] when computing the length and restoring
|
|
- the branch_extra value. */
|
|
-
|
|
- case ')':
|
|
- length += 1 + LINK_SIZE;
|
|
- if (brastackptr > 0)
|
|
- {
|
|
- duplength = length - brastack[--brastackptr];
|
|
- branch_extra = bralenstack[brastackptr];
|
|
- /* This is a paranoid check to stop integer overflow later on */
|
|
- if (duplength > MAX_DUPLENGTH)
|
|
- {
|
|
- errorcode = ERR50;
|
|
- goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- }
|
|
- else duplength = 0;
|
|
-
|
|
- /* The following code is also used when a recursion such as (?3) is
|
|
- followed by a quantifier, because in that case, it has to be wrapped inside
|
|
- brackets so that the quantifier works. The value of duplength must be
|
|
- set before arrival. */
|
|
-
|
|
- HANDLE_QUANTIFIED_BRACKETS:
|
|
-
|
|
- /* Leave ptr at the final char; for read_repeat_counts this happens
|
|
- automatically; for the others we need an increment. */
|
|
-
|
|
- if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
|
|
- {
|
|
- ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
|
|
- if (errorcode != 0) goto PCRE_ERROR_RETURN;
|
|
- }
|
|
- else if (c == '*') { min = 0; max = -1; ptr++; }
|
|
- else if (c == '+') { min = 1; max = -1; ptr++; }
|
|
- else if (c == '?') { min = 0; max = 1; ptr++; }
|
|
- else { min = 1; max = 1; }
|
|
-
|
|
- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
|
|
- group, and if the maximum is greater than zero, we have to replicate
|
|
- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
|
|
- bracket set. */
|
|
-
|
|
- if (min == 0)
|
|
- {
|
|
- length++;
|
|
- if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
|
|
- }
|
|
-
|
|
- /* When the minimum is greater than zero, we have to replicate up to
|
|
- minval-1 times, with no additions required in the copies. Then, if there
|
|
- is a limited maximum we have to replicate up to maxval-1 times allowing
|
|
- for a BRAZERO item before each optional copy and nesting brackets for all
|
|
- but one of the optional copies. */
|
|
-
|
|
- else
|
|
- {
|
|
- length += (min - 1) * duplength;
|
|
- if (max > min) /* Need this test as max=-1 means no limit */
|
|
- length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
|
|
- - (2 + 2*LINK_SIZE);
|
|
- }
|
|
-
|
|
- /* Allow space for once brackets for "possessive quantifier" */
|
|
-
|
|
- if (ptr[1] == '+')
|
|
- {
|
|
- ptr++;
|
|
- length += 2 + 2*LINK_SIZE;
|
|
- }
|
|
- continue;
|
|
-
|
|
- /* Non-special character. It won't be space or # in extended mode, so it is
|
|
- always a genuine character. If we are in a \Q...\E sequence, check for the
|
|
- end; if not, we have a literal. */
|
|
-
|
|
- default:
|
|
- NORMAL_CHAR:
|
|
-
|
|
- if (inescq && c == '\\' && ptr[1] == 'E')
|
|
- {
|
|
- inescq = FALSE;
|
|
- ptr++;
|
|
- continue;
|
|
- }
|
|
-
|
|
- length += 2; /* For a one-byte character */
|
|
- lastitemlength = 1; /* Default length of last item for repeats */
|
|
-
|
|
- /* In UTF-8 mode, check for additional bytes. */
|
|
-
|
|
-#ifdef SUPPORT_UTF8
|
|
- if (utf8 && (c & 0xc0) == 0xc0)
|
|
- {
|
|
- while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
|
|
- { /* because the end is marked */
|
|
- lastitemlength++; /* by a zero byte. */
|
|
- length++;
|
|
- ptr++;
|
|
- }
|
|
- }
|
|
-#endif
|
|
-
|
|
- continue;
|
|
- }
|
|
- }
|
|
-
|
|
-length += 2 + LINK_SIZE; /* For final KET and END */
|
|
-
|
|
-if ((options & PCRE_AUTO_CALLOUT) != 0)
|
|
- length += 2 + 2*LINK_SIZE; /* For final callout */
|
|
-
|
|
-if (length > MAX_PATTERN_SIZE)
|
|
- {
|
|
- errorcode = ERR20;
|
|
- goto PCRE_EARLY_ERROR_RETURN;
|
|
- }
|
|
-
|
|
-/* Compute the size of data block needed and get it, either from malloc or
|
|
-externally provided function. Integer overflow should no longer be possible
|
|
-because nowadays we limit the maximum value of name_count and max_name size. */
|
|
-
|
|
-size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
|
|
-re = (real_pcre *)(pcre_malloc)(size);
|
|
-
|
|
-if (re == NULL)
|
|
- {
|
|
- errorcode = ERR21;
|
|
- goto PCRE_EARLY_ERROR_RETURN;
|
|
- }
|
|
-
|
|
-/* Put in the magic number, and save the sizes, options, and character table
|
|
-pointer. NULL is used for the default character tables. The nullpad field is at
|
|
-the end; it's there to help in the case when a regex compiled on a system with
|
|
-4-byte pointers is run on another with 8-byte pointers. */
|
|
-
|
|
-re->magic_number = MAGIC_NUMBER;
|
|
-re->size = size;
|
|
-re->options = options;
|
|
-re->dummy1 = 0;
|
|
-re->name_table_offset = sizeof(real_pcre);
|
|
-re->name_entry_size = max_name_size + 3;
|
|
-re->name_count = name_count;
|
|
-re->ref_count = 0;
|
|
-re->tables = (tables == _pcre_default_tables)? NULL : tables;
|
|
-re->nullpad = NULL;
|
|
-
|
|
-/* The starting points of the name/number translation table and of the code are
|
|
-passed around in the compile data block. */
|
|
-
|
|
-cd->names_found = 0;
|
|
-cd->name_entry_size = max_name_size + 3;
|
|
-cd->name_table = (uschar *)re + re->name_table_offset;
|
|
-codestart = cd->name_table + re->name_entry_size * re->name_count;
|
|
-cd->start_code = codestart;
|
|
-cd->start_pattern = (const uschar *)pattern;
|
|
-cd->req_varyopt = 0;
|
|
-cd->nopartial = FALSE;
|
|
-
|
|
-/* Set up a starting, non-extracting bracket, then compile the expression. On
|
|
-error, errorcode will be set non-zero, so we don't need to look at the result
|
|
-of the function here. */
|
|
-
|
|
-ptr = (const uschar *)pattern;
|
|
-code = (uschar *)codestart;
|
|
-*code = OP_BRA;
|
|
-bracount = 0;
|
|
-(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
|
|
- &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
|
|
-re->top_bracket = bracount;
|
|
-re->top_backref = cd->top_backref;
|
|
-
|
|
-if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
|
|
-
|
|
-/* If not reached end of pattern on success, there's an excess bracket. */
|
|
-
|
|
-if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
|
|
-
|
|
-/* Fill in the terminating state and check for disastrous overflow, but
|
|
-if debugging, leave the test till after things are printed out. */
|
|
-
|
|
-*code++ = OP_END;
|
|
-
|
|
-#ifndef DEBUG
|
|
-if (code - codestart > length) errorcode = ERR23;
|
|
-#endif
|
|
-
|
|
-/* Give an error if there's back reference to a non-existent capturing
|
|
-subpattern. */
|
|
-
|
|
-if (re->top_backref > re->top_bracket) errorcode = ERR15;
|
|
+if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
|
|
|
|
/* Failed to compile, or error while post-processing */
|
|
|
|
if (errorcode != 0)
|
|
{
|
|
(pcre_free)(re);
|
|
- PCRE_ERROR_RETURN:
|
|
- *erroroffset = ptr - (const uschar *)pattern;
|
|
PCRE_EARLY_ERROR_RETURN:
|
|
+ *erroroffset = ptr - (const uschar *)pattern;
|
|
+#ifdef SUPPORT_UTF8
|
|
+ PCRE_UTF8_ERROR_RETURN:
|
|
+#endif
|
|
*errorptr = error_texts[errorcode];
|
|
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
|
|
return NULL;
|
|
@@ -5180,15 +5333,15 @@
|
|
the pattern is anchored by virtue of ^ characters or \A or anything else (such
|
|
as starting with .* when DOTALL is set).
|
|
|
|
-Otherwise, if we know what the first character has to be, save it, because that
|
|
+Otherwise, if we know what the first byte has to be, save it, because that
|
|
speeds up unanchored matches no end. If not, see if we can set the
|
|
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
|
|
start with ^. and also when all branches start with .* for non-DOTALL matches.
|
|
*/
|
|
|
|
-if ((options & PCRE_ANCHORED) == 0)
|
|
+if ((re->options & PCRE_ANCHORED) == 0)
|
|
{
|
|
- int temp_options = options;
|
|
+ int temp_options = re->options; /* May get changed during these scans */
|
|
if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
|
|
re->options |= PCRE_ANCHORED;
|
|
else
|
|
@@ -5273,7 +5426,7 @@
|
|
if (errorcodeptr != NULL) *errorcodeptr = ERR23;
|
|
return NULL;
|
|
}
|
|
-#endif
|
|
+#endif /* DEBUG */
|
|
|
|
return (pcre *)re;
|
|
}
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_exec.c ./pcrelib/pcre_exec.c
|
|
--- ../pcre.orig/pcrelib/pcre_exec.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_exec.c Fri Feb 9 22:31:19 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -42,25 +42,22 @@
|
|
pattern matching using an NFA algorithm, trying to mimic Perl as closely as
|
|
possible. There are also some static supporting functions. */
|
|
|
|
-#define NLBLOCK md /* The block containing newline information */
|
|
+#define NLBLOCK md /* Block containing newline information */
|
|
+#define PSSTART start_subject /* Field containing processed string start */
|
|
+#define PSEND end_subject /* Field containing processed string end */
|
|
+
|
|
#include "pcre_internal.h"
|
|
|
|
+/* The chain of eptrblocks for tail recursions uses memory in stack workspace,
|
|
+obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
|
|
|
|
-/* Structure for building a chain of data that actually lives on the
|
|
-stack, for holding the values of the subject pointer at the start of each
|
|
-subpattern, so as to detect when an empty string has been matched by a
|
|
-subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
|
|
-are on the heap, not on the stack. */
|
|
-
|
|
-typedef struct eptrblock {
|
|
- struct eptrblock *epb_prev;
|
|
- USPTR epb_saved_eptr;
|
|
-} eptrblock;
|
|
+#define EPTR_WORK_SIZE (1000)
|
|
|
|
/* Flag bits for the match() function */
|
|
|
|
-#define match_condassert 0x01 /* Called to check a condition assertion */
|
|
-#define match_isgroup 0x02 /* Set if start of bracketed group */
|
|
+#define match_condassert 0x01 /* Called to check a condition assertion */
|
|
+#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
|
|
+#define match_tail_recursed 0x04 /* Tail recursive call */
|
|
|
|
/* Non-error returns from the match() function. Error returns are externally
|
|
defined PCRE_ERROR_xxx codes, which are all negative. */
|
|
@@ -101,7 +98,7 @@
|
|
static void
|
|
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
|
|
{
|
|
-int c;
|
|
+unsigned int c;
|
|
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
|
|
while (length-- > 0)
|
|
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
|
|
@@ -291,7 +288,6 @@
|
|
|
|
BOOL Xcur_is_word;
|
|
BOOL Xcondition;
|
|
- BOOL Xminimize;
|
|
BOOL Xprev_is_word;
|
|
|
|
unsigned long int Xoriginal_ims;
|
|
@@ -303,11 +299,10 @@
|
|
int Xprop_category;
|
|
int Xprop_chartype;
|
|
int Xprop_script;
|
|
- int *Xprop_test_variable;
|
|
#endif
|
|
|
|
int Xctype;
|
|
- int Xfc;
|
|
+ unsigned int Xfc;
|
|
int Xfi;
|
|
int Xlength;
|
|
int Xmax;
|
|
@@ -340,10 +335,7 @@
|
|
* Match from current position *
|
|
*************************************************/
|
|
|
|
-/* On entry ecode points to the first opcode, and eptr to the first character
|
|
-in the subject string, while eptrb holds the value of eptr at the start of the
|
|
-last bracketed group - used for breaking infinite loops matching zero-length
|
|
-strings. This function is called recursively in many circumstances. Whenever it
|
|
+/* This function is called recursively in many circumstances. Whenever it
|
|
returns a negative (error) response, the outer incarnation must also return the
|
|
same response.
|
|
|
|
@@ -353,8 +345,8 @@
|
|
made performance worse.
|
|
|
|
Arguments:
|
|
- eptr pointer in subject
|
|
- ecode position in code
|
|
+ eptr pointer to current character in subject
|
|
+ ecode pointer to current position in compiled code
|
|
offset_top current top pointer
|
|
md pointer to "static" info for the match
|
|
ims current /i, /m, and /s options
|
|
@@ -362,7 +354,9 @@
|
|
brackets - for testing for empty matches
|
|
flags can contain
|
|
match_condassert - this is an assertion condition
|
|
- match_isgroup - this is the start of a bracketed group
|
|
+ match_cbegroup - this is the start of an unlimited repeat
|
|
+ group that can match an empty string
|
|
+ match_tail_recursed - this is a tail_recursed group
|
|
rdepth the recursion depth
|
|
|
|
Returns: MATCH_MATCH if matched ) these values are >= 0
|
|
@@ -377,14 +371,16 @@
|
|
int flags, unsigned int rdepth)
|
|
{
|
|
/* These variables do not need to be preserved over recursion in this function,
|
|
-so they can be ordinary variables in all cases. Mark them with "register"
|
|
-because they are used a lot in loops. */
|
|
+so they can be ordinary variables in all cases. Mark some of them with
|
|
+"register" because they are used a lot in loops. */
|
|
|
|
register int rrc; /* Returns from recursive calls */
|
|
register int i; /* Used for loops not involving calls to RMATCH() */
|
|
-register unsigned int c; /* Character values not kept over RMATCH() calls */
|
|
+register unsigned int c; /* Character values not kept over RMATCH() calls */
|
|
register BOOL utf8; /* Local copy of UTF-8 flag for speed */
|
|
|
|
+BOOL minimize, possessive; /* Quantifier options */
|
|
+
|
|
/* When recursion is not being used, all "local" variables that have to be
|
|
preserved over calls to RMATCH() are part of a "frame" which is obtained from
|
|
heap storage. Set up the top-level frame here; others are obtained from the
|
|
@@ -434,7 +430,6 @@
|
|
|
|
#define cur_is_word frame->Xcur_is_word
|
|
#define condition frame->Xcondition
|
|
-#define minimize frame->Xminimize
|
|
#define prev_is_word frame->Xprev_is_word
|
|
|
|
#define original_ims frame->Xoriginal_ims
|
|
@@ -446,7 +441,6 @@
|
|
#define prop_category frame->Xprop_category
|
|
#define prop_chartype frame->Xprop_chartype
|
|
#define prop_script frame->Xprop_script
|
|
-#define prop_test_variable frame->Xprop_test_variable
|
|
#endif
|
|
|
|
#define ctype frame->Xctype
|
|
@@ -470,7 +464,7 @@
|
|
get preserved during recursion in the normal way. In this environment, fi and
|
|
i, and fc and c, can be the same variables. */
|
|
|
|
-#else
|
|
+#else /* NO_RECURSE not defined */
|
|
#define fi i
|
|
#define fc c
|
|
|
|
@@ -489,7 +483,6 @@
|
|
/* that do not have to be preserved over */
|
|
BOOL cur_is_word; /* a recursive call to RMATCH(). */
|
|
BOOL condition;
|
|
-BOOL minimize;
|
|
BOOL prev_is_word;
|
|
|
|
unsigned long int original_ims;
|
|
@@ -501,7 +494,6 @@
|
|
int prop_category;
|
|
int prop_chartype;
|
|
int prop_script;
|
|
-int *prop_test_variable;
|
|
#endif
|
|
|
|
int ctype;
|
|
@@ -516,7 +508,7 @@
|
|
int stacksave[REC_STACK_SAVE_MAX];
|
|
|
|
eptrblock newptrb;
|
|
-#endif
|
|
+#endif /* NO_RECURSE */
|
|
|
|
/* These statements are here to stop the compiler complaining about unitialized
|
|
variables. */
|
|
@@ -524,9 +516,9 @@
|
|
#ifdef SUPPORT_UCP
|
|
prop_value = 0;
|
|
prop_fail_result = 0;
|
|
-prop_test_variable = NULL;
|
|
#endif
|
|
|
|
+
|
|
/* This label is used for tail recursion, which is used in a few cases even
|
|
when NO_RECURSE is not defined, in order to reduce the amount of stack that is
|
|
used. Thanks to Ian Taylor for noticing this possibility and sending the
|
|
@@ -556,24 +548,34 @@
|
|
utf8 = FALSE;
|
|
#endif
|
|
|
|
-/* At the start of a bracketed group, add the current subject pointer to the
|
|
-stack of such pointers, to be re-instated at the end of the group when we hit
|
|
-the closing ket. When match() is called in other circumstances, we don't add to
|
|
-this stack. */
|
|
+/* At the start of a group with an unlimited repeat that may match an empty
|
|
+string, the match_cbegroup flag is set. When this is the case, add the current
|
|
+subject pointer to the chain of such remembered pointers, to be checked when we
|
|
+hit the closing ket, in order to break infinite loops that match no characters.
|
|
+When match() is called in other circumstances, don't add to the chain. If this
|
|
+is a tail recursion, use a block from the workspace, as the one on the stack is
|
|
+already used. */
|
|
|
|
-if ((flags & match_isgroup) != 0)
|
|
+if ((flags & match_cbegroup) != 0)
|
|
{
|
|
- newptrb.epb_prev = eptrb;
|
|
- newptrb.epb_saved_eptr = eptr;
|
|
- eptrb = &newptrb;
|
|
+ eptrblock *p;
|
|
+ if ((flags & match_tail_recursed) != 0)
|
|
+ {
|
|
+ if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
|
|
+ p = md->eptrchain + md->eptrn++;
|
|
+ }
|
|
+ else p = &newptrb;
|
|
+ p->epb_saved_eptr = eptr;
|
|
+ p->epb_prev = eptrb;
|
|
+ eptrb = p;
|
|
}
|
|
|
|
-/* Now start processing the operations. */
|
|
+/* Now start processing the opcodes. */
|
|
|
|
for (;;)
|
|
{
|
|
+ minimize = possessive = FALSE;
|
|
op = *ecode;
|
|
- minimize = FALSE;
|
|
|
|
/* For partial matching, remember if we ever hit the end of the subject after
|
|
matching at least one subject character. */
|
|
@@ -583,33 +585,30 @@
|
|
eptr > md->start_match)
|
|
md->hitend = TRUE;
|
|
|
|
- /* Opening capturing bracket. If there is space in the offset vector, save
|
|
- the current subject position in the working slot at the top of the vector. We
|
|
- mustn't change the current values of the data slot, because they may be set
|
|
- from a previous iteration of this group, and be referred to by a reference
|
|
- inside the group.
|
|
-
|
|
- If the bracket fails to match, we need to restore this value and also the
|
|
- values of the final offsets, in case they were set by a previous iteration of
|
|
- the same bracket.
|
|
-
|
|
- If there isn't enough space in the offset vector, treat this as if it were a
|
|
- non-capturing bracket. Don't worry about setting the flag for the error case
|
|
- here; that is handled in the code for KET. */
|
|
-
|
|
- if (op > OP_BRA)
|
|
+ switch(op)
|
|
{
|
|
- number = op - OP_BRA;
|
|
-
|
|
- /* For extended extraction brackets (large number), we have to fish out the
|
|
- number from a dummy opcode at the start. */
|
|
-
|
|
- if (number > EXTRACT_BASIC_MAX)
|
|
- number = GET2(ecode, 2+LINK_SIZE);
|
|
+ /* Handle a capturing bracket. If there is space in the offset vector, save
|
|
+ the current subject position in the working slot at the top of the vector.
|
|
+ We mustn't change the current values of the data slot, because they may be
|
|
+ set from a previous iteration of this group, and be referred to by a
|
|
+ reference inside the group.
|
|
+
|
|
+ If the bracket fails to match, we need to restore this value and also the
|
|
+ values of the final offsets, in case they were set by a previous iteration
|
|
+ of the same bracket.
|
|
+
|
|
+ If there isn't enough space in the offset vector, treat this as if it were
|
|
+ a non-capturing bracket. Don't worry about setting the flag for the error
|
|
+ case here; that is handled in the code for KET. */
|
|
+
|
|
+ case OP_CBRA:
|
|
+ case OP_SCBRA:
|
|
+ number = GET2(ecode, 1+LINK_SIZE);
|
|
offset = number << 1;
|
|
|
|
#ifdef DEBUG
|
|
- printf("start bracket %d subject=", number);
|
|
+ printf("start bracket %d\n", number);
|
|
+ printf("subject=");
|
|
pchars(eptr, 16, TRUE, md);
|
|
printf("\n");
|
|
#endif
|
|
@@ -624,10 +623,11 @@
|
|
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
|
|
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
|
|
|
|
+ flags = (op == OP_SCBRA)? match_cbegroup : 0;
|
|
do
|
|
{
|
|
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
|
|
- match_isgroup);
|
|
+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
|
|
+ ims, eptrb, flags);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
md->capture_last = save_capture_last;
|
|
ecode += GET(ecode, 1);
|
|
@@ -643,39 +643,35 @@
|
|
RRETURN(MATCH_NOMATCH);
|
|
}
|
|
|
|
- /* Insufficient room for saving captured contents */
|
|
-
|
|
- else op = OP_BRA;
|
|
- }
|
|
-
|
|
- /* Other types of node can be handled by a switch */
|
|
+ /* Insufficient room for saving captured contents. Treat as a non-capturing
|
|
+ bracket. */
|
|
|
|
- switch(op)
|
|
- {
|
|
- case OP_BRA: /* Non-capturing bracket: optimized */
|
|
- DPRINTF(("start bracket 0\n"));
|
|
-
|
|
- /* Loop for all the alternatives */
|
|
+ DPRINTF(("insufficient capture room: treat as non-capturing\n"));
|
|
|
|
+ /* Non-capturing bracket. Loop for all the alternatives. When we get to the
|
|
+ final alternative within the brackets, we would return the result of a
|
|
+ recursive call to match() whatever happened. We can reduce stack usage by
|
|
+ turning this into a tail recursion. */
|
|
+
|
|
+ case OP_BRA:
|
|
+ case OP_SBRA:
|
|
+ DPRINTF(("start non-capturing bracket\n"));
|
|
+ flags = (op >= OP_SBRA)? match_cbegroup : 0;
|
|
for (;;)
|
|
{
|
|
- /* When we get to the final alternative within the brackets, we would
|
|
- return the result of a recursive call to match() whatever happened. We
|
|
- can reduce stack usage by turning this into a tail recursion. */
|
|
-
|
|
if (ecode[GET(ecode, 1)] != OP_ALT)
|
|
- {
|
|
- ecode += 1 + LINK_SIZE;
|
|
- flags = match_isgroup;
|
|
- DPRINTF(("bracket 0 tail recursion\n"));
|
|
- goto TAIL_RECURSE;
|
|
- }
|
|
+ {
|
|
+ ecode += _pcre_OP_lengths[*ecode];
|
|
+ flags |= match_tail_recursed;
|
|
+ DPRINTF(("bracket 0 tail recursion\n"));
|
|
+ goto TAIL_RECURSE;
|
|
+ }
|
|
|
|
/* For non-final alternatives, continue the loop for a NOMATCH result;
|
|
otherwise return. */
|
|
|
|
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
|
|
- match_isgroup);
|
|
+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
|
|
+ eptrb, flags);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += GET(ecode, 1);
|
|
}
|
|
@@ -688,54 +684,72 @@
|
|
obeyed, we can use tail recursion to avoid using another stack frame. */
|
|
|
|
case OP_COND:
|
|
- if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
|
|
+ case OP_SCOND:
|
|
+ if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
|
|
+ {
|
|
+ offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
|
|
+ condition = md->recursive != NULL &&
|
|
+ (offset == RREF_ANY || offset == md->recursive->group_num);
|
|
+ ecode += condition? 3 : GET(ecode, 1);
|
|
+ }
|
|
+
|
|
+ else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
|
|
{
|
|
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
|
|
- condition = (offset == CREF_RECURSE * 2)?
|
|
- (md->recursive != NULL) :
|
|
- (offset < offset_top && md->offset_vector[offset] >= 0);
|
|
- ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
|
|
- flags = match_isgroup;
|
|
- goto TAIL_RECURSE;
|
|
+ condition = offset < offset_top && md->offset_vector[offset] >= 0;
|
|
+ ecode += condition? 3 : GET(ecode, 1);
|
|
+ }
|
|
+
|
|
+ else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
|
|
+ {
|
|
+ condition = FALSE;
|
|
+ ecode += GET(ecode, 1);
|
|
}
|
|
|
|
/* The condition is an assertion. Call match() to evaluate it - setting
|
|
- the final argument TRUE causes it to stop at the end of an assertion. */
|
|
+ the final argument match_condassert causes it to stop at the end of an
|
|
+ assertion. */
|
|
|
|
else
|
|
{
|
|
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
|
|
- match_condassert | match_isgroup);
|
|
+ match_condassert);
|
|
if (rrc == MATCH_MATCH)
|
|
{
|
|
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
|
|
+ condition = TRUE;
|
|
+ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
|
|
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
|
|
}
|
|
else if (rrc != MATCH_NOMATCH)
|
|
{
|
|
RRETURN(rrc); /* Need braces because of following else */
|
|
}
|
|
- else ecode += GET(ecode, 1);
|
|
+ else
|
|
+ {
|
|
+ condition = FALSE;
|
|
+ ecode += GET(ecode, 1);
|
|
+ }
|
|
+ }
|
|
|
|
- /* We are now at the branch that is to be obeyed. As there is only one,
|
|
- we can use tail recursion to avoid using another stack frame. */
|
|
+ /* We are now at the branch that is to be obeyed. As there is only one,
|
|
+ we can use tail recursion to avoid using another stack frame. If the second
|
|
+ alternative doesn't exist, we can just plough on. */
|
|
|
|
+ if (condition || *ecode == OP_ALT)
|
|
+ {
|
|
ecode += 1 + LINK_SIZE;
|
|
- flags = match_isgroup;
|
|
+ flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
|
|
goto TAIL_RECURSE;
|
|
}
|
|
- /* Control never reaches here */
|
|
-
|
|
- /* Skip over conditional reference or large extraction number data if
|
|
- encountered. */
|
|
-
|
|
- case OP_CREF:
|
|
- case OP_BRANUMBER:
|
|
- ecode += 3;
|
|
+ else
|
|
+ {
|
|
+ ecode += 1 + LINK_SIZE;
|
|
+ }
|
|
break;
|
|
|
|
- /* End of the pattern. If we are in a recursion, we should restore the
|
|
- offsets appropriately and continue from after the call. */
|
|
+
|
|
+ /* End of the pattern. If we are in a top-level recursion, we should
|
|
+ restore the offsets appropriately and continue from after the call. */
|
|
|
|
case OP_END:
|
|
if (md->recursive != NULL && md->recursive->group_num == 0)
|
|
@@ -777,8 +791,7 @@
|
|
case OP_ASSERTBACK:
|
|
do
|
|
{
|
|
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
|
|
- match_isgroup);
|
|
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
|
|
if (rrc == MATCH_MATCH) break;
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += GET(ecode, 1);
|
|
@@ -804,8 +817,7 @@
|
|
case OP_ASSERTBACK_NOT:
|
|
do
|
|
{
|
|
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
|
|
- match_isgroup);
|
|
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
|
|
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += GET(ecode,1);
|
|
@@ -826,8 +838,8 @@
|
|
#ifdef SUPPORT_UTF8
|
|
if (utf8)
|
|
{
|
|
- c = GET(ecode,1);
|
|
- for (i = 0; i < c; i++)
|
|
+ i = GET(ecode, 1);
|
|
+ while (i-- > 0)
|
|
{
|
|
eptr--;
|
|
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
|
|
@@ -840,7 +852,7 @@
|
|
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
|
|
|
|
{
|
|
- eptr -= GET(ecode,1);
|
|
+ eptr -= GET(ecode, 1);
|
|
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
|
|
}
|
|
|
|
@@ -897,13 +909,8 @@
|
|
case OP_RECURSE:
|
|
{
|
|
callpat = md->start_code + GET(ecode, 1);
|
|
- new_recursive.group_num = *callpat - OP_BRA;
|
|
-
|
|
- /* For extended extraction brackets (large number), we have to fish out
|
|
- the number from a dummy opcode at the start. */
|
|
-
|
|
- if (new_recursive.group_num > EXTRACT_BASIC_MAX)
|
|
- new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
|
|
+ new_recursive.group_num = (callpat == md->start_code)? 0 :
|
|
+ GET2(callpat, 1 + LINK_SIZE);
|
|
|
|
/* Add to "recursing stack" */
|
|
|
|
@@ -936,10 +943,11 @@
|
|
restore the offset and recursion data. */
|
|
|
|
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
|
|
+ flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
|
|
do
|
|
{
|
|
- RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
|
|
- eptrb, match_isgroup);
|
|
+ RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
|
|
+ md, ims, eptrb, flags);
|
|
if (rrc == MATCH_MATCH)
|
|
{
|
|
DPRINTF(("Recursion matched\n"));
|
|
@@ -983,7 +991,7 @@
|
|
do
|
|
{
|
|
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
|
|
- eptrb, match_isgroup);
|
|
+ eptrb, 0);
|
|
if (rrc == MATCH_MATCH) break;
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += GET(ecode,1);
|
|
@@ -997,7 +1005,7 @@
|
|
/* Continue as from after the assertion, updating the offsets high water
|
|
mark, since extracts may have been taken. */
|
|
|
|
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
|
|
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
|
|
|
|
offset_top = md->end_offset_top;
|
|
eptr = md->end_match_ptr;
|
|
@@ -1031,15 +1039,15 @@
|
|
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode = prev;
|
|
- flags = match_isgroup;
|
|
+ flags = match_tail_recursed;
|
|
goto TAIL_RECURSE;
|
|
}
|
|
else /* OP_KETRMAX */
|
|
{
|
|
- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
|
|
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += 1 + LINK_SIZE;
|
|
- flags = 0;
|
|
+ flags = match_tail_recursed;
|
|
goto TAIL_RECURSE;
|
|
}
|
|
/* Control never gets here */
|
|
@@ -1060,38 +1068,44 @@
|
|
case OP_BRAZERO:
|
|
{
|
|
next = ecode+1;
|
|
- RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
|
|
+ RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
do next += GET(next,1); while (*next == OP_ALT);
|
|
- ecode = next + 1+LINK_SIZE;
|
|
+ ecode = next + 1 + LINK_SIZE;
|
|
}
|
|
break;
|
|
|
|
case OP_BRAMINZERO:
|
|
{
|
|
next = ecode+1;
|
|
- do next += GET(next,1); while (*next == OP_ALT);
|
|
- RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
|
|
- match_isgroup);
|
|
+ do next += GET(next, 1); while (*next == OP_ALT);
|
|
+ RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode++;
|
|
}
|
|
break;
|
|
|
|
- /* End of a group, repeated or non-repeating. If we are at the end of
|
|
- an assertion "group", stop matching and return MATCH_MATCH, but record the
|
|
- current high water mark for use by positive assertions. Do this also
|
|
- for the "once" (not-backup up) groups. */
|
|
+ /* End of a group, repeated or non-repeating. */
|
|
|
|
case OP_KET:
|
|
case OP_KETRMIN:
|
|
case OP_KETRMAX:
|
|
prev = ecode - GET(ecode, 1);
|
|
- saved_eptr = eptrb->epb_saved_eptr;
|
|
|
|
- /* Back up the stack of bracket start pointers. */
|
|
+ /* If this was a group that remembered the subject start, in order to break
|
|
+ infinite repeats of empty string matches, retrieve the subject start from
|
|
+ the chain. Otherwise, set it NULL. */
|
|
+
|
|
+ if (*prev >= OP_SBRA)
|
|
+ {
|
|
+ saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
|
|
+ eptrb = eptrb->epb_prev; /* Backup to previous group */
|
|
+ }
|
|
+ else saved_eptr = NULL;
|
|
|
|
- eptrb = eptrb->epb_prev;
|
|
+ /* If we are at the end of an assertion group, stop matching and return
|
|
+ MATCH_MATCH, but record the current high water mark for use by positive
|
|
+ assertions. Do this also for the "once" (atomic) groups. */
|
|
|
|
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
|
|
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
|
|
@@ -1102,18 +1116,15 @@
|
|
RRETURN(MATCH_MATCH);
|
|
}
|
|
|
|
- /* In all other cases except a conditional group we have to check the
|
|
- group number back at the start and if necessary complete handling an
|
|
- extraction by setting the offsets and bumping the high water mark. */
|
|
+ /* For capturing groups we have to check the group number back at the start
|
|
+ and if necessary complete handling an extraction by setting the offsets and
|
|
+ bumping the high water mark. Note that whole-pattern recursion is coded as
|
|
+ a recurse into group 0, so it won't be picked up here. Instead, we catch it
|
|
+ when the OP_END is reached. Other recursion is handled here. */
|
|
|
|
- if (*prev != OP_COND)
|
|
+ if (*prev == OP_CBRA || *prev == OP_SCBRA)
|
|
{
|
|
- number = *prev - OP_BRA;
|
|
-
|
|
- /* For extended extraction brackets (large number), we have to fish out
|
|
- the number from a dummy opcode at the start. */
|
|
-
|
|
- if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
|
|
+ number = GET2(prev, 1+LINK_SIZE);
|
|
offset = number << 1;
|
|
|
|
#ifdef DEBUG
|
|
@@ -1121,42 +1132,34 @@
|
|
printf("\n");
|
|
#endif
|
|
|
|
- /* Test for a numbered group. This includes groups called as a result
|
|
- of recursion. Note that whole-pattern recursion is coded as a recurse
|
|
- into group 0, so it won't be picked up here. Instead, we catch it when
|
|
- the OP_END is reached. */
|
|
-
|
|
- if (number > 0)
|
|
+ md->capture_last = number;
|
|
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
|
|
{
|
|
- md->capture_last = number;
|
|
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
|
|
- {
|
|
- md->offset_vector[offset] =
|
|
- md->offset_vector[md->offset_end - number];
|
|
- md->offset_vector[offset+1] = eptr - md->start_subject;
|
|
- if (offset_top <= offset) offset_top = offset + 2;
|
|
- }
|
|
-
|
|
- /* Handle a recursively called group. Restore the offsets
|
|
- appropriately and continue from after the call. */
|
|
-
|
|
- if (md->recursive != NULL && md->recursive->group_num == number)
|
|
- {
|
|
- recursion_info *rec = md->recursive;
|
|
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
|
|
- md->recursive = rec->prevrec;
|
|
- md->start_match = rec->save_start;
|
|
- memcpy(md->offset_vector, rec->offset_save,
|
|
- rec->saved_max * sizeof(int));
|
|
- ecode = rec->after_call;
|
|
- ims = original_ims;
|
|
- break;
|
|
- }
|
|
+ md->offset_vector[offset] =
|
|
+ md->offset_vector[md->offset_end - number];
|
|
+ md->offset_vector[offset+1] = eptr - md->start_subject;
|
|
+ if (offset_top <= offset) offset_top = offset + 2;
|
|
+ }
|
|
+
|
|
+ /* Handle a recursively called group. Restore the offsets
|
|
+ appropriately and continue from after the call. */
|
|
+
|
|
+ if (md->recursive != NULL && md->recursive->group_num == number)
|
|
+ {
|
|
+ recursion_info *rec = md->recursive;
|
|
+ DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
|
|
+ md->recursive = rec->prevrec;
|
|
+ md->start_match = rec->save_start;
|
|
+ memcpy(md->offset_vector, rec->offset_save,
|
|
+ rec->saved_max * sizeof(int));
|
|
+ ecode = rec->after_call;
|
|
+ ims = original_ims;
|
|
+ break;
|
|
}
|
|
}
|
|
|
|
- /* Reset the value of the ims flags, in case they got changed during
|
|
- the group. */
|
|
+ /* For both capturing and non-capturing groups, reset the value of the ims
|
|
+ flags, in case they got changed during the group. */
|
|
|
|
ims = original_ims;
|
|
DPRINTF(("ims reset to %02lx\n", ims));
|
|
@@ -1177,20 +1180,22 @@
|
|
preceding bracket, in the appropriate order. In the second case, we can use
|
|
tail recursion to avoid using another stack frame. */
|
|
|
|
+ flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
|
|
+
|
|
if (*ecode == OP_KETRMIN)
|
|
{
|
|
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode = prev;
|
|
- flags = match_isgroup;
|
|
+ flags |= match_tail_recursed;
|
|
goto TAIL_RECURSE;
|
|
}
|
|
else /* OP_KETRMAX */
|
|
{
|
|
- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
|
|
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
ecode += 1 + LINK_SIZE;
|
|
- flags = 0;
|
|
+ flags = match_tail_recursed;
|
|
goto TAIL_RECURSE;
|
|
}
|
|
/* Control never gets here */
|
|
@@ -1202,9 +1207,7 @@
|
|
if ((ims & PCRE_MULTILINE) != 0)
|
|
{
|
|
if (eptr != md->start_subject &&
|
|
- (eptr == md->end_subject ||
|
|
- eptr < md->start_subject + md->nllen ||
|
|
- !IS_NEWLINE(eptr - md->nllen)))
|
|
+ (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
|
|
RRETURN(MATCH_NOMATCH);
|
|
ecode++;
|
|
break;
|
|
@@ -1244,7 +1247,7 @@
|
|
if (!md->endonly)
|
|
{
|
|
if (eptr != md->end_subject &&
|
|
- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
|
|
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
|
|
RRETURN(MATCH_NOMATCH);
|
|
ecode++;
|
|
break;
|
|
@@ -1263,7 +1266,7 @@
|
|
|
|
case OP_EODN:
|
|
if (eptr != md->end_subject &&
|
|
- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
|
|
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
|
|
RRETURN(MATCH_NOMATCH);
|
|
ecode++;
|
|
break;
|
|
@@ -1319,8 +1322,7 @@
|
|
case OP_ANY:
|
|
if ((ims & PCRE_DOTALL) == 0)
|
|
{
|
|
- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
|
|
- RRETURN(MATCH_NOMATCH);
|
|
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
|
|
}
|
|
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
|
|
if (utf8)
|
|
@@ -1414,6 +1416,26 @@
|
|
ecode++;
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
|
|
+ GETCHARINCTEST(c, eptr);
|
|
+ switch(c)
|
|
+ {
|
|
+ default: RRETURN(MATCH_NOMATCH);
|
|
+ case 0x000d:
|
|
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
|
|
+ break;
|
|
+ case 0x000a:
|
|
+ case 0x000b:
|
|
+ case 0x000c:
|
|
+ case 0x0085:
|
|
+ case 0x2028:
|
|
+ case 0x2029:
|
|
+ break;
|
|
+ }
|
|
+ ecode++;
|
|
+ break;
|
|
+
|
|
#ifdef SUPPORT_UCP
|
|
/* Check the next character by Unicode property. We will get here only
|
|
if the support is in the binary; otherwise a compile-time error occurs. */
|
|
@@ -1456,7 +1478,6 @@
|
|
|
|
default:
|
|
RRETURN(PCRE_ERROR_INTERNAL);
|
|
- break;
|
|
}
|
|
|
|
ecode += 3;
|
|
@@ -1926,7 +1947,7 @@
|
|
|
|
else
|
|
{
|
|
- int dc;
|
|
+ unsigned int dc;
|
|
GETCHARINC(dc, eptr);
|
|
ecode += length;
|
|
|
|
@@ -1953,13 +1974,17 @@
|
|
}
|
|
break;
|
|
|
|
- /* Match a single character repeatedly; different opcodes share code. */
|
|
+ /* Match a single character repeatedly. */
|
|
|
|
case OP_EXACT:
|
|
min = max = GET2(ecode, 1);
|
|
ecode += 3;
|
|
goto REPEATCHAR;
|
|
|
|
+ case OP_POSUPTO:
|
|
+ possessive = TRUE;
|
|
+ /* Fall through */
|
|
+
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
min = 0;
|
|
@@ -1968,6 +1993,27 @@
|
|
ecode += 3;
|
|
goto REPEATCHAR;
|
|
|
|
+ case OP_POSSTAR:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATCHAR;
|
|
+
|
|
+ case OP_POSPLUS:
|
|
+ possessive = TRUE;
|
|
+ min = 1;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATCHAR;
|
|
+
|
|
+ case OP_POSQUERY:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = 1;
|
|
+ ecode++;
|
|
+ goto REPEATCHAR;
|
|
+
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
case OP_PLUS:
|
|
@@ -2003,10 +2049,9 @@
|
|
uschar occhars[8];
|
|
|
|
#ifdef SUPPORT_UCP
|
|
- int othercase;
|
|
+ unsigned int othercase;
|
|
if ((ims & PCRE_CASELESS) != 0 &&
|
|
- (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
|
|
- othercase >= 0)
|
|
+ (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
|
|
oclength = _pcre_ord2utf8(othercase, occhars);
|
|
#endif /* SUPPORT_UCP */
|
|
|
|
@@ -2042,7 +2087,8 @@
|
|
}
|
|
/* Control never gets here */
|
|
}
|
|
- else
|
|
+
|
|
+ else /* Maximize */
|
|
{
|
|
pp = eptr;
|
|
for (i = min; i < max; i++)
|
|
@@ -2056,6 +2102,8 @@
|
|
eptr += oclength;
|
|
}
|
|
}
|
|
+
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2110,7 +2158,7 @@
|
|
}
|
|
/* Control never gets here */
|
|
}
|
|
- else
|
|
+ else /* Maximize */
|
|
{
|
|
pp = eptr;
|
|
for (i = min; i < max; i++)
|
|
@@ -2118,6 +2166,7 @@
|
|
if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
|
|
eptr++;
|
|
}
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2146,7 +2195,7 @@
|
|
}
|
|
/* Control never gets here */
|
|
}
|
|
- else
|
|
+ else /* Maximize */
|
|
{
|
|
pp = eptr;
|
|
for (i = min; i < max; i++)
|
|
@@ -2154,6 +2203,7 @@
|
|
if (eptr >= md->end_subject || fc != *eptr) break;
|
|
eptr++;
|
|
}
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2206,6 +2256,34 @@
|
|
ecode += 3;
|
|
goto REPEATNOTCHAR;
|
|
|
|
+ case OP_NOTPOSSTAR:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATNOTCHAR;
|
|
+
|
|
+ case OP_NOTPOSPLUS:
|
|
+ possessive = TRUE;
|
|
+ min = 1;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATNOTCHAR;
|
|
+
|
|
+ case OP_NOTPOSQUERY:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = 1;
|
|
+ ecode++;
|
|
+ goto REPEATNOTCHAR;
|
|
+
|
|
+ case OP_NOTPOSUPTO:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = GET2(ecode, 1);
|
|
+ ecode += 3;
|
|
+ goto REPEATNOTCHAR;
|
|
+
|
|
case OP_NOTSTAR:
|
|
case OP_NOTMINSTAR:
|
|
case OP_NOTPLUS:
|
|
@@ -2245,7 +2323,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (i = 1; i <= min; i++)
|
|
{
|
|
GETCHARINC(d, eptr);
|
|
@@ -2270,7 +2348,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (fi = min;; fi++)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2306,7 +2384,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (i = min; i < max; i++)
|
|
{
|
|
int len = 1;
|
|
@@ -2316,7 +2394,8 @@
|
|
if (fc == d) break;
|
|
eptr += len;
|
|
}
|
|
- for(;;)
|
|
+ if (possessive) continue;
|
|
+ for(;;)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
@@ -2333,6 +2412,7 @@
|
|
if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
|
|
eptr++;
|
|
}
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2354,7 +2434,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (i = 1; i <= min; i++)
|
|
{
|
|
GETCHARINC(d, eptr);
|
|
@@ -2377,7 +2457,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (fi = min;; fi++)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2412,7 +2492,7 @@
|
|
/* UTF-8 mode */
|
|
if (utf8)
|
|
{
|
|
- register int d;
|
|
+ register unsigned int d;
|
|
for (i = min; i < max; i++)
|
|
{
|
|
int len = 1;
|
|
@@ -2421,6 +2501,7 @@
|
|
if (fc == d) break;
|
|
eptr += len;
|
|
}
|
|
+ if (possessive) continue;
|
|
for(;;)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2438,6 +2519,7 @@
|
|
if (eptr >= md->end_subject || fc == *eptr) break;
|
|
eptr++;
|
|
}
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -2469,6 +2551,34 @@
|
|
ecode += 3;
|
|
goto REPEATTYPE;
|
|
|
|
+ case OP_TYPEPOSSTAR:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATTYPE;
|
|
+
|
|
+ case OP_TYPEPOSPLUS:
|
|
+ possessive = TRUE;
|
|
+ min = 1;
|
|
+ max = INT_MAX;
|
|
+ ecode++;
|
|
+ goto REPEATTYPE;
|
|
+
|
|
+ case OP_TYPEPOSQUERY:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = 1;
|
|
+ ecode++;
|
|
+ goto REPEATTYPE;
|
|
+
|
|
+ case OP_TYPEPOSUPTO:
|
|
+ possessive = TRUE;
|
|
+ min = 0;
|
|
+ max = GET2(ecode, 1);
|
|
+ ecode += 3;
|
|
+ goto REPEATTYPE;
|
|
+
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
case OP_TYPEPLUS:
|
|
@@ -2571,7 +2681,6 @@
|
|
|
|
default:
|
|
RRETURN(PCRE_ERROR_INTERNAL);
|
|
- break;
|
|
}
|
|
}
|
|
|
|
@@ -2611,9 +2720,7 @@
|
|
for (i = 1; i <= min; i++)
|
|
{
|
|
if (eptr >= md->end_subject ||
|
|
- ((ims & PCRE_DOTALL) == 0 &&
|
|
- eptr <= md->end_subject - md->nllen &&
|
|
- IS_NEWLINE(eptr)))
|
|
+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
|
|
RRETURN(MATCH_NOMATCH);
|
|
eptr++;
|
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
|
|
@@ -2624,6 +2731,28 @@
|
|
eptr += min;
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ for (i = 1; i <= min; i++)
|
|
+ {
|
|
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
|
|
+ GETCHARINC(c, eptr);
|
|
+ switch(c)
|
|
+ {
|
|
+ default: RRETURN(MATCH_NOMATCH);
|
|
+ case 0x000d:
|
|
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
|
|
+ break;
|
|
+ case 0x000a:
|
|
+ case 0x000b:
|
|
+ case 0x000c:
|
|
+ case 0x0085:
|
|
+ case 0x2028:
|
|
+ case 0x2029:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
for (i = 1; i <= min; i++)
|
|
{
|
|
@@ -2692,7 +2821,8 @@
|
|
#endif /* SUPPORT_UTF8 */
|
|
|
|
/* Code for the non-UTF-8 case for minimum matching of operators other
|
|
- than OP_PROP and OP_NOTPROP. */
|
|
+ than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
|
|
+ number of bytes present, as this was tested above. */
|
|
|
|
switch(ctype)
|
|
{
|
|
@@ -2701,8 +2831,7 @@
|
|
{
|
|
for (i = 1; i <= min; i++)
|
|
{
|
|
- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
|
|
- RRETURN(MATCH_NOMATCH);
|
|
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
|
|
eptr++;
|
|
}
|
|
}
|
|
@@ -2713,6 +2842,28 @@
|
|
eptr += min;
|
|
break;
|
|
|
|
+ /* Because of the CRLF case, we can't assume the minimum number of
|
|
+ bytes are present in this case. */
|
|
+
|
|
+ case OP_ANYNL:
|
|
+ for (i = 1; i <= min; i++)
|
|
+ {
|
|
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
|
|
+ switch(*eptr++)
|
|
+ {
|
|
+ default: RRETURN(MATCH_NOMATCH);
|
|
+ case 0x000d:
|
|
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
|
|
+ break;
|
|
+ case 0x000a:
|
|
+ case 0x000b:
|
|
+ case 0x000c:
|
|
+ case 0x0085:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
for (i = 1; i <= min; i++)
|
|
if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
|
|
@@ -2774,7 +2925,7 @@
|
|
GETCHARINC(c, eptr);
|
|
if (prop_fail_result) RRETURN(MATCH_NOMATCH);
|
|
}
|
|
- break;
|
|
+ /* Control never gets here */
|
|
|
|
case PT_LAMP:
|
|
for (fi = min;; fi++)
|
|
@@ -2789,7 +2940,7 @@
|
|
prop_chartype == ucp_Lt) == prop_fail_result)
|
|
RRETURN(MATCH_NOMATCH);
|
|
}
|
|
- break;
|
|
+ /* Control never gets here */
|
|
|
|
case PT_GC:
|
|
for (fi = min;; fi++)
|
|
@@ -2802,7 +2953,7 @@
|
|
if ((prop_category == prop_value) == prop_fail_result)
|
|
RRETURN(MATCH_NOMATCH);
|
|
}
|
|
- break;
|
|
+ /* Control never gets here */
|
|
|
|
case PT_PC:
|
|
for (fi = min;; fi++)
|
|
@@ -2815,7 +2966,7 @@
|
|
if ((prop_chartype == prop_value) == prop_fail_result)
|
|
RRETURN(MATCH_NOMATCH);
|
|
}
|
|
- break;
|
|
+ /* Control never gets here */
|
|
|
|
case PT_SC:
|
|
for (fi = min;; fi++)
|
|
@@ -2828,11 +2979,10 @@
|
|
if ((prop_script == prop_value) == prop_fail_result)
|
|
RRETURN(MATCH_NOMATCH);
|
|
}
|
|
- break;
|
|
+ /* Control never gets here */
|
|
|
|
default:
|
|
RRETURN(PCRE_ERROR_INTERNAL);
|
|
- break;
|
|
}
|
|
}
|
|
|
|
@@ -2876,7 +3026,7 @@
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
if (fi >= max || eptr >= md->end_subject ||
|
|
(ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
|
|
- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
|
|
+ IS_NEWLINE(eptr)))
|
|
RRETURN(MATCH_NOMATCH);
|
|
|
|
GETCHARINC(c, eptr);
|
|
@@ -2888,6 +3038,23 @@
|
|
case OP_ANYBYTE:
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ switch(c)
|
|
+ {
|
|
+ default: RRETURN(MATCH_NOMATCH);
|
|
+ case 0x000d:
|
|
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
|
|
+ break;
|
|
+ case 0x000a:
|
|
+ case 0x000b:
|
|
+ case 0x000c:
|
|
+ case 0x0085:
|
|
+ case 0x2028:
|
|
+ case 0x2029:
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
|
|
RRETURN(MATCH_NOMATCH);
|
|
@@ -2932,8 +3099,7 @@
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
|
if (fi >= max || eptr >= md->end_subject ||
|
|
- ((ims & PCRE_DOTALL) == 0 &&
|
|
- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
|
|
+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
|
|
RRETURN(MATCH_NOMATCH);
|
|
|
|
c = *eptr++;
|
|
@@ -2945,6 +3111,21 @@
|
|
case OP_ANYBYTE:
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ switch(c)
|
|
+ {
|
|
+ default: RRETURN(MATCH_NOMATCH);
|
|
+ case 0x000d:
|
|
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
|
|
+ break;
|
|
+ case 0x000a:
|
|
+ case 0x000b:
|
|
+ case 0x000c:
|
|
+ case 0x0085:
|
|
+ break;
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
|
|
break;
|
|
@@ -2977,7 +3158,7 @@
|
|
/* Control never gets here */
|
|
}
|
|
|
|
- /* If maximizing it is worth using inline code for speed, doing the type
|
|
+ /* If maximizing, it is worth using inline code for speed, doing the type
|
|
test once at the start (i.e. keep it out of the loop). Again, keep the
|
|
UTF-8 and UCP stuff separate. */
|
|
|
|
@@ -3058,6 +3239,7 @@
|
|
|
|
/* eptr is now past the end of the maximum run */
|
|
|
|
+ if (possessive) continue;
|
|
for(;;)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -3093,6 +3275,7 @@
|
|
|
|
/* eptr is now past the end of the maximum run */
|
|
|
|
+ if (possessive) continue;
|
|
for(;;)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -3135,9 +3318,7 @@
|
|
{
|
|
for (i = min; i < max; i++)
|
|
{
|
|
- if (eptr >= md->end_subject ||
|
|
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
|
|
- break;
|
|
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
|
|
eptr++;
|
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
|
|
}
|
|
@@ -3161,9 +3342,7 @@
|
|
{
|
|
for (i = min; i < max; i++)
|
|
{
|
|
- if (eptr >= md->end_subject ||
|
|
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
|
|
- break;
|
|
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
|
|
eptr++;
|
|
}
|
|
break;
|
|
@@ -3171,7 +3350,8 @@
|
|
else
|
|
{
|
|
c = max - min;
|
|
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
|
|
+ if (c > (unsigned int)(md->end_subject - eptr))
|
|
+ c = md->end_subject - eptr;
|
|
eptr += c;
|
|
}
|
|
}
|
|
@@ -3181,10 +3361,32 @@
|
|
|
|
case OP_ANYBYTE:
|
|
c = max - min;
|
|
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
|
|
+ if (c > (unsigned int)(md->end_subject - eptr))
|
|
+ c = md->end_subject - eptr;
|
|
eptr += c;
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ for (i = min; i < max; i++)
|
|
+ {
|
|
+ int len = 1;
|
|
+ if (eptr >= md->end_subject) break;
|
|
+ GETCHARLEN(c, eptr, len);
|
|
+ if (c == 0x000d)
|
|
+ {
|
|
+ if (++eptr >= md->end_subject) break;
|
|
+ if (*eptr == 0x000a) eptr++;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (c != 0x000a && c != 0x000b && c != 0x000c &&
|
|
+ c != 0x0085 && c != 0x2028 && c != 0x2029)
|
|
+ break;
|
|
+ eptr += len;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
for (i = min; i < max; i++)
|
|
{
|
|
@@ -3257,6 +3459,7 @@
|
|
|
|
/* eptr is now past the end of the maximum run */
|
|
|
|
+ if (possessive) continue;
|
|
for(;;)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -3277,9 +3480,7 @@
|
|
{
|
|
for (i = min; i < max; i++)
|
|
{
|
|
- if (eptr >= md->end_subject ||
|
|
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
|
|
- break;
|
|
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
|
|
eptr++;
|
|
}
|
|
break;
|
|
@@ -3288,10 +3489,30 @@
|
|
|
|
case OP_ANYBYTE:
|
|
c = max - min;
|
|
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
|
|
+ if (c > (unsigned int)(md->end_subject - eptr))
|
|
+ c = md->end_subject - eptr;
|
|
eptr += c;
|
|
break;
|
|
|
|
+ case OP_ANYNL:
|
|
+ for (i = min; i < max; i++)
|
|
+ {
|
|
+ if (eptr >= md->end_subject) break;
|
|
+ c = *eptr;
|
|
+ if (c == 0x000d)
|
|
+ {
|
|
+ if (++eptr >= md->end_subject) break;
|
|
+ if (*eptr == 0x000a) eptr++;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
|
|
+ break;
|
|
+ eptr++;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+
|
|
case OP_NOT_DIGIT:
|
|
for (i = min; i < max; i++)
|
|
{
|
|
@@ -3352,6 +3573,7 @@
|
|
|
|
/* eptr is now past the end of the maximum run */
|
|
|
|
+ if (possessive) continue;
|
|
while (eptr >= pp)
|
|
{
|
|
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
|
|
@@ -3366,14 +3588,12 @@
|
|
}
|
|
/* Control never gets here */
|
|
|
|
- /* There's been some horrible disaster. Since all codes > OP_BRA are
|
|
- for capturing brackets, and there shouldn't be any gaps between 0 and
|
|
- OP_BRA, arrival here can only mean there is something seriously wrong
|
|
- in the code above or the OP_xxx definitions. */
|
|
+ /* There's been some horrible disaster. Arrival here can only mean there is
|
|
+ something seriously wrong in the code above or the OP_xxx definitions. */
|
|
|
|
default:
|
|
DPRINTF(("Unknown opcode %d\n", *ecode));
|
|
- RRETURN(PCRE_ERROR_UNKNOWN_NODE);
|
|
+ RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
|
|
}
|
|
|
|
/* Do not stick any code in here without much thought; it is assumed
|
|
@@ -3411,7 +3631,6 @@
|
|
|
|
#undef cur_is_word
|
|
#undef condition
|
|
-#undef minimize
|
|
#undef prev_is_word
|
|
|
|
#undef original_ims
|
|
@@ -3484,6 +3703,7 @@
|
|
BOOL firstline;
|
|
BOOL first_byte_caseless = FALSE;
|
|
BOOL req_byte_caseless = FALSE;
|
|
+BOOL utf8;
|
|
match_data match_block;
|
|
match_data *md = &match_block;
|
|
const uschar *tables;
|
|
@@ -3491,6 +3711,7 @@
|
|
USPTR start_match = (USPTR)subject + start_offset;
|
|
USPTR end_subject;
|
|
USPTR req_byte_ptr = start_match - 1;
|
|
+eptrblock eptrchain[EPTR_WORK_SIZE];
|
|
|
|
pcre_study_data internal_study;
|
|
const pcre_study_data *study;
|
|
@@ -3567,7 +3788,7 @@
|
|
end_subject = md->end_subject;
|
|
|
|
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
|
|
-md->utf8 = (re->options & PCRE_UTF8) != 0;
|
|
+utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
|
|
|
|
md->notbol = (options & PCRE_NOTBOL) != 0;
|
|
md->noteol = (options & PCRE_NOTEOL) != 0;
|
|
@@ -3576,6 +3797,7 @@
|
|
md->hitend = FALSE;
|
|
|
|
md->recursive = NULL; /* No recursion at top level */
|
|
+md->eptrchain = eptrchain; /* Make workspace generally available */
|
|
|
|
md->lcc = tables + lcc_offset;
|
|
md->ctypes = tables + ctypes_offset;
|
|
@@ -3583,26 +3805,36 @@
|
|
/* Handle different types of newline. The two bits give four cases. If nothing
|
|
is set at run time, whatever was used at compile time applies. */
|
|
|
|
-switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
|
|
- PCRE_NEWLINE_CRLF)
|
|
+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
|
|
+ PCRE_NEWLINE_BITS)
|
|
{
|
|
- default: newline = NEWLINE; break; /* Compile-time default */
|
|
+ case 0: newline = NEWLINE; break; /* Compile-time default */
|
|
case PCRE_NEWLINE_CR: newline = '\r'; break;
|
|
case PCRE_NEWLINE_LF: newline = '\n'; break;
|
|
case PCRE_NEWLINE_CR+
|
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
|
|
+ case PCRE_NEWLINE_ANY: newline = -1; break;
|
|
+ default: return PCRE_ERROR_BADNEWLINE;
|
|
}
|
|
|
|
-if (newline > 255)
|
|
+if (newline < 0)
|
|
{
|
|
- md->nllen = 2;
|
|
- md->nl[0] = (newline >> 8) & 255;
|
|
- md->nl[1] = newline & 255;
|
|
+ md->nltype = NLTYPE_ANY;
|
|
}
|
|
else
|
|
{
|
|
- md->nllen = 1;
|
|
- md->nl[0] = newline;
|
|
+ md->nltype = NLTYPE_FIXED;
|
|
+ if (newline > 255)
|
|
+ {
|
|
+ md->nllen = 2;
|
|
+ md->nl[0] = (newline >> 8) & 255;
|
|
+ md->nl[1] = newline & 255;
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ md->nllen = 1;
|
|
+ md->nl[0] = newline;
|
|
+ }
|
|
}
|
|
|
|
/* Partial matching is supported only for a restricted set of regexes at the
|
|
@@ -3615,7 +3847,7 @@
|
|
back the character offset. */
|
|
|
|
#ifdef SUPPORT_UTF8
|
|
-if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
|
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
|
{
|
|
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
|
|
return PCRE_ERROR_BADUTF8;
|
|
@@ -3707,10 +3939,13 @@
|
|
req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
|
|
}
|
|
|
|
+
|
|
+/* ==========================================================================*/
|
|
+
|
|
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
|
|
the loop runs just once. */
|
|
|
|
-do
|
|
+for(;;)
|
|
{
|
|
USPTR save_end_subject = end_subject;
|
|
|
|
@@ -3725,14 +3960,14 @@
|
|
|
|
/* Advance to a unique first char if possible. If firstline is TRUE, the
|
|
start of the match is constrained to the first line of a multiline string.
|
|
- Implement this by temporarily adjusting end_subject so that we stop scanning
|
|
- at a newline. If the match fails at the newline, later code breaks this loop.
|
|
- */
|
|
+ That is, the match must be before or at the first newline. Implement this by
|
|
+ temporarily adjusting end_subject so that we stop scanning at a newline. If
|
|
+ the match fails at the newline, later code breaks this loop. */
|
|
|
|
if (firstline)
|
|
{
|
|
USPTR t = start_match;
|
|
- while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
|
|
+ while (t < md->end_subject && !IS_NEWLINE(t)) t++;
|
|
end_subject = t;
|
|
}
|
|
|
|
@@ -3753,11 +3988,9 @@
|
|
|
|
else if (startline)
|
|
{
|
|
- if (start_match >= md->start_subject + md->nllen +
|
|
- start_offset)
|
|
+ if (start_match > md->start_subject + start_offset)
|
|
{
|
|
- while (start_match <= end_subject &&
|
|
- !IS_NEWLINE(start_match - md->nllen))
|
|
+ while (start_match <= end_subject && !WAS_NEWLINE(start_match))
|
|
start_match++;
|
|
}
|
|
}
|
|
@@ -3793,8 +4026,8 @@
|
|
|
|
HOWEVER: when the subject string is very, very long, searching to its end can
|
|
take a long time, and give bad performance on quite ordinary patterns. This
|
|
- showed up when somebody was matching /^C/ on a 32-megabyte string... so we
|
|
- don't do this when the string is sufficiently long.
|
|
+ showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
|
|
+ string... so we don't do this when the string is sufficiently long.
|
|
|
|
ALSO: this processing is disabled when partial matching is requested.
|
|
*/
|
|
@@ -3826,9 +4059,14 @@
|
|
}
|
|
}
|
|
|
|
- /* If we can't find the required character, break the matching loop */
|
|
+ /* If we can't find the required character, break the matching loop,
|
|
+ forcing a match failure. */
|
|
|
|
- if (p >= end_subject) break;
|
|
+ if (p >= end_subject)
|
|
+ {
|
|
+ rc = MATCH_NOMATCH;
|
|
+ break;
|
|
+ }
|
|
|
|
/* If we have found the required character, save the point where we
|
|
found it, so that we don't search again next time round the loop if
|
|
@@ -3838,49 +4076,70 @@
|
|
}
|
|
}
|
|
|
|
- /* When a match occurs, substrings will be set for all internal extractions;
|
|
- we just need to set up the whole thing as substring 0 before returning. If
|
|
- there were too many extractions, set the return code to zero. In the case
|
|
- where we had to get some local store to hold offsets for backreferences, copy
|
|
- those back references that we can. In this case there need not be overflow
|
|
- if certain parts of the pattern were not used. */
|
|
+ /* OK, we can now run the match. */
|
|
|
|
md->start_match = start_match;
|
|
md->match_call_count = 0;
|
|
+ md->eptrn = 0; /* Next free eptrchain slot */
|
|
+ rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
|
|
|
|
- rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
|
|
+ /* Any return other than MATCH_NOMATCH breaks the loop. */
|
|
|
|
- /* When the result is no match, if the subject's first character was a
|
|
- newline and the PCRE_FIRSTLINE option is set, break (which will return
|
|
- PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
|
|
- newline in the subject. Otherwise, advance the pointer to the next character
|
|
- and continue - but the continuation will actually happen only when the
|
|
- pattern is not anchored. */
|
|
+ if (rc != MATCH_NOMATCH) break;
|
|
|
|
- if (rc == MATCH_NOMATCH)
|
|
- {
|
|
- if (firstline &&
|
|
- start_match <= md->end_subject - md->nllen &&
|
|
- IS_NEWLINE(start_match))
|
|
- break;
|
|
- start_match++;
|
|
+ /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
|
|
+ newline in the subject (though it may continue over the newline). Therefore,
|
|
+ if we have just failed to match, starting at a newline, do not continue. */
|
|
+
|
|
+ if (firstline && IS_NEWLINE(start_match)) break;
|
|
+
|
|
+ /* Advance the match position by one character. */
|
|
+
|
|
+ start_match++;
|
|
#ifdef SUPPORT_UTF8
|
|
- if (md->utf8)
|
|
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
|
|
- start_match++;
|
|
+ if (utf8)
|
|
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
|
|
+ start_match++;
|
|
#endif
|
|
- continue;
|
|
- }
|
|
|
|
- if (rc != MATCH_MATCH)
|
|
- {
|
|
- DPRINTF((">>>> error: returning %d\n", rc));
|
|
- return rc;
|
|
- }
|
|
+ /* Break the loop if the pattern is anchored or if we have passed the end of
|
|
+ the subject. */
|
|
+
|
|
+ if (anchored || start_match > end_subject) break;
|
|
+
|
|
+ /* If we have just passed a CR and the newline option is CRLF or ANY, and we
|
|
+ are now at a LF, advance the match position by one more character. */
|
|
+
|
|
+ if (start_match[-1] == '\r' &&
|
|
+ (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
|
|
+ start_match < end_subject &&
|
|
+ *start_match == '\n')
|
|
+ start_match++;
|
|
+
|
|
+ } /* End of for(;;) "bumpalong" loop */
|
|
+
|
|
+/* ==========================================================================*/
|
|
+
|
|
+/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
|
|
+conditions is true:
|
|
|
|
- /* We have a match! Copy the offset information from temporary store if
|
|
- necessary */
|
|
+(1) The pattern is anchored;
|
|
|
|
+(2) We are past the end of the subject;
|
|
+
|
|
+(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
|
|
+ this option requests that a match occur at or before the first newline in
|
|
+ the subject.
|
|
+
|
|
+When we have a match and the offset vector is big enough to deal with any
|
|
+backreferences, captured substring offsets will already be set up. In the case
|
|
+where we had to get some local store to hold offsets for backreference
|
|
+processing, copy those that we can. In this case there need not be overflow if
|
|
+certain parts of the pattern were not used, even though there are more
|
|
+capturing parentheses than vector slots. */
|
|
+
|
|
+if (rc == MATCH_MATCH)
|
|
+ {
|
|
if (using_temporary_offsets)
|
|
{
|
|
if (offsetcount >= 4)
|
|
@@ -3889,15 +4148,18 @@
|
|
(offsetcount - 2) * sizeof(int));
|
|
DPRINTF(("Copied offsets from temporary memory\n"));
|
|
}
|
|
- if (md->end_offset_top > offsetcount)
|
|
- md->offset_overflow = TRUE;
|
|
-
|
|
+ if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
|
|
DPRINTF(("Freeing temporary memory\n"));
|
|
(pcre_free)(md->offset_vector);
|
|
}
|
|
|
|
+ /* Set the return code to the number of captured strings, or 0 if there are
|
|
+ too many to fit into the vector. */
|
|
+
|
|
rc = md->offset_overflow? 0 : md->end_offset_top/2;
|
|
|
|
+ /* If there is space, set up the whole thing as substring 0. */
|
|
+
|
|
if (offsetcount < 2) rc = 0; else
|
|
{
|
|
offsets[0] = start_match - md->start_subject;
|
|
@@ -3908,9 +4170,8 @@
|
|
return rc;
|
|
}
|
|
|
|
-/* This "while" is the end of the "do" above */
|
|
-
|
|
-while (!anchored && start_match <= end_subject);
|
|
+/* Control gets here if there has been an error, or if the overall match
|
|
+attempt has failed at all permitted starting positions. */
|
|
|
|
if (using_temporary_offsets)
|
|
{
|
|
@@ -3918,7 +4179,12 @@
|
|
(pcre_free)(md->offset_vector);
|
|
}
|
|
|
|
-if (md->partial && md->hitend)
|
|
+if (rc != MATCH_NOMATCH)
|
|
+ {
|
|
+ DPRINTF((">>>> error: returning %d\n", rc));
|
|
+ return rc;
|
|
+ }
|
|
+else if (md->partial && md->hitend)
|
|
{
|
|
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
|
|
return PCRE_ERROR_PARTIAL;
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_globals.c ./pcrelib/pcre_globals.c
|
|
--- ../pcre.orig/pcrelib/pcre_globals.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_globals.c Fri Feb 9 22:31:19 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -51,6 +51,18 @@
|
|
|
|
|
|
#ifndef VPCOMPAT
|
|
+
|
|
+/**************************************************************************
|
|
+This code used to be here for use when compiling as a C++ library. However,
|
|
+according to Dair Grant it is not needed: "
|
|
+
|
|
+ Including 'extern "C"' in the declaration generates an "initialized and
|
|
+ declared `extern'" warning from gcc 4.0.1. Since we include pcre_internal.h,
|
|
+ which includes pcre.h, which declares these prototypes within an extern "C" {}
|
|
+ block, we shouldn't need the prefix here.
|
|
+
|
|
+So, from Release 7.0 I have cut this out.
|
|
+
|
|
#ifdef __cplusplus
|
|
extern "C" void *(*pcre_malloc)(size_t) = malloc;
|
|
extern "C" void (*pcre_free)(void *) = free;
|
|
@@ -58,12 +70,13 @@
|
|
extern "C" void (*pcre_stack_free)(void *) = free;
|
|
extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
|
|
#else
|
|
+**************************************************************************/
|
|
+
|
|
void *(*pcre_malloc)(size_t) = malloc;
|
|
void (*pcre_free)(void *) = free;
|
|
void *(*pcre_stack_malloc)(size_t) = malloc;
|
|
void (*pcre_stack_free)(void *) = free;
|
|
int (*pcre_callout)(pcre_callout_block *) = NULL;
|
|
-#endif
|
|
#endif
|
|
|
|
/* End of pcre_globals.c */
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_internal.h ./pcrelib/pcre_internal.h
|
|
--- ../pcre.orig/pcrelib/pcre_internal.h Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_internal.h Fri Feb 9 22:31:20 2007
|
|
@@ -7,7 +7,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -54,12 +54,16 @@
|
|
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
|
|
inline, and there are *still* stupid compilers about that don't like indented
|
|
pre-processor statements, or at least there were when I first wrote this. After
|
|
-all, it had only been about 10 years then... */
|
|
+all, it had only been about 10 years then...
|
|
|
|
+It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
|
|
+be absolutely sure we get our version. */
|
|
+
|
|
+#undef DPRINTF
|
|
#ifdef DEBUG
|
|
#define DPRINTF(p) printf p
|
|
#else
|
|
-#define DPRINTF(p) /*nothing*/
|
|
+#define DPRINTF(p) /* Nothing */
|
|
#endif
|
|
|
|
|
|
@@ -118,13 +122,48 @@
|
|
|
|
typedef unsigned char uschar;
|
|
|
|
-/* PCRE is able to support 3 different kinds of newline (CR, LF, CRLF). The
|
|
-following macro is used to package up testing for newlines. NLBLOCK is defined
|
|
-in the various modules to indicate in which datablock the parameters exist. */
|
|
+/* This is an unsigned int value that no character can ever have. UTF-8
|
|
+characters only go up to 0x7fffffff (though Unicode doesn't go beyond
|
|
+0x0010ffff). */
|
|
+
|
|
+#define NOTACHAR 0xffffffff
|
|
+
|
|
+/* PCRE is able to support several different kinds of newline (CR, LF, CRLF,
|
|
+and "all" at present). The following macros are used to package up testing for
|
|
+newlines. NLBLOCK, PSSTART, and PSEND are defined in the various modules to
|
|
+indicate in which datablock the parameters exist, and what the start/end of
|
|
+string field names are. */
|
|
+
|
|
+#define NLTYPE_FIXED 0 /* Newline is a fixed length string */
|
|
+#define NLTYPE_ANY 1 /* Newline is any Unicode line ending */
|
|
+
|
|
+/* This macro checks for a newline at the given position */
|
|
|
|
#define IS_NEWLINE(p) \
|
|
- ((p)[0] == NLBLOCK->nl[0] && \
|
|
- (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]))
|
|
+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \
|
|
+ ((p) < NLBLOCK->PSEND && \
|
|
+ _pcre_is_newline((p), NLBLOCK->PSEND, &(NLBLOCK->nllen), utf8) \
|
|
+ ) \
|
|
+ : \
|
|
+ ((p) <= NLBLOCK->PSEND - NLBLOCK->nllen && \
|
|
+ (p)[0] == NLBLOCK->nl[0] && \
|
|
+ (NLBLOCK->nllen == 1 || (p)[1] == NLBLOCK->nl[1]) \
|
|
+ ) \
|
|
+ )
|
|
+
|
|
+/* This macro checks for a newline immediately preceding the given position */
|
|
+
|
|
+#define WAS_NEWLINE(p) \
|
|
+ ((NLBLOCK->nltype != NLTYPE_FIXED)? \
|
|
+ ((p) > NLBLOCK->PSSTART && \
|
|
+ _pcre_was_newline((p), NLBLOCK->PSSTART, &(NLBLOCK->nllen), utf8) \
|
|
+ ) \
|
|
+ : \
|
|
+ ((p) >= NLBLOCK->PSSTART + NLBLOCK->nllen && \
|
|
+ (p)[-NLBLOCK->nllen] == NLBLOCK->nl[0] && \
|
|
+ (NLBLOCK->nllen == 1 || (p)[-NLBLOCK->nllen+1] == NLBLOCK->nl[1]) \
|
|
+ ) \
|
|
+ )
|
|
|
|
/* When PCRE is compiled as a C++ library, the subject pointer can be replaced
|
|
with a custom type. This makes it possible, for example, to allow pcre_exec()
|
|
@@ -282,7 +321,7 @@
|
|
|
|
#define GETCHAR(c, eptr) \
|
|
c = *eptr; \
|
|
- if ((c & 0xc0) == 0xc0) \
|
|
+ if (c >= 0xc0) \
|
|
{ \
|
|
int gcii; \
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
|
@@ -300,7 +339,7 @@
|
|
|
|
#define GETCHARTEST(c, eptr) \
|
|
c = *eptr; \
|
|
- if (utf8 && (c & 0xc0) == 0xc0) \
|
|
+ if (utf8 && c >= 0xc0) \
|
|
{ \
|
|
int gcii; \
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
|
@@ -318,7 +357,7 @@
|
|
|
|
#define GETCHARINC(c, eptr) \
|
|
c = *eptr++; \
|
|
- if ((c & 0xc0) == 0xc0) \
|
|
+ if (c >= 0xc0) \
|
|
{ \
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
|
int gcss = 6*gcaa; \
|
|
@@ -334,7 +373,7 @@
|
|
|
|
#define GETCHARINCTEST(c, eptr) \
|
|
c = *eptr++; \
|
|
- if (utf8 && (c & 0xc0) == 0xc0) \
|
|
+ if (utf8 && c >= 0xc0) \
|
|
{ \
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
|
int gcss = 6*gcaa; \
|
|
@@ -351,7 +390,7 @@
|
|
|
|
#define GETCHARLEN(c, eptr, len) \
|
|
c = *eptr; \
|
|
- if ((c & 0xc0) == 0xc0) \
|
|
+ if (c >= 0xc0) \
|
|
{ \
|
|
int gcii; \
|
|
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
|
@@ -404,20 +443,21 @@
|
|
/* Masks for identifying the public options that are permitted at compile
|
|
time, run time, or study time, respectively. */
|
|
|
|
+#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY)
|
|
+
|
|
#define PUBLIC_OPTIONS \
|
|
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
|
|
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
|
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
|
|
- PCRE_DUPNAMES|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
|
|
+ PCRE_DUPNAMES|PCRE_NEWLINE_BITS)
|
|
|
|
#define PUBLIC_EXEC_OPTIONS \
|
|
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
|
- PCRE_PARTIAL|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF)
|
|
+ PCRE_PARTIAL|PCRE_NEWLINE_BITS)
|
|
|
|
#define PUBLIC_DFA_EXEC_OPTIONS \
|
|
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
|
- PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_CR| \
|
|
- PCRE_NEWLINE_LF)
|
|
+ PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS)
|
|
|
|
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
|
|
|
@@ -449,9 +489,7 @@
|
|
#define FALSE 0
|
|
#define TRUE 1
|
|
|
|
-/* Escape items that are just an encoding of a particular data value. Note that
|
|
-ESC_n is defined as yet another macro, which is set in config.h to either \n
|
|
-(the default) or \r (which some people want). */
|
|
+/* Escape items that are just an encoding of a particular data value. */
|
|
|
|
#ifndef ESC_e
|
|
#define ESC_e 27
|
|
@@ -462,7 +500,7 @@
|
|
#endif
|
|
|
|
#ifndef ESC_n
|
|
-#define ESC_n NEWLINE
|
|
+#define ESC_n '\n'
|
|
#endif
|
|
|
|
#ifndef ESC_r
|
|
@@ -501,21 +539,28 @@
|
|
their negation. Also, they must appear in the same order as in the opcode
|
|
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
|
|
corresponds to "." rather than an escape sequence. The final one must be
|
|
-ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
|
|
-tests in the code for an escape greater than ESC_b and less than ESC_Z to
|
|
-detect the types that may be repeated. These are the types that consume
|
|
-characters. If any new escapes are put in between that don't consume a
|
|
+ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc).
|
|
+There are two tests in the code for an escape greater than ESC_b and less than
|
|
+ESC_Z to detect the types that may be repeated. These are the types that
|
|
+consume characters. If any new escapes are put in between that don't consume a
|
|
character, that code will have to change. */
|
|
|
|
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
|
|
- ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_X, ESC_Z, ESC_z, ESC_E,
|
|
- ESC_Q, ESC_REF };
|
|
+ ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z,
|
|
+ ESC_E, ESC_Q, ESC_k, ESC_REF };
|
|
+
|
|
|
|
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
|
|
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
|
|
OP_EOD must correspond in order to the list of escapes immediately above.
|
|
-Note that whenever this list is updated, the two macro definitions that follow
|
|
-must also be updated to match. */
|
|
+
|
|
+To keep stored, compiled patterns compatible, new opcodes should be added
|
|
+immediately before OP_BRA, where (since release 7.0) a gap is left for this
|
|
+purpose.
|
|
+
|
|
+*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
|
|
+that follow must also be updated to match. There is also a table called
|
|
+"coptable" in pcre_dfa_exec.c that must be updated. */
|
|
|
|
enum {
|
|
OP_END, /* 0 End of pattern */
|
|
@@ -536,110 +581,122 @@
|
|
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
|
|
OP_NOTPROP, /* 13 \P (not Unicode property) */
|
|
OP_PROP, /* 14 \p (Unicode property) */
|
|
- OP_EXTUNI, /* 15 \X (extended Unicode sequence */
|
|
- OP_EODN, /* 16 End of data or \n at end of data: \Z. */
|
|
- OP_EOD, /* 17 End of data: \z */
|
|
-
|
|
- OP_OPT, /* 18 Set runtime options */
|
|
- OP_CIRC, /* 19 Start of line - varies with multiline switch */
|
|
- OP_DOLL, /* 20 End of line - varies with multiline switch */
|
|
- OP_CHAR, /* 21 Match one character, casefully */
|
|
- OP_CHARNC, /* 22 Match one character, caselessly */
|
|
- OP_NOT, /* 23 Match one character, not the following one */
|
|
-
|
|
- OP_STAR, /* 24 The maximizing and minimizing versions of */
|
|
- OP_MINSTAR, /* 25 all these opcodes must come in pairs, with */
|
|
- OP_PLUS, /* 26 the minimizing one second. */
|
|
- OP_MINPLUS, /* 27 This first set applies to single characters */
|
|
- OP_QUERY, /* 28 */
|
|
- OP_MINQUERY, /* 29 */
|
|
- OP_UPTO, /* 30 From 0 to n matches */
|
|
- OP_MINUPTO, /* 31 */
|
|
- OP_EXACT, /* 32 Exactly n matches */
|
|
-
|
|
- OP_NOTSTAR, /* 33 The maximizing and minimizing versions of */
|
|
- OP_NOTMINSTAR, /* 34 all these opcodes must come in pairs, with */
|
|
- OP_NOTPLUS, /* 35 the minimizing one second. */
|
|
- OP_NOTMINPLUS, /* 36 This set applies to "not" single characters */
|
|
- OP_NOTQUERY, /* 37 */
|
|
- OP_NOTMINQUERY, /* 38 */
|
|
- OP_NOTUPTO, /* 39 From 0 to n matches */
|
|
- OP_NOTMINUPTO, /* 40 */
|
|
- OP_NOTEXACT, /* 41 Exactly n matches */
|
|
-
|
|
- OP_TYPESTAR, /* 42 The maximizing and minimizing versions of */
|
|
- OP_TYPEMINSTAR, /* 43 all these opcodes must come in pairs, with */
|
|
- OP_TYPEPLUS, /* 44 the minimizing one second. These codes must */
|
|
- OP_TYPEMINPLUS, /* 45 be in exactly the same order as those above. */
|
|
- OP_TYPEQUERY, /* 46 This set applies to character types such as \d */
|
|
- OP_TYPEMINQUERY, /* 47 */
|
|
- OP_TYPEUPTO, /* 48 From 0 to n matches */
|
|
- OP_TYPEMINUPTO, /* 49 */
|
|
- OP_TYPEEXACT, /* 50 Exactly n matches */
|
|
-
|
|
- OP_CRSTAR, /* 51 The maximizing and minimizing versions of */
|
|
- OP_CRMINSTAR, /* 52 all these opcodes must come in pairs, with */
|
|
- OP_CRPLUS, /* 53 the minimizing one second. These codes must */
|
|
- OP_CRMINPLUS, /* 54 be in exactly the same order as those above. */
|
|
- OP_CRQUERY, /* 55 These are for character classes and back refs */
|
|
- OP_CRMINQUERY, /* 56 */
|
|
- OP_CRRANGE, /* 57 These are different to the three sets above. */
|
|
- OP_CRMINRANGE, /* 58 */
|
|
+ OP_ANYNL, /* 15 \R (any newline sequence) */
|
|
+ OP_EXTUNI, /* 16 \X (extended Unicode sequence */
|
|
+ OP_EODN, /* 17 End of data or \n at end of data: \Z. */
|
|
+ OP_EOD, /* 18 End of data: \z */
|
|
+
|
|
+ OP_OPT, /* 19 Set runtime options */
|
|
+ OP_CIRC, /* 20 Start of line - varies with multiline switch */
|
|
+ OP_DOLL, /* 21 End of line - varies with multiline switch */
|
|
+ OP_CHAR, /* 22 Match one character, casefully */
|
|
+ OP_CHARNC, /* 23 Match one character, caselessly */
|
|
+ OP_NOT, /* 24 Match one character, not the following one */
|
|
+
|
|
+ OP_STAR, /* 25 The maximizing and minimizing versions of */
|
|
+ OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */
|
|
+ OP_PLUS, /* 27 the minimizing one second. */
|
|
+ OP_MINPLUS, /* 28 This first set applies to single characters.*/
|
|
+ OP_QUERY, /* 29 */
|
|
+ OP_MINQUERY, /* 30 */
|
|
+
|
|
+ OP_UPTO, /* 31 From 0 to n matches */
|
|
+ OP_MINUPTO, /* 32 */
|
|
+ OP_EXACT, /* 33 Exactly n matches */
|
|
+
|
|
+ OP_POSSTAR, /* 34 Possessified star */
|
|
+ OP_POSPLUS, /* 35 Possessified plus */
|
|
+ OP_POSQUERY, /* 36 Posesssified query */
|
|
+ OP_POSUPTO, /* 37 Possessified upto */
|
|
+
|
|
+ OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */
|
|
+ OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */
|
|
+ OP_NOTPLUS, /* 40 the minimizing one second. They must be in */
|
|
+ OP_NOTMINPLUS, /* 41 exactly the same order as those above. */
|
|
+ OP_NOTQUERY, /* 42 This set applies to "not" single characters. */
|
|
+ OP_NOTMINQUERY, /* 43 */
|
|
+
|
|
+ OP_NOTUPTO, /* 44 From 0 to n matches */
|
|
+ OP_NOTMINUPTO, /* 45 */
|
|
+ OP_NOTEXACT, /* 46 Exactly n matches */
|
|
+
|
|
+ OP_NOTPOSSTAR, /* 47 Possessified versions */
|
|
+ OP_NOTPOSPLUS, /* 48 */
|
|
+ OP_NOTPOSQUERY, /* 49 */
|
|
+ OP_NOTPOSUPTO, /* 50 */
|
|
+
|
|
+ OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */
|
|
+ OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */
|
|
+ OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */
|
|
+ OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */
|
|
+ OP_TYPEQUERY, /* 55 This set applies to character types such as \d */
|
|
+ OP_TYPEMINQUERY, /* 56 */
|
|
+
|
|
+ OP_TYPEUPTO, /* 57 From 0 to n matches */
|
|
+ OP_TYPEMINUPTO, /* 58 */
|
|
+ OP_TYPEEXACT, /* 59 Exactly n matches */
|
|
+
|
|
+ OP_TYPEPOSSTAR, /* 60 Possessified versions */
|
|
+ OP_TYPEPOSPLUS, /* 61 */
|
|
+ OP_TYPEPOSQUERY, /* 62 */
|
|
+ OP_TYPEPOSUPTO, /* 63 */
|
|
+
|
|
+ OP_CRSTAR, /* 64 The maximizing and minimizing versions of */
|
|
+ OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */
|
|
+ OP_CRPLUS, /* 66 the minimizing one second. These codes must */
|
|
+ OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */
|
|
+ OP_CRQUERY, /* 68 These are for character classes and back refs */
|
|
+ OP_CRMINQUERY, /* 69 */
|
|
+ OP_CRRANGE, /* 70 These are different to the three sets above. */
|
|
+ OP_CRMINRANGE, /* 71 */
|
|
|
|
- OP_CLASS, /* 59 Match a character class, chars < 256 only */
|
|
- OP_NCLASS, /* 60 Same, but the bitmap was created from a negative
|
|
+ OP_CLASS, /* 72 Match a character class, chars < 256 only */
|
|
+ OP_NCLASS, /* 73 Same, but the bitmap was created from a negative
|
|
class - the difference is relevant only when a UTF-8
|
|
character > 255 is encountered. */
|
|
|
|
- OP_XCLASS, /* 61 Extended class for handling UTF-8 chars within the
|
|
+ OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the
|
|
class. This does both positive and negative. */
|
|
|
|
- OP_REF, /* 62 Match a back reference */
|
|
- OP_RECURSE, /* 63 Match a numbered subpattern (possibly recursive) */
|
|
- OP_CALLOUT, /* 64 Call out to external function if provided */
|
|
-
|
|
- OP_ALT, /* 65 Start of alternation */
|
|
- OP_KET, /* 66 End of group that doesn't have an unbounded repeat */
|
|
- OP_KETRMAX, /* 67 These two must remain together and in this */
|
|
- OP_KETRMIN, /* 68 order. They are for groups the repeat for ever. */
|
|
-
|
|
- /* The assertions must come before ONCE and COND */
|
|
-
|
|
- OP_ASSERT, /* 69 Positive lookahead */
|
|
- OP_ASSERT_NOT, /* 70 Negative lookahead */
|
|
- OP_ASSERTBACK, /* 71 Positive lookbehind */
|
|
- OP_ASSERTBACK_NOT, /* 72 Negative lookbehind */
|
|
- OP_REVERSE, /* 73 Move pointer back - used in lookbehind assertions */
|
|
-
|
|
- /* ONCE and COND must come after the assertions, with ONCE first, as there's
|
|
- a test for >= ONCE for a subpattern that isn't an assertion. */
|
|
-
|
|
- OP_ONCE, /* 74 Once matched, don't back up into the subpattern */
|
|
- OP_COND, /* 75 Conditional group */
|
|
- OP_CREF, /* 76 Used to hold an extraction string number (cond ref) */
|
|
-
|
|
- OP_BRAZERO, /* 77 These two must remain together and in this */
|
|
- OP_BRAMINZERO, /* 78 order. */
|
|
-
|
|
- OP_BRANUMBER, /* 79 Used for extracting brackets whose number is greater
|
|
- than can fit into an opcode. */
|
|
-
|
|
- OP_BRA /* 80 This and greater values are used for brackets that
|
|
- extract substrings up to EXTRACT_BASIC_MAX. After
|
|
- that, use is made of OP_BRANUMBER. */
|
|
-};
|
|
-
|
|
-/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
|
|
-study.c that all opcodes are less than 128 in value. This makes handling UTF-8
|
|
-character sequences easier. */
|
|
-
|
|
-/* The highest extraction number before we have to start using additional
|
|
-bytes. (Originally PCRE didn't have support for extraction counts highter than
|
|
-this number.) The value is limited by the number of opcodes left after OP_BRA,
|
|
-i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
|
|
-opcodes. */
|
|
+ OP_REF, /* 75 Match a back reference */
|
|
+ OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */
|
|
+ OP_CALLOUT, /* 77 Call out to external function if provided */
|
|
+
|
|
+ OP_ALT, /* 78 Start of alternation */
|
|
+ OP_KET, /* 79 End of group that doesn't have an unbounded repeat */
|
|
+ OP_KETRMAX, /* 80 These two must remain together and in this */
|
|
+ OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */
|
|
+
|
|
+ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
|
|
+
|
|
+ OP_ASSERT, /* 82 Positive lookahead */
|
|
+ OP_ASSERT_NOT, /* 83 Negative lookahead */
|
|
+ OP_ASSERTBACK, /* 84 Positive lookbehind */
|
|
+ OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */
|
|
+ OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */
|
|
+
|
|
+ /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
|
|
+ as there's a test for >= ONCE for a subpattern that isn't an assertion. */
|
|
+
|
|
+ OP_ONCE, /* 87 Atomic group */
|
|
+ OP_BRA, /* 88 Start of non-capturing bracket */
|
|
+ OP_CBRA, /* 89 Start of capturing bracket */
|
|
+ OP_COND, /* 90 Conditional group */
|
|
+
|
|
+ /* These three must follow the previous three, in the same order. There's a
|
|
+ check for >= SBRA to distinguish the two sets. */
|
|
+
|
|
+ OP_SBRA, /* 91 Start of non-capturing bracket, check empty */
|
|
+ OP_SCBRA, /* 92 Start of capturing bracket, check empty */
|
|
+ OP_SCOND, /* 93 Conditional group, check empty */
|
|
+
|
|
+ OP_CREF, /* 94 Used to hold a capture number as condition */
|
|
+ OP_RREF, /* 95 Used to hold a recursion number as condition */
|
|
+ OP_DEF, /* 96 The DEFINE condition */
|
|
|
|
-#define EXTRACT_BASIC_MAX 100
|
|
+ OP_BRAZERO, /* 97 These two must remain together and in this */
|
|
+ OP_BRAMINZERO /* 98 order. */
|
|
+};
|
|
|
|
|
|
/* This macro defines textual names for all the opcodes. These are used only
|
|
@@ -648,17 +705,21 @@
|
|
#define OP_NAME_LIST \
|
|
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
|
|
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
|
|
- "notprop", "prop", "extuni", \
|
|
+ "notprop", "prop", "anynl", "extuni", \
|
|
"\\Z", "\\z", \
|
|
"Opt", "^", "$", "char", "charnc", "not", \
|
|
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
|
+ "*+","++", "?+", "{", \
|
|
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
|
+ "*+","++", "?+", "{", \
|
|
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
|
+ "*+","++", "?+", "{", \
|
|
"*", "*?", "+", "+?", "?", "??", "{", "{", \
|
|
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
|
|
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
|
|
- "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cond ref",\
|
|
- "Brazero", "Braminzero", "Branumber", "Bra"
|
|
+ "AssertB", "AssertB not", "Reverse", \
|
|
+ "Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \
|
|
+ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero"
|
|
|
|
|
|
/* This macro defines the length of fixed length operations in the compiled
|
|
@@ -674,7 +735,7 @@
|
|
1, /* End */ \
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
|
|
1, 1, /* Any, Anybyte */ \
|
|
- 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
|
|
+ 3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \
|
|
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
|
2, /* Char - the minimum length */ \
|
|
2, /* Charnc - the minimum length */ \
|
|
@@ -682,12 +743,15 @@
|
|
/* Positive single-char repeats ** These are */ \
|
|
2, 2, 2, 2, 2, 2, /* *, *?, +, +?, ?, ?? ** minima in */ \
|
|
4, 4, 4, /* upto, minupto, exact ** UTF-8 mode */ \
|
|
+ 2, 2, 2, 4, /* *+, ++, ?+, upto+ */ \
|
|
/* Negative single-char repeats - only for chars < 256 */ \
|
|
2, 2, 2, 2, 2, 2, /* NOT *, *?, +, +?, ?, ?? */ \
|
|
4, 4, 4, /* NOT upto, minupto, exact */ \
|
|
+ 2, 2, 2, 4, /* Possessive *, +, ?, upto */ \
|
|
/* Positive type repeats */ \
|
|
2, 2, 2, 2, 2, 2, /* Type *, *?, +, +?, ?, ?? */ \
|
|
4, 4, 4, /* Type upto, minupto, exact */ \
|
|
+ 2, 2, 2, 4, /* Possessive *+, ++, ?+, upto+ */ \
|
|
/* Character class & ref repeats */ \
|
|
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \
|
|
5, 5, /* CRRANGE, CRMINRANGE */ \
|
|
@@ -706,17 +770,22 @@
|
|
1+LINK_SIZE, /* Assert behind */ \
|
|
1+LINK_SIZE, /* Assert behind not */ \
|
|
1+LINK_SIZE, /* Reverse */ \
|
|
- 1+LINK_SIZE, /* Once */ \
|
|
+ 1+LINK_SIZE, /* ONCE */ \
|
|
+ 1+LINK_SIZE, /* BRA */ \
|
|
+ 3+LINK_SIZE, /* CBRA */ \
|
|
1+LINK_SIZE, /* COND */ \
|
|
+ 1+LINK_SIZE, /* SBRA */ \
|
|
+ 3+LINK_SIZE, /* SCBRA */ \
|
|
+ 1+LINK_SIZE, /* SCOND */ \
|
|
3, /* CREF */ \
|
|
+ 3, /* RREF */ \
|
|
+ 1, /* DEF */ \
|
|
1, 1, /* BRAZERO, BRAMINZERO */ \
|
|
- 3, /* BRANUMBER */ \
|
|
- 1+LINK_SIZE /* BRA */ \
|
|
|
|
|
|
-/* A magic value for OP_CREF to indicate the "in recursion" condition. */
|
|
+/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
|
|
|
-#define CREF_RECURSE 0xffff
|
|
+#define RREF_ANY 0xffff
|
|
|
|
/* Error code numbers. They are given names so that they can more easily be
|
|
tracked. */
|
|
@@ -726,7 +795,7 @@
|
|
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
|
|
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
|
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
|
- ERR50, ERR51 };
|
|
+ ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 };
|
|
|
|
/* The real format of the start of the pcre block; the index of names and the
|
|
code vector run on as long as necessary after the end. We store an explicit
|
|
@@ -781,17 +850,23 @@
|
|
const uschar *fcc; /* Points to case-flipping table */
|
|
const uschar *cbits; /* Points to character type table */
|
|
const uschar *ctypes; /* Points to table of type maps */
|
|
+ const uschar *start_workspace;/* The start of working space */
|
|
const uschar *start_code; /* The start of the compiled code */
|
|
const uschar *start_pattern; /* The start of the pattern */
|
|
+ const uschar *end_pattern; /* The end of the pattern */
|
|
+ uschar *hwm; /* High watermark of workspace */
|
|
uschar *name_table; /* The name/number table */
|
|
int names_found; /* Number of entries so far */
|
|
int name_entry_size; /* Size of each entry */
|
|
+ int bracount; /* Count of capturing parens */
|
|
int top_backref; /* Maximum back reference */
|
|
unsigned int backref_map; /* Bitmap of low back refs */
|
|
+ int external_options; /* External (initial) options */
|
|
int req_varyopt; /* "After variable item" flag for reqbyte */
|
|
BOOL nopartial; /* Set TRUE if partial won't work */
|
|
- int nllen; /* 1 or 2 for newline string length */
|
|
- uschar nl[4]; /* Newline string */
|
|
+ int nltype; /* Newline type */
|
|
+ int nllen; /* Newline string length */
|
|
+ uschar nl[4]; /* Newline string when fixed length */
|
|
} compile_data;
|
|
|
|
/* Structure for maintaining a chain of pointers to the currently incomplete
|
|
@@ -824,6 +899,16 @@
|
|
|
|
struct heapframe;
|
|
|
|
+/* Structure for building a chain of data for holding the values of the subject
|
|
+pointer at the start of each subpattern, so as to detect when an empty string
|
|
+has been matched by a subpattern - to break infinite loops. */
|
|
+
|
|
+typedef struct eptrblock {
|
|
+ struct eptrblock *epb_prev;
|
|
+ USPTR epb_saved_eptr;
|
|
+} eptrblock;
|
|
+
|
|
+
|
|
/* Structure for passing "static" information around between the functions
|
|
doing traditional NFA matching, so that they are thread-safe. */
|
|
|
|
@@ -834,8 +919,9 @@
|
|
int *offset_vector; /* Offset vector */
|
|
int offset_end; /* One past the end */
|
|
int offset_max; /* The maximum usable for return data */
|
|
- int nllen; /* 1 or 2 for newline string length */
|
|
- uschar nl[4]; /* Newline string */
|
|
+ int nltype; /* Newline type */
|
|
+ int nllen; /* Newline string length */
|
|
+ uschar nl[4]; /* Newline string when fixed */
|
|
const uschar *lcc; /* Points to lower casing table */
|
|
const uschar *ctypes; /* Points to table of type maps */
|
|
BOOL offset_overflow; /* Set if too many extractions */
|
|
@@ -854,6 +940,8 @@
|
|
int end_offset_top; /* Highwater mark at end of match */
|
|
int capture_last; /* Most recent capture number */
|
|
int start_offset; /* The start offset value */
|
|
+ eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */
|
|
+ int eptrn; /* Next free eptrblock */
|
|
recursion_info *recursive; /* Linked list of recursion data */
|
|
void *callout_data; /* To pass back to callouts */
|
|
struct heapframe *thisframe; /* Used only when compiling for no recursion */
|
|
@@ -869,8 +957,9 @@
|
|
const uschar *tables; /* Character tables */
|
|
int moptions; /* Match options */
|
|
int poptions; /* Pattern options */
|
|
- int nllen; /* 1 or 2 for newline string length */
|
|
- uschar nl[4]; /* Newline string */
|
|
+ int nltype; /* Newline type */
|
|
+ int nllen; /* Newline string length */
|
|
+ uschar nl[4]; /* Newline string when fixed */
|
|
void *callout_data; /* To pass back to callouts */
|
|
} dfa_match_data;
|
|
|
|
@@ -941,13 +1030,17 @@
|
|
one of the exported public functions. They have to be "external" in the C
|
|
sense, but are not part of the PCRE public API. */
|
|
|
|
-extern int _pcre_ord2utf8(int, uschar *);
|
|
-extern real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *,
|
|
- const pcre_study_data *, pcre_study_data *);
|
|
-extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
|
|
-extern int _pcre_ucp_othercase(const int);
|
|
-extern int _pcre_valid_utf8(const uschar *, int);
|
|
-extern BOOL _pcre_xclass(int, const uschar *);
|
|
+extern BOOL _pcre_is_newline(const uschar *, const uschar *, int *,
|
|
+ BOOL);
|
|
+extern int _pcre_ord2utf8(int, uschar *);
|
|
+extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
|
+ const pcre_study_data *, pcre_study_data *);
|
|
+extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
|
|
+extern unsigned int _pcre_ucp_othercase(const unsigned int);
|
|
+extern int _pcre_valid_utf8(const uschar *, int);
|
|
+extern BOOL _pcre_was_newline(const uschar *, const uschar *, int *,
|
|
+ BOOL);
|
|
+extern BOOL _pcre_xclass(int, const uschar *);
|
|
|
|
#endif
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_maketables.c ./pcrelib/pcre_maketables.c
|
|
--- ../pcre.orig/pcrelib/pcre_maketables.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_maketables.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -130,7 +130,7 @@
|
|
meta-character, which in this sense is any character that terminates a run
|
|
of data characters. */
|
|
|
|
- if (strchr("*+?{^.$|()[", i) != 0) x += ctype_meta;
|
|
+ if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta;
|
|
*p++ = x;
|
|
}
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_newline.c ./pcrelib/pcre_newline.c
|
|
--- ../pcre.orig/pcrelib/pcre_newline.c Thu Jan 1 01:00:00 1970
|
|
+++ ./pcrelib/pcre_newline.c Fri Feb 9 20:48:47 2007
|
|
@@ -0,0 +1,135 @@
|
|
+/*************************************************
|
|
+* Perl-Compatible Regular Expressions *
|
|
+*************************************************/
|
|
+
|
|
+/* PCRE is a library of functions to support regular expressions whose syntax
|
|
+and semantics are as close as possible to those of the Perl 5 language.
|
|
+
|
|
+ Written by Philip Hazel
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
+
|
|
+-----------------------------------------------------------------------------
|
|
+Redistribution and use in source and binary forms, with or without
|
|
+modification, are permitted provided that the following conditions are met:
|
|
+
|
|
+ * Redistributions of source code must retain the above copyright notice,
|
|
+ this list of conditions and the following disclaimer.
|
|
+
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
+ notice, this list of conditions and the following disclaimer in the
|
|
+ documentation and/or other materials provided with the distribution.
|
|
+
|
|
+ * Neither the name of the University of Cambridge nor the names of its
|
|
+ contributors may be used to endorse or promote products derived from
|
|
+ this software without specific prior written permission.
|
|
+
|
|
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
+POSSIBILITY OF SUCH DAMAGE.
|
|
+-----------------------------------------------------------------------------
|
|
+*/
|
|
+
|
|
+
|
|
+/* This module contains internal functions for testing newlines when more than
|
|
+one kind of newline is to be recognized. When a newline is found, its length is
|
|
+returned. In principle, we could implement several newline "types", each
|
|
+referring to a different set of newline characters. At present, PCRE supports
|
|
+only NLTYPE_FIXED, which gets handled without these functions, and NLTYPE_ALL,
|
|
+so for now the type isn't passed into the functions. It can easily be added
|
|
+later if required. The full list of Unicode newline characters is taken from
|
|
+http://unicode.org/unicode/reports/tr18/. */
|
|
+
|
|
+
|
|
+#include "pcre_internal.h"
|
|
+
|
|
+
|
|
+
|
|
+/*************************************************
|
|
+* Check for newline at given position *
|
|
+*************************************************/
|
|
+
|
|
+/* It is guaranteed that the initial value of ptr is less than the end of the
|
|
+string that is being processed.
|
|
+
|
|
+Arguments:
|
|
+ ptr pointer to possible newline
|
|
+ endptr pointer to the end of the string
|
|
+ lenptr where to return the length
|
|
+ utf8 TRUE if in utf8 mode
|
|
+
|
|
+Returns: TRUE or FALSE
|
|
+*/
|
|
+
|
|
+BOOL
|
|
+_pcre_is_newline(const uschar *ptr, const uschar *endptr, int *lenptr,
|
|
+ BOOL utf8)
|
|
+{
|
|
+int c;
|
|
+if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
|
|
+switch(c)
|
|
+ {
|
|
+ case 0x000a: /* LF */
|
|
+ case 0x000b: /* VT */
|
|
+ case 0x000c: *lenptr = 1; return TRUE; /* FF */
|
|
+ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1;
|
|
+ return TRUE; /* CR */
|
|
+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
|
+ case 0x2028: /* LS */
|
|
+ case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
|
+ default: return FALSE;
|
|
+ }
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+/*************************************************
|
|
+* Check for newline at previous position *
|
|
+*************************************************/
|
|
+
|
|
+/* It is guaranteed that the initial value of ptr is greater than the start of
|
|
+the string that is being processed.
|
|
+
|
|
+Arguments:
|
|
+ ptr pointer to possible newline
|
|
+ startptr pointer to the start of the string
|
|
+ lenptr where to return the length
|
|
+ utf8 TRUE if in utf8 mode
|
|
+
|
|
+Returns: TRUE or FALSE
|
|
+*/
|
|
+
|
|
+BOOL
|
|
+_pcre_was_newline(const uschar *ptr, const uschar *startptr, int *lenptr,
|
|
+ BOOL utf8)
|
|
+{
|
|
+int c;
|
|
+ptr--;
|
|
+if (utf8)
|
|
+ {
|
|
+ BACKCHAR(ptr);
|
|
+ GETCHAR(c, ptr);
|
|
+ }
|
|
+else c = *ptr;
|
|
+switch(c)
|
|
+ {
|
|
+ case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1;
|
|
+ return TRUE; /* LF */
|
|
+ case 0x000b: /* VT */
|
|
+ case 0x000c: /* FF */
|
|
+ case 0x000d: *lenptr = 1; return TRUE; /* CR */
|
|
+ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */
|
|
+ case 0x2028: /* LS */
|
|
+ case 0x2029: *lenptr = 3; return TRUE; /* PS */
|
|
+ default: return FALSE;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* End of pcre_newline.c */
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_printint.src ./pcrelib/pcre_printint.src
|
|
--- ../pcre.orig/pcrelib/pcre_printint.src Wed Aug 30 22:00:22 2006
|
|
+++ ./pcrelib/pcre_printint.src Fri Feb 9 22:31:20 2007
|
|
@@ -49,9 +49,19 @@
|
|
compiled regex for debugging purposes. */
|
|
|
|
|
|
+/* Macro that decides whether a character should be output as a literal or in
|
|
+hexadecimal. We don't use isprint() because that can vary from system to system
|
|
+(even without the use of locales) and we want the output always to be the same,
|
|
+for testing purposes. This macro is used in pcretest as well as in this file. */
|
|
+
|
|
+#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
|
|
+
|
|
+/* The table of operator names. */
|
|
+
|
|
static const char *OP_names[] = { OP_NAME_LIST };
|
|
|
|
|
|
+
|
|
/*************************************************
|
|
* Print single- or multi-byte character *
|
|
*************************************************/
|
|
@@ -63,7 +73,7 @@
|
|
|
|
if (!utf8 || (c & 0xc0) != 0xc0)
|
|
{
|
|
- if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
|
|
+ if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
|
|
return 0;
|
|
}
|
|
else
|
|
@@ -160,16 +170,6 @@
|
|
|
|
fprintf(f, "%3d ", (int)(code - codestart));
|
|
|
|
- if (*code >= OP_BRA)
|
|
- {
|
|
- if (*code - OP_BRA > EXTRACT_BASIC_MAX)
|
|
- fprintf(f, "%3d Bra extra\n", GET(code, 1));
|
|
- else
|
|
- fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
|
|
- code += _pcre_OP_lengths[OP_BRA];
|
|
- continue;
|
|
- }
|
|
-
|
|
switch(*code)
|
|
{
|
|
case OP_END:
|
|
@@ -203,6 +203,14 @@
|
|
fprintf(f, "\n");
|
|
continue;
|
|
|
|
+ case OP_CBRA:
|
|
+ case OP_SCBRA:
|
|
+ fprintf(f, "%3d %s %d", GET(code, 1), OP_names[*code],
|
|
+ GET2(code, 1+LINK_SIZE));
|
|
+ break;
|
|
+
|
|
+ case OP_BRA:
|
|
+ case OP_SBRA:
|
|
case OP_KETRMAX:
|
|
case OP_KETRMIN:
|
|
case OP_ALT:
|
|
@@ -213,33 +221,45 @@
|
|
case OP_ASSERTBACK_NOT:
|
|
case OP_ONCE:
|
|
case OP_COND:
|
|
+ case OP_SCOND:
|
|
case OP_REVERSE:
|
|
fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
|
|
break;
|
|
|
|
- case OP_BRANUMBER:
|
|
- printf("%3d %s", GET2(code, 1), OP_names[*code]);
|
|
+ case OP_CREF:
|
|
+ fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
|
break;
|
|
|
|
- case OP_CREF:
|
|
- if (GET2(code, 1) == CREF_RECURSE)
|
|
- fprintf(f, " Cond recurse");
|
|
+ case OP_RREF:
|
|
+ c = GET2(code, 1);
|
|
+ if (c == RREF_ANY)
|
|
+ fprintf(f, " Cond recurse any");
|
|
else
|
|
- fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
|
|
+ fprintf(f, " Cond recurse %d", c);
|
|
+ break;
|
|
+
|
|
+ case OP_DEF:
|
|
+ fprintf(f, " Cond def");
|
|
break;
|
|
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
+ case OP_POSSTAR:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
case OP_QUERY:
|
|
case OP_MINQUERY:
|
|
+ case OP_POSQUERY:
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
+ case OP_TYPEPOSSTAR:
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEMINPLUS:
|
|
+ case OP_TYPEPOSPLUS:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPEMINQUERY:
|
|
+ case OP_TYPEPOSQUERY:
|
|
fprintf(f, " ");
|
|
if (*code >= OP_TYPESTAR)
|
|
{
|
|
@@ -257,17 +277,20 @@
|
|
case OP_EXACT:
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
+ case OP_POSUPTO:
|
|
fprintf(f, " ");
|
|
extra = print_char(f, code+3, utf8);
|
|
fprintf(f, "{");
|
|
- if (*code != OP_EXACT) fprintf(f, ",");
|
|
+ if (*code != OP_EXACT) fprintf(f, "0,");
|
|
fprintf(f, "%d}", GET2(code,1));
|
|
if (*code == OP_MINUPTO) fprintf(f, "?");
|
|
+ else if (*code == OP_POSUPTO) fprintf(f, "+");
|
|
break;
|
|
|
|
case OP_TYPEEXACT:
|
|
case OP_TYPEUPTO:
|
|
case OP_TYPEMINUPTO:
|
|
+ case OP_TYPEPOSUPTO:
|
|
fprintf(f, " %s", OP_names[code[3]]);
|
|
if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
|
|
{
|
|
@@ -278,20 +301,26 @@
|
|
if (*code != OP_TYPEEXACT) fprintf(f, "0,");
|
|
fprintf(f, "%d}", GET2(code,1));
|
|
if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
|
|
+ else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
|
|
break;
|
|
|
|
case OP_NOT:
|
|
- if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
|
|
+ c = code[1];
|
|
+ if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
|
else fprintf(f, " [^\\x%02x]", c);
|
|
break;
|
|
|
|
case OP_NOTSTAR:
|
|
case OP_NOTMINSTAR:
|
|
+ case OP_NOTPOSSTAR:
|
|
case OP_NOTPLUS:
|
|
case OP_NOTMINPLUS:
|
|
+ case OP_NOTPOSPLUS:
|
|
case OP_NOTQUERY:
|
|
case OP_NOTMINQUERY:
|
|
- if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
|
|
+ case OP_NOTPOSQUERY:
|
|
+ c = code[1];
|
|
+ if (PRINTABLE(c)) fprintf(f, " [^%c]", c);
|
|
else fprintf(f, " [^\\x%02x]", c);
|
|
fprintf(f, "%s", OP_names[*code]);
|
|
break;
|
|
@@ -299,11 +328,14 @@
|
|
case OP_NOTEXACT:
|
|
case OP_NOTUPTO:
|
|
case OP_NOTMINUPTO:
|
|
- if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
|
|
+ case OP_NOTPOSUPTO:
|
|
+ c = code[3];
|
|
+ if (PRINTABLE(c)) fprintf(f, " [^%c]{", c);
|
|
else fprintf(f, " [^\\x%02x]{", c);
|
|
if (*code != OP_NOTEXACT) fprintf(f, "0,");
|
|
fprintf(f, "%d}", GET2(code,1));
|
|
if (*code == OP_NOTMINUPTO) fprintf(f, "?");
|
|
+ else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
|
|
break;
|
|
|
|
case OP_RECURSE:
|
|
@@ -363,12 +395,14 @@
|
|
for (j = i+1; j < 256; j++)
|
|
if ((ccode[j/8] & (1 << (j&7))) == 0) break;
|
|
if (i == '-' || i == ']') fprintf(f, "\\");
|
|
- if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
|
|
+ if (PRINTABLE(i)) fprintf(f, "%c", i);
|
|
+ else fprintf(f, "\\x%02x", i);
|
|
if (--j > i)
|
|
{
|
|
if (j != i + 1) fprintf(f, "-");
|
|
if (j == '-' || j == ']') fprintf(f, "\\");
|
|
- if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
|
|
+ if (PRINTABLE(j)) fprintf(f, "%c", j);
|
|
+ else fprintf(f, "\\x%02x", j);
|
|
}
|
|
i = j;
|
|
}
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_scanner.cc ./pcrelib/pcre_scanner.cc
|
|
--- ../pcre.orig/pcrelib/pcre_scanner.cc Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/pcre_scanner.cc Fri Feb 9 22:31:20 2007
|
|
@@ -43,6 +43,7 @@
|
|
input_(data_),
|
|
skip_(NULL),
|
|
should_skip_(false),
|
|
+ skip_repeat_(false),
|
|
save_comments_(false),
|
|
comments_(NULL),
|
|
comments_offset_(0) {
|
|
@@ -53,6 +54,7 @@
|
|
input_(data_),
|
|
skip_(NULL),
|
|
should_skip_(false),
|
|
+ skip_repeat_(false),
|
|
save_comments_(false),
|
|
comments_(NULL),
|
|
comments_offset_(0) {
|
|
@@ -63,15 +65,31 @@
|
|
delete comments_;
|
|
}
|
|
|
|
+void Scanner::SetSkipExpression(const char* re) {
|
|
+ delete skip_;
|
|
+ if (re != NULL) {
|
|
+ skip_ = new RE(re);
|
|
+ should_skip_ = true;
|
|
+ skip_repeat_ = true;
|
|
+ ConsumeSkip();
|
|
+ } else {
|
|
+ skip_ = NULL;
|
|
+ should_skip_ = false;
|
|
+ skip_repeat_ = false;
|
|
+ }
|
|
+}
|
|
+
|
|
void Scanner::Skip(const char* re) {
|
|
delete skip_;
|
|
if (re != NULL) {
|
|
skip_ = new RE(re);
|
|
should_skip_ = true;
|
|
+ skip_repeat_ = false;
|
|
ConsumeSkip();
|
|
} else {
|
|
skip_ = NULL;
|
|
should_skip_ = false;
|
|
+ skip_repeat_ = false;
|
|
}
|
|
}
|
|
|
|
@@ -118,19 +136,22 @@
|
|
|
|
// helper function to consume *skip_ and honour save_comments_
|
|
void Scanner::ConsumeSkip() {
|
|
+ const char* start_data = input_.data();
|
|
+ while (skip_->Consume(&input_)) {
|
|
+ if (!skip_repeat_) {
|
|
+ // Only one skip allowed.
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
if (save_comments_) {
|
|
- if (NULL == comments_) {
|
|
+ if (comments_ == NULL) {
|
|
comments_ = new vector<StringPiece>;
|
|
}
|
|
- const char *start_data = input_.data();
|
|
- skip_->Consume(&input_);
|
|
// already pointing one past end, so no need to +1
|
|
int length = input_.data() - start_data;
|
|
if (length > 0) {
|
|
comments_->push_back(StringPiece(start_data, length));
|
|
}
|
|
- } else {
|
|
- skip_->Consume(&input_);
|
|
}
|
|
}
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_scanner.h ./pcrelib/pcre_scanner.h
|
|
--- ../pcre.orig/pcrelib/pcre_scanner.h Tue Aug 9 01:59:00 2005
|
|
+++ ./pcrelib/pcre_scanner.h Fri Feb 9 22:31:20 2007
|
|
@@ -36,7 +36,7 @@
|
|
// Scanner scanner(input);
|
|
// string var;
|
|
// int number;
|
|
-// scanner.Skip("\\s+"); // Skip any white space we encounter
|
|
+// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
|
|
// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
|
|
// ...;
|
|
// }
|
|
@@ -90,10 +90,16 @@
|
|
// skipped. For example, a programming language scanner would use
|
|
// a skip RE that matches white space and comments.
|
|
//
|
|
- // scanner.Skip("(\\s|//.*|/[*](.|\n)*?[*]/)*");
|
|
+ // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
|
|
+ //
|
|
+ // Skipping repeats as long as it succeeds. We used to let people do
|
|
+ // this by writing "(...)*" in the regular expression, but that added
|
|
+ // up to lots of recursive calls within the pcre library, so now we
|
|
+ // control repetition explicitly via the function call API.
|
|
//
|
|
// You can pass NULL for "re" if you do not want any data to be skipped.
|
|
- void Skip(const char* re);
|
|
+ void Skip(const char* re); // DEPRECATED; does *not* repeat
|
|
+ void SetSkipExpression(const char* re);
|
|
|
|
// Temporarily pause "skip"ing. This
|
|
// Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
|
|
@@ -109,12 +115,13 @@
|
|
/***** Special wrappers around SetSkip() for some common idioms *****/
|
|
|
|
// Arranges to skip whitespace, C comments, C++ comments.
|
|
- // The overall RE is a repeated disjunction of the following REs:
|
|
+ // The overall RE is a disjunction of the following REs:
|
|
// \\s whitespace
|
|
// //.*\n C++ comment
|
|
// /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x)
|
|
+ // We get repetition via the semantics of SetSkipExpression, not by using *
|
|
void SkipCXXComments() {
|
|
- Skip("((\\s|//.*\n|/[*](.|\n)*?[*]/)*)");
|
|
+ SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
|
|
}
|
|
|
|
void set_save_comments(bool comments) {
|
|
@@ -143,6 +150,7 @@
|
|
StringPiece input_; // Unprocessed input
|
|
RE* skip_; // If non-NULL, RE for skipping input
|
|
bool should_skip_; // If true, use skip_
|
|
+ bool skip_repeat_; // If true, repeat skip_ as long as it works
|
|
bool save_comments_; // If true, aggregate the skip expression
|
|
|
|
// the skipped comments
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_scanner_unittest.cc ./pcrelib/pcre_scanner_unittest.cc
|
|
--- ../pcre.orig/pcrelib/pcre_scanner_unittest.cc Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/pcre_scanner_unittest.cc Fri Feb 9 22:31:20 2007
|
|
@@ -33,10 +33,13 @@
|
|
// functionality.
|
|
|
|
#include <stdio.h>
|
|
+#include <string>
|
|
#include <vector>
|
|
#include <pcre_stringpiece.h>
|
|
#include <pcre_scanner.h>
|
|
|
|
+#define FLAGS_unittest_stack_size 49152
|
|
+
|
|
// Dies with a fatal error if the two values are not equal.
|
|
#define CHECK_EQ(a, b) do { \
|
|
if ( (a) != (b) ) { \
|
|
@@ -116,8 +119,31 @@
|
|
comments.resize(0);
|
|
}
|
|
|
|
+static void TestBigComment() {
|
|
+ string input;
|
|
+ for (int i = 0; i < 1024; ++i) {
|
|
+ char buf[1024];
|
|
+ snprintf(buf, sizeof(buf), " # Comment %d\n", i);
|
|
+ input += buf;
|
|
+ }
|
|
+ input += "name = value;\n";
|
|
+
|
|
+ Scanner s(input.c_str());
|
|
+ s.SetSkipExpression("\\s+|#.*\n");
|
|
+
|
|
+ string name;
|
|
+ string value;
|
|
+ s.Consume("(\\w+) = (\\w+);", &name, &value);
|
|
+ CHECK_EQ(name, "name");
|
|
+ CHECK_EQ(value, "value");
|
|
+}
|
|
+
|
|
+// TODO: also test scanner and big-comment in a thread with a
|
|
+// small stack size
|
|
+
|
|
int main(int argc, char** argv) {
|
|
TestScanner();
|
|
+ TestBigComment();
|
|
|
|
// Done
|
|
printf("OK\n");
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_study.c ./pcrelib/pcre_study.c
|
|
--- ../pcre.orig/pcrelib/pcre_study.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_study.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -45,6 +45,11 @@
|
|
#include "pcre_internal.h"
|
|
|
|
|
|
+/* Returns from set_start_bits() */
|
|
+
|
|
+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
|
|
+
|
|
+
|
|
/*************************************************
|
|
* Set a bit and maybe its alternate case *
|
|
*************************************************/
|
|
@@ -72,12 +77,16 @@
|
|
|
|
|
|
/*************************************************
|
|
-* Create bitmap of starting chars *
|
|
+* Create bitmap of starting bytes *
|
|
*************************************************/
|
|
|
|
-/* This function scans a compiled unanchored expression and attempts to build a
|
|
-bitmap of the set of initial characters. If it can't, it returns FALSE. As time
|
|
-goes by, we may be able to get more clever at doing this.
|
|
+/* This function scans a compiled unanchored expression recursively and
|
|
+attempts to build a bitmap of the set of possible starting bytes. As time goes
|
|
+by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
|
|
+useful for parenthesized groups in patterns such as (a*)b where the group
|
|
+provides some optional starting bytes but scanning must continue at the outer
|
|
+level to find at least one mandatory byte. At the outermost level, this
|
|
+function fails unless the result is SSB_DONE.
|
|
|
|
Arguments:
|
|
code points to an expression
|
|
@@ -86,14 +95,17 @@
|
|
utf8 TRUE if in UTF-8 mode
|
|
cd the block with char table pointers
|
|
|
|
-Returns: TRUE if table built, FALSE otherwise
|
|
+Returns: SSB_FAIL => Failed to find any starting bytes
|
|
+ SSB_DONE => Found mandatory starting bytes
|
|
+ SSB_CONTINUE => Found optional starting bytes
|
|
*/
|
|
|
|
-static BOOL
|
|
+static int
|
|
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
|
|
BOOL utf8, compile_data *cd)
|
|
{
|
|
register int c;
|
|
+int yield = SSB_DONE;
|
|
|
|
#if 0
|
|
/* ========================================================================= */
|
|
@@ -114,36 +126,60 @@
|
|
|
|
do
|
|
{
|
|
- const uschar *tcode = code + 1 + LINK_SIZE;
|
|
+ const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
|
|
BOOL try_next = TRUE;
|
|
|
|
- while (try_next)
|
|
+ while (try_next) /* Loop for items in this branch */
|
|
{
|
|
- /* If a branch starts with a bracket or a positive lookahead assertion,
|
|
- recurse to set bits from within them. That's all for this branch. */
|
|
-
|
|
- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
|
|
+ int rc;
|
|
+ switch(*tcode)
|
|
{
|
|
- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
|
|
- return FALSE;
|
|
- try_next = FALSE;
|
|
- }
|
|
+ /* Fail if we reach something we don't understand */
|
|
|
|
- else switch(*tcode)
|
|
- {
|
|
default:
|
|
- return FALSE;
|
|
+ return SSB_FAIL;
|
|
|
|
- /* Skip over callout */
|
|
+ /* If we hit a bracket or a positive lookahead assertion, recurse to set
|
|
+ bits from within the subpattern. If it can't find anything, we have to
|
|
+ give up. If it finds some mandatory character(s), we are done for this
|
|
+ branch. Otherwise, carry on scanning after the subpattern. */
|
|
+
|
|
+ case OP_BRA:
|
|
+ case OP_SBRA:
|
|
+ case OP_CBRA:
|
|
+ case OP_SCBRA:
|
|
+ case OP_ONCE:
|
|
+ case OP_ASSERT:
|
|
+ rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
|
|
+ if (rc == SSB_FAIL) return SSB_FAIL;
|
|
+ if (rc == SSB_DONE) try_next = FALSE; else
|
|
+ {
|
|
+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
|
|
+ tcode += 1 + LINK_SIZE;
|
|
+ }
|
|
+ break;
|
|
|
|
- case OP_CALLOUT:
|
|
- tcode += 2 + 2*LINK_SIZE;
|
|
+ /* If we hit ALT or KET, it means we haven't found anything mandatory in
|
|
+ this branch, though we might have found something optional. For ALT, we
|
|
+ continue with the next alternative, but we have to arrange that the final
|
|
+ result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
|
|
+ return SSB_CONTINUE: if this is the top level, that indicates failure,
|
|
+ but after a nested subpattern, it causes scanning to continue. */
|
|
+
|
|
+ case OP_ALT:
|
|
+ yield = SSB_CONTINUE;
|
|
+ try_next = FALSE;
|
|
break;
|
|
|
|
- /* Skip over extended extraction bracket number */
|
|
+ case OP_KET:
|
|
+ case OP_KETRMAX:
|
|
+ case OP_KETRMIN:
|
|
+ return SSB_CONTINUE;
|
|
|
|
- case OP_BRANUMBER:
|
|
- tcode += 3;
|
|
+ /* Skip over callout */
|
|
+
|
|
+ case OP_CALLOUT:
|
|
+ tcode += 2 + 2*LINK_SIZE;
|
|
break;
|
|
|
|
/* Skip over lookbehind and negative lookahead assertions */
|
|
@@ -152,7 +188,7 @@
|
|
case OP_ASSERTBACK:
|
|
case OP_ASSERTBACK_NOT:
|
|
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
|
|
- tcode += 1+LINK_SIZE;
|
|
+ tcode += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Skip over an option setting, changing the caseless flag */
|
|
@@ -166,27 +202,30 @@
|
|
|
|
case OP_BRAZERO:
|
|
case OP_BRAMINZERO:
|
|
- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
|
|
- return FALSE;
|
|
+ if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
|
|
+ return SSB_FAIL;
|
|
/* =========================================================================
|
|
See the comment at the head of this function concerning the next line,
|
|
which was an old fudge for the benefit of OS/2.
|
|
dummy = 1;
|
|
========================================================================= */
|
|
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
|
|
- tcode += 1+LINK_SIZE;
|
|
+ tcode += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Single-char * or ? sets the bit and tries the next item */
|
|
|
|
case OP_STAR:
|
|
case OP_MINSTAR:
|
|
+ case OP_POSSTAR:
|
|
case OP_QUERY:
|
|
case OP_MINQUERY:
|
|
+ case OP_POSQUERY:
|
|
set_bit(start_bits, tcode[1], caseless, cd);
|
|
tcode += 2;
|
|
#ifdef SUPPORT_UTF8
|
|
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
|
|
+ if (utf8 && tcode[-1] >= 0xc0)
|
|
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
|
#endif
|
|
break;
|
|
|
|
@@ -194,10 +233,12 @@
|
|
|
|
case OP_UPTO:
|
|
case OP_MINUPTO:
|
|
+ case OP_POSUPTO:
|
|
set_bit(start_bits, tcode[3], caseless, cd);
|
|
tcode += 4;
|
|
#ifdef SUPPORT_UTF8
|
|
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
|
|
+ if (utf8 && tcode[-1] >= 0xc0)
|
|
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
|
#endif
|
|
break;
|
|
|
|
@@ -210,6 +251,7 @@
|
|
case OP_CHARNC:
|
|
case OP_PLUS:
|
|
case OP_MINPLUS:
|
|
+ case OP_POSPLUS:
|
|
set_bit(start_bits, tcode[1], caseless, cd);
|
|
try_next = FALSE;
|
|
break;
|
|
@@ -283,16 +325,19 @@
|
|
|
|
case OP_TYPEUPTO:
|
|
case OP_TYPEMINUPTO:
|
|
+ case OP_TYPEPOSUPTO:
|
|
tcode += 2; /* Fall through */
|
|
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
+ case OP_TYPEPOSSTAR:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPEMINQUERY:
|
|
+ case OP_TYPEPOSQUERY:
|
|
switch(tcode[1])
|
|
{
|
|
case OP_ANY:
|
|
- return FALSE;
|
|
+ return SSB_FAIL;
|
|
|
|
case OP_NOT_DIGIT:
|
|
for (c = 0; c < 32; c++)
|
|
@@ -418,7 +463,7 @@
|
|
code += GET(code, 1); /* Advance to next branch */
|
|
}
|
|
while (*code == OP_ALT);
|
|
-return TRUE;
|
|
+return yield;
|
|
}
|
|
|
|
|
|
@@ -492,8 +537,8 @@
|
|
/* See if we can find a fixed set of initial characters for the pattern. */
|
|
|
|
memset(start_bits, 0, 32 * sizeof(uschar));
|
|
-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
|
|
- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
|
|
+if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
|
|
+ (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
|
|
|
|
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
|
|
the latter, which is pointed to by the former, which may also get additional
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_tables.c ./pcrelib/pcre_tables.c
|
|
--- ../pcre.orig/pcrelib/pcre_tables.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_tables.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -72,9 +72,8 @@
|
|
const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
|
const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
|
|
|
-/* Table of the number of extra characters, indexed by the first character
|
|
-masked with 0x3f. The highest number for a valid UTF-8 character is in fact
|
|
-0x3d. */
|
|
+/* Table of the number of extra bytes, indexed by the first byte masked with
|
|
+0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
|
|
|
|
const uschar _pcre_utf8_table4[] = {
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
@@ -89,6 +88,7 @@
|
|
{ "Any", PT_ANY, 0 },
|
|
{ "Arabic", PT_SC, ucp_Arabic },
|
|
{ "Armenian", PT_SC, ucp_Armenian },
|
|
+ { "Balinese", PT_SC, ucp_Balinese },
|
|
{ "Bengali", PT_SC, ucp_Bengali },
|
|
{ "Bopomofo", PT_SC, ucp_Bopomofo },
|
|
{ "Braille", PT_SC, ucp_Braille },
|
|
@@ -104,6 +104,7 @@
|
|
{ "Common", PT_SC, ucp_Common },
|
|
{ "Coptic", PT_SC, ucp_Coptic },
|
|
{ "Cs", PT_PC, ucp_Cs },
|
|
+ { "Cuneiform", PT_SC, ucp_Cuneiform },
|
|
{ "Cypriot", PT_SC, ucp_Cypriot },
|
|
{ "Cyrillic", PT_SC, ucp_Cyrillic },
|
|
{ "Deseret", PT_SC, ucp_Deseret },
|
|
@@ -146,6 +147,7 @@
|
|
{ "N", PT_GC, ucp_N },
|
|
{ "Nd", PT_PC, ucp_Nd },
|
|
{ "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue },
|
|
+ { "Nko", PT_SC, ucp_Nko },
|
|
{ "Nl", PT_PC, ucp_Nl },
|
|
{ "No", PT_PC, ucp_No },
|
|
{ "Ogham", PT_SC, ucp_Ogham },
|
|
@@ -158,6 +160,8 @@
|
|
{ "Pd", PT_PC, ucp_Pd },
|
|
{ "Pe", PT_PC, ucp_Pe },
|
|
{ "Pf", PT_PC, ucp_Pf },
|
|
+ { "Phags_Pa", PT_SC, ucp_Phags_Pa },
|
|
+ { "Phoenician", PT_SC, ucp_Phoenician },
|
|
{ "Pi", PT_PC, ucp_Pi },
|
|
{ "Po", PT_PC, ucp_Po },
|
|
{ "Ps", PT_PC, ucp_Ps },
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c ./pcrelib/pcre_ucp_searchfuncs.c
|
|
--- ../pcre.orig/pcrelib/pcre_ucp_searchfuncs.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_ucp_searchfuncs.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -131,11 +131,11 @@
|
|
Arguments:
|
|
c the character value
|
|
|
|
-Returns: the other case or -1 if none
|
|
+Returns: the other case or NOTACHAR if none
|
|
*/
|
|
|
|
-int
|
|
-_pcre_ucp_othercase(const int c)
|
|
+unsigned int
|
|
+_pcre_ucp_othercase(const unsigned int c)
|
|
{
|
|
int bot = 0;
|
|
int top = sizeof(ucp_table)/sizeof(cnode);
|
|
@@ -161,14 +161,14 @@
|
|
}
|
|
}
|
|
|
|
-/* Found an entry in the table. Return -1 for a range entry. Otherwise return
|
|
-the other case if there is one, else -1. */
|
|
+/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
|
|
+return the other case if there is one, else NOTACHAR. */
|
|
|
|
-if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return -1;
|
|
+if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
|
|
|
|
offset = ucp_table[mid].f1 & f1_casemask;
|
|
if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
|
|
-return (offset == 0)? -1 : c + offset;
|
|
+return (offset == 0)? NOTACHAR : c + offset;
|
|
}
|
|
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_valid_utf8.c ./pcrelib/pcre_valid_utf8.c
|
|
--- ../pcre.orig/pcrelib/pcre_valid_utf8.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_valid_utf8.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -79,7 +79,7 @@
|
|
register int ab;
|
|
register int c = *p;
|
|
if (c < 128) continue;
|
|
- if ((c & 0xc0) != 0xc0) return p - string;
|
|
+ if (c < 0xc0) return p - string;
|
|
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
|
if (length < ab) return p - string;
|
|
length -= ab;
|
|
diff -ruN ../pcre.orig/pcrelib/pcre_version.c ./pcrelib/pcre_version.c
|
|
--- ../pcre.orig/pcrelib/pcre_version.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcre_version.c Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -49,16 +49,38 @@
|
|
* Return version string *
|
|
*************************************************/
|
|
|
|
+/* These macros are the standard way of turning unquoted text into C strings.
|
|
+They allow macros like PCRE_MAJOR to be defined without quotes, which is
|
|
+convenient for user programs that want to test its value. */
|
|
+
|
|
#define STRING(a) # a
|
|
#define XSTRING(s) STRING(s)
|
|
|
|
+/* A problem turned up with PCRE_PRERELEASE, which is defined empty for
|
|
+production releases. Originally, it was used naively in this code:
|
|
+
|
|
+ return XSTRING(PCRE_MAJOR)
|
|
+ "." XSTRING(PCRE_MINOR)
|
|
+ XSTRING(PCRE_PRERELEASE)
|
|
+ " " XSTRING(PCRE_DATE);
|
|
+
|
|
+However, when PCRE_PRERELEASE is empty, this leads to an attempted expansion of
|
|
+STRING(). The C standard states: "If (before argument substitution) any
|
|
+argument consists of no preprocessing tokens, the behavior is undefined." It
|
|
+turns out the gcc treats this case as a single empty string - which is what we
|
|
+really want - but Visual C grumbles about the lack of an argument for the
|
|
+macro. Unfortunately, both are within their rights. To cope with both ways of
|
|
+handling this, I had resort to some messy hackery that does a test at run time.
|
|
+I could find no way of detecting that a macro is defined as an empty string at
|
|
+pre-processor time. This hack uses a standard trick for avoiding calling
|
|
+the STRING macro with an empty argument when doing the test. */
|
|
+
|
|
PCRE_DATA_SCOPE const char *
|
|
pcre_version(void)
|
|
{
|
|
-return XSTRING(PCRE_MAJOR)
|
|
- "." XSTRING(PCRE_MINOR)
|
|
- XSTRING(PCRE_PRERELEASE)
|
|
- " " XSTRING(PCRE_DATE);
|
|
+return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)?
|
|
+ XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) :
|
|
+ XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE);
|
|
}
|
|
|
|
/* End of pcre_version.c */
|
|
diff -ruN ../pcre.orig/pcrelib/pcrecpp.cc ./pcrelib/pcrecpp.cc
|
|
--- ../pcre.orig/pcrelib/pcrecpp.cc Wed Aug 30 22:00:22 2006
|
|
+++ ./pcrelib/pcrecpp.cc Fri Feb 9 22:31:20 2007
|
|
@@ -61,7 +61,7 @@
|
|
// If the user doesn't ask for any options, we just use this one
|
|
static RE_Options default_options;
|
|
|
|
-void RE::Init(const char* pat, const RE_Options* options) {
|
|
+void RE::Init(const string& pat, const RE_Options* options) {
|
|
pattern_ = pat;
|
|
if (options == NULL) {
|
|
options_ = default_options;
|
|
@@ -78,7 +78,7 @@
|
|
// conservative in that it may treat some "simple" patterns
|
|
// as "complex" (e.g., if the vertical bar is in a character
|
|
// class or is escaped). But it seems good enough.
|
|
- if (strchr(pat, '|') == NULL) {
|
|
+ if (strchr(pat.c_str(), '|') == NULL) {
|
|
// Simple pattern: we can use position-based checks to perform
|
|
// fully anchored matches
|
|
re_full_ = re_partial_;
|
|
@@ -89,12 +89,18 @@
|
|
}
|
|
}
|
|
|
|
-RE::~RE() {
|
|
+void RE::Cleanup() {
|
|
if (re_full_ != NULL && re_full_ != re_partial_) (*pcre_free)(re_full_);
|
|
if (re_partial_ != NULL) (*pcre_free)(re_partial_);
|
|
if (error_ != &empty_string) delete error_;
|
|
}
|
|
|
|
+
|
|
+RE::~RE() {
|
|
+ Cleanup();
|
|
+}
|
|
+
|
|
+
|
|
pcre* RE::Compile(Anchor anchor) {
|
|
// First, convert RE_Options into pcre options
|
|
int pcre_options = 0;
|
|
@@ -424,6 +430,34 @@
|
|
return Rewrite(out, rewrite, text, vec, matches);
|
|
}
|
|
|
|
+/*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
|
|
+ string result;
|
|
+
|
|
+ // Escape any ascii character not in [A-Za-z_0-9].
|
|
+ //
|
|
+ // Note that it's legal to escape a character even if it has no
|
|
+ // special meaning in a regular expression -- so this function does
|
|
+ // that. (This also makes it identical to the perl function of the
|
|
+ // same name; see `perldoc -f quotemeta`.)
|
|
+ for (int ii = 0; ii < unquoted.size(); ++ii) {
|
|
+ // Note that using 'isalnum' here raises the benchmark time from
|
|
+ // 32ns to 58ns:
|
|
+ if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
|
+ (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
|
|
+ (unquoted[ii] < '0' || unquoted[ii] > '9') &&
|
|
+ unquoted[ii] != '_' &&
|
|
+ // If this is the part of a UTF8 or Latin1 character, we need
|
|
+ // to copy this byte without escaping. Experimentally this is
|
|
+ // what works correctly with the regexp library.
|
|
+ !(unquoted[ii] & 128)) {
|
|
+ result += '\\';
|
|
+ }
|
|
+ result += unquoted[ii];
|
|
+ }
|
|
+
|
|
+ return result;
|
|
+}
|
|
+
|
|
/***** Actual matching and rewriting code *****/
|
|
|
|
int RE::TryMatch(const StringPiece& text,
|
|
@@ -809,14 +843,14 @@
|
|
return parse_##name##_radix(str, n, dest, 0); \
|
|
}
|
|
|
|
-DEFINE_INTEGER_PARSERS(short);
|
|
-DEFINE_INTEGER_PARSERS(ushort);
|
|
-DEFINE_INTEGER_PARSERS(int);
|
|
-DEFINE_INTEGER_PARSERS(uint);
|
|
-DEFINE_INTEGER_PARSERS(long);
|
|
-DEFINE_INTEGER_PARSERS(ulong);
|
|
-DEFINE_INTEGER_PARSERS(longlong);
|
|
-DEFINE_INTEGER_PARSERS(ulonglong);
|
|
+DEFINE_INTEGER_PARSERS(short) /* */
|
|
+DEFINE_INTEGER_PARSERS(ushort) /* */
|
|
+DEFINE_INTEGER_PARSERS(int) /* Don't use semicolons after these */
|
|
+DEFINE_INTEGER_PARSERS(uint) /* statements because they can cause */
|
|
+DEFINE_INTEGER_PARSERS(long) /* compiler warnings if the checking */
|
|
+DEFINE_INTEGER_PARSERS(ulong) /* level is turned up high enough. */
|
|
+DEFINE_INTEGER_PARSERS(longlong) /* */
|
|
+DEFINE_INTEGER_PARSERS(ulonglong) /* */
|
|
|
|
#undef DEFINE_INTEGER_PARSERS
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/pcrecpp.h ./pcrelib/pcrecpp.h
|
|
--- ../pcre.orig/pcrelib/pcrecpp.h Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/pcrecpp.h Fri Feb 9 22:31:20 2007
|
|
@@ -112,6 +112,12 @@
|
|
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
|
// NULL (the corresponding matched sub-pattern is not copied)
|
|
//
|
|
+// CAVEAT: An optional sub-pattern that does not exist in the matched
|
|
+// string is assigned the empty string. Therefore, the following will
|
|
+// return false (because the empty string is not a valid number):
|
|
+// int number;
|
|
+// pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
|
+//
|
|
// -----------------------------------------------------------------------
|
|
// DO_MATCH
|
|
//
|
|
@@ -488,8 +494,25 @@
|
|
// pass in a string or a "const char*" wherever an "RE" is expected.
|
|
RE(const char* pat) { Init(pat, NULL); }
|
|
RE(const char *pat, const RE_Options& option) { Init(pat, &option); }
|
|
- RE(const string& pat) { Init(pat.c_str(), NULL); }
|
|
- RE(const string& pat, const RE_Options& option) { Init(pat.c_str(), &option); }
|
|
+ RE(const string& pat) { Init(pat, NULL); }
|
|
+ RE(const string& pat, const RE_Options& option) { Init(pat, &option); }
|
|
+
|
|
+ // Copy constructor & assignment - note that these are expensive
|
|
+ // because they recompile the expression.
|
|
+ RE(const RE& re) { Init(re.pattern_, &re.options_); }
|
|
+ const RE& operator=(const RE& re) {
|
|
+ if (this != &re) {
|
|
+ Cleanup();
|
|
+
|
|
+ // This is the code that originally came from Google
|
|
+ // Init(re.pattern_.c_str(), &re.options_);
|
|
+
|
|
+ // This is the replacement from Ari Pollak
|
|
+ Init(re.pattern_, &re.options_);
|
|
+ }
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
|
|
~RE();
|
|
|
|
@@ -589,6 +612,15 @@
|
|
const StringPiece &text,
|
|
string *out) const;
|
|
|
|
+ // Escapes all potentially meaningful regexp characters in
|
|
+ // 'unquoted'. The returned string, used as a regular expression,
|
|
+ // will exactly match the original string. For example,
|
|
+ // 1.5-2.0?
|
|
+ // may become:
|
|
+ // 1\.5\-2\.0\?
|
|
+ static string QuoteMeta(const StringPiece& unquoted);
|
|
+
|
|
+
|
|
/***** Generic matching interface *****/
|
|
|
|
// Type of match (TODO: Should be restructured as part of RE_Options)
|
|
@@ -611,7 +643,8 @@
|
|
|
|
private:
|
|
|
|
- void Init(const char* pattern, const RE_Options* options);
|
|
+ void Init(const string& pattern, const RE_Options* options);
|
|
+ void Cleanup();
|
|
|
|
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
|
|
// pairs of integers for the beginning and end positions of matched
|
|
@@ -655,11 +688,6 @@
|
|
pcre* re_full_; // For full matches
|
|
pcre* re_partial_; // For partial matches
|
|
const string* error_; // Error indicator (or points to empty string)
|
|
-
|
|
- // Don't allow the default copy or assignment constructors --
|
|
- // they're expensive and too easy to do by accident.
|
|
- RE(const RE&);
|
|
- void operator=(const RE&);
|
|
};
|
|
|
|
} // namespace pcrecpp
|
|
diff -ruN ../pcre.orig/pcrelib/pcrecpp_unittest.cc ./pcrelib/pcrecpp_unittest.cc
|
|
--- ../pcre.orig/pcrelib/pcrecpp_unittest.cc Wed Aug 30 22:00:22 2006
|
|
+++ ./pcrelib/pcrecpp_unittest.cc Fri Feb 9 22:31:20 2007
|
|
@@ -1,4 +1,6 @@
|
|
-// Copyright (c) 2005, Google Inc.
|
|
+// -*- coding: utf-8 -*-
|
|
+//
|
|
+// Copyright (c) 2005 - 2006, Google Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
@@ -445,6 +447,80 @@
|
|
CHECK(re4.FullMatch(text_bad) == false);
|
|
}
|
|
|
|
+// A meta-quoted string, interpreted as a pattern, should always match
|
|
+// the original unquoted string.
|
|
+static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) {
|
|
+ string quoted = RE::QuoteMeta(unquoted);
|
|
+ RE re(quoted, options);
|
|
+ CHECK(re.FullMatch(unquoted));
|
|
+}
|
|
+
|
|
+// A string containing meaningful regexp characters, which is then meta-
|
|
+// quoted, should not generally match a string the unquoted string does.
|
|
+static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
|
|
+ RE_Options options = RE_Options()) {
|
|
+ string quoted = RE::QuoteMeta(unquoted);
|
|
+ RE re(quoted, options);
|
|
+ CHECK(!re.FullMatch(should_not_match));
|
|
+}
|
|
+
|
|
+// Tests that quoted meta characters match their original strings,
|
|
+// and that a few things that shouldn't match indeed do not.
|
|
+static void TestQuotaMetaSimple() {
|
|
+ TestQuoteMeta("foo");
|
|
+ TestQuoteMeta("foo.bar");
|
|
+ TestQuoteMeta("foo\\.bar");
|
|
+ TestQuoteMeta("[1-9]");
|
|
+ TestQuoteMeta("1.5-2.0?");
|
|
+ TestQuoteMeta("\\d");
|
|
+ TestQuoteMeta("Who doesn't like ice cream?");
|
|
+ TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
|
|
+ TestQuoteMeta("((?!)xxx).*yyy");
|
|
+ TestQuoteMeta("([");
|
|
+}
|
|
+
|
|
+static void TestQuoteMetaSimpleNegative() {
|
|
+ NegativeTestQuoteMeta("foo", "bar");
|
|
+ NegativeTestQuoteMeta("...", "bar");
|
|
+ NegativeTestQuoteMeta("\\.", ".");
|
|
+ NegativeTestQuoteMeta("\\.", "..");
|
|
+ NegativeTestQuoteMeta("(a)", "a");
|
|
+ NegativeTestQuoteMeta("(a|b)", "a");
|
|
+ NegativeTestQuoteMeta("(a|b)", "(a)");
|
|
+ NegativeTestQuoteMeta("(a|b)", "a|b");
|
|
+ NegativeTestQuoteMeta("[0-9]", "0");
|
|
+ NegativeTestQuoteMeta("[0-9]", "0-9");
|
|
+ NegativeTestQuoteMeta("[0-9]", "[9]");
|
|
+ NegativeTestQuoteMeta("((?!)xxx)", "xxx");
|
|
+}
|
|
+
|
|
+static void TestQuoteMetaLatin1() {
|
|
+ TestQuoteMeta("3\xb2 = 9");
|
|
+}
|
|
+
|
|
+static void TestQuoteMetaUtf8() {
|
|
+#ifdef SUPPORT_UTF8
|
|
+ TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8());
|
|
+ TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8
|
|
+ TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol)
|
|
+ TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character
|
|
+ TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime)
|
|
+ TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note)
|
|
+ TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work
|
|
+ NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol)
|
|
+ "27\\\xc2\\\xb0",
|
|
+ pcrecpp::UTF8());
|
|
+#endif
|
|
+}
|
|
+
|
|
+static void TestQuoteMetaAll() {
|
|
+ printf("Testing QuoteMeta\n");
|
|
+ TestQuotaMetaSimple();
|
|
+ TestQuoteMetaSimpleNegative();
|
|
+ TestQuoteMetaLatin1();
|
|
+ TestQuoteMetaUtf8();
|
|
+}
|
|
+
|
|
//
|
|
// Options tests contributed by
|
|
// Giuseppe Maxia, CTO, Stardata s.r.l.
|
|
@@ -667,6 +743,35 @@
|
|
Test_all_options();
|
|
}
|
|
|
|
+static void TestConstructors() {
|
|
+ printf("Testing constructors\n");
|
|
+
|
|
+ RE_Options options;
|
|
+ options.set_dotall(true);
|
|
+ const char *str = "HELLO\n" "cruel\n" "world";
|
|
+
|
|
+ RE orig("HELLO.*world", options);
|
|
+ CHECK(orig.FullMatch(str));
|
|
+
|
|
+ RE copy1(orig);
|
|
+ CHECK(copy1.FullMatch(str));
|
|
+
|
|
+ RE copy2("not a match");
|
|
+ CHECK(!copy2.FullMatch(str));
|
|
+ copy2 = copy1;
|
|
+ CHECK(copy2.FullMatch(str));
|
|
+ copy2 = orig;
|
|
+ CHECK(copy2.FullMatch(str));
|
|
+
|
|
+ // Make sure when we assign to ourselves, nothing bad happens
|
|
+ orig = orig;
|
|
+ copy1 = copy1;
|
|
+ copy2 = copy2;
|
|
+ CHECK(orig.FullMatch(str));
|
|
+ CHECK(copy1.FullMatch(str));
|
|
+ CHECK(copy2.FullMatch(str));
|
|
+}
|
|
+
|
|
int main(int argc, char** argv) {
|
|
// Treat any flag as --help
|
|
if (argc > 1 && argv[1][0] == '-') {
|
|
@@ -985,11 +1090,14 @@
|
|
CHECK(RE("h.*o").PartialMatch("hello!"));
|
|
CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x"));
|
|
|
|
+ /***** other tests *****/
|
|
+
|
|
RadixTests();
|
|
TestReplace();
|
|
TestExtract();
|
|
TestConsume();
|
|
TestFindAndConsume();
|
|
+ TestQuoteMetaAll();
|
|
TestMatchNumberPeculiarity();
|
|
|
|
// Check the pattern() accessor
|
|
@@ -1108,6 +1216,9 @@
|
|
if (getenv("VERBOSE_TEST") != NULL)
|
|
VERBOSE_TEST = true;
|
|
TestOptions();
|
|
+
|
|
+ // Test the constructors
|
|
+ TestConstructors();
|
|
|
|
// Done
|
|
printf("OK\n");
|
|
diff -ruN ../pcre.orig/pcrelib/pcregrep.c ./pcrelib/pcregrep.c
|
|
--- ../pcre.orig/pcrelib/pcregrep.c Wed Jan 3 21:08:37 2007
|
|
+++ ./pcrelib/pcregrep.c Tue Feb 27 04:31:14 2007
|
|
@@ -6,7 +6,7 @@
|
|
its pattern matching. On a Unix or Win32 system it can recurse into
|
|
directories.
|
|
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -56,7 +56,7 @@
|
|
|
|
typedef int BOOL;
|
|
|
|
-#define VERSION "4.3 01-Jun-2006"
|
|
+#define VERSION "4.4 29-Nov-2006"
|
|
#define MAX_PATTERN_COUNT 100
|
|
|
|
#if BUFSIZ > 8192
|
|
@@ -65,7 +65,6 @@
|
|
#define MBUFTHIRD 8192
|
|
#endif
|
|
|
|
-
|
|
/* Values for the "filenames" variable, which specifies options for file name
|
|
output. The order is important; it is assumed that a file name is wanted for
|
|
all values greater than FN_DEFAULT. */
|
|
@@ -83,6 +82,10 @@
|
|
#define PO_LINE_MATCH 0x0002
|
|
#define PO_FIXED_STRINGS 0x0004
|
|
|
|
+/* Line ending types */
|
|
+
|
|
+enum { EL_LF, EL_CR, EL_CRLF, EL_ANY };
|
|
+
|
|
|
|
|
|
/*************************************************
|
|
@@ -100,8 +103,7 @@
|
|
static const char *jfriedl_postfix = "";
|
|
#endif
|
|
|
|
-static int endlinebyte = '\n'; /* Last byte of endline sequence */
|
|
-static int endlineextra = 0; /* Extra bytes for endline sequence */
|
|
+static int endlinetype;
|
|
|
|
static char *colour_string = (char *)"1;31";
|
|
static char *colour_option = NULL;
|
|
@@ -142,6 +144,7 @@
|
|
static BOOL only_matching = FALSE;
|
|
static BOOL quiet = FALSE;
|
|
static BOOL silent = FALSE;
|
|
+static BOOL utf8 = FALSE;
|
|
|
|
/* Structure for options and list of them */
|
|
|
|
@@ -219,6 +222,16 @@
|
|
static const char *suffix[] = {
|
|
"", "\\b", ")$", ")$", "\\E", "\\E\\b", "\\E)$", "\\E)$" };
|
|
|
|
+/* UTF-8 tables - used only when the newline setting is "all". */
|
|
+
|
|
+const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
|
+
|
|
+const char utf8_table4[] = {
|
|
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
+ 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
|
+
|
|
|
|
|
|
/*************************************************
|
|
@@ -471,6 +484,216 @@
|
|
|
|
|
|
/*************************************************
|
|
+* Find end of line *
|
|
+*************************************************/
|
|
+
|
|
+/* The length of the endline sequence that is found is set via lenptr. This may
|
|
+be zero at the very end of the file if there is no line-ending sequence there.
|
|
+
|
|
+Arguments:
|
|
+ p current position in line
|
|
+ endptr end of available data
|
|
+ lenptr where to put the length of the eol sequence
|
|
+
|
|
+Returns: pointer to the last byte of the line
|
|
+*/
|
|
+
|
|
+static char *
|
|
+end_of_line(char *p, char *endptr, int *lenptr)
|
|
+{
|
|
+switch(endlinetype)
|
|
+ {
|
|
+ default: /* Just in case */
|
|
+ case EL_LF:
|
|
+ while (p < endptr && *p != '\n') p++;
|
|
+ if (p < endptr)
|
|
+ {
|
|
+ *lenptr = 1;
|
|
+ return p + 1;
|
|
+ }
|
|
+ *lenptr = 0;
|
|
+ return endptr;
|
|
+
|
|
+ case EL_CR:
|
|
+ while (p < endptr && *p != '\r') p++;
|
|
+ if (p < endptr)
|
|
+ {
|
|
+ *lenptr = 1;
|
|
+ return p + 1;
|
|
+ }
|
|
+ *lenptr = 0;
|
|
+ return endptr;
|
|
+
|
|
+ case EL_CRLF:
|
|
+ for (;;)
|
|
+ {
|
|
+ while (p < endptr && *p != '\r') p++;
|
|
+ if (++p >= endptr)
|
|
+ {
|
|
+ *lenptr = 0;
|
|
+ return endptr;
|
|
+ }
|
|
+ if (*p == '\n')
|
|
+ {
|
|
+ *lenptr = 2;
|
|
+ return p + 1;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ case EL_ANY:
|
|
+ while (p < endptr)
|
|
+ {
|
|
+ int extra = 0;
|
|
+ register int c = *((unsigned char *)p);
|
|
+
|
|
+ if (utf8 && c >= 0xc0)
|
|
+ {
|
|
+ int gcii, gcss;
|
|
+ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
|
+ gcss = 6*extra;
|
|
+ c = (c & utf8_table3[extra]) << gcss;
|
|
+ for (gcii = 1; gcii <= extra; gcii++)
|
|
+ {
|
|
+ gcss -= 6;
|
|
+ c |= (p[gcii] & 0x3f) << gcss;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ p += 1 + extra;
|
|
+
|
|
+ switch (c)
|
|
+ {
|
|
+ case 0x0a: /* LF */
|
|
+ case 0x0b: /* VT */
|
|
+ case 0x0c: /* FF */
|
|
+ *lenptr = 1;
|
|
+ return p;
|
|
+
|
|
+ case 0x0d: /* CR */
|
|
+ if (p < endptr && *p == 0x0a)
|
|
+ {
|
|
+ *lenptr = 2;
|
|
+ p++;
|
|
+ }
|
|
+ else *lenptr = 1;
|
|
+ return p;
|
|
+
|
|
+ case 0x85: /* NEL */
|
|
+ *lenptr = utf8? 2 : 1;
|
|
+ return p;
|
|
+
|
|
+ case 0x2028: /* LS */
|
|
+ case 0x2029: /* PS */
|
|
+ *lenptr = 3;
|
|
+ return p;
|
|
+
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+ } /* End of loop for ANY case */
|
|
+
|
|
+ *lenptr = 0; /* Must have hit the end */
|
|
+ return endptr;
|
|
+ } /* End of overall switch */
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+/*************************************************
|
|
+* Find start of previous line *
|
|
+*************************************************/
|
|
+
|
|
+/* This is called when looking back for before lines to print.
|
|
+
|
|
+Arguments:
|
|
+ p start of the subsequent line
|
|
+ startptr start of available data
|
|
+
|
|
+Returns: pointer to the start of the previous line
|
|
+*/
|
|
+
|
|
+static char *
|
|
+previous_line(char *p, char *startptr)
|
|
+{
|
|
+switch(endlinetype)
|
|
+ {
|
|
+ default: /* Just in case */
|
|
+ case EL_LF:
|
|
+ p--;
|
|
+ while (p > startptr && p[-1] != '\n') p--;
|
|
+ return p;
|
|
+
|
|
+ case EL_CR:
|
|
+ p--;
|
|
+ while (p > startptr && p[-1] != '\n') p--;
|
|
+ return p;
|
|
+
|
|
+ case EL_CRLF:
|
|
+ for (;;)
|
|
+ {
|
|
+ p -= 2;
|
|
+ while (p > startptr && p[-1] != '\n') p--;
|
|
+ if (p <= startptr + 1 || p[-2] == '\r') return p;
|
|
+ }
|
|
+ return p; /* But control should never get here */
|
|
+
|
|
+ case EL_ANY:
|
|
+ if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--;
|
|
+ if (utf8) while ((*p & 0xc0) == 0x80) p--;
|
|
+
|
|
+ while (p > startptr)
|
|
+ {
|
|
+ register int c;
|
|
+ char *pp = p - 1;
|
|
+
|
|
+ if (utf8)
|
|
+ {
|
|
+ int extra = 0;
|
|
+ while ((*pp & 0xc0) == 0x80) pp--;
|
|
+ c = *((unsigned char *)pp);
|
|
+ if (c >= 0xc0)
|
|
+ {
|
|
+ int gcii, gcss;
|
|
+ extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
|
+ gcss = 6*extra;
|
|
+ c = (c & utf8_table3[extra]) << gcss;
|
|
+ for (gcii = 1; gcii <= extra; gcii++)
|
|
+ {
|
|
+ gcss -= 6;
|
|
+ c |= (pp[gcii] & 0x3f) << gcss;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ else c = *((unsigned char *)pp);
|
|
+
|
|
+ switch (c)
|
|
+ {
|
|
+ case 0x0a: /* LF */
|
|
+ case 0x0b: /* VT */
|
|
+ case 0x0c: /* FF */
|
|
+ case 0x0d: /* CR */
|
|
+ case 0x85: /* NEL */
|
|
+ case 0x2028: /* LS */
|
|
+ case 0x2029: /* PS */
|
|
+ return p;
|
|
+
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ p = pp; /* Back one character */
|
|
+ } /* End of loop for ANY case */
|
|
+
|
|
+ return startptr; /* Hit start of data */
|
|
+ } /* End of overall switch */
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+
|
|
+/*************************************************
|
|
* Print the previous "after" lines *
|
|
*************************************************/
|
|
|
|
@@ -495,13 +718,13 @@
|
|
int count = 0;
|
|
while (lastmatchrestart < endptr && count++ < after_context)
|
|
{
|
|
+ int ellength;
|
|
char *pp = lastmatchrestart;
|
|
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
|
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
|
|
- while (*pp != endlinebyte) pp++;
|
|
- fwrite(lastmatchrestart, 1, pp - lastmatchrestart + (1 + endlineextra),
|
|
- stdout);
|
|
- lastmatchrestart = pp + 1;
|
|
+ pp = end_of_line(pp, endptr, &ellength);
|
|
+ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
|
+ lastmatchrestart = pp;
|
|
}
|
|
hyphenpending = TRUE;
|
|
}
|
|
@@ -558,7 +781,7 @@
|
|
|
|
while (ptr < endptr)
|
|
{
|
|
- int i;
|
|
+ int i, endlinelength;
|
|
int mrc = 0;
|
|
BOOL match = FALSE;
|
|
char *t = ptr;
|
|
@@ -571,11 +794,10 @@
|
|
line. In multiline mode the PCRE_FIRSTLINE option is used for compiling, so
|
|
that any match is constrained to be in the first line. */
|
|
|
|
- linelength = 0;
|
|
- while (t < endptr && *t++ != endlinebyte) linelength++;
|
|
+ t = end_of_line(t, endptr, &endlinelength);
|
|
+ linelength = t - ptr - endlinelength;
|
|
length = multiline? endptr - ptr : linelength;
|
|
|
|
-
|
|
/* Extra processing for Jeffrey Friedl's debugging. */
|
|
|
|
#ifdef JFRIEDL_DEBUG
|
|
@@ -706,13 +928,13 @@
|
|
|
|
if (after_context > 0 && lastmatchnumber > 0)
|
|
{
|
|
+ int ellength;
|
|
int linecount = 0;
|
|
char *p = lastmatchrestart;
|
|
|
|
while (p < ptr && linecount < after_context)
|
|
{
|
|
- while (*p != endlinebyte) p++;
|
|
- p++;
|
|
+ p = end_of_line(p, ptr, &ellength);
|
|
linecount++;
|
|
}
|
|
|
|
@@ -725,10 +947,9 @@
|
|
char *pp = lastmatchrestart;
|
|
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
|
if (number) fprintf(stdout, "%d-", lastmatchnumber++);
|
|
- while (*pp != endlinebyte) pp++;
|
|
- fwrite(lastmatchrestart, 1, pp - lastmatchrestart +
|
|
- (1 + endlineextra), stdout);
|
|
- lastmatchrestart = pp + 1;
|
|
+ pp = end_of_line(pp, endptr, &ellength);
|
|
+ fwrite(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
|
|
+ lastmatchrestart = pp;
|
|
}
|
|
if (lastmatchrestart != ptr) hyphenpending = TRUE;
|
|
}
|
|
@@ -754,8 +975,7 @@
|
|
linecount < before_context)
|
|
{
|
|
linecount++;
|
|
- p--;
|
|
- while (p > buffer && p[-1] != endlinebyte) p--;
|
|
+ p = previous_line(p, buffer);
|
|
}
|
|
|
|
if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted)
|
|
@@ -763,12 +983,13 @@
|
|
|
|
while (p < ptr)
|
|
{
|
|
+ int ellength;
|
|
char *pp = p;
|
|
if (printname != NULL) fprintf(stdout, "%s-", printname);
|
|
if (number) fprintf(stdout, "%d-", linenumber - linecount--);
|
|
- while (*pp != endlinebyte) pp++;
|
|
- fwrite(p, 1, pp - p + (1 + endlineextra), stdout);
|
|
- p = pp + 1;
|
|
+ pp = end_of_line(pp, endptr, &ellength);
|
|
+ fwrite(p, 1, pp - p, stdout);
|
|
+ p = pp;
|
|
}
|
|
}
|
|
|
|
@@ -788,11 +1009,16 @@
|
|
|
|
if (multiline)
|
|
{
|
|
+ int ellength;
|
|
char *endmatch = ptr + offsets[1];
|
|
t = ptr;
|
|
- while (t < endmatch) { if (*t++ == endlinebyte) linenumber++; }
|
|
- while (endmatch < endptr && *endmatch != endlinebyte) endmatch++;
|
|
- linelength = endmatch - ptr;
|
|
+ while (t < endmatch)
|
|
+ {
|
|
+ t = end_of_line(t, endptr, &ellength);
|
|
+ if (t <= endmatch) linenumber++; else break;
|
|
+ }
|
|
+ endmatch = end_of_line(endmatch, endptr, &ellength);
|
|
+ linelength = endmatch - ptr - ellength;
|
|
}
|
|
|
|
/*** NOTE: Use only fwrite() to output the data line, so that binary
|
|
@@ -824,9 +1050,7 @@
|
|
fprintf(stdout, "%c[00m", 0x1b);
|
|
fwrite(ptr + offsets[1], 1, linelength - offsets[1], stdout);
|
|
}
|
|
- else fwrite(ptr, 1, linelength, stdout);
|
|
-
|
|
- fprintf(stdout, "\n");
|
|
+ else fwrite(ptr, 1, linelength + endlinelength, stdout);
|
|
}
|
|
|
|
/* End of doing what has to be done for a match */
|
|
@@ -836,13 +1060,13 @@
|
|
/* Remember where the last match happened for after_context. We remember
|
|
where we are about to restart, and that line's number. */
|
|
|
|
- lastmatchrestart = ptr + linelength + 1;
|
|
+ lastmatchrestart = ptr + linelength + endlinelength;
|
|
lastmatchnumber = linenumber + 1;
|
|
}
|
|
|
|
/* Advance to after the newline and increment the line number. */
|
|
|
|
- ptr += linelength + 1;
|
|
+ ptr += linelength + endlinelength;
|
|
linenumber++;
|
|
|
|
/* If we haven't yet reached the end of the file (the buffer is full), and
|
|
@@ -964,8 +1188,7 @@
|
|
while ((nextfile = readdirectory(dir)) != NULL)
|
|
{
|
|
int frc, blen;
|
|
- sprintf(buffer, "%.512s%c%.128s", pathname, sep, nextfile);
|
|
- blen = strlen(buffer);
|
|
+ blen = slprintf(buffer, sizeof(buffer), "%.512s%c%.128s", pathname, sep, nextfile);
|
|
|
|
if (exclude_compiled != NULL &&
|
|
pcre_exec(exclude_compiled, NULL, buffer, blen, 0, 0, NULL, 0) >= 0)
|
|
@@ -1057,7 +1280,7 @@
|
|
{
|
|
int n;
|
|
char s[4];
|
|
- if (op->one_char > 0) sprintf(s, "-%c,", op->one_char); else strcpy(s, " ");
|
|
+ if (op->one_char > 0) snprintf(s, sizeof(s), "-%c,", op->one_char); else strcpy(s, " ");
|
|
printf(" %s --%s%n", s, op->long_name, &n);
|
|
n = 30 - n;
|
|
if (n < 1) n = 1;
|
|
@@ -1098,7 +1321,7 @@
|
|
case 'q': quiet = TRUE; break;
|
|
case 'r': dee_action = dee_RECURSE; break;
|
|
case 's': silent = TRUE; break;
|
|
- case 'u': options |= PCRE_UTF8; break;
|
|
+ case 'u': options |= PCRE_UTF8; utf8 = TRUE; break;
|
|
case 'v': invert = TRUE; break;
|
|
case 'w': process_options |= PO_WORD_MATCH; break;
|
|
case 'x': process_options |= PO_LINE_MATCH; break;
|
|
@@ -1131,7 +1354,7 @@
|
|
{
|
|
static char buffer[8];
|
|
char *p = buffer;
|
|
-sprintf(p, "%d", n);
|
|
+snprintf(p, sizeof(buffer), "%d", n);
|
|
while (*p != 0) p++;
|
|
switch (n%10)
|
|
{
|
|
@@ -1177,7 +1400,7 @@
|
|
return FALSE;
|
|
}
|
|
|
|
-sprintf(buffer, "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern,
|
|
+snprintf(buffer, sizeof(buffer), "%s%.*s%s", prefix[process_options], MBUFTHIRD, pattern,
|
|
suffix[process_options]);
|
|
pattern_list[pattern_count] =
|
|
pcre_compile(buffer, options, &error, &errptr, pcretables);
|
|
@@ -1231,14 +1454,16 @@
|
|
{
|
|
if ((process_options & PO_FIXED_STRINGS) != 0)
|
|
{
|
|
+ char *eop = pattern + strlen(pattern);
|
|
char buffer[MBUFTHIRD];
|
|
for(;;)
|
|
{
|
|
- char *p = strchr(pattern, endlinebyte);
|
|
- if (p == NULL)
|
|
+ int ellength;
|
|
+ char *p = end_of_line(pattern, eop, &ellength);
|
|
+ if (ellength == 0)
|
|
return compile_single_pattern(pattern, options, filename, count);
|
|
- sprintf(buffer, "%.*s", p - pattern - endlineextra, pattern);
|
|
- pattern = p + 1;
|
|
+ snprintf(buffer, sizeof(buffer), "%.*s", p - pattern - ellength, pattern);
|
|
+ pattern = p;
|
|
if (!compile_single_pattern(buffer, options, filename, count))
|
|
return FALSE;
|
|
}
|
|
@@ -1267,7 +1492,9 @@
|
|
const char *locale_from = "--locale";
|
|
const char *error;
|
|
|
|
-/* Set the default line ending value from the default in the PCRE library. */
|
|
+/* Set the default line ending value from the default in the PCRE library;
|
|
+"lf", "cr", "crlf", and "any" are supported. Anything else is treated as "lf".
|
|
+*/
|
|
|
|
(void)pcre_config(PCRE_CONFIG_NEWLINE, &i);
|
|
switch(i)
|
|
@@ -1275,6 +1502,7 @@
|
|
default: newline = (char *)"lf"; break;
|
|
case '\r': newline = (char *)"cr"; break;
|
|
case ('\r' << 8) | '\n': newline = (char *)"crlf"; break;
|
|
+ case -1: newline = (char *)"any"; break;
|
|
}
|
|
|
|
/* Process the options */
|
|
@@ -1350,8 +1578,8 @@
|
|
char buff1[24];
|
|
char buff2[24];
|
|
int baselen = opbra - op->long_name;
|
|
- sprintf(buff1, "%.*s", baselen, op->long_name);
|
|
- sprintf(buff2, "%s%.*s", buff1, strlen(op->long_name) - baselen - 2,
|
|
+ snprintf(buff1, sizeof(buff1), "%.*s", baselen, op->long_name);
|
|
+ snprintf(buff2, sizeof(buff2), "%s%.*s", buff1, strlen(op->long_name) - baselen - 2,
|
|
opbra + 1);
|
|
if (strcmp(arg, buff1) == 0 || strcmp(arg, buff2) == 0)
|
|
break;
|
|
@@ -1565,16 +1793,22 @@
|
|
if (strcmp(newline, "cr") == 0 || strcmp(newline, "CR") == 0)
|
|
{
|
|
pcre_options |= PCRE_NEWLINE_CR;
|
|
- endlinebyte = '\r';
|
|
+ endlinetype = EL_CR;
|
|
}
|
|
else if (strcmp(newline, "lf") == 0 || strcmp(newline, "LF") == 0)
|
|
{
|
|
pcre_options |= PCRE_NEWLINE_LF;
|
|
+ endlinetype = EL_LF;
|
|
}
|
|
else if (strcmp(newline, "crlf") == 0 || strcmp(newline, "CRLF") == 0)
|
|
{
|
|
pcre_options |= PCRE_NEWLINE_CRLF;
|
|
- endlineextra = 1;
|
|
+ endlinetype = EL_CRLF;
|
|
+ }
|
|
+else if (strcmp(newline, "any") == 0 || strcmp(newline, "ANY") == 0)
|
|
+ {
|
|
+ pcre_options |= PCRE_NEWLINE_ANY;
|
|
+ endlinetype = EL_ANY;
|
|
}
|
|
else
|
|
{
|
|
@@ -1700,7 +1934,7 @@
|
|
if (error != NULL)
|
|
{
|
|
char s[16];
|
|
- if (pattern_count == 1) s[0] = 0; else sprintf(s, " number %d", j);
|
|
+ if (pattern_count == 1) s[0] = 0; else snprintf(s, sizeof(s), " number %d", j);
|
|
fprintf(stderr, "pcregrep: Error while studying regex%s: %s\n", s, error);
|
|
return 2;
|
|
}
|
|
diff -ruN ../pcre.orig/pcrelib/pcreposix.c ./pcrelib/pcreposix.c
|
|
--- ../pcre.orig/pcrelib/pcreposix.c Mon Jan 1 10:36:04 2007
|
|
+++ ./pcrelib/pcreposix.c Sat Feb 24 04:30:55 2007
|
|
@@ -6,7 +6,7 @@
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
- Copyright (c) 1997-2007 University of Cambridge
|
|
+ Copyright (c) 1997-2006 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
@@ -78,7 +78,7 @@
|
|
REG_BADPAT, /* unrecognized character after (?< */
|
|
REG_BADPAT, /* lookbehind assertion is not fixed length */
|
|
REG_BADPAT, /* malformed number or name after (?( */
|
|
- REG_BADPAT, /* conditional group containe more than two branches */
|
|
+ REG_BADPAT, /* conditional group contains more than two branches */
|
|
REG_BADPAT, /* assertion expected after (?( */
|
|
REG_BADPAT, /* (?R or (?digits must be followed by ) */
|
|
REG_ECTYPE, /* unknown POSIX class name */
|
|
@@ -93,7 +93,7 @@
|
|
REG_BADPAT, /* closing ) for (?C expected */
|
|
REG_BADPAT, /* recursive call could loop indefinitely */
|
|
REG_BADPAT, /* unrecognized character after (?P */
|
|
- REG_BADPAT, /* syntax error after (?P */
|
|
+ REG_BADPAT, /* syntax error in subpattern name (missing terminator) */
|
|
REG_BADPAT, /* two named subpatterns have the same name */
|
|
REG_BADPAT, /* invalid UTF-8 string */
|
|
REG_BADPAT, /* support for \P, \p, and \X has not been compiled */
|
|
@@ -102,7 +102,13 @@
|
|
REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */
|
|
REG_BADPAT, /* too many named subpatterns (maximum 10,000) */
|
|
REG_BADPAT, /* repeated subpattern is too long */
|
|
- REG_BADPAT /* octal value is greater than \377 (not in UTF-8 mode) */
|
|
+ REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */
|
|
+ REG_BADPAT, /* internal error: overran compiling workspace */
|
|
+ REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */
|
|
+ REG_BADPAT, /* DEFINE group contains more than one branch */
|
|
+ REG_BADPAT, /* repeating a DEFINE group is not allowed */
|
|
+ REG_INVARG, /* inconsistent NEWLINE options */
|
|
+ REG_BADPAT /* \g is not followed followed by an (optionally braced) non-zero number */
|
|
};
|
|
|
|
/* Table of texts corresponding to POSIX error codes */
|
|
@@ -152,7 +158,7 @@
|
|
if (errbuf_size > 0)
|
|
{
|
|
if (addlength > 0 && errbuf_size >= length + addlength)
|
|
- sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
|
|
+ snprintf(errbuf, errbuf_size, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
|
|
else
|
|
{
|
|
strncpy(errbuf, message, errbuf_size - 1);
|
|
diff -ruN ../pcre.orig/pcrelib/pcretest.c ./pcrelib/pcretest.c
|
|
--- ../pcre.orig/pcrelib/pcretest.c Wed Aug 30 22:00:22 2006
|
|
+++ ./pcrelib/pcretest.c Fri Feb 9 22:31:20 2007
|
|
@@ -44,10 +44,29 @@
|
|
#include <locale.h>
|
|
#include <errno.h>
|
|
|
|
-#ifndef _WIN32
|
|
-#include <sys/resource.h>
|
|
+
|
|
+/* A number of things vary for Windows builds. Originally, pcretest opened its
|
|
+input and output without "b"; then I was told that "b" was needed in some
|
|
+environments, so it was added for release 5.0 to both the input and output. (It
|
|
+makes no difference on Unix-like systems.) Later I was told that it is wrong
|
|
+for the input on Windows. I've now abstracted the modes into two macros that
|
|
+are set here, to make it easier to fiddle with them, and removed "b" from the
|
|
+input mode under Windows. */
|
|
+
|
|
+#if defined(_WIN32) || defined(WIN32)
|
|
+#include <io.h> /* For _setmode() */
|
|
+#include <fcntl.h> /* For _O_BINARY */
|
|
+#define INPUT_MODE "r"
|
|
+#define OUTPUT_MODE "wb"
|
|
+
|
|
+#else
|
|
+#include <sys/time.h> /* These two includes are needed */
|
|
+#include <sys/resource.h> /* for setrlimit(). */
|
|
+#define INPUT_MODE "rb"
|
|
+#define OUTPUT_MODE "wb"
|
|
#endif
|
|
|
|
+
|
|
#define PCRE_SPY /* For Win32 build, import data, not export */
|
|
|
|
/* We include pcre_internal.h because we need the internal info for displaying
|
|
@@ -74,10 +93,18 @@
|
|
|
|
/* We also need the pcre_printint() function for printing out compiled
|
|
patterns. This function is in a separate file so that it can be included in
|
|
-pcre_compile.c when that module is compiled with debugging enabled. */
|
|
+pcre_compile.c when that module is compiled with debugging enabled.
|
|
+
|
|
+The definition of the macro PRINTABLE, which determines whether to print an
|
|
+output character as-is or as a hex value when showing compiled patterns, is
|
|
+contained in this file. We uses it here also, in cases when the locale has not
|
|
+been explicitly changed, so as to get consistent output from systems that
|
|
+differ in their output from isprint() even in the "C" locale. */
|
|
|
|
#include "pcre_printint.src"
|
|
|
|
+#define PRINTHEX(c) (locale_set? isprint(c) : PRINTABLE(c))
|
|
+
|
|
|
|
/* It is possible to compile this test program without including support for
|
|
testing the POSIX interface, though this is not available via the standard
|
|
@@ -103,6 +130,8 @@
|
|
#endif
|
|
#endif
|
|
|
|
+/* This is the default loop count for timing. */
|
|
+
|
|
#define LOOPREPEAT 500000
|
|
|
|
/* Static variables */
|
|
@@ -114,6 +143,7 @@
|
|
static int callout_fail_count;
|
|
static int callout_fail_id;
|
|
static int first_callout;
|
|
+static int locale_set = 0;
|
|
static int show_malloc;
|
|
static int use_utf8;
|
|
static size_t gotten_store;
|
|
@@ -157,6 +187,7 @@
|
|
for (;;)
|
|
{
|
|
int rlen = buffer_size - (here - buffer);
|
|
+
|
|
if (rlen > 1000)
|
|
{
|
|
int dlen;
|
|
@@ -213,7 +244,7 @@
|
|
|
|
/* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
|
|
around with conditional compilation, just do the job by hand. It is only used
|
|
-for unpicking the -o argument, so just keep it simple.
|
|
+for unpicking arguments, so just keep it simple.
|
|
|
|
Arguments:
|
|
str string to be converted
|
|
@@ -311,6 +342,8 @@
|
|
Returns: number of characters placed in the buffer
|
|
*/
|
|
|
|
+#if !defined NOUTF8
|
|
+
|
|
static int
|
|
ord2utf8(int cvalue, uschar *utf8bytes)
|
|
{
|
|
@@ -327,6 +360,8 @@
|
|
return i + 1;
|
|
}
|
|
|
|
+#endif
|
|
+
|
|
|
|
|
|
/*************************************************
|
|
@@ -353,16 +388,19 @@
|
|
{
|
|
length -= rc - 1;
|
|
p += rc;
|
|
- if (c < 256 && isprint(c))
|
|
+ if (PRINTHEX(c))
|
|
{
|
|
if (f != NULL) fprintf(f, "%c", c);
|
|
yield++;
|
|
}
|
|
else
|
|
{
|
|
- int n;
|
|
- if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
|
|
- yield += n;
|
|
+ int n = 4;
|
|
+ if (f != NULL) fprintf(f, "\\x{%02x}", c);
|
|
+ yield += (n <= 0x000000ff)? 2 :
|
|
+ (n <= 0x00000fff)? 3 :
|
|
+ (n <= 0x0000ffff)? 4 :
|
|
+ (n <= 0x000fffff)? 5 : 6;
|
|
}
|
|
continue;
|
|
}
|
|
@@ -371,7 +409,8 @@
|
|
|
|
/* Not UTF-8, or malformed UTF-8 */
|
|
|
|
- if (isprint(c = *(p++)))
|
|
+ c = *p++;
|
|
+ if (PRINTHEX(c))
|
|
{
|
|
if (f != NULL) fprintf(f, "%c", c);
|
|
yield++;
|
|
@@ -614,7 +653,7 @@
|
|
*************************************************/
|
|
|
|
/* This is used both at compile and run-time to check for <xxx> escapes, where
|
|
-xxx is LF, CR, or CRLF. Print a message and return 0 if there is no match.
|
|
+xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.
|
|
|
|
Arguments:
|
|
p points after the leading '<'
|
|
@@ -629,6 +668,7 @@
|
|
if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
|
|
if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
|
|
if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
|
|
+if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;
|
|
fprintf(f, "Unknown newline type at: <%s\n", p);
|
|
return 0;
|
|
}
|
|
@@ -636,6 +676,38 @@
|
|
|
|
|
|
/*************************************************
|
|
+* Usage function *
|
|
+*************************************************/
|
|
+
|
|
+static void
|
|
+usage(void)
|
|
+{
|
|
+printf("Usage: pcretest [options] [<input> [<output>]]\n");
|
|
+printf(" -b show compiled code (bytecode)\n");
|
|
+printf(" -C show PCRE compile-time options and exit\n");
|
|
+printf(" -d debug: show compiled code and information (-b and -i)\n");
|
|
+#if !defined NODFA
|
|
+printf(" -dfa force DFA matching for all subjects\n");
|
|
+#endif
|
|
+printf(" -help show usage information\n");
|
|
+printf(" -i show information about compiled patterns\n"
|
|
+ " -m output memory used information\n"
|
|
+ " -o <n> set size of offsets vector to <n>\n");
|
|
+#if !defined NOPOSIX
|
|
+printf(" -p use POSIX interface\n");
|
|
+#endif
|
|
+printf(" -q quiet: do not output PCRE version number at start\n");
|
|
+printf(" -S <n> set stack size to <n> megabytes\n");
|
|
+printf(" -s output store (memory) used information\n"
|
|
+ " -t time compilation and execution\n");
|
|
+printf(" -t <n> time compilation and execution, repeating <n> times\n");
|
|
+printf(" -tm time execution (matching) only\n");
|
|
+printf(" -tm <n> time execution (matching) only, repeating <n> times\n");
|
|
+}
|
|
+
|
|
+
|
|
+
|
|
+/*************************************************
|
|
* Main Program *
|
|
*************************************************/
|
|
|
|
@@ -650,6 +722,7 @@
|
|
int study_options = 0;
|
|
int op = 1;
|
|
int timeit = 0;
|
|
+int timeitm = 0;
|
|
int showinfo = 0;
|
|
int showstore = 0;
|
|
int quiet = 0;
|
|
@@ -681,16 +754,19 @@
|
|
dbuffer = (unsigned char *)malloc(buffer_size);
|
|
pbuffer = (unsigned char *)malloc(buffer_size);
|
|
|
|
-/* The outfile variable is static so that new_malloc can use it. The _setmode()
|
|
-stuff is some magic that I don't understand, but which apparently does good
|
|
-things in Windows. It's related to line terminations. */
|
|
-
|
|
-#if defined(_WIN32) || defined(WIN32)
|
|
-_setmode( _fileno( stdout ), 0x8000 );
|
|
-#endif /* defined(_WIN32) || defined(WIN32) */
|
|
+/* The outfile variable is static so that new_malloc can use it. */
|
|
|
|
outfile = stdout;
|
|
|
|
+/* The following _setmode() stuff is some Windows magic that tells its runtime
|
|
+library to translate CRLF into a single LF character. At least, that's what
|
|
+I've been told: never having used Windows I take this all on trust. Originally
|
|
+it set 0x8000, but then I was advised that _O_BINARY was better. */
|
|
+
|
|
+#if defined(_WIN32) || defined(WIN32)
|
|
+_setmode( _fileno( stdout ), _O_BINARY );
|
|
+#endif
|
|
+
|
|
/* Scan options */
|
|
|
|
while (argc > 1 && argv[op][0] == '-')
|
|
@@ -699,8 +775,8 @@
|
|
|
|
if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
|
|
showstore = 1;
|
|
- else if (strcmp(argv[op], "-t") == 0) timeit = 1;
|
|
else if (strcmp(argv[op], "-q") == 0) quiet = 1;
|
|
+ else if (strcmp(argv[op], "-b") == 0) debug = 1;
|
|
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
|
|
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
|
|
#if !defined NODFA
|
|
@@ -713,11 +789,25 @@
|
|
op++;
|
|
argc--;
|
|
}
|
|
+ else if (strcmp(argv[op], "-t") == 0 || strcmp(argv[op], "-tm") == 0)
|
|
+ {
|
|
+ int both = argv[op][2] == 0;
|
|
+ int temp;
|
|
+ if (argc > 2 && (temp = get_value((unsigned char *)argv[op+1], &endptr),
|
|
+ *endptr == 0))
|
|
+ {
|
|
+ timeitm = temp;
|
|
+ op++;
|
|
+ argc--;
|
|
+ }
|
|
+ else timeitm = LOOPREPEAT;
|
|
+ if (both) timeit = timeitm;
|
|
+ }
|
|
else if (strcmp(argv[op], "-S") == 0 && argc > 2 &&
|
|
((stack_size = get_value((unsigned char *)argv[op+1], &endptr)),
|
|
*endptr == 0))
|
|
{
|
|
-#ifdef _WIN32
|
|
+#if defined(_WIN32) || defined(WIN32)
|
|
printf("PCRE: -S not supported on this OS\n");
|
|
exit(1);
|
|
#else
|
|
@@ -749,7 +839,8 @@
|
|
printf(" %sUnicode properties support\n", rc? "" : "No ");
|
|
(void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
|
|
printf(" Newline sequence is %s\n", (rc == '\r')? "CR" :
|
|
- (rc == '\n')? "LF" : "CRLF");
|
|
+ (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
|
|
+ (rc == -1)? "ANY" : "???");
|
|
(void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
|
|
printf(" Internal link size = %d\n", rc);
|
|
(void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
|
|
@@ -762,24 +853,16 @@
|
|
printf(" Match recursion uses %s\n", rc? "stack" : "heap");
|
|
exit(0);
|
|
}
|
|
+ else if (strcmp(argv[op], "-help") == 0 ||
|
|
+ strcmp(argv[op], "--help") == 0)
|
|
+ {
|
|
+ usage();
|
|
+ goto EXIT;
|
|
+ }
|
|
else
|
|
{
|
|
printf("** Unknown or malformed option %s\n", argv[op]);
|
|
- printf("Usage: pcretest [options] [<input> [<output>]]\n");
|
|
- printf(" -C show PCRE compile-time options and exit\n");
|
|
- printf(" -d debug: show compiled code; implies -i\n");
|
|
-#if !defined NODFA
|
|
- printf(" -dfa force DFA matching for all subjects\n");
|
|
-#endif
|
|
- printf(" -i show information about compiled pattern\n"
|
|
- " -m output memory used information\n"
|
|
- " -o <n> set size of offsets vector to <n>\n");
|
|
-#if !defined NOPOSIX
|
|
- printf(" -p use POSIX interface\n");
|
|
-#endif
|
|
- printf(" -S <n> set stack size to <n> megabytes\n");
|
|
- printf(" -s output store (memory) used information\n"
|
|
- " -t time compilation and execution\n");
|
|
+ usage();
|
|
yield = 1;
|
|
goto EXIT;
|
|
}
|
|
@@ -803,7 +886,7 @@
|
|
|
|
if (argc > 1)
|
|
{
|
|
- infile = fopen(argv[op], "rb");
|
|
+ infile = fopen(argv[op], INPUT_MODE);
|
|
if (infile == NULL)
|
|
{
|
|
printf("** Failed to open %s\n", argv[op]);
|
|
@@ -814,7 +897,7 @@
|
|
|
|
if (argc > 2)
|
|
{
|
|
- outfile = fopen(argv[op+1], "wb");
|
|
+ outfile = fopen(argv[op+1], OUTPUT_MODE);
|
|
if (outfile == NULL)
|
|
{
|
|
printf("** Failed to open %s\n", argv[op+1]);
|
|
@@ -859,7 +942,7 @@
|
|
int do_showinfo = showinfo;
|
|
int do_showrest = 0;
|
|
int do_flip = 0;
|
|
- int erroroffset, len, delimiter;
|
|
+ int erroroffset, len, delimiter, poffset;
|
|
|
|
use_utf8 = 0;
|
|
|
|
@@ -969,6 +1052,7 @@
|
|
}
|
|
|
|
pp = p;
|
|
+ poffset = p - buffer;
|
|
|
|
for(;;)
|
|
{
|
|
@@ -989,6 +1073,11 @@
|
|
if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
|
|
}
|
|
|
|
+ /* The buffer may have moved while being extended; reset the start of data
|
|
+ pointer to the correct relative point in the buffer. */
|
|
+
|
|
+ p = buffer + poffset;
|
|
+
|
|
/* If the first character after the delimiter is backslash, make
|
|
the pattern end with backslash. This is purely to provide a way
|
|
of testing for the error message when a pattern ends with backslash. */
|
|
@@ -1020,6 +1109,7 @@
|
|
|
|
case '+': do_showrest = 1; break;
|
|
case 'A': options |= PCRE_ANCHORED; break;
|
|
+ case 'B': do_debug = 1; break;
|
|
case 'C': options |= PCRE_AUTO_CALLOUT; break;
|
|
case 'D': do_debug = do_showinfo = 1; break;
|
|
case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
|
|
@@ -1042,14 +1132,16 @@
|
|
|
|
case 'L':
|
|
ppp = pp;
|
|
- /* The '\r' test here is so that it works on Windows */
|
|
- while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
|
|
+ /* The '\r' test here is so that it works on Windows. */
|
|
+ /* The '0' test is just in case this is an unterminated line. */
|
|
+ while (*ppp != 0 && *ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
|
|
*ppp = 0;
|
|
if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
|
|
{
|
|
fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
|
|
goto SKIP_DATA;
|
|
}
|
|
+ locale_set = 1;
|
|
tables = pcre_maketables();
|
|
pp = ppp;
|
|
break;
|
|
@@ -1116,19 +1208,19 @@
|
|
#endif /* !defined NOPOSIX */
|
|
|
|
{
|
|
- if (timeit)
|
|
+ if (timeit > 0)
|
|
{
|
|
register int i;
|
|
clock_t time_taken;
|
|
clock_t start_time = clock();
|
|
- for (i = 0; i < LOOPREPEAT; i++)
|
|
+ for (i = 0; i < timeit; i++)
|
|
{
|
|
re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
|
|
if (re != NULL) free(re);
|
|
}
|
|
time_taken = clock() - start_time;
|
|
- fprintf(outfile, "Compile time %.3f milliseconds\n",
|
|
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
|
+ fprintf(outfile, "Compile time %.4f milliseconds\n",
|
|
+ (((double)time_taken * 1000.0) / (double)timeit) /
|
|
(double)CLOCKS_PER_SEC);
|
|
}
|
|
|
|
@@ -1180,17 +1272,17 @@
|
|
|
|
if (do_study)
|
|
{
|
|
- if (timeit)
|
|
+ if (timeit > 0)
|
|
{
|
|
register int i;
|
|
clock_t time_taken;
|
|
clock_t start_time = clock();
|
|
- for (i = 0; i < LOOPREPEAT; i++)
|
|
+ for (i = 0; i < timeit; i++)
|
|
extra = pcre_study(re, study_options, &error);
|
|
time_taken = clock() - start_time;
|
|
if (extra != NULL) free(extra);
|
|
- fprintf(outfile, " Study time %.3f milliseconds\n",
|
|
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
|
+ fprintf(outfile, " Study time %.4f milliseconds\n",
|
|
+ (((double)time_taken * 1000.0) / (double)timeit) /
|
|
(double)CLOCKS_PER_SEC);
|
|
}
|
|
extra = pcre_study(re, study_options, &error);
|
|
@@ -1233,6 +1325,12 @@
|
|
|
|
SHOW_INFO:
|
|
|
|
+ if (do_debug)
|
|
+ {
|
|
+ fprintf(outfile, "------------------------------------------------------------------\n");
|
|
+ pcre_printint(re, outfile);
|
|
+ }
|
|
+
|
|
if (do_showinfo)
|
|
{
|
|
unsigned long int get_options, all_options;
|
|
@@ -1243,12 +1341,6 @@
|
|
int nameentrysize, namecount;
|
|
const uschar *nametable;
|
|
|
|
- if (do_debug)
|
|
- {
|
|
- fprintf(outfile, "------------------------------------------------------------------\n");
|
|
- pcre_printint(re, outfile);
|
|
- }
|
|
-
|
|
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
|
|
new_info(re, NULL, PCRE_INFO_SIZE, &size);
|
|
new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
|
|
@@ -1327,7 +1419,7 @@
|
|
((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "",
|
|
((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : "");
|
|
|
|
- switch (get_options & PCRE_NEWLINE_CRLF)
|
|
+ switch (get_options & PCRE_NEWLINE_BITS)
|
|
{
|
|
case PCRE_NEWLINE_CR:
|
|
fprintf(outfile, "Forced newline sequence: CR\n");
|
|
@@ -1341,6 +1433,10 @@
|
|
fprintf(outfile, "Forced newline sequence: CRLF\n");
|
|
break;
|
|
|
|
+ case PCRE_NEWLINE_ANY:
|
|
+ fprintf(outfile, "Forced newline sequence: ANY\n");
|
|
+ break;
|
|
+
|
|
default:
|
|
break;
|
|
}
|
|
@@ -1358,7 +1454,7 @@
|
|
int ch = first_char & 255;
|
|
const char *caseless = ((first_char & REQ_CASELESS) == 0)?
|
|
"" : " (caseless)";
|
|
- if (isprint(ch))
|
|
+ if (PRINTHEX(ch))
|
|
fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
|
|
else
|
|
fprintf(outfile, "First char = %d%s\n", ch, caseless);
|
|
@@ -1373,7 +1469,7 @@
|
|
int ch = need_char & 255;
|
|
const char *caseless = ((need_char & REQ_CASELESS) == 0)?
|
|
"" : " (caseless)";
|
|
- if (isprint(ch))
|
|
+ if (PRINTHEX(ch))
|
|
fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
|
|
else
|
|
fprintf(outfile, "Need char = %d%s\n", ch, caseless);
|
|
@@ -1409,7 +1505,7 @@
|
|
fprintf(outfile, "\n ");
|
|
c = 2;
|
|
}
|
|
- if (isprint(i) && i != ' ')
|
|
+ if (PRINTHEX(i) && i != ' ')
|
|
{
|
|
fprintf(outfile, "%c ", i);
|
|
c += 2;
|
|
@@ -1468,6 +1564,7 @@
|
|
strerror(errno));
|
|
}
|
|
else fprintf(outfile, "Study data written to %s\n", to_file);
|
|
+
|
|
}
|
|
}
|
|
fclose(f);
|
|
@@ -1866,7 +1963,7 @@
|
|
|
|
for (;; gmatched++) /* Loop for /g or /G */
|
|
{
|
|
- if (timeit)
|
|
+ if (timeitm > 0)
|
|
{
|
|
register int i;
|
|
clock_t time_taken;
|
|
@@ -1876,7 +1973,7 @@
|
|
if (all_use_dfa || use_dfa)
|
|
{
|
|
int workspace[1000];
|
|
- for (i = 0; i < LOOPREPEAT; i++)
|
|
+ for (i = 0; i < timeitm; i++)
|
|
count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
|
|
options | g_notempty, use_offsets, use_size_offsets, workspace,
|
|
sizeof(workspace)/sizeof(int));
|
|
@@ -1884,13 +1981,13 @@
|
|
else
|
|
#endif
|
|
|
|
- for (i = 0; i < LOOPREPEAT; i++)
|
|
+ for (i = 0; i < timeitm; i++)
|
|
count = pcre_exec(re, extra, (char *)bptr, len,
|
|
start_offset, options | g_notempty, use_offsets, use_size_offsets);
|
|
|
|
time_taken = clock() - start_time;
|
|
- fprintf(outfile, "Execute time %.3f milliseconds\n",
|
|
- (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
|
|
+ fprintf(outfile, "Execute time %.4f milliseconds\n",
|
|
+ (((double)time_taken * 1000.0) / (double)timeitm) /
|
|
(double)CLOCKS_PER_SEC);
|
|
}
|
|
|
|
@@ -1966,7 +2063,28 @@
|
|
|
|
if (count >= 0)
|
|
{
|
|
- int i;
|
|
+ int i, maxcount;
|
|
+
|
|
+#if !defined NODFA
|
|
+ if (all_use_dfa || use_dfa) maxcount = use_size_offsets/2; else
|
|
+#endif
|
|
+ maxcount = use_size_offsets/3;
|
|
+
|
|
+ /* This is a check against a lunatic return value. */
|
|
+
|
|
+ if (count > maxcount)
|
|
+ {
|
|
+ fprintf(outfile,
|
|
+ "** PCRE error: returned count %d is too big for offset size %d\n",
|
|
+ count, use_size_offsets);
|
|
+ count = use_size_offsets/3;
|
|
+ if (do_g || do_G)
|
|
+ {
|
|
+ fprintf(outfile, "** /%c loop abandoned\n", do_g? 'g' : 'G');
|
|
+ do_g = do_G = FALSE; /* Break g/G loop */
|
|
+ }
|
|
+ }
|
|
+
|
|
for (i = 0; i < count * 2; i += 2)
|
|
{
|
|
if (use_offsets[i] < 0)
|
|
@@ -2165,6 +2283,7 @@
|
|
{
|
|
new_free((void *)tables);
|
|
setlocale(LC_CTYPE, "C");
|
|
+ locale_set = 0;
|
|
}
|
|
}
|
|
|
|
diff -ruN ../pcre.orig/pcrelib/ucp.h ./pcrelib/ucp.h
|
|
--- ../pcre.orig/pcrelib/ucp.h Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/ucp.h Fri Feb 9 22:31:20 2007
|
|
@@ -6,7 +6,9 @@
|
|
#define _UCP_H
|
|
|
|
/* This file contains definitions of the property values that are returned by
|
|
-the function _pcre_ucp_findprop(). */
|
|
+the function _pcre_ucp_findprop(). New values that are added for new releases
|
|
+of Unicode should always be at the end of each enum, for backwards
|
|
+compatibility. */
|
|
|
|
/* These are the general character categories. */
|
|
|
|
@@ -118,7 +120,12 @@
|
|
ucp_Tibetan,
|
|
ucp_Tifinagh,
|
|
ucp_Ugaritic,
|
|
- ucp_Yi
|
|
+ ucp_Yi,
|
|
+ ucp_Balinese, /* New for Unicode 5.0.0 */
|
|
+ ucp_Cuneiform, /* New for Unicode 5.0.0 */
|
|
+ ucp_Nko, /* New for Unicode 5.0.0 */
|
|
+ ucp_Phags_Pa, /* New for Unicode 5.0.0 */
|
|
+ ucp_Phoenician /* New for Unicode 5.0.0 */
|
|
};
|
|
|
|
#endif
|
|
diff -ruN ../pcre.orig/pcrelib/ucpinternal.h ./pcrelib/ucpinternal.h
|
|
--- ../pcre.orig/pcrelib/ucpinternal.h Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/ucpinternal.h Fri Feb 9 22:31:20 2007
|
|
@@ -2,6 +2,9 @@
|
|
* Unicode Property Table handler *
|
|
*************************************************/
|
|
|
|
+#ifndef _UCPINTERNAL_H
|
|
+#define _UCPINTERNAL_H
|
|
+
|
|
/* Internal header file defining the layout of the bits in each pair of 32-bit
|
|
words that form a data item in the table. */
|
|
|
|
@@ -83,5 +86,7 @@
|
|
(7) Otherwise, set the bottom to one element past the current point and goto
|
|
(2).
|
|
*/
|
|
+
|
|
+#endif /* _UCPINTERNAL_H */
|
|
|
|
/* End of ucpinternal.h */
|
|
diff -ruN ../pcre.orig/pcrelib/ucptable.c ./pcrelib/ucptable.c
|
|
--- ../pcre.orig/pcrelib/ucptable.c Mon Mar 6 22:45:57 2006
|
|
+++ ./pcrelib/ucptable.c Fri Feb 9 22:31:20 2007
|
|
@@ -1,5 +1,6 @@
|
|
/* This source module is automatically generated from the Unicode
|
|
-property table. See ucpinternal.h for a description of the layout. */
|
|
+property table. See ucpinternal.h for a description of the layout.
|
|
+This version was made from the Unicode 5.0.0 tables. */
|
|
|
|
static cnode ucp_table[] = {
|
|
{ 0x09800000, 0x0000001f },
|
|
@@ -298,7 +299,7 @@
|
|
{ 0x2100017d, 0x24000001 },
|
|
{ 0x2100017e, 0x1400ffff },
|
|
{ 0x2100017f, 0x1400fed4 },
|
|
- { 0x21000180, 0x14000000 },
|
|
+ { 0x21000180, 0x140000c3 },
|
|
{ 0x21000181, 0x240000d2 },
|
|
{ 0x21000182, 0x24000001 },
|
|
{ 0x21000183, 0x1400ffff },
|
|
@@ -475,13 +476,27 @@
|
|
{ 0x21000232, 0x24000001 },
|
|
{ 0x21000233, 0x1400ffff },
|
|
{ 0x21800234, 0x14000005 },
|
|
- { 0x2100023a, 0x24000000 },
|
|
+ { 0x2100023a, 0x24002a2b },
|
|
{ 0x2100023b, 0x24000001 },
|
|
{ 0x2100023c, 0x1400ffff },
|
|
{ 0x2100023d, 0x2400ff5d },
|
|
- { 0x2100023e, 0x24000000 },
|
|
+ { 0x2100023e, 0x24002a28 },
|
|
{ 0x2180023f, 0x14000001 },
|
|
- { 0x21000241, 0x24000053 },
|
|
+ { 0x21000241, 0x24000001 },
|
|
+ { 0x21000242, 0x1400ffff },
|
|
+ { 0x21000243, 0x2400ff3d },
|
|
+ { 0x21000244, 0x24000045 },
|
|
+ { 0x21000245, 0x24000047 },
|
|
+ { 0x21000246, 0x24000001 },
|
|
+ { 0x21000247, 0x1400ffff },
|
|
+ { 0x21000248, 0x24000001 },
|
|
+ { 0x21000249, 0x1400ffff },
|
|
+ { 0x2100024a, 0x24000001 },
|
|
+ { 0x2100024b, 0x1400ffff },
|
|
+ { 0x2100024c, 0x24000001 },
|
|
+ { 0x2100024d, 0x1400ffff },
|
|
+ { 0x2100024e, 0x24000001 },
|
|
+ { 0x2100024f, 0x1400ffff },
|
|
{ 0x21800250, 0x14000002 },
|
|
{ 0x21000253, 0x1400ff2e },
|
|
{ 0x21000254, 0x1400ff32 },
|
|
@@ -499,25 +514,30 @@
|
|
{ 0x21800264, 0x14000003 },
|
|
{ 0x21000268, 0x1400ff2f },
|
|
{ 0x21000269, 0x1400ff2d },
|
|
- { 0x2180026a, 0x14000004 },
|
|
+ { 0x2100026a, 0x14000000 },
|
|
+ { 0x2100026b, 0x140029f7 },
|
|
+ { 0x2180026c, 0x14000002 },
|
|
{ 0x2100026f, 0x1400ff2d },
|
|
{ 0x21800270, 0x14000001 },
|
|
{ 0x21000272, 0x1400ff2b },
|
|
{ 0x21800273, 0x14000001 },
|
|
{ 0x21000275, 0x1400ff2a },
|
|
- { 0x21800276, 0x14000009 },
|
|
+ { 0x21800276, 0x14000006 },
|
|
+ { 0x2100027d, 0x140029e7 },
|
|
+ { 0x2180027e, 0x14000001 },
|
|
{ 0x21000280, 0x1400ff26 },
|
|
{ 0x21800281, 0x14000001 },
|
|
{ 0x21000283, 0x1400ff26 },
|
|
{ 0x21800284, 0x14000003 },
|
|
{ 0x21000288, 0x1400ff26 },
|
|
- { 0x21000289, 0x14000000 },
|
|
+ { 0x21000289, 0x1400ffbb },
|
|
{ 0x2100028a, 0x1400ff27 },
|
|
{ 0x2100028b, 0x1400ff27 },
|
|
- { 0x2180028c, 0x14000005 },
|
|
+ { 0x2100028c, 0x1400ffb9 },
|
|
+ { 0x2180028d, 0x14000004 },
|
|
{ 0x21000292, 0x1400ff25 },
|
|
{ 0x21000293, 0x14000000 },
|
|
- { 0x21000294, 0x1400ffad },
|
|
+ { 0x21000294, 0x1c000000 },
|
|
{ 0x21800295, 0x1400001a },
|
|
{ 0x218002b0, 0x18000011 },
|
|
{ 0x098002c2, 0x60000003 },
|
|
@@ -532,6 +552,9 @@
|
|
{ 0x1b800346, 0x30000029 },
|
|
{ 0x13800374, 0x60000001 },
|
|
{ 0x1300037a, 0x18000000 },
|
|
+ { 0x1300037b, 0x14000082 },
|
|
+ { 0x1300037c, 0x14000082 },
|
|
+ { 0x1300037d, 0x14000082 },
|
|
{ 0x0900037e, 0x54000000 },
|
|
{ 0x13800384, 0x60000001 },
|
|
{ 0x13000386, 0x24000026 },
|
|
@@ -647,7 +670,9 @@
|
|
{ 0x130003fa, 0x24000001 },
|
|
{ 0x130003fb, 0x1400ffff },
|
|
{ 0x130003fc, 0x14000000 },
|
|
- { 0x138003fd, 0x24000002 },
|
|
+ { 0x130003fd, 0x2400ff7e },
|
|
+ { 0x130003fe, 0x2400ff7e },
|
|
+ { 0x130003ff, 0x2400ff7e },
|
|
{ 0x0c000400, 0x24000050 },
|
|
{ 0x0c000401, 0x24000050 },
|
|
{ 0x0c000402, 0x24000050 },
|
|
@@ -835,7 +860,7 @@
|
|
{ 0x0c0004bd, 0x1400ffff },
|
|
{ 0x0c0004be, 0x24000001 },
|
|
{ 0x0c0004bf, 0x1400ffff },
|
|
- { 0x0c0004c0, 0x24000000 },
|
|
+ { 0x0c0004c0, 0x2400000f },
|
|
{ 0x0c0004c1, 0x24000001 },
|
|
{ 0x0c0004c2, 0x1400ffff },
|
|
{ 0x0c0004c3, 0x24000001 },
|
|
@@ -850,6 +875,7 @@
|
|
{ 0x0c0004cc, 0x1400ffff },
|
|
{ 0x0c0004cd, 0x24000001 },
|
|
{ 0x0c0004ce, 0x1400ffff },
|
|
+ { 0x0c0004cf, 0x1400fff1 },
|
|
{ 0x0c0004d0, 0x24000001 },
|
|
{ 0x0c0004d1, 0x1400ffff },
|
|
{ 0x0c0004d2, 0x24000001 },
|
|
@@ -892,6 +918,12 @@
|
|
{ 0x0c0004f7, 0x1400ffff },
|
|
{ 0x0c0004f8, 0x24000001 },
|
|
{ 0x0c0004f9, 0x1400ffff },
|
|
+ { 0x0c0004fa, 0x24000001 },
|
|
+ { 0x0c0004fb, 0x1400ffff },
|
|
+ { 0x0c0004fc, 0x24000001 },
|
|
+ { 0x0c0004fd, 0x1400ffff },
|
|
+ { 0x0c0004fe, 0x24000001 },
|
|
+ { 0x0c0004ff, 0x1400ffff },
|
|
{ 0x0c000500, 0x24000001 },
|
|
{ 0x0c000501, 0x1400ffff },
|
|
{ 0x0c000502, 0x24000001 },
|
|
@@ -908,6 +940,10 @@
|
|
{ 0x0c00050d, 0x1400ffff },
|
|
{ 0x0c00050e, 0x24000001 },
|
|
{ 0x0c00050f, 0x1400ffff },
|
|
+ { 0x0c000510, 0x24000001 },
|
|
+ { 0x0c000511, 0x1400ffff },
|
|
+ { 0x0c000512, 0x24000001 },
|
|
+ { 0x0c000513, 0x1400ffff },
|
|
{ 0x01000531, 0x24000030 },
|
|
{ 0x01000532, 0x24000030 },
|
|
{ 0x01000533, 0x24000030 },
|
|
@@ -989,8 +1025,7 @@
|
|
{ 0x01000587, 0x14000000 },
|
|
{ 0x09000589, 0x54000000 },
|
|
{ 0x0100058a, 0x44000000 },
|
|
- { 0x19800591, 0x30000028 },
|
|
- { 0x198005bb, 0x30000002 },
|
|
+ { 0x19800591, 0x3000002c },
|
|
{ 0x190005be, 0x54000000 },
|
|
{ 0x190005bf, 0x30000000 },
|
|
{ 0x190005c0, 0x54000000 },
|
|
@@ -1043,6 +1078,13 @@
|
|
{ 0x37800780, 0x1c000025 },
|
|
{ 0x378007a6, 0x3000000a },
|
|
{ 0x370007b1, 0x1c000000 },
|
|
+ { 0x3f8007c0, 0x34000009 },
|
|
+ { 0x3f8007ca, 0x1c000020 },
|
|
+ { 0x3f8007eb, 0x30000008 },
|
|
+ { 0x3f8007f4, 0x18000001 },
|
|
+ { 0x3f0007f6, 0x68000000 },
|
|
+ { 0x3f8007f7, 0x54000002 },
|
|
+ { 0x3f0007fa, 0x18000000 },
|
|
{ 0x0e800901, 0x30000001 },
|
|
{ 0x0e000903, 0x28000000 },
|
|
{ 0x0e800904, 0x1c000035 },
|
|
@@ -1059,7 +1101,7 @@
|
|
{ 0x09800964, 0x54000001 },
|
|
{ 0x0e800966, 0x34000009 },
|
|
{ 0x09000970, 0x54000000 },
|
|
- { 0x0e00097d, 0x1c000000 },
|
|
+ { 0x0e80097b, 0x1c000004 },
|
|
{ 0x02000981, 0x30000000 },
|
|
{ 0x02800982, 0x28000001 },
|
|
{ 0x02800985, 0x1c000007 },
|
|
@@ -1203,7 +1245,9 @@
|
|
{ 0x1c800cd5, 0x28000001 },
|
|
{ 0x1c000cde, 0x1c000000 },
|
|
{ 0x1c800ce0, 0x1c000001 },
|
|
+ { 0x1c800ce2, 0x30000001 },
|
|
{ 0x1c800ce6, 0x34000009 },
|
|
+ { 0x1c800cf1, 0x68000001 },
|
|
{ 0x24800d02, 0x28000001 },
|
|
{ 0x24800d05, 0x1c000007 },
|
|
{ 0x24800d0e, 0x1c000002 },
|
|
@@ -1452,13 +1496,33 @@
|
|
{ 0x05801a17, 0x30000001 },
|
|
{ 0x05801a19, 0x28000002 },
|
|
{ 0x05801a1e, 0x54000001 },
|
|
+ { 0x3d801b00, 0x30000003 },
|
|
+ { 0x3d001b04, 0x28000000 },
|
|
+ { 0x3d801b05, 0x1c00002e },
|
|
+ { 0x3d001b34, 0x30000000 },
|
|
+ { 0x3d001b35, 0x28000000 },
|
|
+ { 0x3d801b36, 0x30000004 },
|
|
+ { 0x3d001b3b, 0x28000000 },
|
|
+ { 0x3d001b3c, 0x30000000 },
|
|
+ { 0x3d801b3d, 0x28000004 },
|
|
+ { 0x3d001b42, 0x30000000 },
|
|
+ { 0x3d801b43, 0x28000001 },
|
|
+ { 0x3d801b45, 0x1c000006 },
|
|
+ { 0x3d801b50, 0x34000009 },
|
|
+ { 0x3d801b5a, 0x54000006 },
|
|
+ { 0x3d801b61, 0x68000009 },
|
|
+ { 0x3d801b6b, 0x30000008 },
|
|
+ { 0x3d801b74, 0x68000008 },
|
|
{ 0x21801d00, 0x1400002b },
|
|
{ 0x21801d2c, 0x18000035 },
|
|
{ 0x21801d62, 0x14000015 },
|
|
{ 0x0c001d78, 0x18000000 },
|
|
- { 0x21801d79, 0x14000021 },
|
|
+ { 0x21801d79, 0x14000003 },
|
|
+ { 0x21001d7d, 0x14000ee6 },
|
|
+ { 0x21801d7e, 0x1400001c },
|
|
{ 0x21801d9b, 0x18000024 },
|
|
- { 0x1b801dc0, 0x30000003 },
|
|
+ { 0x1b801dc0, 0x3000000a },
|
|
+ { 0x1b801dfe, 0x30000001 },
|
|
{ 0x21001e00, 0x24000001 },
|
|
{ 0x21001e01, 0x1400ffff },
|
|
{ 0x21001e02, 0x24000001 },
|
|
@@ -1967,7 +2031,7 @@
|
|
{ 0x1b8020dd, 0x2c000003 },
|
|
{ 0x1b0020e1, 0x30000000 },
|
|
{ 0x1b8020e2, 0x2c000002 },
|
|
- { 0x1b8020e5, 0x30000006 },
|
|
+ { 0x1b8020e5, 0x3000000a },
|
|
{ 0x09802100, 0x68000001 },
|
|
{ 0x09002102, 0x24000000 },
|
|
{ 0x09802103, 0x68000003 },
|
|
@@ -1995,7 +2059,7 @@
|
|
{ 0x0900212e, 0x68000000 },
|
|
{ 0x0900212f, 0x14000000 },
|
|
{ 0x09802130, 0x24000001 },
|
|
- { 0x09002132, 0x68000000 },
|
|
+ { 0x21002132, 0x2400001c },
|
|
{ 0x09002133, 0x24000000 },
|
|
{ 0x09002134, 0x14000000 },
|
|
{ 0x09802135, 0x1c000003 },
|
|
@@ -2008,7 +2072,8 @@
|
|
{ 0x09802146, 0x14000003 },
|
|
{ 0x0900214a, 0x68000000 },
|
|
{ 0x0900214b, 0x64000000 },
|
|
- { 0x0900214c, 0x68000000 },
|
|
+ { 0x0980214c, 0x68000001 },
|
|
+ { 0x2100214e, 0x1400ffe4 },
|
|
{ 0x09802153, 0x3c00000c },
|
|
{ 0x09002160, 0x38000010 },
|
|
{ 0x09002161, 0x38000010 },
|
|
@@ -2042,7 +2107,9 @@
|
|
{ 0x0900217d, 0x3800fff0 },
|
|
{ 0x0900217e, 0x3800fff0 },
|
|
{ 0x0900217f, 0x3800fff0 },
|
|
- { 0x09802180, 0x38000003 },
|
|
+ { 0x09802180, 0x38000002 },
|
|
+ { 0x09002183, 0x24000001 },
|
|
+ { 0x21002184, 0x1400ffff },
|
|
{ 0x09802190, 0x64000004 },
|
|
{ 0x09802195, 0x68000004 },
|
|
{ 0x0980219a, 0x64000001 },
|
|
@@ -2073,10 +2140,9 @@
|
|
{ 0x0900237c, 0x64000000 },
|
|
{ 0x0980237d, 0x6800001d },
|
|
{ 0x0980239b, 0x64000018 },
|
|
- { 0x090023b4, 0x58000000 },
|
|
- { 0x090023b5, 0x48000000 },
|
|
- { 0x090023b6, 0x54000000 },
|
|
- { 0x098023b7, 0x68000024 },
|
|
+ { 0x098023b4, 0x68000027 },
|
|
+ { 0x098023dc, 0x64000005 },
|
|
+ { 0x098023e2, 0x68000005 },
|
|
{ 0x09802400, 0x68000026 },
|
|
{ 0x09802440, 0x6800000a },
|
|
{ 0x09802460, 0x3c00003b },
|
|
@@ -2143,7 +2209,7 @@
|
|
{ 0x09802600, 0x6800006e },
|
|
{ 0x0900266f, 0x64000000 },
|
|
{ 0x09802670, 0x6800002c },
|
|
- { 0x098026a0, 0x68000011 },
|
|
+ { 0x098026a0, 0x68000012 },
|
|
{ 0x09802701, 0x68000003 },
|
|
{ 0x09802706, 0x68000003 },
|
|
{ 0x0980270c, 0x6800001b },
|
|
@@ -2174,6 +2240,7 @@
|
|
{ 0x098027c0, 0x64000004 },
|
|
{ 0x090027c5, 0x58000000 },
|
|
{ 0x090027c6, 0x48000000 },
|
|
+ { 0x098027c7, 0x64000003 },
|
|
{ 0x098027d0, 0x64000015 },
|
|
{ 0x090027e6, 0x58000000 },
|
|
{ 0x090027e7, 0x48000000 },
|
|
@@ -2215,7 +2282,8 @@
|
|
{ 0x090029fc, 0x58000000 },
|
|
{ 0x090029fd, 0x48000000 },
|
|
{ 0x098029fe, 0x64000101 },
|
|
- { 0x09802b00, 0x68000013 },
|
|
+ { 0x09802b00, 0x6800001a },
|
|
+ { 0x09802b20, 0x68000003 },
|
|
{ 0x11002c00, 0x24000030 },
|
|
{ 0x11002c01, 0x24000030 },
|
|
{ 0x11002c02, 0x24000030 },
|
|
@@ -2310,6 +2378,23 @@
|
|
{ 0x11002c5c, 0x1400ffd0 },
|
|
{ 0x11002c5d, 0x1400ffd0 },
|
|
{ 0x11002c5e, 0x1400ffd0 },
|
|
+ { 0x21002c60, 0x24000001 },
|
|
+ { 0x21002c61, 0x1400ffff },
|
|
+ { 0x21002c62, 0x2400d609 },
|
|
+ { 0x21002c63, 0x2400f11a },
|
|
+ { 0x21002c64, 0x2400d619 },
|
|
+ { 0x21002c65, 0x1400d5d5 },
|
|
+ { 0x21002c66, 0x1400d5d8 },
|
|
+ { 0x21002c67, 0x24000001 },
|
|
+ { 0x21002c68, 0x1400ffff },
|
|
+ { 0x21002c69, 0x24000001 },
|
|
+ { 0x21002c6a, 0x1400ffff },
|
|
+ { 0x21002c6b, 0x24000001 },
|
|
+ { 0x21002c6c, 0x1400ffff },
|
|
+ { 0x21002c74, 0x14000000 },
|
|
+ { 0x21002c75, 0x24000001 },
|
|
+ { 0x21002c76, 0x1400ffff },
|
|
+ { 0x21002c77, 0x14000000 },
|
|
{ 0x0a002c80, 0x24000001 },
|
|
{ 0x0a002c81, 0x1400ffff },
|
|
{ 0x0a002c82, 0x24000001 },
|
|
@@ -2559,6 +2644,8 @@
|
|
{ 0x3c80a016, 0x1c000476 },
|
|
{ 0x3c80a490, 0x68000036 },
|
|
{ 0x0980a700, 0x60000016 },
|
|
+ { 0x0980a717, 0x18000003 },
|
|
+ { 0x0980a720, 0x60000001 },
|
|
{ 0x3080a800, 0x1c000001 },
|
|
{ 0x3000a802, 0x28000000 },
|
|
{ 0x3080a803, 0x1c000002 },
|
|
@@ -2570,6 +2657,8 @@
|
|
{ 0x3080a825, 0x30000001 },
|
|
{ 0x3000a827, 0x28000000 },
|
|
{ 0x3080a828, 0x68000003 },
|
|
+ { 0x4080a840, 0x1c000033 },
|
|
+ { 0x4080a874, 0x54000003 },
|
|
{ 0x1780ac00, 0x1c002ba3 },
|
|
{ 0x0980d800, 0x1000037f },
|
|
{ 0x0980db80, 0x1000007f },
|
|
@@ -2765,13 +2854,15 @@
|
|
{ 0x1301018a, 0x3c000000 },
|
|
{ 0x29810300, 0x1c00001e },
|
|
{ 0x29810320, 0x3c000003 },
|
|
- { 0x12810330, 0x1c000019 },
|
|
+ { 0x12810330, 0x1c000010 },
|
|
+ { 0x12010341, 0x38000000 },
|
|
+ { 0x12810342, 0x1c000007 },
|
|
{ 0x1201034a, 0x38000000 },
|
|
{ 0x3b810380, 0x1c00001d },
|
|
{ 0x3b01039f, 0x54000000 },
|
|
{ 0x2a8103a0, 0x1c000023 },
|
|
{ 0x2a8103c8, 0x1c000007 },
|
|
- { 0x2a0103d0, 0x68000000 },
|
|
+ { 0x2a0103d0, 0x54000000 },
|
|
{ 0x2a8103d1, 0x38000004 },
|
|
{ 0x0d010400, 0x24000028 },
|
|
{ 0x0d010401, 0x24000028 },
|
|
@@ -2861,6 +2952,9 @@
|
|
{ 0x0b810837, 0x1c000001 },
|
|
{ 0x0b01083c, 0x1c000000 },
|
|
{ 0x0b01083f, 0x1c000000 },
|
|
+ { 0x41810900, 0x1c000015 },
|
|
+ { 0x41810916, 0x3c000003 },
|
|
+ { 0x4101091f, 0x54000000 },
|
|
{ 0x1e010a00, 0x1c000000 },
|
|
{ 0x1e810a01, 0x30000002 },
|
|
{ 0x1e810a05, 0x30000001 },
|
|
@@ -2872,6 +2966,9 @@
|
|
{ 0x1e010a3f, 0x30000000 },
|
|
{ 0x1e810a40, 0x3c000007 },
|
|
{ 0x1e810a50, 0x54000008 },
|
|
+ { 0x3e812000, 0x1c00036e },
|
|
+ { 0x3e812400, 0x38000062 },
|
|
+ { 0x3e812470, 0x54000003 },
|
|
{ 0x0981d000, 0x680000f5 },
|
|
{ 0x0981d100, 0x68000026 },
|
|
{ 0x0981d12a, 0x6800003a },
|
|
@@ -2890,6 +2987,7 @@
|
|
{ 0x1381d242, 0x30000002 },
|
|
{ 0x1301d245, 0x68000000 },
|
|
{ 0x0981d300, 0x68000056 },
|
|
+ { 0x0981d360, 0x3c000011 },
|
|
{ 0x0981d400, 0x24000019 },
|
|
{ 0x0981d41a, 0x14000019 },
|
|
{ 0x0981d434, 0x24000019 },
|
|
@@ -2957,6 +3055,8 @@
|
|
{ 0x0981d7aa, 0x14000018 },
|
|
{ 0x0901d7c3, 0x64000000 },
|
|
{ 0x0981d7c4, 0x14000005 },
|
|
+ { 0x0901d7ca, 0x24000000 },
|
|
+ { 0x0901d7cb, 0x14000000 },
|
|
{ 0x0981d7ce, 0x34000031 },
|
|
{ 0x16820000, 0x1c00a6d6 },
|
|
{ 0x1682f800, 0x1c00021d },
|