PCRE2Project · carenas · Apr 14, 2023 · carenas · Apr 15, 2023 · PhilipHazel
diff --git a/doc/pcre2_set_compile_extra_options.3 b/doc/pcre2_set_compile_extra_options.3
@@ -20,7 +20,7 @@ options are:
 .sp
   PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK     Allow \eK in lookarounds
 .\" JOIN
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  Allow \ex{d800} to \ex{dfff}
+  PCRE2_EXTRA_ALLOW_SURROGATES         Allow \ex{d800} to \ex{dfff}
                                          in UTF-8 and UTF-32 modes
 .\" JOIN
   PCRE2_EXTRA_ALT_BSUX                 Extended alternate \eu, \eU, and

diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
@@ -1824,8 +1824,8 @@ Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the
 error that is given if an escape sequence for an invalid Unicode code point is
 encountered in the pattern. In particular, the so-called "surrogate" code
 points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences
-such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
-option, as described in the section entitled "Extra compile options"
+such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATES extra option,
+as described in the section entitled "Extra compile options"
 .\" HTML <a href="#extracompileoptions">
 .\" </a>
 below.
@@ -1907,7 +1907,7 @@ assertions, following Perl's lead. This option is provided to re-enable the
 previous behaviour (act in positive lookarounds, ignore in negative ones) in
 case anybody is relying on it.
 .sp
-  PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+  PCRE2_EXTRA_ALLOW_SURROGATES
 .sp
 This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is
 forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate"
@@ -1924,10 +1924,16 @@ for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does
 not disable the error that occurs, because it applies only to the testing of
 input strings for UTF validity.
 .P
-If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code
-point values in UTF-8 and UTF-32 patterns no longer provoke errors and are
+If the extra option PCRE2_EXTRA_ALLOW_SURROGATES is set, surrogate code point
+values in UTF-8 and UTF-32 patterns no longer provoke errors and are
 incorporated in the compiled pattern. However, they can only match subject
 characters if the matching function is called with PCRE2_NO_UTF_CHECK set.
+.P
+Before 10.43 this option was known as PCRE2_EXTRA_ALLOW_SURROGATES and that
+is still available for backward compatibility, but the new name should be used
+in new code to better reflect that it also applies to characters in that range
+in UTF-32 as part ot the pattern or subject, including characters encoded in
+UTF-8 if found in the subject.
 .sp
   PCRE2_EXTRA_ALT_BSUX
 .sp

diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
@@ -538,8 +538,8 @@ limited to certain values, as follows:
 Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the
 so-called "surrogate" code points). The check for these can be disabled by the
 caller of \fBpcre2_compile()\fP by setting the option
-PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8
-and UTF-32 modes, because these values are not representable in UTF-16.
+PCRE2_EXTRA_ALLOW_SURROGATES. However, this is possible only in UTF-8 and
+UTF-32 modes, because these values are not representable in UTF-16.
 .
 .
 .SS "Escape sequences in character classes"
@@ -1436,8 +1436,8 @@ inclusive. They can also be used for code points specified numerically, for
 example [\e000-\e037]. Ranges can include any characters that are valid for the
 current mode. In any UTF mode, the so-called "surrogate" characters (those
 whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified
-explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables
-this check). However, ranges such as [\ex{d7ff}-\ex{e000}], which include the
+explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATES option disables the
+check). However, ranges such as [\ex{d7ff}-\ex{e000}], which include the
 surrogates, are always permitted.
 .P
 There is a special case in EBCDIC environments for ranges whose end points are

diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -577,7 +577,7 @@ for a description of the effects of these options.
 .sp
       allow_empty_class         set PCRE2_ALLOW_EMPTY_CLASS
       allow_lookaround_bsk      set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
-      allow_surrogate_escapes   set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES
+      allow_surrogates          set PCRE2_EXTRA_ALLOW_SURROGATES
       alt_bsux                  set PCRE2_ALT_BSUX
       alt_circumflex            set PCRE2_ALT_CIRCUMFLEX
       alt_verbnames             set PCRE2_ALT_VERBNAMES

diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3
@@ -309,7 +309,7 @@ UTF-32.)
 Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is
 given if an escape sequence for an invalid Unicode code point is encountered in
 the pattern. If you want to allow escape sequences such as \ex{d800} (a
-surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra
+surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATES extra
 option. However, this is possible only in UTF-8 and UTF-32 modes, because these
 values are not representable in UTF-16.
 .

diff --git a/src/pcre2.h.in b/src/pcre2.h.in
@@ -146,7 +146,7 @@ D   is inspected during pcre2_dfa_match() execution
 
 /* An additional compile options word is available in the compile context. */
 
-#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  0x00000001u  /* C */
+#define PCRE2_EXTRA_ALLOW_SURROGATES         0x00000001u  /* C */
 #define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL    0x00000002u  /* C */
 #define PCRE2_EXTRA_MATCH_WORD               0x00000004u  /* C */
 #define PCRE2_EXTRA_MATCH_LINE               0x00000008u  /* C */
@@ -160,6 +160,9 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTRA_ASCII_POSIX              0x00000800u  /* C */
 #define PCRE2_EXTRA_ASCII_DIGIT              0x00001000u  /* C */
 
+/* Backward compatibility */
+#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES  PCRE2_EXTRA_ALLOW_SURROGATES
+
 /* These are for pcre2_jit_compile(). */
 
 #define PCRE2_JIT_COMPLETE        0x00000001u  /* For full matching */

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -783,7 +783,7 @@ are allowed. */
 
 #define PUBLIC_COMPILE_EXTRA_OPTIONS \
    (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
-    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
+    PCRE2_EXTRA_ALLOW_SURROGATES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
     PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
@@ -1691,7 +1691,7 @@ else
         if (c > 0x10ffffU) *errorcodeptr = ERR77;
         else
           if (c >= 0xd800 && c <= 0xdfff &&
-              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
+              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0)
                 *errorcodeptr = ERR73;
         }
       else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
@@ -1886,7 +1886,7 @@ else
       else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
         {
         if (utf && c >= 0xd800 && c <= 0xdfff &&
-            (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
+            (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0)
           {
           ptr--;
           *errorcodeptr = ERR73;
@@ -1959,7 +1959,7 @@ else
         else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
           {
           if (utf && c >= 0xd800 && c <= 0xdfff &&
-              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
+              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0)
             {
             ptr--;
             *errorcodeptr = ERR73;
@@ -10177,23 +10177,29 @@ if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
 
 /* Check UTF. We have the original options in 'options', with that value as
 modified by (*UTF) etc in cb->external_options. The extra option
-PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
+PCRE2_EXTRA_ALLOW_SURROGATES is not permitted in UTF-16 mode because the
 surrogate code points cannot be represented in UTF-16. */
 
 utf = (cb.external_options & PCRE2_UTF) != 0;
 if (utf)
   {
+  BOOL strict = TRUE;
+
+#if PCRE2_CODE_UNIT_WIDTH != 16
+  strict = (ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) == 0;
+#endif
+
   if ((options & PCRE2_NEVER_UTF) != 0)
     {
     errorcode = ERR74;
     goto HAD_EARLY_ERROR;
     }
   if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
-       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
+       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset, strict)) != 0)
     goto HAD_ERROR;  /* Offset was set by valid_utf() */
 
 #if PCRE2_CODE_UNIT_WIDTH == 16
-  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
+  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) != 0)
     {
     errorcode = ERR91;
     goto HAD_EARLY_ERROR;

diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c
@@ -1090,7 +1090,7 @@ if (utf)
 if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0)
   {
   PCRE2_SIZE erroroffset;
-  rc = PRIV(valid_utf)(pattern, plength, &erroroffset);
+  rc = PRIV(valid_utf)(pattern, plength, &erroroffset, TRUE);
   if (rc != 0)
     {
     *bufflenptr = erroroffset;

diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
@@ -3575,7 +3575,8 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
   offset to be an absolute offset in the whole string. */
 
   match_data->rc = PRIV(valid_utf)(check_subject,
-    length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
+                          length - (PCRE2_SIZE)(check_subject - subject),
+                          &(match_data->startchar), TRUE);
   if (match_data->rc != 0)
     {
     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);

diff --git a/src/pcre2_error.c b/src/pcre2_error.c
@@ -177,7 +177,7 @@ static const unsigned char compile_error_texts[] =
   "internal error: unknown code in parsed pattern\0"
   /* 90 */
   "internal error: bad code value in parsed_skip()\0"
-  "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0"
+  "PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode\0"
   "invalid option bits with PCRE2_LITERAL\0"
   "\\N{U+dddd} is supported only in Unicode (UTF) mode\0"
   "invalid hyphen in option setting\0"

diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
@@ -206,7 +206,7 @@ Unicode doesn't go beyond 0x0010ffff. */
 
 /* This is the largest valid UTF/Unicode code point. */
 
-#define MAX_UTF_CODE_POINT 0x10ffff
+#define MAX_UTF_CODE_POINT 0x10ffffu
 
 /* Compile-time positive error numbers (all except UTF errors, which are
 negative) start at this value. It should probably never be changed, in case
@@ -2036,7 +2036,8 @@ extern PCRE2_SIZE   _pcre2_strlen(PCRE2_SPTR);
 extern int          _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t);
 extern int          _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t);
 extern int          _pcre2_study(pcre2_real_code *);
-extern int          _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *);
+extern int          _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *,
+                      BOOL);
 extern BOOL         _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR,
                       uint32_t *, BOOL);
 extern BOOL         _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL);

diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h
@@ -471,7 +471,7 @@ code. */
 /* These are trivial for the 32-bit library, since all UTF-32 characters fit
 into one PCRE2_UCHAR unit. */
 
-#define MAX_UTF_SINGLE_CU (0x10ffffu)
+#define MAX_UTF_SINGLE_CU 0x10ffffu
 #define HAS_EXTRALEN(c) (0)
 #define GET_EXTRALEN(c) (0)
 #define NOT_FIRSTCU(c) (0)

diff --git a/src/pcre2_match.c b/src/pcre2_match.c
@@ -6551,7 +6551,7 @@ if (use_jit)
     invalid code point to be an absolute offset in the whole string. */
 
     match_data->rc = PRIV(valid_utf)(start_match,
-      length - (start_match - subject), &(match_data->startchar));
+      length - (start_match - subject), &(match_data->startchar), TRUE);
     if (match_data->rc != 0)
       {
       match_data->startchar += start_match - subject;
@@ -6598,14 +6598,15 @@ If we get here in those circumstances, it means the subject string is valid,
 but for some reason JIT matching was not successful. There is no need to check
 the subject again.
 
-We check only the portion of the subject that might be be inspected during
+We check only the portion of the subject that might be inspected during
 matching - from the offset minus the maximum lookbehind to the given length.
 This saves time when a small part of a large subject is being matched by the
 use of a starting offset. Note that the maximum lookbehind is a number of
 characters, not code units.
 
 Note also that support for invalid UTF forces a check, overriding the setting
-of PCRE2_NO_CHECK_UTF. */
+of PCRE2_NO_CHECK_UTF, so validate_utf() has to be told not to error if a
+surrogate is found and the PCRE2_EXTRA_ALLOW_SURROGATES setting is used. */
 
 #ifdef SUPPORT_UNICODE
 if (utf &&
@@ -6685,8 +6686,11 @@ if (utf &&
 
   for (;;)
     {
+    BOOL strict = (re->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) == 0;
+
     match_data->rc = PRIV(valid_utf)(mb->check_subject,
-      length - (mb->check_subject - subject), &(match_data->startchar));
+                                     length - (mb->check_subject - subject),
+                                     &(match_data->startchar), strict);
 
     if (match_data->rc == 0) break;   /* Valid UTF string */
 
@@ -7461,7 +7465,7 @@ if (utf && end_subject != true_end_subject &&
 
     mb->check_subject = start_match;
     rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
-      &(match_data->startchar));
+                         &(match_data->startchar), TRUE);
 
     /* The rest of the subject is valid UTF. */
 

diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
@@ -345,7 +345,7 @@ if (length == PCRE2_ZERO_TERMINATED)
 #ifdef SUPPORT_UNICODE
 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
   {
-  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar));
+  rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar), TRUE);
   if (rc != 0)
     {
     match_data->leftchar = 0;