grep-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC PATCH v2] pcre: migrate to pcre2


From: Carlo Marcelo Arenas Belón
Subject: [RFC PATCH v2] pcre: migrate to pcre2
Date: Wed, 13 Oct 2021 20:12:23 -0700

assumes a very recent version of the pcre2 library (PCRE2_MATCH_INVALID_UTF
comes with 10.34), but a test for it is still missing.

removed some optimizations that might need to be brought back once tested,
had to do some casts to get rid of some warnings that I didn't really like
and maybe missing some error checking.

Signed-off-by: Carlo Marcelo Arenas Belón <carenas@gmail.com>
---
 configure.ac             |   2 +-
 m4/{pcre.m4 => pcre2.m4} |  23 +++--
 src/pcresearch.c         | 210 ++++++++++++++-------------------------
 tests/filename-lineno.pl |   4 +-
 4 files changed, 90 insertions(+), 149 deletions(-)
 rename m4/{pcre.m4 => pcre2.m4} (67%)

diff --git a/configure.ac b/configure.ac
index c49ec4a..9291cee 100644
--- a/configure.ac
+++ b/configure.ac
@@ -197,7 +197,7 @@ if test "$ac_use_included_regex" = no; then
   AC_MSG_WARN([Included lib/regex.c not used])
 fi
 
-gl_FUNC_PCRE
+gl_FUNC_PCRE2
 AM_CONDITIONAL([USE_PCRE], [test $use_pcre = yes])
 
 case $host_os in
diff --git a/m4/pcre.m4 b/m4/pcre2.m4
similarity index 67%
rename from m4/pcre.m4
rename to m4/pcre2.m4
index 78b7fda..7970c4e 100644
--- a/m4/pcre.m4
+++ b/m4/pcre2.m4
@@ -1,15 +1,15 @@
-# pcre.m4 - check for libpcre support
+# pcre2.m4 - check for libpcre2 support
 
 # Copyright (C) 2010-2021 Free Software Foundation, Inc.
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 
-AC_DEFUN([gl_FUNC_PCRE],
+AC_DEFUN([gl_FUNC_PCRE2],
 [
   AC_ARG_ENABLE([perl-regexp],
     AS_HELP_STRING([--disable-perl-regexp],
-                   [disable perl-regexp (pcre) support]),
+                   [disable perl-regexp (pcre2) support]),
     [case $enableval in
        yes|no) test_pcre=$enableval;;
        *) AC_MSG_ERROR([invalid value $enableval for --disable-perl-regexp]);;
@@ -21,24 +21,25 @@ AC_DEFUN([gl_FUNC_PCRE],
   use_pcre=no
 
   if test $test_pcre != no; then
-    PKG_CHECK_MODULES([PCRE], [libpcre], [], [: ${PCRE_LIBS=-lpcre}])
+    PKG_CHECK_MODULES([PCRE], [libpcre2-8], [], [: ${PCRE_LIBS=-lpcre2-8}])
 
-    AC_CACHE_CHECK([for pcre_compile], [pcre_cv_have_pcre_compile],
+    AC_CACHE_CHECK([for pcre2_compile], [pcre_cv_have_pcre2_compile],
       [pcre_saved_CFLAGS=$CFLAGS
        pcre_saved_LIBS=$LIBS
        CFLAGS="$CFLAGS $PCRE_CFLAGS"
        LIBS="$PCRE_LIBS $LIBS"
        AC_LINK_IFELSE(
-         [AC_LANG_PROGRAM([[#include <pcre.h>
+         [AC_LANG_PROGRAM([[#define PCRE2_CODE_UNIT_WIDTH 8
+                            #include <pcre2.h>
                           ]],
-            [[pcre *p = pcre_compile (0, 0, 0, 0, 0);
+            [[pcre2_code *p = pcre2_compile (0, 0, 0, 0, 0, 0);
               return !p;]])],
-         [pcre_cv_have_pcre_compile=yes],
-         [pcre_cv_have_pcre_compile=no])
+         [pcre_cv_have_pcre2_compile=yes],
+         [pcre_cv_have_pcre2_compile=no])
        CFLAGS=$pcre_saved_CFLAGS
        LIBS=$pcre_saved_LIBS])
 
-    if test "$pcre_cv_have_pcre_compile" = yes; then
+    if test "$pcre_cv_have_pcre2_compile" = yes; then
       use_pcre=yes
     elif test $test_pcre = maybe; then
       AC_MSG_WARN([AC_PACKAGE_NAME will be built without pcre support.])
@@ -50,7 +51,7 @@ AC_DEFUN([gl_FUNC_PCRE],
   if test $use_pcre = yes; then
     AC_DEFINE([HAVE_LIBPCRE], [1],
       [Define to 1 if you have the Perl Compatible Regular Expressions
-       library (-lpcre).])
+       library (-lpcre2).])
   else
     PCRE_CFLAGS=
     PCRE_LIBS=
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 3bdaee9..a6a4bb0 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -17,93 +17,48 @@
    02110-1301, USA.  */
 
 /* Written August 1992 by Mike Haertel. */
+/* Updated for PCRE2 by Carlo Arenas. */
 
 #include <config.h>
 #include "search.h"
 #include "die.h"
 
-#include <pcre.h>
-
-/* This must be at least 2; everything after that is for performance
-   in pcre_exec.  */
-enum { NSUB = 300 };
-
-#ifndef PCRE_EXTRA_MATCH_LIMIT_RECURSION
-# define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
-#endif
-#ifndef PCRE_STUDY_JIT_COMPILE
-# define PCRE_STUDY_JIT_COMPILE 0
-#endif
-#ifndef PCRE_STUDY_EXTRA_NEEDED
-# define PCRE_STUDY_EXTRA_NEEDED 0
-#endif
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
 struct pcre_comp
 {
   /* Compiled internal form of a Perl regular expression.  */
-  pcre *cre;
-
-  /* Additional information about the pattern.  */
-  pcre_extra *extra;
-
-#if PCRE_STUDY_JIT_COMPILE
-  /* The JIT stack and its maximum size.  */
-  pcre_jit_stack *jit_stack;
-  int jit_stack_size;
-#endif
-
-  /* Table, indexed by ! (flag & PCRE_NOTBOL), of whether the empty
-     string matches when that flag is used.  */
-  int empty_match[2];
+  pcre2_code *cre;
+  pcre2_match_context *mcontext;
+  pcre2_match_data *data;
 };
 
-
 /* Match the already-compiled PCRE pattern against the data in SUBJECT,
    of size SEARCH_BYTES and starting with offset SEARCH_OFFSET, with
-   options OPTIONS, and storing resulting matches into SUB.  Return
-   the (nonnegative) match location or a (negative) error number.  */
+   options OPTIONS.
+   Return the (nonnegative) match location or a (negative) error number.  */
 static int
 jit_exec (struct pcre_comp *pc, char const *subject, int search_bytes,
-          int search_offset, int options, int *sub)
+          int search_offset, uint32_t options)
 {
   while (true)
     {
-      int e = pcre_exec (pc->cre, pc->extra, subject, search_bytes,
-                         search_offset, options, sub, NSUB);
+      int e = pcre2_match (pc->cre, (PCRE2_SPTR8)subject, search_bytes,
+                         search_offset, options, pc->data, pc->mcontext);
 
-#if PCRE_STUDY_JIT_COMPILE
-      if (e == PCRE_ERROR_JIT_STACKLIMIT
-          && 0 < pc->jit_stack_size && pc->jit_stack_size <= INT_MAX / 2)
+      if (e == PCRE2_ERROR_JIT_STACKLIMIT && !pc->mcontext)
         {
-          int old_size = pc->jit_stack_size;
-          int new_size = pc->jit_stack_size = old_size * 2;
-          if (pc->jit_stack)
-            pcre_jit_stack_free (pc->jit_stack);
-          pc->jit_stack = pcre_jit_stack_alloc (old_size, new_size);
-          if (!pc->jit_stack)
+          /* The PCRE documentation says that a 32 KiB stack is the default.  
*/
+          pcre2_jit_stack *s = pcre2_jit_stack_create (64 << 10, INT_MAX / 2,
+                                                       NULL);
+          pc->mcontext = pcre2_match_context_create (NULL);
+          if (!pc->mcontext || !s)
             die (EXIT_TROUBLE, 0,
                  _("failed to allocate memory for the PCRE JIT stack"));
-          pcre_assign_jit_stack (pc->extra, NULL, pc->jit_stack);
+          pcre2_jit_stack_assign (pc->mcontext, NULL, s);
           continue;
         }
-#endif
-
-#if PCRE_EXTRA_MATCH_LIMIT_RECURSION
-      if (e == PCRE_ERROR_RECURSIONLIMIT
-          && (PCRE_STUDY_EXTRA_NEEDED || pc->extra))
-        {
-          unsigned long lim
-            = (pc->extra->flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION
-               ? pc->extra->match_limit_recursion
-               : 0);
-          if (lim <= ULONG_MAX / 2)
-            {
-              pc->extra->match_limit_recursion = lim ? 2 * lim : (1 << 24) - 1;
-              pc->extra->flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
-              continue;
-            }
-        }
-#endif
 
       return e;
     }
@@ -115,27 +70,26 @@ jit_exec (struct pcre_comp *pc, char const *subject, int 
search_bytes,
 void *
 Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
 {
-  int e;
-  char const *ep;
+  PCRE2_SIZE e;
+  int ec;
+  PCRE2_UCHAR8 ep[128];
   static char const wprefix[] = "(?<!\\w)(?:";
   static char const wsuffix[] = ")(?!\\w)";
-  static char const xprefix[] = "^(?:";
-  static char const xsuffix[] = ")$";
-  int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
-                         sizeof xprefix - 1 + sizeof xsuffix - 1);
-  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
-  int flags = PCRE_DOLLAR_ENDONLY | (match_icase ? PCRE_CASELESS : 0);
+  size_t fix_len_max = sizeof wprefix - 1 + sizeof wsuffix - 1;
+  unsigned char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
+  uint32_t flags = PCRE2_DOLLAR_ENDONLY | (match_icase ? PCRE2_CASELESS : 0);
   char *patlim = pattern + size;
-  char *n = re;
+  char *n = (char *)re;
   char const *p;
   char const *pnul;
   struct pcre_comp *pc = xcalloc (1, sizeof (*pc));
+  pcre2_compile_context *ccontext = NULL;
 
   if (localeinfo.multibyte)
     {
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
-      flags |= PCRE_UTF8;
+      flags |= PCRE2_UTF | PCRE2_NEVER_BACKSLASH_C | PCRE2_MATCH_INVALID_UTF;
     }
 
   /* FIXME: Remove this restriction.  */
@@ -145,8 +99,6 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, 
bool exact)
   *n = '\0';
   if (match_words)
     strcpy (n, wprefix);
-  if (match_lines)
-    strcpy (n, xprefix);
   n += strlen (n);
 
   /* The PCRE interface doesn't allow NUL bytes in the pattern, so
@@ -169,36 +121,32 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t 
ignored, bool exact)
   *patlim = '\n';
 
   if (match_words)
-    strcpy (n, wsuffix);
+    {
+      strcpy (n, wsuffix);
+      size += fix_len_max;
+    }
   if (match_lines)
-    strcpy (n, xsuffix);
+    {
+      ccontext = pcre2_compile_context_create(NULL);
+      uint32_t extra_options = PCRE2_EXTRA_MATCH_LINE;
 
-  pc->cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
-  if (!pc->cre)
-    die (EXIT_TROUBLE, 0, "%s", ep);
+      pcre2_set_compile_extra_options(ccontext, extra_options);
+    }
 
-  int pcre_study_flags = PCRE_STUDY_EXTRA_NEEDED | PCRE_STUDY_JIT_COMPILE;
-  pc->extra = pcre_study (pc->cre, pcre_study_flags, &ep);
-  if (ep)
+  pc->cre = pcre2_compile (re, size, flags, &ec, &e, ccontext);
+  if (!pc->cre) {
+    pcre2_get_error_message (ec, ep, 128);
     die (EXIT_TROUBLE, 0, "%s", ep);
+  }
 
-#if PCRE_STUDY_JIT_COMPILE
-  if (pcre_fullinfo (pc->cre, pc->extra, PCRE_INFO_JIT, &e))
-    die (EXIT_TROUBLE, 0, _("internal error (should never happen)"));
+  pc->data = pcre2_match_data_create_from_pattern (pc->cre, NULL);
 
-  /* The PCRE documentation says that a 32 KiB stack is the default.  */
-  if (e)
-    pc->jit_stack_size = 32 << 10;
-#endif
+  ec = pcre2_jit_compile (pc->cre, PCRE2_JIT_COMPLETE);
+  if (ec && ec != PCRE2_ERROR_JIT_BADOPTION && ec != PCRE2_ERROR_NOMEMORY)
+    die (EXIT_TROUBLE, 0, _("JIT internal error: %d"), ec);
 
   free (re);
 
-  int sub[NSUB];
-  pc->empty_match[false] = pcre_exec (pc->cre, pc->extra, "", 0, 0,
-                                      PCRE_NOTBOL, sub, NSUB);
-  pc->empty_match[true] = pcre_exec (pc->cre, pc->extra, "", 0, 0, 0, sub,
-                                     NSUB);
-
   return pc;
 }
 
@@ -206,11 +154,11 @@ ptrdiff_t
 Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size,
           char const *start_ptr)
 {
-  int sub[NSUB];
+  PCRE2_SIZE *sub;
   char const *p = start_ptr ? start_ptr : buf;
   bool bol = p[-1] == eolbyte;
   char const *line_start = buf;
-  int e = PCRE_ERROR_NOMATCH;
+  int e = PCRE2_ERROR_NOMATCH;
   char const *line_end;
   struct pcre_comp *pc = vcp;
 
@@ -243,41 +191,25 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
 
           int search_offset = p - subject;
 
-          /* Check for an empty match; this is faster than letting
-             pcre_exec do it.  */
-          if (p == line_end)
-            {
-              sub[0] = sub[1] = search_offset;
-              e = pc->empty_match[bol];
-              break;
-            }
-
           int options = 0;
           if (!bol)
-            options |= PCRE_NOTBOL;
+            options |= PCRE2_NOTBOL;
 
           e = jit_exec (pc, subject, line_end - subject, search_offset,
-                        options, sub);
-          if (e != PCRE_ERROR_BADUTF8)
+                        options);
+          if (e != PCRE2_ERROR_BADUTFOFFSET)
             break;
-          int valid_bytes = sub[0];
+
+          sub = pcre2_get_ovector_pointer(pc->data);
+          int valid_bytes = *sub;
 
           if (search_offset <= valid_bytes)
             {
               /* Try to match the string before the encoding error.  */
-              if (valid_bytes == 0)
-                {
-                  /* Handle the empty-match case specially, for speed.
-                     This optimization is valid if VALID_BYTES is zero,
-                     which means SEARCH_OFFSET is also zero.  */
-                  sub[1] = 0;
-                  e = pc->empty_match[bol];
-                }
-              else
-                e = jit_exec (pc, subject, valid_bytes, search_offset,
-                              options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub);
-
-              if (e != PCRE_ERROR_NOMATCH)
+              e = jit_exec (pc, subject, valid_bytes, search_offset,
+                            options | PCRE2_NO_UTF_CHECK | PCRE2_NOTEOL);
+
+              if (e != PCRE2_ERROR_NOMATCH)
                 break;
 
               /* Treat the encoding error as data that cannot match.  */
@@ -288,7 +220,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
           subject += valid_bytes + 1;
         }
 
-      if (e != PCRE_ERROR_NOMATCH)
+      if (e != PCRE2_ERROR_NOMATCH)
         break;
       bol = true;
       p = subject = line_start = line_end + 1;
@@ -299,24 +231,31 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
     {
       switch (e)
         {
-        case PCRE_ERROR_NOMATCH:
+        case PCRE2_ERROR_NOMATCH:
           break;
 
-        case PCRE_ERROR_NOMEMORY:
+        case PCRE2_ERROR_NOMEMORY:
           die (EXIT_TROUBLE, 0, _("%s: memory exhausted"), input_filename ());
 
-#if PCRE_STUDY_JIT_COMPILE
-        case PCRE_ERROR_JIT_STACKLIMIT:
+        case PCRE2_ERROR_JIT_STACKLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exhausted PCRE JIT stack"),
                input_filename ());
-#endif
 
-        case PCRE_ERROR_MATCHLIMIT:
+        case PCRE2_ERROR_DEPTHLIMIT:
+          die (EXIT_TROUBLE, 0,
+               _("%s: exceeded PCRE's nested backtracking limit"),
+               input_filename ());
+
+        case PCRE2_ERROR_HEAPLIMIT:
+          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's heap limit"),
+               input_filename ());
+
+        case PCRE2_ERROR_MATCHLIMIT:
           die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's backtracking limit"),
                input_filename ());
 
-        case PCRE_ERROR_RECURSIONLIMIT:
-          die (EXIT_TROUBLE, 0, _("%s: exceeded PCRE's recursion limit"),
+        case PCRE2_ERROR_RECURSELOOP:
+          die (EXIT_TROUBLE, 0, _("%s: PCRE detected recurse loop"),
                input_filename ());
 
         default:
@@ -332,8 +271,9 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t 
*match_size,
     }
   else
     {
-      char const *matchbeg = subject + sub[0];
-      char const *matchend = subject + sub[1];
+      sub = pcre2_get_ovector_pointer(pc->data);
+      char const *matchbeg = subject + *sub;
+      char const *matchend = subject + *(sub + 1);
       char const *beg;
       char const *end;
       if (start_ptr)
diff --git a/tests/filename-lineno.pl b/tests/filename-lineno.pl
index 1e84b45..1ff3d6a 100755
--- a/tests/filename-lineno.pl
+++ b/tests/filename-lineno.pl
@@ -101,13 +101,13 @@ my @Tests =
    ],
    ['invalid-re-P-paren', '-P ")"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],
    ['invalid-re-P-star-paren', '-P "a.*)"', {EXIT=>2},
     {ERR => $ENV{PCRE_WORKS} == 1
-       ? "$prog: unmatched parentheses\n"
+       ? "$prog: unmatched closing parenthesis\n"
        : $no_pcre
     },
    ],
-- 
2.33.0.1155.gbdb71ac078




reply via email to

[Prev in Thread] Current Thread [Next in Thread]