>From 86ec0ec94e175d96a8910acfff8bb31735078ed5 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 6 Jan 2016 22:40:23 -0800 Subject: [PATCH] Improve on fix for Bug#22181 * src/pcresearch.c (Pexecute): Update subject when skipping past easily-determined encoding errors, as this is faster than letting pcre_exec skip them. On my platform this improves performance 4.7x on a benchmark created via "yes $(printf '\200\200\200\200 \200\200\200\200\200\200\200\200\200\200\200\200\200\200\200\200x\n') | head -n 1000000 >j; grep -oP y j" in a UTF-8 locale. Rework code that deals with PCRE_ERROR_BADUTF8 return, to avoid an incorrect (albeit currently harmless) 'bol = false' assignment. --- src/pcresearch.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/pcresearch.c b/src/pcresearch.c index 8f3d935..c0b8678 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -229,6 +229,7 @@ Pexecute (char *buf, size_t size, size_t *match_size, while (mbclen_cache[to_uchar (*p)] == (size_t) -1) { p++; + subject = p; bol = false; } @@ -269,29 +270,30 @@ Pexecute (char *buf, size_t size, size_t *match_size, } int valid_bytes = sub[0]; - /* Try to match the string before the encoding error. */ - if (valid_bytes < search_offset) - e = PCRE_ERROR_NOMATCH; - else if (valid_bytes == 0) + if (search_offset <= valid_bytes) { - /* Handle the empty-match case specially, for speed. - This optimization is valid if VALID_BYTES is zero, - which means SEARCH_OFFSET is also zero. */ - sub[1] = 0; - e = empty_match[bol]; - } - else - e = jit_exec (subject, valid_bytes, search_offset, - options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub); + /* Try to match the string before the encoding error. */ + if (valid_bytes == 0) + { + /* Handle the empty-match case specially, for speed. + This optimization is valid if VALID_BYTES is zero, + which means SEARCH_OFFSET is also zero. */ + sub[1] = 0; + e = empty_match[bol]; + } + else + e = jit_exec (subject, valid_bytes, search_offset, + options | PCRE_NO_UTF8_CHECK | PCRE_NOTEOL, sub); - if (e != PCRE_ERROR_NOMATCH) - break; + if (e != PCRE_ERROR_NOMATCH) + break; + + /* Treat the encoding error as data that cannot match. */ + p = subject + valid_bytes + 1; + bol = false; + } - /* Treat the encoding error as data that cannot match. */ subject += valid_bytes + 1; - if (p < subject) - p = subject; - bol = false; } if (e != PCRE_ERROR_NOMATCH) -- 2.5.0