[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 07/10] regex: fix longstanding backref match bug
From: |
Paul Eggert |
Subject: |
[PATCH 07/10] regex: fix longstanding backref match bug |
Date: |
Fri, 5 Feb 2021 17:25:59 -0800 |
This fixes a longstanding glibc bug concerning backreferences
<https://sourceware.org/11053> (2009-12-04).
* lib/regexec.c (proceed_next_node, push_fail_stack)
(pop_fail_stack): Push and pop the previous registers
as well as the current ones. All callers changed.
(set_regs): Also pop if CUR_NODE has already been checked,
so that it does not get added as a duplicate set entry.
(update_regs): Fix comment location.
* tests/test-regex.c (tests): New constant.
(bug_regex11): New test function.
(main): Bump alarm value. Call new test function.
---
ChangeLog | 13 ++++++
lib/regexec.c | 26 +++++++----
tests/test-regex.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 141 insertions(+), 11 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 74304474b..bd7d1fa16 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
2021-02-05 Paul Eggert <eggert@cs.ucla.edu>
+ regex: fix longstanding backref match bug
+ This fixes a longstanding glibc bug concerning backreferences
+ <https://sourceware.org/11053> (2009-12-04).
+ * lib/regexec.c (proceed_next_node, push_fail_stack)
+ (pop_fail_stack): Push and pop the previous registers
+ as well as the current ones. All callers changed.
+ (set_regs): Also pop if CUR_NODE has already been checked,
+ so that it does not get added as a duplicate set entry.
+ (update_regs): Fix comment location.
+ * tests/test-regex.c (tests): New constant.
+ (bug_regex11): New test function.
+ (main): Bump alarm value. Call new test function.
+
regex: avoid duplicate in espilon closure
* lib/regcomp.c (calc_eclosure_iter): Insert NODE into epsilon
closure first rather than last. Otherwise, the epsilon closure
diff --git a/lib/regexec.c b/lib/regexec.c
index fdd2e373e..424bc8d15 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -59,7 +59,7 @@ static void update_regs (const re_dfa_t *dfa, regmatch_t
*pmatch,
Idx cur_idx, Idx nmatch);
static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
Idx str_idx, Idx dest_node, Idx nregs,
- regmatch_t *regs,
+ regmatch_t *regs, regmatch_t *prevregs,
re_node_set *eps_via_nodes);
static reg_errcode_t set_regs (const regex_t *preg,
const re_match_context_t *mctx,
@@ -1211,6 +1211,7 @@ check_halt_state_context (const re_match_context_t *mctx,
static Idx
proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
+ regmatch_t *prevregs,
Idx *pidx, Idx node, re_node_set *eps_via_nodes,
struct re_fail_stack_t *fs)
{
@@ -1243,7 +1244,7 @@ proceed_next_node (const re_match_context_t *mctx, Idx
nregs, regmatch_t *regs,
/* Otherwise, push the second epsilon-transition on the fail
stack. */
else if (fs != NULL
&& push_fail_stack (fs, *pidx, candidate, nregs, regs,
- eps_via_nodes))
+ prevregs, eps_via_nodes))
return -2;
/* We know we are going to exit. */
@@ -1316,7 +1317,8 @@ proceed_next_node (const re_match_context_t *mctx, Idx
nregs, regmatch_t *regs,
static reg_errcode_t
__attribute_warn_unused_result__
push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
- Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
+ Idx nregs, regmatch_t *regs, regmatch_t *prevregs,
+ re_node_set *eps_via_nodes)
{
reg_errcode_t err;
Idx num = fs->num++;
@@ -1332,23 +1334,26 @@ push_fail_stack (struct re_fail_stack_t *fs, Idx
str_idx, Idx dest_node,
}
fs->stack[num].idx = str_idx;
fs->stack[num].node = dest_node;
- fs->stack[num].regs = re_malloc (regmatch_t, nregs);
+ fs->stack[num].regs = re_malloc (regmatch_t, 2 * nregs);
if (fs->stack[num].regs == NULL)
return REG_ESPACE;
memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
+ memcpy (fs->stack[num].regs + nregs, prevregs, sizeof (regmatch_t) * nregs);
err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
return err;
}
static Idx
pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
- regmatch_t *regs, re_node_set *eps_via_nodes)
+ regmatch_t *regs, regmatch_t *prevregs,
+ re_node_set *eps_via_nodes)
{
if (fs == NULL || fs->num == 0)
return -1;
Idx num = --fs->num;
*pidx = fs->stack[num].idx;
memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
+ memcpy (prevregs, fs->stack[num].regs + nregs, sizeof (regmatch_t) * nregs);
re_node_set_free (eps_via_nodes);
re_free (fs->stack[num].regs);
*eps_via_nodes = fs->stack[num].eps_via_nodes;
@@ -1408,7 +1413,8 @@ set_regs (const regex_t *preg, const re_match_context_t
*mctx, size_t nmatch,
{
update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
- if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+ if ((idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
+ || re_node_set_contains (&eps_via_nodes, cur_node))
{
Idx reg_idx;
cur_node = -1;
@@ -1418,7 +1424,7 @@ set_regs (const regex_t *preg, const re_match_context_t
*mctx, size_t nmatch,
if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
{
cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
- &eps_via_nodes);
+ prev_idx_match, &eps_via_nodes);
break;
}
}
@@ -1431,7 +1437,8 @@ set_regs (const regex_t *preg, const re_match_context_t
*mctx, size_t nmatch,
}
/* Proceed to next node. */
- cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
+ cur_node = proceed_next_node (mctx, nmatch, pmatch, prev_idx_match,
+ &idx, cur_node,
&eps_via_nodes, fs);
if (__glibc_unlikely (cur_node < 0))
@@ -1443,7 +1450,8 @@ set_regs (const regex_t *preg, const re_match_context_t
*mctx, size_t nmatch,
free_fail_stack_return (fs);
return REG_ESPACE;
}
- cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch, &eps_via_nodes);
+ cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
+ prev_idx_match, &eps_via_nodes);
if (cur_node < 0)
{
re_node_set_free (&eps_via_nodes);
diff --git a/tests/test-regex.c b/tests/test-regex.c
index 58f8200f2..a14619805 100644
--- a/tests/test-regex.c
+++ b/tests/test-regex.c
@@ -55,6 +55,111 @@ really_utf8 (void)
return strcmp (locale_charset (), "UTF-8") == 0;
}
+/* Tests supposed to match; copied from glibc posix/bug-regex11.c. */
+static struct
+{
+ const char *pattern;
+ const char *string;
+ int flags, nmatch;
+ regmatch_t rm[5];
+} const tests[] = {
+ /* Test for newline handling in regex. */
+ { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ /* Other tests. */
+ { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
+ { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+ { { 0, 21 }, { 15, 16 }, { 16, 18 } } },
+ { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
+ { { 0, 21 }, { 8, 9 }, { 9, 10 } } },
+ { "^\\(a*\\)\\1\\{9\\}\\(a\\{0,9\\}\\)\\([0-9]*;.*[^a]\\2\\([0-9]\\)\\)",
+ "a1;;0a1aa2aaa3aaaa4aaaaa5aaaaaa6aaaaaaa7aaaaaaaa8aaaaaaaaa9aa2aa1a0", 0,
+ 5, { { 0, 67 }, { 0, 0 }, { 0, 1 }, { 1, 67 }, { 66, 67 } } },
+ /* Test for BRE expression anchoring. POSIX says just that this may match;
+ in glibc regex it always matched, so avoid changing it. */
+ { "\\(^\\|foo\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ { "\\(foo\\|^\\)bar", "bar", 0, 2, { { 0, 3 }, { -1, -1 } } },
+ /* In ERE this must be treated as an anchor. */
+ { "(^|foo)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+ { "(foo|^)bar", "bar", REG_EXTENDED, 2, { { 0, 3 }, { -1, -1 } } },
+ /* Here ^ cannot be treated as an anchor according to POSIX. */
+ { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+ { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+ /* More tests on backreferences. */
+ { "()\\1", "x", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+ { "()x\\1", "x", REG_EXTENDED, 2, { { 0, 1 }, { 0, 0 } } },
+ { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+ { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4
} } },
+ { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4
} } },
+ { "(b)()c\\1", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 1 }, { 1, 1 } } },
+ { "()(b)c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+ { "a(b)()c\\1", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 2 }, { 2, 2 } } },
+ { "a()(b)c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } } },
+ { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 0, 1 } } },
+ { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+ { "a()(b)\\1c\\2", "abcb", REG_EXTENDED, 3, { { 0, 4 }, { 1, 1 }, { 1, 2 } }
},
+ { "a()d(b)\\1c\\2", "adbcb", REG_EXTENDED, 3, { { 0, 5 }, { 1, 1 }, { 2, 3 }
} },
+ { "a(b())\\2\\1", "abbbb", REG_EXTENDED, 3, { { 0, 3 }, { 1, 2 }, { 2, 2 } }
},
+ { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } }
},
+ { "^([^,]*),\\1,\\1$", "a,a,a", REG_EXTENDED, 2, { { 0, 5 }, { 0, 1 } } },
+ { "^([^,]*),\\1,\\1$", "ab,ab,ab", REG_EXTENDED, 2, { { 0, 8 }, { 0, 2 } } },
+ { "^([^,]*),\\1,\\1,\\1$", "abc,abc,abc,abc", REG_EXTENDED, 2,
+ { { 0, 15 }, { 0, 3 } } },
+ { "^(.?)(.?)(.?)(.?)(.?).?\\5\\4\\3\\2\\1$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "abcdedcba", REG_EXTENDED, 1, { { 0, 9 } } },
+ /* XXX Not used since they fail so far. */
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$|^.?$",
+ "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+ "level", REG_NOSUB | REG_EXTENDED, 0, { { -1, -1 } } },
+ { "^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\\9\\8\\7\\6\\5\\4\\3\\2\\1$",
+ "ababababa", REG_EXTENDED, 1, { { 0, 9 } } },
+};
+
+static void
+bug_regex11 (void)
+{
+ regex_t re;
+ regmatch_t rm[5];
+ size_t i;
+ int n;
+
+ for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
+ {
+ n = regcomp (&re, tests[i].pattern, tests[i].flags);
+ if (n != 0)
+ {
+ char buf[500];
+ regerror (n, &re, buf, sizeof (buf));
+ report_error ("%s: regcomp %zd failed: %s", tests[i].pattern, i, buf);
+ continue;
+ }
+
+ if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
+ {
+ report_error ("%s: regexec %zd failed", tests[i].pattern, i);
+ regfree (&re);
+ continue;
+ }
+
+ for (n = 0; n < tests[i].nmatch; ++n)
+ if (rm[n].rm_so != tests[i].rm[n].rm_so
+ || rm[n].rm_eo != tests[i].rm[n].rm_eo)
+ {
+ if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
+ break;
+ report_error ("%s: regexec %zd match failure rm[%d] %d..%d",
+ tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
+ break;
+ }
+
+ regfree (&re);
+ }
+}
+
int
main (void)
{
@@ -65,11 +170,15 @@ main (void)
struct re_registers regs;
#if HAVE_DECL_ALARM
- /* Some builds of glibc go into an infinite loop on this test. */
- int alarm_value = 2;
+ /* In case a bug causes glibc to go into an infinite loop.
+ The tests should take less than 10 s on a reasonably modern CPU. */
+ int alarm_value = 1000;
signal (SIGALRM, SIG_DFL);
alarm (alarm_value);
#endif
+
+ bug_regex11 ();
+
if (setlocale (LC_ALL, "en_US.UTF-8"))
{
{
--
2.27.0
- [PATCH 01/10] regex: improve comments, Paul Eggert, 2021/02/05
- [PATCH 04/10] regex: make it easier to merge into glibc, Paul Eggert, 2021/02/05
- [PATCH 05/10] regex-tests: fix typo, Paul Eggert, 2021/02/05
- [PATCH 03/10] regex: minor refactoring, Paul Eggert, 2021/02/05
- [PATCH 10/10] regex: fix comment location, Paul Eggert, 2021/02/05
- [PATCH 08/10] regex: debug check for set member duplicates, Paul Eggert, 2021/02/05
- [PATCH 02/10] regex: avoid undefined behavior, Paul Eggert, 2021/02/05
- [PATCH 06/10] regex: avoid duplicate in espilon closure, Paul Eggert, 2021/02/05
- [PATCH 07/10] regex: fix longstanding backref match bug,
Paul Eggert <=
- [PATCH 09/10] regex-tests: add bug 11053 test, Paul Eggert, 2021/02/05