[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
RRI patches for grep
From: |
Aharon Robbins |
Subject: |
RRI patches for grep |
Date: |
Fri, 27 Apr 2012 12:07:27 +0300 |
User-agent: |
Heirloom mailx 12.4 7/29/08 |
Here are the updated RRI patches for grep. First one is for dfa.c and
doc/grep.texi. NOT handled is removal of hard-locale.[ch] from lib/ and
from the make infrastructure.
The second patch is for gnulib. Both are relative to master in both
git repos as of less than an hour ago.
Thanks,
Arnold
------------------
>From 9b16fdee4edf2b4ea8fc4cfc6b6c45bde6ec8cd4 Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <address@hidden>
Date: Fri, 27 Apr 2012 12:03:16 +0300
Subject: [PATCH] Implement/Document Rational Range Interpretation.
---
doc/grep.texi | 21 ++++++++++++++++-----
src/dfa.c | 40 ++++++----------------------------------
2 files changed, 22 insertions(+), 39 deletions(-)
diff --git a/doc/grep.texi b/doc/grep.texi
index 000a844..3af72f3 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -958,9 +958,7 @@ They are omitted (i.e., false) by default and become true
when specified.
@cindex character type
@cindex national language support
@cindex NLS
-These variables specify the locale for the @code{LC_COLLATE} category,
-which determines the collating sequence
-used to interpret range expressions like @samp{[a-z]}.
+These variables specify the locale for the @code{LC_COLLATE} category.
@item LC_ALL
@itemx LC_CTYPE
@@ -1221,7 +1219,12 @@ For example, the regular expression
Within a bracket expression, a @dfn{range expression} consists of two
characters separated by a hyphen.
It matches any single character that
-sorts between the two characters, inclusive, using the locale's
+sorts between the two characters, inclusive,
+using the machine's character set.
+
+Up to and including version 2.12 of @command{grep},
+range expressions would match any single character that sorted between
+the two characters, inclusive, using the current locale's
collating sequence and character set.
For example, in the default C
locale, @samp{[a-d]} is equivalent to @samp{[abcd]}.
@@ -1230,9 +1233,17 @@ characters in dictionary order, and in these locales
@samp{[a-d]} is
typically not equivalent to @samp{[abcd]};
it might be equivalent to @samp{[aBbCcDd]}, for example.
To obtain the traditional interpretation
-of bracket expressions, you can use the @samp{C} locale by setting the
+of bracket expressions, it was necessary to use the @samp{C} locale
+by setting the
@env{LC_ALL} environment variable to the value @samp{C}.
+Since the current POSIX standard now makes the behavior of range expressions
+be implementation-defined, instead of requiring the locale's
+collating order, @command{grep} has reverted to the traditional Unix
+behavior of defining ranges based on the machine character address@hidden
+is known as ``Rational Range Interpretation,'' a lovely phrase
+coined by Karl Berry.}
+
Finally, certain named classes of characters are predefined within
bracket expressions, as follows.
Their interpretation depends on the @code{LC_CTYPE} locale;
diff --git a/src/dfa.c b/src/dfa.c
index 1cbe537..c690e10 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -29,6 +29,7 @@
#include <limits.h>
#include <string.h>
#include <locale.h>
+#include <stdbool.h>
#define STREQ(a, b) (strcmp (a, b) == 0)
@@ -46,7 +47,7 @@
#include "gettext.h"
#define _(str) gettext (str)
-#include "mbsupport.h" /* defines MBS_SUPPORT if appropriate */
+#include "mbsupport.h" /* defines MBS_SUPPORT to 1 or 0, as
appropriate */
#include <wchar.h>
#include <wctype.h>
@@ -56,7 +57,6 @@
#include "regex.h"
#include "dfa.h"
-#include "hard-locale.h"
#include "xalloc.h"
/* HPUX, define those as macros in sys/param.h */
@@ -777,7 +777,6 @@ static int laststart; /* True if we're separated
from beginning or (,
only by zero-width characters. */
static size_t parens; /* Count of outstanding left parens. */
static int minrep, maxrep; /* Repeat counts for {m,n}. */
-static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */
static int cur_mb_len = 1; /* Length of the multibyte representation of
wctok. */
@@ -1111,26 +1110,8 @@ parse_bracket_exp (void)
c1 = tolower (c1);
c2 = tolower (c2);
}
- if (!hard_LC_COLLATE)
- for (c = c1; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
- else
- {
- /* Defer to the system regex library about the meaning
- of range expressions. */
- regex_t re;
- char pattern[6] = { '[', c1, '-', c2, ']', 0 };
- char subject[2] = { 0, 0 };
- regcomp (&re, pattern, REG_NOSUB);
- for (c = 0; c < NOTCHAR; ++c)
- {
- subject[0] = c;
- if (!(case_fold && isupper (c))
- && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
- setbit_case_fold_c (c, ccl);
- }
- regfree (&re);
- }
+ for (c = c1; c <= c2; c++)
+ setbit_case_fold_c (c, ccl);
}
colon_warning_state |= 8;
@@ -1878,9 +1859,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
lasttok = END;
laststart = 1;
parens = 0;
-#ifdef LC_COLLATE
- hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#endif
if (MB_CUR_MAX > 1)
{
cur_mb_len = 0;
@@ -2966,7 +2944,6 @@ match_mb_charset (struct dfa *d, state_num s, position
pos, size_t idx)
with which this operator match. */
int op_len; /* Length of the operator. */
char buffer[128];
- wchar_t wcbuf[6];
/* Pointer to the structure to which we are currently referring. */
struct mb_char_classes *work_mbc;
@@ -3039,16 +3016,11 @@ match_mb_charset (struct dfa *d, state_num s, position
pos, size_t idx)
}
}
- wcbuf[0] = wc;
- wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0';
-
/* match with a range? */
for (i = 0; i < work_mbc->nranges; i++)
{
- wcbuf[2] = work_mbc->range_sts[i];
- wcbuf[4] = work_mbc->range_ends[i];
-
- if (wcscoll (wcbuf, wcbuf + 2) >= 0 && wcscoll (wcbuf + 4, wcbuf) >= 0)
+ if (work_mbc->range_sts[i] <= wc &&
+ wc <= work_mbc->range_ends[i])
goto charset_matched;
}
--
1.7.1
>From 5c7665f2ced46d2e830958bce1bf46469995d3de Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <address@hidden>
Date: Fri, 27 Apr 2012 12:04:22 +0300
Subject: [PATCH] Implement Rational Range Interpretation in Gnulib.
---
lib/regcomp.c | 13 +++----------
lib/regexec.c | 12 ++----------
2 files changed, 5 insertions(+), 20 deletions(-)
diff --git a/lib/regcomp.c b/lib/regcomp.c
index b51a9a6..7748535 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -2702,7 +2702,6 @@ build_range_exp (const reg_syntax_t syntax,
wchar_t wc;
wint_t start_wc;
wint_t end_wc;
- wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2716,11 +2715,7 @@ build_range_exp (const reg_syntax_t syntax,
? __btowc (end_ch) : end_elem->opr.wch);
if (start_wc == WEOF || end_wc == WEOF)
return REG_ECOLLATE;
- cmp_buf[0] = start_wc;
- cmp_buf[4] = end_wc;
-
- if (BE ((syntax & RE_NO_EMPTY_RANGES)
- && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
+ else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
return REG_ERANGE;
/* Got valid collation sequence values, add them as a new entry.
@@ -2761,10 +2756,8 @@ build_range_exp (const reg_syntax_t syntax,
/* Build the table for single byte characters. */
for (wc = 0; wc < SBC_MAX; ++wc)
{
- cmp_buf[2] = wc;
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
- bitset_set (sbcset, wc);
+ if (start_wc <= wc && wc <= end_wc)
+ bitset_set (sbcset, wc);
}
}
# else /* not RE_ENABLE_I18N */
diff --git a/lib/regexec.c b/lib/regexec.c
index 92efb44..5a6a0dc 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -3986,18 +3986,10 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx
node_idx,
# endif /* _LIBC */
{
/* match with range expression? */
-#if __GNUC__ >= 2 && ! (__STDC_VERSION__ < 199901L && defined __STRICT_ANSI__)
- wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
-#else
- wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
- cmp_buf[2] = wc;
-#endif
for (i = 0; i < cset->nranges; ++i)
{
- cmp_buf[0] = cset->range_starts[i];
- cmp_buf[4] = cset->range_ends[i];
- if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
- && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
+ if (cset->range_starts[i] <= wc
+ && wc <= cset->range_ends[i])
{
match_len = char_len;
goto check_node_accept_bytes_match;
--
1.7.1
- RRI patches for grep,
Aharon Robbins <=