bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [bug #33198] Incorrect bracket expression when parsing in ru_RU.KOI8


From: Jim Meyering
Subject: Re: [bug #33198] Incorrect bracket expression when parsing in ru_RU.KOI8-R (Russian locale)
Date: Thu, 02 Jun 2011 23:32:09 +0200

Santiago Ruano Rincón wrote:
> Follow-up Comment #3, bug #33198 (project grep):
> It seems the problem is still unsolved. I've tried both, 2.8 and patching 2.7,
> but I got the same results. Igor Ladygin confirms this.
>
> address@hidden:~$ echo Пример| LC_ALL=ru_RU.KOI8-R grep -qE "[Пп]";
> echo $?
> 1

Here's a slightly better patch.
The dfa.c diff is the same, but I've corrected the test name
and added/corrected log comments.


>From cbd5055c976ebc93b657dcdf3783cc91de4f68ed Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Thu, 2 Jun 2011 18:03:49 +0200
Subject: [PATCH 1/2] fix the [...] bug also for relatively unusual uni-byte
 encodings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* src/dfa.c (setbit_case_fold): Also handle uni-byte locales
like the one mentioned in the original report: see 2011-05-07
commit d98338eb.  Re-reported by Santiago Ruano Rincón.
Note that most uni-byte locales are not affected.
* NEWS (Bug fixes): Mention it.
---
 NEWS      |    4 ++++
 src/dfa.c |    7 +++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/NEWS b/NEWS
index 312c803..67b3fad 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ GNU grep NEWS                                    -*- outline 
-*-

 ** Bug fixes

+  echo c|grep '[c]' would fail for any c in 0x80..0xff, with a uni-byte
+  encoding for which the byte-to-wide-char mapping is nontrivial.  For
+  example, the ISO-88591 locales are not affected, but ru_RU.KOI8-R is.
+
   grep -P no longer aborts when PCRE's backtracking limit is exceeded
   Before, echo aaaaaaaaaaaaaab |grep -P '((a+)*)+$' would abort.  Now,
   it diagnoses the problem and exits with status 2.
diff --git a/src/dfa.c b/src/dfa.c
index b41cbb6..0ce6242 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -573,8 +573,11 @@ setbit_case_fold (
   else
     {
 #if MBS_SUPPORT
-      int b2 = wctob ((unsigned char) b);
-      if (b2 == EOF || b2 == b)
+      /* Below, note how when b2 != b and we have a uni-byte locale
+         (MB_CUR_MAX == 1), we set b = b2.  I.e., in a uni-byte locale,
+         we can safely call setbit with a non-EOF value returned by wctob.  */
+      int b2 = wctob (b);
+      if (b2 == EOF || b2 == b || (MB_CUR_MAX == 1 ? (b=b2), 1 : 0))
 #endif
         setbit (b, c);
     }
--
1.7.6.rc0.254.gf37de


>From 713515f036767f4d0c1a162d5263f119bb1d92b4 Mon Sep 17 00:00:00 2001
From: Jim Meyering <address@hidden>
Date: Thu, 2 Jun 2011 11:01:35 +0200
Subject: [PATCH 2/2] tests: exercise a uni-byte [...] bug: requires
 ru_RU.KOI8-R

* tests/unibyte-bracket-expr: New file.
* tests/Makefile.am (TESTS): Add it.
* init.cfg (require_ru_RU_koi8_r): New function.
---
 tests/Makefile.am          |    1 +
 tests/init.cfg             |    9 +++++++++
 tests/unibyte-bracket-expr |   41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 0 deletions(-)
 create mode 100644 tests/unibyte-bracket-expr

diff --git a/tests/Makefile.am b/tests/Makefile.am
index a01b004..f354e4a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -63,6 +63,7 @@ TESTS =                                               \
   inconsistent-range                            \
   khadafy                                      \
   max-count-vs-context                         \
+  unibyte-bracket-expr                         \
   high-bit-range                               \
   options                                      \
   pcre                                         \
diff --git a/tests/init.cfg b/tests/init.cfg
index 3429f0d..f6ead9c 100644
--- a/tests/init.cfg
+++ b/tests/init.cfg
@@ -69,3 +69,12 @@ require_en_utf8_locale_()
     *) skip_test_ 'en_US.UTF-8 locale not found' ;;
   esac
 }
+
+require_ru_RU_koi8_r()
+{
+  path_prepend_ .
+  case $(get-mb-cur-max ru_RU.KOI8-R) in
+    1) ;;
+    *) skip_test_ 'ru_RU.KOI8-R locale not found' ;;
+  esac
+}
diff --git a/tests/unibyte-bracket-expr b/tests/unibyte-bracket-expr
new file mode 100644
index 0000000..a0b51dd
--- /dev/null
+++ b/tests/unibyte-bracket-expr
@@ -0,0 +1,41 @@
+#!/bin/sh
+# Exercise a DFA range bug that arises only with a unibyte encoding
+# for which the wide-char-to-single-byte mapping is nontrivial.
+# E.g., the regexp, [C] would fail to match C in a unibyte locale like
+# ru_RU.KOI8-R for any C whose wide-char representation differed from
+# its single-byte equivalent.
+
+# Copyright (C) 2011 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+require_ru_RU_koi8_r
+LC_ALL=ru_RU.KOI8-R
+export LC_ALL
+
+fail=0
+
+for i in 8 9 a b c d e f; do
+  for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
+    in=in-$i$j
+    b=$(printf "\\x$i$j")
+    echo "$b" > $in || framework_failure_
+    cp $in /t
+    grep "[$b]" $in > out || fail=1
+    compare out $in || fail=1
+  done
+done
+
+Exit $fail
--
1.7.6.rc0.254.gf37de



reply via email to

[Prev in Thread] Current Thread [Next in Thread]