From 0dc82d895855fb3f9c95b14315b9813a47fb743e Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 7 Mar 2014 19:40:28 -0800 Subject: [PATCH] fgrep: fix case-fold incompatibility with plain 'grep' fgrep converted to lowercase, whereas the regex code converted to uppercase. The resulting behaviors don't agree in offbeat cases like Greek sigmas and Turkish Is. Fix this by changing fgrep to agree with the regex code. * src/kwsearch.c (Fcompile, Fexecute): * src/searchutils.c (kwsinit, mbtoupper): Convert to uppercase, not to lowercase, for compatibility with plain 'grep'. * src/search.h, src/searchutils.c (mbtoupper): Rename from mbtolower, since it now converts to uppercase. All uses changed. * tests/case-fold-titlecase: Add tests for this. --- src/kwsearch.c | 6 +++--- src/search.h | 2 +- src/searchutils.c | 33 ++++++++++++++++----------------- tests/case-fold-titlecase | 4 ++++ 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/kwsearch.c b/src/kwsearch.c index 06f0b79..dd01518 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -36,7 +36,7 @@ Fcompile (char const *pattern, size_t size) size_t psize = size; mb_len_map_t *map = NULL; char const *pat = (match_icase && MB_CUR_MAX > 1 - ? mbtolower (pattern, &psize, &map) + ? mbtoupper (pattern, &psize, &map) : pattern); kwsinit (&kwset); @@ -75,7 +75,7 @@ Fcompile (char const *pattern, size_t size) error (EXIT_TROUBLE, 0, "%s", err); } -/* Apply the MAP (created by mbtolower) to the lowercase-buffer-relative +/* Apply the MAP (created by mbtoupper) to the uppercase-buffer-relative *OFF and *LEN, converting them to be relative to the original buffer. */ static void @@ -110,7 +110,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size, { if (match_icase) { - char *case_buf = mbtolower (buf, &size, &map); + char *case_buf = mbtoupper (buf, &size, &map); if (start_ptr) start_ptr = case_buf + (start_ptr - buf); buf = case_buf; diff --git a/src/search.h b/src/search.h index 91f0271..69e3afd 100644 --- a/src/search.h +++ b/src/search.h @@ -45,7 +45,7 @@ typedef signed char mb_len_map_t; /* searchutils.c */ extern void kwsinit (kwset_t *); -extern char *mbtolower (const char *, size_t *, mb_len_map_t **); +extern char *mbtoupper (const char *, size_t *, mb_len_map_t **); extern void build_mbclen_cache (void); extern bool is_mb_middle (const char **, const char *, const char *, size_t); diff --git a/src/searchutils.c b/src/searchutils.c index 7363701..babb31f 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -37,7 +37,7 @@ kwsinit (kwset_t *kwset) if (match_icase && MB_CUR_MAX == 1) { for (i = 0; i < NCHAR; ++i) - trans[i] = tolower (i); + trans[i] = toupper (i); *kwset = kwsalloc (trans); } @@ -49,38 +49,37 @@ kwsinit (kwset_t *kwset) } #if MBS_SUPPORT -/* Convert the *N-byte string, BEG, to lower-case, and write the +/* Convert BEG, an *N-byte string, to uppercase, and write the NUL-terminated result into malloc'd storage. Upon success, set *N to the length (in bytes) of the resulting string (not including the - trailing NUL byte), and return a pointer to the lower-case string. - Upon memory allocation failure, this function exits. - Note that on input, *N must be larger than zero. + trailing NUL byte), and return a pointer to the uppercase string. + Upon memory allocation failure, exit. *N must be positive. - Note that while this function returns a pointer to malloc'd storage, + Although this function returns a pointer to malloc'd storage, the caller must not free it, since this function retains a pointer to the buffer and reuses it on any subsequent call. As a consequence, this function is not thread-safe. - When each character in the lower-case result string has the same length + When each character in the uppercase result string has the same length as the corresponding character in the input string, set *LEN_MAP_P to NULL. Otherwise, set it to a malloc'd buffer (like the returned buffer, this must not be freed by caller) of the same length as the result string. (*LEN_MAP_P)[J] is the change in byte-length of the character in BEG that formed byte J of the result as it was converted to - lower-case. It is usually zero. For the upper-case Turkish I-with-dot - it is -1, since the upper-case character occupies two bytes, while the - lower-case one occupies only one byte. For the Turkish-I-without-dot - in the tr_TR.utf8 locale, it is 1 because the lower-case representation + uppercase. It is usually zero. For lowercase Turkish dotless I it + is -1, since the lowercase input occupies two bytes, while the + uppercase output occupies only one byte. For lowercase I in the + tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I is one byte longer than the original. When that happens, we have two or more slots in *LEN_MAP_P for each such character. We store the difference in the first one and 0's in any remaining slots. This map is used by the caller to convert offset,length pairs that - reference the lower-case result to numbers that refer to the matched + reference the uppercase result to numbers that refer to the matched part of the original buffer. */ char * -mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) +mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p) { static char *out; static mb_len_map_t *len_map; @@ -94,7 +93,7 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) if (*n > outalloc || outalloc == 0) { - outalloc = MAX(1, *n); + outalloc = MAX (1, *n); out = xrealloc (out, outalloc); len_map = xrealloc (len_map, outalloc); } @@ -175,8 +174,8 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) /* Handle Unicode characters beyond the base plane. */ if (mbclen == 4) { - /* towlower, taking wint_t (4 bytes), handles UCS-4 values. */ - wci = towlower (wci); + /* towupper, taking wint_t (4 bytes), handles UCS-4 values. */ + wci = towupper (wci); if (wci >= 0x10000) { wci -= 0x10000; @@ -197,7 +196,7 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) } else #endif - ombclen = wcrtomb (p, towlower ((wint_t) wc), &os); + ombclen = wcrtomb (p, towupper (wc), &os); *m = mbclen - ombclen; memset (m + 1, 0, ombclen - 1); m += ombclen; diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase index f16022b..ba320c7 100755 --- a/tests/case-fold-titlecase +++ b/tests/case-fold-titlecase @@ -162,6 +162,8 @@ do grep -i "\\(\\)\\1$pat" in >out-regex || fail=1 grep -i "$pat" in >out-dfa || fail=1 compare_ out-regex out-dfa || fail=1 + grep -iF "$pat" in >out-fixed || fail=1 + compare_ out-regex out-fixed || fail=1 done done @@ -180,6 +182,8 @@ if test "$(get-mb-cur-max el_GR.iso88597)" -eq 1; then grep -i "\\(\\)\\1$pat" in >out-regex || fail=1 grep -i "$pat" in >out-dfa || fail=1 compare_ out-regex out-dfa || fail=1 + grep -iF "$pat" in >out-fixed || fail=1 + compare_ out-regex out-fixed || fail=1 done fi -- 1.8.5.3