>From 9b2a998512f4e0b47a4e08ee225807f9d4d49a7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Tue, 18 Jan 2011 07:57:59 +0000 Subject: [PATCH] unistr/u*-strstr: optimize the general case * lib/unistr/u-strstr.h (U8_STRSTR): A new define so we can do a compile time check for code to use for the UTF-8 case. * lib/unistr/u8-strstr.c (u8_strstr): Use strstr() for UTF-8 and needles bigger than 1 byte as it's simpler and probably faster. * lib/unistr/u16-strstr.c (u16_strstr): Try Knuth-Morris-Pratt first, and fail back to the simple scan if not enough memory. * lib/unistr/u32-strstr.c (u32_strstr): Likewise. * modules/unistr/u8-strstr: Depend on strstr so that we have both a functional and efficient implementation. * modules/unistr/u16-strstr: Include str-kmp.h and malloca. * modules/unistr/u32-strstr: Likewise. * lib/str-kmp.h: Generalise to use a definable UNIT, and drop the "unibyte" part of the name. * lib/mbsstr.c: Adjust to the modified str-kmp.h interface. * lib/mbscasestr.c: Likewise. * modules/unistr/u8-strstr-tests: Add a module to check performance. * modules/unistr/u16-strstr-tests: Likewise. * modules/unistr/u32-strstr-tests: Likewise. * tests/unistr/test-u-strstr.h: Test the linear performance characteristics of uN-strstr(). * tests/unistr/test-u8-strstr.c: Define and call the corresponding test function. * tests/unistr/test-u16-strstr.c: Likewise. * tests/unistr/test-u32-strstr.c: Likewise. --- ChangeLog | 24 ++++++++++++++++ lib/mbscasestr.c | 6 +++- lib/mbsstr.c | 8 +++-- lib/str-kmp.h | 27 +++++++++--------- lib/unistr/u-strstr.h | 47 +++++++++++++++++++++++--------- lib/unistr/u16-strstr.c | 1 + lib/unistr/u32-strstr.c | 1 + lib/unistr/u8-strstr.c | 3 ++ modules/unistr/u16-strstr | 5 +++- modules/unistr/u16-strstr-tests | 14 +++++++++ modules/unistr/u32-strstr | 5 +++- modules/unistr/u32-strstr-tests | 14 +++++++++ modules/unistr/u8-strstr | 1 + modules/unistr/u8-strstr-tests | 14 +++++++++ tests/unistr/test-u-strstr.h | 56 +++++++++++++++++++++++++++++++++++++++ tests/unistr/test-u16-strstr.c | 35 ++++++++++++++++++++++++ tests/unistr/test-u32-strstr.c | 35 ++++++++++++++++++++++++ tests/unistr/test-u8-strstr.c | 35 ++++++++++++++++++++++++ 18 files changed, 296 insertions(+), 35 deletions(-) create mode 100644 modules/unistr/u16-strstr-tests create mode 100644 modules/unistr/u32-strstr-tests create mode 100644 modules/unistr/u8-strstr-tests create mode 100644 tests/unistr/test-u-strstr.h create mode 100644 tests/unistr/test-u16-strstr.c create mode 100644 tests/unistr/test-u32-strstr.c create mode 100644 tests/unistr/test-u8-strstr.c diff --git a/ChangeLog b/ChangeLog index 1afdc08..beae33e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +2011-01-18 Pádraig Brady + + unistr/uN-strstr: optimize the general case + * lib/unistr/u8-strstr.c (u8_strstr): Use strstr() as it's faster. + * lib/unistr/u16-strstr.c (u16_strstr) : Try Knuth-Morris-Pratt. + * lib/unistr/u32-strstr.c (u32_strstr) : Likewise. + * modules/unistr/u8-strstr: Depend on strstr to ensure a functional + and efficient strstr(). + * modules/unistrs/u16-strstr: Include str-kmp.h. + * modules/unistrs/u32-strstr: Likewise. + * lib/str-kmp.h: Support definable UNITs. + * lib/mbsstr.c: Adjust for the changed str-kmp.h. + * lib/mbscasestr.c: Likewise. + * modules/unistr/u8-strstr-tests: Add a module to check performance. + * modules/unistr/u16-strstr-tests: Likewise. + * modules/unistr/u32-strstr-tests: Likewise. + * tests/unistr/test-u-strstr.h: Test the linear performance + characteristics of uN-strstr(). + * tests/unistr/test-u8-strstr.c: Define and call the corresponding + test function. + * tests/unistr/test-u16-strstr.c: Likewise. + * tests/unistr/test-u32-strstr.c: Likewise. + + 2011-01-07 Pádraig Brady ignore-value: fixup comments, and add Eric Blake diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c index 379b2fb..98eb077 100644 --- a/lib/mbscasestr.c +++ b/lib/mbscasestr.c @@ -30,6 +30,7 @@ #define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch)) /* Knuth-Morris-Pratt algorithm. */ +#define UNIT unsigned char #define CANON_ELEMENT(c) TOLOWER (c) #include "str-kmp.h" @@ -369,9 +370,10 @@ mbscasestr (const char *haystack, const char *needle) { /* Try the Knuth-Morris-Pratt algorithm. */ const char *result; + size_t needle_len = strlen (needle - 1); bool success = - knuth_morris_pratt_unibyte (haystack, needle - 1, - &result); + knuth_morris_pratt (haystack, needle - 1, needle_len, + &result); if (success) return (char *) result; try_kmp = false; diff --git a/lib/mbsstr.c b/lib/mbsstr.c index 513dde1..994b028 100644 --- a/lib/mbsstr.c +++ b/lib/mbsstr.c @@ -28,6 +28,7 @@ /* Knuth-Morris-Pratt algorithm. */ #define CANON_ELEMENT(c) c +#define UNIT unsigned char #include "str-kmp.h" /* Knuth-Morris-Pratt algorithm. @@ -339,10 +340,11 @@ mbsstr (const char *haystack, const char *needle) if (needle_last_ccount == NULL) { /* Try the Knuth-Morris-Pratt algorithm. */ - const char *result; + const UNIT *result; + size_t needle_len = strlen (needle - 1); bool success = - knuth_morris_pratt_unibyte (haystack, needle - 1, - &result); + knuth_morris_pratt (haystack, needle - 1, needle_len, + &result); if (success) return (char *) result; try_kmp = false; diff --git a/lib/str-kmp.h b/lib/str-kmp.h index 95a73f5..41c71b7 100644 --- a/lib/str-kmp.h +++ b/lib/str-kmp.h @@ -1,4 +1,4 @@ -/* Substring search in a NUL terminated string of 'char' elements, +/* Substring search in a NUL terminated string of 'UNIT' elements, using the Knuth-Morris-Pratt algorithm. Copyright (C) 2005-2011 Free Software Foundation, Inc. Written by Bruno Haible , 2005. @@ -18,10 +18,9 @@ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* Before including this file, you need to define: + UNIT The type to operate on CANON_ELEMENT(c) A macro that canonicalizes an element right after - it has been fetched from one of the two strings. - The argument is an 'unsigned char'; the result - must be an 'unsigned char' as well. */ + it has been fetched from needle or haystack. */ /* Knuth-Morris-Pratt algorithm. See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm @@ -29,10 +28,10 @@ Return true and set *RESULTP if the search was completed. Return false if it was aborted because not enough memory was available. */ static bool -knuth_morris_pratt_unibyte (const char *haystack, const char *needle, - const char **resultp) +knuth_morris_pratt (const UNIT *haystack, const UNIT *needle, + size_t needle_len, const UNIT **resultp) { - size_t m = strlen (needle); + size_t m = needle_len; /* Allocate the table. */ size_t *table = (size_t *) nmalloca (m, sizeof (size_t)); @@ -66,14 +65,14 @@ knuth_morris_pratt_unibyte (const char *haystack, const char *needle, The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold for x < table[i-1], by induction. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - unsigned char b = CANON_ELEMENT ((unsigned char) needle[i - 1]); + UNIT b = CANON_ELEMENT (needle[i - 1]); for (;;) { /* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold for x < i-1-j. Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */ - if (b == CANON_ELEMENT ((unsigned char) needle[j])) + if (b == CANON_ELEMENT (needle[j])) { /* Set table[i] := i-1-j. */ table[i] = i - ++j; @@ -108,17 +107,17 @@ knuth_morris_pratt_unibyte (const char *haystack, const char *needle, /* Search, using the table to accelerate the processing. */ { size_t j; - const char *rhaystack; - const char *phaystack; + const UNIT *rhaystack; + const UNIT *phaystack; *resultp = NULL; j = 0; rhaystack = haystack; phaystack = haystack; /* Invariant: phaystack = rhaystack + j. */ - while (*phaystack != '\0') - if (CANON_ELEMENT ((unsigned char) needle[j]) - == CANON_ELEMENT ((unsigned char) *phaystack)) + while (*phaystack != 0) + if (CANON_ELEMENT (needle[j]) + == CANON_ELEMENT (*phaystack)) { j++; phaystack++; diff --git a/lib/unistr/u-strstr.h b/lib/unistr/u-strstr.h index dee538d..1473a39 100644 --- a/lib/unistr/u-strstr.h +++ b/lib/unistr/u-strstr.h @@ -15,6 +15,12 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ +#ifndef U8_STRSTR +# define CANON_ELEMENT(c) c +# include "malloca.h" +# include "str-kmp.h" +#endif + UNIT * FUNC (const UNIT *haystack, const UNIT *needle) { @@ -38,22 +44,35 @@ FUNC (const UNIT *haystack, const UNIT *needle) } #endif - /* Search for needle's first unit. */ - for (; *haystack != 0; haystack++) - if (*haystack == first) - { - /* Compare with needle's remaining units. */ - const UNIT *hptr = haystack + 1; - const UNIT *nptr = needle + 1; - for (;;) +#ifdef U8_STRSTR + return (uint8_t *) strstr ((const char *) haystack, (const char *) needle); +#else + const UNIT *result; + size_t needle_len = U_STRLEN (needle); +# define SIMPLE_SCAN_FASTER_LIMIT 6 + if (needle_len > SIMPLE_SCAN_FASTER_LIMIT + && knuth_morris_pratt (haystack, needle, needle_len, &result)) + return (UNIT *) result; + else + { + /* Search for needle's first unit. */ + for (; *haystack != 0; haystack++) + if (*haystack == first) { - if (*hptr != *nptr) - break; - hptr++; nptr++; - if (*nptr == 0) - return (UNIT *) haystack; + /* Compare with needle's remaining units. */ + const UNIT *hptr = haystack + 1; + const UNIT *nptr = needle + 1; + for (;;) + { + if (*hptr != *nptr) + break; + hptr++; nptr++; + if (*nptr == 0) + return (UNIT *) haystack; + } } - } + } return NULL; +#endif } diff --git a/lib/unistr/u16-strstr.c b/lib/unistr/u16-strstr.c index 06f17b5..ac3089c 100644 --- a/lib/unistr/u16-strstr.c +++ b/lib/unistr/u16-strstr.c @@ -26,4 +26,5 @@ #define UNIT uint16_t #define U_STRCHR u16_strchr #define U_STRMBTOUC u16_strmbtouc +#define U_STRLEN u16_strlen #include "u-strstr.h" diff --git a/lib/unistr/u32-strstr.c b/lib/unistr/u32-strstr.c index 99ccef3..6a6bad8 100644 --- a/lib/unistr/u32-strstr.c +++ b/lib/unistr/u32-strstr.c @@ -23,4 +23,5 @@ #define FUNC u32_strstr #define UNIT uint32_t #define U_STRCHR u32_strchr +#define U_STRLEN u32_strlen #include "u-strstr.h" diff --git a/lib/unistr/u8-strstr.c b/lib/unistr/u8-strstr.c index 0638baa..d38c21e 100644 --- a/lib/unistr/u8-strstr.c +++ b/lib/unistr/u8-strstr.c @@ -20,10 +20,13 @@ /* Specification. */ #include "unistr.h" +#include + /* FIXME: Maybe walking the string via u8_mblen is a win? */ #define FUNC u8_strstr #define UNIT uint8_t #define U_STRCHR u8_strchr #define U_STRMBTOUC u8_strmbtouc +#define U8_STRSTR 1 #include "u-strstr.h" diff --git a/modules/unistr/u16-strstr b/modules/unistr/u16-strstr index 014ae11..30135d3 100644 --- a/modules/unistr/u16-strstr +++ b/modules/unistr/u16-strstr @@ -4,18 +4,21 @@ Substring test for UTF-16 strings. Files: lib/unistr/u16-strstr.c lib/unistr/u-strstr.h +lib/str-kmp.h Depends-on: unistr/base unistr/u16-strchr unistr/u16-strmbtouc +unistr/u16-strlen +malloca configure.ac: gl_LIBUNISTRING_MODULE([0.9], [unistr/u16-strstr]) Makefile.am: if LIBUNISTRING_COMPILE_UNISTR_U16_STRSTR -lib_SOURCES += unistr/u16-strstr.c +lib_SOURCES += unistr/u16-strstr.c str-kmp.h endif Include: diff --git a/modules/unistr/u16-strstr-tests b/modules/unistr/u16-strstr-tests new file mode 100644 index 0000000..e180308 --- /dev/null +++ b/modules/unistr/u16-strstr-tests @@ -0,0 +1,14 @@ +Files: +tests/unistr/test-u16-strstr.c +tests/unistr/test-u-strstr.h +tests/macros.h + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-u16-strstr +check_PROGRAMS += test-u16-strstr +test_u16_strstr_SOURCES = unistr/test-u16-strstr.c +test_u16_strstr_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/modules/unistr/u32-strstr b/modules/unistr/u32-strstr index 45533c2..4e47320 100644 --- a/modules/unistr/u32-strstr +++ b/modules/unistr/u32-strstr @@ -4,17 +4,20 @@ Substring test for UTF-32 strings. Files: lib/unistr/u32-strstr.c lib/unistr/u-strstr.h +lib/str-kmp.h Depends-on: unistr/base unistr/u32-strchr +unistr/u32-strlen +malloca configure.ac: gl_LIBUNISTRING_MODULE([0.9], [unistr/u32-strstr]) Makefile.am: if LIBUNISTRING_COMPILE_UNISTR_U32_STRSTR -lib_SOURCES += unistr/u32-strstr.c +lib_SOURCES += unistr/u32-strstr.c str-kmp.h endif Include: diff --git a/modules/unistr/u32-strstr-tests b/modules/unistr/u32-strstr-tests new file mode 100644 index 0000000..2e7c26e --- /dev/null +++ b/modules/unistr/u32-strstr-tests @@ -0,0 +1,14 @@ +Files: +tests/unistr/test-u32-strstr.c +tests/unistr/test-u-strstr.h +tests/macros.h + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-u32-strstr +check_PROGRAMS += test-u32-strstr +test_u32_strstr_SOURCES = unistr/test-u32-strstr.c +test_u32_strstr_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/modules/unistr/u8-strstr b/modules/unistr/u8-strstr index 30d99a4..4f3a97b 100644 --- a/modules/unistr/u8-strstr +++ b/modules/unistr/u8-strstr @@ -9,6 +9,7 @@ Depends-on: unistr/base unistr/u8-strchr unistr/u8-strmbtouc +strstr configure.ac: gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-strstr]) diff --git a/modules/unistr/u8-strstr-tests b/modules/unistr/u8-strstr-tests new file mode 100644 index 0000000..51020db --- /dev/null +++ b/modules/unistr/u8-strstr-tests @@ -0,0 +1,14 @@ +Files: +tests/unistr/test-u8-strstr.c +tests/unistr/test-u-strstr.h +tests/macros.h + +Depends-on: + +configure.ac: + +Makefile.am: +TESTS += test-u8-strstr +check_PROGRAMS += test-u8-strstr +test_u8_strstr_SOURCES = unistr/test-u8-strstr.c +test_u8_strstr_LDADD = $(LDADD) $(LIBUNISTRING) diff --git a/tests/unistr/test-u-strstr.h b/tests/unistr/test-u-strstr.h new file mode 100644 index 0000000..aca4a2a --- /dev/null +++ b/tests/unistr/test-u-strstr.h @@ -0,0 +1,56 @@ +/* Test of uN_strstr() functions. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Pádraig Brady , 2011. */ + +#include +#include + +/* Test linear performance characteristics. */ + +static void +test_u_strstr (void) +{ + size_t m = 1000000; + UNIT *haystack = malloc ((2 * m + 1) * (sizeof (UNIT))); + UNIT *needle = malloc ((m + 1) * (sizeof (UNIT))); + if (haystack != NULL && needle != NULL) + { + size_t i; + const UNIT *result; + + for (i=0; i<2*m; i++) + haystack[i]='A'; + haystack[2*m-1]='B'; + haystack[2*m]=0; + + for (i=0; i. */ + +/* Written by Pádraig Brady , 2011. */ + +#include + +#include "unistr.h" + +#include "macros.h" + +#define U_STRSTR u16_strstr +#define UNIT uint16_t +#include "test-u-strstr.h" + +int +main (void) +{ + test_u_strstr (); + + return 0; +} diff --git a/tests/unistr/test-u32-strstr.c b/tests/unistr/test-u32-strstr.c new file mode 100644 index 0000000..6d85339 --- /dev/null +++ b/tests/unistr/test-u32-strstr.c @@ -0,0 +1,35 @@ +/* Test of u16_strstr() function. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Pádraig Brady , 2011. */ + +#include + +#include "unistr.h" + +#include "macros.h" + +#define U_STRSTR u32_strstr +#define UNIT uint32_t +#include "test-u-strstr.h" + +int +main (void) +{ + test_u_strstr (); + + return 0; +} diff --git a/tests/unistr/test-u8-strstr.c b/tests/unistr/test-u8-strstr.c new file mode 100644 index 0000000..a3cba36 --- /dev/null +++ b/tests/unistr/test-u8-strstr.c @@ -0,0 +1,35 @@ +/* Test of u8_strstr() function. + Copyright (C) 2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Pádraig Brady , 2011. */ + +#include + +#include "unistr.h" + +#include "macros.h" + +#define U_STRSTR u8_strstr +#define UNIT uint8_t +#include "test-u-strstr.h" + +int +main (void) +{ + test_u_strstr (); + + return 0; +} -- 1.7.3.4