>From 9b2a998512f4e0b47a4e08ee225807f9d4d49a7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?=
Date: Tue, 18 Jan 2011 07:57:59 +0000
Subject: [PATCH] unistr/u*-strstr: optimize the general case
* lib/unistr/u-strstr.h (U8_STRSTR): A new define so we can
do a compile time check for code to use for the UTF-8 case.
* lib/unistr/u8-strstr.c (u8_strstr): Use strstr() for UTF-8 and
needles bigger than 1 byte as it's simpler and probably faster.
* lib/unistr/u16-strstr.c (u16_strstr): Try Knuth-Morris-Pratt
first, and fail back to the simple scan if not enough memory.
* lib/unistr/u32-strstr.c (u32_strstr): Likewise.
* modules/unistr/u8-strstr: Depend on strstr so that we have
both a functional and efficient implementation.
* modules/unistr/u16-strstr: Include str-kmp.h and malloca.
* modules/unistr/u32-strstr: Likewise.
* lib/str-kmp.h: Generalise to use a definable UNIT, and
drop the "unibyte" part of the name.
* lib/mbsstr.c: Adjust to the modified str-kmp.h interface.
* lib/mbscasestr.c: Likewise.
* modules/unistr/u8-strstr-tests: Add a module to check performance.
* modules/unistr/u16-strstr-tests: Likewise.
* modules/unistr/u32-strstr-tests: Likewise.
* tests/unistr/test-u-strstr.h: Test the linear performance
characteristics of uN-strstr().
* tests/unistr/test-u8-strstr.c: Define and call the corresponding
test function.
* tests/unistr/test-u16-strstr.c: Likewise.
* tests/unistr/test-u32-strstr.c: Likewise.
---
ChangeLog | 24 ++++++++++++++++
lib/mbscasestr.c | 6 +++-
lib/mbsstr.c | 8 +++--
lib/str-kmp.h | 27 +++++++++---------
lib/unistr/u-strstr.h | 47 +++++++++++++++++++++++---------
lib/unistr/u16-strstr.c | 1 +
lib/unistr/u32-strstr.c | 1 +
lib/unistr/u8-strstr.c | 3 ++
modules/unistr/u16-strstr | 5 +++-
modules/unistr/u16-strstr-tests | 14 +++++++++
modules/unistr/u32-strstr | 5 +++-
modules/unistr/u32-strstr-tests | 14 +++++++++
modules/unistr/u8-strstr | 1 +
modules/unistr/u8-strstr-tests | 14 +++++++++
tests/unistr/test-u-strstr.h | 56 +++++++++++++++++++++++++++++++++++++++
tests/unistr/test-u16-strstr.c | 35 ++++++++++++++++++++++++
tests/unistr/test-u32-strstr.c | 35 ++++++++++++++++++++++++
tests/unistr/test-u8-strstr.c | 35 ++++++++++++++++++++++++
18 files changed, 296 insertions(+), 35 deletions(-)
create mode 100644 modules/unistr/u16-strstr-tests
create mode 100644 modules/unistr/u32-strstr-tests
create mode 100644 modules/unistr/u8-strstr-tests
create mode 100644 tests/unistr/test-u-strstr.h
create mode 100644 tests/unistr/test-u16-strstr.c
create mode 100644 tests/unistr/test-u32-strstr.c
create mode 100644 tests/unistr/test-u8-strstr.c
diff --git a/ChangeLog b/ChangeLog
index 1afdc08..beae33e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2011-01-18 Pádraig Brady
+
+ unistr/uN-strstr: optimize the general case
+ * lib/unistr/u8-strstr.c (u8_strstr): Use strstr() as it's faster.
+ * lib/unistr/u16-strstr.c (u16_strstr) : Try Knuth-Morris-Pratt.
+ * lib/unistr/u32-strstr.c (u32_strstr) : Likewise.
+ * modules/unistr/u8-strstr: Depend on strstr to ensure a functional
+ and efficient strstr().
+ * modules/unistrs/u16-strstr: Include str-kmp.h.
+ * modules/unistrs/u32-strstr: Likewise.
+ * lib/str-kmp.h: Support definable UNITs.
+ * lib/mbsstr.c: Adjust for the changed str-kmp.h.
+ * lib/mbscasestr.c: Likewise.
+ * modules/unistr/u8-strstr-tests: Add a module to check performance.
+ * modules/unistr/u16-strstr-tests: Likewise.
+ * modules/unistr/u32-strstr-tests: Likewise.
+ * tests/unistr/test-u-strstr.h: Test the linear performance
+ characteristics of uN-strstr().
+ * tests/unistr/test-u8-strstr.c: Define and call the corresponding
+ test function.
+ * tests/unistr/test-u16-strstr.c: Likewise.
+ * tests/unistr/test-u32-strstr.c: Likewise.
+
+
2011-01-07 Pádraig Brady
ignore-value: fixup comments, and add Eric Blake
diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c
index 379b2fb..98eb077 100644
--- a/lib/mbscasestr.c
+++ b/lib/mbscasestr.c
@@ -30,6 +30,7 @@
#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
/* Knuth-Morris-Pratt algorithm. */
+#define UNIT unsigned char
#define CANON_ELEMENT(c) TOLOWER (c)
#include "str-kmp.h"
@@ -369,9 +370,10 @@ mbscasestr (const char *haystack, const char *needle)
{
/* Try the Knuth-Morris-Pratt algorithm. */
const char *result;
+ size_t needle_len = strlen (needle - 1);
bool success =
- knuth_morris_pratt_unibyte (haystack, needle - 1,
- &result);
+ knuth_morris_pratt (haystack, needle - 1, needle_len,
+ &result);
if (success)
return (char *) result;
try_kmp = false;
diff --git a/lib/mbsstr.c b/lib/mbsstr.c
index 513dde1..994b028 100644
--- a/lib/mbsstr.c
+++ b/lib/mbsstr.c
@@ -28,6 +28,7 @@
/* Knuth-Morris-Pratt algorithm. */
#define CANON_ELEMENT(c) c
+#define UNIT unsigned char
#include "str-kmp.h"
/* Knuth-Morris-Pratt algorithm.
@@ -339,10 +340,11 @@ mbsstr (const char *haystack, const char *needle)
if (needle_last_ccount == NULL)
{
/* Try the Knuth-Morris-Pratt algorithm. */
- const char *result;
+ const UNIT *result;
+ size_t needle_len = strlen (needle - 1);
bool success =
- knuth_morris_pratt_unibyte (haystack, needle - 1,
- &result);
+ knuth_morris_pratt (haystack, needle - 1, needle_len,
+ &result);
if (success)
return (char *) result;
try_kmp = false;
diff --git a/lib/str-kmp.h b/lib/str-kmp.h
index 95a73f5..41c71b7 100644
--- a/lib/str-kmp.h
+++ b/lib/str-kmp.h
@@ -1,4 +1,4 @@
-/* Substring search in a NUL terminated string of 'char' elements,
+/* Substring search in a NUL terminated string of 'UNIT' elements,
using the Knuth-Morris-Pratt algorithm.
Copyright (C) 2005-2011 Free Software Foundation, Inc.
Written by Bruno Haible , 2005.
@@ -18,10 +18,9 @@
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
/* Before including this file, you need to define:
+ UNIT The type to operate on
CANON_ELEMENT(c) A macro that canonicalizes an element right after
- it has been fetched from one of the two strings.
- The argument is an 'unsigned char'; the result
- must be an 'unsigned char' as well. */
+ it has been fetched from needle or haystack. */
/* Knuth-Morris-Pratt algorithm.
See http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm
@@ -29,10 +28,10 @@
Return true and set *RESULTP if the search was completed.
Return false if it was aborted because not enough memory was available. */
static bool
-knuth_morris_pratt_unibyte (const char *haystack, const char *needle,
- const char **resultp)
+knuth_morris_pratt (const UNIT *haystack, const UNIT *needle,
+ size_t needle_len, const UNIT **resultp)
{
- size_t m = strlen (needle);
+ size_t m = needle_len;
/* Allocate the table. */
size_t *table = (size_t *) nmalloca (m, sizeof (size_t));
@@ -66,14 +65,14 @@ knuth_morris_pratt_unibyte (const char *haystack, const char *needle,
The inequality needle[x..i-1] != needle[0..i-1-x] is known to hold
for x < table[i-1], by induction.
Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
- unsigned char b = CANON_ELEMENT ((unsigned char) needle[i - 1]);
+ UNIT b = CANON_ELEMENT (needle[i - 1]);
for (;;)
{
/* Invariants: The inequality needle[x..i-1] != needle[0..i-1-x]
is known to hold for x < i-1-j.
Furthermore, if j>0: needle[i-1-j..i-2] = needle[0..j-1]. */
- if (b == CANON_ELEMENT ((unsigned char) needle[j]))
+ if (b == CANON_ELEMENT (needle[j]))
{
/* Set table[i] := i-1-j. */
table[i] = i - ++j;
@@ -108,17 +107,17 @@ knuth_morris_pratt_unibyte (const char *haystack, const char *needle,
/* Search, using the table to accelerate the processing. */
{
size_t j;
- const char *rhaystack;
- const char *phaystack;
+ const UNIT *rhaystack;
+ const UNIT *phaystack;
*resultp = NULL;
j = 0;
rhaystack = haystack;
phaystack = haystack;
/* Invariant: phaystack = rhaystack + j. */
- while (*phaystack != '\0')
- if (CANON_ELEMENT ((unsigned char) needle[j])
- == CANON_ELEMENT ((unsigned char) *phaystack))
+ while (*phaystack != 0)
+ if (CANON_ELEMENT (needle[j])
+ == CANON_ELEMENT (*phaystack))
{
j++;
phaystack++;
diff --git a/lib/unistr/u-strstr.h b/lib/unistr/u-strstr.h
index dee538d..1473a39 100644
--- a/lib/unistr/u-strstr.h
+++ b/lib/unistr/u-strstr.h
@@ -15,6 +15,12 @@
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see . */
+#ifndef U8_STRSTR
+# define CANON_ELEMENT(c) c
+# include "malloca.h"
+# include "str-kmp.h"
+#endif
+
UNIT *
FUNC (const UNIT *haystack, const UNIT *needle)
{
@@ -38,22 +44,35 @@ FUNC (const UNIT *haystack, const UNIT *needle)
}
#endif
- /* Search for needle's first unit. */
- for (; *haystack != 0; haystack++)
- if (*haystack == first)
- {
- /* Compare with needle's remaining units. */
- const UNIT *hptr = haystack + 1;
- const UNIT *nptr = needle + 1;
- for (;;)
+#ifdef U8_STRSTR
+ return (uint8_t *) strstr ((const char *) haystack, (const char *) needle);
+#else
+ const UNIT *result;
+ size_t needle_len = U_STRLEN (needle);
+# define SIMPLE_SCAN_FASTER_LIMIT 6
+ if (needle_len > SIMPLE_SCAN_FASTER_LIMIT
+ && knuth_morris_pratt (haystack, needle, needle_len, &result))
+ return (UNIT *) result;
+ else
+ {
+ /* Search for needle's first unit. */
+ for (; *haystack != 0; haystack++)
+ if (*haystack == first)
{
- if (*hptr != *nptr)
- break;
- hptr++; nptr++;
- if (*nptr == 0)
- return (UNIT *) haystack;
+ /* Compare with needle's remaining units. */
+ const UNIT *hptr = haystack + 1;
+ const UNIT *nptr = needle + 1;
+ for (;;)
+ {
+ if (*hptr != *nptr)
+ break;
+ hptr++; nptr++;
+ if (*nptr == 0)
+ return (UNIT *) haystack;
+ }
}
- }
+ }
return NULL;
+#endif
}
diff --git a/lib/unistr/u16-strstr.c b/lib/unistr/u16-strstr.c
index 06f17b5..ac3089c 100644
--- a/lib/unistr/u16-strstr.c
+++ b/lib/unistr/u16-strstr.c
@@ -26,4 +26,5 @@
#define UNIT uint16_t
#define U_STRCHR u16_strchr
#define U_STRMBTOUC u16_strmbtouc
+#define U_STRLEN u16_strlen
#include "u-strstr.h"
diff --git a/lib/unistr/u32-strstr.c b/lib/unistr/u32-strstr.c
index 99ccef3..6a6bad8 100644
--- a/lib/unistr/u32-strstr.c
+++ b/lib/unistr/u32-strstr.c
@@ -23,4 +23,5 @@
#define FUNC u32_strstr
#define UNIT uint32_t
#define U_STRCHR u32_strchr
+#define U_STRLEN u32_strlen
#include "u-strstr.h"
diff --git a/lib/unistr/u8-strstr.c b/lib/unistr/u8-strstr.c
index 0638baa..d38c21e 100644
--- a/lib/unistr/u8-strstr.c
+++ b/lib/unistr/u8-strstr.c
@@ -20,10 +20,13 @@
/* Specification. */
#include "unistr.h"
+#include
+
/* FIXME: Maybe walking the string via u8_mblen is a win? */
#define FUNC u8_strstr
#define UNIT uint8_t
#define U_STRCHR u8_strchr
#define U_STRMBTOUC u8_strmbtouc
+#define U8_STRSTR 1
#include "u-strstr.h"
diff --git a/modules/unistr/u16-strstr b/modules/unistr/u16-strstr
index 014ae11..30135d3 100644
--- a/modules/unistr/u16-strstr
+++ b/modules/unistr/u16-strstr
@@ -4,18 +4,21 @@ Substring test for UTF-16 strings.
Files:
lib/unistr/u16-strstr.c
lib/unistr/u-strstr.h
+lib/str-kmp.h
Depends-on:
unistr/base
unistr/u16-strchr
unistr/u16-strmbtouc
+unistr/u16-strlen
+malloca
configure.ac:
gl_LIBUNISTRING_MODULE([0.9], [unistr/u16-strstr])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U16_STRSTR
-lib_SOURCES += unistr/u16-strstr.c
+lib_SOURCES += unistr/u16-strstr.c str-kmp.h
endif
Include:
diff --git a/modules/unistr/u16-strstr-tests b/modules/unistr/u16-strstr-tests
new file mode 100644
index 0000000..e180308
--- /dev/null
+++ b/modules/unistr/u16-strstr-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u16-strstr.c
+tests/unistr/test-u-strstr.h
+tests/macros.h
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u16-strstr
+check_PROGRAMS += test-u16-strstr
+test_u16_strstr_SOURCES = unistr/test-u16-strstr.c
+test_u16_strstr_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u32-strstr b/modules/unistr/u32-strstr
index 45533c2..4e47320 100644
--- a/modules/unistr/u32-strstr
+++ b/modules/unistr/u32-strstr
@@ -4,17 +4,20 @@ Substring test for UTF-32 strings.
Files:
lib/unistr/u32-strstr.c
lib/unistr/u-strstr.h
+lib/str-kmp.h
Depends-on:
unistr/base
unistr/u32-strchr
+unistr/u32-strlen
+malloca
configure.ac:
gl_LIBUNISTRING_MODULE([0.9], [unistr/u32-strstr])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U32_STRSTR
-lib_SOURCES += unistr/u32-strstr.c
+lib_SOURCES += unistr/u32-strstr.c str-kmp.h
endif
Include:
diff --git a/modules/unistr/u32-strstr-tests b/modules/unistr/u32-strstr-tests
new file mode 100644
index 0000000..2e7c26e
--- /dev/null
+++ b/modules/unistr/u32-strstr-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u32-strstr.c
+tests/unistr/test-u-strstr.h
+tests/macros.h
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u32-strstr
+check_PROGRAMS += test-u32-strstr
+test_u32_strstr_SOURCES = unistr/test-u32-strstr.c
+test_u32_strstr_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u8-strstr b/modules/unistr/u8-strstr
index 30d99a4..4f3a97b 100644
--- a/modules/unistr/u8-strstr
+++ b/modules/unistr/u8-strstr
@@ -9,6 +9,7 @@ Depends-on:
unistr/base
unistr/u8-strchr
unistr/u8-strmbtouc
+strstr
configure.ac:
gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-strstr])
diff --git a/modules/unistr/u8-strstr-tests b/modules/unistr/u8-strstr-tests
new file mode 100644
index 0000000..51020db
--- /dev/null
+++ b/modules/unistr/u8-strstr-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u8-strstr.c
+tests/unistr/test-u-strstr.h
+tests/macros.h
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u8-strstr
+check_PROGRAMS += test-u8-strstr
+test_u8_strstr_SOURCES = unistr/test-u8-strstr.c
+test_u8_strstr_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unistr/test-u-strstr.h b/tests/unistr/test-u-strstr.h
new file mode 100644
index 0000000..aca4a2a
--- /dev/null
+++ b/tests/unistr/test-u-strstr.h
@@ -0,0 +1,56 @@
+/* Test of uN_strstr() functions.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see . */
+
+/* Written by Pádraig Brady , 2011. */
+
+#include
+#include
+
+/* Test linear performance characteristics. */
+
+static void
+test_u_strstr (void)
+{
+ size_t m = 1000000;
+ UNIT *haystack = malloc ((2 * m + 1) * (sizeof (UNIT)));
+ UNIT *needle = malloc ((m + 1) * (sizeof (UNIT)));
+ if (haystack != NULL && needle != NULL)
+ {
+ size_t i;
+ const UNIT *result;
+
+ for (i=0; i<2*m; i++)
+ haystack[i]='A';
+ haystack[2*m-1]='B';
+ haystack[2*m]=0;
+
+ for (i=0; i. */
+
+/* Written by Pádraig Brady , 2011. */
+
+#include
+
+#include "unistr.h"
+
+#include "macros.h"
+
+#define U_STRSTR u16_strstr
+#define UNIT uint16_t
+#include "test-u-strstr.h"
+
+int
+main (void)
+{
+ test_u_strstr ();
+
+ return 0;
+}
diff --git a/tests/unistr/test-u32-strstr.c b/tests/unistr/test-u32-strstr.c
new file mode 100644
index 0000000..6d85339
--- /dev/null
+++ b/tests/unistr/test-u32-strstr.c
@@ -0,0 +1,35 @@
+/* Test of u16_strstr() function.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see . */
+
+/* Written by Pádraig Brady , 2011. */
+
+#include
+
+#include "unistr.h"
+
+#include "macros.h"
+
+#define U_STRSTR u32_strstr
+#define UNIT uint32_t
+#include "test-u-strstr.h"
+
+int
+main (void)
+{
+ test_u_strstr ();
+
+ return 0;
+}
diff --git a/tests/unistr/test-u8-strstr.c b/tests/unistr/test-u8-strstr.c
new file mode 100644
index 0000000..a3cba36
--- /dev/null
+++ b/tests/unistr/test-u8-strstr.c
@@ -0,0 +1,35 @@
+/* Test of u8_strstr() function.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see . */
+
+/* Written by Pádraig Brady , 2011. */
+
+#include
+
+#include "unistr.h"
+
+#include "macros.h"
+
+#define U_STRSTR u8_strstr
+#define UNIT uint8_t
+#include "test-u-strstr.h"
+
+int
+main (void)
+{
+ test_u_strstr ();
+
+ return 0;
+}
--
1.7.3.4