[PATCH 02/15] Sort the keywords before building the trie

bug-grep
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 02/15] Sort the keywords before building the trie

From:	Nick Cleaton
Subject:	[PATCH 02/15] Sort the keywords before building the trie
Date:	Sat, 2 Oct 2010 07:07:44 +0100
Collect and sort all of the reversed keywords before starting work
on building the trie data structure.

Use a sort implementation that compacts shared prefixes; the output
of the sorting stage is a sorted list of reversed keywords stored
with common prefix compaction.  Each reversed keyword is stored as
it's cpl (Common Prefix Length) with the previous string and its
unique suffix.  For example, if the reversed keywords are "foo",
"bar" and "baz" then the sorted set is stored as:

   cpl=0, suffix="bar"  /* bar */
   cpl=2, suffix="z"    /* baz */
   cpl=0, suffix="foo"  /* foo */

This format has some useful properties during the trie building stage,
for example the total length of all of the suffixes gives the number
of trie nodes that will be required.

The main reason for choosing a compacting sort rather than just using
qsort() is to avoid a usage case where storing and sorting the
keywords consumes much more memory than the final trie: many long
common prefixes and/or duplicates among the reversed keywords.
---
 .x-sc_prohibit_xalloc_without_use |    1 +
 po/POTFILES.in                    |    1 +
 src/Makefile.am                   |    4 +-
 src/cpsort.c                      |  441 +++++++++++++++++++++++++++++++++++++
 src/cpsort.h                      |   70 ++++++
 src/kwset.c                       |  340 ++++++++++++++++++-----------
 6 files changed, 729 insertions(+), 128 deletions(-)
 create mode 100644 src/cpsort.c
 create mode 100644 src/cpsort.h

diff --git a/.x-sc_prohibit_xalloc_without_use 
b/.x-sc_prohibit_xalloc_without_use
index 6860c2b..20e8595 100644
--- a/.x-sc_prohibit_xalloc_without_use
+++ b/.x-sc_prohibit_xalloc_without_use
@@ -1 +1,2 @@
 ^src/kwset\.c$
+^src/cpsort\.c$
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 15ded0c..a6282a3 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -25,6 +25,7 @@ lib/regcomp.c
 lib/version-etc.c
 lib/xalloc-die.c
 lib/xstrtol-error.c
+src/cpsort.c
 src/dfa.c
 src/dfasearch.c
 src/egrep.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 689ccab..2b649b4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -25,11 +25,11 @@ bin_PROGRAMS = grep egrep fgrep
 grep_SOURCES  = grep.c
 egrep_SOURCES = egrep.c
 fgrep_SOURCES = fgrep.c
-noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h
+noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h cpsort.h
 
 noinst_LIBRARIES = libgrep.a
 libgrep_a_SOURCES = kwset.c dfa.c searchutils.c dfasearch.c kwsearch.c \
-       pcresearch.c main.c
+       pcresearch.c cpsort.c main.c
 
 # Sometimes, the expansion of $(LIBINTL) includes -lc which may
 # include modules defining variables like `optind', so libgreputils.a
diff --git a/src/cpsort.c b/src/cpsort.c
new file mode 100644
index 0000000..263d895
--- /dev/null
+++ b/src/cpsort.c
@@ -0,0 +1,441 @@
+/* cpsort.c - common prefix compacting string sort.
+   Copyright (C) 2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* An incremental mergesort implementation with a compact data storage
+   format that takes advantage of common prefixes between the strings
+   to be sorted.  */
+
+#include <config.h>
+#include <sys/types.h>
+#include "system.h"
+#include "minmax.h"
+#include "cpsort.h"
+
+#ifdef GREP
+# include "xalloc.h"
+# undef malloc
+# define malloc xmalloc
+#endif
+
+#define SWAP(a, b) { typeof(a) swaptmp = (b); (b) = (a); (a) = swaptmp; }
+
+#define MALLOC(ptr, count) ( (ptr) = malloc( (count) * sizeof(*(ptr)) ) )
+
+/* An element of a linked list of sorted blocks of strings that can be
+   merged to form the sorted set.  Smaller blocks are kept nearer to
+   the head of the list.  */
+struct mergeblock
+{
+  struct mergeblock *next;      /* Next block in the linked list. */
+  int str_count;                /* The number of strings in this block. */
+  struct cpsort_md *strs;       /* The metadata for the strings. */
+  unsigned char *suffixes;      /* The concatenated suffixes. */
+  size_t tot_suf_len;           /* Size of the suffixes vector. */
+};
+
+/* A list of 1 or more strings that were added to the cpsort in sorted
+   order.  Integration into the mergeblock linked list is defered until
+   an out of order input is encountered, so that the sort takes
+   advantange of pre-sorted sections of input. */
+struct presorted_group
+{
+  unsigned char *last;          /* The most recently added string. */
+  size_t last_len;              /* Length of the 'last' string. */
+  size_t last_cap;              /* Capacity of the 'last' buffer. */
+  int count;                    /* The number of strings in the group. */
+  struct cpsort_md *strs;       /* The metadata for the strings. */
+  size_t strs_cap;              /* Capacity of the 'strs' buffer. */
+  unsigned char *suffixes;      /* The concatenated suffixes. */
+  size_t tot_suf_len;           /* Size of the suffixes vector. */
+  size_t suffixes_cap;          /* Capacity of the 'suffixes' buffer. */
+};
+
+/* The opaque structure returned to the caller. */
+struct cpsort
+{
+  int str_count;                /* The number of strings added so far. */
+  struct presorted_group psg;   /* Strings encountered in sorted order. */
+  struct mergeblock mb_list;    /* A list of mergeblocks holding the set. */
+};
+
+/* Set up an empty struct presorted_group. */
+static void
+init_empty_psg (struct presorted_group *psg)
+{
+  psg->last = NULL;
+  psg->last_len = 0;
+  psg->last_cap = 0;
+  psg->count = 0;
+  psg->strs = NULL;
+  psg->strs_cap = 0;
+  psg->suffixes = NULL;
+  psg->tot_suf_len = 0;
+  psg->suffixes_cap = 0;
+}
+
+/* Return an opaque pointer to a newly allocated cpsort, or NULL
+   if enough memory cannot be obtained.  */
+cpsort_t
+cpsort_alloc (void)
+{
+  struct cpsort *c;
+
+  if ((c = malloc (sizeof *c)))
+    {
+      c->str_count = 0;
+      init_empty_psg (&c->psg);
+      c->mb_list.next = NULL;
+      c->mb_list.str_count = 0;
+      c->mb_list.strs = NULL;
+      c->mb_list.suffixes = NULL;
+      c->mb_list.tot_suf_len = 0;
+    }
+  return c;
+}
+
+/* Merge two adjacent blocks in the linked list into one. */
+static const char *
+merge_blocks (struct mergeblock *start)
+{
+  unsigned char *dc_base, *a, *b, *dest, *dest_store;
+  struct cpsort_md *a_elt, *b_elt, *dest_md, *dest_md_store;
+  int cpl_ab, cpl_a_dest, cpl_b_dest, b_lt_a;
+
+  MALLOC (dest_store, start->tot_suf_len + start->next->tot_suf_len);
+  MALLOC (dest_md_store, start->str_count + start->next->str_count + 1);
+  if (!dest_store || !dest_md_store)
+    {
+      free (dest_store);
+      free (dest_md_store);
+      return _("memory exhausted");
+    }
+
+  dest = dest_store;
+  dest_md = dest_md_store;
+  a = dc_base = start->suffixes;
+  b = start->next->suffixes;
+  a_elt = start->strs;
+  b_elt = start->next->strs;
+  cpl_a_dest = cpl_b_dest = cpl_ab = 0;
+
+  while (1)
+    {
+      /* Update cpl_ab, and determine which input is lexically lower. */
+      if (cpl_b_dest < cpl_a_dest)
+        b_lt_a = 0;
+      else if (cpl_a_dest < cpl_b_dest)
+        {
+          cpl_ab = cpl_a_dest;
+          b_lt_a = 1;
+        }
+      else
+        {
+          /* Find cpl_ab (which may be greater than the inputs' mutual CPL
+             with dest) and work out which input is lexically lower.  */
+          unsigned char *a_str = a;
+          unsigned char *b_str = b + (cpl_b_dest - b_elt->cpl);
+          int b_len = b_elt->suf_len - (cpl_b_dest - b_elt->cpl);
+          unsigned char *a_str_stop = a_str + MIN (a_elt->suf_len, b_len);
+
+          while (a_str < a_str_stop && *a_str == *b_str)
+            a_str++, b_str++;
+          cpl_ab += a_str - a;
+          if (a_str != a_str_stop)
+            b_lt_a = *b_str < *a_str ? 1 : 0;
+          else
+            {
+              if (b_len < a_elt->suf_len)
+                b_lt_a = 1;
+              else
+                {
+                  if (b_len == a_elt->suf_len)
+                    {
+                      /* Duplicate.  Keep the lower index value. */
+                      if (b_elt->index < a_elt->index)
+                        a_elt->index = b_elt->index;
+
+                      /* Advance b, to skip the duplicate. */
+                      b += b_elt++->suf_len;
+                      if (b_elt->index < 0)
+                        {
+                          b_elt = a_elt;
+                          break;
+                        }
+                      if (b_elt->cpl < cpl_ab)
+                        cpl_ab = b_elt->cpl;
+                    }
+                  b_lt_a = 0;
+                }
+            }
+        }
+
+      if (b_lt_a)
+        {
+          /* Deferred copy of 1 or more strings from source a. */
+          if (dc_base < a)
+            {
+              memcpy (dest, dc_base, a - dc_base);
+              dest += a - dc_base;
+            }
+
+          /* Defer the write of b's suffix. */
+          dc_base = b + (cpl_b_dest - b_elt->cpl);
+
+          /* Swap the input sources. */
+          SWAP (a, b);
+          SWAP (a_elt, b_elt);
+          cpl_a_dest = cpl_b_dest;
+        }
+
+      /* Copy metadata from a to dest. */
+      dest_md->index = a_elt->index;
+      dest_md->suf_len = a_elt->suf_len - (cpl_a_dest - a_elt->cpl);
+      dest_md->cpl = cpl_a_dest;
+      dest_md++;
+
+      /* Advance input a to the next string. */
+      a += a_elt++->suf_len;
+      cpl_a_dest = a_elt->cpl;
+      cpl_b_dest = cpl_ab;
+
+      if (a_elt->index < 0)
+        {
+          memcpy (dest, dc_base, a - dc_base);
+          dest += a - dc_base;
+
+          dc_base = b + (cpl_b_dest - b_elt->cpl);
+
+          dest_md->index = b_elt->index;
+          dest_md->cpl = cpl_b_dest;
+          dest_md->suf_len = b_elt->suf_len - (cpl_b_dest - b_elt->cpl);
+          dest_md++;
+          b_elt++;
+
+          break;
+        }
+
+    }
+
+  /* Copy from dc_base up to the end of whichever input it points into. */
+  {
+    unsigned char *dc_end = start->suffixes + start->tot_suf_len;
+    if (dc_base < start->suffixes || dc_end < dc_base)
+      dc_end = start->next->suffixes + start->next->tot_suf_len;
+    memcpy (dest, dc_base, dc_end - dc_base);
+    dest += dc_end - dc_base;
+  }
+
+  /* Copy remaining metadata elts from input b. */
+  while (b_elt->index != -1)
+    {
+      dest_md->index = b_elt->index;
+      dest_md->cpl = b_elt->cpl;
+      dest_md++->suf_len = b_elt++->suf_len;
+    }
+  dest_md->index = -1;          /* Terminate the vector. */
+
+  /* Store the combination in the first block, remove the second. */
+  free (start->suffixes);
+  free (start->strs);
+  free (start->next->suffixes);
+  free (start->next->strs);
+  {
+    struct mergeblock *tmp = start->next;
+    start->next = start->next->next;
+    free (tmp);
+  }
+  start->suffixes = dest_store;
+  start->tot_suf_len = dest - dest_store;
+  start->strs = dest_md_store;
+  start->str_count = dest_md - dest_md_store;
+
+  return NULL;
+}
+
+/* Expand a malloc()ed buffer to the required size. */
+static const char *
+expand_buf (void **buf, size_t * current_capacity, size_t need_capacity)
+{
+  if (*current_capacity < need_capacity)
+    {
+      size_t cap = *current_capacity;
+      void *newbuf;
+
+      if (cap == 0)
+        cap = sizeof (struct cpsort_md);
+      while (cap < need_capacity)
+        cap *= 2;
+
+      newbuf = realloc (*buf, cap);
+      if (!newbuf)
+        return _("memory exhausted");
+
+      *buf = newbuf;
+      *current_capacity = cap;
+    }
+
+  return NULL;
+}
+
+/* Move the current pre-sorted group into the mergeblock linked list. */
+static const char *
+flush_sorted_group (cpsort_t c)
+{
+  struct mergeblock *mb, *head;
+  char const *err;
+
+  MALLOC (mb, 1);
+  if (!mb)
+    return _("memory exhausted");
+
+  /* Terminate the vector of cpsort_md structs. */
+  c->psg.strs[c->psg.count].index = -1;
+
+  /* Move the data to the mergeblock. */
+  mb->str_count = c->psg.count;
+  mb->suffixes = c->psg.suffixes;
+  mb->strs = c->psg.strs;
+  mb->tot_suf_len = c->psg.tot_suf_len;
+  init_empty_psg (&c->psg);
+
+  /* Install the new mergeblock in the linked list, keeping the list
+     in ascending str_count order. */
+  head = &c->mb_list;
+  while (head->next && head->next->str_count < mb->str_count)
+    head = head->next;
+  mb->next = head->next;
+  head->next = mb;
+
+  /* Merge the new mergeblock with the smaller one before, if the sizes
+     are similar enough. */
+  if (mb->str_count < 2 * head->str_count)
+    {
+      if ((err = merge_blocks (head)))
+        return err;
+      mb = head;
+    }
+
+  /* Merge the new mergeblock with the larger ones after, if the sizes
+     are similar enough. */
+  while (mb->next && mb->next->str_count < 2 * mb->str_count)
+    if ((err = merge_blocks (mb)))
+      return err;
+
+  return NULL;
+}
+
+/* Add the given string to the cpsort.  Return NULL for success, or an
+   error message.  */
+const char *
+cpsort_incr (cpsort_t c, char const *str, size_t len)
+{
+  char const *err;
+  unsigned char const *ustr = str;
+  int cpl, minlen, is_pre_sorted;
+
+  /* Determine the common prefix length between this string and the
+     last string in the group. */
+  minlen = MIN (c->psg.last_len, len);
+  for (cpl = 0; cpl < minlen && c->psg.last[cpl] == ustr[cpl]; ++cpl)
+    ;
+
+  /* Determine whether or not the input continues to be pre-sorted. */
+  if (cpl != minlen)
+    is_pre_sorted = (c->psg.last[cpl] <= ustr[cpl]);
+  else
+    {
+      if (c->psg.last_len == len && c->psg.count != 0)
+        {
+          /* Duplicate, discard. */
+          c->str_count++;
+          return NULL;
+        }
+      is_pre_sorted = (c->psg.last_len <= len);
+    }
+
+  /* If this input is out of order, empty the group before adding it. */
+  if (!is_pre_sorted)
+    {
+      if ((err = flush_sorted_group (c)))
+        return err;
+      cpl = 0;
+    }
+
+  /* Add the new input to the sorted group. */
+  if ((err = expand_buf ((void **) &c->psg.last, &c->psg.last_cap, len)))
+    return err;
+  if ((err = expand_buf ((void **) &c->psg.strs, &c->psg.strs_cap,
+                         (c->psg.count + 2) * sizeof (c->psg.strs[0]))))
+    return err;
+  if ((err = expand_buf ((void **) &c->psg.suffixes, &c->psg.suffixes_cap,
+                         c->psg.tot_suf_len + (len - cpl))))
+    return err;
+
+  memcpy (c->psg.last, str, len);
+  c->psg.last_len = len;
+  c->psg.strs[c->psg.count].cpl = cpl;
+  c->psg.strs[c->psg.count].suf_len = len - cpl;
+  c->psg.strs[c->psg.count].index = c->str_count++;
+  c->psg.count++;
+  memcpy (c->psg.suffixes + c->psg.tot_suf_len, str + cpl, len - cpl);
+  c->psg.tot_suf_len += len - cpl;
+
+  return NULL;
+}
+
+/* Put a sorted dump of the cpsort in the specified cpsort_dump struct.
+   Return NULL for success, or an error message.  */
+const char *
+cpsort_makedump (cpsort_t cps, struct cpsort_dump *dump)
+{
+  struct mergeblock *mb;
+  char const *err;
+
+  /* Finish the mergesort. */
+  flush_sorted_group (cps);
+  mb = cps->mb_list.next;
+  while (mb->next)
+    if ((err = merge_blocks (mb)))
+      return err;
+
+  dump->count = mb->str_count;
+  dump->strs = mb->strs;
+  dump->tot_suf_len = mb->tot_suf_len;
+  dump->suffixes = (char *) mb->suffixes;
+
+  return NULL;
+}
+
+/* Deallocate the cpsort and all its associated storage. */
+void
+cpsort_free (cpsort_t cps)
+{
+  struct mergeblock *mb, *prev;
+
+  mb = cps->mb_list.next;
+  while (mb)
+    {
+      prev = mb;
+      mb = mb->next;
+      free (prev->suffixes);
+      free (prev->strs);
+      free (prev);
+    }
+
+  free (cps);
+}
diff --git a/src/cpsort.h b/src/cpsort.h
new file mode 100644
index 0000000..87a5f03
--- /dev/null
+++ b/src/cpsort.h
@@ -0,0 +1,70 @@
+/* cpsort.h - header declaring the common prefix sort library.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written March 2011 by Nick Cleaton <address@hidden>
+
+   A sortable set of strings.  Strings are added to the set one at a time,
+   and the set can be fetched in sorted order.  Ascending index numbers are
+   assigned to the strings as they are added.
+
+   The sorted strings are returned with common prefix compacation, meaning
+   that common prefixes between adjacent strings are not duplicated; instead
+   each string is labeled with its common prefix length and is stored with
+   the common prefix excluded.
+
+   This is motivated by a requirement in kwset.c, where a sort replaces
+   older code that built a trie from the unsorted input strings.  To ensure
+   that the new code does not perform siginficantly worse than the old code
+   on any benchmark, the sort must be memory efficient when there are many
+   long common input prefixes and/or duplicate inputs.  */
+
+
+/* Meta-data for one string in a sorted dump of a cpsort. */
+struct cpsort_md
+{
+  int index;                    /* Index number of this string. */
+  int cpl;                      /* Common Prefix Length with previous. */
+  int suf_len;                  /* Length, excluding the common prefix. */
+};
+
+/* A sorted dump of a cpsort. */
+struct cpsort_dump
+{
+  struct cpsort_md *strs;       /* The elements of the sorted dump. */
+  char *suffixes;               /* The concatenated string suffixes. */
+  int count;                    /* The number of strings. */
+  int tot_suf_len;              /* The total length of the suffixes. */
+};
+
+struct cpsort;
+typedef struct cpsort *cpsort_t;
+
+/* Return an opaque pointer to a newly allocated cpsort, or NULL
+   if enough memory cannot be obtained.  */
+extern cpsort_t cpsort_alloc (void);
+
+/* Add the given string to the cpsort.  Return NULL for success, or an
+   error message.  */
+extern const char *cpsort_incr (cpsort_t, char const *, size_t);
+
+/* Put a sorted dump of the cpsort in the specified cpcsort_dump struct.
+   Return NULL for success, or an error message.  */
+extern const char *cpsort_makedump (cpsort_t, struct cpsort_dump *);
+
+/* Deallocate the cpsort and all its associated storage.  */
+extern void cpsort_free (cpsort_t);
diff --git a/src/kwset.c b/src/kwset.c
index 18a0f61..d91d52a 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -34,6 +34,7 @@
 #include "system.h"
 #include "kwset.h"
 #include "obstack.h"
+#include "cpsort.h"
 
 #define link kwset_link
 
@@ -49,6 +50,9 @@
 
 #define U(c) ((unsigned char) (c))
 
+#define MALLOC(ptr, count) ( (ptr) = malloc ( (count) * sizeof (*(ptr)) ) )
+#define FREENULL(ptr) ( free (ptr), (ptr) = NULL )
+
 /* Element of a trie representing a set of reversed keywords.  Holds data
    for one trie node and for the edge that leads to it.  Sibling nodes are
    arranged as a balanced tree.  */
@@ -72,6 +76,7 @@ struct trie
 struct kwset
 {
   struct obstack obstack;      /* Obstack for node allocation. */
+  cpsort_t cps;                 /* Cpsort to accumulate words. */
   int words;                   /* Number of words in the trie. */
   struct trie *trie;           /* The trie itself. */
   int mind;                    /* Minimum depth of an accepting node. */
@@ -81,6 +86,9 @@ struct kwset
   char *target;                        /* Target string if there's only one. */
   int mind2;                   /* Used in Boyer-Moore search for one string. */
   char const *trans;           /* Character translation table. */
+  char *kwbuf;                  /* Buffer to hold a reversed keyword. */
+  int kwbuf_capacity;           /* Capacity of the buffer. */
+  struct trie **prev_kw_tries;  /* The tries for the last keyword added. */
 };
 
 /* Allocate and initialize a keyword set object, returning an opaque
@@ -95,6 +103,7 @@ kwsalloc (char const *trans)
     return NULL;
 
   obstack_init(&kwset->obstack);
+  kwset->cps = cpsort_alloc ();
   kwset->words = 0;
   kwset->trie
     = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));
@@ -114,6 +123,9 @@ kwsalloc (char const *trans)
   kwset->maxd = -1;
   kwset->target = NULL;
   kwset->trans = trans;
+  kwset->kwbuf = NULL;
+  kwset->kwbuf_capacity = 0;
+  kwset->prev_kw_tries = NULL;
 
   return (kwset_t) kwset;
 }
@@ -127,150 +139,219 @@ kwsalloc (char const *trans)
 const char *
 kwsincr (kwset_t kws, char const *text, size_t len)
 {
+  char const *err, *end;
+  char *dest;
+
+  if (kws->kwbuf_capacity < len)
+    {
+      free (kws->kwbuf);
+      kws->kwbuf_capacity = MAX (len, 2 * kws->kwbuf_capacity);
+      MALLOC (kws->kwbuf, kws->kwbuf_capacity);
+      if (!kws->kwbuf)
+        return _("memory exhausted");
+    }
+
+  dest = kws->kwbuf;
+  end = text + len;
+  while (text < end)
+    *dest++ = kws->trans ? kws->trans[U (*--end)] : *--end;
+
+  if ((err = cpsort_incr (kws->cps, kws->kwbuf, len)))
+    return err;
+
+  ++kws->words;
+
+  /* Keep track of the longest and shortest string of the keyword set. */
+  if ((int)len < kws->mind)
+    kws->mind = (int)len;
+  if ((int)len > kws->maxd)
+    kws->maxd = (int)len;
+
+  return NULL;
+}
+
+
+/* Add a keyword to the under-construction trie. */
+static char const *
+install_suffix (kwset_t kws, struct cpsort_md *md, char const *text)
+{
   struct kwset *kwset;
   struct trie *trie;
-  unsigned char label;
   struct trie *link;
-  int depth;
+  int depth, trie_depth = md->cpl, len = md->suf_len;
   struct trie *links[DEPTH_SIZE];
   enum { L, R } dirs[DEPTH_SIZE];
   struct trie *t, *r, *l, *rl, *lr;
 
   kwset = (struct kwset *) kws;
-  trie = kwset->trie;
-  text += len;
 
-  /* Descend the trie (built of reversed keywords) character-by-character,
-     installing new nodes when necessary. */
-  while (len--)
+  /* Start at the trie node at which this suffix slots in. */
+  trie = kwset->prev_kw_tries[trie_depth];
+  if (len == 0)
     {
-      label = kwset->trans ? kwset->trans[U(*--text)] : *--text;
+      trie->accepting = 1 + 2 * md->index;
+      return NULL;
+    }
 
-      /* Descend the tree of outgoing links for this trie node,
-         looking for the current character and keeping track
-         of the path followed. */
-      link = trie->links;
-      links[0] = (struct trie *) &trie->links;
-      dirs[0] = L;
-      depth = 1;
+  /* Descend the tree of outgoing links, looking for the place that the first
+     character of the suffix goes and keeping track of the path followed. */
+  link = trie->links;
+  links[0] = (struct trie *) &trie->links;
+  dirs[0] = L;
+  depth = 1;
 
-      while (link && label != link->label)
-        {
-          links[depth] = link;
-          if (label < link->label)
-            dirs[depth++] = L, link = link->llink;
-          else
-            dirs[depth++] = R, link = link->rlink;
-        }
+  while (link)
+    {
+      links[depth] = link;
+      if ((unsigned char) text[0] < link->label)
+        dirs[depth++] = L, link = link->llink;
+      else
+        dirs[depth++] = R, link = link->rlink;
+    }
 
-      /* The current character doesn't have an outgoing link at
-         this trie node, so build a new trie node and install
-         a link in the current trie node's tree. */
-      if (!link)
-        {
-          link = (struct trie *) obstack_alloc(&kwset->obstack,
-                                               sizeof (struct trie));
-          if (!link)
-            return _("memory exhausted");
-          link->llink = NULL;
-          link->rlink = NULL;
-          link->accepting = 0;
-          link->links = NULL;
-          link->parent = trie;
-          link->next = NULL;
-          link->fail = NULL;
-          link->depth = trie->depth + 1;
-          link->shift = 0;
-          link->label = label;
-          link->balance = 0;
-
-          /* Install the new tree node in its parent. */
-          if (dirs[--depth] == L)
-            links[depth]->llink = link;
-          else
-            links[depth]->rlink = link;
+  /* Install a new trie node for the first character of this suffix. */
+  link
+    = (struct trie *) obstack_alloc (&kwset->obstack, sizeof (struct trie));
+  if (!link)
+    return _("memory exhausted");
+  link->llink = NULL;
+  link->rlink = NULL;
+  link->balance = 0;
+  if (dirs[--depth] == L)
+    links[depth]->llink = link;
+  else
+    links[depth]->rlink = link;
+
+  /* Back up the tree fixing the balance flags. */
+  while (depth && !links[depth]->balance)
+    {
+      if (dirs[depth] == L)
+        --links[depth]->balance;
+      else
+        ++links[depth]->balance;
+      --depth;
+    }
 
-          /* Back up the tree fixing the balance flags. */
-          while (depth && !links[depth]->balance)
+  /* Rebalance the tree by pointer rotations if necessary. */
+  if (depth && ((dirs[depth] == L && --links[depth]->balance)
+                || (dirs[depth] == R && ++links[depth]->balance)))
+    {
+      switch (links[depth]->balance)
+        {
+        case (char) -2:
+          switch (dirs[depth + 1])
             {
-              if (dirs[depth] == L)
-                --links[depth]->balance;
-              else
-                ++links[depth]->balance;
-              --depth;
+            case L:
+              r = links[depth], t = r->llink, rl = t->rlink;
+              t->rlink = r, r->llink = rl;
+              t->balance = r->balance = 0;
+              break;
+            case R:
+              r = links[depth], l = r->llink, t = l->rlink;
+              rl = t->rlink, lr = t->llink;
+              t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
+              l->balance = t->balance != 1 ? 0 : -1;
+              r->balance = t->balance != (char) -1 ? 0 : 1;
+              t->balance = 0;
+              break;
+            default:
+              abort ();
             }
-
-          /* Rebalance the tree by pointer rotations if necessary. */
-          if (depth && ((dirs[depth] == L && --links[depth]->balance)
-                        || (dirs[depth] == R && ++links[depth]->balance)))
+          break;
+        case 2:
+          switch (dirs[depth + 1])
             {
-              switch (links[depth]->balance)
-                {
-                case (char) -2:
-                  switch (dirs[depth + 1])
-                    {
-                    case L:
-                      r = links[depth], t = r->llink, rl = t->rlink;
-                      t->rlink = r, r->llink = rl;
-                      t->balance = r->balance = 0;
-                      break;
-                    case R:
-                      r = links[depth], l = r->llink, t = l->rlink;
-                      rl = t->rlink, lr = t->llink;
-                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
-                      l->balance = t->balance != 1 ? 0 : -1;
-                      r->balance = t->balance != (char) -1 ? 0 : 1;
-                      t->balance = 0;
-                      break;
-                    default:
-                      abort ();
-                    }
-                  break;
-                case 2:
-                  switch (dirs[depth + 1])
-                    {
-                    case R:
-                      l = links[depth], t = l->rlink, lr = t->llink;
-                      t->llink = l, l->rlink = lr;
-                      t->balance = l->balance = 0;
-                      break;
-                    case L:
-                      l = links[depth], r = l->rlink, t = r->llink;
-                      lr = t->llink, rl = t->rlink;
-                      t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
-                      l->balance = t->balance != 1 ? 0 : -1;
-                      r->balance = t->balance != (char) -1 ? 0 : 1;
-                      t->balance = 0;
-                      break;
-                    default:
-                      abort ();
-                    }
-                  break;
-                default:
-                  abort ();
-                }
-
-              if (dirs[depth - 1] == L)
-                links[depth - 1]->llink = t;
-              else
-                links[depth - 1]->rlink = t;
+            case R:
+              l = links[depth], t = l->rlink, lr = t->llink;
+              t->llink = l, l->rlink = lr;
+              t->balance = l->balance = 0;
+              break;
+            case L:
+              l = links[depth], r = l->rlink, t = r->llink;
+              lr = t->llink, rl = t->rlink;
+              t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
+              l->balance = t->balance != 1 ? 0 : -1;
+              r->balance = t->balance != (char) -1 ? 0 : 1;
+              t->balance = 0;
+              break;
+            default:
+              abort ();
             }
+          break;
+        default:
+          abort ();
         }
 
+      if (dirs[depth - 1] == L)
+        links[depth - 1]->llink = t;
+      else
+        links[depth - 1]->rlink = t;
+    }
+
+
+  /* Populate trie nodes for the characters of this suffix.  The first node
+     has already been allocated and installed, the others are simple because
+     they have no siblings yet. */
+  while (1)
+    {
+      link->accepting = 0;
+      link->links = NULL;
+      link->parent = trie;
+      link->next = NULL;
+      link->fail = NULL;
+      link->depth = trie->depth + 1;
+      link->shift = 0;
+      link->label = *text++;
+      kwset->prev_kw_tries[++trie_depth] = link;
+
+      if (--len == 0)
+        break;
+
       trie = link;
+      link = (struct trie *) obstack_alloc (&kwset->obstack,
+                                            sizeof (struct trie));
+      if (!link)
+        return _("memory exhausted");
+      link->llink = NULL;
+      link->rlink = NULL;
+      link->balance = 0;
+      trie->links = link;
     }
 
   /* Mark the node we finally reached as accepting, encoding the
-     index number of this word in the keyword set so far. */
-  if (!trie->accepting)
-    trie->accepting = 1 + 2 * kwset->words;
-  ++kwset->words;
+     index number of this word in the keyword set. */
+  link->accepting = 1 + 2 * md->index;
 
-  /* Keep track of the longest and shortest string of the keyword set. */
-  if (trie->depth < kwset->mind)
-    kwset->mind = trie->depth;
-  if (trie->depth > kwset->maxd)
-    kwset->maxd = trie->depth;
+  return NULL;
+}
+
+static char const *
+build_trie_from_sorted_keywords (struct kwset *kwset)
+{
+  struct cpsort_dump cpsdump;
+  char const *err, *text;
+  int i;
+
+  if ((err = cpsort_makedump (kwset->cps, &cpsdump)))
+    return err;
+
+  /* Set up a vector by depth of the last trie node visited. */
+  MALLOC (kwset->prev_kw_tries, kwset->maxd + 1);
+  if (!kwset->prev_kw_tries)
+    return _("memory exhausted");
+  kwset->prev_kw_tries[0] = kwset->trie;
+
+  /* Add each reversed keyword to the trie, in sorted order. */
+  text = cpsdump.suffixes;
+  for (i = 0; i < cpsdump.count; i++)
+    {
+      err = install_suffix (kwset, cpsdump.strs + i, text);
+      if (err)
+        return err;
+      text += cpsdump.strs[i].suf_len;
+    }
+
+  FREENULL (kwset->prev_kw_tries);
 
   return NULL;
 }
@@ -376,7 +457,7 @@ kwsprep (kwset_t kws)
   struct kwset *kwset;
   int i;
   struct trie *curr;
-  char const *trans;
+  char const *trans, *err;
   unsigned char delta[NCHAR];
 
   kwset = (struct kwset *) kws;
@@ -392,15 +473,12 @@ kwsprep (kwset_t kws)
     {
       char c;
 
-      /* Looking for just one string.  Extract it from the trie. */
+      /* Looking for just one string.  Extract it from the buffer. */
       kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);
       if (!kwset->target)
         return _("memory exhausted");
-      for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
-        {
-          kwset->target[i] = curr->links->label;
-          curr = curr->links;
-        }
+      for (i = 0; i < kwset->mind; ++i)
+        kwset->target[i] = kwset->kwbuf[kwset->mind - (i + 1)];
       /* Build the Boyer Moore delta.  Boy that's easy compared to CW. */
       for (i = 0; i < kwset->mind; ++i)
         delta[U(kwset->target[i])] = kwset->mind - (i + 1);
@@ -417,6 +495,12 @@ kwsprep (kwset_t kws)
       struct trie *fail;
       struct trie *last, *next[NCHAR];
 
+      if ((err = build_trie_from_sorted_keywords (kwset)))
+        return err;
+
+      cpsort_free (kwset->cps);
+      kwset->cps = NULL;
+
       /* Traverse the nodes of the trie in level order, simultaneously
          computing the delta table, failure function, and shift function. */
       for (curr = last = kwset->trie; curr; curr = curr->next)
@@ -756,5 +840,9 @@ kwsfree (kwset_t kws)
 
   kwset = (struct kwset *) kws;
   obstack_free(&kwset->obstack, NULL);
+  if (kwset->cps)
+    cpsort_free (kwset->cps);
+  free(kwset->kwbuf);
+  free(kwset->prev_kw_tries);
   free(kws);
 }
-- 
1.5.6.3
[Prev in Thread]
Current Thread
[Next in Thread]
Proof of concept: kwset running 3 times smaller, Nick Cleaton, 2010/10/02
- [PATCH 01/15] Merge trie and tree into a single struct, Nick Cleaton, 2010/10/02
  - [PATCH 02/15] Sort the keywords before building the trie, Nick Cleaton <=
  - [PATCH 03/15] Lay out the trie more systematically, Nick Cleaton, 2010/10/02
  - [PATCH 04/15] Lay out the trees more systematically, Nick Cleaton, 2010/10/02
  - [PATCH 05/15] Avoid using llink and rlink during prep, Nick Cleaton, 2010/10/02
  - [PATCH 06/15] Defer llink/rlink plumbing to the end of prep, Nick Cleaton, 2010/10/02
  - [PATCH 07/15] Share storage between llink and fail, Nick Cleaton, 2010/10/02
  - [PATCH 08/15] Avoid using trie->depth during matching, Nick Cleaton, 2010/10/02
  - [PATCH 09/15] Share storage between rlink and depth, Nick Cleaton, 2010/10/02
  - [PATCH 10/15] Eliminate the trie->parent field, Nick Cleaton, 2010/10/02
  - [PATCH 11/15] Eliminate the trie->next field, Nick Cleaton, 2010/10/02
  - [PATCH 12/15] Eliminate the trie->maxshift field, Nick Cleaton, 2010/10/02
Prev by Date: [PATCH 01/15] Merge trie and tree into a single struct
Next by Date: [PATCH 03/15] Lay out the trie more systematically
Previous by thread: [PATCH 01/15] Merge trie and tree into a single struct
Next by thread: [PATCH 03/15] Lay out the trie more systematically
Index(es):
- Date
- Thread