bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 03/15] Lay out the trie more systematically


From: Nick Cleaton
Subject: [PATCH 03/15] Lay out the trie more systematically
Date: Sat, 2 Oct 2010 07:07:44 +0100

Allocate a single storage vector for all of the trie nodes; this is
possible since the total number of trie nodes can be obtained from the
sorted keyword list.

Lay out the nodes in depth first order, except that sibling nodes are
placed together for locality of reference during tree searches.
---
 src/kwset.c |  195 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 159 insertions(+), 36 deletions(-)

diff --git a/src/kwset.c b/src/kwset.c
index d91d52a..e4712fb 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -33,7 +33,6 @@
 #include <sys/types.h>
 #include "system.h"
 #include "kwset.h"
-#include "obstack.h"
 #include "cpsort.h"
 
 #define link kwset_link
@@ -72,10 +71,17 @@ struct trie
   int maxshift;                        /* Max shift of self and descendents. */
 };
 
+/* A record of a trie node that has one or more siblings. */
+struct broodnote
+{
+  int index;                    /* Keyword where this node is first hit. */
+  int depth;                    /* Depth of the node. */
+  int count;                    /* Number of siblings, including self. */
+};
+
 /* Structure returned opaquely to the caller, containing everything. */
 struct kwset
 {
-  struct obstack obstack;      /* Obstack for node allocation. */
   cpsort_t cps;                 /* Cpsort to accumulate words. */
   int words;                   /* Number of words in the trie. */
   struct trie *trie;           /* The trie itself. */
@@ -88,7 +94,11 @@ struct kwset
   char const *trans;           /* Character translation table. */
   char *kwbuf;                  /* Buffer to hold a reversed keyword. */
   int kwbuf_capacity;           /* Capacity of the buffer. */
-  struct trie **prev_kw_tries;  /* The tries for the last keyword added. */
+  struct trie **prev_kw_tries;  /* The trie nodes of the last keyword added. */
+  struct trie *node_storage;    /* Vector of trie nodes. */
+  struct trie *next_free_node;  /* Next unallocated trie in node_storage. */
+  struct broodnote *bn_cursor;  /* A cursor for walking the broodnotes. */
+  struct trie *next_prealloced; /* Linked list of pre-allocated trie nodes. */
 };
 
 /* Allocate and initialize a keyword set object, returning an opaque
@@ -102,23 +112,9 @@ kwsalloc (char const *trans)
   if (!kwset)
     return NULL;
 
-  obstack_init(&kwset->obstack);
   kwset->cps = cpsort_alloc ();
   kwset->words = 0;
-  kwset->trie
-    = (struct trie *) obstack_alloc(&kwset->obstack, sizeof (struct trie));
-  if (!kwset->trie)
-    {
-      kwsfree((kwset_t) kwset);
-      return NULL;
-    }
-  kwset->trie->accepting = 0;
-  kwset->trie->links = NULL;
-  kwset->trie->parent = NULL;
-  kwset->trie->next = NULL;
-  kwset->trie->fail = NULL;
-  kwset->trie->depth = 0;
-  kwset->trie->shift = 0;
+  kwset->trie = NULL;
   kwset->mind = INT_MAX;
   kwset->maxd = -1;
   kwset->target = NULL;
@@ -126,6 +122,10 @@ kwsalloc (char const *trans)
   kwset->kwbuf = NULL;
   kwset->kwbuf_capacity = 0;
   kwset->prev_kw_tries = NULL;
+  kwset->node_storage = NULL;
+  kwset->next_free_node = NULL;
+  kwset->bn_cursor = NULL;
+  kwset->next_prealloced = NULL;
 
   return (kwset_t) kwset;
 }
@@ -170,10 +170,117 @@ kwsincr (kwset_t kws, char const *text, size_t len)
   return NULL;
 }
 
+/* Compute the shape of the trie that will be built from the sorted set of
+   reversed keywords, and record it as a vector of broodnotes.  Make a
+   broodnote for each node that is the first of 2 or more siblings.
+
+   Return a pointer into the broodnote storage buffer, pointing to the
+   first broodnote.  The broodnote storage buffer must have space for 2
+   more broodnotes than the number of strings in the sorted keyword set.
+
+   Arrange the broodnote vector in index and depth order and terminate it
+   with a guard, so that code can walk the broodnote vector in tandem with
+   working through the keywords and not worry about running run off the
+   top of the broodnotes. */
+static struct broodnote *
+compute_brood_notes (struct kwset *kwset, struct cpsort_dump *keywords,
+                     struct broodnote *storage_buffer)
+{
+  struct broodnote *brood_dest, *brood_stack;
+  struct cpsort_md *md;
+
+  /* We'll build the vector of broodnotes in reverse order, working down
+     from the top of the storage. */
+  brood_dest = storage_buffer + keywords->count + 1;
+
+  /* Install the guard at the end of the broodnote vector. */
+  brood_dest->count = 0;
+  brood_dest->depth = 0;
+  brood_dest->index = INT_MAX;
+  brood_dest--;
+
+  /* A stack of in-progress brood notes building up from the bottom of
+     the storage, with a guard that will never be popped. */
+  brood_stack = storage_buffer;
+  brood_stack->count = 0;
+  brood_stack->depth = -9999;
+
+  /* Walk the keyword list from end to start. */
+  for (md = keywords->strs + keywords->count - 1; keywords->strs <= md; --md)
+    {
+      int is_suffix_of_previous;
+
+      if (md == keywords->strs)
+        is_suffix_of_previous = 1;
+      else
+        is_suffix_of_previous =
+          md->cpl == md[-1].suf_len + md[-1].cpl ? 1 : 0;
+
+      while (md->cpl - is_suffix_of_previous < brood_stack->depth - 1)
+        {
+          /* Brood note complete, move it to final position. */
+          brood_dest->depth = brood_stack->depth;
+          brood_dest->count = brood_stack->count;
+          brood_dest->index = md - keywords->strs;
+          brood_dest--;
+          brood_stack--;
+        }
+      if (is_suffix_of_previous)
+        continue;
+      if (md->cpl == brood_stack->depth - 1)
+        brood_stack->count++;
+      else
+        {
+          /* We've found 2 siblings of a new brood. */
+          brood_stack++;
+          brood_stack->count = 2;
+          brood_stack->depth = md->cpl + 1;
+        }
+    }
+
+  return brood_dest + 1;
+}
+
+/* Used to allocate trie nodes from the node vector when building the trie
+   from the sorted reversed keyword list.  */
+static struct trie *
+allocate_trie_node (struct kwset *kwset, int index, int depth)
+{
+  if (kwset->bn_cursor->index == index && kwset->bn_cursor->depth == depth)
+    {
+      /* Found a broodnote for this node, which is the first of a group of
+         2 or more siblings.  Pre-allocate a block of nodes for the siblings,
+         so that they are kept adjacent.  Pre-allocated nodes are stored in
+         a linked list (using 'links' as the list pointer) and tagged with
+         the depth at which they are to be used. */
+      int siblings = kwset->bn_cursor->count - 1;
+
+      while (siblings--)
+        {
+          kwset->next_free_node->depth = depth;
+          kwset->next_free_node->links = kwset->next_prealloced;
+          kwset->next_prealloced = kwset->next_free_node++;
+        }
+      kwset->bn_cursor++;
+      return kwset->next_free_node++;
+    }
+
+  if (kwset->next_prealloced && kwset->next_prealloced->depth == depth)
+    {
+      /* This node is part of a group of siblings for which a block of
+         adjacent nodes has been pre-allocated.  */
+      struct trie *t = kwset->next_prealloced;
+      kwset->next_prealloced = kwset->next_prealloced->links;
+      return t;
+    }
+
+  return kwset->next_free_node++;
+}
 
 /* Add a keyword to the under-construction trie. */
-static char const *
-install_suffix (kwset_t kws, struct cpsort_md *md, char const *text)
+static void
+install_suffix (kwset_t kws, struct cpsort_md *md, char const *text,
+                int index)
 {
   struct kwset *kwset;
   struct trie *trie;
@@ -190,7 +297,7 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char 
const *text)
   if (len == 0)
     {
       trie->accepting = 1 + 2 * md->index;
-      return NULL;
+      return;
     }
 
   /* Descend the tree of outgoing links, looking for the place that the first
@@ -210,10 +317,7 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char 
const *text)
     }
 
   /* Install a new trie node for the first character of this suffix. */
-  link
-    = (struct trie *) obstack_alloc (&kwset->obstack, sizeof (struct trie));
-  if (!link)
-    return _("memory exhausted");
+  link = allocate_trie_node (kwset, index, trie_depth + 1);
   link->llink = NULL;
   link->rlink = NULL;
   link->balance = 0;
@@ -308,10 +412,7 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char 
const *text)
         break;
 
       trie = link;
-      link = (struct trie *) obstack_alloc (&kwset->obstack,
-                                            sizeof (struct trie));
-      if (!link)
-        return _("memory exhausted");
+      link = allocate_trie_node (kwset, index, trie_depth + 1);
       link->llink = NULL;
       link->rlink = NULL;
       link->balance = 0;
@@ -321,37 +422,58 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char 
const *text)
   /* Mark the node we finally reached as accepting, encoding the
      index number of this word in the keyword set. */
   link->accepting = 1 + 2 * md->index;
-
-  return NULL;
 }
 
 static char const *
 build_trie_from_sorted_keywords (struct kwset *kwset)
 {
   struct cpsort_dump cpsdump;
+  struct broodnote *bn_store;
   char const *err, *text;
   int i;
 
   if ((err = cpsort_makedump (kwset->cps, &cpsdump)))
     return err;
 
+  /* Allocate a single vector to hold all of the trie nodes. */
+  MALLOC (kwset->node_storage, 1 + cpsdump.tot_suf_len);
+  if (!kwset->node_storage)
+    return _("memory exhausted");
+  kwset->next_free_node = kwset->node_storage;
+
+  /* Install the top level trie node. */
+  kwset->trie = kwset->next_free_node++;
+  kwset->trie->accepting = 0;
+  kwset->trie->links = NULL;
+  kwset->trie->parent = NULL;
+  kwset->trie->next = NULL;
+  kwset->trie->fail = NULL;
+  kwset->trie->depth = 0;
+  kwset->trie->shift = 0;
+
   /* Set up a vector by depth of the last trie node visited. */
   MALLOC (kwset->prev_kw_tries, kwset->maxd + 1);
   if (!kwset->prev_kw_tries)
     return _("memory exhausted");
   kwset->prev_kw_tries[0] = kwset->trie;
 
+  /* Compute the shape that the trie will take. */
+  MALLOC (bn_store, cpsdump.count + 2);
+  if (!bn_store)
+    return _("memory exhausted");
+  kwset->bn_cursor = compute_brood_notes (kwset, &cpsdump, bn_store);
+
   /* Add each reversed keyword to the trie, in sorted order. */
   text = cpsdump.suffixes;
   for (i = 0; i < cpsdump.count; i++)
     {
-      err = install_suffix (kwset, cpsdump.strs + i, text);
-      if (err)
-        return err;
+      install_suffix (kwset, cpsdump.strs + i, text, i);
       text += cpsdump.strs[i].suf_len;
     }
 
   FREENULL (kwset->prev_kw_tries);
+  kwset->bn_cursor = NULL;
+  free (bn_store);
 
   return NULL;
 }
@@ -474,7 +596,7 @@ kwsprep (kwset_t kws)
       char c;
 
       /* Looking for just one string.  Extract it from the buffer. */
-      kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);
+      MALLOC (kwset->target, kwset->mind);
       if (!kwset->target)
         return _("memory exhausted");
       for (i = 0; i < kwset->mind; ++i)
@@ -839,10 +961,11 @@ kwsfree (kwset_t kws)
   struct kwset *kwset;
 
   kwset = (struct kwset *) kws;
-  obstack_free(&kwset->obstack, NULL);
   if (kwset->cps)
     cpsort_free (kwset->cps);
   free(kwset->kwbuf);
+  free(kwset->target);
   free(kwset->prev_kw_tries);
+  free(kwset->node_storage);
   free(kws);
 }
-- 
1.5.6.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]