[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 01/15] Merge trie and tree into a single struct
From: |
Nick Cleaton |
Subject: |
[PATCH 01/15] Merge trie and tree into a single struct |
Date: |
Sat, 2 Oct 2010 07:07:44 +0100 |
Store data for a trie node and the edge that leads to it in a single
struct. Previously, trie and tree structs were always allocated in
pairs (except for the top level trie node, which has no inbound edge)
so merging them removes a level of indirection, saves some space and
simplifies some code.
---
src/kwset.c | 104 +++++++++++++++++++++++++---------------------------------
1 files changed, 45 insertions(+), 59 deletions(-)
diff --git a/src/kwset.c b/src/kwset.c
index d85fa30..18a0f61 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -49,21 +49,17 @@
#define U(c) ((unsigned char) (c))
-/* Balanced tree of edges and labels leaving a given trie node. */
-struct tree
-{
- struct tree *llink; /* Left link; MUST be first field. */
- struct tree *rlink; /* Right link (to larger labels). */
- struct trie *trie; /* Trie node pointed to by this edge. */
- unsigned char label; /* Label on this edge. */
- char balance; /* Difference in depths of subtrees. */
-};
-
-/* Node of a trie representing a set of reversed keywords. */
+/* Element of a trie representing a set of reversed keywords. Holds data
+ for one trie node and for the edge that leads to it. Sibling nodes are
+ arranged as a balanced tree. */
struct trie
{
+ struct trie *llink; /* Left tree link; MUST be first field. */
+ struct trie *rlink; /* Right tree link (to larger labels). */
+ unsigned char label; /* Label on the incoming edge. */
+ char balance; /* Difference in depths of subtrees. */
unsigned int accepting; /* Word index of accepted word, or zero. */
- struct tree *links; /* Tree of edges leaving this node. */
+ struct trie *links; /* Tree of edges leaving this node. */
struct trie *parent; /* Parent of this node. */
struct trie *next; /* List of all trie nodes in level order. */
struct trie *fail; /* Aho-Corasick failure function. */
@@ -134,11 +130,11 @@ kwsincr (kwset_t kws, char const *text, size_t len)
struct kwset *kwset;
struct trie *trie;
unsigned char label;
- struct tree *link;
+ struct trie *link;
int depth;
- struct tree *links[DEPTH_SIZE];
+ struct trie *links[DEPTH_SIZE];
enum { L, R } dirs[DEPTH_SIZE];
- struct tree *t, *r, *l, *rl, *lr;
+ struct trie *t, *r, *l, *rl, *lr;
kwset = (struct kwset *) kws;
trie = kwset->trie;
@@ -154,7 +150,7 @@ kwsincr (kwset_t kws, char const *text, size_t len)
looking for the current character and keeping track
of the path followed. */
link = trie->links;
- links[0] = (struct tree *) &trie->links;
+ links[0] = (struct trie *) &trie->links;
dirs[0] = L;
depth = 1;
@@ -172,26 +168,19 @@ kwsincr (kwset_t kws, char const *text, size_t len)
a link in the current trie node's tree. */
if (!link)
{
- link = (struct tree *) obstack_alloc(&kwset->obstack,
- sizeof (struct tree));
+ link = (struct trie *) obstack_alloc(&kwset->obstack,
+ sizeof (struct trie));
if (!link)
return _("memory exhausted");
link->llink = NULL;
link->rlink = NULL;
- link->trie = (struct trie *) obstack_alloc(&kwset->obstack,
- sizeof (struct trie));
- if (!link->trie)
- {
- obstack_free(&kwset->obstack, link);
- return _("memory exhausted");
- }
- link->trie->accepting = 0;
- link->trie->links = NULL;
- link->trie->parent = trie;
- link->trie->next = NULL;
- link->trie->fail = NULL;
- link->trie->depth = trie->depth + 1;
- link->trie->shift = 0;
+ link->accepting = 0;
+ link->links = NULL;
+ link->parent = trie;
+ link->next = NULL;
+ link->fail = NULL;
+ link->depth = trie->depth + 1;
+ link->shift = 0;
link->label = label;
link->balance = 0;
@@ -268,7 +257,7 @@ kwsincr (kwset_t kws, char const *text, size_t len)
}
}
- trie = link->trie;
+ trie = link;
}
/* Mark the node we finally reached as accepting, encoding the
@@ -289,23 +278,23 @@ kwsincr (kwset_t kws, char const *text, size_t len)
/* Enqueue the trie nodes referenced from the given tree in the
given queue. */
static void
-enqueue (struct tree *tree, struct trie **last)
+enqueue (struct trie *tree, struct trie **last)
{
if (!tree)
return;
enqueue(tree->llink, last);
enqueue(tree->rlink, last);
- (*last) = (*last)->next = tree->trie;
+ (*last) = (*last)->next = tree;
}
/* Compute the Aho-Corasick failure function for the trie nodes referenced
from the given tree, given the failure function for their parent as
well as a last resort failure node. */
static void
-treefails (struct tree const *tree, struct trie const *fail,
+treefails (struct trie *tree, struct trie const *fail,
struct trie *recourse)
{
- struct tree *link;
+ struct trie *link;
if (!tree)
return;
@@ -325,19 +314,19 @@ treefails (struct tree const *tree, struct trie const
*fail,
link = link->rlink;
if (link)
{
- tree->trie->fail = link->trie;
+ tree->fail = link;
return;
}
fail = fail->fail;
}
- tree->trie->fail = recourse;
+ tree->fail = recourse;
}
/* Set delta entries for the links of the given tree such that
the preexisting delta value is larger than the current depth. */
static void
-treedelta (struct tree const *tree,
+treedelta (struct trie const *tree,
unsigned int depth,
unsigned char delta[])
{
@@ -351,7 +340,7 @@ treedelta (struct tree const *tree,
/* Return true if A has every label in B. */
static int
-hasevery (struct tree const *a, struct tree const *b)
+hasevery (struct trie const *a, struct trie const *b)
{
if (!b)
return 1;
@@ -370,13 +359,13 @@ hasevery (struct tree const *a, struct tree const *b)
/* Compute a vector, indexed by character code, of the trie nodes
referenced from the given tree. */
static void
-treenext (struct tree const *tree, struct trie *next[])
+treenext (struct trie *tree, struct trie *next[])
{
if (!tree)
return;
treenext(tree->llink, next);
treenext(tree->rlink, next);
- next[tree->label] = tree->trie;
+ next[tree->label] = tree;
}
/* Compute the shift for each trie node, as well as the delta
@@ -410,7 +399,7 @@ kwsprep (kwset_t kws)
for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
{
kwset->target[i] = curr->links->label;
- curr = curr->links->trie;
+ curr = curr->links;
}
/* Build the Boyer Moore delta. Boy that's easy compared to CW. */
for (i = 0; i < kwset->mind; ++i)
@@ -595,7 +584,6 @@ cwexec (kwset_t kws, char const *text, size_t len, struct
kwsmatch *kwsmatch)
unsigned char const *delta;
int d;
char const *end, *qlim;
- struct tree const *tree;
char const *trans;
#ifdef lint
@@ -652,15 +640,14 @@ cwexec (kwset_t kws, char const *text, size_t len, struct
kwsmatch *kwsmatch)
while (beg > text)
{
c = trans ? trans[U(*--beg)] : *--beg;
- tree = trie->links;
- while (tree && c != tree->label)
- if (c < tree->label)
- tree = tree->llink;
+ trie = trie->links;
+ while (trie && c != trie->label)
+ if (c < trie->label)
+ trie = trie->llink;
else
- tree = tree->rlink;
- if (tree)
+ trie = trie->rlink;
+ if (trie)
{
- trie = tree->trie;
if (trie->accepting)
{
mch = beg;
@@ -703,15 +690,14 @@ cwexec (kwset_t kws, char const *text, size_t len, struct
kwsmatch *kwsmatch)
while (beg > text)
{
c = trans ? trans[U(*--beg)] : *--beg;
- tree = trie->links;
- while (tree && c != tree->label)
- if (c < tree->label)
- tree = tree->llink;
+ trie = trie->links;
+ while (trie && c != trie->label)
+ if (c < trie->label)
+ trie = trie->llink;
else
- tree = tree->rlink;
- if (tree)
+ trie = trie->rlink;
+ if (trie)
{
- trie = tree->trie;
if (trie->accepting && beg <= mch)
{
lmch = beg;
--
1.5.6.3
- Proof of concept: kwset running 3 times smaller, Nick Cleaton, 2010/10/02
- [PATCH 01/15] Merge trie and tree into a single struct,
Nick Cleaton <=
- [PATCH 02/15] Sort the keywords before building the trie, Nick Cleaton, 2010/10/02
- [PATCH 03/15] Lay out the trie more systematically, Nick Cleaton, 2010/10/02
- [PATCH 04/15] Lay out the trees more systematically, Nick Cleaton, 2010/10/02
- [PATCH 05/15] Avoid using llink and rlink during prep, Nick Cleaton, 2010/10/02
- [PATCH 06/15] Defer llink/rlink plumbing to the end of prep, Nick Cleaton, 2010/10/02
- [PATCH 07/15] Share storage between llink and fail, Nick Cleaton, 2010/10/02
- [PATCH 08/15] Avoid using trie->depth during matching, Nick Cleaton, 2010/10/02
- [PATCH 09/15] Share storage between rlink and depth, Nick Cleaton, 2010/10/02
- [PATCH 10/15] Eliminate the trie->parent field, Nick Cleaton, 2010/10/02
- [PATCH 11/15] Eliminate the trie->next field, Nick Cleaton, 2010/10/02