bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 13/15] Reduce trie->accepting to a flag


From: Nick Cleaton
Subject: [PATCH 13/15] Reduce trie->accepting to a flag
Date: Sat, 2 Oct 2010 07:07:44 +0100

The use pattern of kwset in grep is such that an int trie->accepting
value isn't really needed - a flag is enough.
---
 src/cpsort.c    |   31 +++++++++++++++++--------------
 src/cpsort.h    |    7 ++++---
 src/dfasearch.c |   24 +++++++++---------------
 src/kwsearch.c  |    2 +-
 src/kwset.c     |   18 ++++++++++--------
 src/kwset.h     |    9 +++++----
 6 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/src/cpsort.c b/src/cpsort.c
index 263d895..fc561f6 100644
--- a/src/cpsort.c
+++ b/src/cpsort.c
@@ -165,13 +165,13 @@ merge_blocks (struct mergeblock *start)
                 {
                   if (b_len == a_elt->suf_len)
                     {
-                      /* Duplicate.  Keep the lower index value. */
-                      if (b_elt->index < a_elt->index)
-                        a_elt->index = b_elt->index;
+                      /* Duplicate.  Prefer a flag=0 word. */
+                      if (b_elt->flag < a_elt->flag)
+                        a_elt->flag = b_elt->flag;
 
                       /* Advance b, to skip the duplicate. */
                       b += b_elt++->suf_len;
-                      if (b_elt->index < 0)
+                      if (b_elt->flag < 0)
                         {
                           b_elt = a_elt;
                           break;
@@ -203,7 +203,7 @@ merge_blocks (struct mergeblock *start)
         }
 
       /* Copy metadata from a to dest. */
-      dest_md->index = a_elt->index;
+      dest_md->flag = a_elt->flag;
       dest_md->suf_len = a_elt->suf_len - (cpl_a_dest - a_elt->cpl);
       dest_md->cpl = cpl_a_dest;
       dest_md++;
@@ -213,14 +213,14 @@ merge_blocks (struct mergeblock *start)
       cpl_a_dest = a_elt->cpl;
       cpl_b_dest = cpl_ab;
 
-      if (a_elt->index < 0)
+      if (a_elt->flag < 0)
         {
           memcpy (dest, dc_base, a - dc_base);
           dest += a - dc_base;
 
           dc_base = b + (cpl_b_dest - b_elt->cpl);
 
-          dest_md->index = b_elt->index;
+          dest_md->flag = b_elt->flag;
           dest_md->cpl = cpl_b_dest;
           dest_md->suf_len = b_elt->suf_len - (cpl_b_dest - b_elt->cpl);
           dest_md++;
@@ -241,13 +241,13 @@ merge_blocks (struct mergeblock *start)
   }
 
   /* Copy remaining metadata elts from input b. */
-  while (b_elt->index != -1)
+  while (b_elt->flag != -1)
     {
-      dest_md->index = b_elt->index;
+      dest_md->flag = b_elt->flag;
       dest_md->cpl = b_elt->cpl;
       dest_md++->suf_len = b_elt++->suf_len;
     }
-  dest_md->index = -1;          /* Terminate the vector. */
+  dest_md->flag = -1;          /* Terminate the vector. */
 
   /* Store the combination in the first block, remove the second. */
   free (start->suffixes);
@@ -304,7 +304,7 @@ flush_sorted_group (cpsort_t c)
     return _("memory exhausted");
 
   /* Terminate the vector of cpsort_md structs. */
-  c->psg.strs[c->psg.count].index = -1;
+  c->psg.strs[c->psg.count].flag = -1;
 
   /* Move the data to the mergeblock. */
   mb->str_count = c->psg.count;
@@ -342,7 +342,7 @@ flush_sorted_group (cpsort_t c)
 /* Add the given string to the cpsort.  Return NULL for success, or an
    error message.  */
 const char *
-cpsort_incr (cpsort_t c, char const *str, size_t len)
+cpsort_incr (cpsort_t c, char const *str, size_t len, int flag)
 {
   char const *err;
   unsigned char const *ustr = str;
@@ -361,7 +361,10 @@ cpsort_incr (cpsort_t c, char const *str, size_t len)
     {
       if (c->psg.last_len == len && c->psg.count != 0)
         {
-          /* Duplicate, discard. */
+          /* Duplicate, discard.  Keep the flag=0 instance if there
+             is one. */
+          if (!flag)
+            c->psg.strs[c->psg.count-1].flag = 0;
           c->str_count++;
           return NULL;
         }
@@ -390,7 +393,7 @@ cpsort_incr (cpsort_t c, char const *str, size_t len)
   c->psg.last_len = len;
   c->psg.strs[c->psg.count].cpl = cpl;
   c->psg.strs[c->psg.count].suf_len = len - cpl;
-  c->psg.strs[c->psg.count].index = c->str_count++;
+  c->psg.strs[c->psg.count].flag = flag ? 1 : 0;
   c->psg.count++;
   memcpy (c->psg.suffixes + c->psg.tot_suf_len, str + cpl, len - cpl);
   c->psg.tot_suf_len += len - cpl;
diff --git a/src/cpsort.h b/src/cpsort.h
index 87a5f03..e3f7cd4 100644
--- a/src/cpsort.h
+++ b/src/cpsort.h
@@ -37,7 +37,7 @@
 /* Meta-data for one string in a sorted dump of a cpsort. */
 struct cpsort_md
 {
-  int index;                    /* Index number of this string. */
+  int flag;                     /* The flag for this string, 0 or 1. */
   int cpl;                      /* Common Prefix Length with previous. */
   int suf_len;                  /* Length, excluding the common prefix. */
 };
@@ -59,8 +59,9 @@ typedef struct cpsort *cpsort_t;
 extern cpsort_t cpsort_alloc (void);
 
 /* Add the given string to the cpsort.  Return NULL for success, or an
-   error message.  */
-extern const char *cpsort_incr (cpsort_t, char const *, size_t);
+   error message.  The int value is the flag value to assign to this
+   string, 0 or 1. */
+extern const char *cpsort_incr (cpsort_t, char const *, size_t, int);
 
 /* Put a sorted dump of the cpsort in the specified cpcsort_dump struct.
    Return NULL for success, or an error message.  */
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 780891f..446a9e4 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -68,13 +68,8 @@ dfawarn (char const *mesg)
     dfaerror (mesg);
 }
 
-/* Number of compiled fixed strings known to exactly match the regexp.
-   If kwsexec returns < kwset_exact_matches, then we don't need to
-   call the regexp matcher at all. */
-static int kwset_exact_matches;
-
 static char const *
-kwsincr_case (const char *must)
+kwsincr_case (const char *must, int flag)
 {
   const char *buf;
   size_t n;
@@ -86,7 +81,7 @@ kwsincr_case (const char *must)
   else
 #endif
     buf = must;
-  return kwsincr (kwset, buf, n);
+  return kwsincr (kwset, buf, n, flag);
 }
 
 /* If the DFA turns out to have some set of fixed strings one of
@@ -104,23 +99,22 @@ kwsmusts (void)
     {
       kwsinit (&kwset);
       /* First, we compile in the substrings known to be exact
-         matches.  The kwset matcher will return the index
-         of the matching string that it chooses. */
+         matches.  We tell kwset to set the flag to 0 for these
+         strings. */
       for (; dm; dm = dm->next)
         {
           if (!dm->exact)
             continue;
-          ++kwset_exact_matches;
-          if ((err = kwsincr_case (dm->must)) != NULL)
+          if ((err = kwsincr_case (dm->must, 0)) != NULL)
             error (EXIT_TROUBLE, 0, "%s", err);
         }
-      /* Now, we compile the substrings that will require
-         the use of the regexp matcher.  */
+      /* Now, we compile the substrings that will require the use
+       * of the regexp matcher.  Flag=1 in kwset for these. */
       for (dm = dfamusts (dfa); dm; dm = dm->next)
         {
           if (dm->exact)
             continue;
-          if ((err = kwsincr_case (dm->must)) != NULL)
+          if ((err = kwsincr_case (dm->must, 1)) != NULL)
             error (EXIT_TROUBLE, 0, "%s", err);
         }
       if ((err = kwsprep (kwset)) != NULL)
@@ -261,7 +255,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
               match = beg;
               while (beg > buf && beg[-1] != eol)
                 --beg;
-              if (kwsm.index < kwset_exact_matches)
+              if (!kwsm.flag)
                 {
 #if MBS_SUPPORT
                   if (mb_start < beg)
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 9244a8a..f15a6b1 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -66,7 +66,7 @@ Fcompile (char const *pattern, size_t size)
 #endif
         }
 
-      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+      if ((err = kwsincr (kwset, beg, end - beg, 0)) != NULL)
         error (EXIT_TROUBLE, 0, "%s", err);
       beg = lim;
     }
diff --git a/src/kwset.c b/src/kwset.c
index e23cbca..aa75fbd 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -71,7 +71,7 @@ struct trie
   rd;
   unsigned char label;         /* Label on the incoming edge. */
   unsigned char is_lastkid;     /* True for the last of a group of siblings. */
-  unsigned int accepting;      /* Word index of accepted word, or zero. */
+  unsigned char accepting;     /* Flag value of accepted word, or zero. */
   struct trie *links;          /* Tree of edges leaving this node. */
   int shift;                   /* Shift function for search failures. */
 };
@@ -105,6 +105,7 @@ struct kwset
   struct trie *next_free_node;  /* Next unallocated trie in node_storage. */
   struct broodnote *bn_cursor;  /* A cursor for walking the broodnotes. */
   struct trie *next_prealloced; /* Linked list of pre-allocated trie nodes. */
+  int first_string_flag;        /* Flag value on first kwsincr() call. */
 };
 
 /* Allocate and initialize a keyword set object, returning an opaque
@@ -144,7 +145,7 @@ kwsalloc (char const *trans)
 /* Add the given string to the contents of the keyword set.  Return NULL
    for success, an error message otherwise. */
 const char *
-kwsincr (kwset_t kws, char const *text, size_t len)
+kwsincr (kwset_t kws, char const *text, size_t len, int flag)
 {
   char const *err, *end;
   char *dest;
@@ -163,10 +164,11 @@ kwsincr (kwset_t kws, char const *text, size_t len)
   while (text < end)
     *dest++ = kws->trans ? kws->trans[U (*--end)] : *--end;
 
-  if ((err = cpsort_incr (kws->cps, kws->kwbuf, len)))
+  if ((err = cpsort_incr (kws->cps, kws->kwbuf, len, flag ? 1 : 0)))
     return err;
 
-  ++kws->words;
+  if (!kws->words++)
+    kws->first_string_flag = flag ? 1 : 0;
 
   /* Keep track of the longest and shortest string of the keyword set. */
   if ((int)len < kws->mind)
@@ -358,8 +360,8 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char 
const *text,
     }
 
   /* Mark the node we finally reached as accepting, encoding the
-     index number of this word in the keyword set. */
-  trie->accepting = 1 + 2 * md->index;
+     flag value for this keyword. */
+  trie->accepting = 1 + 2 * md->flag;
 
   trie->links = NULL;
 }
@@ -874,7 +876,7 @@ cwexec (kwset_t kws, char const *text, size_t len, struct 
kwsmatch *kwsmatch)
         d = 1;
     }
 
-  kwsmatch->index = accept->accepting / 2;
+  kwsmatch->flag = accept->accepting / 2;
   kwsmatch->offset[0] = mch - text;
   kwsmatch->size[0] = mch_len;
 
@@ -895,7 +897,7 @@ kwsexec (kwset_t kws, char const *text, size_t size, struct 
kwsmatch *kwsmatch)
       size_t ret = bmexec (kws, text, size);
       if (ret != (size_t) -1)
         {
-          kwsmatch->index = 0;
+          kwsmatch->flag = kwset->first_string_flag;
           kwsmatch->offset[0] = ret;
           kwsmatch->size[0] = kwset->mind;
         }
diff --git a/src/kwset.h b/src/kwset.h
index 35caa41..6acb1c1 100644
--- a/src/kwset.h
+++ b/src/kwset.h
@@ -23,7 +23,7 @@
 
 struct kwsmatch
 {
-  int index;                   /* Index number of matching keyword. */
+  int flag;                    /* Flag value for the matching keyword. */
   size_t offset[1];            /* Offset of each submatch. */
   size_t size[1];              /* Length of each submatch. */
 };
@@ -40,9 +40,10 @@ typedef struct kwset *kwset_t;
 extern kwset_t kwsalloc (char const *);
 
 /* Incrementally extend the keyword set to include the given string.
-   Return NULL for success, or an error message.  Remember an index
-   number for each keyword included in the set. */
-extern const char *kwsincr (kwset_t, char const *, size_t);
+   Return NULL for success, or an error message.  The final arg is
+   a flag that determines whether or not kwsmatch.flag should be set
+   for matches on this keyword. */
+extern const char *kwsincr (kwset_t, char const *, size_t, int);
 
 /* When the keyword set has been completely built, prepare it for
    use.  Return NULL for success, or an error message. */
-- 
1.5.6.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]