From 25f72238cdda4f3372aaa9181075f975832ef50f Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 15 Mar 2014 14:41:52 +0900 Subject: [PATCH 1/2] grep: optimization by using the Galil rule for Boyer-Moore algorithm in KWSet The Boyer-Moore algorithm runs in O(m n) in the worst case, which perhaps it may be much slower than the DFA. The Galil rule enables to change O(m n) into O(n) for its case without overheads and/or slow-down for other cases by avoiding to compare more than once for a position in the text. To use the Galil rule, looking for the delta2 shift at each position from the trie instead of `mind2' value. I prepare following string, which makes a worst case for Boyer-Moore algorithm, to measure the performance. yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k I run the test with the patch (best-of-5 trials): env LC_ALL=C time -p src/grep kjjjjjjjjjjjjjjjjjjj k real 0.70 user 0.32 sys 0.38 Back out that commit (temporarily), recompile, and rerun the experiment: env LC_ALL=C time -p src/grep kjjjjjjjjjjjjjjjjjjj k real 3.97 user 3.56 sys 0.40 * src/kwset.c (struct kwset): Replace member `mind2' to `shift'. (kwsprep): Looking for the delta2 shift. (bmexec): Use it. --- src/kwset.c | 214 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 127 insertions(+), 87 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 410e046..e0bf6b2 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -83,7 +83,7 @@ struct kwset unsigned char delta[NCHAR]; /* Delta table for rapid search. */ struct trie *next[NCHAR]; /* Table of children of the root. */ char *target; /* Target string if there's only one. */ - int mind2; /* Used in Boyer-Moore search for one string. */ + int *shift; /* Used in Boyer-Moore search for one string. */ char const *trans; /* Character translation table. */ }; @@ -397,12 +397,70 @@ kwsprep (kwset_t kws) node at which an outgoing edge is labeled by that character. */ memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); + struct trie *fail; + struct trie *last, *next[NCHAR]; + + /* Traverse the nodes of the trie in level order, simultaneously + computing the delta table, failure function, and shift function. */ + for (curr = last = kwset->trie; curr; curr = curr->next) + { + /* Enqueue the immediate descendants in the level order queue. */ + enqueue(curr->links, &last); + + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the delta table for the descendants of this node. */ + treedelta(curr->links, curr->depth, delta); + + /* Compute the failure function for the descendants of this node. */ + treefails(curr->links, curr->fail, kwset->trie); + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery(fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } + } + + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } + + /* Create a vector, indexed by character code, of the outgoing links + from the root node. */ + for (i = 0; i < NCHAR; ++i) + next[i] = NULL; + treenext(kwset->trie->links, next); + + if ((trans = kwset->trans) != NULL) + for (i = 0; i < NCHAR; ++i) + kwset->next[i] = next[U(trans[i])]; + else + memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); + /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ if (kwset->words == 1 && kwset->trans == NULL) { - char c; - /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); if (!kwset->target) @@ -410,80 +468,22 @@ kwsprep (kwset_t kws) for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) { kwset->target[i] = curr->links->label; - curr = curr->links->trie; + curr = curr->next; } - /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ - for (i = 0; i < kwset->mind; ++i) - delta[U(kwset->target[i])] = kwset->mind - (i + 1); - /* Find the minimal delta2 shift that we might make after - a backwards match has failed. */ - c = kwset->target[kwset->mind - 1]; - for (i = kwset->mind - 2; i >= 0; --i) - if (kwset->target[i] == c) - break; - kwset->mind2 = kwset->mind - (i + 1); - } - else - { - struct trie *fail; - struct trie *last, *next[NCHAR]; - - /* Traverse the nodes of the trie in level order, simultaneously - computing the delta table, failure function, and shift function. */ - for (curr = last = kwset->trie; curr; curr = curr->next) + /* Looking for the delta2 shift that we might make after a + backwards match has failed. Extract it from the trie. */ + if (kwset->mind > 1) { - /* Enqueue the immediate descendants in the level order queue. */ - enqueue(curr->links, &last); - - curr->shift = kwset->mind; - curr->maxshift = kwset->mind; - - /* Update the delta table for the descendants of this node. */ - treedelta(curr->links, curr->depth, delta); - - /* Compute the failure function for the descendants of this node. */ - treefails(curr->links, curr->fail, kwset->trie); - - /* Update the shifts at each node in the current node's chain - of fails back to the root. */ - for (fail = curr->fail; fail; fail = fail->fail) + kwset->shift = obstack_alloc(&kwset->obstack, + sizeof (*kwset->shift) * (kwset->mind - 1)); + if (!kwset->shift) + return _("memory exhausted"); + for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) { - /* If the current node has some outgoing edge that the fail - doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery(fail->links, curr->links)) - if (curr->depth - fail->depth < fail->shift) - fail->shift = curr->depth - fail->depth; - - /* If the current node is accepting then the shift at the - fail and its descendants should be no larger than the - difference of their depths. */ - if (curr->accepting && fail->maxshift > curr->depth - fail->depth) - fail->maxshift = curr->depth - fail->depth; + kwset->shift[i] = curr->shift; + curr = curr->next; } } - - /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ - for (curr = kwset->trie->next; curr; curr = curr->next) - { - if (curr->maxshift > curr->parent->maxshift) - curr->maxshift = curr->parent->maxshift; - if (curr->shift > curr->maxshift) - curr->shift = curr->maxshift; - } - - /* Create a vector, indexed by character code, of the outgoing links - from the root node. */ - for (i = 0; i < NCHAR; ++i) - next[i] = NULL; - treenext(kwset->trie->links, next); - - if ((trans = kwset->trans) != NULL) - for (i = 0; i < NCHAR; ++i) - kwset->next[i] = next[U(trans[i])]; - else - memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); } /* Fix things up for any translation table. */ @@ -503,7 +503,8 @@ bmexec (kwset_t kws, char const *text, size_t size) struct kwset const *kwset; unsigned char const *d1; char const *ep, *sp, *tp; - int d, gc, i, len, md2; + int d, i, len, skip; + char gc1, gc2; kwset = (struct kwset const *) kws; len = kwset->mind; @@ -520,9 +521,10 @@ bmexec (kwset_t kws, char const *text, size_t size) d1 = kwset->delta; sp = kwset->target + len; - gc = U(sp[-2]); - md2 = kwset->mind2; + gc1 = sp[-1]; + gc2 = sp[-2]; tp = text + len; + skip = 0; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ if (size > 12 * len) @@ -550,14 +552,33 @@ bmexec (kwset_t kws, char const *text, size_t size) } break; found: - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - tp += md2; } /* Now we have only a few characters left to search. We @@ -569,14 +590,33 @@ bmexec (kwset_t kws, char const *text, size_t size) d = d1[U((tp += d)[-1])]; if (d != 0) continue; - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - d = md2; } return -1; -- 1.9.1 From 9be91b2268d861ee6b16f281eac6873a7244f6fb Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Thu, 20 Mar 2014 08:07:25 +0900 Subject: [PATCH 2/2] grep: may also use Boyer-Moore algorithm for case-insensitive matching * src/kwset.c (bmexec): Use character translation table. (kwsexec): Call bmexec for case-insensitive matching. (kwsprep): Change the `if' condition. --- src/kwset.c | 157 +++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 38 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index e0bf6b2..920bc7a 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -459,7 +459,7 @@ kwsprep (kwset_t kws) /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ - if (kwset->words == 1 && kwset->trans == NULL) + if (kwset->words == 1) { /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); @@ -505,9 +505,11 @@ bmexec (kwset_t kws, char const *text, size_t size) char const *ep, *sp, *tp; int d, i, len, skip; char gc1, gc2; + char const *trans; kwset = (struct kwset const *) kws; len = kwset->mind; + trans = kwset->trans; if (len == 0) return 0; @@ -515,15 +517,30 @@ bmexec (kwset_t kws, char const *text, size_t size) return -1; if (len == 1) { + if (trans) + { + for (tp = text; tp < text + size; tp++) + if (trans[U(*tp)] == kwset->target[0]) + break; + return tp < text + size ? tp - text : -1; + } tp = memchr (text, kwset->target[0], size); return tp ? tp - text : -1; } d1 = kwset->delta; sp = kwset->target + len; - gc1 = sp[-1]; - gc2 = sp[-2]; tp = text + len; + if (trans) + { + gc1 = trans[U(sp[-1])]; + gc2 = trans[U(sp[-2])]; + } + else + { + gc1 = sp[-1]; + gc2 = sp[-2]; + } skip = 0; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ @@ -553,30 +570,62 @@ bmexec (kwset_t kws, char const *text, size_t size) break; found: d = len; - while (1) + if (trans) { - if (tp[-2] == gc2) + while (1) { - for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - ; - if (i > d) + if (trans[U(tp[-2])] == gc2) { - for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + for (i = 3; i <= d && trans[U(tp[-i])] == trans[U(sp[-i])]; ++i) ; - if (i > len) - return tp - len - text; + if (i > d) + { + for (i = d + skip + 1; i <= len && trans[U(tp[-i])] == trans[U(sp[-i])]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || trans[U(tp[-1])] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || trans[U(tp[-1])] != gc1) + break; + skip = 1; } - d = kwset->shift[i - 2]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = i - 1; } - else + } + else + { + while (1) { - d = kwset->shift[0]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = 1; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } } } @@ -591,30 +640,62 @@ bmexec (kwset_t kws, char const *text, size_t size) if (d != 0) continue; d = len; - while (1) + if (trans) { - if (tp[-2] == gc2) + while (1) { - for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) - ; - if (i > d) + if (trans[U(tp[-2])] == gc2) { - for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + for (i = 3; i <= d && trans[U(tp[-i])] == trans[U(sp[-i])]; ++i) ; - if (i > len) - return tp - len - text; + if (i > d) + { + for (i = d + skip + 1; i <= len && trans[U(tp[-i])] == trans[U(sp[-i])]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || trans[U(tp[-1])] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || trans[U(tp[-1])] != gc1) + break; + skip = 1; } - d = kwset->shift[i - 2]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = i - 1; } - else + } + else + { + while (1) { - d = kwset->shift[0]; tp += d; - if (tp > ep || tp[-1] != gc1) - break; - skip = 1; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } } } @@ -787,7 +868,7 @@ size_t kwsexec (kwset_t kws, char const *text, size_t size, struct kwsmatch *kwsmatch) { struct kwset const *kwset = (struct kwset *) kws; - if (kwset->words == 1 && kwset->trans == NULL) + if (kwset->words == 1) { size_t ret = bmexec (kws, text, size); if (ret != (size_t) -1) -- 1.9.1