[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH] Exclude optimization
From: |
Sergey Poznyakoff |
Subject: |
[PATCH] Exclude optimization |
Date: |
Mon, 10 Aug 2009 00:25:51 +0300 |
Hello,
The proposed patch considerably speed-ups the exclude module
for large exclusion lists of non-wildcard patterns. Ok to push?
>From 5421774438de3a67d89f988a0cd735e19a4cafd4 Mon Sep 17 00:00:00 2001
From: Sergey Poznyakoff <address@hidden>
Date: Mon, 10 Aug 2009 00:14:45 +0300
Subject: [PATCH] Optimize exclude: use hash tables for non-wildcard patterns.
* lib/exclude.c: Include hash.h and mbuiter.h
(struct exclude_pattern, exclude_segment): New data types.
(struct exclude): Rewrite.
(is_fnmatch_pattern): New function.
(new_exclude_segment, free_exclude_segment): New functions.
(excluded_file_pattern_p, excluded_file_name_p): New functions.
(excluded_file_name, add_exclude): Rewrite using new struct exclude.
* lib/exclude.h (is_fnmatch_pattern): New prototype.
* modules/exclude: Depend on hash and mbuiter.
---
lib/exclude.c | 335 +++++++++++++++++++++++++++++++++++++++++++++++++------
lib/exclude.h | 7 +-
modules/exclude | 2 +
3 files changed, 310 insertions(+), 34 deletions(-)
diff --git a/lib/exclude.c b/lib/exclude.c
index f38abf2..d3af524 100644
--- a/lib/exclude.c
+++ b/lib/exclude.c
@@ -1,7 +1,7 @@
/* exclude.c -- exclude file names
Copyright (C) 1992, 1993, 1994, 1997, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,7 +16,10 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-/* Written by Paul Eggert <address@hidden> */
+/* Written by Paul Eggert <address@hidden>
+ and Sergey Poznyakoff <address@hidden>.
+ Thanks to Phil Proudman <address@hidden>
+ for improvement suggestions. */
#include <config.h>
@@ -30,6 +33,8 @@
#include <string.h>
#include "exclude.h"
+#include "hash.h"
+#include "mbuiter.h"
#include "fnmatch.h"
#include "xalloc.h"
#include "verify.h"
@@ -54,6 +59,14 @@ verify (((EXCLUDE_ANCHORED | EXCLUDE_INCLUDE |
EXCLUDE_WILDCARDS)
| FNM_CASEFOLD | FNM_EXTMATCH))
== 0);
+
+/* Exclusion patterns are grouped into a singly-linked list of
+ "exclusion segments". Each segment represents a set of patterns
+ that can be matches using the same algorithm. Non-wildcard
+ patterns are kept in hash tables, to speed up searches. Wildcard
+ patterns are stored as arrays of patterns. */
+
+
/* An exclude pattern-options pair. The options are fnmatch options
ORed with EXCLUDE_* options. */
@@ -63,15 +76,56 @@ struct patopts
int options;
};
-/* An exclude list, of pattern-options pairs. */
+/* An array of pattern-options pairs. */
-struct exclude
+struct exclude_pattern
{
struct patopts *exclude;
size_t exclude_alloc;
size_t exclude_count;
};
+enum exclude_type
+ {
+ exclude_hash, /* a hash table of excluded names */
+ exclude_pattern /* an array of exclude patterns */
+ };
+
+struct exclude_segment
+ {
+ struct exclude_segment *next; /* next segment in list */
+ enum exclude_type type; /* type of this segment */
+ int options; /* common options for this segment */
+ union
+ {
+ Hash_table *table; /* for type == exclude_hash */
+ struct exclude_pattern pat; /* for type == exclude_pattern */
+ } v;
+ };
+
+/* The exclude structure keeps a singly-linked list of exclude segments */
+struct exclude
+ {
+ struct exclude_segment *head, *tail;
+ };
+
+/* Return true if str is a fnmatch pattern */
+bool
+is_fnmatch_pattern (const char *str)
+{
+ while (*str)
+ {
+ size_t n = strcspn (str, "?*[]");
+ if (str[n] == 0)
+ break;
+ else if (n > 0 && str[n-1] == '\\')
+ str += n + 1;
+ else
+ return true;
+ }
+ return false;
+}
+
/* Return a newly allocated and empty exclude list. */
struct exclude *
@@ -80,12 +134,122 @@ new_exclude (void)
return xzalloc (sizeof *new_exclude ());
}
-/* Free the storage associated with an exclude list. */
+/* Calculate the hash of string. */
+static size_t
+string_hasher (void const *data, size_t n_buckets)
+{
+ char const *p = data;
+ return hash_string (p, n_buckets);
+}
+
+/* Ditto, for case-insensitive hashes */
+static size_t
+string_hasher_ci (void const *data, size_t n_buckets)
+{
+ char const *p = data;
+ mbui_iterator_t iter;
+ size_t value = 0;
+
+ for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter))
+ {
+ mbchar_t m = mbui_cur (iter);
+ wchar_t wc;
+
+ if (m.wc_valid)
+ wc = towlower (m.wc);
+ else
+ wc = *m.ptr;
+
+ value = (value * 31 + wc) % n_buckets;
+ }
+
+ return value;
+}
+
+/* compare two strings for equality */
+static bool
+string_compare (void const *data1, void const *data2)
+{
+ char const *p1 = data1;
+ char const *p2 = data2;
+ return strcmp (p1, p2) == 0;
+}
+
+/* compare two strings for equality, case-insensitive */
+static bool
+string_compare_ci (void const *data1, void const *data2)
+{
+ char const *p1 = data1;
+ char const *p2 = data2;
+ return mbscasecmp (p1, p2) == 0;
+}
+
+static void
+string_free (void *data)
+{
+ free (data);
+}
+
+/* Create new exclude segment of given TYPE and OPTIONS, and attach it
+ to the tail of list in EX */
+struct exclude_segment *
+new_exclude_segment (struct exclude *ex, enum exclude_type type, int options)
+{
+ struct exclude_segment *sp = xzalloc (sizeof (struct exclude_segment));
+ sp->type = type;
+ sp->options = options;
+ switch (type)
+ {
+ case exclude_pattern:
+ break;
+
+ case exclude_hash:
+ sp->v.table = hash_initialize (0, NULL,
+ (options & FNM_CASEFOLD) ?
+ string_hasher_ci
+ : string_hasher,
+ (options & FNM_CASEFOLD) ?
+ string_compare_ci
+ : string_compare,
+ string_free);
+ break;
+ }
+ if (ex->tail)
+ ex->tail->next = sp;
+ else
+ ex->head = sp;
+ ex->tail = sp;
+ return sp;
+}
+
+/* Free a single exclude segment */
+static void
+free_exclude_segment (struct exclude_segment *seg)
+{
+ switch (seg->type)
+ {
+ case exclude_pattern:
+ free (seg->v.pat.exclude);
+ break;
+ case exclude_hash:
+ hash_free (seg->v.table);
+ break;
+ }
+ free (seg);
+}
+
+/* Free the storage associated with an exclude list. */
void
free_exclude (struct exclude *ex)
{
- free (ex->exclude);
+ struct exclude_segment *seg;
+ for (seg = ex->head; seg; )
+ {
+ struct exclude_segment *next = seg->next;
+ free_exclude_segment (seg);
+ seg = next;
+ }
free (ex);
}
@@ -155,36 +319,113 @@ exclude_fnmatch (char const *pattern, char const *f, int
options)
return matched;
}
+/* Return true if the exclude_pattern segment SEG excludes F. */
+
+bool
+excluded_file_pattern_p (struct exclude_segment const *seg, char const *f)
+{
+ size_t exclude_count = seg->v.pat.exclude_count;
+ struct patopts const *exclude = seg->v.pat.exclude;
+ size_t i;
+ bool excluded = !! (exclude[0].options & EXCLUDE_INCLUDE);
+
+ /* Scan through the options, until they change excluded */
+ for (i = 0; i < exclude_count; i++)
+ {
+ char const *pattern = exclude[i].pattern;
+ int options = exclude[i].options;
+ if (excluded != exclude_fnmatch (pattern, f, options))
+ return !excluded;
+ }
+ return excluded;
+}
+
+/* Return true if the exclude_hash segment SEG excludes F.
+ BUFFER is an auxiliary storage of the same length as F (with nul
+ terminator included) */
+bool
+excluded_file_name_p (struct exclude_segment const *seg, char const *f,
+ char *buffer)
+{
+ int options = seg->options;
+ bool excluded = !! (options & EXCLUDE_INCLUDE);
+ Hash_table *table = seg->v.table;
+
+ do
+ {
+ /* initialize the pattern */
+ strcpy (buffer, f);
+
+ while (1)
+ {
+ if (hash_lookup (table, buffer))
+ return !excluded;
+ if (options & FNM_LEADING_DIR)
+ {
+ char *p = strrchr (buffer, '/');
+ if (p)
+ {
+ *p = 0;
+ continue;
+ }
+ }
+ break;
+ }
+
+ if (!(options & EXCLUDE_ANCHORED))
+ {
+ f = strchr (f, '/');
+ if (f)
+ f++;
+ }
+ else
+ break;
+ }
+ while (f);
+ return excluded;
+}
+
/* Return true if EX excludes F. */
bool
excluded_file_name (struct exclude const *ex, char const *f)
{
- size_t exclude_count = ex->exclude_count;
+ struct exclude_segment *seg;
+ bool excluded;
+ char *filename = NULL;
- /* If no options are given, the default is to include. */
- if (exclude_count == 0)
+ /* If no patterns are given, the default is to include. */
+ if (!ex->head)
return false;
- else
- {
- struct patopts const *exclude = ex->exclude;
- size_t i;
- /* Otherwise, the default is the opposite of the first option. */
- bool excluded = !! (exclude[0].options & EXCLUDE_INCLUDE);
+ /* Otherwise, the default is the opposite of the first option. */
+ excluded = !! (ex->head->options & EXCLUDE_INCLUDE);
+ /* Scan through the segments, seeing whether they change status from
+ excluded to included or vice versa. */
+ for (seg = ex->head; seg; seg = seg->next)
+ {
+ bool rc;
- /* Scan through the options, seeing whether they change F from
- excluded to included or vice versa. */
- for (i = 0; i < exclude_count; i++)
+ switch (seg->type)
{
- char const *pattern = exclude[i].pattern;
- int options = exclude[i].options;
- if (excluded == !! (options & EXCLUDE_INCLUDE))
- excluded ^= exclude_fnmatch (pattern, f, options);
+ case exclude_pattern:
+ rc = excluded_file_pattern_p (seg, f);
+ break;
+
+ case exclude_hash:
+ if (!filename)
+ filename = xmalloc (strlen (f) + 1);
+ rc = excluded_file_name_p (seg, f, filename);
+ break;
+ }
+ if (rc != excluded)
+ {
+ excluded = rc;
+ break;
}
-
- return excluded;
}
+ free (filename);
+ return excluded;
}
/* Append to EX the exclusion PATTERN with OPTIONS. */
@@ -192,15 +433,45 @@ excluded_file_name (struct exclude const *ex, char const
*f)
void
add_exclude (struct exclude *ex, char const *pattern, int options)
{
- struct patopts *patopts;
-
- if (ex->exclude_count == ex->exclude_alloc)
- ex->exclude = x2nrealloc (ex->exclude, &ex->exclude_alloc,
- sizeof *ex->exclude);
+ struct exclude_segment *seg;
- patopts = &ex->exclude[ex->exclude_count++];
- patopts->pattern = pattern;
- patopts->options = options;
+ if ((options & EXCLUDE_WILDCARDS) && is_fnmatch_pattern (pattern))
+ {
+ struct exclude_pattern *pat;
+ struct patopts *patopts;
+
+ if (ex->tail && ex->tail->type == exclude_pattern
+ && ((ex->tail->options & EXCLUDE_INCLUDE) ==
+ (options & EXCLUDE_INCLUDE)))
+ seg = ex->tail;
+ else
+ seg = new_exclude_segment (ex, exclude_pattern, options);
+
+ pat = &seg->v.pat;
+ if (pat->exclude_count == pat->exclude_alloc)
+ pat->exclude = x2nrealloc (pat->exclude, &pat->exclude_alloc,
+ sizeof *pat->exclude);
+ patopts = &pat->exclude[pat->exclude_count++];
+ patopts->pattern = pattern;
+ patopts->options = options;
+ }
+ else
+ {
+ char *str, *p;
+#define EXCLUDE_HASH_FLAGS (EXCLUDE_INCLUDE|EXCLUDE_ANCHORED|\
+ FNM_LEADING_DIR|FNM_CASEFOLD)
+ if (ex->tail && ex->tail->type == exclude_hash
+ && ((ex->tail->options & EXCLUDE_HASH_FLAGS) ==
+ (options & EXCLUDE_HASH_FLAGS)))
+ seg = ex->tail;
+ else
+ seg = new_exclude_segment (ex, exclude_hash, options);
+
+ str = xstrdup (pattern);
+ p = hash_insert (seg->v.table, str);
+ if (p != str)
+ free (str);
+ }
}
/* Use ADD_FUNC to append to EX the patterns in FILE_NAME, each with
diff --git a/lib/exclude.h b/lib/exclude.h
index 7d03bc1..b15e6db 100644
--- a/lib/exclude.h
+++ b/lib/exclude.h
@@ -1,7 +1,7 @@
/* exclude.h -- declarations for excluding file names
Copyright (C) 1992, 1993, 1994, 1997, 1999, 2001, 2002, 2003, 2005,
- 2006 Free Software Foundation, Inc.
+ 2006, 2009 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,7 +16,8 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-/* Written by Paul Eggert <address@hidden> */
+/* Written by Paul Eggert <address@hidden>
+ and Sergey Poznyakoff <address@hidden> */
/* Exclude options, which can be ORed with fnmatch options. */
@@ -33,6 +34,8 @@
struct exclude;
+bool is_fnmatch_pattern (const char *);
+
struct exclude *new_exclude (void);
void free_exclude (struct exclude *);
void add_exclude (struct exclude *, char const *, int);
diff --git a/modules/exclude b/modules/exclude
index 3d5aaef..7ff3c14 100644
--- a/modules/exclude
+++ b/modules/exclude
@@ -8,7 +8,9 @@ m4/exclude.m4
Depends-on:
fnmatch
+hash
mbscasecmp
+mbuiter
stdbool
verify
xalloc
--
1.6.0
- [PATCH] Exclude optimization,
Sergey Poznyakoff <=