LCOV - code coverage report
Current view: top level - lisp/emacs-lisp - regexp-opt.el (source / functions) Hit Total Coverage
Test: tramp-tests.info Lines: 97 114 85.1 %
Date: 2017-08-27 09:44:50 Functions: 6 7 85.7 %

          Line data    Source code
       1             : ;;; regexp-opt.el --- generate efficient regexps to match strings -*- lexical-binding: t -*-
       2             : 
       3             : ;; Copyright (C) 1994-2017 Free Software Foundation, Inc.
       4             : 
       5             : ;; Author: Simon Marshall <simon@gnu.org>
       6             : ;; Maintainer: emacs-devel@gnu.org
       7             : ;; Keywords: strings, regexps, extensions
       8             : 
       9             : ;; This file is part of GNU Emacs.
      10             : 
      11             : ;; GNU Emacs is free software: you can redistribute it and/or modify
      12             : ;; it under the terms of the GNU General Public License as published by
      13             : ;; the Free Software Foundation, either version 3 of the License, or
      14             : ;; (at your option) any later version.
      15             : 
      16             : ;; GNU Emacs is distributed in the hope that it will be useful,
      17             : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             : ;; GNU General Public License for more details.
      20             : 
      21             : ;; You should have received a copy of the GNU General Public License
      22             : ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
      23             : 
      24             : ;;; Commentary:
      25             : 
      26             : ;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i[sz]e\\)".
      27             : ;;
      28             : ;; This package generates a regexp from a given list of strings (which matches
      29             : ;; one of those strings) so that the regexp generated by:
      30             : ;;
      31             : ;; (regexp-opt strings)
      32             : ;;
      33             : ;; is equivalent to, but more efficient than, the regexp generated by:
      34             : ;;
      35             : ;; (mapconcat 'regexp-quote strings "\\|")
      36             : ;;
      37             : ;; For example:
      38             : ;;
      39             : ;; (let ((strings '("cond" "if" "when" "unless" "while"
      40             : ;;                  "let" "let*" "progn" "prog1" "prog2"
      41             : ;;                  "save-restriction" "save-excursion" "save-window-excursion"
      42             : ;;                  "save-current-buffer" "save-match-data"
      43             : ;;                  "catch" "throw" "unwind-protect" "condition-case")))
      44             : ;;   (concat "(" (regexp-opt strings t) "\\>"))
      45             : ;;  => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
      46             : ;;
      47             : ;; Searching using the above example `regexp-opt' regexp takes approximately
      48             : ;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
      49             : 
      50             : ;; Since this package was written to produce efficient regexps, not regexps
      51             : ;; efficiently, it is probably not a good idea to in-line too many calls in
      52             : ;; your code, unless you use the following trick with `eval-when-compile':
      53             : ;;
      54             : ;; (defvar definition-regexp
      55             : ;;   (eval-when-compile
      56             : ;;     (concat "^("
      57             : ;;             (regexp-opt '("defun" "defsubst" "defmacro" "defalias"
      58             : ;;                           "defvar" "defconst") t)
      59             : ;;             "\\>")))
      60             : ;;
      61             : ;; The `byte-compile' code will be as if you had defined the variable thus:
      62             : ;;
      63             : ;; (defvar definition-regexp
      64             : ;;   "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>")
      65             : ;;
      66             : ;; Note that if you use this trick for all instances of `regexp-opt' and
      67             : ;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded
      68             : ;; at compile time.  But note also that using this trick means that should
      69             : ;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
      70             : ;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
      71             : ;; your code for such changes to have effect in your code.
      72             : 
      73             : ;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
      74             : ;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and
      75             : ;; Stefan Monnier.
      76             : ;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas
      77             : ;; or any other information to improve things are welcome.
      78             : ;;
      79             : ;; One possible improvement would be to compile '("aa" "ab" "ba" "bb")
      80             : ;; into "[ab][ab]" rather than "a[ab]\\|b[ab]".  I'm not sure it's worth
      81             : ;; it but if someone knows how to do it without going through too many
      82             : ;; contortions, I'm all ears.
      83             : 
      84             : ;;; Code:
      85             : 
      86             : ;;;###autoload
      87             : (defun regexp-opt (strings &optional paren)
      88             :   "Return a regexp to match a string in the list STRINGS.
      89             : Each string should be unique in STRINGS and should not contain
      90             : any regexps, quoted or not.  Optional PAREN specifies how the
      91             : returned regexp is surrounded by grouping constructs.
      92             : 
      93             : The optional argument PAREN can be any of the following:
      94             : 
      95             : a string
      96             :     the resulting regexp is preceded by PAREN and followed by
      97             :     \\), e.g.  use \"\\\\(?1:\" to produce an explicitly numbered
      98             :     group.
      99             : 
     100             : `words'
     101             :     the resulting regexp is surrounded by \\=\\<\\( and \\)\\>.
     102             : 
     103             : `symbols'
     104             :     the resulting regexp is surrounded by \\_<\\( and \\)\\_>.
     105             : 
     106             : non-nil
     107             :     the resulting regexp is surrounded by \\( and \\).
     108             : 
     109             : nil
     110             :     the resulting regexp is surrounded by \\(?: and \\), if it is
     111             :     necessary to ensure that a postfix operator appended to it will
     112             :     apply to the whole expression.
     113             : 
     114             : The resulting regexp is equivalent to but usually more efficient
     115             : than that of a simplified version:
     116             : 
     117             :  (defun simplified-regexp-opt (strings &optional paren)
     118             :    (let ((parens
     119             :           (cond ((stringp paren)       (cons paren \"\\\\)\"))
     120             :                 ((eq paren 'words)    '(\"\\\\\\=<\\\\(\" . \"\\\\)\\\\>\"))
     121             :                 ((eq paren 'symbols) '(\"\\\\_<\\\\(\" . \"\\\\)\\\\_>\"))
     122             :                 ((null paren)          '(\"\\\\(?:\" . \"\\\\)\"))
     123             :                 (t                       '(\"\\\\(\" . \"\\\\)\")))))
     124             :      (concat (car paren)
     125             :              (mapconcat 'regexp-quote strings \"\\\\|\")
     126             :              (cdr paren))))"
     127         207 :   (save-match-data
     128             :     ;; Recurse on the sorted list.
     129         207 :     (let* ((max-lisp-eval-depth 10000)
     130             :            (max-specpdl-size 10000)
     131             :            (completion-ignore-case nil)
     132             :            (completion-regexp-list nil)
     133         207 :            (open (cond ((stringp paren) paren) (paren "\\(")))
     134         207 :            (sorted-strings (delete-dups
     135         207 :                             (sort (copy-sequence strings) 'string-lessp)))
     136         207 :            (re (regexp-opt-group sorted-strings (or open t) (not open))))
     137         207 :       (cond ((eq paren 'words)
     138           0 :              (concat "\\<" re "\\>"))
     139         207 :             ((eq paren 'symbols)
     140           0 :              (concat "\\_<" re "\\_>"))
     141         207 :             (t re)))))
     142             : 
     143             : ;;;###autoload
     144             : (defun regexp-opt-depth (regexp)
     145             :   "Return the depth of REGEXP.
     146             : This means the number of non-shy regexp grouping constructs
     147             : \(parenthesized expressions) in REGEXP."
     148           0 :   (save-match-data
     149             :     ;; Hack to signal an error if REGEXP does not have balanced parentheses.
     150           0 :     (string-match regexp "")
     151             :     ;; Count the number of open parentheses in REGEXP.
     152           0 :     (let ((count 0) start last)
     153           0 :       (while (string-match "\\\\(\\(\\?[0-9]*:\\)?" regexp start)
     154           0 :         (setq start (match-end 0))            ; Start of next search.
     155           0 :         (when (and (not (match-beginning 1))
     156           0 :                    (subregexp-context-p regexp (match-beginning 0) last))
     157             :           ;; It's not a shy group and it's not inside brackets or after
     158             :           ;; a backslash: it's really a group-open marker.
     159           0 :           (setq last start)         ; Speed up next regexp-opt-re-context-p.
     160           0 :           (setq count (1+ count))))
     161           0 :       count)))
     162             : 
     163             : ;;; Workhorse functions.
     164             : 
     165             : (defun regexp-opt-group (strings &optional paren lax)
     166             :   "Return a regexp to match a string in the sorted list STRINGS.
     167             : If PAREN non-nil, output regexp parentheses around returned regexp.
     168             : If LAX non-nil, don't output parentheses if it doesn't require them.
     169             : Merges keywords to avoid backtracking in Emacs's regexp matcher."
     170             :   ;; The basic idea is to find the shortest common prefix or suffix, remove it
     171             :   ;; and recurse.  If there is no prefix, we divide the list into two so that
     172             :   ;; (at least) one half will have at least a one-character common prefix.
     173             : 
     174             :   ;; Also we delay the addition of grouping parenthesis as long as possible
     175             :   ;; until we're sure we need them, and try to remove one-character sequences
     176             :   ;; so we can use character sets rather than grouping parenthesis.
     177        4883 :   (let* ((open-group (cond ((stringp paren) paren) (paren "\\(?:") (t "")))
     178        4883 :          (close-group (if paren "\\)" ""))
     179        4883 :          (open-charset (if lax "" open-group))
     180        4883 :          (close-charset (if lax "" close-group)))
     181        4883 :     (cond
     182             :      ;;
     183             :      ;; If there are no strings, just return the empty string.
     184        4883 :      ((= (length strings) 0)
     185             :       "")
     186             :      ;;
     187             :      ;; If there is only one string, just return it.
     188        4883 :      ((= (length strings) 1)
     189        1569 :       (if (= (length (car strings)) 1)
     190         287 :           (concat open-charset (regexp-quote (car strings)) close-charset)
     191        1569 :         (concat open-group (regexp-quote (car strings)) close-group)))
     192             :      ;;
     193             :      ;; If there is an empty string, remove it and recurse on the rest.
     194        3314 :      ((= (length (car strings)) 0)
     195         259 :       (concat open-charset
     196         259 :               (regexp-opt-group (cdr strings) t t) "?"
     197         259 :               close-charset))
     198             :      ;;
     199             :      ;; If there are several one-char strings, use charsets
     200        3055 :      ((and (= (length (car strings)) 1)
     201         503 :            (let ((strs (cdr strings)))
     202        2157 :              (while (and strs (/= (length (car strs)) 1))
     203        3308 :                (pop strs))
     204        3055 :              strs))
     205         277 :       (let (letters rest)
     206             :         ;; Collect one-char strings
     207         277 :         (dolist (s strings)
     208        3752 :           (if (= (length s) 1) (push (string-to-char s) letters) (push s rest)))
     209             : 
     210         277 :         (if rest
     211             :             ;; several one-char strings: take them and recurse
     212             :             ;; on the rest (first so as to match the longest).
     213          56 :             (concat open-group
     214          56 :                     (regexp-opt-group (nreverse rest))
     215          56 :                     "\\|" (regexp-opt-charset letters)
     216          56 :                     close-group)
     217             :           ;; all are one-char strings: just return a character set.
     218         221 :           (concat open-charset
     219         221 :                   (regexp-opt-charset letters)
     220         277 :                   close-charset))))
     221             :      ;;
     222             :      ;; We have a list of different length strings.
     223             :      (t
     224        2778 :       (let ((prefix (try-completion "" strings)))
     225        2778 :         (if (> (length prefix) 0)
     226             :             ;; common prefix: take it and recurse on the suffixes.
     227         936 :             (let* ((n (length prefix))
     228        5492 :                    (suffixes (mapcar (lambda (s) (substring s n)) strings)))
     229         936 :               (concat open-group
     230         936 :                       (regexp-quote prefix)
     231         936 :                       (regexp-opt-group suffixes t t)
     232         936 :                       close-group))
     233             : 
     234        1842 :           (let* ((sgnirts (mapcar #'reverse strings))
     235        1842 :                  (xiffus (try-completion "" sgnirts)))
     236        1842 :             (if (> (length xiffus) 0)
     237             :                 ;; common suffix: take it and recurse on the prefixes.
     238         259 :                 (let* ((n (- (length xiffus)))
     239             :                        (prefixes
     240             :                         ;; Sorting is necessary in cases such as ("ad" "d").
     241         889 :                         (sort (mapcar (lambda (s) (substring s 0 n)) strings)
     242         259 :                               'string-lessp)))
     243         259 :                   (concat open-group
     244         259 :                           (regexp-opt-group prefixes t t)
     245         259 :                           (regexp-quote (nreverse xiffus))
     246         259 :                           close-group))
     247             : 
     248             :               ;; Otherwise, divide the list into those that start with a
     249             :               ;; particular letter and those that do not, and recurse on them.
     250        1583 :               (let* ((char (substring-no-properties (car strings) 0 1))
     251        1583 :                      (half1 (all-completions char strings))
     252        1583 :                      (half2 (nthcdr (length half1) strings)))
     253        1583 :                 (concat open-group
     254        1583 :                         (regexp-opt-group half1)
     255        1583 :                         "\\|" (regexp-opt-group half2)
     256        4883 :                         close-group))))))))))
     257             : 
     258             : 
     259             : (defun regexp-opt-charset (chars)
     260             :   "Return a regexp to match a character in CHARS.
     261             : CHARS should be a list of characters."
     262             :   ;; The basic idea is to find character ranges.  Also we take care in the
     263             :   ;; position of character set meta characters in the character set regexp.
     264             :   ;;
     265         284 :   (let* ((charmap (make-char-table 'regexp-opt-charset))
     266             :          (start -1) (end -2)
     267             :          (charset "")
     268             :          (bracket "") (dash "") (caret ""))
     269             :     ;;
     270             :     ;; Make a character map but extract character set meta characters.
     271         284 :     (dolist (char chars)
     272       21104 :       (cond
     273       21104 :        ((eq char ?\])
     274           0 :         (setq bracket "]"))
     275       21104 :        ((eq char ?^)
     276           0 :         (setq caret "^"))
     277       21104 :        ((eq char ?-)
     278           0 :         (setq dash "-"))
     279             :        (t
     280       21104 :         (aset charmap char t))))
     281             :     ;;
     282             :     ;; Make a character set from the map using ranges where applicable.
     283         284 :     (map-char-table
     284             :      (lambda (c v)
     285        2472 :        (when v
     286        2472 :          (if (consp c)
     287        1294 :              (if (= (1- (car c)) end) (setq end (cdr c))
     288        1294 :                (if (> end (+ start 2))
     289         488 :                    (setq charset (format "%s%c-%c" charset start end))
     290        1773 :                  (while (>= end start)
     291         967 :                    (setq charset (format "%s%c" charset start))
     292        1294 :                    (setq start (1+ start))))
     293        1294 :                (setq start (car c) end (cdr c)))
     294        1178 :            (if (= (1- c) end) (setq end c)
     295        1178 :              (if (> end (+ start 2))
     296         155 :                (setq charset (format "%s%c-%c" charset start end))
     297        2191 :              (while (>= end start)
     298        1168 :                (setq charset (format "%s%c" charset start))
     299        1178 :                (setq start (1+ start))))
     300        2472 :              (setq start c end c)))))
     301         284 :      charmap)
     302         284 :     (when (>= end start)
     303         284 :       (if (> end (+ start 2))
     304           4 :           (setq charset (format "%s%c-%c" charset start end))
     305         757 :         (while (>= end start)
     306         477 :           (setq charset (format "%s%c" charset start))
     307         477 :           (setq start (1+ start)))))
     308             :     ;;
     309             :     ;; Make sure a caret is not first and a dash is first or last.
     310         284 :     (if (and (string-equal charset "") (string-equal bracket ""))
     311           0 :         (if (string-equal dash "")
     312             :             "\\^"                       ; [^] is not a valid regexp
     313           0 :           (concat "[" dash caret "]"))
     314         284 :       (concat "[" bracket charset caret dash "]"))))
     315             : 
     316             : (provide 'regexp-opt)
     317             : 
     318             : ;;; regexp-opt.el ends here

Generated by: LCOV version 1.12