[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
emacs-27 49d3cd9: rx: Improve 'or' compositionality (bug#37659)
From: |
Mattias Engdegård |
Subject: |
emacs-27 49d3cd9: rx: Improve 'or' compositionality (bug#37659) |
Date: |
Sun, 1 Mar 2020 05:09:34 -0500 (EST) |
branch: emacs-27
commit 49d3cd90bd80a225d5ec26027318ffb4606ff513
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>
rx: Improve 'or' compositionality (bug#37659)
Perform 'regexp-opt' on nested 'or' forms, and after expansion of
user-defined and 'eval' forms. Characters are now turned into strings
for wider 'regexp-opt' scope. This preserves the longest-match
semantics for string in 'or' forms over composition.
* doc/lispref/searching.texi (Rx Constructs): Document.
* lisp/emacs-lisp/rx.el (rx--normalise-or-arg)
(rx--all-string-or-args): New.
(rx--translate-or): Normalise arguments first, and check for strings
in subforms.
(rx--expand-eval): Extracted from rx--translate-eval.
(rx--translate-eval): Call rx--expand-eval.
* test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests.
* etc/NEWS: Announce.
---
doc/lispref/searching.texi | 5 +--
etc/NEWS | 6 ++++
lisp/emacs-lisp/rx.el | 76 +++++++++++++++++++++++++---------------
test/lisp/emacs-lisp/rx-tests.el | 13 ++++++-
4 files changed, 69 insertions(+), 31 deletions(-)
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index a4d5a27..1a090eb 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}}
@itemx @code{(| @var{rx}@dots{})}
@cindex @code{|} in rx
Match exactly one of the @var{rx}s.
-If all arguments are string literals, the longest possible match
-will always be used. Otherwise, either the longest match or the
+If all arguments are strings, characters, or @code{or} forms
+so constrained, the longest possible match will always be used.
+Otherwise, either the longest match or the
first (in left-to-right order) will be used.
Without arguments, the expression will not match anything at all.@*
Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.
diff --git a/etc/NEWS b/etc/NEWS
index e9dfd26..6e2b1fe 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2325,6 +2325,12 @@ expressions from simpler parts.
+++
*** 'not' argument can now be a character or single-char string.
++++
+*** Nested 'or' forms of strings guarantee a longest match.
+For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches
+the whole string "INPUT" if present, not just "IN". Previously, this
+was only guaranteed inside a single 'or' form of string literals.
+
** Frames
+++
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 1ee5e82..a0b2444 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary
function F."
(setq l (cdr l)))
x)
+(defun rx--normalise-or-arg (form)
+ "Normalise the `or' argument FORM.
+Characters become strings, user-definitions and `eval' forms are expanded,
+and `or' forms are normalised recursively."
+ (cond ((characterp form)
+ (char-to-string form))
+ ((and (consp form) (memq (car form) '(or |)))
+ (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
+ ((and (consp form) (eq (car form) 'eval))
+ (rx--normalise-or-arg (rx--expand-eval (cdr form))))
+ (t
+ (let ((expanded (rx--expand-def form)))
+ (if expanded
+ (rx--normalise-or-arg expanded)
+ form)))))
+
+(defun rx--all-string-or-args (body)
+ "If BODY only consists of strings or such `or' forms, return all the strings.
+Otherwise throw `rx--nonstring'."
+ (mapcan (lambda (form)
+ (cond ((stringp form) (list form))
+ ((and (consp form) (memq (car form) '(or |)))
+ (rx--all-string-or-args (cdr form)))
+ (t (throw 'rx--nonstring nil))))
+ body))
+
(defun rx--translate-or (body)
"Translate an or-pattern of zero or more rx items.
Return (REGEXP . PRECEDENCE)."
;; FIXME: Possible improvements:
;;
- ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
- ;; so that they can be candidates for regexp-opt.
- ;;
- ;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
- ;;
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
- ;; in order to improve effectiveness of regexp-opt.
- ;; This would also help composability.
- ;;
- ;; - Use associativity to run regexp-opt on contiguous subsets of arguments
- ;; if not all of them are strings. Example:
+ ;; Then call regexp-opt on runs of string arguments. Example:
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
;;
@@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
;; -> (any "@" "%" digit "A-Z" space word)
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
- ;;
- ;; Problem: If a subpattern is carefully written to be
- ;; optimizable by regexp-opt, how do we prevent the transforms
- ;; above from destroying that property?
- ;; Example: (or "a" (or "abc" "abd" "abe"))
(cond
((null body) ; No items: a never-matching regexp.
(rx--empty))
((null (cdr body)) ; Single item.
(rx--translate (car body)))
- ((rx--every #'stringp body) ; All strings.
- (cons (list (regexp-opt body nil))
- t))
- ((rx--every #'rx--charset-p body) ; All charsets.
- (rx--translate-union nil body))
(t
- (cons (append (car (rx--translate (car body)))
- (mapcan (lambda (item)
- (cons "\\|" (car (rx--translate item))))
- (cdr body)))
- nil))))
+ (let* ((args (mapcar #'rx--normalise-or-arg body))
+ (all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
+ (cond
+ (all-strings ; Only strings.
+ (cons (list (regexp-opt all-strings nil))
+ t))
+ ((rx--every #'rx--charset-p args) ; All charsets.
+ (rx--translate-union nil args))
+ (t
+ (cons (append (car (rx--translate (car args)))
+ (mapcan (lambda (item)
+ (cons "\\|" (car (rx--translate item))))
+ (cdr args)))
+ nil)))))))
(defun rx--charset-p (form)
"Whether FORM looks like a charset, only consisting of character intervals
@@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
(cons (list (list 'regexp-quote arg)) 'seq))
(t (error "rx `literal' form with non-string argument")))))
-(defun rx--translate-eval (body)
- "Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
+(defun rx--expand-eval (body)
+ "Expand `eval' arguments. Return a new rx form."
(unless (and body (null (cdr body)))
(error "rx `eval' form takes exactly one argument"))
- (rx--translate (eval (car body))))
+ (eval (car body)))
+
+(defun rx--translate-eval (body)
+ "Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
+ (rx--translate (rx--expand-eval body)))
(defvar rx--regexp-atomic-regexp nil)
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 2e34d65..4888e1d 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -42,13 +42,24 @@
(ert-deftest rx-or ()
(should (equal (rx (or "ab" (| "c" nonl) "de"))
"ab\\|c\\|.\\|de"))
- (should (equal (rx (or "ab" "abc" "a"))
+ (should (equal (rx (or "ab" "abc" ?a))
"\\(?:a\\(?:bc?\\)?\\)"))
+ (should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
+ "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
+ (should (equal (rx (or "a" (eval (string ?a ?b))))
+ "\\(?:ab?\\)"))
(should (equal (rx (| nonl "a") (| "b" blank))
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
(should (equal (rx (|))
"\\`a\\`")))
+(ert-deftest rx-def-in-or ()
+ (rx-let ((a b)
+ (b (or "abc" c))
+ (c ?a))
+ (should (equal (rx (or a (| "ab" "abcde") "abcd"))
+ "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))))
+
(ert-deftest rx-char-any ()
"Test character alternatives with `]' and `-' (Bug#25123)."
(should (equal
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- emacs-27 49d3cd9: rx: Improve 'or' compositionality (bug#37659),
Mattias Engdegård <=