emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: master ea93326: Add `union' and `intersection' to rx (bug#37849)


From: Stefan Monnier
Subject: Re: master ea93326: Add `union' and `intersection' to rx (bug#37849)
Date: Tue, 10 Dec 2019 16:52:11 -0500
User-agent: Gnus/5.13 (Gnus v5.13) Emacs/27.0.50 (gnu/linux)

>     Document `union' and `intersection'.

Why not (re)use `or` instead of `union`?  IOW make it an optimization?
(we can't similarly (re)use `and` for `intersection` so I'm OK with
`intersection`).


        Stefan


> ---
>  doc/lispref/searching.texi       |  14 +-
>  etc/NEWS                         |   7 +-
>  lisp/emacs-lisp/rx.el            | 309 
> +++++++++++++++++++++++++++------------
>  test/lisp/emacs-lisp/rx-tests.el |  57 ++++++++
>  4 files changed, 289 insertions(+), 98 deletions(-)
>
> diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
> index 0cb3001..5bf3c5b 100644
> --- a/doc/lispref/searching.texi
> +++ b/doc/lispref/searching.texi
> @@ -1214,11 +1214,21 @@ Corresponding string regexp: @samp{[@dots{}]}
>  @item @code{(not @var{charspec})}
>  @cindex @code{not} in rx
>  Match a character not included in @var{charspec}.  @var{charspec} can
> -be an @code{any}, @code{not}, @code{syntax} or @code{category} form, or a
> -character class.@*
> +be an @code{any}, @code{not}, @code{union}, @code{intersection},
> +@code{syntax} or @code{category} form, or a character class.@*
>  Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
>  @samp{\C@var{code}}
>  
> +@item @code{(union @var{charset}@dots{})}
> +@itemx @code{(intersection @var{charset}@dots{})}
> +@cindex @code{union} in rx
> +@cindex @code{intersection} in rx
> +Match a character that matches the union or intersection,
> +respectively, of the @var{charset}s.  Each @var{charset} can be an
> +@code{any} form without character classes, or a @code{union},
> +@code{intersection} or @code{not} form whose arguments are also
> +@var{charset}s.
> +
>  @item @code{not-newline}, @code{nonl}
>  @cindex @code{not-newline} in rx
>  @cindex @code{nonl} in rx
> diff --git a/etc/NEWS b/etc/NEWS
> index 923890d..69b51b7 100644
> --- a/etc/NEWS
> +++ b/etc/NEWS
> @@ -2110,9 +2110,14 @@ at run time, instead of a constant string.
>  These macros add new forms to the rx notation.
>  
>  +++
> -*** 'anychar' is now an alias for 'anything'
> +*** 'anychar' is now an alias for 'anything'.
>  Both match any single character; 'anychar' is more descriptive.
>  
> ++++
> +*** New 'union' and 'intersection' forms for character sets.
> +These permit composing character-matching expressions from simpler
> +parts.
> +
>  ** Frames
>  
>  +++
> diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
> index a92c613..d4b21c3 100644
> --- a/lisp/emacs-lisp/rx.el
> +++ b/lisp/emacs-lisp/rx.el
> @@ -246,6 +246,14 @@ Return (REGEXP . PRECEDENCE)."
>      (setq list (cdr list)))
>    (null list))
>  
> +(defun rx--foldl (f x l)
> +  "(F (F (F X L0) L1) L2) ...
> +Left-fold the list L, starting with X, by the binary function F."
> +  (while l
> +    (setq x (funcall f x (car l)))
> +    (setq l (cdr l)))
> +  x)
> +
>  (defun rx--translate-or (body)
>    "Translate an or-pattern of zero or more rx items.
>  Return (REGEXP . PRECEDENCE)."
> @@ -343,22 +351,11 @@ INTERVALS is a list of (START . END) with START ≤ END, 
> sorted by START."
>          (setq tail d)))
>      intervals))
>  
> -;; FIXME: Consider expanding definitions inside (any ...) and (not ...),
> -;; and perhaps allow (any ...) inside (any ...).
> -;; It would be benefit composability (build a character alternative by 
> pieces)
> -;; and be handy for obtaining the complement of a defined set of
> -;; characters.  (See, for example, python.el:421, `not-simple-operator'.)
> -;; (Expansion in other non-rx positions is probably not a good idea:
> -;; syntax, category, backref, and the integer parameters of group-n,
> -;; =, >=, **, repeat)
> -;; Similar effect could be attained by ensuring that
> -;; (or (any X) (any Y)) -> (any X Y), and find a way to compose negative
> -;; sets.  `and' is taken, but we could add
> -;; (intersection (not (any X)) (not (any Y))) -> (not (any X Y)).
> -
> -(defun rx--translate-any (negated body)
> -  "Translate an (any ...) construct.  Return (REGEXP . PRECEDENCE).
> -If NEGATED, negate the sense."
> +(defun rx--parse-any (body)
> +  "Parse arguments of an (any ...) construct.
> +Return (INTERVALS . CLASSES), where INTERVALS is a sorted list of
> +disjoint intervals (each a cons of chars), and CLASSES
> +a list of named character classes in the order they occur in BODY."
>    (let ((classes nil)
>          (strings nil)
>          (conses nil))
> @@ -380,81 +377,109 @@ If NEGATED, negate the sense."
>                           (or (memq class classes)
>                               (progn (push class classes) t))))))
>              (t (error "Invalid rx `any' argument: %s" arg))))
> -    (let ((items
> -           ;; Translate strings and conses into nonoverlapping intervals,
> -           ;; and add classes as symbols at the end.
> -           (append
> -            (rx--condense-intervals
> -             (sort (append conses
> -                           (mapcan #'rx--string-to-intervals strings))
> -                   #'car-less-than-car))
> -            (reverse classes))))
> -
> -      ;; Move lone ] and range ]-x to the start.
> -      (let ((rbrac-l (assq ?\] items)))
> -        (when rbrac-l
> -          (setq items (cons rbrac-l (delq rbrac-l items)))))
> -
> -      ;; Split x-] and move the lone ] to the start.
> -      (let ((rbrac-r (rassq ?\] items)))
> -        (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
> -          (setcdr rbrac-r ?\\)
> -          (setq items (cons '(?\] . ?\]) items))))
> -
> -      ;; Split ,-- (which would end up as ,- otherwise).
> -      (let ((dash-r (rassq ?- items)))
> -        (when (eq (car dash-r) ?,)
> -          (setcdr dash-r ?,)
> -          (setq items (nconc items '((?- . ?-))))))
> -
> -      ;; Remove - (lone or at start of interval)
> -      (let ((dash-l (assq ?- items)))
> -        (when dash-l
> -          (if (eq (cdr dash-l) ?-)
> -              (setq items (delq dash-l items))   ; Remove lone -
> -            (setcar dash-l ?.))                  ; Reduce --x to .-x
> -          (setq items (nconc items '((?- . ?-))))))
> -
> -      ;; Deal with leading ^ and range ^-x.
> -      (when (and (consp (car items))
> -                 (eq (caar items) ?^)
> -                 (cdr items))
> -        ;; Move ^ and ^-x to second place.
> -        (setq items (cons (cadr items)
> -                          (cons (car items) (cddr items)))))
> +    (cons (rx--condense-intervals
> +           (sort (append conses
> +                         (mapcan #'rx--string-to-intervals strings))
> +                 #'car-less-than-car))
> +          (reverse classes))))
> +
> +(defun rx--generate-alt (negated intervals classes)
> +  "Generate a character alternative.  Return (REGEXP . PRECEDENCE).
> +If NEGATED is non-nil, negate the result; INTERVALS is a sorted
> +list of disjoint intervals and CLASSES a list of named character
> +classes."
> +  (let ((items (append intervals classes)))
> +    ;; Move lone ] and range ]-x to the start.
> +    (let ((rbrac-l (assq ?\] items)))
> +      (when rbrac-l
> +        (setq items (cons rbrac-l (delq rbrac-l items)))))
> +
> +    ;; Split x-] and move the lone ] to the start.
> +    (let ((rbrac-r (rassq ?\] items)))
> +      (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
> +        (setcdr rbrac-r ?\\)
> +        (setq items (cons '(?\] . ?\]) items))))
> +
> +    ;; Split ,-- (which would end up as ,- otherwise).
> +    (let ((dash-r (rassq ?- items)))
> +      (when (eq (car dash-r) ?,)
> +        (setcdr dash-r ?,)
> +        (setq items (nconc items '((?- . ?-))))))
> +
> +    ;; Remove - (lone or at start of interval)
> +    (let ((dash-l (assq ?- items)))
> +      (when dash-l
> +        (if (eq (cdr dash-l) ?-)
> +            (setq items (delq dash-l items))   ; Remove lone -
> +          (setcar dash-l ?.))                  ; Reduce --x to .-x
> +        (setq items (nconc items '((?- . ?-))))))
> +
> +    ;; Deal with leading ^ and range ^-x.
> +    (when (and (consp (car items))
> +               (eq (caar items) ?^)
> +               (cdr items))
> +      ;; Move ^ and ^-x to second place.
> +      (setq items (cons (cadr items)
> +                        (cons (car items) (cddr items)))))
>  
> -      (cond
> -       ;; Empty set: if negated, any char, otherwise match-nothing.
> -       ((null items)
> -        (if negated
> -            (rx--translate-symbol 'anything)
> -          (rx--empty)))
> -       ;; Single non-negated character.
> -       ((and (null (cdr items))
> -             (consp (car items))
> -             (eq (caar items) (cdar items))
> -             (not negated))
> -        (cons (list (regexp-quote (char-to-string (caar items))))
> -              t))
> -       ;; At least one character or class, possibly negated.
> -       (t
> -        (cons
> -         (list
> -          (concat
> -           "["
> -           (and negated "^")
> -           (mapconcat (lambda (item)
> -                        (cond ((symbolp item)
> -                               (format "[:%s:]" item))
> -                              ((eq (car item) (cdr item))
> -                               (char-to-string (car item)))
> -                              ((eq (1+ (car item)) (cdr item))
> -                               (string (car item) (cdr item)))
> -                              (t
> -                               (string (car item) ?- (cdr item)))))
> -                      items nil)
> -           "]"))
> -         t))))))
> +    (cond
> +     ;; Empty set: if negated, any char, otherwise match-nothing.
> +     ((null items)
> +      (if negated
> +          (rx--translate-symbol 'anything)
> +        (rx--empty)))
> +     ;; Single non-negated character.
> +     ((and (null (cdr items))
> +           (consp (car items))
> +           (eq (caar items) (cdar items))
> +           (not negated))
> +      (cons (list (regexp-quote (char-to-string (caar items))))
> +            t))
> +     ;; At least one character or class, possibly negated.
> +     (t
> +      (cons
> +       (list
> +        (concat
> +         "["
> +         (and negated "^")
> +         (mapconcat (lambda (item)
> +                      (cond ((symbolp item)
> +                             (format "[:%s:]" item))
> +                            ((eq (car item) (cdr item))
> +                             (char-to-string (car item)))
> +                            ((eq (1+ (car item)) (cdr item))
> +                             (string (car item) (cdr item)))
> +                            (t
> +                             (string (car item) ?- (cdr item)))))
> +                    items nil)
> +         "]"))
> +       t)))))
> +
> +(defun rx--translate-any (negated body)
> +  "Translate an (any ...) construct.  Return (REGEXP . PRECEDENCE).
> +If NEGATED, negate the sense."
> +  (let ((parsed (rx--parse-any body)))
> +    (rx--generate-alt negated (car parsed) (cdr parsed))))
> +
> +(defun rx--intervals-to-alt (negated intervals)
> +  "Generate a character alternative from an interval set.
> +Return (REGEXP . PRECEDENCE).
> +INTERVALS is a sorted list of disjoint intervals.
> +If NEGATED, negate the sense."
> +  ;; Detect whether the interval set is better described in
> +  ;; complemented form.  This is not just a matter of aesthetics: any
> +  ;; range from ASCII to raw bytes will automatically exclude the
> +  ;; entire non-ASCII Unicode range by the regexp engine.
> +  (if (rx--every (lambda (iv) (not (<= (car iv) #x3ffeff (cdr iv))))
> +                 intervals)
> +      (rx--generate-alt negated intervals nil)
> +    (rx--generate-alt
> +     (not negated) (rx--complement-intervals intervals) nil)))
> +
> +;; FIXME: Consider turning `not' into a variadic operator, following SRE:
> +;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
> +;; (not) = anychar.
> +;; Maybe allow singleton characters as arguments.
>  
>  (defun rx--translate-not (negated body)
>    "Translate a (not ...) construct.  Return (REGEXP . PRECEDENCE).
> @@ -472,10 +497,14 @@ If NEGATED, negate the sense (thus making it positive)."
>               ('category
>                (rx--translate-category (not negated) (cdr arg)))
>               ('not
> -              (rx--translate-not      (not negated) (cdr arg))))))
> +              (rx--translate-not      (not negated) (cdr arg)))
> +             ('union
> +              (rx--translate-union    (not negated) (cdr arg)))
> +             ('intersection
> +              (rx--translate-intersection (not negated) (cdr arg))))))
>       ((let ((class (cdr (assq arg rx--char-classes))))
>          (and class
> -             (rx--translate-any (not negated) (list class)))))
> +             (rx--generate-alt (not negated) nil (list class)))))
>       ((eq arg 'word-boundary)
>        (rx--translate-symbol
>         (if negated 'word-boundary 'not-word-boundary)))
> @@ -484,6 +513,91 @@ If NEGATED, negate the sense (thus making it positive)."
>               (rx--translate-not negated (list expanded)))))
>       (t (error "Illegal argument to rx `not': %S" arg)))))
>  
> +(defun rx--complement-intervals (intervals)
> +  "Complement of the interval list INTERVALS."
> +  (let ((compl nil)
> +        (c 0))
> +    (dolist (iv intervals)
> +      (when (< c (car iv))
> +        (push (cons c (1- (car iv))) compl))
> +      (setq c (1+ (cdr iv))))
> +    (when (< c (max-char))
> +      (push (cons c (max-char)) compl))
> +    (nreverse compl)))
> +
> +(defun rx--intersect-intervals (ivs-a ivs-b)
> +  "Intersection of the interval lists IVS-A and IVS-B."
> +  (let ((isect nil))
> +    (while (and ivs-a ivs-b)
> +      (let ((a (car ivs-a))
> +            (b (car ivs-b)))
> +        (cond
> +         ((< (cdr a) (car b)) (setq ivs-a (cdr ivs-a)))
> +         ((> (car a) (cdr b)) (setq ivs-b (cdr ivs-b)))
> +         (t
> +          (push (cons (max (car a) (car b))
> +                      (min (cdr a) (cdr b)))
> +                isect)
> +          (setq ivs-a (cdr ivs-a))
> +          (setq ivs-b (cdr ivs-b))
> +          (cond ((< (cdr a) (cdr b))
> +                 (push (cons (1+ (cdr a)) (cdr b))
> +                       ivs-b))
> +                ((> (cdr a) (cdr b))
> +                 (push (cons (1+ (cdr b)) (cdr a))
> +                       ivs-a)))))))
> +    (nreverse isect)))
> +
> +(defun rx--union-intervals (ivs-a ivs-b)
> +  "Union of the interval lists IVS-A and IVS-B."
> +  (rx--complement-intervals
> +   (rx--intersect-intervals
> +    (rx--complement-intervals ivs-a)
> +    (rx--complement-intervals ivs-b))))
> +
> +(defun rx--charset-intervals (charset)
> +  "Return a sorted list of non-adjacent disjoint intervals from CHARSET.
> +CHARSET is any expression allowed in a character set expression:
> +either `any' (no classes permitted), or `not', `union' or `intersection'
> +forms whose arguments are charsets."
> +  (pcase charset
> +    (`(,(or 'any 'in 'char) . ,body)
> +     (let ((parsed (rx--parse-any body)))
> +       (when (cdr parsed)
> +         (error
> +          "Character class not permitted in set operations: %S"
> +          (cadr parsed)))
> +       (car parsed)))
> +    (`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
> +    (`(union . ,xs) (rx--charset-union xs))
> +    (`(intersection . ,xs) (rx--charset-intersection xs))
> +    (_ (let ((expanded (rx--expand-def charset)))
> +         (if expanded
> +             (rx--charset-intervals expanded)
> +           (error "Bad character set: %S" charset))))))
> +
> +(defun rx--charset-union (charsets)
> +  "Union of CHARSETS, as a set of intervals."
> +  (rx--foldl #'rx--union-intervals nil
> +             (mapcar #'rx--charset-intervals charsets)))
> +
> +(defconst rx--charset-all (list (cons 0 (max-char))))
> +
> +(defun rx--charset-intersection (charsets)
> +  "Intersection of CHARSETS, as a set of intervals."
> +  (rx--foldl #'rx--intersect-intervals rx--charset-all
> +             (mapcar #'rx--charset-intervals charsets)))
> +
> +(defun rx--translate-union (negated body)
> +  "Translate a (union ...) construct.  Return (REGEXP . PRECEDENCE).
> +If NEGATED, negate the sense."
> +  (rx--intervals-to-alt negated (rx--charset-union body)))
> +
> +(defun rx--translate-intersection (negated body)
> +  "Translate an (intersection ...) construct.  Return (REGEXP . PRECEDENCE).
> +If NEGATED, negate the sense."
> +  (rx--intervals-to-alt negated (rx--charset-intersection body)))
> +
>  (defun rx--atomic-regexp (item)
>    "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
>    (if (eq (cdr item) t)
> @@ -862,6 +976,8 @@ can expand to any number of values."
>        ((or 'any 'in 'char)      (rx--translate-any nil body))
>        ('not-char                (rx--translate-any t body))
>        ('not                     (rx--translate-not nil body))
> +      ('union                   (rx--translate-union nil body))
> +      ('intersection            (rx--translate-intersection nil body))
>  
>        ('repeat                  (rx--translate-repeat body))
>        ('=                       (rx--translate-= body))
> @@ -920,7 +1036,7 @@ can expand to any number of values."
>          (t (error "Unknown rx form `%s'" op)))))))
>  
>  (defconst rx--builtin-forms
> -  '(seq sequence : and or | any in char not-char not
> +  '(seq sequence : and or | any in char not-char not union intersection
>      repeat = >= **
>      zero-or-more 0+ *
>      one-or-more 1+ +
> @@ -1033,8 +1149,11 @@ CHAR           Match a literal character.
>                  character, a string, a range as string \"A-Z\" or cons
>                  (?A . ?Z), or a character class (see below).  Alias: in, 
> char.
>  (not CHARSPEC)  Match one character not matched by CHARSPEC.  CHARSPEC
> -                can be (any ...), (syntax ...), (category ...),
> -                or a character class.
> +                can be (any ...), (union ...), (intersection ...),
> +                (syntax ...), (category ...), or a character class.
> +(union CHARSET...)        Union of CHARSETs.
> +(intersection CHARSET...) Intersection of CHARSETs.
> +                CHARSET is (any...), (not...), (union...) or 
> (intersection...).
>  not-newline     Match any character except a newline.  Alias: nonl.
>  anychar         Match any character.  Alias: anything.
>  unmatchable     Never match anything at all.
> diff --git a/test/lisp/emacs-lisp/rx-tests.el 
> b/test/lisp/emacs-lisp/rx-tests.el
> index 317dae2..0cd2c95 100644
> --- a/test/lisp/emacs-lisp/rx-tests.el
> +++ b/test/lisp/emacs-lisp/rx-tests.el
> @@ -274,6 +274,63 @@
>    (should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
>                   "[[:ascii:]][^a-z]")))
>  
> +(ert-deftest rx-union ()
> +  (should (equal (rx (union))
> +                 "\\`a\\`"))
> +  (should (equal (rx (union (any "ba")))
> +                 "[ab]"))
> +  (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
> +                 "[a-krx-z]"))
> +  (should (equal (rx (union (not (any "a-m")) (not (any "f-p"))))
> +                 "[^f-m]"))
> +  (should (equal (rx (union (any "e-m") (not (any "a-z"))))
> +                 "[^a-dn-z]"))
> +  (should (equal (rx (union (not (any "g-r")) (not (any "t"))))
> +                 "[^z-a]"))
> +  (should (equal (rx (not (union (not (any "g-r")) (not (any "t")))))
> +                 "\\`a\\`"))
> +  (should (equal (rx (union (union (any "a-f") (any "u-z"))
> +                            (any "g-r")))
> +                 "[a-ru-z]"))
> +  (should (equal (rx (union (intersection (any "c-z") (any "a-g"))
> +                            (not (any "a-k"))))
> +                 "[^abh-k]")))
> +
> +(ert-deftest rx-def-in-union ()
> +  (rx-let ((a (any "badc"))
> +           (b (union a (any "def"))))
> +    (should (equal(rx (union b (any "q")))
> +                  "[a-fq]"))))
> +
> +(ert-deftest rx-intersection ()
> +  (should (equal (rx (intersection))
> +                 "[^z-a]"))
> +  (should (equal (rx (intersection (any "ba")))
> +                 "[ab]"))
> +  (should (equal (rx (intersection (any "a-j" "u-z") (any "c-k" ?y)
> +                                   (any "a-i" "x-z")))
> +                 "[c-iy]"))
> +  (should (equal (rx (intersection (not (any "a-m")) (not (any "f-p"))))
> +                 "[^a-p]"))
> +  (should (equal (rx (intersection (any "a-z") (not (any "g-q"))))
> +                 "[a-fr-z]"))
> +  (should (equal (rx (intersection (any "a-d") (any "e")))
> +                 "\\`a\\`"))
> +  (should (equal (rx (not (intersection (any "a-d") (any "e"))))
> +                 "[^z-a]"))
> +  (should (equal (rx (intersection (any "d-u")
> +                                   (intersection (any "e-z") (any "a-m"))))
> +                 "[e-m]"))
> +  (should (equal (rx (intersection (union (any "a-f") (any "f-t"))
> +                                   (any "e-w")))
> +                 "[e-t]")))
> +
> +(ert-deftest rx-def-in-intersection ()
> +  (rx-let ((a (any "a-g"))
> +           (b (intersection a (any "d-j"))))
> +    (should (equal(rx (intersection b (any "e-k")))
> +                  "[e-g]"))))
> +
>  (ert-deftest rx-group ()
>    (should (equal (rx (group nonl) (submatch "x")
>                       (group-n 3 "y") (submatch-n 13 "z") (backref 1))




reply via email to

[Prev in Thread] Current Thread [Next in Thread]