[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/xr e82efe8 6/9: Improved character class set relations
From: |
Mattias Engdegεrd |
Subject: |
[elpa] externals/xr e82efe8 6/9: Improved character class set relations |
Date: |
Sat, 29 Feb 2020 17:22:12 -0500 (EST) |
branch: externals/xr
commit e82efe8b3734bae04a62af745bd207af48d9b637
Author: Mattias EngdegΓ₯rd <address@hidden>
Commit: Mattias EngdegΓ₯rd <address@hidden>
Improved character class set relations
Several bugs fixed and better precision in superset and intersection
computations.
---
xr-test.el | 57 ++++++++++++++-
xr.el | 237 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 269 insertions(+), 25 deletions(-)
diff --git a/xr-test.el b/xr-test.el
index a0b5402..1bebbfa 100644
--- a/xr-test.el
+++ b/xr-test.el
@@ -410,6 +410,10 @@
'((2 . "Repetition of zero-width assertion")
(5 . "Repetition of zero-width assertion")
(13 . "Repetition of zero-width assertion"))))
+ ))
+
+(ert-deftest xr-lint-branch-subsumption ()
+ (let ((text-quoting-style 'grave))
(should (equal (xr-lint "a.cde*f?g\\|g\\|abcdefg")
'((14 . "Branch matches subset of a previous branch"))))
(should (equal (xr-lint "abcd\\|e\\|[aA].[^0-9z]d")
@@ -428,8 +432,6 @@
'((6 . "Branch matches superset of a previous branch"))))
(should (equal (xr-lint ".\\|[a\n]")
nil))
- (should (equal (xr-lint ".\\|[[:space:]\r]")
- '((3 . "Branch matches subset of a previous branch"))))
(should (equal (xr-lint "ab?c+\\|a?b*c*")
'((7 . "Branch matches superset of a previous branch"))))
(should (equal (xr-lint "\\(?:[aA]\\|b\\)\\|a")
@@ -442,6 +444,57 @@
'((4 . "Branch matches superset of a previous branch"))))
(should (equal (xr-lint "abc\\|abcd*e?")
'((5 . "Branch matches superset of a previous branch"))))
+ (should (equal (xr-lint "[a[:digit:]]\\|[a\n]")
+ nil))
+ (should (equal (xr-lint "[a[:ascii:]]\\|[a\n]")
+ '((14 . "Branch matches subset of a previous branch"))))
+
+ (should (equal (xr-lint "[[:alnum:]]\\|[[:alpha:]]")
+ '((13 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:alnum:]%]\\|[[:alpha:]%]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:xdigit:]%]\\|[[:alpha:]%]")
+ nil))
+ (should (equal (xr-lint "[[:alnum:]]\\|[^[:alpha:]]")
+ nil))
+ (should (equal (xr-lint "[^[:alnum:]]\\|[[:alpha:]]")
+ nil))
+ (should (equal (xr-lint "[[:digit:]]\\|[^[:punct:]]")
+ '((13 . "Branch matches superset of a previous branch"))))
+ (should (equal (xr-lint "[^[:digit:]]\\|[[:punct:]]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^[:digit:]]\\|[^[:xdigit:]]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^[:print:]]\\|[[:ascii:]]")
+ nil))
+ (should (equal (xr-lint "[[:print:]]\\|[^[:ascii:]]")
+ nil))
+ (should (equal (xr-lint "[^[:print:]]\\|[^[:ascii:]]")
+ nil))
+ (should (equal (xr-lint "[[:digit:][:cntrl:]]\\|[[:ascii:]]")
+ '((22 . "Branch matches superset of a previous branch"))))
+ (should (equal (xr-lint "[[:alpha:]]\\|A")
+ '((13 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:alpha:]]\\|[A-E]")
+ '((13 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:alpha:]3-7]\\|[A-E46]")
+ '((16 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^[:alpha:]]\\|[123]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[!-@]\\|[[:digit:]]")
+ '((7 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^a-z]\\|[[:digit:]]")
+ '((8 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^[:punct:]]\\|[a-z]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:space:]]\\|[ \t\f]")
+ '((13 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[[:word:]]\\|[a-gH-P2357]")
+ '((12 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^[:space:]]\\|[a-gH-P2357]")
+ '((14 . "Branch matches subset of a previous branch"))))
+ (should (equal (xr-lint "[^z-a]\\|[^0-9[:space:]]")
+ '((8 . "Branch matches subset of a previous branch"))))
))
(ert-deftest xr-lint-subsumed-repetition ()
diff --git a/xr.el b/xr.el
index c32ae7f..90c6384 100644
--- a/xr.el
+++ b/xr.el
@@ -726,28 +726,118 @@ UPPER may be nil, meaning infinity."
and classes (symbols)."
;; We know (since we built it) that x is either a symbol or
;; a string, and that the string does not mix ranges and chars.
- (cond ((symbolp arg) (list arg))
+ (cond ((symbolp arg)
+ ;; unibyte and multibyte are aliases of ascii and nonascii in
+ ;; practice; simplify.
+ (list (cond ((eq arg 'unibyte) 'ascii)
+ ((eq arg 'multibyte) 'nonascii)
+ (t arg))))
((and (>= (length arg) 3)
(eq (aref arg 1) ?-))
(xr--range-string-to-items arg))
(t (string-to-list arg))))
+;; Character class relation matrix
+;; Legend: = same
+;; β row subset of column
+;; β row superset of column
+;; x overlapping
+;; β
disjoint
+;; ? not certain but very likely
+;; * assuming `case-fold-search' is nil
+;;
+;; alp aln dig xdi cnt asc non bla gra pri pun upp low spa wor
+;; alpha = β β
x β
x x β
β β β
? β? β? β
? β?
+;; alnum β = β β β
x x β
β β β
? β? β? β
? β?
+;; digit β
β = β β
β β
β
β β β
β
? β
? β
? β?
+;; xdigit x β β = β
β β
β
β β β
x? x? β
? β?
+;; cntrl β
β
β
β
= β β
x β
β
β
β
? β
? x? β
?
+;; ascii x x β β β = β
x x x x x? x? x? x?
+;; nonascii x x β
β
β
β
= x x x x? x? x? x? x?
+;; blank β
β
β
β
x x x = β
x x? β
? β
? x? β
?
+;; graph β β β β β
x x β
= β β? β? β? β
? β?
+;; print β β β β β
x x x β = β? β? β? x? β?
+;; punct β
? β
? β
β
β
x x? x? β? β? = β
? β
? β
? x?
+;; upper β? β? β
? x? β
? x? x? β
? β? β? β
? = β
* β
? β?
+;; lower β? β? β
? x? β
? x? x? β
? β? β? β
? β
* = β
? β?
+;; space β
? β
? β
? β
? x? x? x? x? β
? x? β
? β
? β
? = β
+;; word β? β? β? β? β
? x? x? β
? β? β? x? β? β? β
=
+
(defun xr--any-item-superset-p (a b)
"Whether A is a superset of B, both being `any' items: a character,
a range (pair of chars), or a class (symbol)."
(cond
((symbolp a)
- (cond ((symbolp b) (eq a b))
- ((eq b ?\n)
- (memq a '(alnum alpha blank digit graph
- lower multibyte nonascii print punct space
- upper word xdigit)))))
+ (cond
+ ((symbolp b)
+ (or (eq a b)
+ (memq
+ b
+ (cdr (assq
+ a
+ ;; Class superset matrix: first class in each row is
+ ;; a superset of all the rest in that row.
+ ;; It is slightly approximative, since upper, lower
+ ;; and (partially) punct can be modified through case
+ ;; and syntax tables.
+ '((alpha upper lower)
+ (alnum alpha digit xdigit upper lower)
+ (xdigit digit)
+ (ascii digit xdigit cntrl)
+ (graph alpha alnum digit xdigit punct upper lower word)
+ (print alpha alnum digit xdigit graph punct
+ upper lower word)
+ (word alpha alnum digit xdigit upper lower)))))))
+
+ ((characterp b)
+ (cond
+ ;; Some reasonable subsets of `space' and `word'.
+ ((eq a 'space) (memq b '(?\s ?\t ?\f)))
+ ((eq a 'word)
+ (string-match-p (rx (any "0-9A-Za-z")) (char-to-string b)))
+ ;; Test for invariant classes only. `punct' is invariant for ASCII.
+ ;; `upper' and `lower' are not really invariant but mostly.
+ ((or (memq a '(digit xdigit cntrl ascii nonascii alpha alnum blank
+ graph print upper lower))
+ (and (eq a 'punct) (<= b 127)))
+ (string-match-p (format "[[:%s:]]" a) (char-to-string b)))))
+
+ (t ; b is a range.
+ ;; For simplicity, only check ASCII ranges.
+ (and (<= (cdr b) 127)
+ (cl-some
+ (lambda (a-range) (and (<= (car a-range) (car b))
+ (<= (cdr b) (cdr a-range))))
+ (cdr (assq a '((alpha (?A . ?Z) (?a . ?z))
+ (alnum (?0 . ?9) (?A . ?Z) (?a . ?z))
+ (digit (?0 . ?9))
+ (xdigit (?0 . ?9) (?A . ?F) (?a . ?f))
+ (cntrl (0 . 31))
+ (ascii (0 . 127))
+ (graph (33 . 126))
+ (print (32 . 126))
+ (punct (33 . 47) (58 . 64) (91 . 96) (123 . 126))
+ ;; Not-so-wild assumptions.
+ (upper (?A . ?Z))
+ (lower (?a . ?z))
+ (word (?0 . ?9) (?A . ?Z) (?a . ?z))
+ (space (?\s . ?\s) (?\t . ?\t) (?\f . ?\f))))))))))
+
((consp a)
- (or (and (characterp b)
- (<= (car a) b (cdr a)))
- (and (consp b)
- (<= (car a) (car b) (cdr b) (cdr a)))))
- (t
+ (cond
+ ((characterp b) (<= (car a) b (cdr a)))
+ ((consp b) (<= (car a) (car b) (cdr b) (cdr a)))
+ (t ; b is a class.
+ ;; Only consider classes with simple definitions.
+ (let ((b-hull (cdr (assq b '((digit . (?0 . ?9))
+ (xdigit . (?0 . ?f))
+ (cntrl . (0 . 31))
+ (ascii . (0 . 127))
+ (nonascii . (#x80 . #x10ffff)))))))
+ (and b-hull
+ (<= (car a) (car b-hull))
+ (<= (cdr b-hull) (cdr a)))))))
+ (t ; a is a character.
(and (characterp b) (eq a b)))))
(defun xr--any-item-may-intersect-p (a b)
@@ -755,18 +845,89 @@ a range (pair of chars), or a class (symbol)."
a range (pair of chars), or a class (symbol). If in doubt, return t."
(cond
((symbolp a)
- (cond ((eq b ?\n)
- (not (memq a '(alnum alpha blank digit graph
- lower multibyte nonascii print punct space
- upper word xdigit))))
- (t t)))
+ (cond
+ ((symbolp b)
+ (or (eq a b)
+ (memq
+ b
+ (cdr (assq
+ a
+ ;; Class intersection matrix: first class in each row
+ ;; intersects all the rest in that row.
+ ;; Again slightly approximate, since upper, lower,
+ ;; space, word and (partially) punct can be modified
+ ;; through syntax and case tables.
+ '((alpha alnum xdigit ascii nonascii graph print
+ upper lower word)
+ (alnum alpha digit xdigit ascii nonascii graph print
+ upper lower word)
+ (digit alnum xdigit ascii graph print word)
+ (xdigit alpha alnum digit ascii graph print
+ upper lower word)
+ (cntrl ascii blank space)
+ (ascii alpha alnum digit xdigit cntrl ascii blank
+ graph print punct upper lower space word)
+ (nonascii alpha alnum blank graph print punct
+ upper lower space word)
+ (blank cntrl ascii nonascii print punct space)
+ (graph alpha alnum digit xdigit ascii nonascii print punct
+ upper lower word)
+ (print alpha alnum digit xdigit ascii nonascii blank graph
+ punct upper lower space word)
+ (punct ascii nonascii blank graph print upper lower word)
+ (upper alpha alnum xdigit ascii nonascii graph print word)
+ (lower alpha alnum xdigit ascii nonascii graph print word)
+ (space cntrl ascii nonascii blank print)
+ (word alpha alnum digit xdigit ascii nonascii graph print
+ punct upper lower)))))))
+
+ ((characterp b)
+ (cond
+ ;; Some reasonably conservative subsets of `space' and `word'.
+ ((eq a 'space)
+ (not (string-match-p (rx (any (33 . 126))) (char-to-string b))))
+ ((eq a 'word)
+ (not (memq b '(?\s ?\t ?\f ?\r))))
+ (t
+ ;; Only some classes are invariant. `punct' is invariant for ASCII.
+ ;; `upper' and `lower' are not really invariant but mostly.
+ (or (and (eq a 'punct) (> b 127))
+ ;; This may be a tad slow.
+ (string-match-p (format "[[:%s:]]" a) (char-to-string b))))))
+
+ (t ; b is a range.
+ ;; For simplicity, only check ASCII ranges.
+ (cond
+ ((and (> (cdr b) 127)
+ (not (memq a '(cntrl ascii digit xdigit)))))
+ ((eq a 'space)
+ (not (cl-some (lambda (a-range) (and (<= (car a-range) (cdr b))
+ (<= (car b) (cdr a-range))))
+ '((?0 . ?9) (?A . ?Z) (?a . ?z)))))
+ ((eq a 'word))
+ (t
+ (cl-some
+ (lambda (a-range) (and (<= (car a-range) (cdr b))
+ (<= (car b) (cdr a-range))))
+ (cdr (assq a '((alpha (?A . ?Z) (?a . ?z))
+ (alnum (?0 . ?9) (?A . ?Z) (?a . ?z))
+ (digit (?0 . ?9))
+ (xdigit (?0 . ?9) (?A . ?F) (?a . ?f))
+ (cntrl (0 . 31))
+ (ascii (0 . 127))
+ (graph (33 . 126))
+ (print (32 . 126))
+ (punct (33 . 47) (58 . 64) (91 . 96) (123 . 126))
+ ;; Not-so-wild assumptions.
+ (upper (?A . ?Z))
+ (lower (?a . ?z)))))))))))
+
((consp a)
- (or (and (characterp b)
- (<= (car a) b (cdr a)))
- (and (consp b)
- (<= (car a) (cdr b))
- (<= (car b) (cdr a)))
- (symbolp b)))
+ (cond ((characterp b) (<= (car a) b (cdr a)))
+ ((consp b) (and (<= (car a) (cdr b))
+ (<= (car b) (cdr a))))
+ (t ; b is a class
+ (xr--any-item-may-intersect-p b a))))
;; Now a must be a character.
((characterp b) (eq a b))
(t (xr--any-item-may-intersect-p b a))))
@@ -798,7 +959,13 @@ A-SETS and B-SETS are arguments to `any'."
((or 'ascii 'alnum 'alpha 'blank 'cntrl 'digit 'graph
'lower 'multibyte 'nonascii 'print 'punct 'space
'unibyte 'upper 'word 'xdigit)
- (xr--char-superset-of-char-set-p sets negated `(any ,rx)))
+ (xr--char-superset-of-char-set-p sets negated (list rx)))
+ (`(not ,(and sym
+ (or 'ascii 'alnum 'alpha 'blank 'cntrl 'digit 'graph
+ 'lower 'multibyte 'nonascii 'print 'punct 'space
+ 'unibyte 'upper 'word 'xdigit)))
+ (and negated
+ (xr--char-superset-of-char-set-p (list sym) nil sets)))
((pred stringp)
(and (= (length rx) 1)
(xr--char-superset-of-char-set-p sets negated (list rx))))))
@@ -811,6 +978,18 @@ A-SETS and B-SETS are arguments to `any'."
(`(syntax ,s) (not (eq s ?>))) ; comment-end often matches newline
(_ (xr--char-superset-of-rx-p '("\n") t rx))))
+(defun xr--single-char-p (rx)
+ "Whether RX only matches single characters."
+ (or (memq rx '(nonl anything
+ ascii alnum alpha blank cntrl digit graph
+ lower multibyte nonascii print punct space
+ unibyte upper word xdigit))
+ (and (stringp rx) (= (length rx) 1))
+ (and (consp rx)
+ (or (memq (car rx) '(any category syntax))
+ (and (eq (car rx) 'not)
+ (xr--single-char-p (cadr rx)))))))
+
(defun xr--syntax-superset-of-rx-p (syntax negated rx)
"Whether SYNTAX, possibly NEGATED, is a superset of RX."
;; Syntax tables vary, but we make a (quite conservative) guess.
@@ -878,8 +1057,20 @@ single-character strings."
(xr--char-superset-of-rx-p sets nil b))
(`(not (any . ,sets))
(xr--char-superset-of-rx-p sets t b))
+ ((or 'ascii 'alnum 'alpha 'blank 'cntrl 'digit 'graph
+ 'lower 'multibyte 'nonascii 'print 'punct 'space
+ 'unibyte 'upper 'word 'xdigit)
+ (xr--char-superset-of-rx-p (list a) nil b))
+ (`(not ,(and sym
+ (or 'ascii 'alnum 'alpha 'blank 'cntrl 'digit 'graph
+ 'lower 'multibyte 'nonascii 'print 'punct 'space
+ 'unibyte 'upper 'word 'xdigit)))
+ (xr--char-superset-of-rx-p (list sym) t b))
+
('nonl (xr--single-non-newline-char-p b))
+ ('anything (xr--single-char-p b))
+
(`(seq . ,a-body)
(pcase b
(`(seq . ,b-body)
- [elpa] externals/xr updated (b5ae891 -> dcf5240), Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr 9a8bf08 2/9: Update copyright year to 2020, Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr 0fb29c8 1/9: Update copyright year to 2020, Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr 9d9debd 5/9: Remove package description in xr.el, Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr 3af4048 4/9: Suppress false positives in repetition subsumption check (bug #2), Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr 1932e3d 7/9: Avoid ambiguous regexp (relint/xr complaint), Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr d7a6480 8/9: Translate [^\n] into nonl, Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr dcf5240 9/9: Increment version to 1.16, Mattias Engdegεrd, 2020/02/29
- [elpa] externals/xr e82efe8 6/9: Improved character class set relations,
Mattias Engdegεrd <=
- [elpa] externals/xr c88fb0e 3/9: Use text quoting for all messages, Mattias Engdegεrd, 2020/02/29