[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] master 6c284c6: Make [:print:] support non-ASCII character
From: |
Eli Zaretskii |
Subject: |
[Emacs-diffs] master 6c284c6: Make [:print:] support non-ASCII characters correctly |
Date: |
Tue, 14 Apr 2015 15:47:48 +0000 |
branch: master
commit 6c284c6b5828bc4407f7201499e0507ce0e5a0a0
Author: Eli Zaretskii <address@hidden>
Commit: Eli Zaretskii <address@hidden>
Make [:print:] support non-ASCII characters correctly
* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters.
(BIT_PRINT): New bit mask.
(re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT.
* src/character.c (printablep): New function.
* src/character.h (printablep): Add prototype.
* lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior
of 'print', 'alnum', and 'alphabetic'.
* doc/lispref/searching.texi (Char Classes): Document the new
behavior of [:print:].
* etc/NEWS: Mention the new behavior of [:print:].
---
doc/lispref/searching.texi | 7 +++++--
etc/NEWS | 8 ++++++++
lisp/emacs-lisp/rx.el | 12 ++++++------
src/character.c | 16 ++++++++++++++++
src/character.h | 1 +
src/regex.c | 9 ++++++---
6 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index 87513e8..238d814 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text
Representations}).
@item [:nonascii:]
This matches any address@hidden character.
@item [:print:]
-This matches printing characters---everything except @acronym{ASCII} control
-characters and the delete character.
+This matches printing characters---everything except @acronym{ASCII}
+and address@hidden control characters (including the delete
+character), surrogates, and codepoints unassigned by Unicode, as
+indicated by the Unicode @samp{general-category} property
+(@pxref{Character Properties}).
@item [:punct:]
This matches any punctuation character. (At present, for multibyte
characters, it matches anything that has non-word syntax.)
diff --git a/etc/NEWS b/etc/NEWS
index 6d8b4c6..907787a 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification
support.
---
*** gulp.el
++++
+** The character class [:print:] in regular expressions
+no longer matches any multibyte character. Instead, Emacs now
+consults the Unicode character properties to determine which
+characters are printable. In particular, surrogates and unassigned
+codepoints are now rejected by this class. If you want the old
+behavior, use [:multibyte:] instead.
+
* New Modes and Packages in Emacs 25.1
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 20af59f..a5a228e 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -969,16 +969,16 @@ CHAR
space, and DEL.
`printing', `print'
- matches printing characters--everything except ASCII control chars
- and DEL.
+ matches printing characters--everything except ASCII and non-ASCII
+ control characters, surrogates, and codepoints unassigned by Unicode.
`alphanumeric', `alnum'
- matches letters and digits. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters and digits. (For multibyte characters,
+ it matches according to Unicode character properties.)
`letter', `alphabetic', `alpha'
- matches letters. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters. (For multibyte characters,
+ it matches according to Unicode character properties.)
`ascii'
matches ASCII (unibyte) characters.
diff --git a/src/character.c b/src/character.c
index ad78f51..b357dd5 100644
--- a/src/character.c
+++ b/src/character.c
@@ -1022,6 +1022,22 @@ decimalnump (int c)
return gen_cat == UNICODE_CATEGORY_Nd;
}
+/* Return 'true' if C is a printable character as defined by its
+ Unicode properties. */
+bool
+printablep (int c)
+{
+ Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
+ if (! INTEGERP (category))
+ return false;
+ EMACS_INT gen_cat = XINT (category);
+
+ /* See UTS #18. */
+ return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
+ || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
+ || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
+}
+
void
syms_of_character (void)
{
diff --git a/src/character.h b/src/character.h
index 7d90295..1a5d2c8 100644
--- a/src/character.h
+++ b/src/character.h
@@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
extern bool alphabeticp (int);
extern bool decimalnump (int);
+extern bool printablep (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \
diff --git a/src/regex.c b/src/regex.c
index 1afc503..b9d09d0 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
- : 1)
+ : printablep (c))
# define ISALNUM(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
@@ -1865,7 +1865,8 @@ struct range_table_work_area
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
/* Bits used to implement the multibyte-part of the various character classes
- such as [:alnum:] in a charset's range table. */
+ such as [:alnum:] in a charset's range table. The code currently assumes
+ that only the low 16 bits are used. */
#define BIT_WORD 0x1
#define BIT_LOWER 0x2
#define BIT_PUNCT 0x4
@@ -1874,6 +1875,7 @@ struct range_table_work_area
#define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80
+#define BIT_PRINT 0x100
/* Set the bit for character C in a list. */
@@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
{
switch (cc)
{
- case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
+ case RECC_NONASCII: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
@@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE;
+ case RECC_PRINT: return BIT_PRINT;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
default:
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Emacs-diffs] master 6c284c6: Make [:print:] support non-ASCII characters correctly,
Eli Zaretskii <=