[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] /srv/bzr/emacs/trunk r112051: Optimize ASCII file reading
From: |
K. Handa |
Subject: |
[Emacs-diffs] /srv/bzr/emacs/trunk r112051: Optimize ASCII file reading with EOL format detection and decoding. |
Date: |
Sat, 16 Mar 2013 01:06:12 +0900 |
User-agent: |
Bazaar (2.5.0) |
------------------------------------------------------------
revno: 112051 [merge]
committer: K. Handa <address@hidden>
branch nick: trunk
timestamp: Sat 2013-03-16 01:06:12 +0900
message:
Optimize ASCII file reading with EOL format detection and decoding.
modified:
src/ChangeLog
src/coding.c
src/coding.h
src/insdel.c
src/lisp.h
=== modified file 'src/ChangeLog'
--- a/src/ChangeLog 2013-03-15 10:07:29 +0000
+++ b/src/ChangeLog 2013-03-15 16:04:47 +0000
@@ -1,3 +1,31 @@
+2013-03-15 handa <address@hidden>
+
+ * insdel.c (insert_from_gap): New arg text_at_gap_tail.
+ (adjust_after_replace): Make it back to static. Delete the third
+ arg text_at_gap_tail. Cancel the code for handling it.
+
+ * coding.h (struct coding_system): New member eol_seen.
+
+ * coding.c (detect_ascii): New function.
+ (detect_coding): Set coding->head_ascii and coding->eol_seen only
+ when the source bytes are actually scanned. On detecting for
+ coding_category_utf_8_auto, call detect_ascii instead of scanning
+ source bytes directly.
+ (produce_chars): Call insert_from_gap with the new arg 0.
+ (encode_coding): Likewise.
+ (decode_coding_gap): Control ASCII optimization by the variable
+ disable_ascii_optimization instead of #ifndef .. #endif.
+ Deccode EOL format according to coding->eol_seen.
+ (syms_of_coding): Declare disable-ascii-optimization as a Lisp
+ variable.
+
+ * global.h (struct emacs_globals): New member
+ f_disable_ascii_optimization.
+ (disable_ascii_optimization): New macro.
+
+ * lisp.h (adjust_after_replace): Cancel externing it.
+ (insert_from_gap): Adjust prototype.
+
2013-03-15 Eli Zaretskii <address@hidden>
* w32term.c (w32fullscreen_hook): Swap FULLSCREEN_BOTH and
=== modified file 'src/coding.c'
--- a/src/coding.c 2013-03-10 22:55:25 +0000
+++ b/src/coding.c 2013-03-15 16:03:54 +0000
@@ -6071,6 +6071,93 @@
#define EOL_SEEN_CR 2
#define EOL_SEEN_CRLF 4
+
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int
eol_seen);
+
+
+/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
+ By side effects, set coding->head_ascii and coding->eol_seen. The
+ value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
+ EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
+ all the source bytes are ASCII. */
+
+static bool
+detect_ascii (struct coding_system *coding)
+{
+ const unsigned char *src, *end;
+ Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
+ int eol_seen;
+
+ eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
+ : EQ (eol_type, Qunix) ? EOL_SEEN_LF
+ : EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
+ : EOL_SEEN_CR);
+ coding_set_source (coding);
+ src = coding->source;
+ end = src + coding->src_bytes;
+
+ if (inhibit_eol_conversion)
+ {
+ /* We don't have to check EOL format. */
+ while (src < end && !( *src & 0x80)) src++;
+ eol_seen = EOL_SEEN_LF;
+ adjust_coding_eol_type (coding, eol_seen);
+ }
+ else if (eol_seen != EOL_SEEN_NONE)
+ {
+ /* We don't have to check EOL format either. */
+ while (src < end && !(*src & 0x80)) src++;
+ }
+ else
+ {
+ end--; /* We look ahead one byte. */
+ while (src < end)
+ {
+ int c = *src;
+
+ if (c & 0x80)
+ break;
+ src++;
+ if (c < 0x20)
+ {
+ if (c == '\r')
+ {
+ if (*src == '\n')
+ {
+ eol_seen |= EOL_SEEN_CRLF;
+ src++;
+ }
+ else
+ eol_seen |= EOL_SEEN_CR;
+ }
+ else if (c == '\n')
+ eol_seen |= EOL_SEEN_LF;
+ }
+ }
+ if (src > end)
+ /* The last two bytes are CR LF, which means that we have
+ scanned all bytes. */
+ end++;
+ else if (src == end)
+ {
+ end++;
+ if (! (*src & 0x80))
+ {
+ if (*src == '\r')
+ eol_seen |= EOL_SEEN_CR;
+ else if (*src == '\n')
+ eol_seen |= EOL_SEEN_LF;
+ src++;
+ }
+ }
+ adjust_coding_eol_type (coding, eol_seen);
+ }
+ coding->head_ascii = src - coding->source;
+ coding->eol_seen = eol_seen;
+ return (src == end);
+}
+
+
/* Detect how end-of-line of a text of length SRC_BYTES pointed by
SOURCE is encoded. If CATEGORY is one of
coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@@ -6215,7 +6302,6 @@
coding_set_source (coding);
src_end = coding->source + coding->src_bytes;
- coding->head_ascii = 0;
/* If we have not yet decided the text encoding type, detect it
now. */
@@ -6225,6 +6311,8 @@
struct coding_detection_info detect_info;
bool null_byte_found = 0, eight_bit_found = 0;
+ coding->head_ascii = 0;
+ coding->eol_seen = EOL_SEEN_NONE;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
for (src = coding->source; src < src_end; src++)
{
@@ -6263,6 +6351,26 @@
if (eight_bit_found)
break;
}
+ else if (! disable_ascii_optimization
+ && ! inhibit_eol_conversion)
+ {
+ if (c == '\r')
+ {
+ if (src < src_end && src[1] == '\n')
+ {
+ coding->eol_seen |= EOL_SEEN_CRLF;
+ src++;
+ coding->head_ascii++;
+ }
+ else
+ coding->eol_seen |= EOL_SEEN_CR;
+ }
+ else if (c == '\n')
+ {
+ coding->eol_seen |= EOL_SEEN_LF;
+ }
+ }
+
if (! eight_bit_found)
coding->head_ascii++;
}
@@ -6353,19 +6461,20 @@
coding_systems
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
- for (src = coding->source; src < src_end; src++)
+ if (detect_ascii (coding))
{
- if (*src & 0x80)
- break;
+ setup_coding_system (XCDR (coding_systems), coding);
}
- coding->head_ascii = src - coding->source;
- if (CONSP (coding_systems)
- && detect_coding_utf_8 (coding, &detect_info))
+ else
{
- if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
- setup_coding_system (XCAR (coding_systems), coding);
- else
- setup_coding_system (XCDR (coding_systems), coding);
+ if (CONSP (coding_systems)
+ && detect_coding_utf_8 (coding, &detect_info))
+ {
+ if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
+ setup_coding_system (XCAR (coding_systems), coding);
+ else
+ setup_coding_system (XCDR (coding_systems), coding);
+ }
}
}
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@@ -6378,6 +6487,7 @@
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
coding->head_ascii = 0;
+ coding->eol_seen = EOL_SEEN_NONE;
if (CONSP (coding_systems)
&& detect_coding_utf_16 (coding, &detect_info))
{
@@ -6815,7 +6925,7 @@
produced = dst - (coding->destination + coding->produced);
if (BUFFERP (coding->dst_object) && produced_chars > 0)
- insert_from_gap (produced_chars, produced);
+ insert_from_gap (produced_chars, produced, 0);
coding->produced += produced;
coding->produced_char += produced_chars;
return carryover;
@@ -7400,7 +7510,7 @@
} while (coding->consumed_char < coding->src_chars);
if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
- insert_from_gap (coding->produced_char, coding->produced);
+ insert_from_gap (coding->produced_char, coding->produced, 0);
SAFE_FREE ();
}
@@ -7510,39 +7620,45 @@
if (CODING_REQUIRE_DETECTION (coding))
detect_coding (coding);
attrs = CODING_ID_ATTRS (coding->id);
-#ifndef CODING_DISABLE_ASCII_OPTIMIZATION
- if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
- && NILP (CODING_ATTR_POST_READ (attrs))
- && NILP (get_translation_table (attrs, 0, NULL))
- && (inhibit_eol_conversion
- || EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
+ if (! disable_ascii_optimization)
{
- /* We can skip the conversion if all source bytes are ASCII. */
- if (coding->head_ascii < 0)
- {
- /* We have not yet counted the number of ASCII bytes at the
- head of the source. Do it now. */
- const unsigned char *src, *src_end;
-
- coding_set_source (coding);
- src_end = coding->source + coding->src_bytes;
- for (src = coding->source; src < src_end; src++)
- {
- if (*src & 0x80)
- break;
- }
- coding->head_ascii = src - coding->source;
- }
- if (coding->src_bytes == coding->head_ascii)
- {
- /* No need of conversion. Use the data in the gap as is. */
- coding->produced_char = chars;
- coding->produced = bytes;
- adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
+ if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
+ && NILP (CODING_ATTR_POST_READ (attrs))
+ && NILP (get_translation_table (attrs, 0, NULL))
+ && (coding->head_ascii >= 0 /* We've already called detect_coding */
+ ? coding->head_ascii == bytes
+ : detect_ascii (coding)))
+ {
+ if (coding->eol_seen == EOL_SEEN_CR)
+ {
+ unsigned char *src_end = GAP_END_ADDR;
+ unsigned char *src = src - coding->src_bytes;
+
+ while (src < src_end)
+ {
+ if (*src++ == '\r')
+ src[-1] = '\n';
+ }
+ }
+ else if (coding->eol_seen == EOL_SEEN_CRLF)
+ {
+ unsigned char *src = GAP_END_ADDR;
+ unsigned char *src_beg = src - coding->src_bytes;
+ unsigned char *dst = src;
+
+ while (src_beg < src)
+ {
+ *--dst = *--src;
+ if (*src == '\n')
+ src--;
+ }
+ bytes -= dst - src;
+ }
+ coding->produced_char = coding->produced = bytes;
+ insert_from_gap (bytes, bytes, 1);
return;
}
}
-#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */
code_conversion_save (0, 0);
coding->mode |= CODING_MODE_LAST_BLOCK;
@@ -10758,6 +10874,11 @@
decode text as usual. */);
inhibit_null_byte_detection = 0;
+ DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
+ doc: /* If non-nil, Emacs does not optimize code decoder for
ASCII files.
+Internal use only. Removed after the experimental optimizer gets stable. */);
+ disable_ascii_optimization = 0;
+
DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
doc: /* Char table for translating self-inserting characters.
This is applied to the result of input methods, not their input.
=== modified file 'src/coding.h'
--- a/src/coding.h 2013-03-10 22:55:25 +0000
+++ b/src/coding.h 2013-03-15 16:03:54 +0000
@@ -440,9 +440,13 @@
/* How may heading bytes we can skip for decoding. This is set to
-1 in setup_coding_system, and updated by detect_coding. So,
when this is equal to the byte length of the text being
- converted, we can skip the actual conversion process. */
+ converted, we can skip the actual conversion process except for
+ the eol format. */
ptrdiff_t head_ascii;
+ /* Used internally in coding.c. See the comment of detect_ascii. */
+ int eol_seen;
+
/* The following members are set by encoding/decoding routine. */
ptrdiff_t produced, produced_char, consumed, consumed_char;
=== modified file 'src/insdel.c'
--- a/src/insdel.c 2013-03-11 04:07:45 +0000
+++ b/src/insdel.c 2013-03-15 16:03:54 +0000
@@ -977,10 +977,11 @@
}
/* Insert a sequence of NCHARS chars which occupy NBYTES bytes
- starting at GPT_ADDR. */
+ starting at GAP_END_ADDR - NBYTES (if text_at_gap_tail) and at
+ GPT_ADDR (if not text_at_gap_tail). */
void
-insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
+insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
{
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
nchars = nbytes;
@@ -989,10 +990,13 @@
MODIFF++;
GAP_SIZE -= nbytes;
- GPT += nchars;
+ if (! text_at_gap_tail)
+ {
+ GPT += nchars;
+ GPT_BYTE += nbytes;
+ }
ZV += nchars;
Z += nchars;
- GPT_BYTE += nbytes;
ZV_BYTE += nbytes;
Z_BYTE += nbytes;
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
@@ -1010,7 +1014,7 @@
current_buffer, 0);
}
- if (GPT - nchars < PT)
+ if (! text_at_gap_tail && GPT - nchars < PT)
adjust_point (nchars, nbytes);
check_markers ();
@@ -1162,16 +1166,14 @@
/* Record undo information and adjust markers and position keepers for
a replacement of a text PREV_TEXT at FROM to a new text of LEN
- chars (LEN_BYTE bytes). If TEXT_AT_GAP_TAIL, the new text
- resides at the gap tail; i.e. at (GAP_END_ADDR - LEN_BYTE)
- Otherwise, the text resides in the gap just after GPT_BYTE.
+ chars (LEN_BYTE bytes) which resides in the gap just after
+ GPT_ADDR.
PREV_TEXT nil means the new text was just inserted. */
-void
+static void
adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
- Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte,
- bool text_at_gap_tail)
+ Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte)
{
ptrdiff_t nchars_del = 0, nbytes_del = 0;
@@ -1191,11 +1193,8 @@
GAP_SIZE -= len_byte;
ZV += len; Z+= len;
ZV_BYTE += len_byte; Z_BYTE += len_byte;
- if (! text_at_gap_tail)
- {
- GPT += len; GPT_BYTE += len_byte;
- if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
- }
+ GPT += len; GPT_BYTE += len_byte;
+ if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
if (nchars_del > 0)
adjust_markers_for_replace (from, from_byte, nchars_del, nbytes_del,
@@ -1250,7 +1249,7 @@
GPT -= len; GPT_BYTE -= len_byte;
ZV -= len; ZV_BYTE -= len_byte;
Z -= len; Z_BYTE -= len_byte;
- adjust_after_replace (from, from_byte, Qnil, newlen, len_byte, 0);
+ adjust_after_replace (from, from_byte, Qnil, newlen, len_byte);
}
/* Replace the text from character positions FROM to TO with NEW,
=== modified file 'src/lisp.h'
--- a/src/lisp.h 2013-03-11 04:02:06 +0000
+++ b/src/lisp.h 2013-03-15 16:03:54 +0000
@@ -2880,7 +2880,7 @@
extern void insert_and_inherit (const char *, ptrdiff_t);
extern void insert_1_both (const char *, ptrdiff_t, ptrdiff_t,
bool, bool, bool);
-extern void insert_from_gap (ptrdiff_t, ptrdiff_t);
+extern void insert_from_gap (ptrdiff_t, ptrdiff_t, bool text_at_gap_tail);
extern void insert_from_string (Lisp_Object, ptrdiff_t, ptrdiff_t,
ptrdiff_t, ptrdiff_t, bool);
extern void insert_from_buffer (struct buffer *, ptrdiff_t, ptrdiff_t, bool);
@@ -2900,8 +2900,6 @@
extern void modify_region_1 (ptrdiff_t, ptrdiff_t, bool);
extern void prepare_to_modify_buffer (ptrdiff_t, ptrdiff_t, ptrdiff_t *);
extern void signal_after_change (ptrdiff_t, ptrdiff_t, ptrdiff_t);
-extern void adjust_after_replace (ptrdiff_t, ptrdiff_t, Lisp_Object,
- ptrdiff_t, ptrdiff_t, bool);
extern void adjust_after_insert (ptrdiff_t, ptrdiff_t, ptrdiff_t,
ptrdiff_t, ptrdiff_t);
extern void adjust_markers_for_delete (ptrdiff_t, ptrdiff_t,
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Emacs-diffs] /srv/bzr/emacs/trunk r112051: Optimize ASCII file reading with EOL format detection and decoding.,
K. Handa <=