[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-71-g1260fd
From: |
Andy Wingo |
Subject: |
[Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-71-g1260fd0 |
Date: |
Wed, 30 Jan 2013 12:57:00 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".
http://git.savannah.gnu.org/cgit/guile.git/commit/?id=1260fd0b2c4ce1d0d7e7b17df924c245f67f9058
The branch, stable-2.0 has been updated
via 1260fd0b2c4ce1d0d7e7b17df924c245f67f9058 (commit)
via 7e0f26eb0d5a9316daad680f62168beffd050632 (commit)
via b2cb557d75e4daf8c7c8cd43313f4cc51d9a3f1b (commit)
from a8fa310b0493cd2e88a7d7f08b1ee3183a81b455 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 1260fd0b2c4ce1d0d7e7b17df924c245f67f9058
Author: Andy Wingo <address@hidden>
Date: Wed Jan 30 13:55:59 2013 +0100
quick fix to ssax.scm
* module/sxml/ssax.scm: Fix previous commit.
commit 7e0f26eb0d5a9316daad680f62168beffd050632
Author: Andy Wingo <address@hidden>
Date: Wed Jan 30 13:52:47 2013 +0100
fix ssax:warn to work as intended
* module/sxml/ssax.scm (ssax:warn): Fix to make more sense.
commit b2cb557d75e4daf8c7c8cd43313f4cc51d9a3f1b
Author: Andy Wingo <address@hidden>
Date: Wed Jan 30 10:17:25 2013 +0100
detect and consume byte-order marks for textual ports
* libguile/ports.h:
* libguile/ports.c (scm_consume_byte_order_mark): New procedure.
* libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we
are opening a file in "r" mode.
* libguile/read.c (scm_i_scan_for_encoding): Don't do anything about
byte-order marks.
* libguile/load.c (scm_primitive_load): Add a note about the duplicate
encoding scan.
* test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and
UTF-16LE BOM handling.
-----------------------------------------------------------------------
Summary of changes:
libguile/fports.c | 35 ++++++++++-------
libguile/load.c | 3 +
libguile/ports.c | 85 ++++++++++++++++++++++++++++++++++++++++-
libguile/ports.h | 3 +-
libguile/read.c | 14 +------
module/sxml/ssax.scm | 9 +++-
test-suite/tests/filesys.test | 59 ++++++++++++++++++++++++++++-
7 files changed, 175 insertions(+), 33 deletions(-)
diff --git a/libguile/fports.c b/libguile/fports.c
index 10cf671..fbc0530 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation,
Inc.
+ * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software
Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
#define FUNC_NAME s_scm_open_file
{
SCM port;
- int fdes, flags = 0, use_encoding = 1;
+ int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0;
unsigned int retries;
char *file, *md, *ptr;
@@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
{
case 'r':
flags |= O_RDONLY;
+ consume_bom = 1;
+ scan_for_encoding = 1;
break;
case 'w':
flags |= O_WRONLY | O_CREAT | O_TRUNC;
@@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
{
case '+':
flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR;
+ consume_bom = 0;
break;
case 'b':
- use_encoding = 0;
+ scan_for_encoding = 0;
+ consume_bom = 0;
+ binary = 1;
#if defined (O_BINARY)
flags |= O_BINARY;
#endif
@@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0,
port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode),
fport_canonicalize_filename (filename));
- if (use_encoding)
- {
- /* If this file has a coding declaration, use that as the port
- encoding. */
- if (SCM_INPUT_PORT_P (port))
- {
- char *enc = scm_i_scan_for_encoding (port);
- if (enc != NULL)
- scm_i_set_port_encoding_x (port, enc);
- }
- }
- else
+ if (consume_bom)
+ scm_consume_byte_order_mark (port);
+
+ if (binary)
/* If this is a binary file, use the binary-friendly ISO-8859-1
encoding. */
scm_i_set_port_encoding_x (port, NULL);
+ else if (scan_for_encoding)
+ /* If this is an input port and the file has a coding declaration,
+ use that as the port encoding. */
+ {
+ char *enc = scm_i_scan_for_encoding (port);
+ if (enc != NULL)
+ scm_i_set_port_encoding_x (port, enc);
+ }
scm_dynwind_end ();
diff --git a/libguile/load.c b/libguile/load.c
index 84b6705..476461c 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
+ /* FIXME: For better or for worse, scm_open_file already scans the
+ file for an encoding. This scans again; necessary for this
+ logic, but unnecessary overall. */
encoding = scm_i_scan_for_encoding (port);
if (encoding)
scm_i_set_port_encoding_x (port, encoding);
diff --git a/libguile/ports.c b/libguile/ports.c
index 55808e2..9b1be9b 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- * 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ * 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation,
Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x,
"set-port-filename!", 2, 0, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0,
+ (SCM port),
+ "Peek ahead in @var{port} for a byte-order mark (\\uFEFF)
encoded\n"
+ "in UTF-8 or in UTF-16. If found, consume the byte-order mark\n"
+ "and set the port to the indicated encoding.\n"
+ "\n"
+ "As a special case, if the port's encoding is already UTF-16LE\n"
+ "or UTF-16BE (as opposed to UTF-16), we consider that the user\n"
+ "has already asked for an explicit byte order. In this case no\n"
+ "scan is performed, and the byte-order mark (if any) is left in\n"
+ "the port.\n"
+ "\n"
+ "Return @code{#t} if a byte-order mark was consumed, and\n"
+ "@code{#f} otherwise.")
+#define FUNC_NAME s_scm_consume_byte_order_mark
+{
+ scm_t_port *pt;
+ const char *enc;
+
+ SCM_VALIDATE_PORT (1, port);
+
+ pt = SCM_PTAB_ENTRY (port);
+ enc = pt->encoding;
+
+ if (enc && strcasecmp (enc, "UTF-16BE") == 0)
+ return SCM_BOOL_F;
+
+ if (enc && strcasecmp (enc, "UTF-16LE") == 0)
+ return SCM_BOOL_F;
+
+ switch (scm_peek_byte_or_eof (port))
+ {
+ case 0xEF:
+ scm_get_byte_or_eof (port);
+ switch (scm_peek_byte_or_eof (port))
+ {
+ case 0xBB:
+ scm_get_byte_or_eof (port);
+ switch (scm_peek_byte_or_eof (port))
+ {
+ case 0xBF:
+ scm_get_byte_or_eof (port);
+ scm_i_set_port_encoding_x (port, "UTF-8");
+ return SCM_BOOL_T;
+ default:
+ scm_unget_byte (0xBB, port);
+ scm_unget_byte (0xEF, port);
+ return SCM_BOOL_F;
+ }
+ default:
+ scm_unget_byte (0xEF, port);
+ return SCM_BOOL_F;
+ }
+ case 0xFE:
+ scm_get_byte_or_eof (port);
+ switch (scm_peek_byte_or_eof (port))
+ {
+ case 0xFF:
+ scm_get_byte_or_eof (port);
+ scm_i_set_port_encoding_x (port, "UTF-16BE");
+ return SCM_BOOL_T;
+ default:
+ scm_unget_byte (0xFE, port);
+ return SCM_BOOL_F;
+ }
+ case 0xFF:
+ scm_get_byte_or_eof (port);
+ switch (scm_peek_byte_or_eof (port))
+ {
+ case 0xFE:
+ scm_get_byte_or_eof (port);
+ scm_i_set_port_encoding_x (port, "UTF-16LE");
+ return SCM_BOOL_T;
+ default:
+ scm_unget_byte (0xFF, port);
+ return SCM_BOOL_F;
+ }
+ default:
+ return SCM_BOOL_F;
+ }
+}
+#undef FUNC_NAME
+
/* A fluid specifying the default encoding for newly created ports. If it is
a string, that is the encoding. If it is #f, it is in the "native"
(Latin-1) encoding. */
diff --git a/libguile/ports.h b/libguile/ports.h
index d4d59b7..2f32345 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -4,7 +4,7 @@
#define SCM_PORTS_H
/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
- * 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ * 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port);
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
SCM_API SCM scm_port_filename (SCM port);
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
+SCM_API SCM scm_consume_byte_order_mark (SCM port);
SCM_INTERNAL const char *scm_i_default_port_encoding (void);
SCM_INTERNAL void scm_i_set_default_port_encoding (const char *);
SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
diff --git a/libguile/read.c b/libguile/read.c
index 222891b..a8f7744 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port)
char header[SCM_ENCODING_SEARCH_SIZE+1];
size_t bytes_read, encoding_length, i;
char *encoding = NULL;
- int utf8_bom = 0;
char *pos, *encoding_start;
int in_comment;
@@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port)
bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
header[bytes_read] = '\0';
- scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+ scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR));
}
- if (bytes_read > 3
- && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
- utf8_bom = 1;
-
/* search past "coding[:=]" */
pos = header;
while (1)
@@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port)
/* This wasn't in a comment */
return NULL;
- if (utf8_bom && strcmp(encoding, "UTF-8"))
- scm_misc_error (NULL,
- "the port input declares the encoding ~s but is encoded as
UTF-8",
- scm_list_1 (scm_from_locale_string (encoding)));
-
return encoding;
}
diff --git a/module/sxml/ssax.scm b/module/sxml/ssax.scm
index 474247b..f750c93 100644
--- a/module/sxml/ssax.scm
+++ b/module/sxml/ssax.scm
@@ -180,9 +180,12 @@
(parameterize ((current-ssax-error-port port))
(thunk)))
-(define (ssax:warn port msg . args)
- (format (current-ssax-error-port)
- ";;; SSAX warning: ~a ~a\n" msg args))
+(define (ssax:warn port . args)
+ (with-output-to-port (current-ssax-error-port)
+ (lambda ()
+ (display ";;; SSAX warning: ")
+ (for-each display args)
+ (newline))))
(define (ucscode->string codepoint)
(string (integer->char codepoint)))
diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test
index a6bfb6e..8bd974d 100644
--- a/test-suite/tests/filesys.test
+++ b/test-suite/tests/filesys.test
@@ -1,6 +1,6 @@
;;;; filesys.test --- test file system functions -*- scheme -*-
;;;;
-;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
USA
(define-module (test-suite test-filesys)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 binary-ports)
#:use-module (test-suite lib)
#:use-module (test-suite guile-test))
@@ -127,3 +129,58 @@
(delete-file (test-file))
(delete-file (test-symlink))
+
+(let ((s "\ufeffHello, world!"))
+ (define* (test-encoding encoding #:optional (ambient "ISO-8859-1"))
+ (with-fluids ((%default-port-encoding ambient))
+ (let* ((bytes (catch 'misc-error
+ (lambda ()
+ (call-with-values open-bytevector-output-port
+ (lambda (port get-bytevector)
+ (set-port-encoding! port encoding)
+ (display s port)
+ (get-bytevector))))
+ (lambda args
+ (throw 'unresolved))))
+ (name (string-copy "myfile-XXXXXX"))
+ (port (mkstemp! name)))
+ (put-bytevector port bytes)
+ (close-port port)
+ (let ((contents (call-with-input-file name read-string)))
+ (delete-file name)
+ contents))))
+
+ (pass-if "UTF-8"
+ (equal? (test-encoding "UTF-8")
+ "Hello, world!"))
+
+ (pass-if "UTF-16BE"
+ (equal? (test-encoding "UTF-16BE")
+ "Hello, world!"))
+
+ (pass-if "UTF-16LE"
+ (equal? (test-encoding "UTF-16LE")
+ "Hello, world!"))
+
+ (pass-if "UTF-8 (ambient)"
+ (equal? (test-encoding "UTF-8" "UTF-8")
+ "Hello, world!"))
+
+ (pass-if "UTF-8 (UTF-16 ambient)"
+ (equal? (test-encoding "UTF-8" "UTF-16")
+ "Hello, world!"))
+
+ ;; Unicode 6.2 section 16.8:
+ ;;
+ ;; For compatibility with versions of the Unicode Standard prior to
+ ;; Version 3.2, the code point U+FEFF has the word-joining semantics
+ ;; of zero width no-break space when it is not used as a BOM. [...]
+ ;;
+ ;; Where the byte order is explicitly specified, such as in UTF-16BE
+ ;; or UTF-16LE, then all U+FEFF characters -- even at the very
+ ;; beginning of the text -- are to be interpreted as zero width
+ ;; no-break spaces.
+ ;;
+ (pass-if "UTF-16LE (ambient)"
+ (equal? (test-encoding "UTF-16LE" "UTF-16LE")
+ "\ufeffHello, world!")))
hooks/post-receive
--
GNU Guile
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Guile-commits] GNU Guile branch, stable-2.0, updated. v2.0.7-71-g1260fd0,
Andy Wingo <=