branch master updated: Include file name encoding for XS parser

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: Include file name encoding for XS parser

From:	Gavin D. Smith
Subject:	branch master updated: Include file name encoding for XS parser
Date:	Thu, 24 Feb 2022 14:30:46 -0500
This is an automated email from the git hooks/post-receive script.

gavin pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new 46732a3290 Include file name encoding for XS parser
46732a3290 is described below

commit 46732a329061e79e2b075494a4a07ec9ded49ac9
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Thu Feb 24 19:30:33 2022 +0000

    Include file name encoding for XS parser
    
    * tp/Texinfo/XS/parsetexi/input.c
    (set_input_encoding): Save input encoding name.
    (encode_file_name): New function to re-encode using the input
    encoding.
    (encode_with_iconv): Split out from convert_to_utf8.
    
    * tp/Texinfo/XS/parsetexi/end_line.c (end_line_misc_line):
    Call it before calling locate_include_file.
    
    Suggestion from Patrice.
---
 ChangeLog                          |  15 +++++
 tp/Texinfo/XS/parsetexi/end_line.c |   5 +-
 tp/Texinfo/XS/parsetexi/input.c    | 124 ++++++++++++++++++++++++-------------
 tp/Texinfo/XS/parsetexi/input.h    |   2 +
 4 files changed, 102 insertions(+), 44 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ebe4ed66a7..3d46c554fc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2022-02-24  Gavin Smith  <gavinsmith0123@gmail.com>
+
+       Include file name encoding for XS parser
+
+       * tp/Texinfo/XS/parsetexi/input.c
+       (set_input_encoding): Save input encoding name.
+       (encode_file_name): New function to re-encode using the input
+       encoding.
+       (encode_with_iconv): Split out from convert_to_utf8.
+
+       * tp/Texinfo/XS/parsetexi/end_line.c (end_line_misc_line):
+       Call it before calling locate_include_file.
+
+       Suggestion from Patrice.
+
 2022-02-24  Gavin Smith  <gavinsmith0123@gmail.com>
 
        Fix XS configure check
diff --git a/tp/Texinfo/XS/parsetexi/end_line.c 
b/tp/Texinfo/XS/parsetexi/end_line.c
index 168d33304f..b1fd088ab2 100644
--- a/tp/Texinfo/XS/parsetexi/end_line.c
+++ b/tp/Texinfo/XS/parsetexi/end_line.c
@@ -1425,7 +1425,10 @@ end_line_misc_line (ELEMENT *current)
               char *fullpath;
               debug ("Include %s", text);
 
-              fullpath = locate_include_file (text);
+              char *sys_filename = encode_file_name (text);
+              fullpath = locate_include_file (sys_filename);
+              free (sys_filename);
+
               if (!fullpath)
                 {
                   command_error (current,
diff --git a/tp/Texinfo/XS/parsetexi/input.c b/tp/Texinfo/XS/parsetexi/input.c
index 2199db9aec..a33f1b2e63 100644
--- a/tp/Texinfo/XS/parsetexi/input.c
+++ b/tp/Texinfo/XS/parsetexi/input.c
@@ -52,9 +52,13 @@ typedef struct {
 
 enum character_encoding input_encoding;
 
+static char *input_encoding_name;
+
 void
 set_input_encoding (char *encoding)
 {
+  free (input_encoding_name); input_encoding_name = strdup (encoding);
+
   if (!strcasecmp (encoding, "utf-8"))
     input_encoding = ce_utf8;
   else if (!strcmp (encoding, "iso-8859-1")
@@ -162,16 +166,61 @@ text_buffer_iconv (TEXT *buf, iconv_t iconv_state,
 }
 
 
+static char *
+encode_with_iconv (iconv_t our_iconv,  char *s)
+{
+  static TEXT t;
+  ICONV_CONST char *inptr; size_t bytes_left;
+  size_t iconv_ret;
+
+  t.end = 0;
+  inptr = s;
+  bytes_left = strlen (s);
+  text_alloc (&t, 10);
+
+  while (1)
+    {
+      iconv_ret = text_buffer_iconv (&t, our_iconv,
+                                     &inptr, &bytes_left);
+
+      /* Make sure libiconv flushes out the last converted character.
+         This is required when the conversion is stateful, in which
+         case libiconv might not output the last character, waiting to
+         see whether it should be combined with the next one.  */
+      if (iconv_ret != (size_t) -1
+          && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1)
+        /* Success: all of input converted. */
+        break;
+
+      if (bytes_left == 0)
+        break;
+
+      switch (errno)
+        {
+        case E2BIG:
+          text_alloc (&t, t.space + 20);
+          break;
+        case EILSEQ:
+        default:
+          fprintf(stderr, "%s:%d: encoding error at byte 0x%2x\n",
+            line_nr.file_name, line_nr.line_nr, *(unsigned char *)inptr);
+          inptr++; bytes_left--;
+          break;
+        }
+    }
+
+  t.text[t.end] = '\0';
+  return strdup (t.text);
+}
+
 /* Return conversion of S according to input_encoding.  This function
    frees S. */
 static char *
 convert_to_utf8 (char *s)
 {
   iconv_t our_iconv = (iconv_t) -1;
-  static TEXT t;
-  ICONV_CONST char *inptr; size_t bytes_left;
-  size_t iconv_ret;
   enum character_encoding enc;
+  char *ret;
 
   /* Convert from @documentencoding to UTF-8.
      It might be possible not to convert to UTF-8 and use an 8-bit encoding
@@ -180,7 +229,8 @@ convert_to_utf8 (char *s)
      file, then we'd have to keep track of which strings needed the UTF-8 flag
      and which didn't. */
 
-  /* Initialize conversions for the first time. */
+  /* Initialize conversions for the first time.  iconv_open returns
+     (iconv_t) -1 on failure so these should only be called once. */
   if (iconv_validate_utf8 == (iconv_t) 0)
     iconv_validate_utf8 = iconv_open ("UTF-8", "UTF-8");
   if (iconv_from_latin1 == (iconv_t) 0)
@@ -229,47 +279,37 @@ convert_to_utf8 (char *s)
       return s;
     }
 
-  t.end = 0;
-  inptr = s;
-  bytes_left = strlen (s);
-  text_alloc (&t, 10);
-
-  while (1)
-    {
-      iconv_ret = text_buffer_iconv (&t, our_iconv,
-                                     &inptr, &bytes_left);
-
-      /* Make sure libiconv flushes out the last converted character.
-         This is required when the conversion is stateful, in which
-         case libiconv might not output the last character, waiting to
-         see whether it should be combined with the next one.  */
-      if (iconv_ret != (size_t) -1
-          && text_buffer_iconv (&t, our_iconv, 0, 0) != (size_t) -1)
-        /* Success: all of input converted. */
-        break;
+  ret = encode_with_iconv (our_iconv, s);
+  free (s);
+  return ret;
+}
 
-      if (bytes_left == 0)
-        break;
+static iconv_t reverse_iconv;
 
-      switch (errno)
+/* Reverse the decoding of the filename to the input encoding, to retrieve
+   the bytes that were present in the original Texinfo file.  Return
+   value to be freed by caller. */
+char *
+encode_file_name (char *filename)
+{
+  if (input_encoding != ce_utf8 && !reverse_iconv)
+    {
+      if (input_encoding_name)
         {
-        case E2BIG:
-          text_alloc (&t, t.space + 20);
-          break;
-        case EILSEQ:
-        default:
-          fprintf(stderr, "%s:%d: encoding error at byte 0x%2x\n",
-            line_nr.file_name, line_nr.line_nr, *(unsigned char *)inptr);
-          inptr++; bytes_left--;
-          break;
+          reverse_iconv = iconv_open (input_encoding_name, "UTF-8");
         }
     }
-
-  free (s);
-  t.text[t.end] = '\0';
-  return strdup (t.text);
+  if (reverse_iconv && reverse_iconv != (iconv_t) -1)
+    {
+      return encode_with_iconv (reverse_iconv, filename);
+    }
+  else
+    {
+      return strdup (filename);
+    }
 }
 
+
 int
 expanding_macro (char *macro)
 {
@@ -537,11 +577,9 @@ locate_include_file (char *filename)
   struct stat dummy;
   int i, status;
 
-  /* Checks if filename is absolute or relative to current directory.
-     TODO: Could use macros in top-level config.h for this. */
-  /* TODO: The Perl code (in Common.pm, 'locate_include_file') handles 
-     a volume in a path (like "A:"), possibly more general treatment 
-     with File::Spec module. */
+  /* Checks if filename is absolute or relative to current directory. */
+  /* Note: the Perl code (in Common.pm, 'locate_include_file') handles 
+     a volume in a path (like "A:") using the File::Spec module. */
   if (!memcmp (filename, "/", 1)
       || !memcmp (filename, "../", 3)
       || !memcmp (filename, "./", 2))
diff --git a/tp/Texinfo/XS/parsetexi/input.h b/tp/Texinfo/XS/parsetexi/input.h
index 82dfc7a59d..9bfc7085ef 100644
--- a/tp/Texinfo/XS/parsetexi/input.h
+++ b/tp/Texinfo/XS/parsetexi/input.h
@@ -16,7 +16,9 @@ int input_push_file (char *filename);
 void input_reset_input_stack (void);
 int expanding_macro (char *macro);
 int top_file_index (void);
+
 char *locate_include_file (char *filename);
+char *encode_file_name (char *filename);
 void set_input_encoding (char *encoding);
 void add_include_directory (char *filename);
 void clear_include_directories (void);
[Prev in Thread]
Current Thread
[Next in Thread]
branch master updated: Include file name encoding for XS parser, Gavin D. Smith <=
Prev by Date: branch master updated: Fix XS configure check
Next by Date: branch master updated: One function in Texinfo::Common to handle file name encoding
Previous by thread: branch master updated: Fix XS configure check
Next by thread: branch master updated: One function in Texinfo::Common to handle file name encoding
Index(es):
- Date
- Thread