texinfo-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

branch master updated: UTF-8 flag on strings for XS parser


From: Gavin D. Smith
Subject: branch master updated: UTF-8 flag on strings for XS parser
Date: Fri, 25 Feb 2022 12:39:33 -0500

This is an automated email from the git hooks/post-receive script.

gavin pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new eac8797369 UTF-8 flag on strings for XS parser
eac8797369 is described below

commit eac879736912372fba273b560a332e33c73bb2c8
Author: Gavin Smith <gavinsmith0123@gmail.com>
AuthorDate: Fri Feb 25 17:39:24 2022 +0000

    UTF-8 flag on strings for XS parser
    
    * tp/Texinfo/XS/parsetexi/api.c (newSVpv_utf8): New function.
    (element_to_perl_hash, build_single_index_data, build_line_nr_hash)
    (convert_error): Use it in many more places where the string being
    created should be "Perl-internal".  Suggestion from Patrice.
---
 ChangeLog                                          |  9 ++++
 tp/Texinfo/XS/parsetexi/api.c                      | 60 +++++++++++-----------
 tp/t/results/include/cpp_lines.pl                  |  2 +-
 .../non_ascii_command_line/Chapteur.html           |  3 +-
 .../non_ascii_command_line/os\303\251.2"           |  3 +-
 5 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 68261ceb60..d6cbab2945 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2022-02-24  Gavin Smith  <gavinsmith0123@gmail.com>
+
+       UTF-8 flag on strings for XS parser
+
+       * tp/Texinfo/XS/parsetexi/api.c (newSVpv_utf8): New function.
+       (element_to_perl_hash, build_single_index_data, build_line_nr_hash)
+       (convert_error): Use it in many more places where the string being
+       created should be "Perl-internal".  Suggestion from Patrice.
+
 2022-02-24  Patrice Dumas  <pertusus@free.fr>
 
        One function in Texinfo::Common to handle file name encoding
diff --git a/tp/Texinfo/XS/parsetexi/api.c b/tp/Texinfo/XS/parsetexi/api.c
index 1b06962e3e..3ed7314cbb 100644
--- a/tp/Texinfo/XS/parsetexi/api.c
+++ b/tp/Texinfo/XS/parsetexi/api.c
@@ -293,6 +293,19 @@ build_node_spec (NODE_SPEC_EXTRA *value)
   return newRV_inc ((SV *)hv);
 }
 
+/* Used to create a "Perl-internal" string that represents a sequence
+   of Unicode codepoints with no specific encoding. */
+static SV *
+newSVpv_utf8 (char *str, STRLEN len)
+{
+  SV *sv;
+  dTHX;
+
+  sv = newSVpv (str, len);
+  SvUTF8_on (sv);
+  return sv;
+}
+
 /* Set E->hv and 'hv' on E's descendants.  e->parent->hv is assumed
    to already exist. */
 static void
@@ -402,22 +415,11 @@ element_to_perl_hash (ELEMENT *e)
 
   if (e->text.space > 0)
     {
-      sv = newSVpv (e->text.text, e->text.end);
+      sv = newSVpv_utf8 (e->text.text, e->text.end);
       if (e->cmd != CM_value)
         hv_store (e->hv, "text", strlen ("text"), sv, 0);
       else
         hv_store (e->hv, "type", strlen ("type"), sv, 0);
-
-      SvUTF8_on (sv);
-      /* The strings here have to be in UTF-8 to start with.
-         This leads to an unnecessary round trip with "@documentencoding 
-         ISO-8859-1" for Info and plain text output, when we first convert the 
-         characters in the input file to UTF-8, and convert them back again 
for 
-         the output.
-      
-         The alternative is to leave the UTF-8 flag off, and hope that Perl 
-         interprets 8-bit encodings like ISO-8859-1 correctly.  See
-         "How does Perl store UTF-8 strings?" in "man perlguts". */
     }
 
   if (e->extra_number > 0)
@@ -483,7 +485,7 @@ element_to_perl_hash (ELEMENT *e)
             case extra_string:
               { /* A simple string. */
               char *value = (char *) f;
-              STORE(newSVpv (value, 0));
+              STORE(newSVpv_utf8 (value, 0));
               break;
               }
             case extra_integer:
@@ -505,15 +507,14 @@ element_to_perl_hash (ELEMENT *e)
                 {
                   if (f->contents.list[j]->text.end > 0)
                     {
-                      av_push (av,
-                               newSVpv (f->contents.list[j]->text.text,
-                                        f->contents.list[j]->text.end));
+                      SV *sv = newSVpv_utf8 (f->contents.list[j]->text.text,
+                                             f->contents.list[j]->text.end);
+                      av_push (av, sv);
                     }
                   else
                     {
                       /* Empty strings permitted. */
-                      av_push (av,
-                               newSVpv ("", 0));
+                      av_push (av, newSVpv ("", 0));
                     }
                 }
               break;
@@ -577,8 +578,10 @@ element_to_perl_hash (ELEMENT *e)
                 hv_store (type, "content", strlen ("content"),
                           build_perl_array (&eft->content->contents), 0);
               if (eft->normalized)
-                hv_store (type, "normalized", strlen ("normalized"),
-                          newSVpv (eft->normalized, 0), 0);
+                {
+                  SV *sv = newSVpv_utf8 (eft->normalized, 0);
+                  hv_store (type, "normalized", strlen ("normalized"), sv, 0);
+                }
               STORE(newRV_inc ((SV *)type));
               break;
               }
@@ -617,7 +620,7 @@ element_to_perl_hash (ELEMENT *e)
 
       if (line_nr->macro)
         {
-          STORE("macro", newSVpv (line_nr->macro, 0));
+          STORE("macro", newSVpv_utf8 (line_nr->macro, 0));
         }
       else
         STORE("macro", newSVpv ("", 0));
@@ -745,7 +748,7 @@ build_single_index_data (INDEX *i)
       hv = (HV *) i->hv;
     }
 
-  STORE("name", newSVpv (i->name, 0));
+  STORE("name", newSVpv_utf8 (i->name, 0));
   STORE("in_code", i->in_code ? newSViv(1) : newSViv(0));
 
   if (i->merged_in)
@@ -767,7 +770,7 @@ build_single_index_data (INDEX *i)
       hv_store (ultimate->contained_hv, i->name, strlen (i->name),
                 newSViv (1), 0);
 
-      STORE("merged_in", newSVpv (ultimate->name, 0));
+      STORE("merged_in", newSVpv_utf8 (ultimate->name, 0));
 
       if (i->contained_hv)
         {
@@ -809,7 +812,7 @@ build_single_index_data (INDEX *i)
       e = &i->index_entries[j];
       entry = newHV ();
 
-      STORE2("index_name", newSVpv (i->name, 0));
+      STORE2("index_name", newSVpv_utf8 (i->name, 0));
       STORE2("index_at_command",
              newSVpv (command_name(e->index_at_command), 0));
       STORE2("index_type_command",
@@ -860,7 +863,7 @@ build_single_index_data (INDEX *i)
       if (e->node)
         STORE2("node", newRV_inc ((SV *)e->node->hv));
       if (e->sortas)
-        STORE2("sortas", newSVpv (e->sortas, 0));
+        STORE2("sortas", newSVpv_utf8 (e->sortas, 0));
 
       /* Create ignored_chars hash. */
       {
@@ -1124,12 +1127,12 @@ build_line_nr_hash (LINE_NR line_nr)
   if (line_nr.macro)
     {
       hv_store (hv, "macro", strlen ("macro"),
-                newSVpv (line_nr.macro, 0), 0);
+                newSVpv_utf8 (line_nr.macro, 0), 0);
     }
   else
     {
       hv_store (hv, "macro", strlen ("macro"),
-                newSVpv ("", 0), 0);
+                newSVpv_utf8 ("", 0), 0);
     }
 
   return newRV_inc ((SV *) hv);
@@ -1147,8 +1150,7 @@ convert_error (int i)
   e = error_list[i];
   hv = newHV ();
 
-  msg = newSVpv (e.message, 0);
-  SvUTF8_on (msg);
+  msg = newSVpv_utf8 (e.message, 0);
 
   hv_store (hv, "message", strlen ("message"), msg, 0);
   hv_store (hv, "type", strlen ("type"),
diff --git a/tp/t/results/include/cpp_lines.pl 
b/tp/t/results/include/cpp_lines.pl
index 3b942f5488..2acca375d6 100644
--- a/tp/t/results/include/cpp_lines.pl
+++ b/tp/t/results/include/cpp_lines.pl
@@ -704,7 +704,7 @@ $result_trees{'cpp_lines'} = {
           'cmdname' => 'documentlanguage',
           'extra' => {
             'spaces_before_argument' => ' ',
-            'text_arg' => 'làng'
+            'text_arg' => "l\x{e0}ng"
           },
           'line_nr' => {
             'file_name' => 'accentêd',
diff --git 
a/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html 
b/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
index 71f800ef1a..e7ee8b9acd 100644
--- a/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
+++ b/tp/tests/formatting/res_parser/non_ascii_command_line/Chapteur.html
@@ -69,7 +69,8 @@ ul.mark-néni {list-style-type: "vàça"}
 <img class="image" src="dîrectory/imàge.êxt" alt="âlt">
 
 
-
+<pre class="verbatim">In included téxt.
+</pre>
 </div>
 <hr>
 <p>
diff --git 
"a/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2" 
"b/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
index 054aa9681a..eeddb28fa3 100644
--- "a/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
+++ "b/tp/tests/formatting/res_parser/non_ascii_command_line/os\303\251.2"
@@ -3,5 +3,4 @@ texi2any: warning: Destruktïw is not a valid language code
 texi2any: warning: unknown variable from command line: Kommandöh
 osé.texi:23: @include: could not find not_existïng.téxi
 osé.texi:21: warning: @image file `dîrectory/imàge' (for HTML) not found, 
using `dîrectory/imàge.êxt'
-osé.texi:25: @verbatiminclude: could not find included_akçentêd.texi
-osé.texi:27: @verbatiminclude: could not find vi_not_existïng.téxi
+osé.texi:27: @verbatiminclude: could not find vi_not_existïng.téxi



reply via email to

[Prev in Thread] Current Thread [Next in Thread]