[6051] parsetexi more complete node name normalization

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[6051] parsetexi more complete node name normalization

From:	Gavin D. Smith
Subject:	[6051] parsetexi more complete node name normalization
Date:	Mon, 19 Jan 2015 17:24:35 +0000
Revision: 6051
          http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=6051
Author:   gavin
Date:     2015-01-19 17:24:34 +0000 (Mon, 19 Jan 2015)
Log Message:
-----------
parsetexi more complete node name normalization

Modified Paths:
--------------
    trunk/parsetexi/ChangeLog
    trunk/parsetexi/command_data.awk
    trunk/parsetexi/command_ids.h
    trunk/parsetexi/convert.c

Modified: trunk/parsetexi/ChangeLog
===================================================================
--- trunk/parsetexi/ChangeLog   2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/ChangeLog   2015-01-19 17:24:34 UTC (rev 6051)
@@ -1,3 +1,11 @@
+2015-01-19  Gavin Smith  <address@hidden>
+
+       * command_data.awk: Output #define's into generated 
+       commands_id.h file to give more readable alternatives for the 
+       CM_hex_?? enum command_id values.
+       * convert.c (convert_to_normalized): More complete 
+       implementation.
+
 2015-01-18  Gavin Smith  <address@hidden>
 
        * tree_types.h (INDEX_ENTRY_REF): New type.

Modified: trunk/parsetexi/command_data.awk
===================================================================
--- trunk/parsetexi/command_data.awk    2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/command_data.awk    2015-01-19 17:24:34 UTC (rev 6051)
@@ -69,6 +69,31 @@
     }
 
     print "/* This file automatically generated by command_data.awk */"
+    print
+    print "/* Useful aliases */"
+    print "#define CM_hex_09 CM_TAB"
+    print "#define CM_hex_0a CM_NEWLINE"
+    print "#define CM_hex_20 CM_SPACE"
+    print "#define CM_hex_21 CM_EXCLAMATION_MARK"
+    print "#define CM_hex_22 CM_POUND_SIGN"
+    print "#define CM_hex_27 CM_BACKQUOTE"
+    print "#define CM_hex_2a CM_ASTERISK"
+    print "#define CM_hex_2c CM_COMMA"
+    print "#define CM_hex_2d CM_HYPHEN"
+    print "#define CM_hex_2e CM_FULL_STOP"
+    print "#define CM_hex_2f CM_SLASH"
+    print "#define CM_hex_3a CM_COLON"
+    print "#define CM_hex_3d CM_EQUALS"
+    print "#define CM_hex_3f CM_QUESTION_MARK"
+    print "#define CM_hex_40 CM_AT_SIGN"
+    print "#define CM_hex_5c CM_BACKSLASH"
+    print "#define CM_hex_5e CM_CIRCUMFLEX"
+    print "#define CM_hex_60 CM_BACKQUOTE"
+    print "#define CM_hex_7b CM_OPEN_BRACE"
+    print "#define CM_hex_7c CM_VERTICAL_BAR"
+    print "#define CM_hex_7d CM_CLOSE_BRACE"
+    print "#define CM_hex_7e CM_TILDE"
+    print
     print "enum command_id {"
     print "CM_NONE,"
     print

Modified: trunk/parsetexi/command_ids.h
===================================================================
--- trunk/parsetexi/command_ids.h       2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/command_ids.h       2015-01-19 17:24:34 UTC (rev 6051)
@@ -1,4 +1,29 @@
 /* This file automatically generated by command_data.awk */
+
+/* Useful aliases */
+#define CM_hex_09 CM_TAB
+#define CM_hex_0a CM_NEWLINE
+#define CM_hex_20 CM_SPACE
+#define CM_hex_21 CM_EXCLAMATION_MARK
+#define CM_hex_22 CM_POUND_SIGN
+#define CM_hex_27 CM_APOSTROPHE
+#define CM_hex_2a CM_ASTERISK
+#define CM_hex_2c CM_COMMA
+#define CM_hex_2d CM_HYPHEN
+#define CM_hex_2e CM_FULL_STOP
+#define CM_hex_2f CM_SLASH
+#define CM_hex_3a CM_COLON
+#define CM_hex_3d CM_EQUALS
+#define CM_hex_3f CM_QUESTION_MARK
+#define CM_hex_40 CM_AT_SIGN
+#define CM_hex_5c CM_BACKSLASH
+#define CM_hex_5e CM_CIRCUMFLEX
+#define CM_hex_60 CM_BACKQUOTE
+#define CM_hex_7b CM_OPEN_BRACE
+#define CM_hex_7c CM_VERTICAL_BAR
+#define CM_hex_7d CM_CLOSE_BRACE
+#define CM_hex_7e CM_TILDE
+
 enum command_id {
 CM_NONE,
 

Modified: trunk/parsetexi/convert.c
===================================================================
--- trunk/parsetexi/convert.c   2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/convert.c   2015-01-19 17:24:34 UTC (rev 6051)
@@ -17,6 +17,9 @@
 #include <stdlib.h>
 
 #include "tree_types.h"
+#include "tree.h"
+#include "commands.h"
+#include "text.h"
 
 /* Stub for Texinfo::Convert::Text::convert */
 char *
@@ -35,17 +38,238 @@
   return "bar";
 }
 
-/* Stub for Texinfo::Convert::NodeNameNormalization::normalize_node. */
-char *
-convert_to_normalized (ELEMENT *label)
+/* IN_UC is non-zero if we are converting to upper case. */
+static void
+convert_to_normalized_internal (ELEMENT *root, TEXT *result, int in_uc)
 {
-  int i;
+#define ADD(x) text_append (result, x)
 
-  /* Return text of the first contents child that has text. */
-  for (i = 0; i < label->contents.number; i++)
+  /* Empty if ignored type, or ignored brace command, or has a misc arg or 
misc 
+     line arg argument. */
+
+  if (root->text.end > 0)
     {
-      if (label->contents.list[i]->text.end > 0)
-        return label->contents.list[i]->text.text;
+      text_append_n (result, root->text.text, root->text.end);
     }
-  return 0;
+
+  if (root->cmd != CM_NONE) // 228 NodeNameNormalization.pm
+    {
+      if (command_flags(root) & CF_nobrace || root->cmd == CM_ASTERISK)
+        {
+          switch (root->cmd) // 353 Common.pm
+            {
+            case CM_ASTERISK:
+              ADD("\n");
+              break;
+            case CM_SPACE:
+            case CM_TAB:
+            case CM_NEWLINE:
+              ADD(" ");
+              break;
+            case CM_HYPHEN:
+            case CM_VERTICAL_BAR:
+            case CM_SLASH:
+            case CM_COLON:
+              break;
+            case CM_EXCLAMATION_MARK:
+            case CM_QUESTION_MARK:
+            case CM_FULL_STOP:
+            case CM_AT_SIGN:
+            case CM_CLOSE_BRACE:
+            case CM_OPEN_BRACE:
+            case CM_BACKSLASH:
+              ADD(command_data(root->cmd).cmdname);
+              break;
+            default:
+              /* Shouldn't get here. */
+              break;
+            }
+          return;
+        }
+      else /* unicode_character_brace_no_arg_commands line 538 of Unicode.pm */
+        {
+          int not_processed = 0;
+          /* TODO: There are more commands there than are listed in
+             "(texinfo)HTML Xref Command Expansion", like @guillemetleft. */
+
+          /* All of these are non-ASCII characters. */
+
+          /* TODO: How exactly to normalize to Unicode Normalization Form C?
+             It would be better to leave it to Perl, so we don't have to find 
a 
+             Unicode library for C.
+             In that case we should add the UTF-8 character, instead of its
+             _XXXX representation, and normalize, followed by the conversion 
to 
+             the escaped form.
+             Read in Perl with "$characters = decode('UTF-8', $octets,     
+             Encode::FB_CROAK);" */
+
+          switch (root->cmd)
+            {
+            case CM_bullet:
+              ADD("_2022"); break;
+            case CM_copyright:
+              ADD("_00A9"); break;
+            case CM_registeredsymbol:
+              ADD("_00AE"); break;
+            case CM_dots:
+              ADD("_2026"); break;
+            case CM_equiv:
+              ADD("_2261"); break;
+            case CM_expansion:
+              ADD("_21A6"); break;
+            case CM_arrow:
+              ADD("_2192"); break;
+            case CM_minus:
+              ADD("_2212"); break;
+            case CM_point:
+              ADD("_2605"); break;
+            case CM_print:
+              ADD("_22A3"); break;
+            case CM_result:
+              ADD("_21D2"); break;
+            case CM_aa:
+              ADD("_00E5"); break;
+            case CM_AA:
+              ADD("_00C5"); break;
+            case CM_ae:
+              ADD("_00E6"); break;
+            case CM_oe:
+              ADD("_0153"); break;
+            case CM_AE:
+              ADD("_00C6"); break;
+            case CM_OE:
+              ADD("_0152"); break;
+            case CM_o:
+              ADD("_00F8"); break;
+            case CM_O:
+              ADD("_00D8"); break;
+            case CM_ss:
+              ADD("_00DF"); break;
+            case CM_DH:
+              ADD("_00D0"); break;
+            case CM_dh:
+              ADD("_00F0"); break;
+            case CM_TH:
+              ADD("_00DE"); break;
+            case CM_th:
+              ADD("_00FE"); break;
+            case CM_l:
+              ADD("_0142"); break;
+            case CM_L:
+              ADD("_0141"); break;
+            case CM_exclamdown:
+              ADD("_00A1"); break;
+            case CM_questiondown:
+              ADD("_00BF"); break;
+            case CM_pounds:
+              ADD("_00A3"); break;
+            case CM_ordf:
+              ADD("_00AA"); break;
+            case CM_ordm:
+              ADD("_00BA"); break;
+            case CM_comma:
+              ADD("_002C"); break;
+            case CM_atchar:
+              ADD("_0040"); break;
+            case CM_lbracechar:
+              ADD("_007B"); break;
+            case CM_rbracechar:
+              ADD("_007D"); break;
+            case CM_backslashchar:
+              ADD("_005C"); break;
+            case CM_hashchar:
+              ADD("_0023"); break;
+            case CM_euro:
+              ADD("_20AC"); break;
+            case CM_geq:
+              ADD("_2265"); break;
+            case CM_leq:
+              ADD("_2264"); break;
+            case CM_textdegree:
+              ADD("_00B0"); break;
+            case CM_quotedblleft:
+              ADD("_201C"); break;
+            case CM_quotedblright:
+              ADD("_201D"); break;
+            case CM_quoteleft:
+              ADD("_2018"); break;
+            case CM_quoteright:
+              ADD("_2019"); break;
+            case CM_quotedblbase:
+              ADD("_201E"); break;
+            case CM_quotesinglbase:
+              ADD("_201A"); break;
+            case CM_guillemetleft:
+              ADD("_00AB"); break;
+            case CM_guillemetright:
+              ADD("_00BB"); break;
+            case CM_guillemotleft:
+              ADD("_00AB"); break;
+            case CM_guillemotright:
+              ADD("_00BB"); break;
+            case CM_guilsinglleft:
+              ADD("_2039"); break;
+            case CM_guilsinglright:
+              ADD("_203A"); break;
+            case CM_click:
+              ADD("_2192"); break;
+            default:
+              not_processed = 1;
+            }
+          if (!not_processed)
+            return;
+        }
+
+      /* 300 bracketed. */
+      if (root->type == ET_bracketed)
+        text_append_n (result, "{", 1);
+
+      if (command_flags(root) & CF_accent) //243
+        {
+          /* TODO: Get the Unicode value for the accented character. */
+          return;
+        }
+      // else // 262 ref commands - why would we have these in node names??
+
+      /* For example, the @t command in "@t{makeinfo} Pointer Creation". */
+      else if (root->args.number >= 1 // 287
+               && (args_child_by_index(root, 0)->type == ET_brace_command_arg
+                   || root->cmd == CM_math))
+        {
+          if (root->cmd == CM_sc)
+            in_uc = 1;
+          convert_to_normalized_internal (args_child_by_index(root, 0),
+                                          result, in_uc);
+        }
+    } // 294
+
+  if (root->contents.number > 0) // 295
+    {
+      /* Concatenate the conversion of each contents child. */
+      int i;
+
+      for (i = 0; i < root->contents.number; i++)
+        {
+          convert_to_normalized_internal (root->contents.list[i],
+                                          result, in_uc);
+        }
+    }
+
+  /* 300 bracketed. */
+  if (root->type == ET_bracketed)
+    text_append_n (result, "}", 1);
+
+#undef ADD
 }
+
+/* Replacement for Texinfo::Convert::NodeNameNormalization::normalize_node. */
+char *
+convert_to_normalized (ELEMENT *label)
+{
+  TEXT result;
+
+  text_init (&result);
+  convert_to_normalized_internal (label, &result, 0);
+
+  return result.text;
+}
[Prev in Thread]
Current Thread
[Next in Thread]
[6051] parsetexi more complete node name normalization, Gavin D. Smith <=
Prev by Date: [6050] check for mknod usability
Next by Date: [6052] parsetexi better dumping
Previous by thread: [6050] check for mknod usability
Next by thread: [6052] parsetexi better dumping
Index(es):
- Date
- Thread