[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[6051] parsetexi more complete node name normalization
From: |
Gavin D. Smith |
Subject: |
[6051] parsetexi more complete node name normalization |
Date: |
Mon, 19 Jan 2015 17:24:35 +0000 |
Revision: 6051
http://svn.sv.gnu.org/viewvc/?view=rev&root=texinfo&revision=6051
Author: gavin
Date: 2015-01-19 17:24:34 +0000 (Mon, 19 Jan 2015)
Log Message:
-----------
parsetexi more complete node name normalization
Modified Paths:
--------------
trunk/parsetexi/ChangeLog
trunk/parsetexi/command_data.awk
trunk/parsetexi/command_ids.h
trunk/parsetexi/convert.c
Modified: trunk/parsetexi/ChangeLog
===================================================================
--- trunk/parsetexi/ChangeLog 2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/ChangeLog 2015-01-19 17:24:34 UTC (rev 6051)
@@ -1,3 +1,11 @@
+2015-01-19 Gavin Smith <address@hidden>
+
+ * command_data.awk: Output #define's into generated
+ commands_id.h file to give more readable alternatives for the
+ CM_hex_?? enum command_id values.
+ * convert.c (convert_to_normalized): More complete
+ implementation.
+
2015-01-18 Gavin Smith <address@hidden>
* tree_types.h (INDEX_ENTRY_REF): New type.
Modified: trunk/parsetexi/command_data.awk
===================================================================
--- trunk/parsetexi/command_data.awk 2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/command_data.awk 2015-01-19 17:24:34 UTC (rev 6051)
@@ -69,6 +69,31 @@
}
print "/* This file automatically generated by command_data.awk */"
+ print
+ print "/* Useful aliases */"
+ print "#define CM_hex_09 CM_TAB"
+ print "#define CM_hex_0a CM_NEWLINE"
+ print "#define CM_hex_20 CM_SPACE"
+ print "#define CM_hex_21 CM_EXCLAMATION_MARK"
+ print "#define CM_hex_22 CM_POUND_SIGN"
+ print "#define CM_hex_27 CM_BACKQUOTE"
+ print "#define CM_hex_2a CM_ASTERISK"
+ print "#define CM_hex_2c CM_COMMA"
+ print "#define CM_hex_2d CM_HYPHEN"
+ print "#define CM_hex_2e CM_FULL_STOP"
+ print "#define CM_hex_2f CM_SLASH"
+ print "#define CM_hex_3a CM_COLON"
+ print "#define CM_hex_3d CM_EQUALS"
+ print "#define CM_hex_3f CM_QUESTION_MARK"
+ print "#define CM_hex_40 CM_AT_SIGN"
+ print "#define CM_hex_5c CM_BACKSLASH"
+ print "#define CM_hex_5e CM_CIRCUMFLEX"
+ print "#define CM_hex_60 CM_BACKQUOTE"
+ print "#define CM_hex_7b CM_OPEN_BRACE"
+ print "#define CM_hex_7c CM_VERTICAL_BAR"
+ print "#define CM_hex_7d CM_CLOSE_BRACE"
+ print "#define CM_hex_7e CM_TILDE"
+ print
print "enum command_id {"
print "CM_NONE,"
print
Modified: trunk/parsetexi/command_ids.h
===================================================================
--- trunk/parsetexi/command_ids.h 2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/command_ids.h 2015-01-19 17:24:34 UTC (rev 6051)
@@ -1,4 +1,29 @@
/* This file automatically generated by command_data.awk */
+
+/* Useful aliases */
+#define CM_hex_09 CM_TAB
+#define CM_hex_0a CM_NEWLINE
+#define CM_hex_20 CM_SPACE
+#define CM_hex_21 CM_EXCLAMATION_MARK
+#define CM_hex_22 CM_POUND_SIGN
+#define CM_hex_27 CM_APOSTROPHE
+#define CM_hex_2a CM_ASTERISK
+#define CM_hex_2c CM_COMMA
+#define CM_hex_2d CM_HYPHEN
+#define CM_hex_2e CM_FULL_STOP
+#define CM_hex_2f CM_SLASH
+#define CM_hex_3a CM_COLON
+#define CM_hex_3d CM_EQUALS
+#define CM_hex_3f CM_QUESTION_MARK
+#define CM_hex_40 CM_AT_SIGN
+#define CM_hex_5c CM_BACKSLASH
+#define CM_hex_5e CM_CIRCUMFLEX
+#define CM_hex_60 CM_BACKQUOTE
+#define CM_hex_7b CM_OPEN_BRACE
+#define CM_hex_7c CM_VERTICAL_BAR
+#define CM_hex_7d CM_CLOSE_BRACE
+#define CM_hex_7e CM_TILDE
+
enum command_id {
CM_NONE,
Modified: trunk/parsetexi/convert.c
===================================================================
--- trunk/parsetexi/convert.c 2015-01-18 19:37:05 UTC (rev 6050)
+++ trunk/parsetexi/convert.c 2015-01-19 17:24:34 UTC (rev 6051)
@@ -17,6 +17,9 @@
#include <stdlib.h>
#include "tree_types.h"
+#include "tree.h"
+#include "commands.h"
+#include "text.h"
/* Stub for Texinfo::Convert::Text::convert */
char *
@@ -35,17 +38,238 @@
return "bar";
}
-/* Stub for Texinfo::Convert::NodeNameNormalization::normalize_node. */
-char *
-convert_to_normalized (ELEMENT *label)
+/* IN_UC is non-zero if we are converting to upper case. */
+static void
+convert_to_normalized_internal (ELEMENT *root, TEXT *result, int in_uc)
{
- int i;
+#define ADD(x) text_append (result, x)
- /* Return text of the first contents child that has text. */
- for (i = 0; i < label->contents.number; i++)
+ /* Empty if ignored type, or ignored brace command, or has a misc arg or
misc
+ line arg argument. */
+
+ if (root->text.end > 0)
{
- if (label->contents.list[i]->text.end > 0)
- return label->contents.list[i]->text.text;
+ text_append_n (result, root->text.text, root->text.end);
}
- return 0;
+
+ if (root->cmd != CM_NONE) // 228 NodeNameNormalization.pm
+ {
+ if (command_flags(root) & CF_nobrace || root->cmd == CM_ASTERISK)
+ {
+ switch (root->cmd) // 353 Common.pm
+ {
+ case CM_ASTERISK:
+ ADD("\n");
+ break;
+ case CM_SPACE:
+ case CM_TAB:
+ case CM_NEWLINE:
+ ADD(" ");
+ break;
+ case CM_HYPHEN:
+ case CM_VERTICAL_BAR:
+ case CM_SLASH:
+ case CM_COLON:
+ break;
+ case CM_EXCLAMATION_MARK:
+ case CM_QUESTION_MARK:
+ case CM_FULL_STOP:
+ case CM_AT_SIGN:
+ case CM_CLOSE_BRACE:
+ case CM_OPEN_BRACE:
+ case CM_BACKSLASH:
+ ADD(command_data(root->cmd).cmdname);
+ break;
+ default:
+ /* Shouldn't get here. */
+ break;
+ }
+ return;
+ }
+ else /* unicode_character_brace_no_arg_commands line 538 of Unicode.pm */
+ {
+ int not_processed = 0;
+ /* TODO: There are more commands there than are listed in
+ "(texinfo)HTML Xref Command Expansion", like @guillemetleft. */
+
+ /* All of these are non-ASCII characters. */
+
+ /* TODO: How exactly to normalize to Unicode Normalization Form C?
+ It would be better to leave it to Perl, so we don't have to find
a
+ Unicode library for C.
+ In that case we should add the UTF-8 character, instead of its
+ _XXXX representation, and normalize, followed by the conversion
to
+ the escaped form.
+ Read in Perl with "$characters = decode('UTF-8', $octets,
+ Encode::FB_CROAK);" */
+
+ switch (root->cmd)
+ {
+ case CM_bullet:
+ ADD("_2022"); break;
+ case CM_copyright:
+ ADD("_00A9"); break;
+ case CM_registeredsymbol:
+ ADD("_00AE"); break;
+ case CM_dots:
+ ADD("_2026"); break;
+ case CM_equiv:
+ ADD("_2261"); break;
+ case CM_expansion:
+ ADD("_21A6"); break;
+ case CM_arrow:
+ ADD("_2192"); break;
+ case CM_minus:
+ ADD("_2212"); break;
+ case CM_point:
+ ADD("_2605"); break;
+ case CM_print:
+ ADD("_22A3"); break;
+ case CM_result:
+ ADD("_21D2"); break;
+ case CM_aa:
+ ADD("_00E5"); break;
+ case CM_AA:
+ ADD("_00C5"); break;
+ case CM_ae:
+ ADD("_00E6"); break;
+ case CM_oe:
+ ADD("_0153"); break;
+ case CM_AE:
+ ADD("_00C6"); break;
+ case CM_OE:
+ ADD("_0152"); break;
+ case CM_o:
+ ADD("_00F8"); break;
+ case CM_O:
+ ADD("_00D8"); break;
+ case CM_ss:
+ ADD("_00DF"); break;
+ case CM_DH:
+ ADD("_00D0"); break;
+ case CM_dh:
+ ADD("_00F0"); break;
+ case CM_TH:
+ ADD("_00DE"); break;
+ case CM_th:
+ ADD("_00FE"); break;
+ case CM_l:
+ ADD("_0142"); break;
+ case CM_L:
+ ADD("_0141"); break;
+ case CM_exclamdown:
+ ADD("_00A1"); break;
+ case CM_questiondown:
+ ADD("_00BF"); break;
+ case CM_pounds:
+ ADD("_00A3"); break;
+ case CM_ordf:
+ ADD("_00AA"); break;
+ case CM_ordm:
+ ADD("_00BA"); break;
+ case CM_comma:
+ ADD("_002C"); break;
+ case CM_atchar:
+ ADD("_0040"); break;
+ case CM_lbracechar:
+ ADD("_007B"); break;
+ case CM_rbracechar:
+ ADD("_007D"); break;
+ case CM_backslashchar:
+ ADD("_005C"); break;
+ case CM_hashchar:
+ ADD("_0023"); break;
+ case CM_euro:
+ ADD("_20AC"); break;
+ case CM_geq:
+ ADD("_2265"); break;
+ case CM_leq:
+ ADD("_2264"); break;
+ case CM_textdegree:
+ ADD("_00B0"); break;
+ case CM_quotedblleft:
+ ADD("_201C"); break;
+ case CM_quotedblright:
+ ADD("_201D"); break;
+ case CM_quoteleft:
+ ADD("_2018"); break;
+ case CM_quoteright:
+ ADD("_2019"); break;
+ case CM_quotedblbase:
+ ADD("_201E"); break;
+ case CM_quotesinglbase:
+ ADD("_201A"); break;
+ case CM_guillemetleft:
+ ADD("_00AB"); break;
+ case CM_guillemetright:
+ ADD("_00BB"); break;
+ case CM_guillemotleft:
+ ADD("_00AB"); break;
+ case CM_guillemotright:
+ ADD("_00BB"); break;
+ case CM_guilsinglleft:
+ ADD("_2039"); break;
+ case CM_guilsinglright:
+ ADD("_203A"); break;
+ case CM_click:
+ ADD("_2192"); break;
+ default:
+ not_processed = 1;
+ }
+ if (!not_processed)
+ return;
+ }
+
+ /* 300 bracketed. */
+ if (root->type == ET_bracketed)
+ text_append_n (result, "{", 1);
+
+ if (command_flags(root) & CF_accent) //243
+ {
+ /* TODO: Get the Unicode value for the accented character. */
+ return;
+ }
+ // else // 262 ref commands - why would we have these in node names??
+
+ /* For example, the @t command in "@t{makeinfo} Pointer Creation". */
+ else if (root->args.number >= 1 // 287
+ && (args_child_by_index(root, 0)->type == ET_brace_command_arg
+ || root->cmd == CM_math))
+ {
+ if (root->cmd == CM_sc)
+ in_uc = 1;
+ convert_to_normalized_internal (args_child_by_index(root, 0),
+ result, in_uc);
+ }
+ } // 294
+
+ if (root->contents.number > 0) // 295
+ {
+ /* Concatenate the conversion of each contents child. */
+ int i;
+
+ for (i = 0; i < root->contents.number; i++)
+ {
+ convert_to_normalized_internal (root->contents.list[i],
+ result, in_uc);
+ }
+ }
+
+ /* 300 bracketed. */
+ if (root->type == ET_bracketed)
+ text_append_n (result, "}", 1);
+
+#undef ADD
}
+
+/* Replacement for Texinfo::Convert::NodeNameNormalization::normalize_node. */
+char *
+convert_to_normalized (ELEMENT *label)
+{
+ TEXT result;
+
+ text_init (&result);
+ convert_to_normalized_internal (label, &result, 0);
+
+ return result.text;
+}
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [6051] parsetexi more complete node name normalization,
Gavin D. Smith <=