[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: * tp/Texinfo/Convert/Unicode.pm (%unicode_to_eigh
From: |
Patrice Dumas |
Subject: |
branch master updated: * tp/Texinfo/Convert/Unicode.pm (%unicode_to_eight_bit) (_eight_bit_and_unicode_point, encoded_accents) (unicode_point_decoded_in_encoding): use directly the input encodings names. Separate mappings of koi8-u and koi8-r. |
Date: |
Fri, 18 Feb 2022 16:26:10 -0500 |
This is an automated email from the git hooks/post-receive script.
pertusus pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new d57ee64a19 * tp/Texinfo/Convert/Unicode.pm (%unicode_to_eight_bit)
(_eight_bit_and_unicode_point, encoded_accents)
(unicode_point_decoded_in_encoding): use directly the input encodings names.
Separate mappings of koi8-u and koi8-r.
d57ee64a19 is described below
commit d57ee64a194e3043dd35e16616891f653c7f2785
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Fri Feb 18 22:26:00 2022 +0100
* tp/Texinfo/Convert/Unicode.pm (%unicode_to_eight_bit)
(_eight_bit_and_unicode_point, encoded_accents)
(unicode_point_decoded_in_encoding): use directly the input encodings
names. Separate mappings of koi8-u and koi8-r.
* tp/Texinfo/Encoding.pm (encoding_alias): remove
%eight_bit_encoding_aliases. Use my, not our for variables not used
elsewhere.
* util/txixml2texi.pl: use directly Encode::resolve_alias() instead of
Texinfo::Encoding::encoding_alias().
* tp/Texinfo/XS/parsetexi/end_line.c (end_line_misc_line): add latin-1
mapping. Remove impossible or incorrect mappings from encoding_map.
* doc/tp_api/Makefile.am: do not document Texinfo::Encoding.
---
ChangeLog | 19 +++++++++++++++++
doc/tp_api/Makefile.am | 1 -
tp/Texinfo/Convert/Unicode.pm | 42 ++++++++++++++++++++++----------------
tp/Texinfo/Encoding.pm | 32 ++++++++---------------------
tp/Texinfo/XS/parsetexi/end_line.c | 15 +++++++++-----
util/txixml2texi.pl | 5 +++--
6 files changed, 65 insertions(+), 49 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index ffd684842f..b10e737064 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2022-02-18 Patrice Dumas <pertusus@free.fr>
+
+ * tp/Texinfo/Convert/Unicode.pm (%unicode_to_eight_bit)
+ (_eight_bit_and_unicode_point, encoded_accents)
+ (unicode_point_decoded_in_encoding): use directly the input encodings
+ names. Separate mappings of koi8-u and koi8-r.
+
+ * tp/Texinfo/Encoding.pm (encoding_alias): remove
+ %eight_bit_encoding_aliases. Use my, not our for variables not used
+ elsewhere.
+
+ * util/txixml2texi.pl: use directly Encode::resolve_alias() instead of
+ Texinfo::Encoding::encoding_alias().
+
+ * tp/Texinfo/XS/parsetexi/end_line.c (end_line_misc_line): add latin-1
+ mapping. Remove impossible or incorrect mappings from encoding_map.
+
+ * doc/tp_api/Makefile.am: do not document Texinfo::Encoding.
+
2022-02-18 Patrice Dumas <pertusus@free.fr>
* Pod-Simple-Texinfo/pod2texi.pl: do not ignore an input file that
diff --git a/doc/tp_api/Makefile.am b/doc/tp_api/Makefile.am
index ca4fd0e2fd..46e0e87795 100644
--- a/doc/tp_api/Makefile.am
+++ b/doc/tp_api/Makefile.am
@@ -56,7 +56,6 @@ BUILT_SOURCES = tp_api.texi
# The order matters for the result.
tp_api_dependencies = \
$(top_srcdir)/tp/Texinfo/Common.pm \
- $(top_srcdir)/tp/Texinfo/Encoding.pm \
$(top_srcdir)/tp/Texinfo/ParserNonXS.pm \
$(top_srcdir)/tp/Texinfo/Structuring.pm \
$(top_srcdir)/tp/Texinfo/Report.pm \
diff --git a/tp/Texinfo/Convert/Unicode.pm b/tp/Texinfo/Convert/Unicode.pm
index d43b3c7671..7b1b7b3d91 100644
--- a/tp/Texinfo/Convert/Unicode.pm
+++ b/tp/Texinfo/Convert/Unicode.pm
@@ -652,7 +652,7 @@ foreach my $command (keys(%unicode_accented_letters)) {
}
my %unicode_to_eight_bit = (
- 'iso8859_1' => {
+ 'iso-8859-1' => {
'00A0' => 'A0',
'00A1' => 'A1',
'00A2' => 'A2',
@@ -751,7 +751,7 @@ my %unicode_to_eight_bit = (
'00FE' => 'FE',
'00FF' => 'FF',
},
- 'iso8859_15' => {
+ 'iso-8859-15' => {
'00A0' => 'A0',
'00A1' => 'A1',
'00A2' => 'A2',
@@ -849,7 +849,7 @@ my %unicode_to_eight_bit = (
'00FE' => 'FE',
'00FF' => 'FF',
},
- 'iso8859_2' => {
+ 'iso-8859-2' => {
'00A0' => 'A0',
'0104' => 'A1',
'02D8' => 'A2',
@@ -946,15 +946,9 @@ my %unicode_to_eight_bit = (
'0163' => 'FE',
'02D9' => 'FF',
},
- 'koi8' => {
+ 'koi8-r' => {
'0415' => 'A3',
- '0454' => 'A4',
- '0456' => 'A6',
- '0457' => 'A7',
'04D7' => 'B3',
- '0404' => 'B4',
- '0406' => 'B6',
- '0407' => 'B7',
'042E' => 'C0',
'0430' => 'C1',
'0431' => 'C2',
@@ -1020,8 +1014,23 @@ my %unicode_to_eight_bit = (
'0427' => 'FE',
'042A' => 'FF',
},
+ 'koi8-u' => {
+ '0454' => 'A4',
+ '0404' => 'B4',
+ '0456' => 'A6',
+ '0406' => 'B6',
+ '0457' => 'A7',
+ '0407' => 'B7',
+ '0491' => 'AD',
+ '0490' => 'BD',
+ }
);
+foreach my $unicode_point (keys(%{$unicode_to_eight_bit{'koi8-r'}})) {
+ $unicode_to_eight_bit{'koi8-u'}->{$unicode_point}
+ = $unicode_to_eight_bit{'koi8-r'}->{$unicode_point};
+}
+
# currently unused
my %makeinfo_transliterate_map = (
'0416' => 'ZH',
@@ -1277,8 +1286,6 @@ sub _eight_bit_and_unicode_point($$)
my $char = shift;
my $encoding = shift;
- my $encoding_map_name
- = $Texinfo::Encoding::eight_bit_encoding_aliases{$encoding};
my ($eight_bit, $codepoint);
if (ord($char) <= 128) {
# 7bit ascii characters, the same in every 8bit encodings
@@ -1286,9 +1293,8 @@ sub _eight_bit_and_unicode_point($$)
$codepoint = uc(sprintf("%04x",ord($char)));
} elsif (ord($char) <= hex(0xFFFF)) {
$codepoint = uc(sprintf("%04x",ord($char)));
- if (exists($unicode_to_eight_bit{$encoding_map_name}->{$codepoint})) {
- $eight_bit
- = $unicode_to_eight_bit{$encoding_map_name}->{$codepoint};
+ if (exists($unicode_to_eight_bit{$encoding}->{$codepoint})) {
+ $eight_bit = $unicode_to_eight_bit{$encoding}->{$codepoint};
}
}
return ($eight_bit, $codepoint);
@@ -1458,7 +1464,7 @@ sub encoded_accents($$$$$;$)
if ($encoding eq 'utf-8') {
return _format_unicode_accents_stack($converter, $text, $stack,
$format_accent, $set_case);
- } elsif ($Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}) {
+ } elsif ($unicode_to_eight_bit{$encoding}) {
return _format_eight_bit_accents_stack($converter, $text, $stack,
$encoding,
$format_accent, $set_case);
}
@@ -1474,8 +1480,8 @@ sub unicode_point_decoded_in_encoding($$) {
return 1 if ($encoding
and ($encoding eq 'utf-8'
- or
($Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}
- and
$unicode_to_eight_bit{$Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}}->{$unicode_point})));
+ or ($unicode_to_eight_bit{$encoding}
+ and
$unicode_to_eight_bit{$encoding}->{$unicode_point})));
return 0;
}
diff --git a/tp/Texinfo/Encoding.pm b/tp/Texinfo/Encoding.pm
index 67e33d410f..f70e4da229 100644
--- a/tp/Texinfo/Encoding.pm
+++ b/tp/Texinfo/Encoding.pm
@@ -33,7 +33,7 @@ use vars qw(@ISA @EXPORT_OK);
# charset related definitions.
-our %perl_charset_to_html = (
+my %perl_charset_to_html = (
'utf8' => 'utf-8',
'utf-8-strict' => 'utf-8',
'ascii' => 'us-ascii',
@@ -41,47 +41,32 @@ our %perl_charset_to_html = (
);
# encoding name normalization to html-compatible encoding names
-our %encoding_aliases = (
- 'latin1' => 'iso-8859-1',
-);
+my %encoding_aliases;
foreach my $perl_charset (keys(%perl_charset_to_html)) {
$encoding_aliases{$perl_charset} = $perl_charset_to_html{$perl_charset};
$encoding_aliases{$perl_charset_to_html{$perl_charset}}
= $perl_charset_to_html{$perl_charset};
}
-our %eight_bit_encoding_aliases = (
- "iso-8859-1", 'iso8859_1',
- "iso-8859-2", 'iso8859_2',
- "iso-8859-15", 'iso8859_15',
- "koi8-r", 'koi8',
- "koi8-u", 'koi8',
-);
-
-foreach my $encoding (keys(%eight_bit_encoding_aliases)) {
- $encoding_aliases{$encoding} = $encoding;
- $encoding_aliases{$eight_bit_encoding_aliases{$encoding}} = $encoding;
-}
-our %canonical_texinfo_encodings;
+my %canonical_texinfo_encodings;
# These are the encodings from the texinfo manual
-foreach my $canonical_encoding('us-ascii', 'utf-8', 'iso-8859-1',
- 'iso-8859-15','iso-8859-2','koi8-r', 'koi8-u') {
+foreach my $canonical_encoding ('us-ascii', 'utf-8', 'iso-8859-1',
+ 'iso-8859-15', 'iso-8859-2', 'koi8-r', 'koi8-u') {
$canonical_texinfo_encodings{$canonical_encoding} = 1;
+ $encoding_aliases{$canonical_encoding} = $canonical_encoding;
}
sub encoding_alias($)
{
my $encoding = shift;
- my $canonical_texinfo_encoding;
- $canonical_texinfo_encoding
- = $encoding if ($canonical_texinfo_encodings{lc($encoding)});
my $perl_encoding = Encode::resolve_alias($encoding);
my $canonical_output_encoding;
if ($perl_encoding) {
$canonical_output_encoding = $encoding_aliases{$perl_encoding};
}
- foreach my $possible_encoding($encoding, $canonical_output_encoding,
+ my $canonical_texinfo_encoding;
+ foreach my $possible_encoding ($encoding, $canonical_output_encoding,
$perl_encoding) {
if (defined($possible_encoding)
and $canonical_texinfo_encodings{lc($possible_encoding)}) {
@@ -120,6 +105,7 @@ Texinfo::Encoding takes care of encoding definition and
aliasing.
=over
=item ($canonical_texinfo_encoding, $perl_encoding,
$canonical_output_encoding) = encoding_alias($encoding)
+X<C<encoding_alias>>
Taking an encoding name as argument, the function returns the
corresponding canonical Texinfo encoding I<$canonical_texinfo_encoding>
diff --git a/tp/Texinfo/XS/parsetexi/end_line.c
b/tp/Texinfo/XS/parsetexi/end_line.c
index b67c147247..625d51dc99 100644
--- a/tp/Texinfo/XS/parsetexi/end_line.c
+++ b/tp/Texinfo/XS/parsetexi/end_line.c
@@ -1503,6 +1503,8 @@ end_line_misc_line (ELEMENT *current)
static char *known_encodings[] = {
"shift_jis",
"latin1",
+ "latin-1",
+ "utf8",
0
};
for (i = 0; (known_encodings[i]); i++)
@@ -1521,11 +1523,15 @@ end_line_misc_line (ELEMENT *current)
struct encoding_map {
char *from; char *to;
};
+ /* The map mimics Encode::resolve_alias() result. Even when
+ the alias is not good, such as 'utf-8-strict' for 'utf-8'
+ use the same mapping for consistency with the perl Parser
*/
static struct encoding_map map[] = {
"utf-8", "utf-8-strict",
"us-ascii", "ascii",
"shift_jis", "shiftjis",
- "latin1", "iso-8859-1"
+ "latin1", "iso-8859-1",
+ "latin-1", "iso-8859-1"
};
for (i = 0; i < sizeof map / sizeof *map; i++)
{
@@ -1544,7 +1550,7 @@ end_line_misc_line (ELEMENT *current)
{
command_warn (current, "unrecognized encoding name `%s'",
text);
- /* Texinfo::Encoding calls Encode::Alias, so knows
+ /* Texinfo::Encoding calls Encode::resolve_alias, so knows
about more encodings than what we know about here.
TODO: Check when perl_encoding could be defined when
texinfo_encoding isn't.
@@ -1565,12 +1571,11 @@ end_line_misc_line (ELEMENT *current)
"utf-8-strict","utf-8",
"ascii", "us-ascii",
"shiftjis", "shift_jis",
- "latin-1", "iso-8859-1",
"iso-8859-1", "iso-8859-1",
"iso-8859-2", "iso-8859-2",
"iso-8859-15", "iso-8859-15",
- "koi8-r", "koi8",
- "koi8-u", "koi8",
+ "koi8-r", "koi8-r",
+ "koi8-u", "koi8-u",
};
input_encoding = perl_encoding;
for (i = 0; i < sizeof map / sizeof *map; i++)
diff --git a/util/txixml2texi.pl b/util/txixml2texi.pl
index ab585c4823..074a247e78 100755
--- a/util/txixml2texi.pl
+++ b/util/txixml2texi.pl
@@ -24,6 +24,7 @@ use Getopt::Long qw(GetOptions);
# for dirname.
use File::Basename;
use File::Spec;
+use Encode;
Getopt::Long::Configure("gnu_getopt");
@@ -294,8 +295,8 @@ while ($reader->read) {
}
if ($name eq 'documentencoding' and $reader->hasAttributes()
and defined($reader->getAttribute('encoding'))) {
- my ($texinfo_encoding, $perl_encoding, $output_encoding)
- =
Texinfo::Encoding::encoding_alias($reader->getAttribute('encoding'));
+ my $perl_encoding
+ = Encode::resolve_alias($reader->getAttribute('encoding'));
if (defined($perl_encoding)) {
if ($debug) {
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: * tp/Texinfo/Convert/Unicode.pm (%unicode_to_eight_bit) (_eight_bit_and_unicode_point, encoded_accents) (unicode_point_decoded_in_encoding): use directly the input encodings names. Separate mappings of koi8-u and koi8-r.,
Patrice Dumas <=