diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index f188106..e579ce8 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,20 @@ +2013-02-08 Miguel Angel Arruga Vivas + + Add support for Unicode escaped sequences in x-c.c based in + x-java.c Unicode support. + * x-c.c (po-charset.h): Included file. + (unistr.h): Likewise. + (P7_UNICODE): New macro. + (phase7_getc): Add new case for 'u'/'U'. + (skip_unicode_codepoint): New function. + (get_unicode_codepoint): New function. + (utf8_string_to_store): New boolean. + (phase5_get): Skip unicode codepoint in '' strings. + Store UTF-8 representation in "" strings. + (extract_parenthesized): Change 'xgettext_current_source_encoding' + to 'po_charset_utf8' when needed. From x-java.c code. + (extract_whole_file): Set 'utf8_string_to_store'. + 2013-02-06 Miguel Angel Arruga Vivas GtkBuilder support in xgettext. diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c index ea0a874..12d2f41 100644 --- a/gettext-tools/src/x-c.c +++ b/gettext-tools/src/x-c.c @@ -36,6 +36,8 @@ #include "xalloc.h" #include "xvasprintf.h" #include "hash.h" +#include "po-charset.h" +#include "unistr.h" #include "gettext.h" #define _(s) gettext(s) @@ -867,6 +869,7 @@ struct token_ty #define P7_QUOTES (1000 + '"') #define P7_QUOTE (1000 + '\'') #define P7_NEWLINE (1000 + '\n') +#define P7_UNICODE (1000 + 'u') static int phase7_getc () @@ -998,6 +1001,26 @@ phase7_getc () } phase3_ungetc (c); return n; + + /* Unicode support. We keep the u/U in c. */ + case 'u': case 'U': + n = phase3_getc (); /* n stores the next character. */ + switch (n) + { + default: + j = '\\'; /* j stores the result. */ + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + j = P7_UNICODE; + break; + } + phase3_ungetc (n); + phase3_ungetc (c); + return j; } } @@ -1021,12 +1044,107 @@ free_token (token_ty *tp) } +static void +skip_unicode_codepoint () +{ + int num_bytes, j; + int c = phase3_getc (); + + switch (c) + { + default: + /* This must be called pointing a unicode escaped codepoint. */ + abort (); + case 'u': + num_bytes = 4; + break; + case 'U': + num_bytes = 8; + break; + } + + for (j = 0; j < num_bytes; ++j) + { + c = phase3_getc (); + switch (c) + { + default: + /* This must be called pointing a unicode escaped codepoint. */ + abort (); + /* + phase3_ungetc (c); + */ + return; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + break; + } + } +} + +/* Returns the codepoint stored in the escaped sequence. */ +static ucs4_t +get_unicode_codepoint () +{ + int num_bytes, j; + ucs4_t n = 0; + int c = phase3_getc (); + + switch (c) + { + default: + /* This must be called pointing a unicode escaped codepoint. */ + abort (); + + case 'u': + num_bytes = 4; + break; + case 'U': + num_bytes = 8; + break; + } + + for (j = 0; j < num_bytes; ++j) + { + c = phase3_getc (); + switch (c) + { + default: + /* This must be called pointing a unicode escaped codepoint. */ + abort (); + /* + phase3_ungetc (c); + */ + return n; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = n * 16 + c - '0'; + break; + + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + n = n * 16 + 10 + c - 'A'; + break; + + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + n = n * 16 + 10 + c - 'a'; + break; + } + } + return n; +} + /* 5. Parse each resulting logical line as preprocessing tokens and white space. Preprocessing tokens and C tokens don't always match. */ static token_ty phase5_pushback[1]; static int phase5_pushback_length; +/* Set when a \u is found with a non-ASCII character. */ +static bool utf8_string_to_store; static void phase5_get (token_ty *tp) @@ -1228,6 +1346,8 @@ phase5_get (token_ty *tp) } if (c == EOF || c == P7_QUOTE) break; + if (c == P7_UNICODE) + skip_unicode_codepoint (); } tp->type = token_type_character_constant; return; @@ -1254,12 +1374,30 @@ phase5_get (token_ty *tp) break; if (c == P7_QUOTE) c = '\''; - if (bufpos >= bufmax) - { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); - } - buffer[bufpos++] = c; + if (c == P7_UNICODE) + { + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, get_unicode_codepoint (), 6); + + if (bufpos + count >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + memcpy (buffer + bufpos, utf8buf, count); + if (count > 1) + utf8_string_to_store = true; + bufpos += count; + } + else + { + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } } if (bufpos >= bufmax) { @@ -1843,7 +1981,15 @@ extract_parenthesized (message_list_ty *mlp, arglist_parser_alloc (mlp, state ? next_shapes : NULL))) { + if (utf8_string_to_store) + xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); + if (utf8_string_to_store) + { + utf8_string_to_store = false; + xgettext_current_source_encoding = + xgettext_global_source_encoding; + } return true; } next_context_iter = null_context_list_iterator; @@ -1852,7 +1998,15 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_rparen: + if (utf8_string_to_store) + xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); + if (utf8_string_to_store) + { + utf8_string_to_store = false; + xgettext_current_source_encoding = + xgettext_global_source_encoding; + } return false; case xgettext_token_type_comma: @@ -1886,6 +2040,8 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_string_literal: + if (utf8_string_to_store) + xgettext_current_source_encoding = po_charset_utf8; if (extract_all) remember_a_message (mlp, NULL, token.string, inner_context, &token.pos, NULL, token.comment); @@ -1894,6 +2050,12 @@ extract_parenthesized (message_list_ty *mlp, inner_context, token.pos.file_name, token.pos.line_number, token.comment); + if (utf8_string_to_store) + { + utf8_string_to_store = false; + xgettext_current_source_encoding = + xgettext_global_source_encoding; + } drop_reference (token.comment); next_context_iter = null_context_list_iterator; selectorcall_context_iter = null_context_list_iterator; @@ -1907,7 +2069,15 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_eof: + if (utf8_string_to_store) + xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); + if (utf8_string_to_store) + { + utf8_string_to_store = false; + xgettext_current_source_encoding = + xgettext_global_source_encoding; + } return true; default: @@ -1929,6 +2099,7 @@ extract_whole_file (FILE *f, real_file_name = real_filename; logical_file_name = xstrdup (logical_filename); line_number = 1; + utf8_string_to_store = false; newline_count = 0; last_comment_line = -1;