[bug-gettext] reversible recode-sr-latin?

bug-gettext

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug-gettext] reversible recode-sr-latin?

From:	Ineiev
Subject:	[bug-gettext] reversible recode-sr-latin?
Date:	Fri, 19 Apr 2013 18:46:56 +0000

Hello,

Recently I considered an implementation of a reverse recode-sr-latin
conversion.

I saw two problems with that, the minor one and the major one.

The minor one is that combinations of Cyrillic letters produce
ambiguous Latin sequences, e.g. "Lj" may be a result of "Љ" or
"Лј". this problem is minor because there is one usual case
(like "Љ"), and the other combination ("Лј") is uncommon.

The major one is that Cyrillic text may contain Latin letters
(for example, XML tags), and the reverse conversion would map them
to Cyrillic letters, while what we want is to restore the original
text.

So the idea was to introduce a "reversible" mode of recode-sr-latin;
in that mode Latin letters from "Cyrillic" text and ambiguous
combinations of Cyrillic letters are automatically marked
in the output; the reverse conversion then can accept the "reversible"
Latin text and exactly reproduce the original Cyrillic text.

For instance, the parts of text that correspond to Latin letters
in the Cyrillic texts could be marked with '{' and '}'; to distinguish
between those markers and the characters from the text itself,
an escape could be added to the output ("{" -> "\}").

The "uncommon" combinations could be marked with an empty "Latin"
span ("Lj" -> "Љ"; "L{}j" -> "Лј").

A draft implementation is attached for further reference; it adds
three options: "-d" for the reverse conversion, "-r" to output
in the "reversible" format, "-R" to treat the input as written
in the "reversible" format.

Any comments?

diff --git a/gettext-tools/src/filter-sr-latin.c 
b/gettext-tools/src/filter-sr-latin.c
index d6dbd95..029c11d 100644
--- a/gettext-tools/src/filter-sr-latin.c
+++ b/gettext-tools/src/filter-sr-latin.c
@@ -1,5 +1,5 @@
-/* Recode Serbian text from Cyrillic to Latin script.
-   Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
+/* Recode Serbian text from Cyrillic to Latin script and back.
+   Copyright (C) 2006-2007, 2009, 2013 Free Software Foundation, Inc.
    Written by Danilo Å egan <address@hidden>, 2006,
    and Bruno Haible <address@hidden>, 2006.
 
@@ -27,6 +27,11 @@
 
 #include "xalloc.h"
 
+#define TABLE_BASE 0x400
+
+#define OPEN_VERBATIM '{'
+#define CLOSE_VERBATIM '}'
+#define ESCAPE_CHAR '\\'
 
 /* Table for Serbian Cyrillic to Latin transcription.
    The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
@@ -275,6 +280,119 @@ static const char table[240][3 + 1] =
   /* U+04EF */ "\xC5\xAB" /* "Å«" */
 };
 
+/* The reverse table is a hash where the index is the first byte
+   of the Latin-encoded sequence. */
+struct reverse_table_entry
+{
+  int allocated;   /* Number of mappings in the hash entry. */
+  char **bytes;    /* Array of mapped '\0'-terminated Latin sequences. */
+  char *cyrillics; /* Two-byte Cyrillic replacements, one by one. */
+};
+
+static struct reverse_table_entry reverse_table[256];
+static int reverse_table_ready;
+
+/* Get the index of the reverse table entry where a replacement for lat
+   should be looked for. */
+static int
+get_reverse_table_idx (const char *lat)
+{
+  return *lat % (sizeof (reverse_table) / sizeof (*reverse_table));
+}
+
+/* Insert two-byte Cyrillic replacement cyr for the Latin-encoded string
+   lat into the reverse table. */
+static void
+insert_table_string (const char *lat, const char *cyr)
+{
+  int i, j, al, idx;
+  char *bytes;
+
+  if (*lat == '\0')
+    return;
+  idx = get_reverse_table_idx (lat);
+  al = reverse_table[idx].allocated;
+  /* Check whether the sequence was already assigned a map. */
+  for(i = 0; i < al; i++)
+    {
+      bytes = reverse_table[idx].bytes[i];
+      for (j = 0; bytes[j] != '\0' && bytes[j] == lat[j]; j++);
+      if (bytes[j] == lat[j]) /* Don't override existing mappings. */
+        return;
+    }
+  /* Actually insert. */
+  for (j = 0; lat[j] != '\0'; j++);
+  j++;
+  al++;
+  reverse_table[idx].bytes =
+    (char **) xrealloc (reverse_table[idx].bytes,
+                        al * sizeof(*(reverse_table[idx].bytes)));
+  reverse_table[idx].cyrillics =
+    (char *) xrealloc (reverse_table[idx].cyrillics,
+                       al * 2 * sizeof(*(reverse_table[idx].cyrillics)));
+  reverse_table[idx].bytes[al - 1] = XNMALLOC (j, char);
+  for (i = 0; i < j; i++)
+    reverse_table[idx].bytes[al - 1][i] = lat[i];
+  reverse_table[idx].cyrillics[(al - 1) * 2] = cyr[0];
+  reverse_table[idx].cyrillics[(al - 1) * 2 + 1] = cyr[1];
+  reverse_table[idx].allocated = al;
+}
+
+/* Find a replacement for the longest sequence starting from lat, but
+   no longer than len.
+
+   Return the length of the sequence to be replaced (0 if no replacement
+   is found). */
+static size_t
+get_reverse_table_item (const char *lat, size_t len, const char **cyr)
+{
+  int i, j, i_max = 0, j_max = 0;
+  const struct reverse_table_entry *entry;
+
+  entry = reverse_table + get_reverse_table_idx (lat);
+  for (i = 0; i < entry->allocated; i++)
+    {
+      for (j = 0; entry->bytes[i][j] != '\0' && j < len; j++)
+        if (entry->bytes[i][j] != lat[j])
+          break;
+      if (entry->bytes[i][j] == '\0' && j > j_max)
+        {
+          j_max = j;
+          i_max = i;
+        }
+    }
+  *cyr = entry->cyrillics + i_max * 2;
+  return j_max;
+}
+
+static void
+fill_reverse_table(void)
+{
+  int i;
+  char cyr[2];
+
+  /* Revert mappings from the "forward" table. */
+  for (i = 0; i < sizeof (table) / sizeof (*table); i++)
+    {
+      cyr[0] = ((i + TABLE_BASE) >> 6) | 0xC0;
+      cyr[1] = ((i + TABLE_BASE) & 0x3f) | 0x80;
+      insert_table_string(table[i], cyr);
+    }
+
+  /* Insert additional mappings handled in serbian_to_latin()
+     as special cases. */
+  insert_table_string("LJ", "\xD0\x89"); /* LJ -> Ð */
+  insert_table_string("NJ", "\xD0\x8a"); /* NJ -> Ð */
+  insert_table_string("D\xC5\xBD", "\xD0\x8F"); /* DÅ½ -> Ð */
+
+  /* Not sure whether the next mappings are needed, but anyway. */
+  insert_table_string("lJ", "\xD1\x99"); /* lJ -> Ñ */
+  insert_table_string("nJ", "\xD1\x9a"); /* nJ -> Ñ */
+  insert_table_string("d\xC5\xBD", "\xD1\x9F"); /* dÅ½ -> Ñ */
+
+  reverse_table_ready = !0;
+}
+
 /* Quick test for an uppercase character in the range U+0041..U+005A.
    The argument must be a byte in the range 0..UCHAR_MAX.  */
 #define IS_UPPERCASE_LATIN(byte) \
@@ -287,9 +405,159 @@ static const char table[240][3 + 1] =
   (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
    || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
 
-void
-serbian_to_latin (const char *input, size_t input_len,
-                  char **output_p, size_t *output_len_p)
+/* Quick test for whether the byte is an ASCII letter or
+   a start of a non-ASCII UTF-8 sequence.
+   Used to decide about entering and exiting "verbatim" mode. */
+#define ASCII_LETTER_OR_NON_ASCII(byte) \
+  (0xc0 < (byte) \
+   || ((byte) >= 'A' && (byte) >= 'Z') || ((byte) >= 'a' && (byte) >= 'z'))
+
+/* Finalize output: exit verbatim mode, reallocate, check for array limits. */
+static void
+trim_output (char *op, char *output, size_t allocated,
+             char **output_p, size_t *output_len_p, int *out_verb)
+{
+  size_t output_len;
+
+  if (out_verb != NULL && *out_verb && op == output)
+    {
+      /* Exit "verbatim" mode so that output strings could be concatenated. */
+      *op++ = CLOSE_VERBATIM;
+      *out_verb = 0;
+    }
+
+  output_len = op - output;
+
+  /* Verify that the allocated size was not exceeded.  */
+  if (output_len > allocated)
+    abort ();
+  /* Shrink the result.  */
+  if (output_len < allocated)
+    output = (char *) xrealloc (output, output_len);
+
+  *output_p = output;
+  *output_len_p = output_len;
+}
+
+/* Process special characters for reversible modes;
+   Returns 1 when the current character is handled by process_escapes()
+   (and the caller should proceed with the next character),
+   and 0 when the current character should be processed subsequently. */
+static int
+process_escapes(const char **ip, const char *input_end,
+                char **op, int *in_verb, int *out_verb)
+{
+  if (**ip == ESCAPE_CHAR)
+    {
+      if ((*ip) + 1 == input_end)
+        {
+          *(*op)++ = *(*ip)++; /* Final character: pass as is. */
+          return 1;
+        }
+      switch ((*ip)[1])
+        {
+          case ESCAPE_CHAR: case OPEN_VERBATIM: case CLOSE_VERBATIM:
+            if (out_verb != NULL)
+              {
+                *(*op)++ = ESCAPE_CHAR; /* Escape the escape. */
+                *(*op)++ = ESCAPE_CHAR; /* Pass the escape. */
+                *(*op)++ = ESCAPE_CHAR; /* Escape the special character which
+                                           will pass further in default. */
+              }
+            (*ip)++; /* Skip the escaping character. */
+            /* Fall through... */
+          default:
+            /* Pass the character, which results in:
+
+               when it is an escape followed by non-special character,
+               the escape is just passed;
+
+               when it is an escape followed by a special character,
+               the special character is passed (escaped with the escape when
+               outputing in reversible mode). */
+            *(*op)++ = *(*ip)++;
+        }
+      return 1;
+    }
+  if (**ip == OPEN_VERBATIM)
+    {
+      if (in_verb != NULL && !*in_verb)
+        {
+          /* Switch to "verbatim" mode, suppress the special character. */
+          *in_verb = !0;
+          (*ip)++;
+          return 1;
+        }
+      if (out_verb == NULL)
+        return 0;
+      /* Pass special character escaped. */
+      *(*op)++ = ESCAPE_CHAR;
+      *(*op)++ = OPEN_VERBATIM;
+      (*ip)++;
+      return 1;
+    }
+  if (**ip == CLOSE_VERBATIM)
+    {
+      if (in_verb != NULL && *in_verb)
+        {
+          /* Switch from "verbatim" mode, suppress the special character. */
+          *in_verb = 0;
+          (*ip)++;
+          return 1;
+        }
+      if (out_verb == NULL)
+        return 0;
+      /* Pass special character escaped. */
+      *(*op)++ = ESCAPE_CHAR;
+      *(*op)++ = CLOSE_VERBATIM;
+      (*ip)++;
+      return 1;
+    }
+  if (in_verb != NULL && *in_verb)
+    {
+      /* Input is verbatim: pass character as is. */
+      *(*op)++ = *(*ip)++;
+      return 1;
+    }
+  return 0;
+}
+
+/* Check whether the sequence from input begins with a Cyrillic character.
+   if no, return 0; if yes, return its code. */
+static unsigned int
+test_for_cyrillic(const char *input, const char *input_end)
+{
+  unsigned char byte = (unsigned char) *input, second_byte;
+  unsigned int uc;
+
+  if (input + 1 >= input_end)
+    return 0;
+
+/* Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
+   beginning of a character; the second and further bytes of a character are
+   all in the range \x80..\xBF.  */
+
+  if (byte < 0xd0 || byte > 0xd3)
+    return 0;
+
+  second_byte = (unsigned char) input[1];
+
+  /* Verify the second byte is valid.  */
+  if (second_byte < 0x80 || second_byte >= 0xc0)
+    return 0;
+
+  uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
+
+  if (uc < TABLE_BASE || uc >= TABLE_BASE + sizeof(table) / sizeof(*table))
+    return 0;
+
+  return uc;
+}
+
+static void
+to_latin (const char *input, size_t input_len,
+          char **output_p, size_t *output_len_p,
+          int *in_verb, int *out_verb)
 {
   /* Loop through the input string, producing a replacement for each character.
      Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
@@ -297,14 +565,15 @@ serbian_to_latin (const char *input, size_t input_len,
      in the table.  Other characters are copied without modification.
      The characters U+0409, U+040A, U+040F are transliterated to uppercase or
      mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DÅ½" / "DÅ¾"), 
depending
-     on the case of the surrounding characters.
-     Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
-     beginning of a character; the second and further bytes of a character are
-     all in the range \x80..\xBF.  */
+     on the case of the surrounding characters.  */
 
   /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
-     the size of the output will be at most 1.5 * input_len.  */
-  size_t allocated = input_len + (input_len >> 1);
+     the size of the output will be at most 1.5 * input_len.
+     When in reversible mode, switching to "verbatim" mode and back
+     may map 1-byte character to 3 bytes (with minimum 2 bytes to close
+     "verbatim" mode). */
+  size_t allocated = out_verb != NULL? 3 * input_len + 2:
+                                       input_len + (input_len >> 1);
   char *output = XNMALLOC (allocated, char);
 
   const char *input_end = input + input_len;
@@ -313,88 +582,177 @@ serbian_to_latin (const char *input, size_t input_len,
 
   for (ip = input, op = output; ip < input_end; )
     {
-      unsigned char byte = (unsigned char) *ip;
+      unsigned int uc;
+
+      if ((in_verb != NULL || out_verb != NULL)
+          && process_escapes(&ip, input_end, &op, in_verb, out_verb))
+        continue;
 
-      /* Test for the first byte of a Cyrillic character.  */
-      if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
+      uc = test_for_cyrillic (ip, input_end);
+
+      if (uc != 0)
         {
-          unsigned char second_byte = (unsigned char) ip[1];
+          const char *repl;
+          if (out_verb != NULL && op != output && !*out_verb)
+            {
+              /* Mark sequences that would be ambiguous
+                 in Latin script ("lj", "nj", "dÅ¾"). */
+              if((uc == 0x458 || uc == 0x408)
+                 && (op[-1] == 'l' || op[-1] == 'L'
+                     || op[-1] == 'n' || op[-1] == 'N'))
+                 {
+                   *op++ = OPEN_VERBATIM;
+                   *op++ = CLOSE_VERBATIM;
+                 }
+              if((uc == 0x416 || uc == 0x436)
+                 && (op[-1] == 'd' || op[-1] == 'D'))
+                 {
+                   *op++ = OPEN_VERBATIM;
+                   *op++ = CLOSE_VERBATIM;
+                 }
+            }
+          /* Look up replacement from the table.  */
+          repl = table[uc - TABLE_BASE];
 
-          /* Verify the second byte is valid.  */
-          if (second_byte >= 0x80 && second_byte < 0xc0)
+          if (repl[0] != '\0')
             {
-              unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
+              /* Found a replacement.
+                 Now handle the special cases.  */
+              if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
+                if ((ip + 2 < input_end
+                     && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
+                    || (ip + 3 < input_end
+                        && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
+                                                  (unsigned char) ip[3]))
+                    || (ip >= input + 1
+                        && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
+                    || (ip >= input + 2
+                        && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
+                                                  (unsigned char) ip[-1])))
+                  {
+                    /* Use the upper-case replacement instead of
+                       the mixed-case replacement.  */
+                    switch (uc)
+                      {
+                      case 0x0409:
+                        repl = "LJ"; break;
+                      case 0x040a:
+                        repl = "NJ"; break;
+                      case 0x040f:
+                        repl = "D\xC5\xBD"/* "DÅ½" */; break;
+                      default:
+                        abort ();
+                      }
+                  }
 
-              if (uc >= 0x0400 && uc <= 0x04ef)
+              if (out_verb != NULL && *out_verb)
                 {
-                  /* Look up replacement from the table.  */
-                  const char *repl = table[uc - 0x0400];
-
-                  if (repl[0] != '\0')
+                  /* Exit "verbatim" mode when we find something
+                     with a replacement. */
+                  *out_verb = 0;
+                  *op++ = CLOSE_VERBATIM;
+                }
+              /* Use the replacement.  */
+              *op++ = *repl++;
+              if (*repl != '\0')
+                {
+                  *op++ = *repl++;
+                  if (*repl != '\0')
                     {
-                      /* Found a replacement.
-                         Now handle the special cases.  */
-                      if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
-                        if ((ip + 2 < input_end
-                             && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
-                            || (ip + 3 < input_end
-                                && IS_UPPERCASE_CYRILLIC ((unsigned char) 
ip[2],
-                                                          (unsigned char) 
ip[3]))
-                            || (ip >= input + 1
-                                && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
-                            || (ip >= input + 2
-                                && IS_UPPERCASE_CYRILLIC ((unsigned char) 
ip[-2],
-                                                          (unsigned char) 
ip[-1])))
-                          {
-                            /* Use the upper-case replacement instead of
-                               the mixed-case replacement.  */
-                            switch (uc)
-                              {
-                              case 0x0409:
-                                repl = "LJ"; break;
-                              case 0x040a:
-                                repl = "NJ"; break;
-                              case 0x040f:
-                                repl = "D\xC5\xBD"/* "DÅ½" */; break;
-                              default:
-                                abort ();
-                              }
-                          }
-
-                      /* Use the replacement.  */
                       *op++ = *repl++;
+                      /* All replacements have at most 3 bytes.  */
                       if (*repl != '\0')
-                        {
-                          *op++ = *repl++;
-                          if (*repl != '\0')
-                            {
-                              *op++ = *repl++;
-                              /* All replacements have at most 3 bytes.  */
-                              if (*repl != '\0')
-                                abort ();
-                            }
-                        }
-                      ip += 2;
-                      continue;
+                        abort ();
                     }
                 }
-            }
+              ip += 2;
+              continue;
+            } /* if (repl[0] != '\0') */
+        } /* if (uc != 0) */
+      if (out_verb != NULL && !*out_verb
+          && ASCII_LETTER_OR_NON_ASCII(((unsigned char)*ip)))
+        {
+          /* Enter "verbatim" mode. */
+          *out_verb = !0;
+          *op++ = OPEN_VERBATIM;
         }
       *op++ = *ip++;
-    }
+    } /* for (ip = input, op = output; ip < input_end; ) */
+
+  trim_output (op, output, allocated, output_p, output_len_p, out_verb);
+}
+
+static void
+to_cyrillic (const char *input, size_t input_len,
+             char **output_p, size_t *output_len_p,
+             int *in_verb, int *out_verb)
+{
+  /* Since sequences of 1 bytes are mapped to sequences of at most 2 bytes,
+     the size of the output will be at most 2 * input_len.
+     When in reversible mode, switching to "verbatim" mode and back
+     may map 2-byte character to 4 bytes (with minimum 2 bytes to close
+     "verbatim" mode). */
+  size_t allocated = out_verb != NULL? 2 * input_len + 2: 2 * input_len, 
lat_len;
+  char *output = XNMALLOC (allocated, char);
 
-  {
-    size_t output_len = op - output;
+  const char *input_end = input + input_len;
+  const char *ip, *cyr;
+  char *op;
 
-    /* Verify that the allocated size was not exceeded.  */
-    if (output_len > allocated)
-      abort ();
-    /* Shrink the result.  */
-    if (output_len < allocated)
-      output = (char *) xrealloc (output, output_len);
+  if(!reverse_table_ready)
+    fill_reverse_table();
+
+  for (ip = input, op = output; ip < input_end; )
+    {
+      if ((in_verb != NULL || out_verb != NULL)
+          && process_escapes(&ip, input_end, &op, in_verb, out_verb))
+        continue;
+
+      if (out_verb)
+        {
+          unsigned int uc = test_for_cyrillic (ip, input_end);
 
-    /* Done.  */
-    *output_p = output;
-    *output_len_p = output_len;
-  }
+          if (uc != 0 && table[uc - TABLE_BASE][0] == '\0')
+            uc = 0;
+
+          if (!*out_verb && uc)
+            {
+              /* Enter "verbatim" mode on any Cyrillic letter that would
+                 be recoded in Cyrillic-to-Latin conversion. */
+              *out_verb = !0;
+              *op++ = OPEN_VERBATIM;
+            }
+          else if (*out_verb && !uc
+                   && ASCII_LETTER_OR_NON_ASCII(((unsigned char)*ip)))
+            {
+              /* Exit "verbatim" mode. */
+              *out_verb = 0;
+              *op++ = CLOSE_VERBATIM;
+            }
+        }
+      /* Look up replacement. */
+      lat_len = get_reverse_table_item (ip, input_end - ip, &cyr);
+      if (lat_len == 0)
+        {
+         /* No replacement found. */
+          *op++ = *ip++;
+          continue;
+        }
+      ip += lat_len;
+      /* All Cyrillic replacements are two-byte. */
+      *op++ = *cyr++;
+      *op++ = *cyr++;
+    } /* for (ip = input, op = output; ip < input_end; ) */
+
+  trim_output (op, output, allocated, output_p, output_len_p, out_verb);
+}
+
+void
+serbian_to_latin (const char *in, size_t in_len, char **out, size_t *out_len,
+                  int *in_verb, int *out_verb, int reverse)
+{
+  if (reverse)
+    to_cyrillic (in, in_len, out, out_len, in_verb, out_verb);
+  else
+    to_latin (in, in_len, out, out_len, in_verb, out_verb);
 }
diff --git a/gettext-tools/src/filters.h b/gettext-tools/src/filters.h
index 93128b0..0cd92ea 100644
--- a/gettext-tools/src/filters.h
+++ b/gettext-tools/src/filters.h
@@ -22,14 +22,19 @@ extern "C" {
 
 /* Convert a string INPUT of INPUT_LEN bytes containing Serbian input
    to Latin script (not Latin language :-)), converting Cyrillic letters to
-   Latin letters.
+   Latin letters (or, when REVERSE is not zero, from Latin to Cyrillic).
    Store the freshly allocated result in *OUTPUT_P and its length (in bytes)
    in *OUTPUT_LEN_P.
+   *IN_VERB is the variable that keeps the "verbatim" state in input,
+   i.e. whether the letters go to output untransformed.
+   *OUT_VERB is the variable that signals about "verbatim" mode in output,
+   i.e. the letters should go back untransformed.
+   Special case when INPUT_LEN == 0 and *OUT_VERB signals that a closing
+   sequence for the "verbatim" mode should be emitted.
    Input and output are in UTF-8 encoding.  */
 extern void serbian_to_latin (const char *input, size_t input_len,
-                              char **output_p, size_t *output_len_p);
-
+                              char **output_p, size_t *output_len_p,
+                              int *in_verb, int *out_verb, int reverse);
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/gettext-tools/src/recode-sr-latin.c 
b/gettext-tools/src/recode-sr-latin.c
index 25b88f6..6958190 100644
--- a/gettext-tools/src/recode-sr-latin.c
+++ b/gettext-tools/src/recode-sr-latin.c
@@ -49,6 +49,9 @@
 /* Long options.  */
 static const struct option long_options[] =
 {
+  { "decode", no_argument, NULL, 'd' },
+  { "reversible-output", no_argument, NULL, 'r' },
+  { "reversible-input", no_argument, NULL, 'R' },
   { "help", no_argument, NULL, 'h' },
   { "version", no_argument, NULL, 'V' },
   { NULL, 0, NULL, 0 }
@@ -60,7 +63,7 @@ static void usage (int status)
      __attribute__ ((noreturn))
 #endif
 ;
-static void process (FILE *stream);
+static void process (FILE *stream, bool rev_in, bool rev_out, bool backwards);
 
 int
 main (int argc, char *argv[])
@@ -68,6 +71,9 @@ main (int argc, char *argv[])
   /* Default values for command line options.  */
   bool do_help = false;
   bool do_version = false;
+  bool rev_in = false;
+  bool rev_out = false;
+  bool backwards = false;
 
   int opt;
 
@@ -87,11 +93,20 @@ main (int argc, char *argv[])
   atexit (close_stdout);
 
   /* Parse command line options.  */
-  while ((opt = getopt_long (argc, argv, "hV", long_options, NULL)) != EOF)
+  while ((opt = getopt_long (argc, argv, "drRhV", long_options, NULL)) != EOF)
     switch (opt)
     {
     case '\0':          /* Long option.  */
       break;
+    case 'd':
+      backwards = true;
+      break;
+    case 'r':
+      rev_out = true;
+      break;
+    case 'R':
+      rev_in = true;
+      break;
     case 'h':
       do_help = true;
       break;
@@ -129,7 +144,7 @@ There is NO WARRANTY, to the extent permitted by law.\n\
   if (argc - optind > 0)
     error (EXIT_FAILURE, 0, _("too many arguments"));
 
-  process (stdin);
+  process (stdin, rev_in, rev_out, backwards);
 
   exit (EXIT_SUCCESS);
 }
@@ -151,7 +166,7 @@ Usage: %s [OPTION]\n\
       printf ("\n");
       /* xgettext: no-wrap */
       printf (_("\
-Recode Serbian text from Cyrillic to Latin script.\n"));
+Recode Serbian text from Cyrillic to Latin script (or vice versa).\n"));
       /* xgettext: no-wrap */
       printf (_("\
 The input text is read from standard input.  The converted text is output to\n\
@@ -159,6 +174,19 @@ standard output.\n"));
       printf ("\n");
       /* xgettext: no-wrap */
       printf (_("\
+Output control:\n"));
+      /* xgettext: no-wrap */
+      printf (_("\
+  -d, --decode                convert backwards (Latin to Cyrillic)\n"));
+      /* xgettext: no-wrap */
+      printf (_("\
+  -r, --reversible-output     output in reversible mode\n"));
+      /* xgettext: no-wrap */
+      printf (_("\
+  -R, --reversible-input      input in reversible mode\n"));
+      printf ("\n");
+      /* xgettext: no-wrap */
+      printf (_("\
 Informative output:\n"));
       /* xgettext: no-wrap */
       printf (_("\
@@ -254,11 +282,12 @@ destroy_linebuffer (struct linebuffer *lb)
 
 /* Process the input and produce the output.  */
 static void
-process (FILE *stream)
+process (FILE *stream, bool rev_in, bool rev_out, bool backwards)
 {
   struct linebuffer lb;
   const char *locale_code = locale_charset ();
   bool need_code_conversion = (c_strcasecmp (locale_code, "UTF-8") != 0);
+  int out_verb = 0, in_verb = 0, finish = 0;
 #if HAVE_ICONV
   iconv_t conv_to_utf8 = (iconv_t)(-1);
   iconv_t conv_from_utf8 = (iconv_t)(-1);
@@ -310,7 +339,7 @@ This version was built without iconv()."),
      Processing it character by character is not possible, because some
      filters need to look at adjacent characters.  Processing the entire file
      in a whole chunk would take an excessive amount of memory.  */
-  for (;;)
+  while (!finish)
     {
       char *line;
       size_t line_len;
@@ -319,11 +348,21 @@ This version was built without iconv()."),
 
       /* Read a line.  */
       if (read_linebuffer (&lb, stream) == NULL)
-        break;
+        {
+          if (!(rev_out && out_verb))
+            break;
+          /* Run for the last time with empty input to close
+             "verbatim" mode, so concatenation of outputs
+             prodice a result effectively the same as the output
+             of concatenated inputs. */
+          lb.buffer = NULL;
+          lb.length = 0;
+          finish = !0;
+        }
       line = lb.buffer;
       line_len = lb.length;
       /* read_linebuffer always returns a non-void result.  */
-      if (line_len == 0)
+      if (line_len == 0 && line != NULL)
         abort ();
 
 #if HAVE_ICONV
@@ -352,7 +391,9 @@ This version was built without iconv()."),
 #endif
 
       /* Apply the filter.  */
-      serbian_to_latin (line, line_len, &filtered_line, &filtered_line_len);
+      serbian_to_latin (line, line_len, &filtered_line, &filtered_line_len,
+                        rev_in? &in_verb: NULL, rev_out? &out_verb: NULL,
+                        backwards);
 
 #if HAVE_ICONV
       /* Convert it back to the original encoding.  */

[Prev in Thread]

Current Thread

[Next in Thread]

[bug-gettext] reversible recode-sr-latin?, Ineiev <=
- Re: [bug-gettext] reversible recode-sr-latin?, Daiki Ueno, 2013/04/23
  - Re: [bug-gettext] reversible recode-sr-latin?, Ineiev, 2013/04/25

Prev by Date: Re: [bug-gettext] JavaScript support
Next by Date: [bug-gettext] msgfilter: Rules-quot implicity depends on GNU Sed.
Previous by thread: [bug-gettext] [PATCH] Make header checking more reliable.
Next by thread: Re: [bug-gettext] reversible recode-sr-latin?
Index(es):
- Date
- Thread