gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r9810 - Extractor/src/plugins


From: gnunet
Subject: [GNUnet-SVN] r9810 - Extractor/src/plugins
Date: Sat, 19 Dec 2009 13:58:35 +0100

Author: grothoff
Date: 2009-12-19 13:58:35 +0100 (Sat, 19 Dec 2009)
New Revision: 9810

Modified:
   Extractor/src/plugins/translitextractor.c
Log:
dos2unix

Modified: Extractor/src/plugins/translitextractor.c
===================================================================
--- Extractor/src/plugins/translitextractor.c   2009-12-19 12:58:07 UTC (rev 
9809)
+++ Extractor/src/plugins/translitextractor.c   2009-12-19 12:58:35 UTC (rev 
9810)
@@ -1,129 +1,129 @@
-/*
-     This file is part of libextractor.
-     (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
- */  
-  
-/**
- * @brief Transliterate keywords that contain international characters
- * @author Nils Durner
- */ 
-  
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-  
-/* Language independent chars were taken from glibc's locale/C-translit.h.in
- * 
- * This extractor uses two tables: one contains the Unicode
- * characters and the other one contains the transliterations (since
- * transliterations are often used more than once: � -> ae, � -> ae).
- * The first table points to an appropriate transliteration stored in the
- * second table.
- * 
- * To generate the two tables, a relational database was prepared:
- *  create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
- *  create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
- * 
- * After that, the data from glibc was converted to a SQL script using
- * "awk -F '\t'":
- *   {
- *     transl = $2;
- *     gsub(/'/, "''", transl);
- *     print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, 
index($3, ">") - 6) "', '" transl "');";
- *     print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', 
(Select count(*) from TRANSL));";
- *   }
- * 
- * Then the SQL script was executed, "commit"ted and the relation between the
- * two tables established using:
- *   update TBL Set TRANSLID = (Select TRANSLID from TRANSL where 
TRANSL.TRANSL = TBL.TRANSL);
- *   commit;
- * 
- * The C arrays were then created with:
- *   Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
- *   Select TRANSL || ', '  from TRANSL order by TRANSLID;
- * and reformatted with:
- *   {
- *     a = $0;
- *     getline;
- *     b = $0;
- *     getline;
- *     c = $0;
- *     getline;
- *     printf("%s %s %s %s\n", a, b, c, $0);
- *   }
- * 
- * The unicode values for the other characters were taken from
- *   http://bigfield.ddo.jp/unicode/unicode0.html
- */ 
+/*
+     This file is part of libextractor.
+     (C) 2002 - 2005 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */  
+  
+/**
+ * @brief Transliterate keywords that contain international characters
+ * @author Nils Durner
+ */ 
+  
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+  
+/* Language independent chars were taken from glibc's locale/C-translit.h.in
+ * 
+ * This extractor uses two tables: one contains the Unicode
+ * characters and the other one contains the transliterations (since
+ * transliterations are often used more than once: � -> ae, � -> ae).
+ * The first table points to an appropriate transliteration stored in the
+ * second table.
+ * 
+ * To generate the two tables, a relational database was prepared:
+ *  create table TBL(UNI varchar(20), TRANSL varchar(10), TRANSLID integer);
+ *  create table TRANSL (TRANSL varchar(20) primary key, TRANSLID integer);
+ * 
+ * After that, the data from glibc was converted to a SQL script using
+ * "awk -F '\t'":
+ *   {
+ *     transl = $2;
+ *     gsub(/'/, "''", transl);
+ *     print "insert into TBL(UNI, TRANSL) values ('0x" substr($3, 6, 
index($3, ">") - 6) "', '" transl "');";
+ *     print "insert into TRANSL(TRANSL, TRANSLID) values ('" transl "', 
(Select count(*) from TRANSL));";
+ *   }
+ * 
+ * Then the SQL script was executed, "commit"ted and the relation between the
+ * two tables established using:
+ *   update TBL Set TRANSLID = (Select TRANSLID from TRANSL where 
TRANSL.TRANSL = TBL.TRANSL);
+ *   commit;
+ * 
+ * The C arrays were then created with:
+ *   Select '{' || UNI || ', ' || TRANSLID || '},' from TBL order by UNI;
+ *   Select TRANSL || ', '  from TRANSL order by TRANSLID;
+ * and reformatted with:
+ *   {
+ *     a = $0;
+ *     getline;
+ *     b = $0;
+ *     getline;
+ *     c = $0;
+ *     getline;
+ *     printf("%s %s %s %s\n", a, b, c, $0);
+ *   }
+ * 
+ * The unicode values for the other characters were taken from
+ *   http://bigfield.ddo.jp/unicode/unicode0.html
+ */ 
 
-unsigned int chars[][2] = { 
+unsigned int chars[][2] = { 
     {0x00C4, 444}, {0x00D6, 445}, {0x00DC, 446}, {0x00DF, 13},
-  /* �, �, �, � */ 
-{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* �, �, �, � */ 
-{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* �, �, �, � */ 
-{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* �, �, �, � */ 
-{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* �, �, �, � */ 
-{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* �, �, �, � */ 
-{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* �, �, �, � */ 
-{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* �, �, �, � */ 
-{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* �, �, �, � */ 
-{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* �, �, �, � */ 
-  
-  /* Language independent */ 
-{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, 
-  {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, 
-  {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, 
-  {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, 
-  {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, 
-  {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, 
-  {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, 
-  {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, 
-  {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, 
-  {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, 
-  {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, 
-  {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, 
-  {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, 
-  {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, 
-  {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, 
-  {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, 
-  {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, 
-  {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, 
-  {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, 
-  {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, 
-  {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, 
-  {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, 
-  {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, 
-  {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, 
-  {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, 
-  {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, 
-  {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, 
-  {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, 
-  {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, 
-  {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, 
-  {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, 
-  {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, 
-  {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, 
-  {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, 
-  {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, 
-  {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, 
-  {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, 
-  {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, 
-  {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, 
+  /* �, �, �, � */ 
+{0x00E4, 14}, {0x00F6, 19}, {0x00FC, 447}, {0x00C5, 448}, /* �, �, �, � */ 
+{0x00E5, 449}, {0x00C6, 444}, {0x00E6, 14}, {0x00D8, 445}, /* �, �, �, � */ 
+{0x00F8, 19}, {0x00C0, 419}, {0x00C8, 77}, {0x00D9, 426}, /* �, �, �, � */ 
+{0x00E0, 431}, {0x00E8, 76}, {0x00F9, 5}, {0x00C9, 77}, /* �, �, �, � */ 
+{0x00E9, 76}, {0x00C2, 419}, {0x00CA, 77}, {0x00CE, 63}, /* �, �, �, � */ 
+{0x00D4, 423}, {0x00DB, 426}, {0x00E2, 431}, {0x00EA, 76}, /* �, �, �, � */ 
+{0x00EE, 80}, {0x00F4, 41}, {0x00FB, 5}, {0x00CB, 77}, /* �, �, �, � */ 
+{0x00CF, 63}, {0x00EB, 76}, {0x00EF, 80}, {0x00C7, 57}, /* �, �, �, � */ 
+{0x00E7, 118}, {0x0152, 445}, {0x0053, 19}, {0x0080, 66}, /* �, �, �, � */ 
+  
+  /* Language independent */ 
+{0xFB00, 391}, {0xFB01, 392}, {0xFB02, 393}, {0xFB03, 394}, 
+  {0xFB04, 395}, {0xFB06, 396}, {0xFB29, 40}, {0xFEFF, 36}, 
+  {0xFE4D, 33}, {0xFE4E, 33}, {0xFE4F, 33}, {0xFE5A, 401}, 
+  {0xFE5B, 402}, {0xFE5C, 403}, {0xFE5F, 404}, {0xFE50, 6}, 
+  {0xFE52, 42}, {0xFE54, 397}, {0xFE55, 34}, {0xFE56, 398}, 
+  {0xFE57, 399}, {0xFE59, 400}, {0xFE6A, 407}, {0xFE6B, 408}, 
+  {0xFE60, 405}, {0xFE61, 128}, {0xFE62, 40}, {0xFE63, 3}, 
+  {0xFE64, 47}, {0xFE65, 48}, {0xFE66, 262}, {0xFE68, 127}, 
+  {0xFE69, 406}, {0xFF0A, 128}, {0xFF0B, 40}, {0xFF0C, 6}, 
+  {0xFF0D, 3}, {0xFF0E, 42}, {0xFF0F, 126}, {0xFF01, 399}, 
+  {0xFF02, 38}, {0xFF03, 404}, {0xFF04, 406}, {0xFF05, 407}, 
+  {0xFF06, 405}, {0xFF07, 30}, {0xFF08, 400}, {0xFF09, 401}, 
+  {0xFF1A, 34}, {0xFF1B, 397}, {0xFF1C, 47}, {0xFF1D, 262}, 
+  {0xFF1E, 48}, {0xFF1F, 398}, {0xFF10, 409}, {0xFF11, 410}, 
+  {0xFF12, 411}, {0xFF13, 412}, {0xFF14, 413}, {0xFF15, 414}, 
+  {0xFF16, 415}, {0xFF17, 416}, {0xFF18, 417}, {0xFF19, 418}, 
+  {0xFF2A, 421}, {0xFF2B, 422}, {0xFF2C, 64}, {0xFF2D, 79}, 
+  {0xFF2E, 66}, {0xFF2F, 423}, {0xFF20, 408}, {0xFF21, 419}, 
+  {0xFF22, 75}, {0xFF23, 57}, {0xFF24, 81}, {0xFF25, 77}, 
+  {0xFF26, 78}, {0xFF27, 420}, {0xFF28, 61}, {0xFF29, 63}, 
+  {0xFF3A, 73}, {0xFF3B, 429}, {0xFF3C, 127}, {0xFF3D, 430}, 
+  {0xFF3E, 31}, {0xFF3F, 33}, {0xFF30, 68}, {0xFF31, 69}, 
+  {0xFF32, 70}, {0xFF33, 424}, {0xFF34, 425}, {0xFF35, 426}, 
+  {0xFF36, 100}, {0xFF37, 427}, {0xFF38, 105}, {0xFF39, 428}, 
+  {0xFF4A, 83}, {0xFF4B, 434}, {0xFF4C, 65}, {0xFF4D, 119}, 
+  {0xFF4E, 435}, {0xFF4F, 41}, {0xFF40, 32}, {0xFF41, 431}, 
+  {0xFF42, 432}, {0xFF43, 118}, {0xFF44, 82}, {0xFF45, 76}, 
+  {0xFF46, 433}, {0xFF47, 60}, {0xFF48, 62}, {0xFF49, 80}, 
+  {0xFF5A, 442}, {0xFF5B, 402}, {0xFF5C, 129}, {0xFF5D, 403}, 
+  {0xFF5E, 35}, {0xFF50, 436}, {0xFF51, 437}, {0xFF52, 438}, 
+  {0xFF53, 20}, {0xFF54, 439}, {0xFF55, 5}, {0xFF56, 111}, 
+  {0xFF57, 440}, {0xFF58, 12}, {0xFF59, 441}, {0x00AB, 2}, 
+  {0x00AD, 3}, {0x00AE, 4}, {0x00A0, 0}, {0x00A9, 1}, 
+  {0x00BB, 7}, {0x00BC, 8}, {0x00BD, 9}, {0x00BE, 10}, 
+  {0x00B5, 5}, {0x00B8, 6}, {0x00C6, 11}, {0x00DF, 13}, 
+  {0x00D7, 12}, {0x00E6, 14}, {0x0001D4AA, 423}, {0x0001D4AB, 68}, 
+  {0x0001D4AC, 69}, {0x0001D4AE, 424}, {0x0001D4AF, 425}, {0x0001D4A2, 420}, 
+  {0x0001D4A5, 421}, {0x0001D4A6, 422}, {0x0001D4A9, 66}, {0x0001D4BB, 433}, 
+  {0x0001D4BD, 62}, {0x0001D4BE, 80}, {0x0001D4BF, 83}, {0x0001D4B0, 426}, 
   {0x0001D4B1, 100}, {0x0001D4B2, 427}, {0x0001D4B3, 105}, {0x0001D4B4, 428},
   
 {0x0001D4B5, 73}, {0x0001D4B6, 431}, {0x0001D4B7, 432}, {0x0001D4B8, 118},
@@ -839,9 +839,9 @@
 "Y", "[", "]", "a", 
 "b", "f", "k", "n", 
 "p", "q", "r", "t", 
-"w", "y", "z", "z", 
-    /* German */ "Ae", "Oe", "Ue", "ue", 
-    /* Scandinavian */ "Aa", "aa" 
+"w", "y", "z", "z", 
+    /* German */ "Ae", "Oe", "Ue", "ue", 
+    /* Scandinavian */ "Aa", "aa" 
 };
 
 
@@ -867,7 +867,7 @@
 }
 
 
-struct EXTRACTOR_Keywords *
+struct EXTRACTOR_Keywords *
 libextractor_translit_extract (const char *filename, 
 const char *data,
                                
@@ -892,7 +892,7 @@
 
 
 while (pos != NULL)
-    
+    
     {
       
 int charlen = 0;
@@ -916,28 +916,27 @@
           
 char *tr;
           
-
 
-            /* Get length of character */ 
+            /* Get length of character */ 
             c = srcdata[src];
           
 if ((c & 0xC0) == 0xC0)
-            
-              /* UTF-8 char */ 
+            
+              /* UTF-8 char */ 
               if ((c & 0xE0) == 0xE0)
               
 if ((c & 0xF0) == 0xF0)
                 
 charlen = 4;
-          
+          
               else
                 
 charlen = 3;
-          
+          
             else
               
 charlen = 2;
-          
+          
           else
             
 charlen = 1;
@@ -945,16 +944,15 @@
 
 if (src + charlen - 1 > len)
             {
-              
-                /* incomplete UTF-8 */ 
+              
+                /* incomplete UTF-8 */ 
                 src = len;
               
 continue;
             
 }
-          
-
-            /* Copy character to destination */ 
+          
+            /* Copy character to destination */ 
             if (charlen > 1)
             {
               
@@ -963,39 +961,38 @@
 
 if (charlen == 2)
                 {
-                  
-                    /* 5 bits from the first byte and 6 bits from the second.
-                       64 = 2^6 */ 
+                  
+                    /* 5 bits from the first byte and 6 bits from the second.
+                       64 = 2^6 */ 
                     unicode =
                     ((srcdata[src] & 0x1F) * 64) | (srcdata[src + 1] & 0x3F);
                 
 }
-              
+              
               else if (charlen == 3)
                 {
-                  
-                    /* 4 bits from the first byte and 6 bits from the second 
and third
-                       byte. 4096 = 2^12 */ 
-                    unicode = ((srcdata[src] & 0xF) * 4096) | 
+                  
+                    /* 4 bits from the first byte and 6 bits from the second 
and third
+                       byte. 4096 = 2^12 */ 
+                    unicode = ((srcdata[src] & 0xF) * 4096) | 
                     ((srcdata[src + 1] & 0x3F) *
                      64) | (srcdata[src + 2] & 0x3F);
                 
 }
-              
+              
               else if (charlen == 4)
                 {
-                  
-                    /* 3 bits from the first byte and 6 bits from the second, 
third
-                       and fourth byte. 262144 = 2^18 */ 
-                    unicode = ((srcdata[src] & 7) * 262144) | 
-                    ((srcdata[src] & 0xF) * 4096) | 
+                  
+                    /* 3 bits from the first byte and 6 bits from the second, 
third
+                       and fourth byte. 262144 = 2^18 */ 
+                    unicode = ((srcdata[src] & 7) * 262144) | 
+                    ((srcdata[src] & 0xF) * 4096) | 
                     ((srcdata[src + 1] & 0x3F) *
                      64) | (srcdata[src + 2] & 0x3F);
                 
 }
-              
-
-                /* Look it up */ 
+              
+                /* Look it up */ 
                 idx = 0;
               
 tr = srcdata + src;
@@ -1007,8 +1004,8 @@
                   
 if (unicode == chars[idx][0])
                     {
-                      
-                        /* Found it */ 
+                      
+                        /* Found it */ 
                         tr = translit[chars[idx][1]];
                       
 trlen = strlen (tr);
@@ -1022,7 +1019,7 @@
 }
             
 }
-          
+          
           else
             
 trlen = 1;
@@ -1040,12 +1037,12 @@
 
 if (charlen > 1)
             {
-              
-                /* Copy character to destination string */ 
+              
+                /* Copy character to destination string */ 
                 memcpy (transl + dest, tr, trlen);
             
 }
-          
+          
           else
             
 transl[dest] = c;
@@ -1076,4 +1073,4 @@
 
 }
 
-
+





reply via email to

[Prev in Thread] Current Thread [Next in Thread]