eliot-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Eliot-dev] eliot/dic compdic.cpp compdic.h


From: Olivier Teulière
Subject: [Eliot-dev] eliot/dic compdic.cpp compdic.h
Date: Sun, 16 May 2010 10:06:01 +0000

CVSROOT:        /cvsroot/eliot
Module name:    eliot
Changes by:     Olivier Teulière <ipkiss>       10/05/16 10:06:01

Modified files:
        dic            : compdic.cpp compdic.h 

Log message:
        Sort the word list before processing it. It allows a much better 
compression, and avoid problems when a word is a prefix of the word just before.
        
        Also, the word list is not a raw buffer anymore, which makes it a bit 
easier to understand the algorithm.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.h?cvsroot=eliot&r1=1.1&r2=1.2

Patches:
Index: compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/compdic.cpp,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- compdic.cpp 15 May 2010 12:14:53 -0000      1.18
+++ compdic.cpp 16 May 2010 10:06:01 -0000      1.19
@@ -125,7 +125,7 @@
 }
 
 
-const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int 
&oDicSize)
+void CompDic::loadWordList(const string &iFileName, vector<wstring> &oWordList)
 {
     ifstream file(iFileName.c_str(), ios::in | ios::binary);
     if (!file.is_open())
@@ -135,42 +135,28 @@
     struct stat stat_buf;
     if (stat(iFileName.c_str(), &stat_buf) < 0)
         throw DicException((fmt(_("Could not open file '%1%'")) % 
iFileName).str());
-    oDicSize = (unsigned int)stat_buf.st_size;
+    int dicSize = (unsigned int)stat_buf.st_size;
 
-    // Place the buffer in a vector to avoid worrying about memory handling
-    vector<char> buffer(oDicSize);
-    // Load the file data, everything in one shot
-    file.read(&buffer.front(), oDicSize);
-    file.close();
-
-    // If there is a BOM in the file, use an offset to start reading after it
-    size_t bomOffset = 0;
-    if ((uint8_t)buffer[0] == 0xEF &&
-        (uint8_t)buffer[1] == 0xBB &&
-        (uint8_t)buffer[2] == 0xBF)
-    {
-        bomOffset = 3;
-    }
-
-    // Buffer for the wide characters (it will use at most as many characters
-    // as the utf-8 version)
-    wchar_t *wideBuf = new wchar_t[oDicSize];
+    // Reserve some space (heuristic: the average length of words is 11)
+    oWordList.reserve(dicSize / 11);
 
-    try
+    string line;
+    while (getline(file, line))
+    {
+        // If there is a BOM in the file, remove it from the first word
+        if (oWordList.empty() && line.size() >= 3 &&
+            (uint8_t)line[0] == 0xEF &&
+            (uint8_t)line[1] == 0xBB &&
+            (uint8_t)line[2] == 0xBF)
     {
-        unsigned int number = readFromUTF8(wideBuf, oDicSize,
-                                           (&buffer.front()) + bomOffset,
-                                           oDicSize - bomOffset,
-                                           "loadWordList");
-        oDicSize = number;
-        return wideBuf;
-    }
-    catch (...)
-    {
-        // Avoid leaks, and propagate the exception
-        delete[] wideBuf;
-        throw;
+            line = line.substr(3);
     }
+        oWordList.push_back(readFromUTF8(line.data(),
+                                        line.size(), "loadWordList"));
+    }
+
+    // Sort the word list, to perform a better compression
+    sort(oWordList.begin(), oWordList.end());
 }
 
 
@@ -229,8 +215,11 @@
 #endif
 
 
-unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
-                               const Header &iHeader)
+unsigned int CompDic::makeNode(ostream &outFile, const Header &iHeader,
+                               vector<wstring>::const_iterator &itCurrWord,
+                               const vector<wstring>::const_iterator 
&itLastWord,
+                               wstring::const_iterator &itPosInWord,
+                               const wchar_t *iPrefix)
 {
 #ifdef CHECK_RECURSION
     IncDec inc(m_currentRec);
@@ -256,7 +245,9 @@
         newEdge.last = 0;
         try
         {
-            newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++);
+            newEdge.chr = iHeader.getCodeFromChar(*m_endString = *itPosInWord);
+            ++m_endString;
+            ++itPosInWord;
         }
         catch (DicException &e)
         {
@@ -271,32 +262,31 @@
         edges.push_back(newEdge);
 
         // End of a word?
-        if (*m_input == L'\n' || *m_input == L'\r')
+        if (itPosInWord == itCurrWord->end())
         {
             m_headerInfo.nwords++;
             *m_endString = L'\0';
             // Mark edge as word
             edges.back().term = 1;
 
-            // Skip \r and/or \n
-            while (m_input != m_endOfInput &&
-                   (*m_input == L'\n' || *m_input == L'\r'))
-            {
-                ++m_input;
-            }
+            // Next word
+            ++itCurrWord;
             // At the end of input?
-            if (m_input == m_endOfInput)
+            if (itCurrWord == itLastWord)
                 break;
+            itPosInWord = itCurrWord->begin();
 
             m_endString = m_stringBuf;
-            while (*m_endString == *m_input)
+            // This assumes that a word cannot be a prefix of the previous one
+            while (*m_endString == *itPosInWord)
             {
-                m_endString++;
-                m_input++;
+                ++m_endString;
+                ++itPosInWord;
             }
         }
         // Make dawg pointed to by this edge
-        edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader);
+        edges.back().ptr = makeNode(outFile, iHeader, itCurrWord, itLastWord,
+                                    itPosInWord, iPrefix + 1);
     }
 
     int numedges = edges.size();
@@ -348,17 +338,16 @@
         throw DicException(oss.str());
     }
 
-    const wchar_t *wordList = NULL;
-    try
-    {
         const clock_t startLoadTime = clock();
-        unsigned int dicSize;
-        wordList = loadWordList(iWordListFile, dicSize);
+    vector<wstring> wordList;
+    loadWordList(iWordListFile, wordList);
         const clock_t endLoadTime = clock();
         m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
 
-        m_input = wordList;
-        m_endOfInput = m_input + dicSize;
+    if (wordList.empty())
+    {
+        throw DicException(_("The word list is empty!"));
+    }
 
         // Write the header a first time, to reserve the space in the file
         Header tempHeader = writeHeader(outFile);
@@ -370,12 +359,17 @@
         DicEdge *tmpPtr = &specialNode;
         writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
 
+    vector<wstring>::const_iterator firstWord = wordList.begin();
+    wstring::const_iterator initialPos = firstWord->begin();
+
         // Call makeNode with null (relative to stringbuf) prefix;
         // Initialize string to null; Put index of start node on output
         DicEdge rootNode = {0, 0, 0, 0};
         m_endString = m_stringBuf;
         const clock_t startBuildTime = clock();
-        rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
+    rootNode.ptr = makeNode(outFile, tempHeader,
+                            firstWord, wordList.end(),
+                            initialPos, m_endString);
         // Reuse the temporary variable
         tmpPtr = &rootNode;
         writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
@@ -387,17 +381,8 @@
         const Header finalHeader = writeHeader(outFile);
 
         // Clean up
-        delete[] wordList;
         outFile.close();
 
         return finalHeader;
-    }
-    catch (std::exception &e)
-    {
-        // Avoid memory leaks
-        if (wordList != NULL)
-            delete[] wordList;
-        throw;
-    }
 }
 

Index: compdic.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/compdic.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- compdic.h   15 May 2010 12:14:53 -0000      1.1
+++ compdic.h   16 May 2010 10:06:01 -0000      1.2
@@ -94,10 +94,6 @@
     wchar_t m_stringBuf[MAX_STRING_LENGTH];
     /// Point to the end of the string
     wchar_t* m_endString;
-    /// Current position in the word list
-    const wchar_t *m_input;
-    /// Mark the end of the input
-    const wchar_t *m_endOfInput;
 #ifdef CHECK_RECURSION
     map<int, vector<DicEdge> > m_mapForDepth;
     int m_currentRec;
@@ -110,15 +106,12 @@
 
     /**
      * Read the word list stored in iFileName, convert it to wide chars,
-     * and return it. The oDicSize parameter contains the size of the
-     * returned array.
+     * and return it (in the oWordList argument).
      * In case of problem, an exception is thrown.
      * @param iFileName: Name (and path) of the file containing the word list.
-     * @param oDicSize: Size of the returned array
-     * @return Word list as a wchar_t array
+     * @param oWordList: Word list
      */
-    const wchar_t * loadWordList(const string &iFileName,
-                                 unsigned int &oDicSize);
+    void loadWordList(const string &iFileName, vector<wstring> &oWordList);
 
     Header writeHeader(ostream &outFile) const;
 
@@ -137,13 +130,20 @@
      * the words beginning with that prefix.  String is a pointer (relative
      * to m_stringBuf) indicating how much of iPrefix is matched in the
      * input.
-     * @param iPrefix: prefix to work on
      * @param outfile: stream where to write the nodes
      * @param iHeader: temporary header, used only to do the conversion between
      *      the (wide) chars and their corresponding internal code
+     * @param itCurrWord: iterator on the word list
+     * @param itLastWord: end of the word list
+     * @param itPosInWord: iterator on the letters of the current word
+     * @param iPrefix: prefix to work on
+     * @return the index of a DAWG matching all the words with prefix iPrefix
      */
-    unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile,
-                          const Header &iHeader);
+    unsigned int makeNode(ostream &outFile, const Header &iHeader,
+                          vector<wstring>::const_iterator &itCurrWord,
+                          const vector<wstring>::const_iterator &itLastWord,
+                          wstring::const_iterator &itPosInWord,
+                          const wchar_t *iPrefix);
 
 };
 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]