[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot/dic compdic.cpp compdic.h
From: |
Olivier Teulière |
Subject: |
[Eliot-dev] eliot/dic compdic.cpp compdic.h |
Date: |
Sun, 16 May 2010 10:06:01 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Changes by: Olivier Teulière <ipkiss> 10/05/16 10:06:01
Modified files:
dic : compdic.cpp compdic.h
Log message:
Sort the word list before processing it. It allows a much better
compression, and avoid problems when a word is a prefix of the word just before.
Also, the word list is not a raw buffer anymore, which makes it a bit
easier to understand the algorithm.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.h?cvsroot=eliot&r1=1.1&r2=1.2
Patches:
Index: compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/compdic.cpp,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- compdic.cpp 15 May 2010 12:14:53 -0000 1.18
+++ compdic.cpp 16 May 2010 10:06:01 -0000 1.19
@@ -125,7 +125,7 @@
}
-const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int
&oDicSize)
+void CompDic::loadWordList(const string &iFileName, vector<wstring> &oWordList)
{
ifstream file(iFileName.c_str(), ios::in | ios::binary);
if (!file.is_open())
@@ -135,42 +135,28 @@
struct stat stat_buf;
if (stat(iFileName.c_str(), &stat_buf) < 0)
throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
- oDicSize = (unsigned int)stat_buf.st_size;
+ int dicSize = (unsigned int)stat_buf.st_size;
- // Place the buffer in a vector to avoid worrying about memory handling
- vector<char> buffer(oDicSize);
- // Load the file data, everything in one shot
- file.read(&buffer.front(), oDicSize);
- file.close();
-
- // If there is a BOM in the file, use an offset to start reading after it
- size_t bomOffset = 0;
- if ((uint8_t)buffer[0] == 0xEF &&
- (uint8_t)buffer[1] == 0xBB &&
- (uint8_t)buffer[2] == 0xBF)
- {
- bomOffset = 3;
- }
-
- // Buffer for the wide characters (it will use at most as many characters
- // as the utf-8 version)
- wchar_t *wideBuf = new wchar_t[oDicSize];
+ // Reserve some space (heuristic: the average length of words is 11)
+ oWordList.reserve(dicSize / 11);
- try
+ string line;
+ while (getline(file, line))
+ {
+ // If there is a BOM in the file, remove it from the first word
+ if (oWordList.empty() && line.size() >= 3 &&
+ (uint8_t)line[0] == 0xEF &&
+ (uint8_t)line[1] == 0xBB &&
+ (uint8_t)line[2] == 0xBF)
{
- unsigned int number = readFromUTF8(wideBuf, oDicSize,
- (&buffer.front()) + bomOffset,
- oDicSize - bomOffset,
- "loadWordList");
- oDicSize = number;
- return wideBuf;
- }
- catch (...)
- {
- // Avoid leaks, and propagate the exception
- delete[] wideBuf;
- throw;
+ line = line.substr(3);
}
+ oWordList.push_back(readFromUTF8(line.data(),
+ line.size(), "loadWordList"));
+ }
+
+ // Sort the word list, to perform a better compression
+ sort(oWordList.begin(), oWordList.end());
}
@@ -229,8 +215,11 @@
#endif
-unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
- const Header &iHeader)
+unsigned int CompDic::makeNode(ostream &outFile, const Header &iHeader,
+ vector<wstring>::const_iterator &itCurrWord,
+ const vector<wstring>::const_iterator
&itLastWord,
+ wstring::const_iterator &itPosInWord,
+ const wchar_t *iPrefix)
{
#ifdef CHECK_RECURSION
IncDec inc(m_currentRec);
@@ -256,7 +245,9 @@
newEdge.last = 0;
try
{
- newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++);
+ newEdge.chr = iHeader.getCodeFromChar(*m_endString = *itPosInWord);
+ ++m_endString;
+ ++itPosInWord;
}
catch (DicException &e)
{
@@ -271,32 +262,31 @@
edges.push_back(newEdge);
// End of a word?
- if (*m_input == L'\n' || *m_input == L'\r')
+ if (itPosInWord == itCurrWord->end())
{
m_headerInfo.nwords++;
*m_endString = L'\0';
// Mark edge as word
edges.back().term = 1;
- // Skip \r and/or \n
- while (m_input != m_endOfInput &&
- (*m_input == L'\n' || *m_input == L'\r'))
- {
- ++m_input;
- }
+ // Next word
+ ++itCurrWord;
// At the end of input?
- if (m_input == m_endOfInput)
+ if (itCurrWord == itLastWord)
break;
+ itPosInWord = itCurrWord->begin();
m_endString = m_stringBuf;
- while (*m_endString == *m_input)
+ // This assumes that a word cannot be a prefix of the previous one
+ while (*m_endString == *itPosInWord)
{
- m_endString++;
- m_input++;
+ ++m_endString;
+ ++itPosInWord;
}
}
// Make dawg pointed to by this edge
- edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader);
+ edges.back().ptr = makeNode(outFile, iHeader, itCurrWord, itLastWord,
+ itPosInWord, iPrefix + 1);
}
int numedges = edges.size();
@@ -348,17 +338,16 @@
throw DicException(oss.str());
}
- const wchar_t *wordList = NULL;
- try
- {
const clock_t startLoadTime = clock();
- unsigned int dicSize;
- wordList = loadWordList(iWordListFile, dicSize);
+ vector<wstring> wordList;
+ loadWordList(iWordListFile, wordList);
const clock_t endLoadTime = clock();
m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
- m_input = wordList;
- m_endOfInput = m_input + dicSize;
+ if (wordList.empty())
+ {
+ throw DicException(_("The word list is empty!"));
+ }
// Write the header a first time, to reserve the space in the file
Header tempHeader = writeHeader(outFile);
@@ -370,12 +359,17 @@
DicEdge *tmpPtr = &specialNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
+ vector<wstring>::const_iterator firstWord = wordList.begin();
+ wstring::const_iterator initialPos = firstWord->begin();
+
// Call makeNode with null (relative to stringbuf) prefix;
// Initialize string to null; Put index of start node on output
DicEdge rootNode = {0, 0, 0, 0};
m_endString = m_stringBuf;
const clock_t startBuildTime = clock();
- rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
+ rootNode.ptr = makeNode(outFile, tempHeader,
+ firstWord, wordList.end(),
+ initialPos, m_endString);
// Reuse the temporary variable
tmpPtr = &rootNode;
writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
@@ -387,17 +381,8 @@
const Header finalHeader = writeHeader(outFile);
// Clean up
- delete[] wordList;
outFile.close();
return finalHeader;
- }
- catch (std::exception &e)
- {
- // Avoid memory leaks
- if (wordList != NULL)
- delete[] wordList;
- throw;
- }
}
Index: compdic.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/compdic.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- compdic.h 15 May 2010 12:14:53 -0000 1.1
+++ compdic.h 16 May 2010 10:06:01 -0000 1.2
@@ -94,10 +94,6 @@
wchar_t m_stringBuf[MAX_STRING_LENGTH];
/// Point to the end of the string
wchar_t* m_endString;
- /// Current position in the word list
- const wchar_t *m_input;
- /// Mark the end of the input
- const wchar_t *m_endOfInput;
#ifdef CHECK_RECURSION
map<int, vector<DicEdge> > m_mapForDepth;
int m_currentRec;
@@ -110,15 +106,12 @@
/**
* Read the word list stored in iFileName, convert it to wide chars,
- * and return it. The oDicSize parameter contains the size of the
- * returned array.
+ * and return it (in the oWordList argument).
* In case of problem, an exception is thrown.
* @param iFileName: Name (and path) of the file containing the word list.
- * @param oDicSize: Size of the returned array
- * @return Word list as a wchar_t array
+ * @param oWordList: Word list
*/
- const wchar_t * loadWordList(const string &iFileName,
- unsigned int &oDicSize);
+ void loadWordList(const string &iFileName, vector<wstring> &oWordList);
Header writeHeader(ostream &outFile) const;
@@ -137,13 +130,20 @@
* the words beginning with that prefix. String is a pointer (relative
* to m_stringBuf) indicating how much of iPrefix is matched in the
* input.
- * @param iPrefix: prefix to work on
* @param outfile: stream where to write the nodes
* @param iHeader: temporary header, used only to do the conversion between
* the (wide) chars and their corresponding internal code
+ * @param itCurrWord: iterator on the word list
+ * @param itLastWord: end of the word list
+ * @param itPosInWord: iterator on the letters of the current word
+ * @param iPrefix: prefix to work on
+ * @return the index of a DAWG matching all the words with prefix iPrefix
*/
- unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile,
- const Header &iHeader);
+ unsigned int makeNode(ostream &outFile, const Header &iHeader,
+ vector<wstring>::const_iterator &itCurrWord,
+ const vector<wstring>::const_iterator &itLastWord,
+ wstring::const_iterator &itPosInWord,
+ const wchar_t *iPrefix);
};
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot/dic compdic.cpp compdic.h,
Olivier Teulière <=