[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot/dic Makefile.am compdic.cpp compdic.h com...
From: |
Olivier Teulière |
Subject: |
[Eliot-dev] eliot/dic Makefile.am compdic.cpp compdic.h com... |
Date: |
Sat, 15 May 2010 12:14:54 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Changes by: Olivier Teulière <ipkiss> 10/05/15 12:14:53
Modified files:
dic : Makefile.am compdic.cpp
Added files:
dic : compdic.h compdicmain.cpp
Log message:
The dictionary creation is now encapsulated into a dedicated class, to
allow reusing it easily
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/Makefile.am?cvsroot=eliot&r1=1.21&r2=1.22
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&r1=1.17&r2=1.18
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.h?cvsroot=eliot&rev=1.1
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdicmain.cpp?cvsroot=eliot&rev=1.1
Patches:
Index: Makefile.am
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Makefile.am,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -b -r1.21 -r1.22
--- Makefile.am 1 May 2009 09:04:47 -0000 1.21
+++ Makefile.am 15 May 2010 12:14:53 -0000 1.22
@@ -32,7 +32,8 @@
encoding.cpp encoding.h \
automaton.cpp automaton.h \
regexp.cpp regexp.h \
- grammar.cpp grammar.h
+ grammar.cpp grammar.h \
+ compdic.cpp compdic.h
#####################################
if BUILD_DICTOOLS
@@ -42,7 +43,7 @@
listdic \
regexp
-compdic_SOURCES=compdic.cpp
+compdic_SOURCES=compdicmain.cpp
compdic_CPPFLAGS=$(AM_CPPFLAGS) @BOOST_CPPFLAGS@
compdic_LDADD=libdic.a @LIBINTL@
Index: compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/compdic.cpp,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -b -r1.17 -r1.18
--- compdic.cpp 20 Apr 2010 20:49:58 -0000 1.17
+++ compdic.cpp 15 May 2010 12:14:53 -0000 1.18
@@ -24,14 +24,10 @@
#include <fstream>
#include <sstream>
#include <iostream>
-#include <vector>
#include <map>
#include <boost/format.hpp>
#include <boost/foreach.hpp>
-#include <boost/tokenizer.hpp>
-#include <boost/unordered_map.hpp>
#include <boost/functional/hash.hpp>
-#include <getopt.h>
#include <ctime>
#include <sys/types.h>
#include <sys/stat.h>
@@ -42,6 +38,10 @@
#include <cerrno>
#include <cstring>
+#include "compdic.h"
+#include "encoding.h"
+#include "dic_exception.h"
+
// For htonl & Co.
#ifdef WIN32
# include <winsock2.h>
@@ -60,131 +60,36 @@
#else
# define _(String) String
#endif
-#ifdef WIN32
-# include <windows.h>
-#endif
-
-#include "encoding.h"
-#include "header.h"
-#include "dic_internals.h"
-#include "dic_exception.h"
-
-using namespace std;
// Useful shortcut
#define fmt(a) boost::format(a)
-//#define DEBUG_OUTPUT
-#define CHECK_RECURSION
-
-unsigned int getFileSize(const string &iFileName)
+CompDic::CompDic()
+ : m_currentRec(0), m_maxRec(0), m_loadTime(0), m_buildTime(0)
{
- struct stat stat_buf;
- if (stat(iFileName.c_str(), &stat_buf) < 0)
- throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
- return (unsigned int)stat_buf.st_size;
-}
-
-const wchar_t* load_uncompressed(const string &iFileName, unsigned int
&ioDicSize)
-{
- ifstream file(iFileName.c_str(), ios::in | ios::binary);
- if (!file.is_open())
- throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
-
- // Place the buffer in a vector to avoid worrying about memory handling
- vector<char> buffer(ioDicSize);
- // Load the file data, everything in one shot
- file.read(&buffer.front(), ioDicSize);
- file.close();
-
- // If there is a BOM in the file, use an offset to start reading after it
- size_t bomOffset = 0;
- if ((uint8_t)buffer[0] == 0xEF &&
- (uint8_t)buffer[1] == 0xBB &&
- (uint8_t)buffer[2] == 0xBF)
- {
- bomOffset = 3;
- }
-
- // Buffer for the wide characters (it will use at most as many characters
- // as the utf-8 version)
- wchar_t *wideBuf = new wchar_t[ioDicSize];
-
- try
- {
- unsigned int number = readFromUTF8(wideBuf, ioDicSize,
- (&buffer.front()) + bomOffset,
- ioDicSize - bomOffset,
- "load_uncompressed");
- ioDicSize = number;
- return wideBuf;
- }
- catch (...)
- {
- // Avoid leaks, and propagate the exception
- delete[] wideBuf;
- throw;
- }
+ m_headerInfo.root = 0;
+ m_headerInfo.nwords = 0;
+ m_headerInfo.nodesused = 1;
+ m_headerInfo.edgesused = 1;
+ m_headerInfo.nodessaved = 0;
+ m_headerInfo.edgessaved = 0;
}
-void readLetters(const string &iFileName, DictHeaderInfo &ioHeaderInfo)
+void CompDic::addLetter(wchar_t chr, int points, int frequency,
+ bool isVowel, bool isConsonant,
+ const vector<wstring> &iInputs)
{
- ifstream in(iFileName.c_str());
- if (!in.is_open())
- throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
-
- // Use a more friendly type name
- typedef boost::tokenizer<boost::char_separator<wchar_t>,
- std::wstring::const_iterator,
- std::wstring> Tokenizer;
-
- int lineNb = 1;
- string line;
- while (getline(in, line))
- {
- // Ignore empty lines
- if (line == "" || line == "\r" || line == "\n")
- continue;
-
- // Convert the line to a wstring
- const wstring &wline = readFromUTF8(line.c_str(), line.size(),
"readLetters (1)");
- // Split the lines on space characters
- boost::char_separator<wchar_t> sep(L" ");
- Tokenizer tok(wline, sep);
- Tokenizer::iterator it;
- vector<wstring> tokens(tok.begin(), tok.end());
-
- // We expect at least 5 fields on the line
- if (tokens.size() < 5)
- {
- ostringstream ss;
- ss << fmt(_("readLetters: Not enough fields "
- "in %1% (line %2%)")) % iFileName % lineNb;
- throw DicException(ss.str());
- }
-
- // The first field is a single character
- wstring letter = tokens[0];
- if (letter.size() != 1)
- {
- ostringstream ss;
- ss << fmt(_("readLetters: Invalid letter at line %1% "
- "(only one character allowed)")) % lineNb;
- throw DicException(ss.str());
- }
-
// We don't support non-alphabetical characters in the dictionary
// apart from the joker '?'. For more explanations on the issue, see
// on the eliot-dev mailing-list the thread with the following title:
// re: Unable to show menus in Catalan, and some weird char "problem"
// (started on 2009/12/31)
- wchar_t chr = letter[0];
if (!iswalpha(chr) && chr != L'?')
{
ostringstream ss;
- ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(letter)
<< endl;
+ ss << fmt(_("'%1%' is not a valid letter.")) % convertToMb(chr) <<
endl;
ss << fmt(_("For technical reasons, Eliot currently only supports "
"alphabetical characters as internal character "
"representation, even if the tile has a display string
"
@@ -192,19 +97,19 @@
"word list accordingly."));
throw DicException(ss.str());
}
- wchar_t upChar = towupper(chr);
- ioHeaderInfo.letters += upChar;
- ioHeaderInfo.points.push_back(_wtoi(tokens[1].c_str()));
- ioHeaderInfo.frequency.push_back(_wtoi(tokens[2].c_str()));
- ioHeaderInfo.vowels.push_back(_wtoi(tokens[3].c_str()));
- ioHeaderInfo.consonants.push_back(_wtoi(tokens[4].c_str()));
+ const wchar_t upChar = towupper(chr);
+ m_headerInfo.letters += upChar;
+ m_headerInfo.points.push_back(points);
+ m_headerInfo.frequency.push_back(frequency);
+ m_headerInfo.vowels.push_back(isVowel);
+ m_headerInfo.consonants.push_back(isConsonant);
- if (tokens.size() > 5)
- {
- vector<wstring> inputs(tokens.begin() + 5, tokens.end());
// Ensure the input strings are in upper case
- BOOST_FOREACH(wstring &str, inputs)
+ if (!iInputs.empty())
+ {
+ vector<wstring> upperInputs = iInputs;
+ BOOST_FOREACH(wstring &str, upperInputs)
{
std::transform(str.begin(), str.end(), str.begin(), towupper);
}
@@ -212,50 +117,74 @@
// If the display string is identical to the internal char and if
// there is no other input, no need to save this information, as
// it is already the default.
- if (inputs.size() != 1 || inputs[0] != wstring(1, upChar))
+ if (upperInputs.size() != 1 || upperInputs[0] != wstring(1, upChar))
{
- ioHeaderInfo.displayInputData[upChar] = inputs;
+ m_headerInfo.displayInputData[upChar] = upperInputs;
}
}
-
- ++lineNb;
- }
}
-Header skip_init_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo)
+const wchar_t * CompDic::loadWordList(const string &iFileName, unsigned int
&oDicSize)
{
- ioHeaderInfo.root = 0;
- ioHeaderInfo.nwords = 0;
- ioHeaderInfo.nodesused = 1;
- ioHeaderInfo.edgesused = 1;
- ioHeaderInfo.nodessaved = 0;
- ioHeaderInfo.edgessaved = 0;
+ ifstream file(iFileName.c_str(), ios::in | ios::binary);
+ if (!file.is_open())
+ throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
- Header aHeader(ioHeaderInfo);
- aHeader.write(outfile);
- return aHeader;
+ // Get the file size
+ struct stat stat_buf;
+ if (stat(iFileName.c_str(), &stat_buf) < 0)
+ throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
+ oDicSize = (unsigned int)stat_buf.st_size;
+
+ // Place the buffer in a vector to avoid worrying about memory handling
+ vector<char> buffer(oDicSize);
+ // Load the file data, everything in one shot
+ file.read(&buffer.front(), oDicSize);
+ file.close();
+
+ // If there is a BOM in the file, use an offset to start reading after it
+ size_t bomOffset = 0;
+ if ((uint8_t)buffer[0] == 0xEF &&
+ (uint8_t)buffer[1] == 0xBB &&
+ (uint8_t)buffer[2] == 0xBF)
+ {
+ bomOffset = 3;
+ }
+
+ // Buffer for the wide characters (it will use at most as many characters
+ // as the utf-8 version)
+ wchar_t *wideBuf = new wchar_t[oDicSize];
+
+ try
+ {
+ unsigned int number = readFromUTF8(wideBuf, oDicSize,
+ (&buffer.front()) + bomOffset,
+ oDicSize - bomOffset,
+ "loadWordList");
+ oDicSize = number;
+ return wideBuf;
+ }
+ catch (...)
+ {
+ // Avoid leaks, and propagate the exception
+ delete[] wideBuf;
+ throw;
+ }
}
-void fix_header(ostream &outfile, DictHeaderInfo &ioHeaderInfo)
+Header CompDic::writeHeader(ostream &outFile) const
{
- ioHeaderInfo.root = ioHeaderInfo.edgesused;
- // Go back to the beginning of the stream to overwrite the header
- outfile.seekp(0, ios::beg);
-#if defined(WORDS_BIGENDIAN)
-#warning "**********************************************"
-#warning "compdic does not run yet on bigendian machines"
-#warning "**********************************************"
-#else
- Header aHeader(ioHeaderInfo);
- aHeader.write(outfile);
-#endif
+ // Go back to the beginning of the stream before writing the header
+ outFile.seekp(0, ios::beg);
+ Header aHeader(m_headerInfo);
+ aHeader.write(outFile);
+ return aHeader;
}
-// Change endianness of the pointed edges, and write them to the given ostream
-void write_node(uint32_t *ioEdges, unsigned int num, ostream &outfile)
+void CompDic::writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile)
{
// Handle endianness
for (unsigned int i = 0; i < num; ++i)
@@ -267,16 +196,13 @@
cout << fmt(_("writing %1% edges")) % num << endl;
for (int i = 0; i < num; i++)
{
- outfile.write((char*)(ioEdges + i), sizeof(DicEdge));
+ outFile.write((char*)(ioEdges + i), sizeof(DicEdge));
}
#else
- outfile.write((char*)ioEdges, num * sizeof(DicEdge));
+ outFile.write((char*)ioEdges, num * sizeof(DicEdge));
#endif
}
-#define MAX_STRING_LENGTH 200
-
-
#define MAX_EDGES 2000
/* ods3: ?? */
/* ods4: 1746 */
@@ -295,62 +221,24 @@
class IncDec
{
public:
- IncDec(int &ioCounter)
- : m_counter(ioCounter)
- {
- m_counter++;
- }
-
- ~IncDec()
- {
- m_counter--;
- }
+ IncDec(int &ioCounter) : m_counter(ioCounter) { ++m_counter; }
+ ~IncDec() { --m_counter; }
private:
int &m_counter;
};
-
-int current_rec = 0;
-int max_rec = 0;
#endif
-typedef boost::unordered_map<vector<DicEdge>, unsigned int> HashMap;
-
-/* global variables */
-HashMap global_hashmap;
-wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
-wchar_t* global_endstring; /* Marks END of current string */
-const wchar_t* global_input;
-const wchar_t* global_endofinput;
-#ifdef CHECK_RECURSION
-map<int, vector<DicEdge> > global_mapfordepth;
-#endif
-
-/**
- * Makenode takes a prefix (as position relative to stringbuf) and
- * returns an index of the start node of a dawg that recognizes all
- * words beginning with that prefix. String is a pointer (relative
- * to stringbuf) indicating how much of iPrefix is matched in the
- * input.
- * @param iPrefix: prefix to work on
- * @param outfile: stream where to write the nodes
- * @param ioHeaderInfo: information needed to build the final header, updated
- * during the processing
- * @param iHeader: temporary header, used only to do the conversion between
- * the (wide) chars and their corresponding internal code
- */
-unsigned int makenode(const wchar_t *iPrefix, ostream &outfile,
- DictHeaderInfo &ioHeaderInfo, const Header &iHeader)
+unsigned int CompDic::makeNode(const wchar_t *iPrefix, ostream &outFile,
+ const Header &iHeader)
{
#ifdef CHECK_RECURSION
- IncDec inc(current_rec);
- if (current_rec > max_rec)
- max_rec = current_rec;
-#endif
+ IncDec inc(m_currentRec);
+ if (m_currentRec > m_maxRec)
+ m_maxRec = m_currentRec;
-#ifdef CHECK_RECURSION
// Instead of creating a vector, try to reuse an existing one
- vector<DicEdge> &edges = global_mapfordepth[current_rec];
+ vector<DicEdge> &edges = m_mapForDepth[m_currentRec];
edges.reserve(MAX_EDGES);
edges.clear();
#else
@@ -360,7 +248,7 @@
#endif
DicEdge newEdge;
- while (iPrefix == global_endstring)
+ while (iPrefix == m_endString)
{
// More edges out of node
newEdge.ptr = 0;
@@ -368,48 +256,47 @@
newEdge.last = 0;
try
{
- newEdge.chr = iHeader.getCodeFromChar(*global_endstring++ =
*global_input++);
+ newEdge.chr = iHeader.getCodeFromChar(*m_endString++ = *m_input++);
}
catch (DicException &e)
{
// If an invalid character is found, be specific about the problem
ostringstream oss;
oss << fmt(_("Error on line %1%, col %2%: %3%"))
- % (1 + ioHeaderInfo.nwords)
- % (global_endstring - global_stringbuf)
+ % (1 + m_headerInfo.nwords)
+ % (m_endString - m_stringBuf)
% e.what() << endl;
throw DicException(oss.str());
}
edges.push_back(newEdge);
// End of a word?
- if (*global_input == L'\n' || *global_input == L'\r')
+ if (*m_input == L'\n' || *m_input == L'\r')
{
- ioHeaderInfo.nwords++;
- *global_endstring = L'\0';
+ m_headerInfo.nwords++;
+ *m_endString = L'\0';
// Mark edge as word
edges.back().term = 1;
// Skip \r and/or \n
- while (global_input != global_endofinput &&
- (*global_input == L'\n' || *global_input == L'\r'))
+ while (m_input != m_endOfInput &&
+ (*m_input == L'\n' || *m_input == L'\r'))
{
- ++global_input;
+ ++m_input;
}
// At the end of input?
- if (global_input == global_endofinput)
+ if (m_input == m_endOfInput)
break;
- global_endstring = global_stringbuf;
- while (*global_endstring == *global_input)
+ m_endString = m_stringBuf;
+ while (*m_endString == *m_input)
{
- global_endstring++;
- global_input++;
+ m_endString++;
+ m_input++;
}
}
// Make dawg pointed to by this edge
- edges.back().ptr =
- makenode(iPrefix + 1, outfile, ioHeaderInfo, iHeader);
+ edges.back().ptr = makeNode(iPrefix + 1, outFile, iHeader);
}
int numedges = edges.size();
@@ -422,212 +309,95 @@
// Mark the last edge
edges.back().last = 1;
- HashMap::const_iterator itMap = global_hashmap.find(edges);
- if (itMap != global_hashmap.end())
+ HashMap::const_iterator itMap = m_hashMap.find(edges);
+ if (itMap != m_hashMap.end())
{
- ioHeaderInfo.edgessaved += numedges;
- ioHeaderInfo.nodessaved++;
+ m_headerInfo.edgessaved += numedges;
+ m_headerInfo.nodessaved++;
return itMap->second;
}
else
{
- unsigned int node_pos = ioHeaderInfo.edgesused;
- global_hashmap[edges] = ioHeaderInfo.edgesused;
- ioHeaderInfo.edgesused += numedges;
- ioHeaderInfo.nodesused++;
- write_node(reinterpret_cast<uint32_t*>(&edges.front()),
- numedges, outfile);
+ unsigned int node_pos = m_headerInfo.edgesused;
+ m_hashMap[edges] = m_headerInfo.edgesused;
+ m_headerInfo.edgesused += numedges;
+ m_headerInfo.nodesused++;
+ writeNode(reinterpret_cast<uint32_t*>(&edges.front()),
+ numedges, outFile);
return node_pos;
}
}
-void printUsage(const string &iBinaryName)
-{
- cout << "Usage: " << iBinaryName << " [options]" << endl
- << _("Mandatory options:") << endl
- << _(" -d, --dicname <string> Set the dictionary name and version")
<< endl
- << _(" -l, --letters <string> Path to the file containing the
letters (see below)") << endl
- << _(" -i, --input <string> Path to the uncompressed dictionary
file (encoded in UTF-8)") << endl
- << _(" The words must be in alphabetical
order, without duplicates") << endl
- << _(" -o, --output <string Path to the generated compressed
dictionary file") << endl
- << _("Other options:") << endl
- << _(" -h, --help Print this help and exit") << endl
- << _("Example:") << endl
- << " " << iBinaryName << _(" -d 'ODS 5.0' -l letters.txt -i ods5.txt
-o ods5.dawg") << endl
- << endl
- << _("The file containing the letters (--letters switch) must be
UTF-8 encoded.") << endl
- << _("Each line corresponds to one letter, and must contain at least
5 fields separated with "
- "one or more space(s).") << endl
- << _(" - 1st field: the letter itself, as stored in the input file
(single character)") << endl
- << _(" - 2nd field: the points of the letter") << endl
- << _(" - 3rd field: the frequency of the letter (how many letters of
this kind in the game)") << endl
- << _(" - 4th field: 1 if the letter is considered as a vowel in
Scrabble game, 0 otherwise") << endl
- << _(" - 5th field: 1 if the letter is considered as a consonant in
Scrabble game, 0 otherwise") << endl
- << _(" - 6th field (optional): display string for the letter
(default: the letter itself)") << endl
- << _(" - other fields (optional): input strings for the letter, in
addition to the display string") << endl
- << endl
- << _("Example for french:") << endl
- << "A 1 9 1 0" << endl
- << "[...]" << endl
- << "Z 10 1 0 1" << endl
- << "? 0 2 1 1" << endl
- << endl
- << _("Example for catalan:") << endl
- << "A 1 12 1 0" << endl
- << "[...]" << endl
- // TRANSLATORS: the first "L.L" must be translated "L·L",
- // and the last one translated "Ä¿L"
- << _("W 10 1 0 1 L.L L.L L-L L.L") << endl
- << "X 10 1 0 1" << endl
- << "Y 10 1 0 1 NY" << endl
- << "[...]" << endl;
-}
-
-
-int main(int argc, char* argv[])
-{
-#if HAVE_SETLOCALE
- // Set locale via LC_ALL
- setlocale(LC_ALL, "");
-#endif
-
-#if ENABLE_NLS
- // Set the message domain
-#ifdef WIN32
- // Get the absolute path, as returned by GetFullPathName()
- char baseDir[MAX_PATH];
- GetFullPathName(argv[0], MAX_PATH, baseDir, NULL);
- char *pos = strrchr(baseDir, L'\\');
- if (pos)
- *pos = '\0';
- const string localeDir = baseDir + string("\\locale");
-#else
- static const string localeDir = LOCALEDIR;
-#endif
- bindtextdomain(PACKAGE, localeDir.c_str());
- textdomain(PACKAGE);
-#endif
-
- static const struct option long_options[] =
- {
- {"help", no_argument, NULL, 'h'},
- {"dicname", required_argument, NULL, 'd'},
- {"letters", required_argument, NULL, 'l'},
- {"input", required_argument, NULL, 'i'},
- {"output", required_argument, NULL, 'o'},
- {0, 0, 0, 0}
- };
- static const char short_options[] = "hd:l:i:o:";
-
- bool found_d = false;
- bool found_l = false;
- bool found_i = false;
- bool found_o = false;
- string inFileName;
- string outFileName;
- DictHeaderInfo headerInfo;
-
- int res;
- int option_index = 1;
- try
- {
- while ((res = getopt_long(argc, argv, short_options,
- long_options, &option_index)) != -1)
- {
- switch (res)
- {
- case 'h':
- printUsage(argv[0]);
- exit(0);
- case 'd':
- found_d = true;
- headerInfo.dicName = convertToWc(optarg);
- break;
- case 'l':
- found_l = true;
- readLetters(optarg, headerInfo);
- break;
- case 'i':
- found_i = true;
- inFileName = optarg;
- break;
- case 'o':
- found_o = true;
- outFileName = optarg;
- break;
- }
- }
-
- // Check mandatory options
- if (!found_d || !found_l || !found_i || !found_o)
+Header CompDic::generateDawg(const string &iWordListFile,
+ const string &iDawgFile,
+ const string &iDicName)
+{
+ m_headerInfo.dicName = convertToWc(iDicName);
+ // We are not (yet) able to build the GADDAG format
+ m_headerInfo.dawg = true;
+
+ // Open the output file
+ ofstream outFile(iDawgFile.c_str(), ios::out | ios::binary | ios::trunc);
+ if (!outFile.is_open())
{
- cerr << _("A mandatory option is missing") << endl;
- printUsage(argv[0]);
- exit(1);
+ ostringstream oss;
+ oss << fmt(_("Cannot open output file '%1%'")) % iDawgFile;
+ throw DicException(oss.str());
}
- unsigned int dicSize = getFileSize(inFileName);
-
- ofstream outfile(outFileName.c_str(), ios::out | ios::binary |
ios::trunc);
- if (!outfile.is_open())
+ const wchar_t *wordList = NULL;
+ try
{
- cerr << fmt(_("Cannot open output file '%1%'")) % outFileName <<
endl;
- exit(1);
- }
-
- clock_t startLoadTime = clock();
- // FIXME: not exception safe
- const wchar_t *uncompressed = load_uncompressed(inFileName, dicSize);
- clock_t endLoadTime = clock();
+ const clock_t startLoadTime = clock();
+ unsigned int dicSize;
+ wordList = loadWordList(iWordListFile, dicSize);
+ const clock_t endLoadTime = clock();
+ m_loadTime = 1.0 * (endLoadTime - startLoadTime) / CLOCKS_PER_SEC;
- global_input = uncompressed;
- global_endofinput = global_input + dicSize;
+ m_input = wordList;
+ m_endOfInput = m_input + dicSize;
- headerInfo.dawg = true;
- Header tempHeader = skip_init_header(outfile, headerInfo);
+ // Write the header a first time, to reserve the space in the file
+ Header tempHeader = writeHeader(outFile);
- DicEdge specialnode = {0, 0, 0, 0};
- specialnode.last = 1;
+ DicEdge specialNode = {0, 0, 0, 0};
+ specialNode.last = 1;
// Temporary variable to avoid a warning when compiling with -O2
// (there is no warning with -O0... g++ bug?)
- DicEdge *tmpPtr = &specialnode;
- write_node(reinterpret_cast<uint32_t*>(tmpPtr), 1, outfile);
+ DicEdge *tmpPtr = &specialNode;
+ writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
- /*
- * Call makenode with null (relative to stringbuf) prefix;
- * Initialize string to null; Put index of start node on output
- */
- DicEdge rootnode = {0, 0, 0, 0};
- global_endstring = global_stringbuf;
- clock_t startBuildTime = clock();
- rootnode.ptr = makenode(global_endstring, outfile, headerInfo,
tempHeader);
- clock_t endBuildTime = clock();
+ // Call makeNode with null (relative to stringbuf) prefix;
+ // Initialize string to null; Put index of start node on output
+ DicEdge rootNode = {0, 0, 0, 0};
+ m_endString = m_stringBuf;
+ const clock_t startBuildTime = clock();
+ rootNode.ptr = makeNode(m_endString, outFile, tempHeader);
// Reuse the temporary variable
- tmpPtr = &rootnode;
- write_node(reinterpret_cast<uint32_t*>(tmpPtr), 1, outfile);
-
- fix_header(outfile, headerInfo);
-
- Header aHeader(headerInfo);
- aHeader.print();
-
- delete[] uncompressed;
- outfile.close();
+ tmpPtr = &rootNode;
+ writeNode(reinterpret_cast<uint32_t*>(tmpPtr), 1, outFile);
+ const clock_t endBuildTime = clock();
+ m_buildTime = 1.0 * (endBuildTime - startBuildTime) / CLOCKS_PER_SEC;
+
+ // Write the header again, now that it is complete
+ m_headerInfo.root = m_headerInfo.edgesused;
+ const Header finalHeader = writeHeader(outFile);
+
+ // Clean up
+ delete[] wordList;
+ outFile.close();
- printf(_(" Load time: %.3f s\n"), 1.0 * (endLoadTime - startLoadTime)
/ CLOCKS_PER_SEC);
- printf(_(" Compression time: %.3f s\n"), 1.0 * (endBuildTime -
startBuildTime) / CLOCKS_PER_SEC);
-#ifdef CHECK_RECURSION
- cout << fmt(_(" Maximum recursion level reached: %1%")) % max_rec <<
endl;
-#endif
- return 0;
+ return finalHeader;
}
catch (std::exception &e)
{
- cerr << fmt(_("Exception caught: %1%")) % e.what() << endl;
- return 1;
+ // Avoid memory leaks
+ if (wordList != NULL)
+ delete[] wordList;
+ throw;
}
}
Index: compdic.h
===================================================================
RCS file: compdic.h
diff -N compdic.h
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ compdic.h 15 May 2010 12:14:53 -0000 1.1
@@ -0,0 +1,150 @@
+/*****************************************************************************
+ * Eliot
+ * Copyright (C) 2005-2007 Antoine Fraboulet
+ * Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *****************************************************************************/
+
+#ifndef DIC_COMPDIC_H_
+#define DIC_COMPDIC_H_
+
+#include <vector>
+#include <string>
+#include <iosfwd>
+#include <boost/unordered_map.hpp>
+
+#include "header.h"
+#include "dic_internals.h"
+
+class DicEdge;
+class DictHeaderInfo;
+class Header;
+
+using namespace std;
+
+//#define DEBUG_OUTPUT
+#define CHECK_RECURSION
+
+
+class CompDic
+{
+ typedef boost::unordered_map<vector<DicEdge>, unsigned int> HashMap;
+
+public:
+ CompDic();
+
+ /**
+ * Define a new letter. The letter must be alphabetic (i.e. iswalpha()
+ * returns true for it).
+ * @param letter: Letter to addLetter
+ * @param points: Points of the letter
+ * @param frequency: Number of occurrences of the letter in the game
+ * @param isVowel: True if the letter can be considered as a vowel,
+ * false otherwise
+ * @param isConsonant: True if the letter can be considered as a consonant,
+ * false otherwise
+ * @param iInputs: Vector containing the various ways to input the letter.
+ * If not empty, the first value corresponds to the display string.
+ */
+ void addLetter(wchar_t letter, int points, int frequency,
+ bool isVowel, bool isConsonant,
+ const vector<wstring> &iInputs);
+
+ /**
+ * Generate the dictionary. You must have called addLetter() before
+ * (once for each letter of the word list, and possible once for the
+ * joker).
+ * @param iWordListFile: Name (and path) of the word list file
+ * @param iDawgFile: Name (and path) of the generated dawg file
+ * @param iDicName: Internal name of the dictionary
+ * @return The header of the generated dawg
+ */
+ Header generateDawg(const string &iWordListFile,
+ const string &iDawgFile,
+ const string &iDicName);
+
+ // Statistics
+ double getLoadTime() const { return m_loadTime; }
+ double getBuildTime() const { return m_buildTime; }
+#ifdef CHECK_RECURSION
+ double getMaxRecursion() const { return m_maxRec; }
+#endif
+
+private:
+ DictHeaderInfo m_headerInfo;
+
+ HashMap m_hashMap;
+
+#define MAX_STRING_LENGTH 200
+
+ /// Space for the current string
+ wchar_t m_stringBuf[MAX_STRING_LENGTH];
+ /// Point to the end of the string
+ wchar_t* m_endString;
+ /// Current position in the word list
+ const wchar_t *m_input;
+ /// Mark the end of the input
+ const wchar_t *m_endOfInput;
+#ifdef CHECK_RECURSION
+ map<int, vector<DicEdge> > m_mapForDepth;
+ int m_currentRec;
+ int m_maxRec;
+#endif
+
+ double m_loadTime;
+ double m_buildTime;
+
+
+ /**
+ * Read the word list stored in iFileName, convert it to wide chars,
+ * and return it. The oDicSize parameter contains the size of the
+ * returned array.
+ * In case of problem, an exception is thrown.
+ * @param iFileName: Name (and path) of the file containing the word list.
+ * @param oDicSize: Size of the returned array
+ * @return Word list as a wchar_t array
+ */
+ const wchar_t * loadWordList(const string &iFileName,
+ unsigned int &oDicSize);
+
+ Header writeHeader(ostream &outFile) const;
+
+ /**
+ * Change the endianness of the pointed edges (if needed),
+ * and write them to the given ostream.
+ * @param ioEdges: array of edges
+ * @param num: number of edges in the array
+ * @param outFile: stream where to write the edges
+ */
+ void writeNode(uint32_t *ioEdges, unsigned int num, ostream &outFile);
+
+ /**
+ * MakeNode takes a prefix (as position relative to m_stringBuf) and
+ * returns the index of the start node of a dawg that recognizes all
+ * the words beginning with that prefix. String is a pointer (relative
+ * to m_stringBuf) indicating how much of iPrefix is matched in the
+ * input.
+ * @param iPrefix: prefix to work on
+ * @param outfile: stream where to write the nodes
+ * @param iHeader: temporary header, used only to do the conversion between
+ * the (wide) chars and their corresponding internal code
+ */
+ unsigned int makeNode(const wchar_t *iPrefix, ostream &outFile,
+ const Header &iHeader);
+
+};
+
+#endif /* DIC_COMPDIC_H_ */
Index: compdicmain.cpp
===================================================================
RCS file: compdicmain.cpp
diff -N compdicmain.cpp
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ compdicmain.cpp 15 May 2010 12:14:53 -0000 1.1
@@ -0,0 +1,261 @@
+/*****************************************************************************
+ * Eliot
+ * Copyright (C) 1999-2007 Antoine Fraboulet & Olivier Teulière
+ * Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
+ * Olivier Teulière <ipkiss @@ gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *****************************************************************************/
+
+#include "config.h"
+
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <boost/format.hpp>
+#include <boost/foreach.hpp>
+#include <boost/tokenizer.hpp>
+#include <getopt.h>
+
+#if ENABLE_NLS
+# include <libintl.h>
+# define _(String) gettext(String)
+#else
+# define _(String) String
+#endif
+#ifdef WIN32
+# include <windows.h>
+#endif
+
+#include "compdic.h"
+#include "dic_exception.h"
+#include "encoding.h"
+#include "header.h"
+
+using namespace std;
+
+// Useful shortcut
+#define fmt(a) boost::format(a)
+
+
+void readLetters(const string &iFileName, CompDic &ioBuilder)
+{
+ ifstream in(iFileName.c_str());
+ if (!in.is_open())
+ throw DicException((fmt(_("Could not open file '%1%'")) %
iFileName).str());
+
+ // Use a more friendly type name
+ typedef boost::tokenizer<boost::char_separator<wchar_t>,
+ std::wstring::const_iterator,
+ std::wstring> Tokenizer;
+
+ int lineNb = 1;
+ string line;
+ while (getline(in, line))
+ {
+ // Ignore empty lines
+ if (line == "" || line == "\r" || line == "\n")
+ continue;
+
+ // Convert the line to a wstring
+ const wstring &wline =
+ readFromUTF8(line.c_str(), line.size(), "readLetters (1)");
+ // Split the lines on space characters
+ boost::char_separator<wchar_t> sep(L" ");
+ Tokenizer tok(wline, sep);
+ Tokenizer::iterator it;
+ vector<wstring> tokens(tok.begin(), tok.end());
+
+ // We expect at least 5 fields on the line
+ if (tokens.size() < 5)
+ {
+ ostringstream ss;
+ ss << fmt(_("readLetters: Not enough fields "
+ "in %1% (line %2%)")) % iFileName % lineNb;
+ throw DicException(ss.str());
+ }
+
+ // The first field is a single character
+ wstring letter = tokens[0];
+ if (letter.size() != 1)
+ {
+ ostringstream ss;
+ ss << fmt(_("readLetters: Invalid letter at line %1% "
+ "(only one character allowed)")) % lineNb;
+ throw DicException(ss.str());
+ }
+
+ vector<wstring> inputs;
+ if (tokens.size() > 5)
+ {
+ inputs = vector<wstring>(tokens.begin() + 5, tokens.end());
+ }
+ ioBuilder.addLetter(letter[0], _wtoi(tokens[1].c_str()),
+ _wtoi(tokens[2].c_str()), _wtoi(tokens[3].c_str()),
+ _wtoi(tokens[4].c_str()), inputs);
+
+ ++lineNb;
+ }
+}
+
+
+void printUsage(const string &iBinaryName)
+{
+ cout << "Usage: " << iBinaryName << " [options]" << endl
+ << _("Mandatory options:") << endl
+ << _(" -d, --dicname <string> Set the dictionary name and version")
<< endl
+ << _(" -l, --letters <string> Path to the file containing the
letters (see below)") << endl
+ << _(" -i, --input <string> Path to the uncompressed dictionary
file (encoded in UTF-8)") << endl
+ << _(" The words must be in alphabetical
order, without duplicates") << endl
+ << _(" -o, --output <string Path to the generated compressed
dictionary file") << endl
+ << _("Other options:") << endl
+ << _(" -h, --help Print this help and exit") << endl
+ << _("Example:") << endl
+ << " " << iBinaryName << _(" -d 'ODS 5.0' -l letters.txt -i ods5.txt
-o ods5.dawg") << endl
+ << endl
+ << _("The file containing the letters (--letters switch) must be
UTF-8 encoded.") << endl
+ << _("Each line corresponds to one letter, and must contain at least
5 fields separated with "
+ "one or more space(s).") << endl
+ << _(" - 1st field: the letter itself, as stored in the input file
(single character)") << endl
+ << _(" - 2nd field: the points of the letter") << endl
+ << _(" - 3rd field: the frequency of the letter (how many letters of
this kind in the game)") << endl
+ << _(" - 4th field: 1 if the letter is considered as a vowel in
Scrabble game, 0 otherwise") << endl
+ << _(" - 5th field: 1 if the letter is considered as a consonant in
Scrabble game, 0 otherwise") << endl
+ << _(" - 6th field (optional): display string for the letter
(default: the letter itself)") << endl
+ << _(" - other fields (optional): input strings for the letter, in
addition to the display string") << endl
+ << endl
+ << _("Example for french:") << endl
+ << "A 1 9 1 0" << endl
+ << "[...]" << endl
+ << "Z 10 1 0 1" << endl
+ << "? 0 2 1 1" << endl
+ << endl
+ << _("Example for catalan:") << endl
+ << "A 1 12 1 0" << endl
+ << "[...]" << endl
+ // TRANSLATORS: the first "L.L" must be translated "L·L",
+ // and the last one translated "Ä¿L"
+ << _("W 10 1 0 1 L.L L.L L-L L.L") << endl
+ << "X 10 1 0 1" << endl
+ << "Y 10 1 0 1 NY" << endl
+ << "[...]" << endl;
+}
+
+
+int main(int argc, char* argv[])
+{
+#if HAVE_SETLOCALE
+ // Set locale via LC_ALL
+ setlocale(LC_ALL, "");
+#endif
+
+#if ENABLE_NLS
+ // Set the message domain
+#ifdef WIN32
+ // Get the absolute path, as returned by GetFullPathName()
+ char baseDir[MAX_PATH];
+ GetFullPathName(argv[0], MAX_PATH, baseDir, NULL);
+ char *pos = strrchr(baseDir, L'\\');
+ if (pos)
+ *pos = '\0';
+ const string localeDir = baseDir + string("\\locale");
+#else
+ static const string localeDir = LOCALEDIR;
+#endif
+ bindtextdomain(PACKAGE, localeDir.c_str());
+ textdomain(PACKAGE);
+#endif
+
+ static const struct option long_options[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"dicname", required_argument, NULL, 'd'},
+ {"letters", required_argument, NULL, 'l'},
+ {"input", required_argument, NULL, 'i'},
+ {"output", required_argument, NULL, 'o'},
+ {0, 0, 0, 0}
+ };
+ static const char short_options[] = "hd:l:i:o:";
+
+ bool found_d = false;
+ bool found_l = false;
+ bool found_i = false;
+ bool found_o = false;
+ string dicName;
+ string inFileName;
+ string outFileName;
+ CompDic builder;
+
+ int res;
+ int option_index = 1;
+ try
+ {
+ while ((res = getopt_long(argc, argv, short_options,
+ long_options, &option_index)) != -1)
+ {
+ switch (res)
+ {
+ case 'h':
+ printUsage(argv[0]);
+ exit(0);
+ case 'd':
+ found_d = true;
+ dicName = optarg;
+ break;
+ case 'l':
+ found_l = true;
+ readLetters(optarg, builder);
+ break;
+ case 'i':
+ found_i = true;
+ inFileName = optarg;
+ break;
+ case 'o':
+ found_o = true;
+ outFileName = optarg;
+ break;
+ }
+ }
+
+ // Check mandatory options
+ if (!found_d || !found_l || !found_i || !found_o)
+ {
+ cerr << _("A mandatory option is missing") << endl;
+ printUsage(argv[0]);
+ exit(1);
+ }
+
+ // Generate the dictionary
+ const Header &header =
+ builder.generateDawg(inFileName, outFileName, dicName);
+
+ // Print the header
+ header.print();
+
+ cout << fmt(_(" Load time: %1% s")) % builder.getLoadTime() << endl;
+ cout << fmt(_(" Compression time: %1% s")) % builder.getBuildTime() <<
endl;
+#ifdef CHECK_RECURSION
+ cout << fmt(_(" Maximum recursion level reached: %1%")) %
builder.getMaxRecursion() << endl;
+#endif
+ return 0;
+ }
+ catch (std::exception &e)
+ {
+ cerr << fmt(_("Exception caught: %1%")) % e.what() << endl;
+ return 1;
+ }
+}
+
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot/dic Makefile.am compdic.cpp compdic.h com...,
Olivier Teulière <=