00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "qdawg.h"
00021 #include <qintdict.h>
00022 #include <qfile.h>
00023 #include <qtl.h>
00024
00025 #include <limits.h>
00026 #include <stdio.h>
00027
00028
00029 #include <sys/types.h>
00030 #include <sys/stat.h>
00031 #include <sys/mman.h>
00032 #include <fcntl.h>
00033 #include <errno.h>
00034 #include <unistd.h>
00035
00036 class QDawgPrivate;
00037 class QTrie;
00038
00039 typedef QValueList<QTrie*> TrieClub;
00040 typedef QIntDict<TrieClub> TrieClubDirectory;
00041
00042 class TriePtr {
00043 public:
00044 QChar letter;
00045 QTrie* p;
00046 int operator <(const TriePtr& o) const;
00047 int operator >(const TriePtr& o) const;
00048 int operator <=(const TriePtr& o) const;
00049 };
00050
00051 class TrieList : public QValueList<TriePtr> {
00052 bool sorted;
00053 public:
00054 TrieList()
00055 {
00056 sorted=TRUE;
00057 }
00058
00059 QTrie* findAdd(QChar c);
00060 bool equal(TrieList& l);
00061
00062 void sort()
00063 {
00064 if ( !sorted ) {
00065 qHeapSort(*this);
00066 sorted = TRUE;
00067 }
00068 }
00069 };
00070
00071
00072 class QTrie {
00073 public:
00074 QTrie();
00075 ~QTrie();
00076
00077 void insertWord(const QString& s, uint index=0);
00078 bool equal(QTrie* o);
00079 void dump(int indent=0);
00080
00081 private:
00082 TrieList children;
00083 bool isword;
00084
00085 friend class QDawgPrivate;
00086 int maxdepth;
00087 int decendants;
00088 int key;
00089 void distributeKeys(TrieClubDirectory& directory);
00090 QTrie* clubLeader(TrieClubDirectory& directory);
00091 int collectKeys();
00092 friend class TriePtr;
00093 friend class TrieList;
00094 };
00095
00096 QTrie::QTrie()
00097 {
00098 key = 0;
00099 isword = FALSE;
00100 }
00101
00102 QTrie::~QTrie()
00103 {
00104
00105
00106 }
00107
00108 void QTrie::insertWord(const QString& s, uint index)
00109 {
00110 if ( index == s.length() ) {
00111 isword = TRUE;
00112 } else {
00113 QTrie* t = children.findAdd(s[index]);
00114 t->insertWord(s,index+1);
00115 }
00116 }
00117
00118 bool QTrie::equal(QTrie* o)
00119 {
00120 if ( o == this ) return TRUE;
00121 if ( isword != o->isword )
00122 return FALSE;
00123 return children.equal(o->children);
00124 }
00125
00126 void QTrie::dump(int indent)
00127 {
00128 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00129 QTrie* s = (*it).p;
00130 for (int in=0; in<indent; in++)
00131 fputc(' ',stderr);
00132 fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(),
00133 s->key,s->isword?"word":"",s);
00134 s->dump(indent+2);
00135 }
00136 }
00137
00138 void QTrie::distributeKeys(TrieClubDirectory& directory)
00139 {
00140 maxdepth = INT_MIN;
00141 decendants = children.count();
00142 key = 0;
00143 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00144 QTrie* s = (*it).p;
00145 QChar l = (*it).letter;
00146 s->distributeKeys(directory);
00147 key = key*64+l.unicode()+s->key*5;
00148 decendants += s->decendants;
00149 if ( s->maxdepth+1 > maxdepth )
00150 maxdepth = s->maxdepth+1;
00151 }
00152 if ( decendants ) {
00153 key += decendants + maxdepth*256 + children.count() * 65536;
00154 if ( !key ) key++;
00155 }
00156 TrieClub* c = directory[key];
00157 if ( !c ) directory.insert(key, (c = new TrieClub) );
00158 c->prepend(this);
00159 }
00160
00161 QTrie* QTrie::clubLeader(TrieClubDirectory& directory)
00162 {
00163 if ( !key ) return directory[0]->first();
00164 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00165 QTrie* t= (*it).p->clubLeader(directory);
00166 (*it).p = t;
00167 }
00168 TrieClub *club = directory[key];
00169 for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00170 QTrie* o = *it;
00171 if ( o->equal(this) )
00172 return o;
00173 }
00174 return this;
00175 }
00176
00177 int QTrie::collectKeys()
00178 {
00179 int n=0;
00180 if ( key ) key=0,n+=children.count();
00181 for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it)
00182 n += (*it).p->collectKeys();
00183 return n;
00184 }
00185
00186 int TriePtr::operator <(const TriePtr& o) const
00187 { return letter < o.letter; }
00188 int TriePtr::operator >(const TriePtr& o) const
00189 { return letter > o.letter; }
00190 int TriePtr::operator <=(const TriePtr& o) const
00191 { return letter <= o.letter; }
00192
00193 bool TrieList::equal(TrieList& l)
00194 {
00195 if ( count() != l.count() )
00196 return FALSE;
00197 sort(); l.sort();
00198 ConstIterator it2 = begin();
00199 ConstIterator it = l.begin();
00200 for( ; it != l.end(); ++it, ++it2 )
00201 if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) )
00202 return FALSE;
00203 return TRUE;
00204 }
00205 QTrie* TrieList::findAdd(QChar c)
00206 {
00207 for (Iterator it=begin(); it!=end(); ++it) {
00208 if ( (*it).letter == c )
00209 return (*it).p;
00210 }
00211 TriePtr p;
00212 p.p = new QTrie;
00213 p.letter = c;
00214 prepend(p);
00215 sorted=FALSE;
00216 sort();
00217 return p.p;
00218 }
00219
00220 static const char* dawg_sig = "QDAWG100";
00221
00222 class QDawgPrivate {
00223 public:
00224 QDawgPrivate(QIODevice* dev)
00225 {
00226 QDataStream ds(dev);
00227 char sig[8];
00228 ds.readRawBytes(sig,8);
00229 if ( !strncmp(dawg_sig,sig,8) ) {
00230 uint n;
00231 char* nn;
00232 ds.readBytes(nn,n);
00233
00234
00235 node = (QDawg::Node*)nn;
00236 nodes = n / sizeof(QDawg::Node);
00237 } else {
00238 node = 0;
00239 }
00240 }
00241
00242 bool ok() const { return node; }
00243
00244 QDawgPrivate(uchar* mem)
00245 {
00246 if ( !strncmp(dawg_sig,(char*)mem,8) ) {
00247 mem += 8;
00248
00249 int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3];
00250 mem += 4;
00251
00252
00253 node = (QDawg::Node*)((char*)mem);
00254 nodes = n / sizeof(QDawg::Node);
00255 }
00256 }
00257
00258 QDawgPrivate(QTrie* t)
00259 {
00260 TrieClubDirectory directory(9973);
00261 t->distributeKeys(directory);
00262 QTrie* l = t->clubLeader(directory);
00263 ASSERT(l==t);
00264 generateArray(t);
00265
00266 TrieClub *club;
00267 for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit)
00268 {
00269 for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00270 delete *it;
00271 }
00272 delete club;
00273 }
00274 }
00275
00276 bool write(QIODevice* dev)
00277 {
00278 QDataStream ds(dev);
00279 ds.writeRawBytes(dawg_sig,8);
00280
00281 ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes);
00282 return dev->state() == IO_Ok;
00283 }
00284
00285 void dumpWords(int nid=0, int index=0)
00286 {
00287 static char word[256];
00288 int i=0;
00289 do {
00290 QDawg::Node& n = node[nid+i];
00291 word[index] = n.let;
00292 if ( n.isword )
00293 fprintf(stderr,"%.*s\n",index+1,word);
00294 if ( n.offset ) dumpWords(n.offset+nid+i,index+1);
00295 } while (!node[nid+i++].islast);
00296 }
00297
00298 void dump(int nid=0, int indent=0)
00299 {
00300 int i=0;
00301 do {
00302 QDawg::Node& n = node[nid+i];
00303 fprintf(stderr,"%d: ",nid+i);
00304 for (int in=0; in<indent; in++)
00305 fputc(' ',stderr);
00306 fprintf(stderr," %c %d %d %d\n",n.let,
00307 n.isword,n.islast,n.offset);
00308 if ( n.offset ) dump(n.offset+nid+i,indent+2);
00309 } while (!node[nid+i++].islast);
00310 }
00311
00312 int countWords(int nid=0)
00313 {
00314 int t=0;
00315 int i=0;
00316 do {
00317 QDawg::Node& n = node[nid+i];
00318 if ( n.isword )
00319 t++;
00320 if ( n.offset )
00321 t+=countWords(n.offset+nid+i);
00322 } while (!node[nid+i++].islast);
00323 return t;
00324 }
00325
00326 bool contains(const QString& s, int nid=0, int index=0) const
00327 {
00328 int i=0;
00329 do {
00330 QDawg::Node& n = node[nid+i];
00331 if ( s[index] == QChar((ushort)n.let) ) {
00332 if ( n.isword && index == (int)s.length()-1 )
00333 return TRUE;
00334 if ( n.offset )
00335 return contains(s,n.offset+nid+i,index+1);
00336 }
00337 } while (!node[nid+i++].islast);
00338 return FALSE;
00339 }
00340
00341 void appendAllWords(QStringList& list, int nid=0, QString s="") const
00342 {
00343 int i=0;
00344 int next = s.length();
00345 do {
00346 QDawg::Node& n = node[nid+i];
00347 s[next] = QChar((ushort)n.let);
00348 if ( n.isword )
00349 list.append(s);
00350 if ( n.offset )
00351 appendAllWords(list, n.offset+nid+i, s);
00352 } while (!node[nid+i++].islast);
00353 }
00354
00355 const QDawg::Node* root() { return node; }
00356
00357 private:
00358 void generateArray(QTrie* t)
00359 {
00360 nodes = 0;
00361 int n = t->collectKeys();
00362 node = new QDawg::Node[n];
00363 appendToArray(t);
00364 ASSERT(n == nodes);
00365 }
00366
00367 int appendToArray(QTrie* t)
00368 {
00369 if ( !t->key ) {
00370 if ( !t->children.count() )
00371 return 0;
00372 t->key = nodes;
00373 nodes += t->children.count();
00374 QDawg::Node* n = &node[t->key-1];
00375 int here = t->key;
00376 for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) {
00377 QTrie* s = (*it).p;
00378 ++n;
00379 n->let = (*it).letter.unicode();
00380 n->isword = s->isword;
00381 n->islast = 0;
00382 n->offset = appendToArray(s);
00383 if ( n->offset ) {
00384 int t = n->offset-here;
00385 n->offset=t;
00386 if ( n->offset != t )
00387 qWarning("Overflow: too many words");
00388 }
00389 here++;
00390 }
00391 n->islast = 1;
00392 }
00393 return t->key;
00394 }
00395
00396 private:
00397 int nodes;
00398 QDawg::Node *node;
00399 };
00400
00443 QDawg::QDawg()
00444 {
00445 d = 0;
00446 }
00447
00451 QDawg::~QDawg()
00452 {
00453 delete d;
00454 }
00455
00460 bool QDawg::createFromWords(QIODevice* dev)
00461 {
00462 delete d;
00463
00464 QTextStream i(dev);
00465 QTrie* trie = new QTrie;
00466 int n=0;
00467 while (!i.atEnd()) {
00468 trie->insertWord(QString::fromUtf8(i.readLine()));
00469 n++;
00470 }
00471 if ( n )
00472 d = new QDawgPrivate(trie);
00473 else
00474 d = 0;
00475 return TRUE;
00476 }
00477
00481 void QDawg::createFromWords(const QStringList& list)
00482 {
00483 delete d;
00484
00485 if ( list.count() ) {
00486 QTrie* trie = new QTrie;
00487 for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) {
00488 trie->insertWord(*it);
00489 }
00490 d = new QDawgPrivate(trie);
00491 } else {
00492 d = 0;
00493 }
00494 }
00495
00499 QStringList QDawg::allWords() const
00500 {
00501 QStringList result;
00502 if ( d ) d->appendAllWords(result);
00503 return result;
00504 }
00505
00506
00513 bool QDawg::readFile(const QString& filename)
00514 {
00515 delete d;
00516 d = 0;
00517 int f = ::open( QFile::encodeName(filename), O_RDONLY );
00518 if ( f < 0 )
00519 return FALSE;
00520 struct stat st;
00521 if ( !fstat( f, &st ) ) {
00522 char * tmp = (char*)mmap( 0, st.st_size,
00523 PROT_READ,
00524 MAP_FILE | MAP_PRIVATE,
00525 f, 0 );
00526 if ( tmp && tmp != (char*)MAP_FAILED )
00527 d = new QDawgPrivate((uchar*)tmp);
00528 }
00529 ::close( f );
00530 return d;
00531 }
00532
00539 bool QDawg::read(QIODevice* dev)
00540 {
00541 delete d;
00542 d = new QDawgPrivate(dev);
00543 if ( d->ok() )
00544 return TRUE;
00545 delete d;
00546 d = 0;
00547 return FALSE;
00548 }
00549
00553 bool QDawg::write(QIODevice* dev) const
00554 {
00555 return d ? d->write(dev) : TRUE;
00556 }
00557
00561 int QDawg::countWords() const
00562 {
00563 return d ? d->countWords() : 0;
00564 }
00565
00569 const QDawg::Node* QDawg::root() const
00570 {
00571 return d ? d->root() : 0;
00572 }
00573
00578 bool QDawg::contains(const QString& s) const
00579 {
00580 return d ? d->contains(s) : FALSE;
00581 }
00582
00588 void QDawg::dump() const
00589 {
00590 if ( d ) d->dump();
00591 }
00592