Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

qdawg.cpp

Go to the documentation of this file.
00001 /**********************************************************************
00002 ** Copyright (C) 2000-2002 Trolltech AS.  All rights reserved.
00003 **
00004 ** This file is part of the Qtopia Environment.
00005 **
00006 ** This file may be distributed and/or modified under the terms of the
00007 ** GNU General Public License version 2 as published by the Free Software
00008 ** Foundation and appearing in the file LICENSE.GPL included in the
00009 ** packaging of this file.
00010 **
00011 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
00012 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
00013 **
00014 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
00015 **
00016 ** Contact info@trolltech.com if any conditions of this licensing are
00017 ** not clear to you.
00018 **
00019 **********************************************************************/
00020 #include "qdawg.h"
00021 #include <qintdict.h>
00022 #include <qfile.h>
00023 #include <qtl.h>
00024 
00025 #include <limits.h>
00026 #include <stdio.h>
00027 
00028 // for mmap
00029 #include <sys/types.h>
00030 #include <sys/stat.h>
00031 #include <sys/mman.h>
00032 #include <fcntl.h>
00033 #include <errno.h>
00034 #include <unistd.h>
00035 
00036 class QDawgPrivate;
00037 class QTrie;
00038 
00039 typedef QValueList<QTrie*> TrieClub;
00040 typedef QIntDict<TrieClub> TrieClubDirectory;
00041 
00042 class TriePtr {
00043 public:
00044     QChar letter;
00045     QTrie* p;
00046     int operator <(const TriePtr& o) const;
00047     int operator >(const TriePtr& o) const;
00048     int operator <=(const TriePtr& o) const;
00049 };
00050 
00051 class TrieList : public QValueList<TriePtr> {
00052     bool sorted;
00053 public:
00054     TrieList()
00055     {
00056         sorted=TRUE;
00057     }
00058 
00059     QTrie* findAdd(QChar c);
00060     bool equal(TrieList& l);
00061 
00062     void sort()
00063     {
00064         if ( !sorted ) {
00065             qHeapSort(*this);
00066             sorted = TRUE;
00067         }
00068     }
00069 };
00070 
00071 // A fast but memory-wasting temporary class.  The Dawg is the goal.
00072 class QTrie {
00073 public:
00074     QTrie();
00075     ~QTrie();
00076 
00077     void insertWord(const QString& s, uint index=0);
00078     bool equal(QTrie* o);
00079     void dump(int indent=0);
00080 
00081 private:
00082     TrieList children;
00083     bool isword;
00084 
00085     friend class QDawgPrivate;
00086     int maxdepth;
00087     int decendants;
00088     int key;
00089     void distributeKeys(TrieClubDirectory& directory);
00090     QTrie* clubLeader(TrieClubDirectory& directory);
00091     int collectKeys();
00092     friend class TriePtr;
00093     friend class TrieList;
00094 };
00095 
00096 QTrie::QTrie()
00097 {
00098     key = 0;
00099     isword = FALSE;
00100 }
00101 
00102 QTrie::~QTrie()
00103 {
00104     // NOTE: we do not delete the children - after conversion to DAWG
00105     // it's too difficult.  The QTrie's are deleted via the directory.
00106 }
00107 
00108 void QTrie::insertWord(const QString& s, uint index)
00109 {
00110     if ( index == s.length() ) {
00111         isword = TRUE;
00112     } else {
00113         QTrie* t = children.findAdd(s[index]);
00114         t->insertWord(s,index+1);
00115     }
00116 }
00117 
00118 bool QTrie::equal(QTrie* o)
00119 {
00120     if ( o == this ) return TRUE;
00121     if ( isword != o->isword )
00122         return FALSE;
00123     return children.equal(o->children);
00124 }
00125 
00126 void QTrie::dump(int indent)
00127 {
00128     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00129         QTrie* s = (*it).p;
00130         for (int in=0; in<indent; in++)
00131             fputc(' ',stderr);
00132         fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(),
00133             s->key,s->isword?"word":"",s);
00134         s->dump(indent+2);
00135     }
00136 }
00137 
00138 void QTrie::distributeKeys(TrieClubDirectory& directory)
00139 {
00140     maxdepth = INT_MIN;
00141     decendants = children.count();
00142     key = 0;
00143     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00144         QTrie* s = (*it).p;
00145         QChar l = (*it).letter;
00146         s->distributeKeys(directory);
00147         key = key*64+l.unicode()+s->key*5;
00148         decendants += s->decendants;
00149         if ( s->maxdepth+1 > maxdepth )
00150             maxdepth = s->maxdepth+1;
00151     }
00152     if ( decendants ) {
00153         key += decendants + maxdepth*256 + children.count() * 65536;
00154         if ( !key ) key++; // unlikely
00155     }
00156     TrieClub* c = directory[key];
00157     if ( !c ) directory.insert(key, (c = new TrieClub) );
00158     c->prepend(this);
00159 }
00160 
00161 QTrie* QTrie::clubLeader(TrieClubDirectory& directory)
00162 {
00163     if ( !key ) return directory[0]->first();
00164     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
00165         QTrie* t= (*it).p->clubLeader(directory);
00166         (*it).p = t;
00167     }
00168     TrieClub *club = directory[key];
00169     for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00170         QTrie* o = *it;
00171         if ( o->equal(this) )
00172             return o;
00173     }
00174     return this;
00175 }
00176 
00177 int QTrie::collectKeys()
00178 {
00179     int n=0;
00180     if ( key ) key=0,n+=children.count();
00181     for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it)
00182         n += (*it).p->collectKeys();
00183     return n;
00184 }
00185 
00186 int TriePtr::operator <(const TriePtr& o) const
00187     { return letter < o.letter; }
00188 int TriePtr::operator >(const TriePtr& o) const
00189     { return letter > o.letter; }
00190 int TriePtr::operator <=(const TriePtr& o) const
00191     { return letter <= o.letter; }
00192 
00193 bool TrieList::equal(TrieList& l)
00194 {
00195     if ( count() != l.count() )
00196         return FALSE;
00197     sort(); l.sort();
00198     ConstIterator it2 = begin();
00199     ConstIterator it = l.begin();
00200     for( ; it != l.end(); ++it, ++it2 )
00201         if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) )
00202             return FALSE;
00203     return TRUE;
00204 }
00205 QTrie* TrieList::findAdd(QChar c)
00206 {
00207     for (Iterator it=begin(); it!=end(); ++it) {
00208         if ( (*it).letter == c )
00209             return (*it).p;
00210     }
00211     TriePtr p;
00212     p.p = new QTrie;
00213     p.letter = c;
00214     prepend(p);
00215     sorted=FALSE;
00216     sort();
00217     return p.p;
00218 }
00219 
00220 static const char* dawg_sig = "QDAWG100";
00221 
00222 class QDawgPrivate {
00223 public:
00224     QDawgPrivate(QIODevice* dev)
00225     {
00226         QDataStream ds(dev);
00227         char sig[8];
00228         ds.readRawBytes(sig,8);
00229         if ( !strncmp(dawg_sig,sig,8) ) {
00230             uint n;
00231             char* nn;
00232             ds.readBytes(nn,n);
00233 
00234             // #### endianness problem ignored.
00235             node = (QDawg::Node*)nn;
00236             nodes = n / sizeof(QDawg::Node);
00237         } else {
00238             node = 0;
00239         }
00240     }
00241 
00242     bool ok() const { return node; }
00243 
00244     QDawgPrivate(uchar* mem)
00245     {
00246         if ( !strncmp(dawg_sig,(char*)mem,8) ) {
00247             mem += 8;
00248 
00249             int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3];
00250             mem += 4;
00251 
00252             // #### endianness problem ignored.
00253             node = (QDawg::Node*)((char*)mem);
00254             nodes = n / sizeof(QDawg::Node);
00255         }
00256     }
00257 
00258     QDawgPrivate(QTrie* t) // destroys the QTrie.
00259     {
00260         TrieClubDirectory directory(9973);
00261         t->distributeKeys(directory);
00262         QTrie* l = t->clubLeader(directory);
00263         ASSERT(l==t);
00264         generateArray(t);
00265 
00266         TrieClub *club;
00267         for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit)
00268         {
00269             for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
00270                 delete *it;
00271             }
00272             delete club;
00273         }
00274     }
00275 
00276     bool write(QIODevice* dev)
00277     {
00278         QDataStream ds(dev);
00279         ds.writeRawBytes(dawg_sig,8);
00280         // #### endianness problem ignored.
00281         ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes);
00282         return dev->state() == IO_Ok;
00283     }
00284 
00285     void dumpWords(int nid=0, int index=0)
00286     {
00287         static char word[256]; // ick latin1
00288         int i=0;
00289         do {
00290             QDawg::Node& n = node[nid+i];
00291             word[index] = n.let;
00292             if ( n.isword )
00293                 fprintf(stderr,"%.*s\n",index+1,word);
00294             if ( n.offset ) dumpWords(n.offset+nid+i,index+1);
00295         } while (!node[nid+i++].islast);
00296     }
00297 
00298     void dump(int nid=0, int indent=0)
00299     {
00300         int i=0;
00301         do {
00302             QDawg::Node& n = node[nid+i];
00303             fprintf(stderr,"%d: ",nid+i);
00304             for (int in=0; in<indent; in++)
00305                 fputc(' ',stderr);
00306             fprintf(stderr," %c %d %d %d\n",n.let,
00307                 n.isword,n.islast,n.offset);
00308             if ( n.offset ) dump(n.offset+nid+i,indent+2);
00309         } while (!node[nid+i++].islast);
00310     }
00311 
00312     int countWords(int nid=0)
00313     {
00314         int t=0;
00315         int i=0;
00316         do {
00317             QDawg::Node& n = node[nid+i];
00318             if ( n.isword )
00319                 t++;
00320             if ( n.offset )
00321                 t+=countWords(n.offset+nid+i);
00322         } while (!node[nid+i++].islast);
00323         return t;
00324     }
00325 
00326     bool contains(const QString& s, int nid=0, int index=0) const
00327     {
00328         int i=0;
00329         do {
00330             QDawg::Node& n = node[nid+i];
00331             if ( s[index] == QChar((ushort)n.let) ) {
00332                 if ( n.isword && index == (int)s.length()-1 )
00333                     return TRUE;
00334                 if ( n.offset )
00335                     return contains(s,n.offset+nid+i,index+1);
00336             }
00337         } while (!node[nid+i++].islast);
00338         return FALSE;
00339     }
00340 
00341     void appendAllWords(QStringList& list, int nid=0, QString s="") const
00342     {
00343         int i=0;
00344         int next = s.length();
00345         do {
00346             QDawg::Node& n = node[nid+i];
00347             s[next] = QChar((ushort)n.let);
00348             if ( n.isword )
00349                 list.append(s);
00350             if ( n.offset )
00351                 appendAllWords(list, n.offset+nid+i, s);
00352         } while (!node[nid+i++].islast);
00353     }
00354 
00355     const QDawg::Node* root() { return node; }
00356 
00357 private:
00358     void generateArray(QTrie* t)
00359     {
00360         nodes = 0;
00361         int n = t->collectKeys();
00362         node = new QDawg::Node[n];
00363         appendToArray(t);
00364         ASSERT(n == nodes);
00365     }
00366 
00367     int appendToArray(QTrie* t)
00368     {
00369         if ( !t->key ) {
00370             if ( !t->children.count() )
00371                 return 0;
00372             t->key = nodes;
00373             nodes += t->children.count();
00374             QDawg::Node* n = &node[t->key-1];
00375             int here = t->key;
00376             for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) {
00377                 QTrie* s = (*it).p;
00378                 ++n;
00379                 n->let = (*it).letter.unicode();
00380                 n->isword = s->isword;
00381                 n->islast = 0;
00382                 n->offset = appendToArray(s);
00383                 if ( n->offset ) {
00384                     int t = n->offset-here;
00385                     n->offset=t;
00386                     if ( n->offset != t )
00387                         qWarning("Overflow: too many words");
00388                 }
00389                 here++;
00390             }
00391             n->islast = 1;
00392         }
00393         return t->key;
00394     }
00395 
00396 private:
00397     int nodes;
00398     QDawg::Node *node;
00399 };
00400 
00443 QDawg::QDawg()
00444 {
00445     d = 0;
00446 }
00447 
00451 QDawg::~QDawg()
00452 {
00453     delete d;
00454 }
00455 
00460 bool QDawg::createFromWords(QIODevice* dev)
00461 {
00462     delete d;
00463 
00464     QTextStream i(dev);
00465     QTrie* trie = new QTrie;
00466     int n=0;
00467     while (!i.atEnd()) {
00468         trie->insertWord(QString::fromUtf8(i.readLine()));
00469         n++;
00470     }
00471     if ( n )
00472         d = new QDawgPrivate(trie);
00473     else
00474         d = 0;
00475     return TRUE;
00476 }
00477 
00481 void QDawg::createFromWords(const QStringList& list)
00482 {
00483     delete d;
00484 
00485     if ( list.count() ) {
00486         QTrie* trie = new QTrie;
00487         for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) {
00488             trie->insertWord(*it);
00489         }
00490         d = new QDawgPrivate(trie);
00491     } else {
00492         d = 0;
00493     }
00494 }
00495 
00499 QStringList QDawg::allWords() const
00500 {
00501     QStringList result;
00502     if ( d ) d->appendAllWords(result);
00503     return result;
00504 }
00505 
00506 
00513 bool QDawg::readFile(const QString& filename)
00514 {
00515     delete d;
00516     d = 0;
00517     int f = ::open( QFile::encodeName(filename), O_RDONLY );
00518     if ( f < 0 )
00519         return FALSE;
00520     struct stat st;
00521     if ( !fstat( f, &st ) ) {
00522         char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file
00523                        PROT_READ, // read-only memory
00524                        MAP_FILE | MAP_PRIVATE, // swap-backed map from file
00525                        f, 0 ); // from offset 0 of f
00526         if ( tmp && tmp != (char*)MAP_FAILED )
00527             d = new QDawgPrivate((uchar*)tmp);
00528     }
00529     ::close( f );
00530     return d;
00531 }
00532 
00539 bool QDawg::read(QIODevice* dev)
00540 {
00541     delete d;
00542     d = new QDawgPrivate(dev);
00543     if ( d->ok() )
00544         return TRUE;
00545     delete d;
00546     d = 0;
00547     return FALSE;
00548 }
00549 
00553 bool QDawg::write(QIODevice* dev) const
00554 {
00555     return d ? d->write(dev) : TRUE;
00556 }
00557 
00561 int QDawg::countWords() const
00562 {
00563     return d ? d->countWords() : 0;
00564 }
00565 
00569 const QDawg::Node* QDawg::root() const
00570 {
00571     return d ? d->root() : 0;
00572 }
00573 
00578 bool QDawg::contains(const QString& s) const
00579 {
00580     return d ? d->contains(s) : FALSE;
00581 }
00582 
00588 void QDawg::dump() const
00589 {
00590     if ( d ) d->dump();
00591 }
00592 

Generated on Sat Nov 5 16:16:38 2005 for OPIE by  doxygen 1.4.2