Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

PPMLanguageModel.cpp

Go to the documentation of this file.
00001 // PPMLanguageModel.h
00002 //
00004 //
00005 // Copyright (c) 1999-2002 David Ward
00006 //
00008 
00009 #include <math.h>
00010 #include "PPMLanguageModel.h"
00011 
00012 using namespace Dasher;
00013 using namespace std;
00014 
00015 // static TCHAR debug[256];
00016 typedef unsigned long ulong;
00017 
00021 
00022 CPPMLanguageModel::CPPMnode *CPPMLanguageModel::CPPMnode::find_symbol(int sym)
00023 // see if symbol is a child of node
00024 {
00025         //  printf("finding symbol %d at node %d\n",sym,node->id);
00026         CPPMnode *found=child;
00027         while (found) {
00028                 if (found->symbol==sym)
00029                         return found;
00030                 found=found->next;
00031         }
00032         return 0;
00033 }
00034 
00035 
00036 CPPMLanguageModel::CPPMnode * CPPMLanguageModel::CPPMnode::add_symbol_to_node(int sym,int *update)
00037 {
00038         CPPMnode *born,*search;
00039         search=find_symbol(sym);
00040         if (!search) {
00041                 born = new CPPMnode(sym);
00042                 born->next=child;
00043                 child=born;
00044                 //   node->count=1;
00045                 return born;            
00046         } else {
00047                 if (*update) {   // perform update exclusions
00048                         search->count++;
00049                         *update=0;
00050                 }
00051                 return search;
00052         }
00053         
00054 }
00055 
00056 
00058 // CPPMLanguageModel defs
00060 
00061 CPPMLanguageModel::CPPMLanguageModel(CAlphabet *_alphabet,int _normalization)
00062         : CLanguageModel(_alphabet,_normalization)
00063 {
00064         root=new CPPMnode(-1);
00065         m_rootcontext=new CPPMContext(root,0);
00066 }
00067 
00068 
00069 CPPMLanguageModel::~CPPMLanguageModel()
00070 {
00071         delete root;
00072 }
00073 
00074 
00075 bool CPPMLanguageModel::GetProbs(CContext *context,vector<unsigned int> &probs,double )
00076         // get the probability distribution at the context
00077 {
00078         // seems like we have to have this hack for VC++
00079         CPPMContext *ppmcontext=static_cast<CPPMContext *> (context);
00080         
00081         
00082         int modelchars=GetNumberModelChars();
00083         int norm=CLanguageModel::normalization();
00084         probs.resize(modelchars);
00085         CPPMnode *temp,*s; 
00086         int loop,total;
00087         int sym; 
00088         // ulong spent=0; 
00089         ulong size_of_slice;
00090         bool *exclusions=new bool [modelchars];
00091         ulong uniform=modelchars;
00092         ulong tospend=norm-uniform;
00093         temp=ppmcontext->head;
00094         for (loop=0; loop <modelchars; loop++) {   /* set up the exclusions array */
00095                 probs[loop]=0;
00096                 exclusions[loop]=0;
00097         }
00098         while (temp!=0) {
00099                 //      Usprintf(debug,TEXT("tospend %u\n"),tospend);
00100                 //      DebugOutput(TEXT("round\n"));
00101                 total=0;
00102                 s=temp->child;
00103                 while (s) {
00104                         sym=s->symbol; 
00105                         if (!exclusions[s->symbol])
00106                                 total=total+s->count;
00107                         s=s->next;
00108                 }
00109                 if (total) {
00110                         //      Usprintf(debug,TEXT"escape %u\n"),tospend*
00111                         size_of_slice=tospend;
00112                         s=temp->child;
00113                         while (s) {
00114                                 if (!exclusions[s->symbol]) {
00115                                         exclusions[s->symbol]=1;
00116                                         ulong p=size_of_slice*(2*s->count-1)/2/ulong(total);
00117                                         probs[s->symbol]+=p;
00118                                         tospend-=p;             
00119                                 }
00120                                 //                              Usprintf(debug,TEXT("sym %u counts %d p %u tospend %u \n"),sym,s->count,p,tospend);      
00121                                 //                              DebugOutput(debug);
00122                                 s=s->next;
00123                         }
00124                 }
00125                 temp = temp->vine;
00126         }
00127         //      Usprintf(debug,TEXT("Norm %u tospend %u\n"),Norm,tospend);
00128         //      DebugOutput(debug);
00129         
00130         size_of_slice=tospend;
00131         int symbolsleft=0;
00132         for (sym=1;sym<modelchars;sym++)
00133                 if (!probs[sym])
00134                         symbolsleft++;
00135                 for (sym=1;sym<modelchars;sym++) 
00136                         if (!probs[sym]) {
00137                                 ulong p=size_of_slice/symbolsleft;
00138                                 probs[sym]+=p;
00139                                 tospend-=p;
00140                         }
00141                         
00142                         // distribute what's left evenly        
00143                         tospend+=uniform;
00144                         for (sym=1;sym<modelchars;sym++) {
00145                                 ulong p=tospend/(modelchars-sym);
00146                                 probs[sym]+=p;
00147                                 tospend-=p;
00148                         }
00149                         //      Usprintf(debug,TEXT("finaltospend %u\n"),tospend);
00150                         //      DebugOutput(debug);
00151                         
00152                         // free(exclusions); // !!!
00153                         // !!! NB by IAM: p577 Stroustrup 3rd Edition: "Allocating an object using new and deleting it using free() is asking for trouble"
00154                         delete[] exclusions;
00155                         return true;
00156 }
00157 
00158 
00159 void CPPMLanguageModel::AddSymbol(CPPMLanguageModel::CPPMContext &context,int symbol)
00160         // add symbol to the context
00161         // creates new nodes, updates counts
00162         // and leaves 'context' at the new context
00163 {
00164         // sanity check
00165         if (symbol==0 || symbol>=GetNumberModelChars())
00166                 return;
00167         
00168         CPPMnode *vineptr,*temp;
00169         int updatecnt=1;
00170         
00171         temp=context.head->vine;
00172         context.head=context.head->add_symbol_to_node(symbol,&updatecnt);
00173         vineptr=context.head;
00174         context.order++;
00175         
00176         while (temp!=0) {
00177                 vineptr->vine=temp->add_symbol_to_node(symbol,&updatecnt);    
00178                 vineptr=vineptr->vine;
00179                 temp=temp->vine;
00180         }
00181         vineptr->vine=root;
00182         if (context.order>MAX_ORDER){
00183                 context.head=context.head->vine;
00184                 context.order--;
00185         }
00186 }
00187 
00188 
00189 // update context with symbol 'Symbol'
00190 void CPPMLanguageModel::EnterSymbol(CContext* Context, modelchar Symbol)
00191 {
00192         CPPMLanguageModel::CPPMContext& context = * static_cast<CPPMContext *> (Context);
00193         
00194         CPPMnode *find;
00195         // CPPMnode *temp=context.head;
00196         
00197         while (context.head) {
00198                 find =context.head->find_symbol(Symbol);
00199                 if (find) {
00200                         context.order++;
00201                         context.head=find;
00202                         //      Usprintf(debug,TEXT("found context %x order %d\n"),head,order);
00203                         //      DebugOutput(debug);
00204                         return;
00205                 }
00206                 context.order--;
00207                 context.head=context.head->vine;
00208         }
00209         
00210         if (context.head==0) {
00211                 context.head=root;
00212                 context.order=0;
00213         }
00214         
00215 }
00216 
00217 
00218 void CPPMLanguageModel::LearnSymbol(CContext* Context, modelchar Symbol)
00219 {
00220         CPPMLanguageModel::CPPMContext& context = * static_cast<CPPMContext *> (Context);
00221         AddSymbol(context, Symbol);
00222 }
00223 
00224 
00225 void CPPMLanguageModel::dumpSymbol(int symbol)
00226 {
00227         if ((symbol <= 32) || (symbol >= 127))
00228                 printf( "<%d>", symbol );
00229         else
00230                 printf( "%c", symbol );
00231 }
00232 
00233 
00234 void CPPMLanguageModel::dumpString( char *str, int pos, int len )
00235         // Dump the string STR starting at position POS
00236 {
00237         char cc;
00238         int p;
00239         for (p = pos; p<pos+len; p++) {
00240                 cc = str [p];
00241                 if ((cc <= 31) || (cc >= 127))
00242                         printf( "<%d>", cc );
00243                 else
00244                         printf( "%c", cc );
00245         }
00246 }
00247 
00248 
00249 void CPPMLanguageModel::dumpTrie( CPPMLanguageModel::CPPMnode *, int )
00250         // diagnostic display of the PPM trie from node t and deeper
00251 {
00252 //TODO
00253 /*
00254         dchar debug[256];
00255         int sym;
00256         CPPMnode *s;
00257         Usprintf( debug,TEXT("%5d %7x "), d, t );
00258         //TODO: Uncomment this when headers sort out
00259         //DebugOutput(debug);
00260         if (t < 0) // pointer to input
00261                 printf( "                     <" );
00262         else {
00263                 Usprintf(debug,TEXT( " %3d %5d %7x  %7x  %7x    <"), t->symbol,t->count, t->vine, t->child, t->next );
00264                 //TODO: Uncomment this when headers sort out
00265                 //DebugOutput(debug);
00266         }
00267         
00268         dumpString( dumpTrieStr, 0, d );
00269         Usprintf( debug,TEXT(">\n") );
00270         //TODO: Uncomment this when headers sort out
00271         //DebugOutput(debug);
00272         if (t != 0) {
00273                 s = t->child;
00274                 while (s != 0) {
00275                         sym =s->symbol;
00276                         
00277                         dumpTrieStr [d] = sym;
00278                         dumpTrie( s, d+1 );
00279                         s = s->next;
00280                 }
00281         }
00282 */
00283 }
00284 
00285 
00286 void CPPMLanguageModel::dump()
00287         // diagnostic display of the whole PPM trie
00288 {
00289 // TODO:
00290 /*
00291         dchar debug[256];
00292         Usprintf(debug,TEXT(  "Dump of Trie : \n" ));
00293         //TODO: Uncomment this when headers sort out
00294         //DebugOutput(debug);
00295         Usprintf(debug,TEXT(   "---------------\n" ));
00296         //TODO: Uncomment this when headers sort out
00297         //DebugOutput(debug);
00298         Usprintf( debug,TEXT(  "depth node     symbol count  vine   child      next   context\n") );
00299         //TODO: Uncomment this when headers sort out
00300         //DebugOutput(debug);
00301         dumpTrie( root, 0 );
00302         Usprintf( debug,TEXT(  "---------------\n" ));
00303         //TODO: Uncomment this when headers sort out
00304         //DebugOutput(debug);
00305         Usprintf(debug,TEXT( "\n" ));
00306         //TODO: Uncomment this when headers sort out
00307         //DebugOutput(debug);
00308 */
00309 }

Generated on Sat Nov 5 16:16:00 2005 for OPIE by  doxygen 1.4.2