00001
00002
00004
00005
00006
00008
00009 #include <math.h>
00010 #include "PPMLanguageModel.h"
00011
00012 using namespace Dasher;
00013 using namespace std;
00014
00015
00016 typedef unsigned long ulong;
00017
00021
00022 CPPMLanguageModel::CPPMnode *CPPMLanguageModel::CPPMnode::find_symbol(int sym)
00023
00024 {
00025
00026 CPPMnode *found=child;
00027 while (found) {
00028 if (found->symbol==sym)
00029 return found;
00030 found=found->next;
00031 }
00032 return 0;
00033 }
00034
00035
00036 CPPMLanguageModel::CPPMnode * CPPMLanguageModel::CPPMnode::add_symbol_to_node(int sym,int *update)
00037 {
00038 CPPMnode *born,*search;
00039 search=find_symbol(sym);
00040 if (!search) {
00041 born = new CPPMnode(sym);
00042 born->next=child;
00043 child=born;
00044
00045 return born;
00046 } else {
00047 if (*update) {
00048 search->count++;
00049 *update=0;
00050 }
00051 return search;
00052 }
00053
00054 }
00055
00056
00058
00060
00061 CPPMLanguageModel::CPPMLanguageModel(CAlphabet *_alphabet,int _normalization)
00062 : CLanguageModel(_alphabet,_normalization)
00063 {
00064 root=new CPPMnode(-1);
00065 m_rootcontext=new CPPMContext(root,0);
00066 }
00067
00068
00069 CPPMLanguageModel::~CPPMLanguageModel()
00070 {
00071 delete root;
00072 }
00073
00074
00075 bool CPPMLanguageModel::GetProbs(CContext *context,vector<unsigned int> &probs,double )
00076
00077 {
00078
00079 CPPMContext *ppmcontext=static_cast<CPPMContext *> (context);
00080
00081
00082 int modelchars=GetNumberModelChars();
00083 int norm=CLanguageModel::normalization();
00084 probs.resize(modelchars);
00085 CPPMnode *temp,*s;
00086 int loop,total;
00087 int sym;
00088
00089 ulong size_of_slice;
00090 bool *exclusions=new bool [modelchars];
00091 ulong uniform=modelchars;
00092 ulong tospend=norm-uniform;
00093 temp=ppmcontext->head;
00094 for (loop=0; loop <modelchars; loop++) {
00095 probs[loop]=0;
00096 exclusions[loop]=0;
00097 }
00098 while (temp!=0) {
00099
00100
00101 total=0;
00102 s=temp->child;
00103 while (s) {
00104 sym=s->symbol;
00105 if (!exclusions[s->symbol])
00106 total=total+s->count;
00107 s=s->next;
00108 }
00109 if (total) {
00110
00111 size_of_slice=tospend;
00112 s=temp->child;
00113 while (s) {
00114 if (!exclusions[s->symbol]) {
00115 exclusions[s->symbol]=1;
00116 ulong p=size_of_slice*(2*s->count-1)/2/ulong(total);
00117 probs[s->symbol]+=p;
00118 tospend-=p;
00119 }
00120
00121
00122 s=s->next;
00123 }
00124 }
00125 temp = temp->vine;
00126 }
00127
00128
00129
00130 size_of_slice=tospend;
00131 int symbolsleft=0;
00132 for (sym=1;sym<modelchars;sym++)
00133 if (!probs[sym])
00134 symbolsleft++;
00135 for (sym=1;sym<modelchars;sym++)
00136 if (!probs[sym]) {
00137 ulong p=size_of_slice/symbolsleft;
00138 probs[sym]+=p;
00139 tospend-=p;
00140 }
00141
00142
00143 tospend+=uniform;
00144 for (sym=1;sym<modelchars;sym++) {
00145 ulong p=tospend/(modelchars-sym);
00146 probs[sym]+=p;
00147 tospend-=p;
00148 }
00149
00150
00151
00152
00153
00154 delete[] exclusions;
00155 return true;
00156 }
00157
00158
00159 void CPPMLanguageModel::AddSymbol(CPPMLanguageModel::CPPMContext &context,int symbol)
00160
00161
00162
00163 {
00164
00165 if (symbol==0 || symbol>=GetNumberModelChars())
00166 return;
00167
00168 CPPMnode *vineptr,*temp;
00169 int updatecnt=1;
00170
00171 temp=context.head->vine;
00172 context.head=context.head->add_symbol_to_node(symbol,&updatecnt);
00173 vineptr=context.head;
00174 context.order++;
00175
00176 while (temp!=0) {
00177 vineptr->vine=temp->add_symbol_to_node(symbol,&updatecnt);
00178 vineptr=vineptr->vine;
00179 temp=temp->vine;
00180 }
00181 vineptr->vine=root;
00182 if (context.order>MAX_ORDER){
00183 context.head=context.head->vine;
00184 context.order--;
00185 }
00186 }
00187
00188
00189
00190 void CPPMLanguageModel::EnterSymbol(CContext* Context, modelchar Symbol)
00191 {
00192 CPPMLanguageModel::CPPMContext& context = * static_cast<CPPMContext *> (Context);
00193
00194 CPPMnode *find;
00195
00196
00197 while (context.head) {
00198 find =context.head->find_symbol(Symbol);
00199 if (find) {
00200 context.order++;
00201 context.head=find;
00202
00203
00204 return;
00205 }
00206 context.order--;
00207 context.head=context.head->vine;
00208 }
00209
00210 if (context.head==0) {
00211 context.head=root;
00212 context.order=0;
00213 }
00214
00215 }
00216
00217
00218 void CPPMLanguageModel::LearnSymbol(CContext* Context, modelchar Symbol)
00219 {
00220 CPPMLanguageModel::CPPMContext& context = * static_cast<CPPMContext *> (Context);
00221 AddSymbol(context, Symbol);
00222 }
00223
00224
00225 void CPPMLanguageModel::dumpSymbol(int symbol)
00226 {
00227 if ((symbol <= 32) || (symbol >= 127))
00228 printf( "<%d>", symbol );
00229 else
00230 printf( "%c", symbol );
00231 }
00232
00233
00234 void CPPMLanguageModel::dumpString( char *str, int pos, int len )
00235
00236 {
00237 char cc;
00238 int p;
00239 for (p = pos; p<pos+len; p++) {
00240 cc = str [p];
00241 if ((cc <= 31) || (cc >= 127))
00242 printf( "<%d>", cc );
00243 else
00244 printf( "%c", cc );
00245 }
00246 }
00247
00248
00249 void CPPMLanguageModel::dumpTrie( CPPMLanguageModel::CPPMnode *, int )
00250
00251 {
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283 }
00284
00285
00286 void CPPMLanguageModel::dump()
00287
00288 {
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309 }