Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

CRegExp.cpp

Go to the documentation of this file.
00001 #include "CRegExp.h"
00002 
00003 //#include <stdio.h>
00004 #include <stdlib.h>
00005 //#include <string.h>
00006 
00007 
00008 tchar CRegExpFilt::escapedchar(tchar c)
00009 {
00010   switch (c)
00011     {
00012     case '\\':
00013       return '\\';
00014       break;
00015     case '"':
00016       return '\"';
00017       break;
00018     case 'a':
00019       return '\a';
00020       break;
00021     case 'b':
00022       return '\b';
00023       break;
00024     case 'f':
00025       return '\f';
00026       break;
00027     case 'n':
00028       return '\n';
00029       break;
00030     case 'r':
00031       return '\r';
00032       break;
00033     case 't':
00034       return '\t';
00035       break;
00036     case 'v':
00037       return '\v';
00038       break;
00039     default:
00040       return c;
00041       break;
00042     }
00043 }
00044 
00045 void CRegExpFilt::regchar(tchar c, bool insens)
00046 {
00047   if (insens)
00048     {
00049       tchar t = upper(c);
00050       CV[t] = 0;
00051       t = lower(c);
00052       CV[t] = 0;
00053     }
00054   else
00055     {
00056       CV[c] = 0;
00057     }
00058 }
00059 
00060 void CRegExpFilt::prepreprocessing(const QString& pat, bool insens)
00061 {
00062   for (unsigned int p = 0; p < pat.length(); p++)
00063     {
00064 #ifdef _WINDOWS
00065       switch (pat.at(p).unicode())
00066 #else
00067       switch (pat[p].unicode())
00068 #endif
00069           {
00070         case '{':
00071           {
00072             break;
00073           }
00074         case '}':
00075           {
00076             break;
00077           }
00078         case '^':
00079           {
00080             break;
00081           }
00082         case '.' :
00083           {
00084             break;
00085           }
00086         case '#':
00087           {
00088             p++;
00089 #ifdef _WINDOWS
00090             while ('0' <= pat.at(p).unicode() && pat.at(p).unicode() <= '9')
00091 #else
00092                 while ('0' <= pat[p].unicode() && pat[p].unicode() <= '9')
00093 #endif
00094               {
00095               }
00096             p--;
00097             break;
00098           }
00099         case '\\' :
00100           {
00101 #ifdef _WINDOWS
00102             tchar c = escapedchar(pat.at(++p).unicode());
00103 #else
00104             tchar c = escapedchar(pat[++p].unicode());
00105 #endif
00106                 regchar(c, insens);
00107             break;
00108           }
00109 
00110         case '[' :
00111           {
00112             tchar clast;
00113             bool invert = false;
00114             tchar c;
00115 #ifdef _WINDOWS
00116             if (pat.at(p+1).unicode() == '^')
00117 #else
00118             if (pat[p+1].unicode() == '^')
00119 #endif
00120                 {
00121                 p++;
00122                 invert = true;
00123               }
00124 #ifdef _WINDOWS
00125             while ((c = pat.at(++p).unicode()) != ']')
00126 #else
00127             while ((c = pat[++p].unicode()) != ']')
00128 #endif
00129                 {
00130                 if (c == '\\')
00131                   {
00132 #ifdef _WINDOWS
00133                     c = escapedchar(pat.at(++p).unicode());
00134 #else
00135                     c = escapedchar(pat[++p].unicode());
00136 #endif
00137                         if (c == ']') break;
00138                   }
00139                 if (c == '-')
00140                   {
00141 #ifdef _WINDOWS
00142                     c = pat.at(++p).unicode();
00143 #else
00144                     c = pat[++p].unicode();
00145 #endif
00146                         for (tchar j = clast; j <= c; j++)
00147                       {
00148                         regchar(j, insens);
00149                       }
00150                   }
00151                 else
00152                   {
00153                     regchar(c, insens);
00154                   }
00155                 clast = c;
00156               }
00157             break;
00158           }
00159         default :
00160           {
00161 #ifdef _WINDOWS
00162             regchar(pat.at(p).unicode(), insens);
00163 #else
00164             regchar(pat[p].unicode(), insens);
00165 #endif
00166                 break;
00167           }
00168         }
00169     }
00170   /*
00171   for (iter i = CV.begin(); i != CV.end(); ++i)
00172     {
00173       printf("Pre: [%u]\n", i.first());
00174     }
00175   */
00176   CV[0] = 0;
00177 }
00178 
00179 unsigned int CRegExpFilt::preprocessing(const QString& pat, bool insens)
00180 {
00181   prepreprocessing(pat, insens);
00182   qDebug("PrePreProcessing done");
00183   unsigned int p, m;
00184   bool inkeep = false;
00185   keep = 0;
00186   replace = 0;
00187   for (unsigned int j = 0; j < WORD_SIZE; j++)
00188     {
00189       bit[j] = (1 << (WORD_SIZE -j -1));
00190       lfcnt[j] = 0;
00191     }
00192 
00193   for (p = 0, m = 0; p < pat.length(); p++)
00194     {
00195       qDebug("m is %u", m);
00196       if (inkeep) keep |= bit[m];
00197 #ifdef _WINDOWS
00198       switch (pat.at(p).unicode())
00199 #else
00200       switch (pat[p].unicode())
00201 #endif
00202           {
00203         case '{':
00204           {
00205             inkeep = true;
00206             break;
00207           }
00208         case '}':
00209           {
00210             keep ^= bit[m];
00211             inkeep = false;
00212             break;
00213           }
00214         case '^':
00215           {
00216             replace |= bit[m];
00217             lfcnt[m]++;
00218             break;
00219           }
00220         case '.' :
00221           {
00222             for (iter j = CV.begin(); j != CV.end(); ++j) CV[j.first()] |= bit[m];
00223             m++;
00224             break;
00225           }
00226         case '#':
00227           {
00228             if (m > 0)
00229               {
00230                 p++;
00231                 int count = 0;
00232 #ifdef _WINDOWS
00233                 while ('0' <= pat.at(p).unicode() && pat.at(p).unicode() <= '9')
00234 #else
00235                 while ('0' <= pat[p].unicode() && pat[p].unicode() <= '9')
00236 #endif
00237                 {
00238 #ifdef _WINDOWS
00239                     count = 10*count + pat.at(p++).unicode() - '0';
00240 #else
00241                     count = 10*count + pat[p++].unicode() - '0';
00242 #endif
00243                   }
00244                 p--;
00245                 count = count-1;
00246                 unsigned int mask = 0;
00247                 for (unsigned int i = m; i < m+count; i++)
00248                   {
00249                     mask |= bit[i];
00250                   }
00251                
00252                 for (iter it = CV.begin(); it != CV.end(); ++it)
00253                   {
00254                     if (CV[it.first()] & bit[m-1])
00255                       {
00256                         CV[it.first()] |= mask;
00257                       }
00258                   }
00259                 if (keep & bit[m-1]) keep |= mask;
00260                 m += count;
00261               }
00262             else
00263               {
00264                 p++;
00265               }
00266             break;
00267           }
00268         case '\\' :
00269           {
00270 #ifdef _WINDOWS
00271             tchar c = escapedchar(pat.at(++p).unicode());
00272 #else
00273             tchar c = escapedchar(pat[++p].unicode());
00274 #endif
00275                 if (insens)
00276               {
00277                 CV[upper(c)] |= bit[m];
00278                 CV[lower(c)] |= bit[m];
00279               }
00280             else
00281               {
00282                 CV[c] |= bit[m];
00283               }
00284             m++;
00285             break;
00286           }
00287 
00288         case '[' :
00289           {
00290             tchar c, clast;
00291             bool invert = false;
00292 #ifdef _WINDOWS
00293             if (pat.at(p+1).unicode() == '^')
00294 #else
00295             if (pat[p+1].unicode() == '^')
00296 #endif
00297                 {
00298                 p++;
00299                 invert = true;
00300               }
00301 #ifdef _WINDOWS
00302             while ((c = pat.at(++p).unicode()) != ']')
00303 #else
00304             while ((c = pat[++p].unicode()) != ']')
00305 #endif
00306                 {
00307                 if (c == '\\')
00308                   {
00309 #ifdef _WINDOWS
00310                     c = escapedchar(pat.at(++p).unicode());
00311 #else
00312                     c = escapedchar(pat[++p].unicode());
00313 #endif
00314                         if (c == ']') break;
00315                   }
00316                 if (c == '-')
00317                   {
00318 #ifdef _WINDOWS
00319                     c = pat.at(++p).unicode();
00320 #else
00321                     c = pat[++p].unicode();
00322 #endif
00323                         for (tchar j = clast; j <= c; j++)
00324                       {
00325                         if (insens)
00326                           {
00327                             iter it;
00328                             if ((it = CV.find(upper(j))) != CV.end())
00329                               CV[it] |= bit[m];
00330                             else
00331                               CV[0] |= bit[m];
00332                             if ((it = CV.find(lower(j))) != CV.end())
00333                               CV[it] |= bit[m];
00334                             else
00335                               CV[0] |= bit[m];
00336                           }
00337                         else
00338                           {
00339                             iter it;
00340                             if ((it = CV.find(j)) != CV.end())
00341                               CV[it] |= bit[m];
00342                             else
00343                               {
00344                                 CV[0] |= bit[m];
00345                               }
00346                           }
00347                       }
00348                   }
00349                 else
00350                   {
00351                     if (insens)
00352                       {
00353                         iter it;
00354                         if ((it = CV.find(upper(c))) != CV.end())
00355                           CV[it] |= bit[m];
00356                         else
00357                           CV[0] |= bit[m];
00358                         if ((it = CV.find(lower(c))) != CV.end())
00359                           CV[it] |= bit[m];
00360                         else
00361                           CV[0] |= bit[m];
00362                       }
00363                     else
00364                       {
00365                         iter it;
00366                         if ((it = CV.find(c)) != CV.end())
00367                           CV[it] |= bit[m];
00368                         else
00369                           CV[0] |= bit[m];
00370                       }
00371                   }
00372                 clast = c;
00373               }
00374             if (invert)
00375               {
00376                 for (iter i = CV.begin(); i != CV.end(); ++i)
00377                   {
00378                     CV[i.first()] ^= bit[m];
00379                   }
00380               }
00381             m++;
00382             break;
00383           }
00384         default :
00385           {
00386 #ifdef _WINDOWS
00387             tchar c = pat.at(p).unicode();
00388 #else
00389             tchar c = pat[p].unicode();
00390 #endif
00391                 if (insens)
00392               {
00393                 CV[upper(c)] |= bit[m];
00394                 CV[lower(c)] |= bit[m];
00395               }
00396             else CV[c] |= bit[m];
00397             m++;
00398             break;
00399           }
00400         }
00401     }
00402   qDebug("Returning:%u",m);
00403   return m;
00404 }
00405 
00406 bool CRegExpFilt::empty()
00407 {
00408   return m_outQueue.empty();
00409 }
00410 
00411 tchar CRegExpFilt::pop()
00412 {
00413   return m_outQueue.pop();
00414 }
00415 
00416 bool CRegExpFilt::addch(tchar ch)
00417 {
00418   word[cur] = ch;
00419   cur = (cur+1)%patlength;
00420   if (len < patlength) len++;
00421 
00422   unsigned int cv = 0;
00423   iter it;
00424   if ((it = CV.find(ch)) == CV.end())
00425     {
00426       cv = CV[0];
00427     }
00428   else
00429     {
00430       cv = CV[it];
00431     }
00432 
00433   R = ((R >> 1) | bit_0) & cv; /* Exact matches */
00434   if (R & endpos)
00435     {
00436       for (unsigned int i = 0; i < patlength; i++)
00437         {
00438           if (replace & bit[i])
00439             {
00440               for (unsigned int j = 0; j < lfcnt[i]; j++)
00441                 {
00442                   m_outQueue.push(10);
00443                 }
00444             }
00445           if (keep & bit[i])
00446             {
00447               m_outQueue.push(word[(cur+i)%patlength]);
00448               //                putchar('*');
00449               //                putchar(i + '0');
00450             }
00451           len = 0;
00452         }
00453       return true;
00454     }
00455   else
00456     {
00457       if (len == patlength)
00458         {
00459           tchar ch = word[cur];
00460           if (ch == 10) ch = ' ';
00461           m_outQueue.push(ch);
00462         }
00463       return false;
00464     }
00465 }
00466 
00467 void CRegExpFilt::restart()
00468 {
00469   R = 0;
00470   len = 0;
00471 }
00472 
00473 CRegExpFilt::CRegExpFilt(const QString& pat, bool insensflag) : CV(300)
00474 {
00475   cur = 0;
00476   patlength = preprocessing(pat, insensflag);
00477   qDebug("Preprocesing done:%u", patlength);
00478   endpos = bit[patlength-1];
00479   bit_0 = bit[0];
00480 
00481   restart();
00482 
00483   qDebug("Pattern: %s:%u", (const char*)pat, patlength);
00484 
00485 }
00486 
00487 
00488 CRegExpFilt::~CRegExpFilt()
00489 {
00490 }
00491 
00492 #ifdef NOWAYISTHISDEFINED
00493 void reportmatch(tchar *line, /*tchar *text,*/ unsigned int mtype, unsigned int lino)
00494 {
00495   /*
00496     tchar *text = line + strlen(line);
00497 
00498     tchar *ptr = line;
00499     if (mtype == 0)
00500     printf("Exact match at line number %u.\n", lino);
00501     else
00502     printf("%u error match at line number %u.\n", mtype, lino);
00503     while (ptr < text) putchar(*ptr++);
00504     printf("%c[4m^%c[24m%s\n", 27, 27, ptr);
00505   */
00506 }
00507 
00508 
00509 void usage(void)
00510 {
00511   printf("Usage: CRegExpFilt [-i] pattern/a file\n");
00512 }
00513 
00514 int getline(tchar *s,int lim,FILE *f)
00515 {
00516   int c, i;
00517     
00518   for (i = 0; i < lim-1 && (c = getc(f)) != EOF && c != '\n'; )
00519     {
00520       s[i++] = (tchar)c;
00521     }
00522   s[i] = '\0';
00523   return ((c == EOF && i == 0) ? -1 : i);
00524 }
00525 
00526 #define BUF_SIZE        256
00527 
00528 int main(int argc, char **argv)
00529 {
00530   unsigned int lino = 0;
00531   unsigned int blino = 0;
00532   bool insens = false;
00533   int len;
00534   tchar line[BUF_SIZE];
00535   FILE *inf;
00536 
00537   /* Error checking of cmd ln args! */
00538   if (argc < 3)
00539     {usage(); return 10; }
00540   /* Corresponds to requiring a minimum of 3 matches */
00541   for (len = 1; len < argc-2; len++)
00542     {
00543       if (argv[len][0] != '-')
00544         {usage(); return 10; }
00545       else switch (argv[len][1])
00546         {
00547         case 'i' :
00548           {
00549             insens = true;
00550             break;
00551           }
00552         default :
00553           {usage(); return 10;}
00554         }
00555     }
00556 
00557   tchar* pattern = new tchar[strlen(argv[argc-2])+1];
00558 
00559   for (int i = 0; (pattern[i] = argv[argc-2][i]) != 0; i++);
00560 
00561 
00562 
00563   CRegExpFilt test(pattern, insens);
00564 
00565   delete [] pattern;
00566 
00567   inf = fopen(argv[argc-1], "r");
00568   if (!inf)
00569     {
00570       printf("file not found\n");
00571       return 10;
00572     }
00573 
00574   while ((len = getline(line, BUF_SIZE, inf)) >= 0)
00575     {
00576       lino++;
00577       bool ret = false;
00578    
00579       {
00580         tchar *textend = line+len;
00581         tchar *text = line;
00582         while (text < textend)
00583           {
00584             ret |= test.addch(*text++);
00585           }
00586         ret |= test.addch('\n');
00587         while (!test.empty())
00588           {
00589             putchar(test.pop());
00590           }
00591       }
00592       //    inswt = test.addch(line, len);
00593       if (ret) reportmatch(line, 0, lino);
00594     }
00595   fclose(inf);
00596   //    CloseSTDLIB();
00597   return 0;
00598 }
00599 #endif

Generated on Sat Nov 5 16:16:54 2005 for OPIE by  doxygen 1.4.2