00001 #include "CRegExp.h"
00002
00003
00004 #include <stdlib.h>
00005
00006
00007
00008 tchar CRegExpFilt::escapedchar(tchar c)
00009 {
00010 switch (c)
00011 {
00012 case '\\':
00013 return '\\';
00014 break;
00015 case '"':
00016 return '\"';
00017 break;
00018 case 'a':
00019 return '\a';
00020 break;
00021 case 'b':
00022 return '\b';
00023 break;
00024 case 'f':
00025 return '\f';
00026 break;
00027 case 'n':
00028 return '\n';
00029 break;
00030 case 'r':
00031 return '\r';
00032 break;
00033 case 't':
00034 return '\t';
00035 break;
00036 case 'v':
00037 return '\v';
00038 break;
00039 default:
00040 return c;
00041 break;
00042 }
00043 }
00044
00045 void CRegExpFilt::regchar(tchar c, bool insens)
00046 {
00047 if (insens)
00048 {
00049 tchar t = upper(c);
00050 CV[t] = 0;
00051 t = lower(c);
00052 CV[t] = 0;
00053 }
00054 else
00055 {
00056 CV[c] = 0;
00057 }
00058 }
00059
00060 void CRegExpFilt::prepreprocessing(const QString& pat, bool insens)
00061 {
00062 for (unsigned int p = 0; p < pat.length(); p++)
00063 {
00064 #ifdef _WINDOWS
00065 switch (pat.at(p).unicode())
00066 #else
00067 switch (pat[p].unicode())
00068 #endif
00069 {
00070 case '{':
00071 {
00072 break;
00073 }
00074 case '}':
00075 {
00076 break;
00077 }
00078 case '^':
00079 {
00080 break;
00081 }
00082 case '.' :
00083 {
00084 break;
00085 }
00086 case '#':
00087 {
00088 p++;
00089 #ifdef _WINDOWS
00090 while ('0' <= pat.at(p).unicode() && pat.at(p).unicode() <= '9')
00091 #else
00092 while ('0' <= pat[p].unicode() && pat[p].unicode() <= '9')
00093 #endif
00094 {
00095 }
00096 p--;
00097 break;
00098 }
00099 case '\\' :
00100 {
00101 #ifdef _WINDOWS
00102 tchar c = escapedchar(pat.at(++p).unicode());
00103 #else
00104 tchar c = escapedchar(pat[++p].unicode());
00105 #endif
00106 regchar(c, insens);
00107 break;
00108 }
00109
00110 case '[' :
00111 {
00112 tchar clast;
00113 bool invert = false;
00114 tchar c;
00115 #ifdef _WINDOWS
00116 if (pat.at(p+1).unicode() == '^')
00117 #else
00118 if (pat[p+1].unicode() == '^')
00119 #endif
00120 {
00121 p++;
00122 invert = true;
00123 }
00124 #ifdef _WINDOWS
00125 while ((c = pat.at(++p).unicode()) != ']')
00126 #else
00127 while ((c = pat[++p].unicode()) != ']')
00128 #endif
00129 {
00130 if (c == '\\')
00131 {
00132 #ifdef _WINDOWS
00133 c = escapedchar(pat.at(++p).unicode());
00134 #else
00135 c = escapedchar(pat[++p].unicode());
00136 #endif
00137 if (c == ']') break;
00138 }
00139 if (c == '-')
00140 {
00141 #ifdef _WINDOWS
00142 c = pat.at(++p).unicode();
00143 #else
00144 c = pat[++p].unicode();
00145 #endif
00146 for (tchar j = clast; j <= c; j++)
00147 {
00148 regchar(j, insens);
00149 }
00150 }
00151 else
00152 {
00153 regchar(c, insens);
00154 }
00155 clast = c;
00156 }
00157 break;
00158 }
00159 default :
00160 {
00161 #ifdef _WINDOWS
00162 regchar(pat.at(p).unicode(), insens);
00163 #else
00164 regchar(pat[p].unicode(), insens);
00165 #endif
00166 break;
00167 }
00168 }
00169 }
00170
00171
00172
00173
00174
00175
00176 CV[0] = 0;
00177 }
00178
00179 unsigned int CRegExpFilt::preprocessing(const QString& pat, bool insens)
00180 {
00181 prepreprocessing(pat, insens);
00182 qDebug("PrePreProcessing done");
00183 unsigned int p, m;
00184 bool inkeep = false;
00185 keep = 0;
00186 replace = 0;
00187 for (unsigned int j = 0; j < WORD_SIZE; j++)
00188 {
00189 bit[j] = (1 << (WORD_SIZE -j -1));
00190 lfcnt[j] = 0;
00191 }
00192
00193 for (p = 0, m = 0; p < pat.length(); p++)
00194 {
00195 qDebug("m is %u", m);
00196 if (inkeep) keep |= bit[m];
00197 #ifdef _WINDOWS
00198 switch (pat.at(p).unicode())
00199 #else
00200 switch (pat[p].unicode())
00201 #endif
00202 {
00203 case '{':
00204 {
00205 inkeep = true;
00206 break;
00207 }
00208 case '}':
00209 {
00210 keep ^= bit[m];
00211 inkeep = false;
00212 break;
00213 }
00214 case '^':
00215 {
00216 replace |= bit[m];
00217 lfcnt[m]++;
00218 break;
00219 }
00220 case '.' :
00221 {
00222 for (iter j = CV.begin(); j != CV.end(); ++j) CV[j.first()] |= bit[m];
00223 m++;
00224 break;
00225 }
00226 case '#':
00227 {
00228 if (m > 0)
00229 {
00230 p++;
00231 int count = 0;
00232 #ifdef _WINDOWS
00233 while ('0' <= pat.at(p).unicode() && pat.at(p).unicode() <= '9')
00234 #else
00235 while ('0' <= pat[p].unicode() && pat[p].unicode() <= '9')
00236 #endif
00237 {
00238 #ifdef _WINDOWS
00239 count = 10*count + pat.at(p++).unicode() - '0';
00240 #else
00241 count = 10*count + pat[p++].unicode() - '0';
00242 #endif
00243 }
00244 p--;
00245 count = count-1;
00246 unsigned int mask = 0;
00247 for (unsigned int i = m; i < m+count; i++)
00248 {
00249 mask |= bit[i];
00250 }
00251
00252 for (iter it = CV.begin(); it != CV.end(); ++it)
00253 {
00254 if (CV[it.first()] & bit[m-1])
00255 {
00256 CV[it.first()] |= mask;
00257 }
00258 }
00259 if (keep & bit[m-1]) keep |= mask;
00260 m += count;
00261 }
00262 else
00263 {
00264 p++;
00265 }
00266 break;
00267 }
00268 case '\\' :
00269 {
00270 #ifdef _WINDOWS
00271 tchar c = escapedchar(pat.at(++p).unicode());
00272 #else
00273 tchar c = escapedchar(pat[++p].unicode());
00274 #endif
00275 if (insens)
00276 {
00277 CV[upper(c)] |= bit[m];
00278 CV[lower(c)] |= bit[m];
00279 }
00280 else
00281 {
00282 CV[c] |= bit[m];
00283 }
00284 m++;
00285 break;
00286 }
00287
00288 case '[' :
00289 {
00290 tchar c, clast;
00291 bool invert = false;
00292 #ifdef _WINDOWS
00293 if (pat.at(p+1).unicode() == '^')
00294 #else
00295 if (pat[p+1].unicode() == '^')
00296 #endif
00297 {
00298 p++;
00299 invert = true;
00300 }
00301 #ifdef _WINDOWS
00302 while ((c = pat.at(++p).unicode()) != ']')
00303 #else
00304 while ((c = pat[++p].unicode()) != ']')
00305 #endif
00306 {
00307 if (c == '\\')
00308 {
00309 #ifdef _WINDOWS
00310 c = escapedchar(pat.at(++p).unicode());
00311 #else
00312 c = escapedchar(pat[++p].unicode());
00313 #endif
00314 if (c == ']') break;
00315 }
00316 if (c == '-')
00317 {
00318 #ifdef _WINDOWS
00319 c = pat.at(++p).unicode();
00320 #else
00321 c = pat[++p].unicode();
00322 #endif
00323 for (tchar j = clast; j <= c; j++)
00324 {
00325 if (insens)
00326 {
00327 iter it;
00328 if ((it = CV.find(upper(j))) != CV.end())
00329 CV[it] |= bit[m];
00330 else
00331 CV[0] |= bit[m];
00332 if ((it = CV.find(lower(j))) != CV.end())
00333 CV[it] |= bit[m];
00334 else
00335 CV[0] |= bit[m];
00336 }
00337 else
00338 {
00339 iter it;
00340 if ((it = CV.find(j)) != CV.end())
00341 CV[it] |= bit[m];
00342 else
00343 {
00344 CV[0] |= bit[m];
00345 }
00346 }
00347 }
00348 }
00349 else
00350 {
00351 if (insens)
00352 {
00353 iter it;
00354 if ((it = CV.find(upper(c))) != CV.end())
00355 CV[it] |= bit[m];
00356 else
00357 CV[0] |= bit[m];
00358 if ((it = CV.find(lower(c))) != CV.end())
00359 CV[it] |= bit[m];
00360 else
00361 CV[0] |= bit[m];
00362 }
00363 else
00364 {
00365 iter it;
00366 if ((it = CV.find(c)) != CV.end())
00367 CV[it] |= bit[m];
00368 else
00369 CV[0] |= bit[m];
00370 }
00371 }
00372 clast = c;
00373 }
00374 if (invert)
00375 {
00376 for (iter i = CV.begin(); i != CV.end(); ++i)
00377 {
00378 CV[i.first()] ^= bit[m];
00379 }
00380 }
00381 m++;
00382 break;
00383 }
00384 default :
00385 {
00386 #ifdef _WINDOWS
00387 tchar c = pat.at(p).unicode();
00388 #else
00389 tchar c = pat[p].unicode();
00390 #endif
00391 if (insens)
00392 {
00393 CV[upper(c)] |= bit[m];
00394 CV[lower(c)] |= bit[m];
00395 }
00396 else CV[c] |= bit[m];
00397 m++;
00398 break;
00399 }
00400 }
00401 }
00402 qDebug("Returning:%u",m);
00403 return m;
00404 }
00405
00406 bool CRegExpFilt::empty()
00407 {
00408 return m_outQueue.empty();
00409 }
00410
00411 tchar CRegExpFilt::pop()
00412 {
00413 return m_outQueue.pop();
00414 }
00415
00416 bool CRegExpFilt::addch(tchar ch)
00417 {
00418 word[cur] = ch;
00419 cur = (cur+1)%patlength;
00420 if (len < patlength) len++;
00421
00422 unsigned int cv = 0;
00423 iter it;
00424 if ((it = CV.find(ch)) == CV.end())
00425 {
00426 cv = CV[0];
00427 }
00428 else
00429 {
00430 cv = CV[it];
00431 }
00432
00433 R = ((R >> 1) | bit_0) & cv;
00434 if (R & endpos)
00435 {
00436 for (unsigned int i = 0; i < patlength; i++)
00437 {
00438 if (replace & bit[i])
00439 {
00440 for (unsigned int j = 0; j < lfcnt[i]; j++)
00441 {
00442 m_outQueue.push(10);
00443 }
00444 }
00445 if (keep & bit[i])
00446 {
00447 m_outQueue.push(word[(cur+i)%patlength]);
00448
00449
00450 }
00451 len = 0;
00452 }
00453 return true;
00454 }
00455 else
00456 {
00457 if (len == patlength)
00458 {
00459 tchar ch = word[cur];
00460 if (ch == 10) ch = ' ';
00461 m_outQueue.push(ch);
00462 }
00463 return false;
00464 }
00465 }
00466
00467 void CRegExpFilt::restart()
00468 {
00469 R = 0;
00470 len = 0;
00471 }
00472
00473 CRegExpFilt::CRegExpFilt(const QString& pat, bool insensflag) : CV(300)
00474 {
00475 cur = 0;
00476 patlength = preprocessing(pat, insensflag);
00477 qDebug("Preprocesing done:%u", patlength);
00478 endpos = bit[patlength-1];
00479 bit_0 = bit[0];
00480
00481 restart();
00482
00483 qDebug("Pattern: %s:%u", (const char*)pat, patlength);
00484
00485 }
00486
00487
00488 CRegExpFilt::~CRegExpFilt()
00489 {
00490 }
00491
00492 #ifdef NOWAYISTHISDEFINED
00493 void reportmatch(tchar *line, unsigned int mtype, unsigned int lino)
00494 {
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506 }
00507
00508
00509 void usage(void)
00510 {
00511 printf("Usage: CRegExpFilt [-i] pattern/a file\n");
00512 }
00513
00514 int getline(tchar *s,int lim,FILE *f)
00515 {
00516 int c, i;
00517
00518 for (i = 0; i < lim-1 && (c = getc(f)) != EOF && c != '\n'; )
00519 {
00520 s[i++] = (tchar)c;
00521 }
00522 s[i] = '\0';
00523 return ((c == EOF && i == 0) ? -1 : i);
00524 }
00525
00526 #define BUF_SIZE 256
00527
00528 int main(int argc, char **argv)
00529 {
00530 unsigned int lino = 0;
00531 unsigned int blino = 0;
00532 bool insens = false;
00533 int len;
00534 tchar line[BUF_SIZE];
00535 FILE *inf;
00536
00537
00538 if (argc < 3)
00539 {usage(); return 10; }
00540
00541 for (len = 1; len < argc-2; len++)
00542 {
00543 if (argv[len][0] != '-')
00544 {usage(); return 10; }
00545 else switch (argv[len][1])
00546 {
00547 case 'i' :
00548 {
00549 insens = true;
00550 break;
00551 }
00552 default :
00553 {usage(); return 10;}
00554 }
00555 }
00556
00557 tchar* pattern = new tchar[strlen(argv[argc-2])+1];
00558
00559 for (int i = 0; (pattern[i] = argv[argc-2][i]) != 0; i++);
00560
00561
00562
00563 CRegExpFilt test(pattern, insens);
00564
00565 delete [] pattern;
00566
00567 inf = fopen(argv[argc-1], "r");
00568 if (!inf)
00569 {
00570 printf("file not found\n");
00571 return 10;
00572 }
00573
00574 while ((len = getline(line, BUF_SIZE, inf)) >= 0)
00575 {
00576 lino++;
00577 bool ret = false;
00578
00579 {
00580 tchar *textend = line+len;
00581 tchar *text = line;
00582 while (text < textend)
00583 {
00584 ret |= test.addch(*text++);
00585 }
00586 ret |= test.addch('\n');
00587 while (!test.empty())
00588 {
00589 putchar(test.pop());
00590 }
00591 }
00592
00593 if (ret) reportmatch(line, 0, lino);
00594 }
00595 fclose(inf);
00596
00597 return 0;
00598 }
00599 #endif