Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

CharCodeToUnicode.cc

Go to the documentation of this file.
00001 //========================================================================
00002 //
00003 // CharCodeToUnicode.cc
00004 //
00005 // Copyright 2001-2002 Glyph & Cog, LLC
00006 //
00007 //========================================================================
00008 
00009 #ifdef __GNUC__
00010 #pragma implementation
00011 #endif
00012 
00013 #include <aconf.h>
00014 #include <stdio.h>
00015 #include <string.h>
00016 #include "gmem.h"
00017 #include "gfile.h"
00018 #include "GString.h"
00019 #include "Error.h"
00020 #include "GlobalParams.h"
00021 #include "PSTokenizer.h"
00022 #include "CharCodeToUnicode.h"
00023 
00024 //------------------------------------------------------------------------
00025 
00026 #define maxUnicodeString 8
00027 
00028 struct CharCodeToUnicodeString {
00029   CharCode c;
00030   Unicode u[maxUnicodeString];
00031   int len;
00032 };
00033 
00034 //------------------------------------------------------------------------
00035 
00036 static int getCharFromString(void *data) {
00037   char *p;
00038   int c;
00039 
00040   p = *(char **)data;
00041   if (*p) {
00042     c = *p++;
00043     *(char **)data = p;
00044   } else {
00045     c = EOF;
00046   }
00047   return c;
00048 }
00049 
00050 static int getCharFromFile(void *data) {
00051   return fgetc((FILE *)data);
00052 }
00053 
00054 //------------------------------------------------------------------------
00055 
00056 CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GString *collectionA) {
00057   FILE *f;
00058   Unicode *mapA;
00059   CharCode size, mapLenA;
00060   char buf[64];
00061   Unicode u;
00062   CharCodeToUnicode *ctu;
00063 
00064   if (!(f = globalParams->getCIDToUnicodeFile(collectionA))) {
00065     error(-1, "Couldn't find cidToUnicode file for the '%s' collection",
00066           collectionA->getCString());
00067     return NULL;
00068   }
00069 
00070   size = 32768;
00071   mapA = (Unicode *)gmalloc(size * sizeof(Unicode));
00072   mapLenA = 0;
00073 
00074   while (getLine(buf, sizeof(buf), f)) {
00075     if (mapLenA == size) {
00076       size *= 2;
00077       mapA = (Unicode *)grealloc(mapA, size * sizeof(Unicode));
00078     }
00079     if (sscanf(buf, "%x", &u) == 1) {
00080       mapA[mapLenA] = u;
00081     } else {
00082       error(-1, "Bad line (%d) in cidToUnicode file for the '%s' collection",
00083             (int)(mapLenA + 1), collectionA->getCString());
00084       mapA[mapLenA] = 0;
00085     }
00086     ++mapLenA;
00087   }
00088 
00089   ctu = new CharCodeToUnicode(collectionA->copy(), mapA, mapLenA, gTrue,
00090                               NULL, 0);
00091   gfree(mapA);
00092   return ctu;
00093 }
00094 
00095 CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) {
00096   return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0);
00097 }
00098 
00099 CharCodeToUnicode *CharCodeToUnicode::parseCMap(GString *buf, int nBits) {
00100   CharCodeToUnicode *ctu;
00101   char *p;
00102 
00103   ctu = new CharCodeToUnicode(NULL);
00104   p = buf->getCString();
00105   ctu->parseCMap1(&getCharFromString, &p, nBits);
00106   return ctu;
00107 }
00108 
00109 void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data,
00110                                    int nBits) {
00111   PSTokenizer *pst;
00112   char tok1[256], tok2[256], tok3[256];
00113   int nDigits, n1, n2, n3;
00114   CharCode oldLen, i;
00115   CharCode code1, code2;
00116   Unicode u;
00117   char uHex[5];
00118   int j;
00119   GString *name;
00120   FILE *f;
00121 
00122   nDigits = nBits / 4;
00123   pst = new PSTokenizer(getCharFunc, data);
00124   pst->getToken(tok1, sizeof(tok1), &n1);
00125   while (pst->getToken(tok2, sizeof(tok2), &n2)) {
00126     if (!strcmp(tok2, "usecmap")) {
00127       if (tok1[0] == '/') {
00128         name = new GString(tok1 + 1);
00129         if ((f = globalParams->findToUnicodeFile(name))) {
00130           parseCMap1(&getCharFromFile, f, nBits);
00131           fclose(f);
00132         } else {
00133           error(-1, "Couldn't find ToUnicode CMap file for '%s'",
00134                 name->getCString());
00135         }
00136         delete name;
00137       }
00138       pst->getToken(tok1, sizeof(tok1), &n1);
00139     } else if (!strcmp(tok2, "beginbfchar")) {
00140       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00141         if (!strcmp(tok1, "endbfchar")) {
00142           break;
00143         }
00144         if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00145             !strcmp(tok2, "endbfchar")) {
00146           error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00147           break;
00148         }
00149         if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00150               tok2[0] == '<' && tok2[n2 - 1] == '>')) {
00151           error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00152           continue;
00153         }
00154         tok1[n1 - 1] = tok2[n2 - 1] = '\0';
00155         if (sscanf(tok1 + 1, "%x", &code1) != 1) {
00156           error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00157           continue;
00158         }
00159         if (code1 >= mapLen) {
00160           oldLen = mapLen;
00161           mapLen = (code1 + 256) & ~255;
00162           map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00163           for (i = oldLen; i < mapLen; ++i) {
00164             map[i] = 0;
00165           }
00166         }
00167         if (n2 == 6) {
00168           if (sscanf(tok2 + 1, "%x", &u) != 1) {
00169             error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00170             continue;
00171           }
00172           map[code1] = u;
00173         } else {
00174           map[code1] = 0;
00175           if (sMapLen == sMapSize) {
00176             sMapSize += 8;
00177             sMap = (CharCodeToUnicodeString *)
00178                 grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00179           }
00180           sMap[sMapLen].c = code1;
00181           sMap[sMapLen].len = (n2 - 2) / 4;
00182           for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00183             strncpy(uHex, tok2 + 1 + j*4, 4);
00184             uHex[4] = '\0';
00185             if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00186               error(-1, "Illegal entry in bfchar block in ToUnicode CMap");
00187             }
00188           }
00189           ++sMapLen;
00190         }
00191       }
00192       pst->getToken(tok1, sizeof(tok1), &n1);
00193     } else if (!strcmp(tok2, "beginbfrange")) {
00194       while (pst->getToken(tok1, sizeof(tok1), &n1)) {
00195         if (!strcmp(tok1, "endbfrange")) {
00196           break;
00197         }
00198         if (!pst->getToken(tok2, sizeof(tok2), &n2) ||
00199             !strcmp(tok2, "endbfrange") ||
00200             !pst->getToken(tok3, sizeof(tok3), &n3) ||
00201             !strcmp(tok3, "endbfrange")) {
00202           error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00203           break;
00204         }
00205         if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' &&
00206               n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>' &&
00207               tok3[0] == '<' && tok3[n3 - 1] == '>')) {
00208           error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00209           continue;
00210         }
00211         tok1[n1 - 1] = tok2[n2 - 1] = tok3[n3 - 1] = '\0';
00212         if (sscanf(tok1 + 1, "%x", &code1) != 1 ||
00213             sscanf(tok2 + 1, "%x", &code2) != 1) {
00214           error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00215           continue;
00216         }
00217         if (code2 >= mapLen) {
00218           oldLen = mapLen;
00219           mapLen = (code2 + 256) & ~255;
00220           map = (Unicode *)grealloc(map, mapLen * sizeof(Unicode));
00221           for (i = oldLen; i < mapLen; ++i) {
00222             map[i] = 0;
00223           }
00224         }
00225         if (n3 == 6) {
00226           if (sscanf(tok3 + 1, "%x", &u) != 1) {
00227             error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00228             continue;
00229           }
00230           for (; code1 <= code2; ++code1) {
00231             map[code1] = u++;
00232           }
00233         } else {
00234           if (sMapLen + (int)(code2 - code1 + 1) > sMapSize) {
00235             sMapSize = (sMapSize + (code2 - code1 + 1) + 7) & ~7;
00236             sMap = (CharCodeToUnicodeString *)
00237                 grealloc(sMap, sMapSize * sizeof(CharCodeToUnicodeString));
00238           }
00239           for (i = 0; code1 <= code2; ++code1, ++i) {
00240             map[code1] = 0;
00241             sMap[sMapLen].c = code1;
00242             sMap[sMapLen].len = (n3 - 2) / 4;
00243             for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) {
00244               strncpy(uHex, tok3 + 1 + j*4, 4);
00245               uHex[4] = '\0';
00246               if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) {
00247                 error(-1, "Illegal entry in bfrange block in ToUnicode CMap");
00248               }
00249             }
00250             sMap[sMapLen].u[sMap[sMapLen].len - 1] += i;
00251             ++sMapLen;
00252           }
00253         }
00254       }
00255       pst->getToken(tok1, sizeof(tok1), &n1);
00256     } else {
00257       strcpy(tok1, tok2);
00258     }
00259   }
00260   delete pst;
00261 }
00262 
00263 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA) {
00264   CharCode i;
00265 
00266   collection = collectionA;
00267   mapLen = 256;
00268   map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00269   for (i = 0; i < mapLen; ++i) {
00270     map[i] = 0;
00271   }
00272   sMap = NULL;
00273   sMapLen = sMapSize = 0;
00274   refCnt = 1;
00275 }
00276 
00277 CharCodeToUnicode::CharCodeToUnicode(GString *collectionA, Unicode *mapA,
00278                                      CharCode mapLenA, GBool copyMap,
00279                                      CharCodeToUnicodeString *sMapA,
00280                                      int sMapLenA) {
00281   collection = collectionA;
00282   mapLen = mapLenA;
00283   if (copyMap) {
00284     map = (Unicode *)gmalloc(mapLen * sizeof(Unicode));
00285     memcpy(map, mapA, mapLen * sizeof(Unicode));
00286   } else {
00287     map = mapA;
00288   }
00289   sMap = sMapA;
00290   sMapLen = sMapSize = sMapLenA;
00291   refCnt = 1;
00292 }
00293 
00294 CharCodeToUnicode::~CharCodeToUnicode() {
00295   if (collection) {
00296     delete collection;
00297   }
00298   gfree(map);
00299   if (sMap) {
00300     gfree(sMap);
00301   }
00302 }
00303 
00304 void CharCodeToUnicode::incRefCnt() {
00305   ++refCnt;
00306 }
00307 
00308 void CharCodeToUnicode::decRefCnt() {
00309   if (--refCnt == 0) {
00310     delete this;
00311   }
00312 }
00313 
00314 GBool CharCodeToUnicode::match(GString *collectionA) {
00315   return collection && !collection->cmp(collectionA);
00316 }
00317 
00318 int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode *u, int size) {
00319   int i, j;
00320 
00321   if (c >= mapLen) {
00322     return 0;
00323   }
00324   if (map[c]) {
00325     u[0] = map[c];
00326     return 1;
00327   }
00328   for (i = 0; i < sMapLen; ++i) {
00329     if (sMap[i].c == c) {
00330       for (j = 0; j < sMap[i].len && j < size; ++j) {
00331         u[j] = sMap[i].u[j];
00332       }
00333       return j;
00334     }
00335   }
00336   return 0;
00337 }
00338 
00339 //------------------------------------------------------------------------
00340 
00341 CIDToUnicodeCache::CIDToUnicodeCache() {
00342   int i;
00343 
00344   for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00345     cache[i] = NULL;
00346   }
00347 }
00348 
00349 CIDToUnicodeCache::~CIDToUnicodeCache() {
00350   int i;
00351 
00352   for (i = 0; i < cidToUnicodeCacheSize; ++i) {
00353     if (cache[i]) {
00354       cache[i]->decRefCnt();
00355     }
00356   }
00357 }
00358 
00359 CharCodeToUnicode *CIDToUnicodeCache::getCIDToUnicode(GString *collection) {
00360   CharCodeToUnicode *ctu;
00361   int i, j;
00362 
00363   if (cache[0] && cache[0]->match(collection)) {
00364     cache[0]->incRefCnt();
00365     return cache[0];
00366   }
00367   for (i = 1; i < cidToUnicodeCacheSize; ++i) {
00368     if (cache[i] && cache[i]->match(collection)) {
00369       ctu = cache[i];
00370       for (j = i; j >= 1; --j) {
00371         cache[j] = cache[j - 1];
00372       }
00373       cache[0] = ctu;
00374       ctu->incRefCnt();
00375       return ctu;
00376     }
00377   }
00378   if ((ctu = CharCodeToUnicode::parseCIDToUnicode(collection))) {
00379     if (cache[cidToUnicodeCacheSize - 1]) {
00380       cache[cidToUnicodeCacheSize - 1]->decRefCnt();
00381     }
00382     for (j = cidToUnicodeCacheSize - 1; j >= 1; --j) {
00383       cache[j] = cache[j - 1];
00384     }
00385     cache[0] = ctu;
00386     ctu->incRefCnt();
00387     return ctu;
00388   }
00389   return NULL;
00390 }

Generated on Sat Nov 5 16:18:13 2005 for OPIE by  doxygen 1.4.2