00001 // AlphabetMap.h 00002 // 00004 // 00005 // Copyright (c) 2002 Iain Murray 00006 // 00008 00009 00010 /* 00011 If I were just using GCC, which comes with the CGI "STL" implementation, I would 00012 use hash_map (which isn't part of the ANSI/ISO standard C++ STL, but hey it's nice). 00013 Using a plain map is just too slow for training on large files (or it is with certain 00014 STL implementations). I'm sure training could be made much faster still, but that's 00015 another matter... 00016 00017 While I could (and probably should) get a hash_map for VC++ from 00018 http://www.stlport.org I thought it would be nicer if people didn't have 00019 to download extra stuff and then have to get it working alongside the STL 00020 with VC++, especially for just one small part of Dasher. 00021 00022 The result is this: 00023 *************************************************** 00024 very much thrown together to get Dasher out ASAP. 00025 *************************************************** 00026 It is deliberately not like an STL container. 00027 However, as it has a tiny interface, it should still be easy to replace. 00028 Sorry if this seems really unprofressional. 00029 00030 Replacing it might be a good idea. On the other hand it could be customised 00031 to the needs of the alphabet, so that it works faster. For example, 00032 currently if I have a string "asdf", it might be that "a" is checked 00033 then "as" is checked then "asd" is checked. I shouldn't need to keep 00034 rehashing the leading characters. I plan to fix that here. Doing so with 00035 a standard hash_map would be hard. 00036 00037 00038 Usage: 00039 alphabet_map MyMap(NumberOfEntriesWeExpect); // Can omit NumberOfEntriesWeExpect 00040 MyMap.add("asdf", 15); 00041 symbol i = MyMap.get("asdf") // i=15 00042 symbol j = MyMap.get("fdsa") // j=0 00043 00044 You can't remove items once they are added as Dasher has no need for that. 00045 00046 IAM 08/2002 00047 */ 00048 00049 #ifndef __AlphabetMap_h__ 00050 #define __AlphabetMap_h__ 00051 00052 #include "MSVC_Unannoy.h" 00053 #include <vector> 00054 #include <string> 00055 00056 #include "DasherTypes.h" 00057 00058 namespace Dasher {class alphabet_map;} 00059 class Dasher::alphabet_map 00060 { 00061 public: 00062 alphabet_map(uint InitialTableSize=255); 00063 void Add(const std::string& Key, symbol Value); 00064 symbol Get(const std::string& Key, bool* KeyIsPrefix=0) const; 00065 00066 private: 00067 class Entry 00068 { 00069 public: 00070 Entry(std::string Key, symbol Symbol, Entry* Next) 00071 : Key(Key), KeyIsPrefix(false), Symbol(Symbol), Next(Next) {} 00072 00073 std::string Key; 00074 bool KeyIsPrefix; 00075 symbol Symbol; 00076 Entry* Next; 00077 }; 00078 00079 void RecursiveAdd(const std::string& Key, symbol Value, bool PrefixFlag); 00080 00081 // A standard hash -- could try and research something specific. 00082 inline uint Hash(const std::string& Input) const { 00083 uint Result = 0; 00084 00085 typedef std::string::const_iterator CI; 00086 CI Cur = Input.begin(); 00087 CI end = Input.end(); 00088 00089 while (Cur!=end) Result = (Result<<1)^*Cur++; 00090 Result %= HashTable.size(); 00091 00092 return Result; 00093 /* 00094 if (Input.size()==1) // Speedup for ASCII text 00095 return Input[0]; 00096 00097 for (int i=0; i<Input.size(); i++) 00098 Result = (Result<<1)^Input[i]; 00099 00100 00101 return Result%HashTable.size(); 00102 */ 00103 } 00104 00105 std::vector<Entry> Entries; 00106 std::vector<Entry*> HashTable; 00107 const symbol Undefined; 00108 }; 00109 00110 00111 #endif /* #ifndef __AlphabetMap_h__ */
1.4.2