MultiOnDiskHashTable.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file provides a hash table data structure suitable for incremental and
  11. // distributed storage across a set of files.
  12. //
  13. // Multiple hash tables from different files are implicitly merged to improve
  14. // performance, and on reload the merged table will override those from other
  15. // files.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
  19. #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
  20. #include "llvm/ADT/DenseMap.h"
  21. #include "llvm/ADT/DenseSet.h"
  22. #include "llvm/ADT/PointerUnion.h"
  23. #include "llvm/ADT/STLExtras.h"
  24. #include "llvm/ADT/SmallVector.h"
  25. #include "llvm/ADT/TinyPtrVector.h"
  26. #include "llvm/ADT/iterator_range.h"
  27. #include "llvm/Support/Endian.h"
  28. #include "llvm/Support/EndianStream.h"
  29. #include "llvm/Support/OnDiskHashTable.h"
  30. #include "llvm/Support/raw_ostream.h"
  31. #include <algorithm>
  32. #include <cstdint>
  33. #include <vector>
  34. namespace clang {
  35. namespace serialization {
  36. /// \brief A collection of on-disk hash tables, merged when relevant for performance.
  37. template<typename Info> class MultiOnDiskHashTable {
  38. public:
  39. /// A handle to a file, used when overriding tables.
  40. using file_type = typename Info::file_type;
  41. /// A pointer to an on-disk representation of the hash table.
  42. using storage_type = const unsigned char *;
  43. using external_key_type = typename Info::external_key_type;
  44. using internal_key_type = typename Info::internal_key_type;
  45. using data_type = typename Info::data_type;
  46. using data_type_builder = typename Info::data_type_builder;
  47. using hash_value_type = unsigned;
  48. private:
  49. /// The generator is permitted to read our merged table.
  50. template<typename ReaderInfo, typename WriterInfo>
  51. friend class MultiOnDiskHashTableGenerator;
  52. /// \brief A hash table stored on disk.
  53. struct OnDiskTable {
  54. using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
  55. file_type File;
  56. HashTable Table;
  57. OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
  58. storage_type Buckets, storage_type Payload, storage_type Base,
  59. const Info &InfoObj)
  60. : File(File),
  61. Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
  62. };
  63. struct MergedTable {
  64. std::vector<file_type> Files;
  65. llvm::DenseMap<internal_key_type, data_type> Data;
  66. };
  67. using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
  68. using TableVector = llvm::TinyPtrVector<void *>;
  69. /// \brief The current set of on-disk and merged tables.
  70. /// We manually store the opaque value of the Table because TinyPtrVector
  71. /// can't cope with holding a PointerUnion directly.
  72. /// There can be at most one MergedTable in this vector, and if present,
  73. /// it is the first table.
  74. TableVector Tables;
  75. /// \brief Files corresponding to overridden tables that we've not yet
  76. /// discarded.
  77. llvm::TinyPtrVector<file_type> PendingOverrides;
  78. struct AsOnDiskTable {
  79. using result_type = OnDiskTable *;
  80. result_type operator()(void *P) const {
  81. return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
  82. }
  83. };
  84. using table_iterator =
  85. llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
  86. using table_range = llvm::iterator_range<table_iterator>;
  87. /// \brief The current set of on-disk tables.
  88. table_range tables() {
  89. auto Begin = Tables.begin(), End = Tables.end();
  90. if (getMergedTable())
  91. ++Begin;
  92. return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
  93. llvm::map_iterator(End, AsOnDiskTable()));
  94. }
  95. MergedTable *getMergedTable() const {
  96. // If we already have a merged table, it's the first one.
  97. return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
  98. .template dyn_cast<MergedTable*>();
  99. }
  100. /// \brief Delete all our current on-disk tables.
  101. void clear() {
  102. for (auto *T : tables())
  103. delete T;
  104. if (auto *M = getMergedTable())
  105. delete M;
  106. Tables.clear();
  107. }
  108. void removeOverriddenTables() {
  109. llvm::DenseSet<file_type> Files;
  110. Files.insert(PendingOverrides.begin(), PendingOverrides.end());
  111. // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
  112. auto ShouldRemove = [&Files](void *T) -> bool {
  113. auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
  114. bool Remove = Files.count(ODT->File);
  115. if (Remove)
  116. delete ODT;
  117. return Remove;
  118. };
  119. Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
  120. ShouldRemove),
  121. Tables.end());
  122. PendingOverrides.clear();
  123. }
  124. void condense() {
  125. MergedTable *Merged = getMergedTable();
  126. if (!Merged)
  127. Merged = new MergedTable;
  128. // Read in all the tables and merge them together.
  129. // FIXME: Be smarter about which tables we merge.
  130. for (auto *ODT : tables()) {
  131. auto &HT = ODT->Table;
  132. Info &InfoObj = HT.getInfoObj();
  133. for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
  134. auto *LocalPtr = I.getItem();
  135. // FIXME: Don't rely on the OnDiskHashTable format here.
  136. auto L = InfoObj.ReadKeyDataLength(LocalPtr);
  137. const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
  138. data_type_builder ValueBuilder(Merged->Data[Key]);
  139. InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
  140. ValueBuilder);
  141. }
  142. Merged->Files.push_back(ODT->File);
  143. delete ODT;
  144. }
  145. Tables.clear();
  146. Tables.push_back(Table(Merged).getOpaqueValue());
  147. }
  148. public:
  149. MultiOnDiskHashTable() = default;
  150. MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
  151. : Tables(std::move(O.Tables)),
  152. PendingOverrides(std::move(O.PendingOverrides)) {
  153. O.Tables.clear();
  154. }
  155. MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
  156. if (&O == this)
  157. return *this;
  158. clear();
  159. Tables = std::move(O.Tables);
  160. O.Tables.clear();
  161. PendingOverrides = std::move(O.PendingOverrides);
  162. return *this;
  163. }
  164. ~MultiOnDiskHashTable() { clear(); }
  165. /// \brief Add the table \p Data loaded from file \p File.
  166. void add(file_type File, storage_type Data, Info InfoObj = Info()) {
  167. using namespace llvm::support;
  168. storage_type Ptr = Data;
  169. uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
  170. // Read the list of overridden files.
  171. uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
  172. // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
  173. // an additional copy.
  174. llvm::SmallVector<file_type, 16> OverriddenFiles;
  175. OverriddenFiles.reserve(NumFiles);
  176. for (/**/; NumFiles != 0; --NumFiles)
  177. OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
  178. PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
  179. OverriddenFiles.end());
  180. // Read the OnDiskChainedHashTable header.
  181. storage_type Buckets = Data + BucketOffset;
  182. auto NumBucketsAndEntries =
  183. OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
  184. // Register the table.
  185. Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
  186. NumBucketsAndEntries.second,
  187. Buckets, Ptr, Data, std::move(InfoObj));
  188. Tables.push_back(NewTable.getOpaqueValue());
  189. }
  190. /// \brief Find and read the lookup results for \p EKey.
  191. data_type find(const external_key_type &EKey) {
  192. data_type Result;
  193. if (!PendingOverrides.empty())
  194. removeOverriddenTables();
  195. if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
  196. condense();
  197. internal_key_type Key = Info::GetInternalKey(EKey);
  198. auto KeyHash = Info::ComputeHash(Key);
  199. if (MergedTable *M = getMergedTable()) {
  200. auto It = M->Data.find(Key);
  201. if (It != M->Data.end())
  202. Result = It->second;
  203. }
  204. data_type_builder ResultBuilder(Result);
  205. for (auto *ODT : tables()) {
  206. auto &HT = ODT->Table;
  207. auto It = HT.find_hashed(Key, KeyHash);
  208. if (It != HT.end())
  209. HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
  210. ResultBuilder);
  211. }
  212. return Result;
  213. }
  214. /// \brief Read all the lookup results into a single value. This only makes
  215. /// sense if merging values across keys is meaningful.
  216. data_type findAll() {
  217. data_type Result;
  218. data_type_builder ResultBuilder(Result);
  219. if (!PendingOverrides.empty())
  220. removeOverriddenTables();
  221. if (MergedTable *M = getMergedTable()) {
  222. for (auto &KV : M->Data)
  223. Info::MergeDataInto(KV.second, ResultBuilder);
  224. }
  225. for (auto *ODT : tables()) {
  226. auto &HT = ODT->Table;
  227. Info &InfoObj = HT.getInfoObj();
  228. for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
  229. auto *LocalPtr = I.getItem();
  230. // FIXME: Don't rely on the OnDiskHashTable format here.
  231. auto L = InfoObj.ReadKeyDataLength(LocalPtr);
  232. const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
  233. InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
  234. }
  235. }
  236. return Result;
  237. }
  238. };
  239. /// \brief Writer for the on-disk hash table.
  240. template<typename ReaderInfo, typename WriterInfo>
  241. class MultiOnDiskHashTableGenerator {
  242. using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
  243. using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
  244. Generator Gen;
  245. public:
  246. MultiOnDiskHashTableGenerator() : Gen() {}
  247. void insert(typename WriterInfo::key_type_ref Key,
  248. typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
  249. Gen.insert(Key, Data, Info);
  250. }
  251. void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
  252. const BaseTable *Base) {
  253. using namespace llvm::support;
  254. llvm::raw_svector_ostream OutStream(Out);
  255. // Write our header information.
  256. {
  257. endian::Writer<little> Writer(OutStream);
  258. // Reserve four bytes for the bucket offset.
  259. Writer.write<uint32_t>(0);
  260. if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
  261. // Write list of overridden files.
  262. Writer.write<uint32_t>(Merged->Files.size());
  263. for (const auto &F : Merged->Files)
  264. Info.EmitFileRef(OutStream, F);
  265. // Add all merged entries from Base to the generator.
  266. for (auto &KV : Merged->Data) {
  267. if (!Gen.contains(KV.first, Info))
  268. Gen.insert(KV.first, Info.ImportData(KV.second), Info);
  269. }
  270. } else {
  271. Writer.write<uint32_t>(0);
  272. }
  273. }
  274. // Write the table itself.
  275. uint32_t BucketOffset = Gen.Emit(OutStream, Info);
  276. // Replace the first four bytes with the bucket offset.
  277. endian::write32le(Out.data(), BucketOffset);
  278. }
  279. };
  280. } // namespace serialization
  281. } // namespace clang
  282. #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H