diff --git a/src/App/StringHasher.cpp b/src/App/StringHasher.cpp index 107b6d8216..e9eb92d2bb 100644 --- a/src/App/StringHasher.cpp +++ b/src/App/StringHasher.cpp @@ -180,6 +180,23 @@ StringHasher::~StringHasher() clear(); } +StringHasher::StringHasher([[maybe_unused]] StringHasher &&other) noexcept +{ + // Private: unimplemented +} + +StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &other) +{ + // Private: unimplemented + return *this; +} + +StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &&other) noexcept +{ + // Private: unimplemented + return *this; +} + void StringHasher::setSaveAll(bool enable) { if (_hashes->SaveAll == enable) { @@ -195,23 +212,31 @@ void StringHasher::compact() return; } + // Make a list of all the table entries that have only a single reference and are not marked + // "persistent" std::deque pendings; for (auto& hasher : _hashes->right) { if (!hasher.second->isPersistent() && hasher.second->getRefCount() == 1) { pendings.emplace_back(hasher.second); } } + + // Recursively remove the unused StringIDs while (!pendings.empty()) { StringIDRef sid = pendings.front(); pendings.pop_front(); + // Try to erase the map entry for this StringID if (_hashes->right.erase(sid.value()) == 0U) { - continue; + continue;// If nothing was erased, there's nothing more to do } sid._sid->_hasher = nullptr; sid._sid->unref(); for (auto& hasher : sid._sid->_sids) { if (hasher._sid->_hasher == this && !hasher._sid->isPersistent() && hasher._sid->getRefCount() == 2) { + // If the related StringID also uses this hasher, is not marked persistent, and has + // a current reference count of 2 (which will be its hasher reference and its entry + // in the related SIDs list), then prep it for removal as well. pendings.push_back(hasher); } } @@ -332,10 +357,10 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector_hasher == this) { - id._sids.push_back(hasher); + newStringID._sids.push_back(hasher); } } } - if (id._sids.size() > 10) { - std::sort(id._sids.begin() + extra, id._sids.end()); - id._sids.erase(std::unique(id._sids.begin() + extra, id._sids.end()), id._sids.end()); + if (newStringID._sids.size() > 10) { + std::sort(newStringID._sids.begin() + extra, newStringID._sids.end()); + newStringID._sids.erase(std::unique(newStringID._sids.begin() + extra, newStringID._sids.end()), + newStringID._sids.end()); } - if ((id._postfix.size() != 0) && !indexed) { - StringID::IndexID res = StringID::fromString(id._data); - if (res.id > 0) { - int offset = id.isPostfixEncoded() ? 1 : 0; - for (int i = offset; i < id._sids.size(); ++i) { - if (id._sids[i].value() == res.id) { + // If the new StringID has a postfix, but is not indexed, see if the data string itself + // contains an index. + if ((newStringID._postfix.size() != 0) && !indexed) { + // Use the fromString function to parse the new StringID's data field for a possible index + StringID::IndexID res = StringID::fromString(newStringID._data); + if (res.id > 0) { // If the data had an index + int offset = newStringID.isPostfixEncoded() ? 1 : 0; + // Search for the SID with that index + for (int i = offset; i < newStringID._sids.size(); ++i) { + if (newStringID._sids[i].value() == res.id) { if (i != offset) { - std::swap(id._sids[offset], id._sids[i]); + // If this SID is not already the first element in sids, move it there by + // swapping it with + std::swap(newStringID._sids[offset], newStringID._sids[i]); } if (res.index != 0) { - id._flags.setFlag(StringID::Flag::PrefixIDIndex); + newStringID._flags.setFlag(StringID::Flag::PrefixIDIndex); } else { - id._flags.setFlag(StringID::Flag::PrefixID); + newStringID._flags.setFlag(StringID::Flag::PrefixID); } break; } diff --git a/src/App/StringHasher.h b/src/App/StringHasher.h index a28884d869..a5c8952061 100644 --- a/src/App/StringHasher.h +++ b/src/App/StringHasher.h @@ -308,6 +308,22 @@ private: StringHasher* _hasher = nullptr; mutable Flags _flags; mutable QVector _sids; + +private: + StringID([[maybe_unused]] const StringID& other) + : _id(0), + _flags(StringID::Flag::None) {}; + StringID([[maybe_unused]] StringID&& other) noexcept + : _id(0), + _flags(StringID::Flag::None) {}; + StringID& operator=([[maybe_unused]] const StringID& rhs) + { + return *this; + };// NOLINT + StringID& operator=([[maybe_unused]] StringID&& rhs) noexcept + { + return *this; + }; }; ////////////////////////////////////////////////////////////////////////// @@ -317,7 +333,6 @@ private: class StringIDRef { public: - /// Default construction results in an empty StringIDRef object: it will evaluate to boolean /// "false" if queried. StringIDRef() @@ -600,7 +615,7 @@ public: /// Used predominantly by the unit test code to verify that index is set correctly. In general /// user code should not need to call this function. - int getIndex() + int getIndex() const { return _index; } @@ -612,7 +627,17 @@ private: int _index; }; -/// A String table to map string from/to a unique integer + +/// \brief A bidirectional map of strings and their integer identifier. +/// +/// Maps an arbitrary text string to a unique integer ID, maintaining a reference-counted shared +/// pointer for each. This permits elimination of unused strings based on their reference +/// count. If a duplicate string is added, no additional copy is made, and a new reference to the +/// original storage is returned (incrementing the reference counter of the instance). +/// +/// If the string is longer than a given threshold, instead of storing the string, instead its +/// SHA1 hash is stored (and the original string discarded). This allows an upper threshold on the +/// length of a stored string, while still effectively guaranteeing uniqueness in the table. class AppExport StringHasher: public Base::Persistence, public Base::Handled { @@ -633,38 +658,40 @@ public: /** Maps an arbitrary string to an integer * * @param text: input string. - * @param len: length of the string, or -1 if the string is 0 terminated. - * @param hashable: whether the string is hashable. - * @return Return a shared pointer to the internally stored StringID. + * @param len: length of the string: optional if the string is null-terminated. + * @param hashable: whether hashing the string is permitted. + * @return A shared pointer to the internally-stored StringID. * - * The function maps an arbitrary text string to a unique integer ID, which - * is returned as a shared pointer to reference count the ID so that it is - * possible to prune any unused strings. + * Maps an arbitrary text string to a unique integer ID, returning a reference-counted shared + * pointer to the StringID. This permits elimination of unused strings based on their reference + * count. If a duplicate string is added, no additional copy is made, and a new reference to the + * original storage is returned (incrementing the reference counter of the instance). * - * If \c hashable is true and the string is longer than the threshold - * setting of this StringHasher, it will be sha1 hashed before storing, and - * the original content of the string is discarded. If else, the string is - * copied and stored inside a StringID instance. + * If \c hashable is true and the string is longer than the threshold setting of this + * StringHasher, only the SHA1 hash of the string is stored: the original content of the string + * is discarded. If \c hashable is false, the string is copied and stored inside a StringID + * instance. * - * The purpose of function is to provide a short form of a stable string - * identification. + * The purpose of this function is to provide a short form of a stable string identification. */ StringIDRef getID(const char* text, int len = -1, bool hashable = false); - /// Option for string string data + /// Options for string string data enum class Option { - /// No option + /// No option is set None = 0, + /// The input data is binary Binary = 1 << 0, - /** The input data is hashable. If the data length is longer than the - * threshold setting of the StringHasher, it will be sha1 hashed before - * storing, and the original content of the string is discarded. - */ + + /// Hashing is permitted for this input data. If the data length is longer than the + /// threshold setting of the StringHasher, it will be sha1 hashed before storing, and the + /// original content of the string is discarded. Hashable = 1 << 1, - /// Do not copy the data, assuming the data is constant. If this option - // is not set, the data will be copied before storing. + + /// Do not copy the data: assume it is constant and exists for the lifetime of this hasher. + /// If this option is not set, the data will be copied before storing. NoCopy = 1 << 2, }; using Options = Base::Flags