App/Toponaming: StringHasher tests, commenting, and cleanup

2023-04-13 21:53:41 -05:00
parent 2ef98bfdc0
commit ec98b5e498
3 changed files with 471 additions and 167 deletions
--- a/src/App/StringHasher.cpp
+++ b/src/App/StringHasher.cpp
@@ -180,18 +180,18 @@ StringHasher::~StringHasher()
    clear();
 }

-StringHasher::StringHasher([[maybe_unused]] StringHasher &&other) noexcept
+StringHasher::StringHasher([[maybe_unused]] StringHasher&& other) noexcept
 {
    // Private: unimplemented
 }

-StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &other)
+StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher& other)
 {
    // Private: unimplemented
    return *this;
 }

-StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &&other) noexcept
+StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher&& other) noexcept
 {
    // Private: unimplemented
    return *this;
@@ -317,22 +317,28 @@ StringIDRef StringHasher::getID(const QByteArray& data, Options options)

 StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<StringIDRef>& sids)
 {
-    StringID anID;
-    anID._postfix = name.postfixBytes();
+    StringID tempID;
+    tempID._postfix = name.postfixBytes();

    Data::IndexedName indexed;
-    if (anID._postfix.size() == 0) {
+    if (tempID._postfix.size() == 0) {
+        // Restrict this optimization to only cases with no postfix because it is causing some
+        // problems during recomputes. TODO: This needs to be investigated further
        indexed = Data::IndexedName(name.dataBytes());
    }
    if (indexed) {
-        anID._data =
+        // If this is an IndexedName, then _data only stores the base part of the name, without the
+        // integer index
+        tempID._data =
            QByteArray::fromRawData(indexed.getType(), static_cast<int>(strlen(indexed.getType())));
    }
    else {
-        anID._data = name.dataBytes();
+        // Store the entire name in _data, but temporarily re-use the existing memory
+        tempID._data = name.dataBytes();
    }

-    auto it = _hashes->left.find(&anID);
+    // Check to see if there is already an entry in the hash table for this StringID
+    auto it = _hashes->left.find(&tempID);
    if (it != _hashes->left.end()) {
        auto res = StringIDRef(it->first);
        if (indexed) {
@@ -342,40 +348,47 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
    }

    if (!indexed && name.isRaw()) {
-        anID._data = QByteArray(name.dataBytes().constData(), name.dataBytes().size());
+        // Make a copy of the memory if we didn't do so earlier
+        tempID._data = QByteArray(name.dataBytes().constData(), name.dataBytes().size());
    }

+    // If the postfix is not already encoded, use getID to encode it:
    StringIDRef postfixRef;
-    if ((anID._postfix.size() != 0) && anID._postfix.indexOf("#") < 0) {
-        postfixRef = getID(anID._postfix);
-        postfixRef.toBytes(anID._postfix);
+    if ((tempID._postfix.size() != 0) && tempID._postfix.indexOf("#") < 0) {
+        postfixRef = getID(tempID._postfix);
+        postfixRef.toBytes(tempID._postfix);
    }

+    // If _data is an IndexedName, use getID to encode it:
    StringIDRef indexRef;
    if (indexed) {
-        indexRef = getID(anID._data);
+        indexRef = getID(tempID._data);
    }

-    StringIDRef sid(new StringID(lastID() + 1, anID._data));
-    StringID& newStringID = *sid._sid;
-    if (anID._postfix.size() != 0) {
+    // The real StringID object that we are going to insert
+    StringIDRef newStringIDRef(new StringID(lastID() + 1, tempID._data));
+    StringID& newStringID = *newStringIDRef._sid;
+    if (tempID._postfix.size() != 0) {
        newStringID._flags.setFlag(StringID::Flag::Postfixed);
-        newStringID._postfix = anID._postfix;
+        newStringID._postfix = tempID._postfix;
    }

-    int count = 0;
-    for (const auto& hasher : sids) {
-        if (hasher && hasher._sid->_hasher == this) {
-            ++count;
+    // Count the related SIDs that use this hasher
+    int numSIDs = 0;
+    for (const auto& relatedID : sids) {
+        if (relatedID && relatedID._sid->_hasher == this) {
+            ++numSIDs;
        }
    }

-    int extra = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0);
-    if (count == sids.size() && !postfixRef && !indexRef) {
+    int numAddedSIDs = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0);
+    if (numSIDs == sids.size() && !postfixRef && !indexRef) {
+        // The simplest case: just copy the whole list
        newStringID._sids = sids;
    }
    else {
-        newStringID._sids.reserve(count + extra);
+        // Put the added SIDs at the front of the SID list
+        newStringID._sids.reserve(numSIDs + numAddedSIDs);
        if (postfixRef) {
            newStringID._flags.setFlag(StringID::Flag::PostfixEncoded);
            newStringID._sids.push_back(postfixRef);
@@ -384,15 +397,21 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
            newStringID._flags.setFlag(StringID::Flag::Indexed);
            newStringID._sids.push_back(indexRef);
        }
-        for (const auto& hasher : sids) {
-            if (hasher && hasher._sid->_hasher == this) {
-                newStringID._sids.push_back(hasher);
+        // Append the sids from the input list whose hasher is this one
+        for (const auto& relatedID : sids) {
+            if (relatedID && relatedID._sid->_hasher == this) {
+                newStringID._sids.push_back(relatedID);
            }
        }
    }
-    if (newStringID._sids.size() > 10) {
-        std::sort(newStringID._sids.begin() + extra, newStringID._sids.end());
-        newStringID._sids.erase(std::unique(newStringID._sids.begin() + extra, newStringID._sids.end()),
+
+    // If the number of related IDs is larger than some threshold (hardcoded to 10 right now), then
+    // remove any duplicates (ignoring the new SIDs we may have just added)
+    const int relatedIDSizeThreshold {10};
+    if (newStringID._sids.size() > relatedIDSizeThreshold) {
+        std::sort(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end());
+        newStringID._sids.erase(
+            std::unique(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end()),
            newStringID._sids.end());
    }

@@ -401,14 +420,14 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
    if ((newStringID._postfix.size() != 0) && !indexed) {
        // Use the fromString function to parse the new StringID's data field for a possible index
        StringID::IndexID res = StringID::fromString(newStringID._data);
-        if (res.id > 0) { // If the data had an index
+        if (res.id > 0) {// If the data had an index
            int offset = newStringID.isPostfixEncoded() ? 1 : 0;
            // Search for the SID with that index
            for (int i = offset; i < newStringID._sids.size(); ++i) {
                if (newStringID._sids[i].value() == res.id) {
                    if (i != offset) {
                        // If this SID is not already the first element in sids, move it there by
-                        // swapping it with
+                        // swapping it with whatever WAS there
                        std::swap(newStringID._sids[offset], newStringID._sids[i]);
                    }
                    if (res.index != 0) {
@@ -423,7 +442,7 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
        }
    }

-    return {insert(sid), indexed.getIndex()};
+    return {insert(newStringIDRef), indexed.getIndex()};
 }

 StringIDRef StringHasher::getID(long id, int index) const
--- a/src/App/StringHasher.h
+++ b/src/App/StringHasher.h
@@ -74,7 +74,7 @@ using StringHasherRef = Base::Reference<StringHasher>;
 */
 class AppExport StringID: public Base::BaseClass, public Base::Handled
 {
-    TYPESYSTEM_HEADER_WITH_OVERRIDE();
+    TYPESYSTEM_HEADER_WITH_OVERRIDE();// NOLINT

 public:
    /// Flag of the stored string data
@@ -316,10 +316,10 @@ private:
    StringID([[maybe_unused]] StringID&& other) noexcept
        : _id(0),
          _flags(StringID::Flag::None) {};
-    StringID& operator=([[maybe_unused]] const StringID& rhs)
+    StringID& operator=([[maybe_unused]] const StringID& rhs)// NOLINT
    {
        return *this;
-    };// NOLINT
+    };
    StringID& operator=([[maybe_unused]] StringID&& rhs) noexcept
    {
        return *this;
@@ -590,12 +590,12 @@ public:

    bool isMarked() const
    {
-        return _sid && _sid->isMarked();
+        return _sid && _sid->isMarked();// NOLINT
    }

    bool isFromSameHasher(const StringHasherRef& hasher) const
    {
-        return _sid && _sid->isFromSameHasher(hasher);
+        return _sid && _sid->isFromSameHasher(hasher);// NOLINT
    }

    StringHasherRef getHasher() const
@@ -635,13 +635,13 @@ private:
 /// count. If a duplicate string is added, no additional copy is made, and a new reference to the
 /// original storage is returned (incrementing the reference counter of the instance).
 ///
-/// If the string is longer than a given threshold, instead of storing the string, instead its
-/// SHA1 hash is stored (and the original string discarded). This allows an upper threshold on the
-/// length of a stored string, while still effectively guaranteeing uniqueness in the table.
+/// If the string is longer than a given threshold, instead of storing the string, its SHA1 hash is
+/// stored (and the original string discarded). This allows an upper threshold on the length of a
+/// stored string, while still effectively guaranteeing uniqueness in the table.
 class AppExport StringHasher: public Base::Persistence, public Base::Handled
 {

-    TYPESYSTEM_HEADER_WITH_OVERRIDE();
+    TYPESYSTEM_HEADER_WITH_OVERRIDE();// NOLINT

 public:
    StringHasher();
--- a/tests/src/App/StringHasher.cpp
+++ b/tests/src/App/StringHasher.cpp