App/Toponaming: Stubs of tests for StringHasher

2023-04-09 19:19:04 -05:00
parent e461f1bc27
commit 2ef98bfdc0
3 changed files with 426 additions and 60 deletions
--- a/src/App/StringHasher.cpp
+++ b/src/App/StringHasher.cpp
@@ -180,6 +180,23 @@ StringHasher::~StringHasher()
    clear();
 }

+StringHasher::StringHasher([[maybe_unused]] StringHasher &&other) noexcept
+{
+    // Private: unimplemented
+}
+
+StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &other)
+{
+    // Private: unimplemented
+    return *this;
+}
+
+StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &&other) noexcept
+{
+    // Private: unimplemented
+    return *this;
+}
+
 void StringHasher::setSaveAll(bool enable)
 {
    if (_hashes->SaveAll == enable) {
@@ -195,23 +212,31 @@ void StringHasher::compact()
        return;
    }

+    // Make a list of all the table entries that have only a single reference and are not marked
+    // "persistent"
    std::deque<StringIDRef> pendings;
    for (auto& hasher : _hashes->right) {
        if (!hasher.second->isPersistent() && hasher.second->getRefCount() == 1) {
            pendings.emplace_back(hasher.second);
        }
    }
+
+    // Recursively remove the unused StringIDs
    while (!pendings.empty()) {
        StringIDRef sid = pendings.front();
        pendings.pop_front();
+        // Try to erase the map entry for this StringID
        if (_hashes->right.erase(sid.value()) == 0U) {
-            continue;
+            continue;// If nothing was erased, there's nothing more to do
        }
        sid._sid->_hasher = nullptr;
        sid._sid->unref();
        for (auto& hasher : sid._sid->_sids) {
            if (hasher._sid->_hasher == this && !hasher._sid->isPersistent()
                && hasher._sid->getRefCount() == 2) {
+                // If the related StringID also uses this hasher, is not marked persistent, and has
+                // a current reference count of 2 (which will be its hasher reference and its entry
+                // in the related SIDs list), then prep it for removal as well.
                pendings.push_back(hasher);
            }
        }
@@ -332,10 +357,10 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
    }

    StringIDRef sid(new StringID(lastID() + 1, anID._data));
-    StringID& id = *sid._sid;
+    StringID& newStringID = *sid._sid;
    if (anID._postfix.size() != 0) {
-        id._flags.setFlag(StringID::Flag::Postfixed);
-        id._postfix = anID._postfix;
+        newStringID._flags.setFlag(StringID::Flag::Postfixed);
+        newStringID._postfix = anID._postfix;
    }

    int count = 0;
@@ -347,43 +372,50 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri

    int extra = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0);
    if (count == sids.size() && !postfixRef && !indexRef) {
-        id._sids = sids;
+        newStringID._sids = sids;
    }
    else {
-        id._sids.reserve(count + extra);
+        newStringID._sids.reserve(count + extra);
        if (postfixRef) {
-            id._flags.setFlag(StringID::Flag::PostfixEncoded);
-            id._sids.push_back(postfixRef);
+            newStringID._flags.setFlag(StringID::Flag::PostfixEncoded);
+            newStringID._sids.push_back(postfixRef);
        }
        if (indexRef) {
-            id._flags.setFlag(StringID::Flag::Indexed);
-            id._sids.push_back(indexRef);
+            newStringID._flags.setFlag(StringID::Flag::Indexed);
+            newStringID._sids.push_back(indexRef);
        }
        for (const auto& hasher : sids) {
            if (hasher && hasher._sid->_hasher == this) {
-                id._sids.push_back(hasher);
+                newStringID._sids.push_back(hasher);
            }
        }
    }
-    if (id._sids.size() > 10) {
-        std::sort(id._sids.begin() + extra, id._sids.end());
-        id._sids.erase(std::unique(id._sids.begin() + extra, id._sids.end()), id._sids.end());
+    if (newStringID._sids.size() > 10) {
+        std::sort(newStringID._sids.begin() + extra, newStringID._sids.end());
+        newStringID._sids.erase(std::unique(newStringID._sids.begin() + extra, newStringID._sids.end()),
+            newStringID._sids.end());
    }

-    if ((id._postfix.size() != 0) && !indexed) {
-        StringID::IndexID res = StringID::fromString(id._data);
-        if (res.id > 0) {
-            int offset = id.isPostfixEncoded() ? 1 : 0;
-            for (int i = offset; i < id._sids.size(); ++i) {
-                if (id._sids[i].value() == res.id) {
+    // If the new StringID has a postfix, but is not indexed, see if the data string itself
+    // contains an index.
+    if ((newStringID._postfix.size() != 0) && !indexed) {
+        // Use the fromString function to parse the new StringID's data field for a possible index
+        StringID::IndexID res = StringID::fromString(newStringID._data);
+        if (res.id > 0) { // If the data had an index
+            int offset = newStringID.isPostfixEncoded() ? 1 : 0;
+            // Search for the SID with that index
+            for (int i = offset; i < newStringID._sids.size(); ++i) {
+                if (newStringID._sids[i].value() == res.id) {
                    if (i != offset) {
-                        std::swap(id._sids[offset], id._sids[i]);
+                        // If this SID is not already the first element in sids, move it there by
+                        // swapping it with
+                        std::swap(newStringID._sids[offset], newStringID._sids[i]);
                    }
                    if (res.index != 0) {
-                        id._flags.setFlag(StringID::Flag::PrefixIDIndex);
+                        newStringID._flags.setFlag(StringID::Flag::PrefixIDIndex);
                    }
                    else {
-                        id._flags.setFlag(StringID::Flag::PrefixID);
+                        newStringID._flags.setFlag(StringID::Flag::PrefixID);
                    }
                    break;
                }
--- a/src/App/StringHasher.h
+++ b/src/App/StringHasher.h
@@ -308,6 +308,22 @@ private:
    StringHasher* _hasher = nullptr;
    mutable Flags _flags;
    mutable QVector<StringIDRef> _sids;
+
+private:
+    StringID([[maybe_unused]] const StringID& other)
+        : _id(0),
+          _flags(StringID::Flag::None) {};
+    StringID([[maybe_unused]] StringID&& other) noexcept
+        : _id(0),
+          _flags(StringID::Flag::None) {};
+    StringID& operator=([[maybe_unused]] const StringID& rhs)
+    {
+        return *this;
+    };// NOLINT
+    StringID& operator=([[maybe_unused]] StringID&& rhs) noexcept
+    {
+        return *this;
+    };
 };

 //////////////////////////////////////////////////////////////////////////
@@ -317,7 +333,6 @@ private:
 class StringIDRef
 {
 public:
-
    /// Default construction results in an empty StringIDRef object: it will evaluate to boolean
    /// "false" if queried.
    StringIDRef()
@@ -600,7 +615,7 @@ public:

    /// Used predominantly by the unit test code to verify that index is set correctly. In general
    /// user code should not need to call this function.
-    int getIndex()
+    int getIndex() const
    {
        return _index;
    }
@@ -612,7 +627,17 @@ private:
    int _index;
 };

-/// A String table to map string from/to a unique integer
+
+/// \brief A bidirectional map  of strings and their integer identifier.
+///
+/// Maps an arbitrary text string to a unique integer ID, maintaining a reference-counted shared
+/// pointer for each. This permits elimination of unused strings based on their reference
+/// count. If a duplicate string is added, no additional copy is made, and a new reference to the
+/// original storage is returned (incrementing the reference counter of the instance).
+///
+/// If the string is longer than a given threshold, instead of storing the string, instead its
+/// SHA1 hash is stored (and the original string discarded). This allows an upper threshold on the
+/// length of a stored string, while still effectively guaranteeing uniqueness in the table.
 class AppExport StringHasher: public Base::Persistence, public Base::Handled
 {

@@ -633,38 +658,40 @@ public:
    /** Maps an arbitrary string to an integer
     *
     * @param text: input string.
-     * @param len: length of the string, or -1 if the string is 0 terminated.
-     * @param hashable: whether the string is hashable.
-     * @return Return a shared pointer to the internally stored StringID.
+     * @param len: length of the string: optional if the string is null-terminated.
+     * @param hashable: whether hashing the string is permitted.
+     * @return A shared pointer to the internally-stored StringID.
     *
-     * The function maps an arbitrary text string to a unique integer ID, which
-     * is returned as a shared pointer to reference count the ID so that it is
-     * possible to prune any unused strings.
+     * Maps an arbitrary text string to a unique integer ID, returning a reference-counted shared
+     * pointer to the StringID. This permits elimination of unused strings based on their reference
+     * count. If a duplicate string is added, no additional copy is made, and a new reference to the
+     * original storage is returned (incrementing the reference counter of the instance).
     *
-     * If \c hashable is true and the string is longer than the threshold
-     * setting of this StringHasher, it will be sha1 hashed before storing, and
-     * the original content of the string is discarded. If else, the string is
-     * copied and stored inside a StringID instance.
+     * If \c hashable is true and the string is longer than the threshold setting of this
+     * StringHasher, only the SHA1 hash of the string is stored: the original content of the string
+     * is discarded. If \c hashable is false, the string is copied and stored inside a StringID
+     * instance.
     *
-     * The purpose of function is to provide a short form of a stable string
-     * identification.
+     * The purpose of this function is to provide a short form of a stable string identification.
     */
    StringIDRef getID(const char* text, int len = -1, bool hashable = false);

-    /// Option for string string data
+    /// Options for string string data
    enum class Option
    {
-        /// No option
+        /// No option is set
        None = 0,
+
        /// The input data is binary
        Binary = 1 << 0,
-        /** The input data is hashable. If the data length is longer than the
-         * threshold setting of the StringHasher, it will be sha1 hashed before
-         * storing, and the original content of the string is discarded.
-         */
+
+        /// Hashing is permitted for this input data. If the data length is longer than the
+        /// threshold setting of the StringHasher, it will be sha1 hashed before storing, and the
+        /// original content of the string is discarded.
        Hashable = 1 << 1,
-        /// Do not copy the data, assuming the data is constant. If this option
-        // is not set, the data will be copied before storing.
+
+        /// Do not copy the data: assume it is constant and exists for the lifetime of this hasher.
+        /// If this option is not set, the data will be copied before storing.
        NoCopy = 1 << 2,
    };
    using Options = Base::Flags<Option>;
@@ -672,15 +699,10 @@ public:
    /** Map text or binary data to an integer
     *
     * @param data: input data.
-     * @param options: options describing how to store the data. @sa Option.
-     * @return Return a shared pointer to the internally stored StringID.
+     * @param options: options describing how to store the data.
+     * @return A shared pointer to the internally stored StringID.
     *
-     * The function maps an arbitrary text string to a unique integer ID, which
-     * is returned as a shared pointer to reference count the ID so that it is
-     * possible to prune any unused strings.
-     *
-     * The purpose of function is to provide a short form of a stable string
-     * identification.
+     * \sa getID (const char*, int, bool);
     */
    StringIDRef getID(const QByteArray& data, Options options = Option::Hashable);

@@ -724,14 +746,15 @@ public:

    /** Enable/disable saving all string ID
     *
-     * If disabled, then only save string ID that are used.
+     * If saveAll is true, then compact() does nothing even when called explicitly. Setting
+     * saveAll it to false causes compact() to be run immediately.
     */
    void setSaveAll(bool enable);
    bool getSaveAll() const;

    /** Set threshold of string hashing
     *
-     * For hashable string that are longer than the threshold, the string will
+     * For hashable strings that are longer than this threshold, the string will
     * be replaced by its sha1 hash.
     */
    void setThreshold(int threshold);
@@ -744,7 +767,7 @@ public:
     */
    void clearMarks() const;

-    /// Compact string storage
+    /// Compact string storage by eliminating unused strings from the table.
    void compact();

    class HashMap;
@@ -758,8 +781,14 @@ protected:
    void restoreStreamNew(std::istream& stream, std::size_t count);

 private:
-    std::unique_ptr<HashMap> _hashes;
+    std::unique_ptr<HashMap> _hashes;///< Bidirectional map of StringID and its index (a long int).
    mutable std::string _filename;
+
+private:
+    StringHasher(const StringHasher&);
+    StringHasher(StringHasher&&) noexcept;
+    StringHasher& operator=(StringHasher& other);
+    StringHasher& operator=(StringHasher&& other) noexcept;
 };
 }// namespace App

--- a/tests/src/App/StringHasher.cpp
+++ b/tests/src/App/StringHasher.cpp
@@ -6,6 +6,7 @@
 #include <App/StringIDPy.h>

 #include <QCryptographicHash>
+#include <array>

 class StringIDTest: public ::testing::Test
 {
@@ -1058,6 +1059,310 @@ TEST_F(StringIDRefTest, setPersistent)
 class StringHasherTest: public ::testing::Test
 {
 protected:
-    // void SetUp() override {}
-    // void TearDown() override {}
+    void SetUp() override
+    {
+        _hasher = std::make_unique<App::StringHasher>();
+    }
+
+    void TearDown() override
+    {
+        _hasher.reset();
+    }
+
+    App::StringHasher* Hasher()
+    {
+        return _hasher.get();
+    };
+
+private:
+    std::unique_ptr<App::StringHasher> _hasher;
 };
+
+TEST_F(StringHasherTest, defaultConstructor)
+{
+    // Arrange
+    // Done in Setup()
+
+    // Act
+    // Done in Setup()
+
+    // Assert
+    EXPECT_EQ(0, Hasher()->size());
+}
+
+TEST_F(StringHasherTest, getMemSize)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, Save)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, Restore)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, SaveDocFile)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, RestoreDocFile)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, setPersistenceFileName)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getPersistenceFileName)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayShort)
+{
+    // Arrange
+    const std::array<char, 5> string {"data"};
+    QByteArray qba(string.data(), string.size());
+    Hasher()->setThreshold(string.size() + 1);
+
+    // Act
+    auto id = Hasher()->getID(qba, App::StringHasher::Option::Hashable);
+
+    // Assert
+    EXPECT_STREQ(string.data(), id.constData());
+    EXPECT_FALSE(id.isHashed());
+    EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
+    EXPECT_EQ(2, id.getRefCount());
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayLongHashable)
+{
+    // Arrange
+    const std::array<char, 47> string {"data that is longer than our hasher threshold"};
+    QByteArray qba(string.data(), string.size());
+    Hasher()->setThreshold(string.size() - 1);
+
+    // Act
+    auto id = Hasher()->getID(qba, App::StringHasher::Option::Hashable);
+
+    // Assert
+    EXPECT_STRNE(string.data(), id.constData());
+    EXPECT_TRUE(id.isHashed());
+    EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayLongUnhashable)
+{
+    // Arrange
+    const std::array<char, 47> string {"data that is longer than our hasher threshold"};
+    QByteArray qba(string.data(), string.size());
+    Hasher()->setThreshold(string.size() - 1);
+
+    // Act
+    auto id = Hasher()->getID(qba, App::StringHasher::Option::None);
+
+    // Assert
+    EXPECT_STREQ(string.data(), id.constData());
+    EXPECT_FALSE(id.isHashed());
+    EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayNoCopy)
+{
+    // Arrange
+    const std::array<char, 5> string {"data"};
+    QByteArray qba(string.data(), string.size());
+    Hasher()->setThreshold(string.size() + 1);
+
+    // Act
+    auto id = Hasher()->getID(qba, App::StringHasher::Option::NoCopy);
+
+    // Assert
+    EXPECT_STREQ(string.data(), id.constData());
+    EXPECT_EQ(qba.constData(), id.constData());// No copy was made, the pointers are the same
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayTwoDifferentStrings)
+{
+    // Arrange
+    const std::array<char, 6> stringA {"dataA"};
+    QByteArray qbaA(stringA.data(), stringA.size());
+    const std::array<char, 6> stringB {"dataB"};
+    QByteArray qbaB(stringB.data(), stringB.size());
+
+    // Act
+    auto idA = Hasher()->getID(qbaA);
+    auto idB = Hasher()->getID(qbaB);
+
+    // Assert
+    EXPECT_EQ(2, Hasher()->size());
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayTwoIdenticalStrings)
+{
+    // Arrange
+    const std::array<char, 5> stringA {"data"};
+    QByteArray qbaA(stringA.data(), stringA.size());
+    const std::array<char, 5> stringB {"data"};
+    QByteArray qbaB(stringB.data(), stringB.size());
+
+    // Act
+    auto idA = Hasher()->getID(qbaA);
+    auto idB = Hasher()->getID(qbaB);
+
+    // Assert
+    EXPECT_EQ(1, Hasher()->size());
+}
+
+TEST_F(StringHasherTest, getIDFromQByteArrayBinaryFlag)
+{
+    // Arrange
+    const std::array<char, 5> string {"data"};
+    QByteArray qba(string.data(), string.size());
+
+    // Act
+    auto id = Hasher()->getID(qba, App::StringHasher::Option::Binary);
+
+    // Assert
+    EXPECT_TRUE(id.isBinary());
+}
+
+TEST_F(StringHasherTest, getIDFromCString)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+/*
+ * Things that have to be tested for getIDFromMappedName:
+ *   1. With and without postfix (every other path must test both)
+ *   2. Existing entry: short circuits
+ *   3. Raw data and non-raw
+ *   4. Postfix contains # and not
+ *   5. Indexed name and not
+ *   6. sids empty and sids with content
+ *   7. sids whose hasher==this and whose hasher is something else
+ *   8. If sids.size() > 10, something happens to sids
+ *
+ *
+ */
+
+TEST_F(StringHasherTest, getIDFromMappedName)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getIDFromIntegerID)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getIDFromIndexID)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getIDMap)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, clear)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, size)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, count)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getPyObject)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, setSaveAll)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getSaveAll)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, setThreshold)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, getThreshold)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, clearMarks)
+{
+    // Arrange
+    // Act
+    // Assert
+}
+
+TEST_F(StringHasherTest, compact)
+{
+    // Arrange
+    // Act
+    // Assert
+}