App/Toponaming: Stubs of tests for StringHasher

This commit is contained in:
Chris Hennes
2023-04-09 19:19:04 -05:00
committed by Chris Hennes
parent e461f1bc27
commit 2ef98bfdc0
3 changed files with 426 additions and 60 deletions

View File

@@ -180,6 +180,23 @@ StringHasher::~StringHasher()
clear();
}
StringHasher::StringHasher([[maybe_unused]] StringHasher &&other) noexcept
{
// Private: unimplemented
}
StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &other)
{
// Private: unimplemented
return *this;
}
StringHasher& StringHasher::operator=([[maybe_unused]] StringHasher &&other) noexcept
{
// Private: unimplemented
return *this;
}
void StringHasher::setSaveAll(bool enable)
{
if (_hashes->SaveAll == enable) {
@@ -195,23 +212,31 @@ void StringHasher::compact()
return;
}
// Make a list of all the table entries that have only a single reference and are not marked
// "persistent"
std::deque<StringIDRef> pendings;
for (auto& hasher : _hashes->right) {
if (!hasher.second->isPersistent() && hasher.second->getRefCount() == 1) {
pendings.emplace_back(hasher.second);
}
}
// Recursively remove the unused StringIDs
while (!pendings.empty()) {
StringIDRef sid = pendings.front();
pendings.pop_front();
// Try to erase the map entry for this StringID
if (_hashes->right.erase(sid.value()) == 0U) {
continue;
continue;// If nothing was erased, there's nothing more to do
}
sid._sid->_hasher = nullptr;
sid._sid->unref();
for (auto& hasher : sid._sid->_sids) {
if (hasher._sid->_hasher == this && !hasher._sid->isPersistent()
&& hasher._sid->getRefCount() == 2) {
// If the related StringID also uses this hasher, is not marked persistent, and has
// a current reference count of 2 (which will be its hasher reference and its entry
// in the related SIDs list), then prep it for removal as well.
pendings.push_back(hasher);
}
}
@@ -332,10 +357,10 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
}
StringIDRef sid(new StringID(lastID() + 1, anID._data));
StringID& id = *sid._sid;
StringID& newStringID = *sid._sid;
if (anID._postfix.size() != 0) {
id._flags.setFlag(StringID::Flag::Postfixed);
id._postfix = anID._postfix;
newStringID._flags.setFlag(StringID::Flag::Postfixed);
newStringID._postfix = anID._postfix;
}
int count = 0;
@@ -347,43 +372,50 @@ StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector<Stri
int extra = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0);
if (count == sids.size() && !postfixRef && !indexRef) {
id._sids = sids;
newStringID._sids = sids;
}
else {
id._sids.reserve(count + extra);
newStringID._sids.reserve(count + extra);
if (postfixRef) {
id._flags.setFlag(StringID::Flag::PostfixEncoded);
id._sids.push_back(postfixRef);
newStringID._flags.setFlag(StringID::Flag::PostfixEncoded);
newStringID._sids.push_back(postfixRef);
}
if (indexRef) {
id._flags.setFlag(StringID::Flag::Indexed);
id._sids.push_back(indexRef);
newStringID._flags.setFlag(StringID::Flag::Indexed);
newStringID._sids.push_back(indexRef);
}
for (const auto& hasher : sids) {
if (hasher && hasher._sid->_hasher == this) {
id._sids.push_back(hasher);
newStringID._sids.push_back(hasher);
}
}
}
if (id._sids.size() > 10) {
std::sort(id._sids.begin() + extra, id._sids.end());
id._sids.erase(std::unique(id._sids.begin() + extra, id._sids.end()), id._sids.end());
if (newStringID._sids.size() > 10) {
std::sort(newStringID._sids.begin() + extra, newStringID._sids.end());
newStringID._sids.erase(std::unique(newStringID._sids.begin() + extra, newStringID._sids.end()),
newStringID._sids.end());
}
if ((id._postfix.size() != 0) && !indexed) {
StringID::IndexID res = StringID::fromString(id._data);
if (res.id > 0) {
int offset = id.isPostfixEncoded() ? 1 : 0;
for (int i = offset; i < id._sids.size(); ++i) {
if (id._sids[i].value() == res.id) {
// If the new StringID has a postfix, but is not indexed, see if the data string itself
// contains an index.
if ((newStringID._postfix.size() != 0) && !indexed) {
// Use the fromString function to parse the new StringID's data field for a possible index
StringID::IndexID res = StringID::fromString(newStringID._data);
if (res.id > 0) { // If the data had an index
int offset = newStringID.isPostfixEncoded() ? 1 : 0;
// Search for the SID with that index
for (int i = offset; i < newStringID._sids.size(); ++i) {
if (newStringID._sids[i].value() == res.id) {
if (i != offset) {
std::swap(id._sids[offset], id._sids[i]);
// If this SID is not already the first element in sids, move it there by
// swapping it with
std::swap(newStringID._sids[offset], newStringID._sids[i]);
}
if (res.index != 0) {
id._flags.setFlag(StringID::Flag::PrefixIDIndex);
newStringID._flags.setFlag(StringID::Flag::PrefixIDIndex);
}
else {
id._flags.setFlag(StringID::Flag::PrefixID);
newStringID._flags.setFlag(StringID::Flag::PrefixID);
}
break;
}

View File

@@ -308,6 +308,22 @@ private:
StringHasher* _hasher = nullptr;
mutable Flags _flags;
mutable QVector<StringIDRef> _sids;
private:
StringID([[maybe_unused]] const StringID& other)
: _id(0),
_flags(StringID::Flag::None) {};
StringID([[maybe_unused]] StringID&& other) noexcept
: _id(0),
_flags(StringID::Flag::None) {};
StringID& operator=([[maybe_unused]] const StringID& rhs)
{
return *this;
};// NOLINT
StringID& operator=([[maybe_unused]] StringID&& rhs) noexcept
{
return *this;
};
};
//////////////////////////////////////////////////////////////////////////
@@ -317,7 +333,6 @@ private:
class StringIDRef
{
public:
/// Default construction results in an empty StringIDRef object: it will evaluate to boolean
/// "false" if queried.
StringIDRef()
@@ -600,7 +615,7 @@ public:
/// Used predominantly by the unit test code to verify that index is set correctly. In general
/// user code should not need to call this function.
int getIndex()
int getIndex() const
{
return _index;
}
@@ -612,7 +627,17 @@ private:
int _index;
};
/// A String table to map string from/to a unique integer
/// \brief A bidirectional map of strings and their integer identifier.
///
/// Maps an arbitrary text string to a unique integer ID, maintaining a reference-counted shared
/// pointer for each. This permits elimination of unused strings based on their reference
/// count. If a duplicate string is added, no additional copy is made, and a new reference to the
/// original storage is returned (incrementing the reference counter of the instance).
///
/// If the string is longer than a given threshold, instead of storing the string, instead its
/// SHA1 hash is stored (and the original string discarded). This allows an upper threshold on the
/// length of a stored string, while still effectively guaranteeing uniqueness in the table.
class AppExport StringHasher: public Base::Persistence, public Base::Handled
{
@@ -633,38 +658,40 @@ public:
/** Maps an arbitrary string to an integer
*
* @param text: input string.
* @param len: length of the string, or -1 if the string is 0 terminated.
* @param hashable: whether the string is hashable.
* @return Return a shared pointer to the internally stored StringID.
* @param len: length of the string: optional if the string is null-terminated.
* @param hashable: whether hashing the string is permitted.
* @return A shared pointer to the internally-stored StringID.
*
* The function maps an arbitrary text string to a unique integer ID, which
* is returned as a shared pointer to reference count the ID so that it is
* possible to prune any unused strings.
* Maps an arbitrary text string to a unique integer ID, returning a reference-counted shared
* pointer to the StringID. This permits elimination of unused strings based on their reference
* count. If a duplicate string is added, no additional copy is made, and a new reference to the
* original storage is returned (incrementing the reference counter of the instance).
*
* If \c hashable is true and the string is longer than the threshold
* setting of this StringHasher, it will be sha1 hashed before storing, and
* the original content of the string is discarded. If else, the string is
* copied and stored inside a StringID instance.
* If \c hashable is true and the string is longer than the threshold setting of this
* StringHasher, only the SHA1 hash of the string is stored: the original content of the string
* is discarded. If \c hashable is false, the string is copied and stored inside a StringID
* instance.
*
* The purpose of function is to provide a short form of a stable string
* identification.
* The purpose of this function is to provide a short form of a stable string identification.
*/
StringIDRef getID(const char* text, int len = -1, bool hashable = false);
/// Option for string string data
/// Options for string string data
enum class Option
{
/// No option
/// No option is set
None = 0,
/// The input data is binary
Binary = 1 << 0,
/** The input data is hashable. If the data length is longer than the
* threshold setting of the StringHasher, it will be sha1 hashed before
* storing, and the original content of the string is discarded.
*/
/// Hashing is permitted for this input data. If the data length is longer than the
/// threshold setting of the StringHasher, it will be sha1 hashed before storing, and the
/// original content of the string is discarded.
Hashable = 1 << 1,
/// Do not copy the data, assuming the data is constant. If this option
// is not set, the data will be copied before storing.
/// Do not copy the data: assume it is constant and exists for the lifetime of this hasher.
/// If this option is not set, the data will be copied before storing.
NoCopy = 1 << 2,
};
using Options = Base::Flags<Option>;
@@ -672,15 +699,10 @@ public:
/** Map text or binary data to an integer
*
* @param data: input data.
* @param options: options describing how to store the data. @sa Option.
* @return Return a shared pointer to the internally stored StringID.
* @param options: options describing how to store the data.
* @return A shared pointer to the internally stored StringID.
*
* The function maps an arbitrary text string to a unique integer ID, which
* is returned as a shared pointer to reference count the ID so that it is
* possible to prune any unused strings.
*
* The purpose of function is to provide a short form of a stable string
* identification.
* \sa getID (const char*, int, bool);
*/
StringIDRef getID(const QByteArray& data, Options options = Option::Hashable);
@@ -724,14 +746,15 @@ public:
/** Enable/disable saving all string ID
*
* If disabled, then only save string ID that are used.
* If saveAll is true, then compact() does nothing even when called explicitly. Setting
* saveAll it to false causes compact() to be run immediately.
*/
void setSaveAll(bool enable);
bool getSaveAll() const;
/** Set threshold of string hashing
*
* For hashable string that are longer than the threshold, the string will
* For hashable strings that are longer than this threshold, the string will
* be replaced by its sha1 hash.
*/
void setThreshold(int threshold);
@@ -744,7 +767,7 @@ public:
*/
void clearMarks() const;
/// Compact string storage
/// Compact string storage by eliminating unused strings from the table.
void compact();
class HashMap;
@@ -758,8 +781,14 @@ protected:
void restoreStreamNew(std::istream& stream, std::size_t count);
private:
std::unique_ptr<HashMap> _hashes;
std::unique_ptr<HashMap> _hashes;///< Bidirectional map of StringID and its index (a long int).
mutable std::string _filename;
private:
StringHasher(const StringHasher&);
StringHasher(StringHasher&&) noexcept;
StringHasher& operator=(StringHasher& other);
StringHasher& operator=(StringHasher&& other) noexcept;
};
}// namespace App

View File

@@ -6,6 +6,7 @@
#include <App/StringIDPy.h>
#include <QCryptographicHash>
#include <array>
class StringIDTest: public ::testing::Test
{
@@ -1058,6 +1059,310 @@ TEST_F(StringIDRefTest, setPersistent)
class StringHasherTest: public ::testing::Test
{
protected:
// void SetUp() override {}
// void TearDown() override {}
void SetUp() override
{
_hasher = std::make_unique<App::StringHasher>();
}
void TearDown() override
{
_hasher.reset();
}
App::StringHasher* Hasher()
{
return _hasher.get();
};
private:
std::unique_ptr<App::StringHasher> _hasher;
};
TEST_F(StringHasherTest, defaultConstructor)
{
// Arrange
// Done in Setup()
// Act
// Done in Setup()
// Assert
EXPECT_EQ(0, Hasher()->size());
}
TEST_F(StringHasherTest, getMemSize)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, Save)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, Restore)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, SaveDocFile)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, RestoreDocFile)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, setPersistenceFileName)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getPersistenceFileName)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getIDFromQByteArrayShort)
{
// Arrange
const std::array<char, 5> string {"data"};
QByteArray qba(string.data(), string.size());
Hasher()->setThreshold(string.size() + 1);
// Act
auto id = Hasher()->getID(qba, App::StringHasher::Option::Hashable);
// Assert
EXPECT_STREQ(string.data(), id.constData());
EXPECT_FALSE(id.isHashed());
EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
EXPECT_EQ(2, id.getRefCount());
}
TEST_F(StringHasherTest, getIDFromQByteArrayLongHashable)
{
// Arrange
const std::array<char, 47> string {"data that is longer than our hasher threshold"};
QByteArray qba(string.data(), string.size());
Hasher()->setThreshold(string.size() - 1);
// Act
auto id = Hasher()->getID(qba, App::StringHasher::Option::Hashable);
// Assert
EXPECT_STRNE(string.data(), id.constData());
EXPECT_TRUE(id.isHashed());
EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
}
TEST_F(StringHasherTest, getIDFromQByteArrayLongUnhashable)
{
// Arrange
const std::array<char, 47> string {"data that is longer than our hasher threshold"};
QByteArray qba(string.data(), string.size());
Hasher()->setThreshold(string.size() - 1);
// Act
auto id = Hasher()->getID(qba, App::StringHasher::Option::None);
// Assert
EXPECT_STREQ(string.data(), id.constData());
EXPECT_FALSE(id.isHashed());
EXPECT_NE(qba.constData(), id.constData());// A copy was made, the pointers differ
}
TEST_F(StringHasherTest, getIDFromQByteArrayNoCopy)
{
// Arrange
const std::array<char, 5> string {"data"};
QByteArray qba(string.data(), string.size());
Hasher()->setThreshold(string.size() + 1);
// Act
auto id = Hasher()->getID(qba, App::StringHasher::Option::NoCopy);
// Assert
EXPECT_STREQ(string.data(), id.constData());
EXPECT_EQ(qba.constData(), id.constData());// No copy was made, the pointers are the same
}
TEST_F(StringHasherTest, getIDFromQByteArrayTwoDifferentStrings)
{
// Arrange
const std::array<char, 6> stringA {"dataA"};
QByteArray qbaA(stringA.data(), stringA.size());
const std::array<char, 6> stringB {"dataB"};
QByteArray qbaB(stringB.data(), stringB.size());
// Act
auto idA = Hasher()->getID(qbaA);
auto idB = Hasher()->getID(qbaB);
// Assert
EXPECT_EQ(2, Hasher()->size());
}
TEST_F(StringHasherTest, getIDFromQByteArrayTwoIdenticalStrings)
{
// Arrange
const std::array<char, 5> stringA {"data"};
QByteArray qbaA(stringA.data(), stringA.size());
const std::array<char, 5> stringB {"data"};
QByteArray qbaB(stringB.data(), stringB.size());
// Act
auto idA = Hasher()->getID(qbaA);
auto idB = Hasher()->getID(qbaB);
// Assert
EXPECT_EQ(1, Hasher()->size());
}
TEST_F(StringHasherTest, getIDFromQByteArrayBinaryFlag)
{
// Arrange
const std::array<char, 5> string {"data"};
QByteArray qba(string.data(), string.size());
// Act
auto id = Hasher()->getID(qba, App::StringHasher::Option::Binary);
// Assert
EXPECT_TRUE(id.isBinary());
}
TEST_F(StringHasherTest, getIDFromCString)
{
// Arrange
// Act
// Assert
}
/*
* Things that have to be tested for getIDFromMappedName:
* 1. With and without postfix (every other path must test both)
* 2. Existing entry: short circuits
* 3. Raw data and non-raw
* 4. Postfix contains # and not
* 5. Indexed name and not
* 6. sids empty and sids with content
* 7. sids whose hasher==this and whose hasher is something else
* 8. If sids.size() > 10, something happens to sids
*
*
*/
TEST_F(StringHasherTest, getIDFromMappedName)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getIDFromIntegerID)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getIDFromIndexID)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getIDMap)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, clear)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, size)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, count)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getPyObject)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, setSaveAll)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getSaveAll)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, setThreshold)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, getThreshold)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, clearMarks)
{
// Arrange
// Act
// Assert
}
TEST_F(StringHasherTest, compact)
{
// Arrange
// Act
// Assert
}