diff --git a/src/App/CMakeLists.txt b/src/App/CMakeLists.txt index 4326b562f0..80e4e06b4a 100644 --- a/src/App/CMakeLists.txt +++ b/src/App/CMakeLists.txt @@ -90,6 +90,8 @@ generate_from_xml(GeoFeatureGroupExtensionPy) generate_from_xml(MetadataPy) generate_from_xml(OriginGroupExtensionPy) generate_from_xml(PartPy) +generate_from_xml(StringHasherPy) +generate_from_xml(StringIDPy) generate_from_xml(ComplexGeoDataPy) generate_from_xml(PropertyContainerPy) @@ -115,6 +117,8 @@ SET(FreeCADApp_XML_SRCS PropertyContainerPy.xml ComplexGeoDataPy.xml MaterialPy.xml + StringHasherPy.xml + StringIDPy.xml ) SOURCE_GROUP("XML" FILES ${FreeCADApp_XML_SRCS}) @@ -270,6 +274,9 @@ SET(FreeCADApp_CPP_SRCS MaterialPyImp.cpp Metadata.cpp MetadataPyImp.cpp + StringHasher.cpp + StringHasherPyImp.cpp + StringIDPyImp.cpp ) SET(FreeCADApp_HPP_SRCS @@ -288,6 +295,7 @@ SET(FreeCADApp_HPP_SRCS MappedElement.h Material.h Metadata.h + StringHasher.h ) SET(FreeCADApp_SRCS diff --git a/src/App/StringHasher.cpp b/src/App/StringHasher.cpp new file mode 100644 index 0000000000..f4d0f94cb4 --- /dev/null +++ b/src/App/StringHasher.cpp @@ -0,0 +1,869 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later + +/*************************************************************************************************** + * * + * Copyright (c) 2022 Zheng, Lei (realthunder) * + * Copyright (c) 2023 FreeCAD Project Association * + * * + * This file is part of FreeCAD. * + * * + * FreeCAD is free software: you can redistribute it and/or modify it under the terms of the * + * GNU Lesser General Public License as published by the Free Software Foundation, either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * FreeCAD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License along with * + * FreeCAD. If not, see . * + * * + **************************************************************************************************/ + +#include "PreCompiled.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "MappedElement.h" +#include "StringHasher.h" +#include "StringHasherPy.h" +#include "StringIDPy.h" + + +FC_LOG_LEVEL_INIT("App", true, true) + +namespace bio = boost::iostreams; +using namespace App; + +/////////////////////////////////////////////////////////// + +struct StringIDHasher +{ + std::size_t operator()(const StringID* sid) const + { + if (!sid) { + return 0; + } + return qHash(sid->data(), qHash(sid->postfix())); + } + + bool operator()(const StringID* IDa, const StringID* IDb) const + { + if (IDa == IDb) { + return true; + } + if (!IDa || !IDb) { + return false; + } + return IDa->data() == IDb->data() && IDa->postfix() == IDb->postfix(); + } +}; + +using HashMapBase = + boost::bimap, + boost::bimaps::set_of>; + +class StringHasher::HashMap: public HashMapBase +{ +public: + bool SaveAll = false; + int Threshold = 0; +}; + +/////////////////////////////////////////////////////////// + +TYPESYSTEM_SOURCE_ABSTRACT(App::StringID, Base::BaseClass) + +StringID::~StringID() +{ + if (_hasher) { + _hasher->_hashes->right.erase(_id); + } +} + +PyObject* StringID::getPyObject() +{ + return new StringIDPy(this); +} + +PyObject* StringID::getPyObjectWithIndex(int index) +{ + auto res = new StringIDPy(this); + res->_index = index; + return res; +} + +std::string StringID::toString(int index) const +{ + std::ostringstream ss; + ss << '#' << std::hex << value(); + if (index != 0) { + ss << ':' << index; + } + return ss.str(); +} + +StringID::IndexID StringID::fromString(const char* name, bool eof, int size) +{ + IndexID res {}; + res.id = 0; + res.index = 0; + if (!name) { + res.id = -1; + return res; + } + if (size < 0) { + size = static_cast(std::strlen(name)); + } + bio::stream iss(name, size); + char sep = 0; + char sep2 = 0; + iss >> sep >> std::hex >> res.id >> sep2 >> res.index; + if ((eof && !iss.eof()) || sep != '#' || (sep2 != 0 && sep2 != ':')) { + res.id = -1; + return res; + } + return res; +} + +std::string StringID::dataToText(int index) const +{ + if (isHashed() || isBinary()) { + return _data.toBase64().constData(); + } + + std::string res(_data.constData()); + if (index != 0) { + res += std::to_string(index); + } + if (_postfix.size() != 0) { + res += _postfix.constData(); + } + return res; +} + +void StringID::mark() const +{ + if (isMarked()) { + return; + } + _flags.setFlag(Flag::Marked); + for (auto& sid : _sids) { + sid.deref().mark(); + } +} + +/////////////////////////////////////////////////////////// + +TYPESYSTEM_SOURCE(App::StringHasher, Base::Persistence) + +StringHasher::StringHasher() + : _hashes(new HashMap) +{} + +StringHasher::~StringHasher() +{ + clear(); +} + +void StringHasher::setSaveAll(bool enable) +{ + if (_hashes->SaveAll == enable) { + return; + } + _hashes->SaveAll = enable; + compact(); +} + +void StringHasher::compact() +{ + if (_hashes->SaveAll) { + return; + } + + // Make a list of all the table entries that have only a single reference and are not marked + // "persistent" + std::deque pendings; + for (auto& hasher : _hashes->right) { + if (!hasher.second->isPersistent() && hasher.second->getRefCount() == 1) { + pendings.emplace_back(hasher.second); + } + } + + // Recursively remove the unused StringIDs + while (!pendings.empty()) { + StringIDRef sid = pendings.front(); + pendings.pop_front(); + // Try to erase the map entry for this StringID + if (_hashes->right.erase(sid.value()) == 0U) { + continue;// If nothing was erased, there's nothing more to do + } + sid._sid->_hasher = nullptr; + sid._sid->unref(); + for (auto& hasher : sid._sid->_sids) { + if (hasher._sid->_hasher == this && !hasher._sid->isPersistent() + && hasher._sid->getRefCount() == 2) { + // If the related StringID also uses this hasher, is not marked persistent, and has + // a current reference count of 2 (which will be its hasher reference and its entry + // in the related SIDs list), then prep it for removal as well. + pendings.push_back(hasher); + } + } + } +} + +bool StringHasher::getSaveAll() const +{ + return _hashes->SaveAll; +} + +void StringHasher::setThreshold(int threshold) +{ + _hashes->Threshold = threshold; +} + +int StringHasher::getThreshold() const +{ + return _hashes->Threshold; +} + +long StringHasher::lastID() const +{ + if (_hashes->right.empty()) { + return 0; + } + auto it = _hashes->right.end(); + --it; + return it->first; +} + +StringIDRef StringHasher::getID(const char* text, int len, bool hashable) +{ + if (len < 0) { + len = static_cast(strlen(text)); + } + return getID(QByteArray::fromRawData(text, len), hashable ? Option::Hashable : Option::None); +} + +StringIDRef StringHasher::getID(const QByteArray& data, Options options) +{ + bool binary = options.testFlag(Option::Binary); + bool hashable = options.testFlag(Option::Hashable); + bool nocopy = options.testFlag(Option::NoCopy); + + bool hashed = hashable && _hashes->Threshold > 0 && (int)data.size() > _hashes->Threshold; + + StringID dataID; + if (hashed) { + QCryptographicHash hasher(QCryptographicHash::Sha1); + hasher.addData(data); + dataID._data = hasher.result(); + } + else { + dataID._data = data; + } + + auto it = _hashes->left.find(&dataID); + if (it != _hashes->left.end()) { + return {it->first}; + } + + if (!hashed && !nocopy) { + // if not hashed, make a deep copy of the data + dataID._data = QByteArray(data.constData(), data.size()); + } + + StringID::Flags flags(StringID::Flag::None); + if (binary) { + flags.setFlag(StringID::Flag::Binary); + } + if (hashed) { + flags.setFlag(StringID::Flag::Hashed); + } + StringIDRef sid(new StringID(lastID() + 1, dataID._data, flags)); + return {insert(sid)}; +} + +StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector& sids) +{ + StringID tempID; + tempID._postfix = name.postfixBytes(); + + Data::IndexedName indexed; + if (tempID._postfix.size() != 0) { + // Only check for IndexedName if there is postfix, because of the way + // we restore the StringID. See StringHasher::saveStream/restoreStreamNew() + indexed = Data::IndexedName(name.dataBytes()); + } + if (indexed) { + // If this is an IndexedName, then _data only stores the base part of the name, without the + // integer index + tempID._data = + QByteArray::fromRawData(indexed.getType(), static_cast(strlen(indexed.getType()))); + } + else { + // Store the entire name in _data, but temporarily re-use the existing memory + tempID._data = name.dataBytes(); + } + + // Check to see if there is already an entry in the hash table for this StringID + auto it = _hashes->left.find(&tempID); + if (it != _hashes->left.end()) { + auto res = StringIDRef(it->first); + if (indexed) { + res._index = indexed.getIndex(); + } + return res; + } + + if (!indexed && name.isRaw()) { + // Make a copy of the memory if we didn't do so earlier + tempID._data = QByteArray(name.dataBytes().constData(), name.dataBytes().size()); + } + + // If the postfix is not already encoded, use getID to encode it: + StringIDRef postfixRef; + if ((tempID._postfix.size() != 0) && tempID._postfix.indexOf("#") < 0) { + postfixRef = getID(tempID._postfix); + postfixRef.toBytes(tempID._postfix); + } + + // If _data is an IndexedName, use getID to encode it: + StringIDRef indexRef; + if (indexed) { + indexRef = getID(tempID._data); + } + + // The real StringID object that we are going to insert + StringIDRef newStringIDRef(new StringID(lastID() + 1, tempID._data)); + StringID& newStringID = *newStringIDRef._sid; + if (tempID._postfix.size() != 0) { + newStringID._flags.setFlag(StringID::Flag::Postfixed); + newStringID._postfix = tempID._postfix; + } + + // Count the related SIDs that use this hasher + int numSIDs = 0; + for (const auto& relatedID : sids) { + if (relatedID && relatedID._sid->_hasher == this) { + ++numSIDs; + } + } + + int numAddedSIDs = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0); + if (numSIDs == sids.size() && !postfixRef && !indexRef) { + // The simplest case: just copy the whole list + newStringID._sids = sids; + } + else { + // Put the added SIDs at the front of the SID list + newStringID._sids.reserve(numSIDs + numAddedSIDs); + if (postfixRef) { + newStringID._flags.setFlag(StringID::Flag::PostfixEncoded); + newStringID._sids.push_back(postfixRef); + } + if (indexRef) { + newStringID._flags.setFlag(StringID::Flag::Indexed); + newStringID._sids.push_back(indexRef); + } + // Append the sids from the input list whose hasher is this one + for (const auto& relatedID : sids) { + if (relatedID && relatedID._sid->_hasher == this) { + newStringID._sids.push_back(relatedID); + } + } + } + + // If the number of related IDs is larger than some threshold (hardcoded to 10 right now), then + // remove any duplicates (ignoring the new SIDs we may have just added) + const int relatedIDSizeThreshold {10}; + if (newStringID._sids.size() > relatedIDSizeThreshold) { + std::sort(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end()); + newStringID._sids.erase( + std::unique(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end()), + newStringID._sids.end()); + } + + // If the new StringID has a postfix, but is not indexed, see if the data string itself + // contains an index. + if ((newStringID._postfix.size() != 0) && !indexed) { + // Use the fromString function to parse the new StringID's data field for a possible index + StringID::IndexID res = StringID::fromString(newStringID._data); + if (res.id > 0) {// If the data had an index + if (res.index != 0) { + indexed.setIndex(res.index); + newStringID._data.resize(newStringID._data.lastIndexOf(':')+1); + } + int offset = newStringID.isPostfixEncoded() ? 1 : 0; + // Search for the SID with that index + for (int i = offset; i < newStringID._sids.size(); ++i) { + if (newStringID._sids[i].value() == res.id) { + if (i != offset) { + // If this SID is not already the first element in sids, move it there by + // swapping it with whatever WAS there + std::swap(newStringID._sids[offset], newStringID._sids[i]); + } + if (res.index != 0) { + newStringID._flags.setFlag(StringID::Flag::PrefixIDIndex); + } + else { + newStringID._flags.setFlag(StringID::Flag::PrefixID); + } + break; + } + } + } + } + + return {insert(newStringIDRef), indexed.getIndex()}; +} + +StringIDRef StringHasher::getID(long id, int index) const +{ + if (id <= 0) { + return {}; + } + auto it = _hashes->right.find(id); + if (it == _hashes->right.end()) { + return {}; + } + StringIDRef res(it->second); + res._index = index; + return res; +} + +void StringHasher::setPersistenceFileName(const char* filename) const +{ + if (!filename) { + filename = ""; + } + _filename = filename; +} + +const std::string& StringHasher::getPersistenceFileName() const +{ + return _filename; +} + +void StringHasher::Save(Base::Writer& writer) const +{ + + size_t count = 0; + if (_hashes->SaveAll) { + count = _hashes->size(); + } + else { + count = 0; + for (auto& hasher : _hashes->right) { + if (hasher.second->isMarked() || hasher.second->isPersistent()) { + ++count; + } + } + } + + writer.Stream() << writer.ind() << "SaveAll + << "\" threshold=\"" << _hashes->Threshold << "\""; + + if (count == 0U) { + writer.Stream() << " count=\"0\">\n"; + return; + } + + writer.Stream() << " count=\"0\" new=\"1\"/>\n"; + + writer.Stream() << writer.ind() << "\n"; + return; + } + + writer.Stream() << " count=\"" << count << "\">\n"; + saveStream(writer.beginCharStream() << '\n'); + writer.endCharStream() << '\n'; + writer.Stream() << writer.ind() << "\n"; +} + +void StringHasher::SaveDocFile(Base::Writer& writer) const +{ + std::size_t count = _hashes->SaveAll ? this->size() : this->count(); + writer.Stream() << "StringTableStart v1 " << count << '\n'; + saveStream(writer.Stream()); +} + +void StringHasher::saveStream(std::ostream& stream) const +{ + boost::io::ios_flags_saver ifs(stream); + stream << std::hex; + + long anchor = 0; + const StringID* last = nullptr; + long lastID = 0; + bool relative = false; + + for (auto& hasher : _hashes->right) { + auto& d = *hasher.second; + long id = d._id; + if (!_hashes->SaveAll && !d.isMarked() && !d.isPersistent()) { + continue; + } + + // We use relative coding to save space. But in order to have some + // minimum protection against corruption, write an absolute value every + // once a while. + relative = (id - anchor) < 1000; + if (relative) { + stream << '-' << id - lastID; + } + else { + anchor = id; + stream << id; + } + lastID = id; + + int offset = d.isPostfixEncoded() ? 1 : 0; + + StringID::IndexID prefixID {}; + prefixID.id = 0; + prefixID.index = 0; + if (d.isPrefixID()) { + assert(d._sids.size() > offset); + prefixID.id = d._sids[offset].value(); + } + else if (d.isPrefixIDIndex()) { + prefixID = StringID::fromString(d._data); + assert(d._sids.size() > offset && d._sids[offset].value() == prefixID.id); + } + + auto flags = d._flags; + flags.setFlag(StringID::Flag::Marked, false); + stream << '.' << flags.toUnderlyingType(); + + int position = 0; + if (!relative) { + for (; position < d._sids.size(); ++position) { + stream << '.' << d._sids[position].value(); + } + } + else { + if (last) { + for (; position < d._sids.size() && position < last->_sids.size(); ++position) { + long m = last->_sids[position].value(); + long n = d._sids[position].value(); + if (n < m) { + stream << ".-" << m - n; + } + else { + stream << '.' << n - m; + } + } + } + for (; position < d._sids.size(); ++position) { + stream << '.' << id - d._sids[position].value(); + } + } + + last = &d; + + // Having postfix means it is a geometry element name, which + // guarantees to be a single line without space. So it is safe to + // store in raw stream. + if (d.isPostfixed()) { + if (!d.isPrefixIDIndex() && !d.isIndexed() && !d.isPrefixID()) { + stream << ' ' << d._data.constData(); + } + + if (!d.isPostfixEncoded()) { + stream << ' ' << d._postfix.constData(); + } + stream << '\n'; + } + else { + // Reaching here means the string may contain space and newlines + stream << ' '; + stream << std::dec << d._data.constData() << std::hex; + } + } +} + +void StringHasher::RestoreDocFile(Base::Reader& reader) +{ + std::string marker; + std::string ver; + reader >> marker; + std::size_t count = 0; + _hashes->clear(); + if (marker == "StringTableStart") { + reader >> ver >> count; + if (ver != "v1") { + FC_WARN("Unknown string table format"); + } + restoreStreamNew(reader, count); + return; + } + count = atoi(marker.c_str()); + restoreStream(reader, count); +} + +void StringHasher::restoreStreamNew(std::istream& stream, std::size_t count) +{ + _hashes->clear(); + std::string content; + boost::io::ios_flags_saver ifs(stream); + stream >> std::hex; + std::vector tokens; + long lastid = 0; + const StringID* last = nullptr; + + std::string tmp; + + for (uint32_t i = 0; i < count; ++i) { + if (!(stream >> tmp)) { + FC_THROWM(Base::RuntimeError, "Invalid string table"); + } + + tokens.clear(); + boost::split(tokens, tmp, boost::is_any_of(".")); + if (tokens.size() < 2) { + FC_THROWM(Base::RuntimeError, "Invalid string table"); + } + + long id = 0; + bool relative = false; + if (tokens[0][0] == '-') { + relative = true; + id = lastid + strtol(tokens[0].c_str() + 1, nullptr, 16); + } + else { + id = strtol(tokens[0].c_str(), nullptr, 16); + } + + lastid = id; + + unsigned long flag = strtol(tokens[1].c_str(), nullptr, 16); + StringIDRef sid(new StringID(id, QByteArray(), static_cast(flag))); + + StringID& d = *sid._sid; + d._sids.reserve(tokens.size() - 2); + + int j = 2; + if (relative && last) { + for (; j < (int)tokens.size() && j - 2 < last->_sids.size(); ++j) { + long m = last->_sids[j - 2].value(); + long n; + if (tokens[j][0] == '-') { + n = -strtol(&tokens[j][1], nullptr, 16); + } + else { + n = strtol(&tokens[j][0], nullptr, 16); + } + StringIDRef sid = getID(m + n); + if (!sid) { + FC_THROWM(Base::RuntimeError, "Invalid string id reference"); + } + d._sids.push_back(sid); + } + } + for (; j < (int)tokens.size(); ++j) { + long n = strtol(tokens[j].data(), nullptr, 16); + StringIDRef sid = getID(relative ? id - n : n); + if (!sid) { + FC_THROWM(Base::RuntimeError, "Invalid string id reference"); + } + d._sids.push_back(sid); + } + + if (!d.isPostfixed()) { + stream >> content; + if (d.isHashed() || d.isBinary()) { + d._data = QByteArray::fromBase64(content.c_str()); + } + else { + d._data = content.c_str(); + } + } + else { + int offset = 0; + if (d.isPostfixEncoded()) { + offset = 1; + if (d._sids.empty()) { + FC_THROWM(Base::RuntimeError, "Missing string postfix"); + } + d._postfix = d._sids[0]._sid->_data; + } + if (d.isIndexed()) { + if (d._sids.size() <= offset) { + FC_THROWM(Base::RuntimeError, "Missing string prefix"); + } + d._data = d._sids[offset]._sid->_data; + } + else if (d.isPrefixID() || d.isPrefixIDIndex()) { + if (d._sids.size() <= offset) { + FC_THROWM(Base::RuntimeError, "Missing string prefix id"); + } + d._data = d._sids[offset]._sid->toString(0).c_str(); + if (d.isPrefixIDIndex()) + d._data += ":"; + } + else { + stream >> content; + d._data = content.c_str(); + } + if (!d.isPostfixEncoded()) { + stream >> content; + d._postfix = content.c_str(); + } + } + + last = insert(sid); + } +} + +StringID* StringHasher::insert(const StringIDRef& sid) +{ + assert(sid && sid._sid->_hasher == nullptr); + auto& hasher = *sid._sid; + hasher._hasher = this; + hasher.ref(); + auto res = _hashes->right.insert(_hashes->right.end(), + HashMap::right_map::value_type(sid.value(), &hasher)); + if (res->second != &hasher) { + hasher._hasher = nullptr; + hasher.unref(); + } + return res->second; +} + +void StringHasher::restoreStream(std::istream& stream, std::size_t count) +{ + _hashes->clear(); + std::string content; + for (uint32_t i = 0; i < count; ++i) { + int32_t id = 0; + uint8_t type = 0; + stream >> id >> type >> content; + StringIDRef sid = new StringID(id, QByteArray(), static_cast(type)); + if (sid.isHashed() || sid.isBinary()) { + sid._sid->_data = QByteArray::fromBase64(content.c_str()); + } + else { + sid._sid->_data = QByteArray(content.c_str()); + } + insert(sid); + } +} + +void StringHasher::clear() +{ + for (auto& hasher : _hashes->right) { + hasher.second->_hasher = nullptr; + hasher.second->unref(); + } + _hashes->clear(); +} + +size_t StringHasher::size() const +{ + return _hashes->size(); +} + +size_t StringHasher::count() const +{ + size_t count = 0; + for (auto& hasher : _hashes->right) { + if (hasher.second->getRefCount() > 1) { + ++count; + } + } + return count; +} + +void StringHasher::Restore(Base::XMLReader& reader) +{ + clear(); + reader.readElement("StringHasher"); + _hashes->SaveAll = reader.getAttributeAsInteger("saveall") != 0L; + _hashes->Threshold = static_cast(reader.getAttributeAsInteger("threshold")); + + bool newTag = false; + if (reader.hasAttribute("new") && reader.getAttributeAsInteger("new") > 0) { + reader.readElement("StringHasher2"); + newTag = true; + } + + if (reader.hasAttribute("file")) { + const char* file = reader.getAttribute("file"); + if (*file != '\0') { + reader.addFile(file, this); + } + return; + } + + std::size_t count = reader.getAttributeAsUnsigned("count"); + if (newTag) { + restoreStreamNew(reader.beginCharStream(), count); + reader.readEndElement("StringHasher2"); + return; + } + if ((count != 0U) && reader.FileVersion > 1) { + restoreStream(reader.beginCharStream(), count); + } + else { + for (std::size_t i = 0; i < count; ++i) { + reader.readElement("Item"); + StringIDRef sid; + long id = reader.getAttributeAsInteger("id"); + bool hashed = reader.hasAttribute("hash"); + if (hashed || reader.hasAttribute("data")) { + const char* value = + hashed ? reader.getAttribute("hash") : reader.getAttribute("data"); + sid = new StringID(id, QByteArray::fromBase64(value), StringID::Flag::Hashed); + } + else { + sid = new StringID(id, QByteArray(reader.getAttribute("text"))); + } + insert(sid); + } + } + reader.readEndElement("StringHasher"); +} + +unsigned int StringHasher::getMemSize() const +{ + return (_hashes->SaveAll ? size() : count()) * 10; +} + +PyObject* StringHasher::getPyObject() +{ + return new StringHasherPy(this); +} + +std::map StringHasher::getIDMap() const +{ + std::map ret; + for (auto& hasher : _hashes->right) { + ret.emplace_hint(ret.end(), hasher.first, StringIDRef(hasher.second)); + } + return ret; +} + +void StringHasher::clearMarks() const +{ + for (auto& hasher : _hashes->right) { + hasher.second->_flags.setFlag(StringID::Flag::Marked, false); + } +} diff --git a/src/App/StringHasher.h b/src/App/StringHasher.h new file mode 100644 index 0000000000..7c8f4f5dcf --- /dev/null +++ b/src/App/StringHasher.h @@ -0,0 +1,830 @@ +// SPDX-License-Identifier: LGPL-2.1-or-later + +/*************************************************************************************************** + * * + * Copyright (c) 2022 Zheng, Lei (realthunder) * + * Copyright (c) 2023 FreeCAD Project Association * + * * + * This file is part of FreeCAD. * + * * + * FreeCAD is free software: you can redistribute it and/or modify it under the terms of the * + * GNU Lesser General Public License as published by the Free Software Foundation, either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * FreeCAD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * + * See the GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License along with * + * FreeCAD. If not, see . * + * * + **************************************************************************************************/ + +#ifndef APP_STRING_ID_H +#define APP_STRING_ID_H + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + + +namespace Data +{ +class MappedName; +} + +namespace App +{ + +class StringHasher; +class StringID; +class StringIDRef; +using StringHasherRef = Base::Reference; + +/** Class to store a string + * + * The main purpose of this class is to provide an efficient storage of the + * mapped geometry element name (i.e. the new Topological Naming), but it can + * also be used as a general purpose string table. + * + * The StringID is to be stored in a string table (StringHasher), and be + * referred to by an integer ID. The stored data can be optionally divided into + * two parts, prefix and postfix. This is because a new mapped name is often + * created by adding some common postfix to an existing name, so data sharing + * can be improved using the following techniques: + * + * a) reference count (through QByteArray) the main data part, + * + * b) (recursively) encode prefix and/or postfix as an integer (in the + * format of #, e.g. #1b) that references another StringID, + * + * c) Check index based name in prefix, e.g. Edge1, Vertex2, and encode + * only the text part as StringID. The index is stored separately in + * reference class StringIDRef to maximize data sharing. + */ +class AppExport StringID: public Base::BaseClass, public Base::Handled +{ + TYPESYSTEM_HEADER_WITH_OVERRIDE();// NOLINT + +public: + /// Flag of the stored string data + enum class Flag + { + /// No flag + None = 0, + /// The stored data is binary + Binary = 1 << 0, + /// The stored data is the sha1 hash of the original content + Hashed = 1 << 1, + /** Postfix is encoded as #, e.g. #1b, where the hex integer part + * refers to another StringID. + */ + PostfixEncoded = 1 << 2, + /// The data is split as prefix and postfix + Postfixed = 1 << 3, + /// The prefix data is split as text + index + Indexed = 1 << 4, + /** The prefix data is encoded as #, e.g. #1b, where the hex + * integer part refers to another StringID. + */ + PrefixID = 1 << 5, + /** The prefix split as text + index, where the text is encoded + * using another StringID. + */ + PrefixIDIndex = 1 << 6, + /// The string ID is persistent regardless of internal mark + Persistent = 1 << 7, + /// Internal marked used to check if the string ID is used + Marked = 1 << 8, + }; + using Flags = Base::Flags; + + /** Constructor + * @param id: integer ID of this StringID + * @param data: input data + * @param flags: flags describes the data + * + * User code is not supposed to create StringID directly, but through StringHasher::getID() + */ + StringID(long id, QByteArray data, const Flags& flags = Flag::None) + : _id(id), + _data(std::move(data)), + _flags(flags) + {} + + /// Constructs an empty StringID + StringID() + : _id(0), + _flags(Flag::None) + {} + + StringID(const StringID& other) = delete; + StringID(StringID&& other) noexcept = delete; + StringID& operator=(const StringID& rhs) = delete; + StringID& operator=(StringID&& rhs) noexcept = delete; + + ~StringID() override; + + /// Returns the ID of this StringID + long value() const + { + return _id; + } + + /// Returns all related StringIDs that used to encode this StringID + const QVector& relatedIDs() const + { + return _sids; + } + + /// @name Flag accessors + //@{ + bool isBinary() const; + bool isHashed() const; + bool isPostfixed() const; + bool isPostfixEncoded() const; + bool isIndexed() const; + bool isPrefixID() const; + bool isPrefixIDIndex() const; + bool isMarked() const; + bool isPersistent() const; + //@} + + /// Checks if this StringID is from the input hasher + bool isFromSameHasher(const StringHasherRef& hasher) const + { + return this->_hasher == hasher; + } + + /// Returns the owner hasher + StringHasherRef getHasher() const + { + return {_hasher}; + } + + /// Returns the data (prefix) + QByteArray data() const + { + return _data; + } + + /// Returns the postfix + QByteArray postfix() const + { + return _postfix; + } + + /// Sets the postfix + void setPostfix(QByteArray postfix) + { + _postfix = std::move(postfix); + } + + PyObject* getPyObject() override; + /// Returns a Python tuple containing both the text and index + PyObject* getPyObjectWithIndex(int index); + + /** Convert to string representation of this StringID + * @param index: optional index + * + * The format is #. And if index is non zero, then #:. Both + * and are in hex format. + */ + std::string toString(int index = 0) const; + + /// Light weight structure of holding a string ID and associated index + struct IndexID + { + long id; + int index; + + explicit operator bool() const + { + return id > 0; + } + + friend std::ostream& operator<<(std::ostream& stream, const IndexID& indexID) + { + stream << indexID.id; + if (indexID.index != 0) { + stream << ':' << indexID.index; + } + return stream; + } + }; + + /** Parse string to get ID and index + * @param name: input string + * @param eof: Whether to check the end of string. If true, then the input + * string must contain only the string representation of this + * StringID + * @param size: input string size, or -1 if the input string is zero terminated. + * @return Return the integer ID and index. + * + * The input string is expected to be in the format of # or with index + * #:, where both id and index are in hex digits. + */ + static IndexID fromString(const char* name, bool eof = true, int size = -1); + + /** Parse string to get ID and index + * @param bytes: input data + * @param eof: Whether to check the end of string. If true, then the input + * string must contain only the string representation of this + * StringID + * + * The input string is expected to be in the format of # or with index + * #:, where both id and index are in hex digits. + */ + static IndexID fromString(const QByteArray& bytes, bool eof = true) + { + return fromString(bytes.constData(), eof, bytes.size()); + } + + /** Get the text content of this StringID + * @param index: optional index + * @return Return the text content of this StringID. If the data is binary, + * then output in base64 encoded string. + */ + std::string dataToText(int index = 0) const; + + /** Get the content of this StringID as QByteArray + * @param index: optional index. + */ + QByteArray dataToBytes(int index = 0) const + { + QByteArray res(_data); + if (index != 0) { + res += QByteArray::number(index); + } + if (_postfix.size() != 0) { + res += _postfix; + } + return res; + } + + /// Mark this StringID as used + void mark() const; + + /// Mark the StringID as persistent regardless of usage mark + void setPersistent(bool enable); + + bool operator<(const StringID& other) const + { + return compare(other) < 0; + } + + /** Compare StringID + * @param other: the other StringID for comparison + * @return Returns -1 if less than the other StringID, 1 if greater, or 0 if equal + */ + int compare(const StringID& other) const + { + if (_hasher < other._hasher) { + return -1; + } + if (_hasher > other._hasher) { + return 1; + } + if (_id < other._id) { + return -1; + } + if (_id > other._id) { + return 1; + } + return 0; + } + + friend class StringHasher; + +private: + long _id; + QByteArray _data; + QByteArray _postfix; + StringHasher* _hasher = nullptr; + mutable Flags _flags; + mutable QVector _sids; +}; + +////////////////////////////////////////////////////////////////////////// + +/** Counted reference to a StringID instance + */ +class StringIDRef +{ +public: + /// Default construction results in an empty StringIDRef object: it will evaluate to boolean + /// "false" if queried. + StringIDRef() + : _sid(nullptr), + _index(0) + {} + + /// Standard construction from a heap-allocated StringID. This reference-counting class manages + /// the lifetime of the StringID, ensuring it is deallocated when its reference count goes to + /// zero. + /// \param stringID A pointer to a StringID allocated with "new" + /// \param index (optional) An index value to store along with the StringID. Defaults to zero. + StringIDRef(StringID* stringID, int index = 0) + : _sid(stringID), + _index(index) + { + if (_sid) { + _sid->ref(); + } + } + + /// Copy construction results in an incremented reference count for the stored StringID + StringIDRef(const StringIDRef& other) + : _sid(other._sid), + _index(other._index) + { + if (_sid) { + _sid->ref(); + } + } + + /// Move construction does NOT increase the reference count of the StringID (instead, it + /// invalidates the pointer in the moved object). + StringIDRef(StringIDRef&& other) noexcept + : _sid(other._sid), + _index(other._index) + { + other._sid = nullptr; + } + + StringIDRef(const StringIDRef& other, int index) + : _sid(other._sid), + _index(index) + { + if (_sid) { + _sid->ref(); + } + } + + ~StringIDRef() + { + if (_sid) { + _sid->unref(); + } + } + + void reset(const StringIDRef& stringID = StringIDRef()) + { + *this = stringID; + } + + void reset(const StringIDRef& stringID, int index) + { + *this = stringID; + this->_index = index; + } + + void swap(StringIDRef& stringID) + { + if (*this != stringID) { + auto tmp = stringID; + stringID = *this; + *this = tmp; + } + } + + StringIDRef& operator=(StringID* stringID) + { + if (_sid == stringID) { + return *this; + } + if (_sid) { + _sid->unref(); + } + _sid = stringID; + if (_sid) { + _sid->ref(); + } + this->_index = 0; + return *this; + } + + StringIDRef& operator=(const StringIDRef& stringID) + { + if (&stringID == this) { + return *this; + } + if (_sid != stringID._sid) { + if (_sid) { + _sid->unref(); + } + _sid = stringID._sid; + if (_sid) { + _sid->ref(); + } + } + this->_index = stringID._index; + return *this; + } + + StringIDRef& operator=(StringIDRef&& stringID) noexcept + { + if (_sid != stringID._sid) { + if (_sid) { + _sid->unref(); + } + _sid = stringID._sid; + stringID._sid = nullptr; + } + this->_index = stringID._index; + return *this; + } + + bool operator<(const StringIDRef& stringID) const + { + if (!stringID._sid) { + return false; + } + if (!_sid) { + return true; + } + int res = _sid->compare(*stringID._sid); + if (res < 0) { + return true; + } + if (res > 0) { + return false; + } + return _index < stringID._index; + } + + bool operator==(const StringIDRef& stringID) const + { + if (_sid && stringID._sid) { + return _sid->compare(*stringID._sid) == 0 && _index == stringID._index; + } + return _sid == stringID._sid; + } + + bool operator!=(const StringIDRef& stringID) const + { + return !(*this == stringID); + } + + explicit operator bool() const + { + return _sid != nullptr; + } + + int getRefCount() const + { + if (_sid) { + return _sid->getRefCount(); + } + return 0; + } + + std::string toString() const + { + if (_sid) { + return _sid->toString(_index); + } + return {}; + } + + std::string dataToText() const + { + if (_sid) { + return _sid->dataToText(_index); + } + return {}; + } + + /// Get a reference to the data: only makes sense if index and postfix are both empty, but + /// calling code is responsible for ensuring that. + const char* constData() const + { + if (_sid) { + assert(_index == 0); + assert(_sid->postfix().isEmpty()); + return _sid->data().constData(); + } + return ""; + } + + const StringID& deref() const + { + return *_sid; + } + + long value() const + { + if (_sid) { + return _sid->value(); + } + return 0; + } + + QVector relatedIDs() const + { + if (_sid) { + return _sid->relatedIDs(); + } + return {}; + } + + bool isBinary() const + { + if (_sid) { + return _sid->isBinary(); + } + return false; + } + + bool isHashed() const + { + if (_sid) { + return _sid->isHashed(); + } + return false; + } + + void toBytes(QByteArray& bytes) const + { + if (_sid) { + bytes = _sid->dataToBytes(_index); + } + } + + PyObject* getPyObject() + { + if (_sid) { + return _sid->getPyObjectWithIndex(_index); + } + Py_INCREF(Py_None); + return Py_None; + } + + void mark() const + { + if (_sid) { + _sid->mark(); + } + } + + bool isMarked() const + { + return _sid && _sid->isMarked();// NOLINT + } + + bool isFromSameHasher(const StringHasherRef& hasher) const + { + return _sid && _sid->isFromSameHasher(hasher);// NOLINT + } + + StringHasherRef getHasher() const + { + if (_sid) { + return _sid->getHasher(); + } + return {}; + } + + void setPersistent(bool enable) + { + if (_sid) { + _sid->setPersistent(enable); + } + } + + /// Used predominantly by the unit test code to verify that index is set correctly. In general + /// user code should not need to call this function. + int getIndex() const + { + return _index; + } + + friend class StringHasher; + +private: + StringID* _sid; + int _index; +}; + + +/// \brief A bidirectional map of strings and their integer identifier. +/// +/// Maps an arbitrary text string to a unique integer ID, maintaining a reference-counted shared +/// pointer for each. This permits elimination of unused strings based on their reference +/// count. If a duplicate string is added, no additional copy is made, and a new reference to the +/// original storage is returned (incrementing the reference counter of the instance). +/// +/// If the string is longer than a given threshold, instead of storing the string, its SHA1 hash is +/// stored (and the original string discarded). This allows an upper threshold on the length of a +/// stored string, while still effectively guaranteeing uniqueness in the table. +class AppExport StringHasher: public Base::Persistence, public Base::Handled +{ + + TYPESYSTEM_HEADER_WITH_OVERRIDE();// NOLINT + +public: + StringHasher(); + ~StringHasher() override; + + StringHasher(const StringHasher&) = delete; + StringHasher(StringHasher&&) noexcept = delete; + StringHasher& operator=(StringHasher& other) = delete; + StringHasher& operator=(StringHasher&& other) noexcept = delete; + + unsigned int getMemSize() const override; + void Save(Base::Writer& /*writer*/) const override; + void Restore(Base::XMLReader& /*reader*/) override; + void SaveDocFile(Base::Writer& /*writer*/) const override; + void RestoreDocFile(Base::Reader& /*reader*/) override; + void setPersistenceFileName(const char* name) const; + const std::string& getPersistenceFileName() const; + + /** Maps an arbitrary string to an integer + * + * @param text: input string. + * @param len: length of the string: optional if the string is null-terminated. + * @param hashable: whether hashing the string is permitted. + * @return A shared pointer to the internally-stored StringID. + * + * Maps an arbitrary text string to a unique integer ID, returning a reference-counted shared + * pointer to the StringID. This permits elimination of unused strings based on their reference + * count. If a duplicate string is added, no additional copy is made, and a new reference to the + * original storage is returned (incrementing the reference counter of the instance). + * + * If \c hashable is true and the string is longer than the threshold setting of this + * StringHasher, only the SHA1 hash of the string is stored: the original content of the string + * is discarded. If \c hashable is false, the string is copied and stored inside a StringID + * instance. + * + * The purpose of this function is to provide a short form of a stable string identification. + */ + StringIDRef getID(const char* text, int len = -1, bool hashable = false); + + /// Options for string string data + enum class Option + { + /// No option is set + None = 0, + + /// The input data is binary + Binary = 1 << 0, + + /// Hashing is permitted for this input data. If the data length is longer than the + /// threshold setting of the StringHasher, it will be sha1 hashed before storing, and the + /// original content of the string is discarded. + Hashable = 1 << 1, + + /// Do not copy the data: assume it is constant and exists for the lifetime of this hasher. + /// If this option is not set, the data will be copied before storing. + NoCopy = 1 << 2, + }; + using Options = Base::Flags