// SPDX-License-Identifier: LGPL-2.1-or-later /*************************************************************************************************** * * * Copyright (c) 2022 Zheng, Lei (realthunder) * * Copyright (c) 2023 FreeCAD Project Association * * * * This file is part of FreeCAD. * * * * FreeCAD is free software: you can redistribute it and/or modify it under the terms of the * * GNU Lesser General Public License as published by the Free Software Foundation, either * * version 2.1 of the License, or (at your option) any later version. * * * * FreeCAD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License along with * * FreeCAD. If not, see . * * * **************************************************************************************************/ #include "PreCompiled.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "MappedElement.h" #include "StringHasher.h" #include "StringHasherPy.h" #include "StringIDPy.h" FC_LOG_LEVEL_INIT("App", true, true) namespace bio = boost::iostreams; using namespace App; /////////////////////////////////////////////////////////// struct StringIDHasher { std::size_t operator()(const StringID* sid) const { if (!sid) { return 0; } return qHash(sid->data(), qHash(sid->postfix())); } bool operator()(const StringID* IDa, const StringID* IDb) const { if (IDa == IDb) { return true; } if (!IDa || !IDb) { return false; } return IDa->data() == IDb->data() && IDa->postfix() == IDb->postfix(); } }; using HashMapBase = boost::bimap, boost::bimaps::set_of>; class StringHasher::HashMap: public HashMapBase { public: bool SaveAll = false; int Threshold = 0; }; /////////////////////////////////////////////////////////// TYPESYSTEM_SOURCE_ABSTRACT(App::StringID, Base::BaseClass) StringID::~StringID() { if (_hasher) { _hasher->_hashes->right.erase(_id); } } PyObject* StringID::getPyObject() { return new StringIDPy(this); } PyObject* StringID::getPyObjectWithIndex(int index) { auto res = new StringIDPy(this); res->_index = index; return res; } std::string StringID::toString(int index) const { std::ostringstream ss; ss << '#' << std::hex << value(); if (index != 0) { ss << ':' << index; } return ss.str(); } StringID::IndexID StringID::fromString(const char* name, bool eof, int size) { IndexID res {}; res.id = 0; res.index = 0; if (!name) { res.id = -1; return res; } if (size < 0) { size = static_cast(std::strlen(name)); } bio::stream iss(name, size); char sep = 0; char sep2 = 0; iss >> sep >> std::hex >> res.id >> sep2 >> res.index; if ((eof && !iss.eof()) || sep != '#' || (sep2 != 0 && sep2 != ':')) { res.id = -1; return res; } return res; } std::string StringID::dataToText(int index) const { if (isHashed() || isBinary()) { return _data.toBase64().constData(); } std::string res(_data.constData()); if (index != 0) { res += std::to_string(index); } if (_postfix.size() != 0) { res += _postfix.constData(); } return res; } void StringID::mark() const { if (isMarked()) { return; } _flags.setFlag(Flag::Marked); for (auto& sid : _sids) { sid.deref().mark(); } } /////////////////////////////////////////////////////////// TYPESYSTEM_SOURCE(App::StringHasher, Base::Persistence) StringHasher::StringHasher() : _hashes(new HashMap) {} StringHasher::~StringHasher() { clear(); } void StringHasher::setSaveAll(bool enable) { if (_hashes->SaveAll == enable) { return; } _hashes->SaveAll = enable; compact(); } void StringHasher::compact() { if (_hashes->SaveAll) { return; } // Make a list of all the table entries that have only a single reference and are not marked // "persistent" std::deque pendings; for (auto& hasher : _hashes->right) { if (!hasher.second->isPersistent() && hasher.second->getRefCount() == 1) { pendings.emplace_back(hasher.second); } } // Recursively remove the unused StringIDs while (!pendings.empty()) { StringIDRef sid = pendings.front(); pendings.pop_front(); // Try to erase the map entry for this StringID if (_hashes->right.erase(sid.value()) == 0U) { continue;// If nothing was erased, there's nothing more to do } sid._sid->_hasher = nullptr; sid._sid->unref(); for (auto& hasher : sid._sid->_sids) { if (hasher._sid->_hasher == this && !hasher._sid->isPersistent() && hasher._sid->getRefCount() == 2) { // If the related StringID also uses this hasher, is not marked persistent, and has // a current reference count of 2 (which will be its hasher reference and its entry // in the related SIDs list), then prep it for removal as well. pendings.push_back(hasher); } } } } bool StringHasher::getSaveAll() const { return _hashes->SaveAll; } void StringHasher::setThreshold(int threshold) { _hashes->Threshold = threshold; } int StringHasher::getThreshold() const { return _hashes->Threshold; } long StringHasher::lastID() const { if (_hashes->right.empty()) { return 0; } auto it = _hashes->right.end(); --it; return it->first; } StringIDRef StringHasher::getID(const char* text, int len, bool hashable) { if (len < 0) { len = static_cast(strlen(text)); } return getID(QByteArray::fromRawData(text, len), hashable ? Option::Hashable : Option::None); } StringIDRef StringHasher::getID(const QByteArray& data, Options options) { bool binary = options.testFlag(Option::Binary); bool hashable = options.testFlag(Option::Hashable); bool nocopy = options.testFlag(Option::NoCopy); bool hashed = hashable && _hashes->Threshold > 0 && (int)data.size() > _hashes->Threshold; StringID dataID; if (hashed) { QCryptographicHash hasher(QCryptographicHash::Sha1); hasher.addData(data); dataID._data = hasher.result(); } else { dataID._data = data; } auto it = _hashes->left.find(&dataID); if (it != _hashes->left.end()) { return {it->first}; } if (!hashed && !nocopy) { // if not hashed, make a deep copy of the data dataID._data = QByteArray(data.constData(), data.size()); } StringID::Flags flags(StringID::Flag::None); if (binary) { flags.setFlag(StringID::Flag::Binary); } if (hashed) { flags.setFlag(StringID::Flag::Hashed); } StringIDRef sid(new StringID(lastID() + 1, dataID._data, flags)); return {insert(sid)}; } StringIDRef StringHasher::getID(const Data::MappedName& name, const QVector& sids) { StringID tempID; tempID._postfix = name.postfixBytes(); Data::IndexedName indexed; if (tempID._postfix.size() != 0) { // Only check for IndexedName if there is postfix, because of the way // we restore the StringID. See StringHasher::saveStream/restoreStreamNew() indexed = Data::IndexedName(name.dataBytes()); } if (indexed) { // If this is an IndexedName, then _data only stores the base part of the name, without the // integer index tempID._data = QByteArray::fromRawData(indexed.getType(), static_cast(strlen(indexed.getType()))); } else { // Store the entire name in _data, but temporarily re-use the existing memory tempID._data = name.dataBytes(); } // Check to see if there is already an entry in the hash table for this StringID auto it = _hashes->left.find(&tempID); if (it != _hashes->left.end()) { auto res = StringIDRef(it->first); if (indexed) { res._index = indexed.getIndex(); } return res; } if (!indexed && name.isRaw()) { // Make a copy of the memory if we didn't do so earlier tempID._data = QByteArray(name.dataBytes().constData(), name.dataBytes().size()); } // If the postfix is not already encoded, use getID to encode it: StringIDRef postfixRef; if ((tempID._postfix.size() != 0) && tempID._postfix.indexOf("#") < 0) { postfixRef = getID(tempID._postfix); postfixRef.toBytes(tempID._postfix); } // If _data is an IndexedName, use getID to encode it: StringIDRef indexRef; if (indexed) { indexRef = getID(tempID._data); } // The real StringID object that we are going to insert StringIDRef newStringIDRef(new StringID(lastID() + 1, tempID._data)); StringID& newStringID = *newStringIDRef._sid; if (tempID._postfix.size() != 0) { newStringID._flags.setFlag(StringID::Flag::Postfixed); newStringID._postfix = tempID._postfix; } // Count the related SIDs that use this hasher int numSIDs = 0; for (const auto& relatedID : sids) { if (relatedID && relatedID._sid->_hasher == this) { ++numSIDs; } } int numAddedSIDs = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0); if (numSIDs == sids.size() && !postfixRef && !indexRef) { // The simplest case: just copy the whole list newStringID._sids = sids; } else { // Put the added SIDs at the front of the SID list newStringID._sids.reserve(numSIDs + numAddedSIDs); if (postfixRef) { newStringID._flags.setFlag(StringID::Flag::PostfixEncoded); newStringID._sids.push_back(postfixRef); } if (indexRef) { newStringID._flags.setFlag(StringID::Flag::Indexed); newStringID._sids.push_back(indexRef); } // Append the sids from the input list whose hasher is this one for (const auto& relatedID : sids) { if (relatedID && relatedID._sid->_hasher == this) { newStringID._sids.push_back(relatedID); } } } // If the number of related IDs is larger than some threshold (hardcoded to 10 right now), then // remove any duplicates (ignoring the new SIDs we may have just added) const int relatedIDSizeThreshold {10}; if (newStringID._sids.size() > relatedIDSizeThreshold) { std::sort(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end()); newStringID._sids.erase( std::unique(newStringID._sids.begin() + numAddedSIDs, newStringID._sids.end()), newStringID._sids.end()); } // If the new StringID has a postfix, but is not indexed, see if the data string itself // contains an index. if ((newStringID._postfix.size() != 0) && !indexed) { // Use the fromString function to parse the new StringID's data field for a possible index StringID::IndexID res = StringID::fromString(newStringID._data); if (res.id > 0) {// If the data had an index if (res.index != 0) { indexed.setIndex(res.index); newStringID._data.resize(newStringID._data.lastIndexOf(':')+1); } int offset = newStringID.isPostfixEncoded() ? 1 : 0; // Search for the SID with that index for (int i = offset; i < newStringID._sids.size(); ++i) { if (newStringID._sids[i].value() == res.id) { if (i != offset) { // If this SID is not already the first element in sids, move it there by // swapping it with whatever WAS there std::swap(newStringID._sids[offset], newStringID._sids[i]); } if (res.index != 0) { newStringID._flags.setFlag(StringID::Flag::PrefixIDIndex); } else { newStringID._flags.setFlag(StringID::Flag::PrefixID); } break; } } } } return {insert(newStringIDRef), indexed.getIndex()}; } StringIDRef StringHasher::getID(long id, int index) const { if (id <= 0) { return {}; } auto it = _hashes->right.find(id); if (it == _hashes->right.end()) { return {}; } StringIDRef res(it->second); res._index = index; return res; } void StringHasher::setPersistenceFileName(const char* filename) const { if (!filename) { filename = ""; } _filename = filename; } const std::string& StringHasher::getPersistenceFileName() const { return _filename; } void StringHasher::Save(Base::Writer& writer) const { size_t count = 0; if (_hashes->SaveAll) { count = _hashes->size(); } else { count = 0; for (auto& hasher : _hashes->right) { if (hasher.second->isMarked() || hasher.second->isPersistent()) { ++count; } } } writer.Stream() << writer.ind() << "SaveAll << "\" threshold=\"" << _hashes->Threshold << "\""; if (count == 0U) { writer.Stream() << " count=\"0\">\n"; return; } writer.Stream() << " count=\"0\" new=\"1\"/>\n"; writer.Stream() << writer.ind() << "\n"; return; } writer.Stream() << " count=\"" << count << "\">\n"; saveStream(writer.beginCharStream() << '\n'); writer.endCharStream() << '\n'; writer.Stream() << writer.ind() << "\n"; } void StringHasher::SaveDocFile(Base::Writer& writer) const { std::size_t count = _hashes->SaveAll ? this->size() : this->count(); writer.Stream() << "StringTableStart v1 " << count << '\n'; saveStream(writer.Stream()); } void StringHasher::saveStream(std::ostream& stream) const { Base::TextOutputStream textStreamWrapper(stream); boost::io::ios_flags_saver ifs(stream); stream << std::hex; long anchor = 0; const StringID* last = nullptr; long lastID = 0; bool relative = false; for (auto& hasher : _hashes->right) { auto& d = *hasher.second; long id = d._id; if (!_hashes->SaveAll && !d.isMarked() && !d.isPersistent()) { continue; } // We use relative coding to save space. But in order to have some // minimum protection against corruption, write an absolute value every // once a while. relative = (id - anchor) < 1000; if (relative) { stream << '-' << id - lastID; } else { anchor = id; stream << id; } lastID = id; int offset = d.isPostfixEncoded() ? 1 : 0; StringID::IndexID prefixID {}; prefixID.id = 0; prefixID.index = 0; if (d.isPrefixID()) { assert(d._sids.size() > offset); prefixID.id = d._sids[offset].value(); } else if (d.isPrefixIDIndex()) { prefixID = StringID::fromString(d._data); assert(d._sids.size() > offset && d._sids[offset].value() == prefixID.id); } auto flags = d._flags; flags.setFlag(StringID::Flag::Marked, false); stream << '.' << flags.toUnderlyingType(); int position = 0; if (!relative) { for (; position < d._sids.size(); ++position) { stream << '.' << d._sids[position].value(); } } else { if (last) { for (; position < d._sids.size() && position < last->_sids.size(); ++position) { long m = last->_sids[position].value(); long n = d._sids[position].value(); if (n < m) { stream << ".-" << m - n; } else { stream << '.' << n - m; } } } for (; position < d._sids.size(); ++position) { stream << '.' << id - d._sids[position].value(); } } last = &d; // Having postfix means it is a geometry element name, which // guarantees to be a single line without space. So it is safe to // store in raw stream. if (d.isPostfixed()) { if (!d.isPrefixIDIndex() && !d.isIndexed() && !d.isPrefixID()) { stream << ' ' << d._data.constData(); } if (!d.isPostfixEncoded()) { stream << ' ' << d._postfix.constData(); } stream << '\n'; } else { // Reaching here means the string may contain space and newlines // We rely on OutputStream (i.e. textStreamWrapper) to save the string. stream << ' '; textStreamWrapper << d._data.constData(); } } } void StringHasher::RestoreDocFile(Base::Reader& reader) { std::string marker; std::string ver; reader >> marker; std::size_t count = 0; _hashes->clear(); if (marker == "StringTableStart") { reader >> ver >> count; if (ver != "v1") { FC_WARN("Unknown string table format"); } restoreStreamNew(reader, count); return; } count = atoi(marker.c_str()); restoreStream(reader, count); } void StringHasher::restoreStreamNew(std::istream& stream, std::size_t count) { Base::TextInputStream asciiStream (stream); _hashes->clear(); std::string content; boost::io::ios_flags_saver ifs(stream); stream >> std::hex; std::vector tokens; long lastid = 0; const StringID* last = nullptr; std::string tmp; for (uint32_t i = 0; i < count; ++i) { if (!(stream >> tmp)) { FC_THROWM(Base::RuntimeError, "Invalid string table"); } tokens.clear(); boost::split(tokens, tmp, boost::is_any_of(".")); if (tokens.size() < 2) { FC_THROWM(Base::RuntimeError, "Invalid string table"); } long id = 0; bool relative = false; if (tokens[0][0] == '-') { relative = true; id = lastid + strtol(tokens[0].c_str() + 1, nullptr, 16); } else { id = strtol(tokens[0].c_str(), nullptr, 16); } lastid = id; unsigned long flag = strtol(tokens[1].c_str(), nullptr, 16); StringIDRef sid(new StringID(id, QByteArray(), static_cast(flag))); StringID& d = *sid._sid; d._sids.reserve(tokens.size() - 2); int j = 2; if (relative && last) { for (; j < (int)tokens.size() && j - 2 < last->_sids.size(); ++j) { long m = last->_sids[j - 2].value(); long n; if (tokens[j][0] == '-') { n = -strtol(&tokens[j][1], nullptr, 16); } else { n = strtol(&tokens[j][0], nullptr, 16); } StringIDRef sid = getID(m + n); if (!sid) { FC_THROWM(Base::RuntimeError, "Invalid string id reference"); } d._sids.push_back(sid); } } for (; j < (int)tokens.size(); ++j) { long n = strtol(tokens[j].data(), nullptr, 16); StringIDRef sid = getID(relative ? id - n : n); if (!sid) { FC_THROWM(Base::RuntimeError, "Invalid string id reference"); } d._sids.push_back(sid); } if (!d.isPostfixed()) { asciiStream >> content; if (d.isHashed() || d.isBinary()) { d._data = QByteArray::fromBase64(content.c_str()); } else { d._data = content.c_str(); } } else { int offset = 0; if (d.isPostfixEncoded()) { offset = 1; if (d._sids.empty()) { FC_THROWM(Base::RuntimeError, "Missing string postfix"); } d._postfix = d._sids[0]._sid->_data; } if (d.isIndexed()) { if (d._sids.size() <= offset) { FC_THROWM(Base::RuntimeError, "Missing string prefix"); } d._data = d._sids[offset]._sid->_data; } else if (d.isPrefixID() || d.isPrefixIDIndex()) { if (d._sids.size() <= offset) { FC_THROWM(Base::RuntimeError, "Missing string prefix id"); } d._data = d._sids[offset]._sid->toString(0).c_str(); if (d.isPrefixIDIndex()) d._data += ":"; } else { stream >> content; d._data = content.c_str(); } if (!d.isPostfixEncoded()) { stream >> content; d._postfix = content.c_str(); } } last = insert(sid); } } StringID* StringHasher::insert(const StringIDRef& sid) { assert(sid && sid._sid->_hasher == nullptr); auto& hasher = *sid._sid; hasher._hasher = this; hasher.ref(); auto res = _hashes->right.insert(_hashes->right.end(), HashMap::right_map::value_type(sid.value(), &hasher)); if (res->second != &hasher) { hasher._hasher = nullptr; hasher.unref(); } return res->second; } void StringHasher::restoreStream(std::istream& stream, std::size_t count) { _hashes->clear(); std::string content; for (uint32_t i = 0; i < count; ++i) { int32_t id = 0; uint8_t type = 0; stream >> id >> type >> content; StringIDRef sid = new StringID(id, QByteArray(), static_cast(type)); if (sid.isHashed() || sid.isBinary()) { sid._sid->_data = QByteArray::fromBase64(content.c_str()); } else { sid._sid->_data = QByteArray(content.c_str()); } insert(sid); } } void StringHasher::clear() { for (auto& hasher : _hashes->right) { hasher.second->_hasher = nullptr; hasher.second->unref(); } _hashes->clear(); } size_t StringHasher::size() const { return _hashes->size(); } size_t StringHasher::count() const { size_t count = 0; for (auto& hasher : _hashes->right) { if (hasher.second->getRefCount() > 1) { ++count; } } return count; } void StringHasher::Restore(Base::XMLReader& reader) { clear(); reader.readElement("StringHasher"); _hashes->SaveAll = reader.getAttributeAsInteger("saveall") != 0L; _hashes->Threshold = static_cast(reader.getAttributeAsInteger("threshold")); bool newTag = false; if (reader.hasAttribute("new") && reader.getAttributeAsInteger("new") > 0) { reader.readElement("StringHasher2"); newTag = true; } if (reader.hasAttribute("file")) { const char* file = reader.getAttribute("file"); if (*file != '\0') { reader.addFile(file, this); } return; } std::size_t count = reader.getAttributeAsUnsigned("count"); if (newTag) { try { restoreStreamNew(reader.beginCharStream(), count); } catch (const Base::Exception &e) { e.ReportException(); FC_ERR("Failed to restore string table: full-document recompute strongly recommended."); } reader.readEndElement("StringHasher2"); return; } if ((count != 0U) && reader.FileVersion > 1) { restoreStream(reader.beginCharStream(), count); } else { for (std::size_t i = 0; i < count; ++i) { reader.readElement("Item"); StringIDRef sid; long id = reader.getAttributeAsInteger("id"); bool hashed = reader.hasAttribute("hash"); if (hashed || reader.hasAttribute("data")) { const char* value = hashed ? reader.getAttribute("hash") : reader.getAttribute("data"); sid = new StringID(id, QByteArray::fromBase64(value), StringID::Flag::Hashed); } else { sid = new StringID(id, QByteArray(reader.getAttribute("text"))); } insert(sid); } } reader.readEndElement("StringHasher"); } unsigned int StringHasher::getMemSize() const { return (_hashes->SaveAll ? size() : count()) * 10; } PyObject* StringHasher::getPyObject() { return new StringHasherPy(this); } std::map StringHasher::getIDMap() const { std::map ret; for (auto& hasher : _hashes->right) { ret.emplace_hint(ret.end(), hasher.first, StringIDRef(hasher.second)); } return ret; } void StringHasher::clearMarks() const { for (auto& hasher : _hashes->right) { hasher.second->_flags.setFlag(StringID::Flag::Marked, false); } }