From d9e171e5d08be438cefcd2771abc18aa7da7f20b Mon Sep 17 00:00:00 2001 From: Chris Hennes Date: Thu, 30 Mar 2023 09:43:53 -0500 Subject: [PATCH] App/Toponaming: Add original StringHasher Also includes StringIDPy. This is realthunder's original code, it does not compile in the current codebase yet. --- src/App/CMakeLists.txt | 8 + src/App/StringHasher.cpp | 749 ++++++++++++++++++++++++++++++++++ src/App/StringHasher.h | 680 ++++++++++++++++++++++++++++++ src/App/StringHasherPy.xml | 71 ++++ src/App/StringHasherPyImp.cpp | 148 +++++++ src/App/StringIDPy.xml | 65 +++ src/App/StringIDPyImp.cpp | 89 ++++ 7 files changed, 1810 insertions(+) create mode 100644 src/App/StringHasher.cpp create mode 100644 src/App/StringHasher.h create mode 100644 src/App/StringHasherPy.xml create mode 100644 src/App/StringHasherPyImp.cpp create mode 100644 src/App/StringIDPy.xml create mode 100644 src/App/StringIDPyImp.cpp diff --git a/src/App/CMakeLists.txt b/src/App/CMakeLists.txt index 4326b562f0..80e4e06b4a 100644 --- a/src/App/CMakeLists.txt +++ b/src/App/CMakeLists.txt @@ -90,6 +90,8 @@ generate_from_xml(GeoFeatureGroupExtensionPy) generate_from_xml(MetadataPy) generate_from_xml(OriginGroupExtensionPy) generate_from_xml(PartPy) +generate_from_xml(StringHasherPy) +generate_from_xml(StringIDPy) generate_from_xml(ComplexGeoDataPy) generate_from_xml(PropertyContainerPy) @@ -115,6 +117,8 @@ SET(FreeCADApp_XML_SRCS PropertyContainerPy.xml ComplexGeoDataPy.xml MaterialPy.xml + StringHasherPy.xml + StringIDPy.xml ) SOURCE_GROUP("XML" FILES ${FreeCADApp_XML_SRCS}) @@ -270,6 +274,9 @@ SET(FreeCADApp_CPP_SRCS MaterialPyImp.cpp Metadata.cpp MetadataPyImp.cpp + StringHasher.cpp + StringHasherPyImp.cpp + StringIDPyImp.cpp ) SET(FreeCADApp_HPP_SRCS @@ -288,6 +295,7 @@ SET(FreeCADApp_HPP_SRCS MappedElement.h Material.h Metadata.h + StringHasher.h ) SET(FreeCADApp_SRCS diff --git a/src/App/StringHasher.cpp b/src/App/StringHasher.cpp new file mode 100644 index 0000000000..879bc1b652 --- /dev/null +++ b/src/App/StringHasher.cpp @@ -0,0 +1,749 @@ +/**************************************************************************** +* Copyright (c) 2022 Zheng Lei (realthunder) * +* * +* This file is part of the FreeCAD CAx development system. * +* * +* This library is free software; you can redistribute it and/or * +* modify it under the terms of the GNU Library General Public * +* License as published by the Free Software Foundation; either * +* version 2 of the License, or (at your option) any later version. * +* * +* This library is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU Library General Public License for more details. * +* * +* You should have received a copy of the GNU Library General Public * +* License along with this library; see the file COPYING.LIB. If not, * +* write to the Free Software Foundation, Inc., 59 Temple Place, * +* Suite 330, Boston, MA 02111-1307, USA * +* * +****************************************************************************/ + +#include "PreCompiled.h" + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "StringHasher.h" +#include "StringHasherPy.h" +#include "DocumentParams.h" +#include "MappedElement.h" +#include "StringIDPy.h" + + +FC_LOG_LEVEL_INIT("App",true,true) + +namespace bio = boost::iostreams; +using namespace App; + +/////////////////////////////////////////////////////////// + +struct StringIDHasher { + std::size_t operator()(const StringID *sid) const { + if (!sid) + return 0; + return qHash(sid->data(), qHash(sid->postfix())); + } + + bool operator()(const StringID *IDa, const StringID *IDb) const { + if (IDa == IDb) + return true; + if (!IDa || !IDb) + return false; + return IDa->data() == IDb->data() && IDa->postfix() == IDb->postfix(); + } +}; + +typedef boost::bimap< + boost::bimaps::unordered_set_of, + boost::bimaps::set_of> + HashMapBase; + +class StringHasher::HashMap: public HashMapBase +{ +public: + bool SaveAll = false; + int Threshold = 0; +}; + +/////////////////////////////////////////////////////////// + +TYPESYSTEM_SOURCE_ABSTRACT(App::StringID, Base::BaseClass) + +StringID::~StringID() +{ + if (_hasher) + _hasher->_hashes->right.erase(_id); +} + +PyObject *StringID::getPyObject() { + return new StringIDPy(this); +} + +PyObject *StringID::getPyObjectWithIndex(int index) { + auto res = new StringIDPy(this); + res->_index = index; + return res; +} + +std::string StringID::toString(int index) const { + std::ostringstream ss; + ss << '#' << std::hex << value(); + if (index) + ss << ':' << index; + return ss.str(); +} + +StringID::IndexID StringID::fromString(const char *name, bool eof, int size) { + IndexID res; + res.id = 0; + res.index = 0; + if (!name) { + res.id = -1; + return res; + } + if (size < 0) + size = std::strlen(name); + bio::stream iss(name, size); + char sep = 0; + char sep2 = 0; + iss >> sep >> std::hex >> res.id >> sep2 >> res.index; + if ((eof && !iss.eof()) || sep != '#' || (sep2 != 0 && sep2 != ':')) { + res.id = -1; + return res; + } + return res; +} + +std::string StringID::dataToText(int index) const { + if(isHashed() || isBinary()) + return _data.toBase64().constData(); + + std::string res(_data.constData()); + if (index) + res += std::to_string(index); + if (_postfix.size()) + res += _postfix.constData(); + return res; +} + +void StringID::mark() const +{ + if (isMarked()) + return; + _flags.setFlag(Flag::Marked); + for (auto &sid : _sids) + sid.deref().mark(); +} + +/////////////////////////////////////////////////////////// + +TYPESYSTEM_SOURCE(App::StringHasher, Base::Persistence) + +StringHasher::StringHasher() + :_hashes(new HashMap) +{} + +StringHasher::~StringHasher() { + clear(); +} + +void StringHasher::setSaveAll(bool enable) { + if (_hashes->SaveAll == enable) + return; + _hashes->SaveAll = enable; + compact(); +} + +void StringHasher::compact() +{ + if (_hashes->SaveAll) + return; + + std::deque pendings; + for (auto &v : _hashes->right) { + if (!v.second->isPersistent() && v.second->getRefCount() == 1) + pendings.emplace_back(v.second); + } + while (pendings.size()) { + StringIDRef sid = pendings.front(); + pendings.pop_front(); + if (!_hashes->right.erase(sid.value())) + continue; + sid._sid->_hasher = nullptr; + sid._sid->unref(); + for (auto &s : sid._sid->_sids) { + if (s._sid->_hasher == this + && !s._sid->isPersistent() + && s._sid->getRefCount() == 2) + pendings.push_back(s); + } + } +} + +bool StringHasher::getSaveAll() const { + return _hashes->SaveAll; +} + +void StringHasher::setThreshold(int threshold) { + _hashes->Threshold = threshold; +} + +int StringHasher::getThreshold() const { + return _hashes->Threshold; +} + +long StringHasher::lastID() const { + if(_hashes->right.empty()) + return 0; + auto it = _hashes->right.end(); + --it; + return it->first; +} + +StringIDRef StringHasher::getID(const char *text, int len, bool hashable) { + if (len < 0) + len = strlen(text); + return getID(QByteArray::fromRawData(text, len), hashable ? Option::Hashable : Option::None); +} + +StringIDRef StringHasher::getID(const QByteArray &data, Options options) +{ + bool binary = options.testFlag(Option::Binary); + bool hashable = options.testFlag(Option::Hashable); + bool nocopy = options.testFlag(Option::NoCopy); + + bool hashed = hashable && _hashes->Threshold > 0 + && (int)data.size() > _hashes->Threshold; + + StringID dataID; + if (hashed) { + QCryptographicHash hasher(QCryptographicHash::Sha1); + hasher.addData(data); + dataID._data = hasher.result(); + } + else + dataID._data = data; + + auto it = _hashes->left.find(&dataID); + if (it != _hashes->left.end()) + return StringIDRef(it->first); + + if (!hashed && !nocopy) + // if not hashed, make a deep copy of the data + dataID._data = QByteArray(data.constData(), data.size()); + + StringID::Flags flags(StringID::Flag::None); + if (binary) + flags.setFlag(StringID::Flag::Binary); + if (hashed) + flags.setFlag(StringID::Flag::Hashed); + StringIDRef sid(new StringID(lastID() + 1, dataID._data, flags)); + return StringIDRef(insert(sid)); +} + +StringIDRef StringHasher::getID(const Data::MappedName &name, + const QVector & sids) +{ + StringID anID; + anID._postfix = name.postfixBytes(); + + Data::IndexedName indexed; + if (!anID._postfix.size()) + indexed = Data::IndexedName(name.dataBytes()); + if (indexed) + anID._data = QByteArray::fromRawData(indexed.getType(), strlen(indexed.getType())); + else + anID._data = name.dataBytes(); + + auto it = _hashes->left.find(&anID); + if (it != _hashes->left.end()) { + auto res = StringIDRef(it->first); + if (indexed) + res._index = indexed.getIndex(); + return res; + } + + if (!indexed && name.isRaw()) + anID._data = QByteArray(name.dataBytes().constData(), + name.dataBytes().size()); + + StringIDRef postfixRef; + if (anID._postfix.size() && anID._postfix.indexOf("#") < 0) { + postfixRef = getID(anID._postfix); + postfixRef.toBytes(anID._postfix); + } + + StringIDRef indexRef; + if (indexed) + indexRef = getID(anID._data); + + StringIDRef sid(new StringID(lastID() + 1, anID._data)); + StringID &id = *sid._sid; + if (anID._postfix.size()) { + id._flags.setFlag(StringID::Flag::Postfixed); + id._postfix = anID._postfix; + } + + int count = 0; + for (auto &sid : sids) { + if (sid && sid._sid->_hasher == this) + ++count; + } + + int extra = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0); + if (count == sids.size() && !postfixRef && !indexRef) + id._sids = sids; + else { + id._sids.reserve(count + extra); + if (postfixRef) { + id._flags.setFlag(StringID::Flag::PostfixEncoded); + id._sids.push_back(postfixRef); + } + if (indexRef) { + id._flags.setFlag(StringID::Flag::Indexed); + id._sids.push_back(indexRef); + } + for (auto &s : sids) { + if (s && s._sid->_hasher == this) + id._sids.push_back(s); + } + } + if (id._sids.size() > 10) { + std::sort(id._sids.begin() + extra, id._sids.end()); + id._sids.erase(std::unique(id._sids.begin() + extra, id._sids.end()), id._sids.end()); + } + + if (id._postfix.size() && !indexed) { + StringID::IndexID res = StringID::fromString(id._data); + if (res.id > 0) { + int offset = id.isPostfixEncoded() ? 1 : 0; + for (int i = offset; i < id._sids.size(); ++i) { + if (id._sids[i].value() == res.id) { + if (i!=offset) + std::swap(id._sids[offset], id._sids[i]); + if (res.index != 0) + id._flags.setFlag(StringID::Flag::PrefixIDIndex); + else + id._flags.setFlag(StringID::Flag::PrefixID); + break; + } + } + } + } + + return StringIDRef(insert(sid), indexed.getIndex()); +} + +StringIDRef StringHasher::getID(long id, int index) const { + if (id<=0) + return StringIDRef(); + auto it = _hashes->right.find(id); + if (it == _hashes->right.end()) + return StringIDRef(); + StringIDRef res(it->second); + res._index = index; + return res; +} + +void StringHasher::setPersistenceFileName(const char *filename) const { + if (!filename) + filename = ""; + _filename = filename; +} + +const std::string &StringHasher::getPersistenceFileName() const { + return _filename; +} + +void StringHasher::Save(Base::Writer &writer) const { + + size_t count; + if (_hashes->SaveAll) + count = _hashes->size(); + else { + count = 0; + for (auto &v : _hashes->right) { + if (v.second->isMarked() || v.second->isPersistent()) + ++count; + } + } + + writer.Stream() << writer.ind() + << "SaveAll + << "\" threshold=\"" << _hashes->Threshold << "\""; + + if (!count) { + writer.Stream() << " count=\"0\">\n"; + return; + } + + writer.Stream() << " count=\"0\" new=\"1\"/>\n"; + + writer.Stream() << writer.ind() << "\n"; + return; + } + + writer.Stream() << " count=\"" << count << "\">\n"; + saveStream(writer.beginCharStream(false) << '\n'); + writer.endCharStream() << '\n'; + writer.Stream() << writer.ind() << "\n"; +} + +void StringHasher::SaveDocFile(Base::Writer &writer) const { + std::size_t count = _hashes->SaveAll?this->size():this->count(); + writer.Stream() << "StringTableStart v1 " << count << '\n'; + saveStream(writer.Stream()); +} + +void StringHasher::saveStream(std::ostream &s) const { + Base::OutputStream str(s,false); + boost::io::ios_flags_saver ifs(s); + s << std::hex; + + bool allowRealtive = DocumentParams::getRelativeStringID(); + long anchor = 0; + const StringID *last = nullptr; + long lastid = 0; + bool relative = false; + + for (auto &v : _hashes->right) { + auto &d = *v.second; + long id = d._id; + if (!_hashes->SaveAll && !d.isMarked() && !d.isPersistent()) + continue; + + if (!allowRealtive) + s << id; + else { + // We use relative coding to save space. But in order to have some + // minimum protection against corruption, write an absolute value every + // once a while. + relative = (id - anchor) < 1000; + if (relative) + s << '-' << id - lastid; + else { + anchor = id; + s << id; + } + lastid = id; + } + + int offset = d.isPostfixEncoded() ? 1 : 0; + + StringID::IndexID prefixid; + prefixid.id = 0; + prefixid.index = 0; + if (d.isPrefixID()) { + assert(d._sids.size() > offset); + prefixid.id = d._sids[offset].value(); + } + else if (d.isPrefixIDIndex()) { + prefixid = StringID::fromString(d._data); + assert(d._sids.size() > offset && d._sids[offset].value() == prefixid.id); + } + + auto flags = d._flags; + flags.setFlag(StringID::Flag::Marked, false); + s << '.' << flags.toUnderlyingType(); + + int i = 0; + if (!relative) { + for (; i < d._sids.size(); ++i) + s << '.' << d._sids[i].value(); + } + else { + if (last) { + for (; i < d._sids.size() && i < last->_sids.size(); ++i) { + long m = last->_sids[i].value(); + long n = d._sids[i].value(); + if (n < m) + s << ".-" << m-n; + else + s << '.' << n - m; + } + } + for (; i < d._sids.size(); ++i) + s << '.' << id - d._sids[i].value(); + } + + last = &d; + + // Having postfix means it is a geometry element name, which + // guarantees to be a single line without space. So it is safe to + // store in raw stream. + if (d.isPostfixed()) { + if (d.isPrefixIDIndex()) + s << ' ' << prefixid.index; + else if (!d.isIndexed() && !d.isPrefixID()) + s << ' ' << d._data.constData(); + + if (!d.isPostfixEncoded()) + s << ' ' << d._postfix.constData(); + s << '\n'; + } + else { + // Reaching here means the string may contain space and newlines + // We rely on OutputStream (i.e. str) to save the string. + s << ' '; + str << d._data.constData(); + } + } +} + +void StringHasher::RestoreDocFile(Base::Reader &reader) { + std::string marker, ver; + reader >> marker; + std::size_t count; + _hashes->clear(); + if (marker == "StringTableStart") { + reader >> ver >> count; + if (ver != "v1") + FC_WARN("Unknown string table format"); + restoreStreamNew(reader, count); + return; + } + count = atoi(marker.c_str()); + restoreStream(reader, count); +} + +void StringHasher::restoreStreamNew(std::istream &s, std::size_t count) { + Base::InputStream str(s, false); + _hashes->clear(); + std::string content; + boost::io::ios_flags_saver ifs(s); + s >> std::hex; + std::vector tokens; + long lastid = 0; + const StringID *last = nullptr; + + std::string tmp; + + for (uint32_t i = 0; i < count; ++i) { + if (!(s >> tmp)) + FC_THROWM(Base::RuntimeError, "Invalid string table"); + + tokens.clear(); + boost::split(tokens, tmp, boost::is_any_of(".")); + if (tokens.size() < 2) + FC_THROWM(Base::RuntimeError, "Invalid string table"); + + long id; + bool relative = false; + if (tokens[0][0] == '-') { + relative = true; + id = lastid + strtol(tokens[0].c_str() + 1, nullptr, 16); + } + else + id = strtol(tokens[0].c_str(), nullptr, 16); + + lastid = id; + + unsigned long flag = strtol(tokens[1].c_str(), nullptr, 16); + StringIDRef sid(new StringID(id, QByteArray(), static_cast(flag))); + + StringID &d = *sid._sid; + d._sids.reserve(tokens.size() - 2); + + int j = 2; + if (relative && last) { + for (; j < (int)tokens.size() && j - 2 < last->_sids.size(); ++j) { + long m = last->_sids[j - 2].value(); + long n; + if (tokens[j][0] == '-') + n = -strtol(&tokens[j][1], nullptr, 16); + else + n = strtol(&tokens[j][0], nullptr, 16); + StringIDRef sid = getID(m + n); + if (!sid) + FC_THROWM(Base::RuntimeError, "Invalid string id reference"); + d._sids.push_back(sid); + } + } + for (; j < (int)tokens.size(); ++j) { + long n = strtol(&tokens[j][0], nullptr, 16); + StringIDRef sid = getID(relative ? id - n : n); + if (!sid) + FC_THROWM(Base::RuntimeError, "Invalid string id reference"); + d._sids.push_back(sid); + } + + if (!d.isPostfixed()) { + str >> content; + if (d.isHashed() || d.isBinary()) + d._data = QByteArray::fromBase64(content.c_str()); + else + d._data = content.c_str(); + } + else { + int offset = 0; + if (d.isPostfixEncoded()) { + offset = 1; + if (d._sids.empty()) + FC_THROWM(Base::RuntimeError, "Missing string postfix"); + d._postfix = d._sids[0]._sid->_data; + } + if (d.isIndexed()) { + if (d._sids.size() <= offset) + FC_THROWM(Base::RuntimeError, "Missing string prefix"); + d._data = d._sids[offset]._sid->_data; + } + else if (d.isPrefixID() || d.isPrefixIDIndex()) { + if (d._sids.size() <= offset) + FC_THROWM(Base::RuntimeError, "Missing string prefix id"); + int index = 0; + if (d.isPrefixIDIndex()) { + if (!(s >> index)) + FC_THROWM(Base::RuntimeError, "Missing string prefix index"); + } + d._data = d._sids[offset]._sid->toString(index).c_str(); + } + else { + s >> content; + d._data = content.c_str(); + } + if (!d.isPostfixEncoded()) { + s >> content; + d._postfix = content.c_str(); + } + } + + last = insert(sid); + } +} + +StringID *StringHasher::insert(const StringIDRef &sid) +{ + assert(sid && sid._sid->_hasher == nullptr); + auto &d = *sid._sid; + d._hasher = this; + d.ref(); + auto res = _hashes->right.insert(_hashes->right.end(), + HashMap::right_map::value_type(sid.value(), &d)); + if (res->second != &d) { + d._hasher = nullptr; + d.unref(); + } + return res->second; +} + +void StringHasher::restoreStream(std::istream &s, std::size_t count) { + Base::InputStream str(s, false); + _hashes->clear(); + std::string content; + for (uint32_t i = 0; i < count; ++i) { + int32_t id; + uint8_t type; + str >> id >> type >> content; + StringIDRef sid = new StringID(id, QByteArray(), static_cast(type)); + if (sid.isHashed() || sid.isBinary()) + sid._sid->_data = QByteArray::fromBase64(content.c_str()); + else + sid._sid->_data = QByteArray(content.c_str()); + insert(sid); + } +} + +void StringHasher::clear() { + for (auto & v : _hashes->right) { + v.second->_hasher = nullptr; + v.second->unref(); + } + _hashes->clear(); +} + +size_t StringHasher::size() const { + return _hashes->size(); +} + +size_t StringHasher::count() const { + size_t count = 0; + for (auto &v : _hashes->right) + if (v.second->getRefCount() > 1) + ++count; + return count; +} + +void StringHasher::Restore(Base::XMLReader &reader) { + clear(); + reader.readElement("StringHasher"); + _hashes->SaveAll = reader.getAttributeAsInteger("saveall") ? true : false; + _hashes->Threshold = reader.getAttributeAsInteger("threshold"); + + bool newtag = false; + if (reader.getAttributeAsInteger("new", "0") > 0) { + reader.readElement("StringHasher2"); + newtag = true; + } + + if (reader.hasAttribute("file")) { + const char *file = reader.getAttribute("file"); + if(*file) + reader.addFile(file, this); + return; + } + + std::size_t count = reader.getAttributeAsUnsigned("count"); + if (newtag) { + restoreStreamNew(reader.beginCharStream(false), count); + reader.readEndElement("StringHasher2"); + return; + } + else if (count && reader.FileVersion > 1) + restoreStream(reader.beginCharStream(false), count); + else { + for (std::size_t i = 0; i < count; ++i) { + reader.readElement("Item"); + StringIDRef sid; + long id = reader.getAttributeAsInteger("id"); + bool hashed = reader.hasAttribute("hash"); + if (hashed || reader.hasAttribute("data")) { + const char *value = hashed ? reader.getAttribute("hash") : reader.getAttribute("data"); + sid = new StringID(id, QByteArray::fromBase64(value), StringID::Flag::Hashed); + } + else + sid = new StringID(id, QByteArray(reader.getAttribute("text"))); + insert(sid); + } + } + reader.readEndElement("StringHasher"); +} + +unsigned int StringHasher::getMemSize (void) const { + return (_hashes->SaveAll?size():count()) * 10; +} + +PyObject *StringHasher::getPyObject() { + return new StringHasherPy(this); +} + +std::map StringHasher::getIDMap() const { + std::map ret; + for (auto &v : _hashes->right) + ret.emplace_hint(ret.end(), v.first, StringIDRef(v.second)); + return ret; +} + +void StringHasher::clearMarks() const +{ + for (auto &v : _hashes->right) + v.second->_flags.setFlag(StringID::Flag::Marked, false); +} diff --git a/src/App/StringHasher.h b/src/App/StringHasher.h new file mode 100644 index 0000000000..9c676c3076 --- /dev/null +++ b/src/App/StringHasher.h @@ -0,0 +1,680 @@ +/**************************************************************************** +* Copyright (c) 2022 Zheng Lei (realthunder) * +* * +* This file is part of the FreeCAD CAx development system. * +* * +* This library is free software; you can redistribute it and/or * +* modify it under the terms of the GNU Library General Public * +* License as published by the Free Software Foundation; either * +* version 2 of the License, or (at your option) any later version. * +* * +* This library is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU Library General Public License for more details. * +* * +* You should have received a copy of the GNU Library General Public * +* License along with this library; see the file COPYING.LIB. If not, * +* write to the Free Software Foundation, Inc., 59 Temple Place, * +* Suite 330, Boston, MA 02111-1307, USA * +* * +****************************************************************************/ + +#ifndef APP_STRINGID_H +#define APP_STRINGID_H + +#include +#include + +#include +#include + +#include +#include +#include +#include + + +namespace Data{ +class MappedName; +} + +namespace App { + +class StringHasher; +class StringID; +class StringIDRef; +typedef Base::Reference StringHasherRef; + +/** Class to store a string +* +* The main purpose of this class is to provide an efficient storage of the +* mapped geometry element name (i.e. the new Topological Naming), but it can +* also be used as a general purpose string table. +* +* The StringID is to be stored in a string table (StringHasher), and be +* referred to by an integer ID. The stored data can be optionally divided into +* two parts, prefix and postfix. This is because a new mapped name is often +* created by adding some common postfix to an existing name, so data sharing +* can be improved using the following techniques: +* +* a) reference count (through QByteArray) the main data part, +* +* b) (recursively) encode prefix and/or postfix as an integer (in the +* format of #, e.g. #1b) that references another StringID, +* +* c) Check index based name in prefix, e.g. Edge1, Vertex2, and encode +* only the text part as StringID. The index is stored separately in +* reference class StringIDRef to maximize data sharing. +*/ +class AppExport StringID: public Base::BaseClass, public Base::Handled { + TYPESYSTEM_HEADER_WITH_OVERRIDE(); +public: + /// Flag of the stored string data + enum class Flag { + /// No flag + None = 0, + /// The stored data is binary + Binary = 1 << 0, + /// The stored data is the sha1 hash of the original content + Hashed = 1 << 1, + /** Postfix is encoded as #, e.g. #1b, where the hex integer part + * refers to another StringID. + */ + PostfixEncoded = 1 << 2, + /// The data is splited as prefix and postfix + Postfixed = 1 << 3, + /// The prefix data is split as text + index + Indexed = 1 << 4, + /** The prefix data is encoded as #, e.g. #1b, where the hex + * integer part refers to another StringID. + */ + PrefixID = 1 << 5, + /** The prefix split as text + index, where the text is encoded + * using another StringID. + */ + PrefixIDIndex = 1 << 6, + /// The string ID is persistent regardless of internal mark */ + Persistent = 1 << 7, + /// Internal marked used to check if the string ID is used + Marked = 1 << 8, + }; + typedef Base::Flags Flags; + + /** Constructor + * @param id: integer ID of this StringID + * @param data: input data + * @param flags: flags describes the data + * + * User code is not supposed to create StringID directly, but through StringHasher::getID() + */ + StringID(long id, const QByteArray &data, const Flags &flags=Flag::None) + :_id(id),_data(data),_flags(flags) + {} + + /// Constructs an empty StringID + StringID() + :_id(0), _flags(Flag::None) + {} + + virtual ~StringID(); + + /// Returns the ID of this StringID + long value() const {return _id;} + + /// Returns all related StringIDs that used to encode this StringID + const QVector &relatedIDs() const {return _sids;} + + /// @name Flag accessors + //@{ + bool isBinary() const; + bool isHashed() const; + bool isPostfixed() const; + bool isPostfixEncoded() const; + bool isIndexed() const; + bool isPrefixID() const; + bool isPrefixIDIndex() const; + bool isMarked() const; + bool isPersistent() const; + //@} + + /// Checks if this StringID is from the input hasher + bool isFromSameHasher(const StringHasherRef & hasher) const + { + return this->_hasher == hasher; + } + + /// Returns the owner hasher + StringHasherRef getHasher() const + { + return StringHasherRef(_hasher); + } + + /// Returns the data (prefix) + const QByteArray data() const {return _data;} + /// Returns the postfix + const QByteArray postfix() const {return _postfix;} + + virtual PyObject *getPyObject() override; + /// Returns a Python tuple containing both the text and index + PyObject *getPyObjectWithIndex(int index); + + /** Convert to string represtation of this StringID + * @param index: optional index + * + * The format is #. And if index is non zero, then #:. Both + * and are in hex format. + */ + std::string toString(int index) const; + + /// Light weight structure of holding a string ID and associated index + struct IndexID { + long id; + int index; + + explicit operator bool() const { + return id > 0; + } + + friend std::ostream & operator << (std::ostream &s, const IndexID & id) { + s << id.id; + if (id.index) + s << ':' << id.index; + return s; + } + }; + + /** Parse string to get ID and index + * @param name: input string + * @param eof: Whether to check the end of string. If true, then the input + * string must contain only the string representation of this + * StringID + * @param size: input string size, or -1 if the input string is zero terminated. + * @return Return the integer ID and index. + * + * The input string is expected to be in the format of # or with index + * #:, where both id and index are in hex digits. + */ + static IndexID fromString(const char *name, bool eof=true, int size = -1); + + /** Parse string to get ID and index + * @param bytes: input data + * @param eof: Whether to check the end of string. If true, then the input + * string must contain only the string representation of this + * StringID + * + * The input string is expected to be in the format of # or with index + * #:, where both id and index are in hex digits. + */ + static IndexID fromString(const QByteArray &bytes, bool eof=true) { + return fromString(bytes.constData(), eof, bytes.size()); + } + + /** Get the text content of this StringID + * @param index: optional index + * @return Return the text content of this StringID. If the data is binary, + * then output in base64 encoded string. + */ + std::string dataToText(int index) const; + + /** Get the content of this StringID as QByteArray + * @param bytes: output bytes + * @param index: opttional index. + */ + void toBytes(QByteArray &bytes, int index) const { + if (_postfix.size()) + bytes = _data + _postfix; + else if (index) + bytes = _data + QByteArray::number(index); + else + bytes = _data; + } + + /// Mark this StringID as used + void mark() const; + + /// Mark the StringID as persistent regardless of usage mark + void setPersistent(bool enable); + + bool operator<(const StringID &other) const { + return compare(other) < 0; + } + + /** Compare StringID + * @param other: the other StringID for comparison + * @return Returns -1 if less than the other StringID, 1 if greater, or 0 if equal + */ + int compare(const StringID &other) const { + if (_hasher < other._hasher) + return -1; + if (_hasher > other._hasher) + return 1; + if (_id < other._id) + return -1; + if (_id > other._id) + return 1; + return 0; + } + + friend class StringHasher; + +private: + long _id; + QByteArray _data; + QByteArray _postfix; + StringHasher *_hasher = nullptr; + mutable Flags _flags; + mutable QVector _sids; +}; + +////////////////////////////////////////////////////////////////////////// + +/** Counted reference to a StringID instance +*/ +class StringIDRef +{ +public: + StringIDRef() + :_sid(nullptr), _index(0) + {} + + StringIDRef(StringID* p, int index=0) + : _sid(p), _index(index) + { + if (_sid) + _sid->ref(); + } + + StringIDRef(const StringIDRef & other) + : _sid(other._sid) + , _index(other._index) + { + if (_sid) + _sid->ref(); + } + + StringIDRef(StringIDRef && other) + : _sid(other._sid) + , _index(other._index) + { + other._sid = nullptr; + } + + StringIDRef(const StringIDRef & other, int index) + : _sid(other._sid) + , _index(index) + { + if (_sid) + _sid->ref(); + } + + ~StringIDRef() + { + if (_sid) + _sid->unref(); + } + + void reset(const StringIDRef & p = StringIDRef()) { + *this = p; + } + + void reset(const StringIDRef &p, int index) { + *this = p; + this->_index = index; + } + + void swap(StringIDRef &p) { + if(*this != p) { + auto tmp = p; + p = *this; + *this = tmp; + } + } + + StringIDRef & operator=(StringID* p) { + if (_sid == p) + return *this; + if (_sid) + _sid->unref(); + _sid = p; + if (_sid) + _sid->ref(); + this->_index = 0; + return *this; + } + + StringIDRef & operator=(const StringIDRef & p) { + if (_sid != p._sid) { + if (_sid) + _sid->unref(); + _sid = p._sid; + if (_sid) + _sid->ref(); + } + this->_index = p._index; + return *this; + } + + StringIDRef & operator=(StringIDRef && p) { + if (_sid != p._sid) { + if (_sid) + _sid->unref(); + _sid = p._sid; + p._sid = nullptr; + } + this->_index = p._index; + return *this; + } + + bool operator<(const StringIDRef & p) const { + if (!_sid) + return true; + if (!p._sid) + return false; + int res = _sid->compare(*p._sid); + if (res < 0) + return true; + if (res > 0) + return false; + return _index < p._index; + } + + bool operator==(const StringIDRef & p) const { + return _sid == p._sid && _index == p._index; + } + + bool operator!=(const StringIDRef & p) const { + return _sid != p._sid || _index != p._index; + } + + explicit operator bool() const { + return _sid != nullptr; + } + + int getRefCount(void) const { + if (_sid) + return _sid->getRefCount(); + return 0; + } + + std::string toString() const { + if (_sid) + return _sid->toString(_index); + return std::string(); + } + + std::string dataToText() const { + if (_sid) + return _sid->dataToText(_index); + return std::string(); + } + + const char * constData() const { + if (_sid) { + assert(_index == 0); + assert(_sid->postfix().isEmpty()); + return _sid->data().constData(); + } + return ""; + } + + const StringID & deref() const { + return *_sid; + } + + long value() const { + if (_sid) + return _sid->value(); + return 0; + } + + QVector relatedIDs() const { + if (_sid) + return _sid->relatedIDs(); + return QVector(); + } + + bool isBinary() const { + if (_sid) + return _sid->isBinary(); + return false; + } + + bool isHashed() const { + if (_sid) + return _sid->isHashed(); + return false; + } + + void toBytes(QByteArray &bytes) const { + if (_sid) + _sid->toBytes(bytes, _index); + } + + PyObject *getPyObject(void) { + if (_sid) + return _sid->getPyObjectWithIndex(_index); + Py_INCREF(Py_None); + return Py_None; + } + + void mark() const { + if (_sid) + _sid->mark(); + } + + bool isMarked() const { + return _sid && _sid->isMarked(); + } + + bool isFromSameHasher(const StringHasherRef & hasher) const + { + return _sid && _sid->isFromSameHasher(hasher); + } + + StringHasherRef getHasher() const + { + if (_sid) + return _sid->getHasher(); + return StringHasherRef(); + } + + void setPersistent(bool enable) + { + if (_sid) + _sid->setPersistent(enable); + } + + friend class StringHasher; + +private: + StringID *_sid; + int _index; +}; + +/// A String table to map string from/to a unique integer +class AppExport StringHasher: public Base::Persistence, public Base::Handled { + + TYPESYSTEM_HEADER_WITH_OVERRIDE(); + +public: + StringHasher(); + virtual ~StringHasher(); + + virtual unsigned int getMemSize (void) const override; + virtual void Save (Base::Writer &/*writer*/) const override; + virtual void Restore(Base::XMLReader &/*reader*/) override; + virtual void SaveDocFile (Base::Writer &/*writer*/) const override; + virtual void RestoreDocFile (Base::Reader &/*reader*/) override; + void setPersistenceFileName(const char *name) const; + const std::string &getPersistenceFileName() const; + + /** Maps an arbitrary string to an integer + * + * @param text: input string. + * @param len: length of the string, or -1 if the string is 0 terminated. + * @param hashable: whether the string is hashable. + * @return Return a shared pointer to the internally stored StringID. + * + * The function maps an arbitrary text string to a unique integer ID, which + * is returned as a shared pointer to reference count the ID so that it is + * possible to prune any unused strings. + * + * If \c hashable is true and the string is longer than the threshold + * setting of this StringHasher, it will be sha1 hashed before storing, and + * the original content of the string is discarded. If else, the string is + * copied and stored inside a StringID instance. + * + * The purpose of function is to provide a short form of a stable string + * identification. + */ + StringIDRef getID(const char *text, int len=-1, bool hashable=false); + + /// Option for string string data + enum class Option { + /// No option + None = 0, + /// The input data is binary + Binary = 1 << 0, + /** The input data is hashable. If the data length is longer than the + * threshold setting of the StringHasher, it will be sha1 hashed before + * storing, and the original content of the string is discarded. + */ + Hashable = 1 << 1, + /// Do not copy the data, assuming the data is constant. If this option + //is not set, the data will be copied before storing. + NoCopy = 1 << 2, + }; + typedef Base::Flags