App/Toponaming: Add original StringHasher

Also includes StringIDPy. This is realthunder's original code, it does not compile in the current codebase yet.
2023-03-30 09:43:53 -05:00
parent 0b29d5f338
commit d9e171e5d0
7 changed files with 1810 additions and 0 deletions
--- a/src/App/StringHasher.cpp
+++ b/src/App/StringHasher.cpp
@@ -0,0 +1,749 @@
+/****************************************************************************
+*   Copyright (c) 2022 Zheng Lei (realthunder) <realthunder.dev@gmail.com> *
+*                                                                          *
+*   This file is part of the FreeCAD CAx development system.               *
+*                                                                          *
+*   This library is free software; you can redistribute it and/or          *
+*   modify it under the terms of the GNU Library General Public            *
+*   License as published by the Free Software Foundation; either           *
+*   version 2 of the License, or (at your option) any later version.       *
+*                                                                          *
+*   This library  is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of         *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
+*   GNU Library General Public License for more details.                   *
+*                                                                          *
+*   You should have received a copy of the GNU Library General Public      *
+*   License along with this library; see the file COPYING.LIB. If not,     *
+*   write to the Free Software Foundation, Inc., 59 Temple Place,          *
+*   Suite 330, Boston, MA  02111-1307, USA                                 *
+*                                                                          *
+****************************************************************************/
+
+#include "PreCompiled.h"
+
+#include <deque>
+#include <QCryptographicHash>
+#include <QHash>
+
+#include <Base/Reader.h>
+#include <Base/Writer.h>
+
+#include <boost/algorithm/string/classification.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/bimap.hpp>
+#include <boost/bimap/set_of.hpp>
+#include <boost/bimap/unordered_set_of.hpp>
+#include <boost/iostreams/stream.hpp>
+
+#include "StringHasher.h"
+#include "StringHasherPy.h"
+#include "DocumentParams.h"
+#include "MappedElement.h"
+#include "StringIDPy.h"
+
+
+FC_LOG_LEVEL_INIT("App",true,true)
+
+namespace bio = boost::iostreams;
+using namespace App;
+
+///////////////////////////////////////////////////////////
+
+struct StringIDHasher {
+   std::size_t operator()(const StringID *sid) const {
+       if (!sid)
+           return 0;
+       return qHash(sid->data(), qHash(sid->postfix()));
+   }
+
+   bool operator()(const StringID *IDa, const StringID *IDb) const {
+       if (IDa == IDb)
+           return true;
+       if (!IDa || !IDb)
+           return false;
+       return IDa->data() == IDb->data() && IDa->postfix() == IDb->postfix();
+   }
+};
+
+typedef boost::bimap<
+   boost::bimaps::unordered_set_of<StringID *,
+                                   StringIDHasher,
+                                   StringIDHasher>,
+   boost::bimaps::set_of<long>>
+   HashMapBase;
+
+class StringHasher::HashMap: public HashMapBase
+{
+public:
+   bool SaveAll = false;
+   int Threshold = 0;
+};
+
+///////////////////////////////////////////////////////////
+
+TYPESYSTEM_SOURCE_ABSTRACT(App::StringID, Base::BaseClass)
+
+StringID::~StringID()
+{
+   if (_hasher)
+       _hasher->_hashes->right.erase(_id);
+}
+
+PyObject *StringID::getPyObject() {
+   return new StringIDPy(this);
+}
+
+PyObject *StringID::getPyObjectWithIndex(int index) {
+   auto res = new StringIDPy(this);
+   res->_index = index;
+   return res;
+}
+
+std::string StringID::toString(int index) const {
+   std::ostringstream ss;
+   ss << '#' << std::hex << value();
+   if (index)
+       ss << ':' << index;
+   return ss.str();
+}
+
+StringID::IndexID StringID::fromString(const char *name, bool eof, int size) {
+   IndexID res;
+   res.id = 0;
+   res.index = 0;
+   if (!name) {
+       res.id = -1;
+       return res;
+   }
+   if (size < 0)
+       size = std::strlen(name);
+   bio::stream<bio::array_source> iss(name, size);
+   char sep = 0;
+   char sep2 = 0;
+   iss >> sep >> std::hex >> res.id >> sep2 >> res.index;
+   if ((eof && !iss.eof()) || sep != '#' || (sep2 != 0 && sep2 != ':')) {
+       res.id = -1;
+       return res;
+   }
+   return res;
+}
+
+std::string StringID::dataToText(int index) const {
+   if(isHashed() || isBinary())
+       return _data.toBase64().constData();
+
+   std::string res(_data.constData());
+   if (index)
+       res += std::to_string(index);
+   if (_postfix.size())
+       res += _postfix.constData();
+   return res;
+}
+
+void StringID::mark() const
+{
+   if (isMarked())
+       return;
+   _flags.setFlag(Flag::Marked);
+   for (auto &sid : _sids)
+       sid.deref().mark();
+}
+
+///////////////////////////////////////////////////////////
+
+TYPESYSTEM_SOURCE(App::StringHasher, Base::Persistence)
+
+StringHasher::StringHasher()
+   :_hashes(new HashMap)
+{}
+
+StringHasher::~StringHasher() {
+   clear();
+}
+
+void StringHasher::setSaveAll(bool enable) {
+   if (_hashes->SaveAll == enable)
+       return;
+   _hashes->SaveAll = enable;
+   compact();
+}
+
+void StringHasher::compact()
+{
+   if (_hashes->SaveAll)
+       return;
+
+   std::deque<StringIDRef> pendings;
+   for (auto &v : _hashes->right) {
+       if (!v.second->isPersistent() && v.second->getRefCount() == 1)
+           pendings.emplace_back(v.second);
+   }
+   while (pendings.size()) {
+       StringIDRef sid = pendings.front();
+       pendings.pop_front();
+       if (!_hashes->right.erase(sid.value()))
+           continue;
+       sid._sid->_hasher = nullptr;
+       sid._sid->unref();
+       for (auto &s : sid._sid->_sids) {
+           if (s._sid->_hasher == this
+               && !s._sid->isPersistent()
+               && s._sid->getRefCount() == 2)
+               pendings.push_back(s);
+       }
+   }
+}
+
+bool StringHasher::getSaveAll() const {
+   return _hashes->SaveAll;
+}
+
+void StringHasher::setThreshold(int threshold) {
+   _hashes->Threshold = threshold;
+}
+
+int StringHasher::getThreshold() const {
+   return _hashes->Threshold;
+}
+
+long StringHasher::lastID() const {
+   if(_hashes->right.empty())
+       return 0;
+   auto it = _hashes->right.end();
+   --it;
+   return it->first;
+}
+
+StringIDRef StringHasher::getID(const char *text, int len, bool hashable) {
+   if (len < 0)
+       len = strlen(text);
+   return getID(QByteArray::fromRawData(text, len), hashable ? Option::Hashable : Option::None);
+}
+
+StringIDRef StringHasher::getID(const QByteArray &data, Options options)
+{
+   bool binary = options.testFlag(Option::Binary);
+   bool hashable = options.testFlag(Option::Hashable);
+   bool nocopy = options.testFlag(Option::NoCopy);
+
+   bool hashed = hashable && _hashes->Threshold > 0
+       && (int)data.size() > _hashes->Threshold;
+
+   StringID dataID;
+   if (hashed) {
+       QCryptographicHash hasher(QCryptographicHash::Sha1);
+       hasher.addData(data);
+       dataID._data = hasher.result();
+   }
+   else
+       dataID._data = data;
+
+   auto it = _hashes->left.find(&dataID);
+   if (it != _hashes->left.end())
+       return StringIDRef(it->first);
+
+   if (!hashed && !nocopy)
+       // if not hashed, make a deep copy of the data
+       dataID._data = QByteArray(data.constData(), data.size());
+
+   StringID::Flags flags(StringID::Flag::None);
+   if (binary)
+       flags.setFlag(StringID::Flag::Binary);
+   if (hashed)
+       flags.setFlag(StringID::Flag::Hashed);
+   StringIDRef sid(new StringID(lastID() + 1, dataID._data, flags));
+   return StringIDRef(insert(sid));
+}
+
+StringIDRef StringHasher::getID(const Data::MappedName &name,
+                               const QVector<StringIDRef> & sids)
+{
+   StringID anID;
+   anID._postfix = name.postfixBytes();
+
+   Data::IndexedName indexed;
+   if (!anID._postfix.size())
+       indexed = Data::IndexedName(name.dataBytes());
+   if (indexed)
+       anID._data = QByteArray::fromRawData(indexed.getType(), strlen(indexed.getType()));
+   else
+       anID._data = name.dataBytes();
+
+   auto it = _hashes->left.find(&anID);
+   if (it != _hashes->left.end()) {
+       auto res = StringIDRef(it->first);
+       if (indexed)
+           res._index = indexed.getIndex();
+       return res;
+   }
+
+   if (!indexed && name.isRaw())
+       anID._data = QByteArray(name.dataBytes().constData(),
+                               name.dataBytes().size());
+
+   StringIDRef postfixRef;
+   if (anID._postfix.size() && anID._postfix.indexOf("#") < 0) {
+       postfixRef = getID(anID._postfix);
+       postfixRef.toBytes(anID._postfix);
+   }
+
+   StringIDRef indexRef;
+   if (indexed)
+       indexRef = getID(anID._data);
+
+   StringIDRef sid(new StringID(lastID() + 1, anID._data));
+   StringID &id = *sid._sid;
+   if (anID._postfix.size()) {
+       id._flags.setFlag(StringID::Flag::Postfixed);
+       id._postfix = anID._postfix;
+   }
+
+   int count = 0;
+   for (auto &sid : sids) {
+       if (sid && sid._sid->_hasher == this)
+           ++count;
+   }
+
+   int extra = (postfixRef ? 1 : 0) + (indexRef ? 1 : 0);
+   if (count == sids.size() && !postfixRef && !indexRef)
+       id._sids = sids;
+   else {
+       id._sids.reserve(count + extra);
+       if (postfixRef) {
+           id._flags.setFlag(StringID::Flag::PostfixEncoded);
+           id._sids.push_back(postfixRef);
+       }
+       if (indexRef) {
+           id._flags.setFlag(StringID::Flag::Indexed);
+           id._sids.push_back(indexRef);
+       }
+       for (auto &s : sids) {
+           if (s && s._sid->_hasher == this)
+               id._sids.push_back(s);
+       }
+   }
+   if (id._sids.size() > 10) {
+       std::sort(id._sids.begin() + extra, id._sids.end());
+       id._sids.erase(std::unique(id._sids.begin() + extra, id._sids.end()), id._sids.end());
+   }
+
+   if (id._postfix.size() && !indexed) {
+       StringID::IndexID res = StringID::fromString(id._data);
+       if (res.id > 0) {
+           int offset = id.isPostfixEncoded() ? 1 : 0;
+           for (int i = offset; i < id._sids.size(); ++i) {
+               if (id._sids[i].value() == res.id) {
+                   if (i!=offset)
+                       std::swap(id._sids[offset], id._sids[i]);
+                   if (res.index != 0)
+                       id._flags.setFlag(StringID::Flag::PrefixIDIndex);
+                   else
+                       id._flags.setFlag(StringID::Flag::PrefixID);
+                   break;
+               }
+           }
+       }
+   }
+
+   return StringIDRef(insert(sid), indexed.getIndex());
+}
+
+StringIDRef StringHasher::getID(long id, int index) const {
+   if (id<=0)
+       return StringIDRef();
+   auto it = _hashes->right.find(id);
+   if (it == _hashes->right.end())
+       return StringIDRef();
+   StringIDRef res(it->second);
+   res._index = index;
+   return res;
+}
+
+void StringHasher::setPersistenceFileName(const char *filename) const {
+   if (!filename)
+       filename = "";
+   _filename = filename;
+}
+
+const std::string &StringHasher::getPersistenceFileName() const {
+   return _filename;
+}
+
+void StringHasher::Save(Base::Writer &writer) const {
+
+   size_t count;
+   if (_hashes->SaveAll)
+       count = _hashes->size();
+   else {
+       count = 0;
+       for (auto &v : _hashes->right) {
+           if (v.second->isMarked() || v.second->isPersistent())
+               ++count;
+       }
+   }
+
+   writer.Stream() << writer.ind()
+                   << "<StringHasher saveall=\"" << _hashes->SaveAll
+                   << "\" threshold=\"" << _hashes->Threshold << "\"";
+
+   if (!count) {
+       writer.Stream() << " count=\"0\"></StringHasher>\n";
+       return;
+   }
+
+   writer.Stream() << " count=\"0\" new=\"1\"/>\n";
+
+   writer.Stream() << writer.ind() << "<StringHasher2 ";
+   if (_filename.size()) {
+       writer.Stream() << " file=\""
+                       << writer.addFile(_filename + ".txt", this)
+                       << "\"/>\n";
+       return;
+   }
+
+   writer.Stream() << " count=\"" << count << "\">\n";
+   saveStream(writer.beginCharStream(false) << '\n');
+   writer.endCharStream() << '\n';
+   writer.Stream() << writer.ind() << "</StringHasher2>\n";
+}
+
+void StringHasher::SaveDocFile(Base::Writer &writer) const {
+   std::size_t count = _hashes->SaveAll?this->size():this->count();
+   writer.Stream() << "StringTableStart v1 " << count << '\n';
+   saveStream(writer.Stream());
+}
+
+void StringHasher::saveStream(std::ostream &s) const {
+   Base::OutputStream str(s,false);
+   boost::io::ios_flags_saver ifs(s);
+   s << std::hex;
+
+   bool allowRealtive = DocumentParams::getRelativeStringID();
+   long anchor = 0;
+   const StringID *last = nullptr;
+   long lastid = 0;
+   bool relative = false;
+
+   for (auto &v : _hashes->right) {
+       auto &d = *v.second;
+       long id = d._id;
+       if (!_hashes->SaveAll && !d.isMarked() && !d.isPersistent())
+           continue;
+
+       if (!allowRealtive)
+           s << id;
+       else {
+           // We use relative coding to save space. But in order to have some
+           // minimum protection against corruption, write an absolute value every
+           // once a while.
+           relative = (id - anchor) < 1000;
+           if (relative)
+               s << '-' << id - lastid;
+           else  {
+               anchor = id;
+               s << id;
+           }
+           lastid = id;
+       }
+
+       int offset = d.isPostfixEncoded() ? 1 : 0;
+
+       StringID::IndexID prefixid;
+       prefixid.id = 0;
+       prefixid.index = 0;
+       if (d.isPrefixID()) {
+           assert(d._sids.size() > offset);
+           prefixid.id = d._sids[offset].value();
+       }
+       else if (d.isPrefixIDIndex()) {
+           prefixid = StringID::fromString(d._data);
+           assert(d._sids.size() > offset && d._sids[offset].value() == prefixid.id);
+       }
+
+       auto flags = d._flags;
+       flags.setFlag(StringID::Flag::Marked, false);
+       s << '.' << flags.toUnderlyingType();
+
+       int i = 0;
+       if (!relative) {
+           for (; i < d._sids.size(); ++i)
+               s << '.' << d._sids[i].value();
+       }
+       else {
+           if (last) {
+               for (; i < d._sids.size() && i < last->_sids.size(); ++i) {
+                   long m = last->_sids[i].value();
+                   long n = d._sids[i].value();
+                   if (n < m)
+                       s << ".-" << m-n;
+                   else
+                       s << '.' << n - m;
+               }
+           }
+           for (; i < d._sids.size(); ++i)
+               s << '.' << id - d._sids[i].value();
+       }
+
+       last = &d;
+
+       // Having postfix means it is a geometry element name, which
+       // guarantees to be a single line without space. So it is safe to
+       // store in raw stream.
+       if (d.isPostfixed()) {
+           if (d.isPrefixIDIndex())
+               s << ' ' << prefixid.index;
+           else if (!d.isIndexed() && !d.isPrefixID())
+               s << ' ' << d._data.constData();
+
+           if (!d.isPostfixEncoded())
+               s << ' ' << d._postfix.constData();
+           s << '\n';
+       }
+       else {
+           // Reaching here means the string may contain space and newlines
+           // We rely on OutputStream (i.e. str) to save the string.
+           s << ' ';
+           str << d._data.constData();
+       }
+   }
+}
+
+void StringHasher::RestoreDocFile(Base::Reader &reader) {
+   std::string marker, ver;
+   reader >> marker;
+   std::size_t count;
+   _hashes->clear();
+   if (marker == "StringTableStart") {
+       reader >> ver >> count;
+       if (ver != "v1")
+           FC_WARN("Unknown string table format");
+       restoreStreamNew(reader, count);
+       return;
+   }
+   count = atoi(marker.c_str());
+   restoreStream(reader, count);
+}
+
+void StringHasher::restoreStreamNew(std::istream &s, std::size_t count) {
+   Base::InputStream str(s, false);
+   _hashes->clear();
+   std::string content;
+   boost::io::ios_flags_saver ifs(s);
+   s >> std::hex;
+   std::vector<std::string> tokens;
+   long lastid = 0;
+   const StringID *last = nullptr;
+
+   std::string tmp;
+
+   for (uint32_t i = 0; i < count; ++i) {
+       if (!(s >> tmp))
+           FC_THROWM(Base::RuntimeError, "Invalid string table");
+
+       tokens.clear();
+       boost::split(tokens, tmp, boost::is_any_of("."));
+       if (tokens.size() < 2)
+           FC_THROWM(Base::RuntimeError, "Invalid string table");
+
+       long id;
+       bool relative = false;
+       if (tokens[0][0] == '-') {
+           relative = true;
+           id = lastid + strtol(tokens[0].c_str() + 1, nullptr, 16);
+       }
+       else
+           id = strtol(tokens[0].c_str(), nullptr, 16);
+
+       lastid = id;
+
+       unsigned long flag = strtol(tokens[1].c_str(), nullptr, 16);
+       StringIDRef sid(new StringID(id, QByteArray(), static_cast<StringID::Flag>(flag)));
+
+       StringID &d = *sid._sid;
+       d._sids.reserve(tokens.size() - 2);
+
+       int j = 2;
+       if (relative && last) {
+           for (; j < (int)tokens.size() && j - 2 < last->_sids.size(); ++j) {
+               long m = last->_sids[j - 2].value();
+               long n;
+               if (tokens[j][0] == '-')
+                   n = -strtol(&tokens[j][1], nullptr, 16);
+               else
+                   n = strtol(&tokens[j][0], nullptr, 16);
+               StringIDRef sid = getID(m + n);
+               if (!sid)
+                   FC_THROWM(Base::RuntimeError, "Invalid string id reference");
+               d._sids.push_back(sid);
+           }
+       }
+       for (; j < (int)tokens.size(); ++j) {
+           long n = strtol(&tokens[j][0], nullptr, 16);
+           StringIDRef sid = getID(relative ? id - n : n);
+           if (!sid)
+               FC_THROWM(Base::RuntimeError, "Invalid string id reference");
+           d._sids.push_back(sid);
+       }
+
+       if (!d.isPostfixed()) {
+           str >> content;
+           if (d.isHashed() || d.isBinary())
+               d._data = QByteArray::fromBase64(content.c_str());
+           else
+               d._data = content.c_str();
+       }
+       else {
+           int offset = 0;
+           if (d.isPostfixEncoded()) {
+               offset = 1;
+               if (d._sids.empty())
+                   FC_THROWM(Base::RuntimeError, "Missing string postfix");
+               d._postfix = d._sids[0]._sid->_data;
+           }
+           if (d.isIndexed()) {
+               if (d._sids.size() <= offset)
+                   FC_THROWM(Base::RuntimeError, "Missing string prefix");
+               d._data = d._sids[offset]._sid->_data;
+           }
+           else if (d.isPrefixID() || d.isPrefixIDIndex()) {
+               if (d._sids.size() <= offset)
+                   FC_THROWM(Base::RuntimeError, "Missing string prefix id");
+               int index = 0;
+               if (d.isPrefixIDIndex()) {
+                   if (!(s >> index))
+                       FC_THROWM(Base::RuntimeError, "Missing string prefix index");
+               }
+               d._data = d._sids[offset]._sid->toString(index).c_str();
+           }
+           else {
+               s >> content;
+               d._data = content.c_str();
+           }
+           if (!d.isPostfixEncoded()) {
+               s >> content;
+               d._postfix = content.c_str();
+           }
+       }
+
+       last = insert(sid);
+   }
+}
+
+StringID *StringHasher::insert(const StringIDRef &sid)
+{
+   assert(sid && sid._sid->_hasher == nullptr);
+   auto &d = *sid._sid;
+   d._hasher = this;
+   d.ref();
+   auto res = _hashes->right.insert(_hashes->right.end(),
+                                    HashMap::right_map::value_type(sid.value(), &d));
+   if (res->second != &d) {
+       d._hasher = nullptr;
+       d.unref();
+   }
+   return res->second;
+}
+
+void StringHasher::restoreStream(std::istream &s, std::size_t count) {
+   Base::InputStream str(s, false);
+   _hashes->clear();
+   std::string content;
+   for (uint32_t i = 0; i < count; ++i) {
+       int32_t id;
+       uint8_t type;
+       str >> id >> type >> content;
+       StringIDRef sid = new StringID(id, QByteArray(), static_cast<StringID::Flag>(type));
+       if (sid.isHashed() || sid.isBinary())
+           sid._sid->_data = QByteArray::fromBase64(content.c_str());
+       else
+           sid._sid->_data = QByteArray(content.c_str());
+       insert(sid);
+   }
+}
+
+void StringHasher::clear() {
+   for (auto & v : _hashes->right) {
+       v.second->_hasher = nullptr;
+       v.second->unref();
+   }
+   _hashes->clear();
+}
+
+size_t StringHasher::size() const {
+   return _hashes->size();
+}
+
+size_t StringHasher::count() const {
+   size_t count = 0;
+   for (auto &v : _hashes->right)
+       if (v.second->getRefCount() > 1)
+           ++count;
+   return count;
+}
+
+void StringHasher::Restore(Base::XMLReader &reader) {
+   clear();
+   reader.readElement("StringHasher");
+   _hashes->SaveAll = reader.getAttributeAsInteger("saveall") ? true : false;
+   _hashes->Threshold = reader.getAttributeAsInteger("threshold");
+
+   bool newtag = false;
+   if (reader.getAttributeAsInteger("new", "0") > 0) {
+       reader.readElement("StringHasher2");
+       newtag = true;
+   }
+
+   if (reader.hasAttribute("file")) {
+       const char *file = reader.getAttribute("file");
+       if(*file)
+           reader.addFile(file, this);
+       return;
+   }
+
+   std::size_t count = reader.getAttributeAsUnsigned("count");
+   if (newtag) {
+       restoreStreamNew(reader.beginCharStream(false), count);
+       reader.readEndElement("StringHasher2");
+       return;
+   }
+   else if (count && reader.FileVersion > 1)
+       restoreStream(reader.beginCharStream(false), count);
+   else {
+       for (std::size_t i = 0; i < count; ++i) {
+           reader.readElement("Item");
+           StringIDRef sid;
+           long id = reader.getAttributeAsInteger("id");
+           bool hashed = reader.hasAttribute("hash");
+           if (hashed || reader.hasAttribute("data")) {
+               const char *value = hashed ? reader.getAttribute("hash") : reader.getAttribute("data");
+               sid = new StringID(id, QByteArray::fromBase64(value), StringID::Flag::Hashed);
+           }
+           else
+               sid = new StringID(id, QByteArray(reader.getAttribute("text")));
+           insert(sid);
+       }
+   }
+   reader.readEndElement("StringHasher");
+}
+
+unsigned int StringHasher::getMemSize (void) const {
+   return (_hashes->SaveAll?size():count()) * 10;
+}
+
+PyObject *StringHasher::getPyObject() {
+   return new StringHasherPy(this);
+}
+
+std::map<long, StringIDRef> StringHasher::getIDMap() const {
+   std::map<long,StringIDRef> ret;
+   for (auto &v : _hashes->right)
+       ret.emplace_hint(ret.end(), v.first, StringIDRef(v.second));
+   return ret;
+}
+
+void StringHasher::clearMarks() const
+{
+   for (auto &v : _hashes->right)
+       v.second->_flags.setFlag(StringID::Flag::Marked, false);
+}