diff --git a/src/Base/Persistence.cpp b/src/Base/Persistence.cpp index c93b9b6545..927e4e78b9 100644 --- a/src/Base/Persistence.cpp +++ b/src/Base/Persistence.cpp @@ -37,6 +37,10 @@ /// Here the FreeCAD includes sorted by Base,App,Gui...... #include "Persistence.h" +#include +#include +#include + using namespace Base; @@ -115,13 +119,35 @@ std::string Persistence::encodeAttribute(const std::string& str) // clang-format off // https://www.w3.org/TR/xml/#charsets +// Nominally allowed ranges static constexpr std::array, 6> validRanges {{ - {0x9, 0x9}, - {0xA, 0xA}, - {0xD, 0xD}, + {0x9, 0x9}, // TAB -- explicitly allowed + {0xA, 0xA}, // LF -- explicitly allowed + {0xD, 0xD}, // CR -- explicitly allowed {0x20, 0xD7FF}, {0xE000, 0xFFFD}, - {0x10000, 0x10FFFF}, + {0x10000, 0x10FFFF} +}}; +static constexpr std::array, 19> discouragedRanges {{ + {0x7F, 0x84}, + {0x86, 0x9F}, + {0xFDD0, 0xFDEF}, + {0x1FFFE, 0x1FFFF}, + {0x2FFFE, 0x2FFFF}, + {0x3FFFE, 0x3FFFF}, + {0x4FFFE, 0x4FFFF}, + {0x5FFFE, 0x5FFFF}, + {0x6FFFE, 0x6FFFF}, + {0x7FFFE, 0x7FFFF}, + {0x8FFFE, 0x8FFFF}, + {0x9FFFE, 0x9FFFF}, + {0xAFFFE, 0xAFFFF}, + {0xBFFFE, 0xBFFFF}, + {0xCFFFE, 0xCFFFF}, + {0xDFFFE, 0xDFFFF}, + {0xEFFFE, 0xEFFFF}, + {0xFFFFE, 0xFFFFF}, + {0x10FFFE, 0x10FFFF} }}; // clang-format on @@ -131,21 +157,71 @@ static constexpr std::array, 6> validRanges {{ */ std::string Persistence::validateXMLString(const std::string& str) { - std::wstring_convert, char32_t> cvt; - std::u32string cp_in = cvt.from_bytes(str); - std::u32string cp_out; - cp_out.reserve(cp_in.size()); - for (auto cp : cp_in) { - if (std::any_of(validRanges.begin(), validRanges.end(), [cp](const auto& range) { - return cp >= range.first && cp <= range.second; - })) { - cp_out += cp; +#if QT_VERSION >= QT_VERSION_CHECK(6, 6, 0) + // In newer Qt we cannot use QString::toUcs4, so we have to do it the long way... + + const QString input = QString::fromUtf8(str); + QString output; + output.reserve(input.size()); + + for (auto it = input.cbegin(); it != input.cend();) { + const auto cp = static_cast(it->unicode()); + const uint ch = it->unicode(); + ++it; + + if (QChar::isHighSurrogate(ch) && it != input.cend() + && QChar::isLowSurrogate(static_cast(it->unicode()))) { + // We are outside the "Basic Multilingual Plane (BMP)" (directly storable in a UTF-16 + // char, which is what QString uses internally). So now we have to use *two* chars, + // combine them into one for our check, and then run the validity check for XML output. + const uint low = it->unicode(); + ++it; + const char32_t full = QChar::surrogateToUcs4(ch, low); + const bool valid = std::ranges::any_of(validRanges, [full](const auto& r) { + return full >= r.first && full <= r.second; + }); + const bool discouraged = std::ranges::any_of(discouragedRanges, [full](const auto& r) { + return full >= r.first && full <= r.second; + }); + output.append((valid && !discouraged) ? QChar::fromUcs4(full) : QChar::fromUcs4('_')); } else { - cp_out += '_'; + // The character fits into 16 bytes, it can be checked directly + const bool valid = std::ranges::any_of(validRanges, [cp](const auto& r) { + return cp >= r.first && cp <= r.second; + }); + const bool discouraged = std::ranges::any_of(discouragedRanges, [cp](const auto& r) { + return cp >= r.first && cp <= r.second; + }); + output.append((valid && !discouraged) ? QChar(ch) : QChar::fromUcs2('_')); } } - return cvt.to_bytes(cp_out); + + const QByteArray utf8 = output.toUtf8(); + return {utf8.constData(), static_cast(utf8.size())}; +#else + // In older Qt we can directly use QString::toUcs4, which makes for a bit simpler code + const QString input = QString::fromStdString(str); + const QVector ucs4 = input.toUcs4(); + + QVector filtered; + filtered.reserve(ucs4.size()); + + for (uint cp : ucs4) { + const char32_t c32 = static_cast(cp); + const bool ok = std::ranges::any_of(validRanges, [c32](const auto& r) { + return c32 >= r.first && c32 <= r.second; + }); + const bool discouraged = std::ranges::any_of(discouragedRanges, [c32](const auto& r) { + return c32 >= r.first && c32 <= r.second; + }); + filtered.push_back((ok && !discouraged) ? cp : static_cast(U'_')); + } + + const QString output = QString::fromUcs4(filtered.constData(), filtered.size()); + const QByteArray utf8 = output.toUtf8(); + return {utf8.constData(), static_cast(utf8.size())}; +#endif } void Persistence::dumpToStream(std::ostream& stream, int compression) diff --git a/tests/src/Base/Reader.cpp b/tests/src/Base/Reader.cpp index b1a3e03802..ff01d72d25 100644 --- a/tests/src/Base/Reader.cpp +++ b/tests/src/Base/Reader.cpp @@ -463,6 +463,153 @@ TEST_F(ReaderTest, validDefaults) EXPECT_EQ(value20, TimesIGoToBed::Late); } +TEST_F(ReaderTest, AsciiUnchanged) +{ + std::string input = "Hello, world!"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, input); +} + +TEST_F(ReaderTest, AllowedWhitespacePreserved) +{ + std::string input = "a\tb\nc\rd"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, input); +} + +TEST_F(ReaderTest, DisallowedC0ControlsBecomeUnderscore) +{ + std::string input = "A"; + input.push_back(char(0x00)); + input.push_back(char(0x0F)); + input.push_back(char(0x1B)); + input += "Z"; + std::string expected = "A___Z"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, expected); +} + +TEST_F(ReaderTest, DelAndC1ControlsBecomeUnderscore) +{ + std::string input; + input.push_back('X'); + input.push_back(char(0x7F)); // DEL + // U+0086 (SSA) in UTF-8: 0xC2 0x86 + input += std::string("\xC2\x86", 2); + input.push_back('Y'); + std::string expected = "X__Y"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, expected); +} + +namespace +{ + +// Minimal UTF-8 encoder -- not valid for full production use, test suite only! +static void append_cp_utf8(std::string& s, char32_t cp) +{ + if (cp <= 0x7F) { + s.push_back(static_cast(cp)); + } + else if (cp <= 0x7FF) { + s.push_back(static_cast(0xC0 | (cp >> 6))); + s.push_back(static_cast(0x80 | (cp & 0x3F))); + } + else if (cp <= 0xFFFF) { + s.push_back(static_cast(0xE0 | (cp >> 12))); + s.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (cp & 0x3F))); + } + else { + s.push_back(static_cast(0xF0 | (cp >> 18))); + s.push_back(static_cast(0x80 | ((cp >> 12) & 0x3F))); + s.push_back(static_cast(0x80 | ((cp >> 6) & 0x3F))); + s.push_back(static_cast(0x80 | (cp & 0x3F))); + } +} + +static std::string make_utf8(std::initializer_list cps) +{ + std::string s; + for (char32_t cp : cps) { + append_cp_utf8(s, cp); + } + return s; +} + +} // namespace + +TEST_F(ReaderTest, BmpBoundaryAndNoncharacters) +{ + std::string input; + input += "X"; + input += make_utf8({0xFFFD}); // allowed + input += make_utf8({0xFFFE}); // disallowed + input += "Y"; + + std::string expected; + expected += "X"; + expected += make_utf8({0xFFFD}); + expected += "_"; + expected += "Y"; + + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, expected); +} + +TEST_F(ReaderTest, NonBmpEmojiPreserved) +{ + // 😀 U+1F600 + std::string emoji = make_utf8({0x1F600}); + std::string input = "A" + emoji + "B"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, input); +} + +TEST_F(ReaderTest, ZwjSequencePreserved) +{ + std::string family = make_utf8({0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466}); + std::string input = "X" + family + "Y"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, input); +} + +TEST_F(ReaderTest, CombiningMarksPreserved) +{ + std::string decomposed = std::string("caf") + make_utf8({0x0065, 0x0301}); // "café" + std::string result = Base::Persistence::validateXMLString(decomposed); + EXPECT_EQ(result, decomposed); +} + +TEST_F(ReaderTest, PrivateUseAreaPreserved) +{ + // Yes, actually permitted by XML (as far as I can determine - chennes) + std::string pua = make_utf8({0xE000}); + std::string input = "A" + pua + "B"; + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, input); +} + +TEST_F(ReaderTest, MixedContentSanitization) +{ + std::string input; + input += "A"; + input.push_back(char(0x1F)); // disallowed control -> '_' + input += make_utf8({0x1F602}); // 😂 allowed + input.push_back(char(0x00)); // disallowed control -> '_' + input += "Z"; + + std::string expected; + expected += "A"; + expected += "_"; + expected += make_utf8({0x1F602}); + expected += "_"; + expected += "Z"; + + std::string result = Base::Persistence::validateXMLString(input); + EXPECT_EQ(result, expected); +} + TEST_F(ReaderTest, validateXmlString) { std::string input = "abcde";