Core: Switch away from deprecated wstring_convert and codecvt_utf8
This commit is contained in:
committed by
Kacper Donat
parent
9afebe3357
commit
0ce82c4159
@@ -37,6 +37,10 @@
|
||||
/// Here the FreeCAD includes sorted by Base,App,Gui......
|
||||
#include "Persistence.h"
|
||||
|
||||
#include <QChar>
|
||||
#include <QVector>
|
||||
#include <QByteArray>
|
||||
|
||||
|
||||
using namespace Base;
|
||||
|
||||
@@ -115,13 +119,35 @@ std::string Persistence::encodeAttribute(const std::string& str)
|
||||
|
||||
// clang-format off
|
||||
// https://www.w3.org/TR/xml/#charsets
|
||||
// Nominally allowed ranges
|
||||
static constexpr std::array<std::pair<char32_t, char32_t>, 6> validRanges {{
|
||||
{0x9, 0x9},
|
||||
{0xA, 0xA},
|
||||
{0xD, 0xD},
|
||||
{0x9, 0x9}, // TAB -- explicitly allowed
|
||||
{0xA, 0xA}, // LF -- explicitly allowed
|
||||
{0xD, 0xD}, // CR -- explicitly allowed
|
||||
{0x20, 0xD7FF},
|
||||
{0xE000, 0xFFFD},
|
||||
{0x10000, 0x10FFFF},
|
||||
{0x10000, 0x10FFFF}
|
||||
}};
|
||||
static constexpr std::array<std::pair<char32_t, char32_t>, 19> discouragedRanges {{
|
||||
{0x7F, 0x84},
|
||||
{0x86, 0x9F},
|
||||
{0xFDD0, 0xFDEF},
|
||||
{0x1FFFE, 0x1FFFF},
|
||||
{0x2FFFE, 0x2FFFF},
|
||||
{0x3FFFE, 0x3FFFF},
|
||||
{0x4FFFE, 0x4FFFF},
|
||||
{0x5FFFE, 0x5FFFF},
|
||||
{0x6FFFE, 0x6FFFF},
|
||||
{0x7FFFE, 0x7FFFF},
|
||||
{0x8FFFE, 0x8FFFF},
|
||||
{0x9FFFE, 0x9FFFF},
|
||||
{0xAFFFE, 0xAFFFF},
|
||||
{0xBFFFE, 0xBFFFF},
|
||||
{0xCFFFE, 0xCFFFF},
|
||||
{0xDFFFE, 0xDFFFF},
|
||||
{0xEFFFE, 0xEFFFF},
|
||||
{0xFFFFE, 0xFFFFF},
|
||||
{0x10FFFE, 0x10FFFF}
|
||||
}};
|
||||
// clang-format on
|
||||
|
||||
@@ -131,21 +157,71 @@ static constexpr std::array<std::pair<char32_t, char32_t>, 6> validRanges {{
|
||||
*/
|
||||
std::string Persistence::validateXMLString(const std::string& str)
|
||||
{
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> cvt;
|
||||
std::u32string cp_in = cvt.from_bytes(str);
|
||||
std::u32string cp_out;
|
||||
cp_out.reserve(cp_in.size());
|
||||
for (auto cp : cp_in) {
|
||||
if (std::any_of(validRanges.begin(), validRanges.end(), [cp](const auto& range) {
|
||||
return cp >= range.first && cp <= range.second;
|
||||
})) {
|
||||
cp_out += cp;
|
||||
#if QT_VERSION >= QT_VERSION_CHECK(6, 6, 0)
|
||||
// In newer Qt we cannot use QString::toUcs4, so we have to do it the long way...
|
||||
|
||||
const QString input = QString::fromUtf8(str);
|
||||
QString output;
|
||||
output.reserve(input.size());
|
||||
|
||||
for (auto it = input.cbegin(); it != input.cend();) {
|
||||
const auto cp = static_cast<char32_t>(it->unicode());
|
||||
const uint ch = it->unicode();
|
||||
++it;
|
||||
|
||||
if (QChar::isHighSurrogate(ch) && it != input.cend()
|
||||
&& QChar::isLowSurrogate(static_cast<char32_t>(it->unicode()))) {
|
||||
// We are outside the "Basic Multilingual Plane (BMP)" (directly storable in a UTF-16
|
||||
// char, which is what QString uses internally). So now we have to use *two* chars,
|
||||
// combine them into one for our check, and then run the validity check for XML output.
|
||||
const uint low = it->unicode();
|
||||
++it;
|
||||
const char32_t full = QChar::surrogateToUcs4(ch, low);
|
||||
const bool valid = std::ranges::any_of(validRanges, [full](const auto& r) {
|
||||
return full >= r.first && full <= r.second;
|
||||
});
|
||||
const bool discouraged = std::ranges::any_of(discouragedRanges, [full](const auto& r) {
|
||||
return full >= r.first && full <= r.second;
|
||||
});
|
||||
output.append((valid && !discouraged) ? QChar::fromUcs4(full) : QChar::fromUcs4('_'));
|
||||
}
|
||||
else {
|
||||
cp_out += '_';
|
||||
// The character fits into 16 bytes, it can be checked directly
|
||||
const bool valid = std::ranges::any_of(validRanges, [cp](const auto& r) {
|
||||
return cp >= r.first && cp <= r.second;
|
||||
});
|
||||
const bool discouraged = std::ranges::any_of(discouragedRanges, [cp](const auto& r) {
|
||||
return cp >= r.first && cp <= r.second;
|
||||
});
|
||||
output.append((valid && !discouraged) ? QChar(ch) : QChar::fromUcs2('_'));
|
||||
}
|
||||
}
|
||||
return cvt.to_bytes(cp_out);
|
||||
|
||||
const QByteArray utf8 = output.toUtf8();
|
||||
return {utf8.constData(), static_cast<size_t>(utf8.size())};
|
||||
#else
|
||||
// In older Qt we can directly use QString::toUcs4, which makes for a bit simpler code
|
||||
const QString input = QString::fromStdString(str);
|
||||
const QVector<uint> ucs4 = input.toUcs4();
|
||||
|
||||
QVector<uint> filtered;
|
||||
filtered.reserve(ucs4.size());
|
||||
|
||||
for (uint cp : ucs4) {
|
||||
const char32_t c32 = static_cast<char32_t>(cp);
|
||||
const bool ok = std::ranges::any_of(validRanges, [c32](const auto& r) {
|
||||
return c32 >= r.first && c32 <= r.second;
|
||||
});
|
||||
const bool discouraged = std::ranges::any_of(discouragedRanges, [c32](const auto& r) {
|
||||
return c32 >= r.first && c32 <= r.second;
|
||||
});
|
||||
filtered.push_back((ok && !discouraged) ? cp : static_cast<uint>(U'_'));
|
||||
}
|
||||
|
||||
const QString output = QString::fromUcs4(filtered.constData(), filtered.size());
|
||||
const QByteArray utf8 = output.toUtf8();
|
||||
return {utf8.constData(), static_cast<size_t>(utf8.size())};
|
||||
#endif
|
||||
}
|
||||
|
||||
void Persistence::dumpToStream(std::ostream& stream, int compression)
|
||||
|
||||
@@ -463,6 +463,153 @@ TEST_F(ReaderTest, validDefaults)
|
||||
EXPECT_EQ(value20, TimesIGoToBed::Late);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, AsciiUnchanged)
|
||||
{
|
||||
std::string input = "Hello, world!";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, input);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, AllowedWhitespacePreserved)
|
||||
{
|
||||
std::string input = "a\tb\nc\rd";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, input);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, DisallowedC0ControlsBecomeUnderscore)
|
||||
{
|
||||
std::string input = "A";
|
||||
input.push_back(char(0x00));
|
||||
input.push_back(char(0x0F));
|
||||
input.push_back(char(0x1B));
|
||||
input += "Z";
|
||||
std::string expected = "A___Z";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, expected);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, DelAndC1ControlsBecomeUnderscore)
|
||||
{
|
||||
std::string input;
|
||||
input.push_back('X');
|
||||
input.push_back(char(0x7F)); // DEL
|
||||
// U+0086 (SSA) in UTF-8: 0xC2 0x86
|
||||
input += std::string("\xC2\x86", 2);
|
||||
input.push_back('Y');
|
||||
std::string expected = "X__Y";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, expected);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Minimal UTF-8 encoder -- not valid for full production use, test suite only!
|
||||
static void append_cp_utf8(std::string& s, char32_t cp)
|
||||
{
|
||||
if (cp <= 0x7F) {
|
||||
s.push_back(static_cast<char>(cp));
|
||||
}
|
||||
else if (cp <= 0x7FF) {
|
||||
s.push_back(static_cast<char>(0xC0 | (cp >> 6)));
|
||||
s.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
||||
}
|
||||
else if (cp <= 0xFFFF) {
|
||||
s.push_back(static_cast<char>(0xE0 | (cp >> 12)));
|
||||
s.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
|
||||
s.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
||||
}
|
||||
else {
|
||||
s.push_back(static_cast<char>(0xF0 | (cp >> 18)));
|
||||
s.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
|
||||
s.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
|
||||
s.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
|
||||
}
|
||||
}
|
||||
|
||||
static std::string make_utf8(std::initializer_list<char32_t> cps)
|
||||
{
|
||||
std::string s;
|
||||
for (char32_t cp : cps) {
|
||||
append_cp_utf8(s, cp);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_F(ReaderTest, BmpBoundaryAndNoncharacters)
|
||||
{
|
||||
std::string input;
|
||||
input += "X";
|
||||
input += make_utf8({0xFFFD}); // allowed
|
||||
input += make_utf8({0xFFFE}); // disallowed
|
||||
input += "Y";
|
||||
|
||||
std::string expected;
|
||||
expected += "X";
|
||||
expected += make_utf8({0xFFFD});
|
||||
expected += "_";
|
||||
expected += "Y";
|
||||
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, expected);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, NonBmpEmojiPreserved)
|
||||
{
|
||||
// 😀 U+1F600
|
||||
std::string emoji = make_utf8({0x1F600});
|
||||
std::string input = "A" + emoji + "B";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, input);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, ZwjSequencePreserved)
|
||||
{
|
||||
std::string family = make_utf8({0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466});
|
||||
std::string input = "X" + family + "Y";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, input);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, CombiningMarksPreserved)
|
||||
{
|
||||
std::string decomposed = std::string("caf") + make_utf8({0x0065, 0x0301}); // "café"
|
||||
std::string result = Base::Persistence::validateXMLString(decomposed);
|
||||
EXPECT_EQ(result, decomposed);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, PrivateUseAreaPreserved)
|
||||
{
|
||||
// Yes, actually permitted by XML (as far as I can determine - chennes)
|
||||
std::string pua = make_utf8({0xE000});
|
||||
std::string input = "A" + pua + "B";
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, input);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, MixedContentSanitization)
|
||||
{
|
||||
std::string input;
|
||||
input += "A";
|
||||
input.push_back(char(0x1F)); // disallowed control -> '_'
|
||||
input += make_utf8({0x1F602}); // 😂 allowed
|
||||
input.push_back(char(0x00)); // disallowed control -> '_'
|
||||
input += "Z";
|
||||
|
||||
std::string expected;
|
||||
expected += "A";
|
||||
expected += "_";
|
||||
expected += make_utf8({0x1F602});
|
||||
expected += "_";
|
||||
expected += "Z";
|
||||
|
||||
std::string result = Base::Persistence::validateXMLString(input);
|
||||
EXPECT_EQ(result, expected);
|
||||
}
|
||||
|
||||
TEST_F(ReaderTest, validateXmlString)
|
||||
{
|
||||
std::string input = "abcde";
|
||||
|
||||
Reference in New Issue
Block a user