Base: Allow all valid Python 3 identifier characters
Note: Does not check for keywords, only character classes (not a change from the original code).
This commit is contained in:
committed by
Kacper Donat
parent
7b85239093
commit
c26e3e9756
@@ -77,6 +77,7 @@ SetupDoxygen()
|
||||
SetupLibFmt()
|
||||
SetupYamlCpp()
|
||||
SetupZipIos()
|
||||
find_package(ICU REQUIRED COMPONENTS uc i18n)
|
||||
if(NOT FREECAD_LIBPACK_USE OR FREECAD_LIBPACK_CHECKFILE_CLBUNDLER OR FREECAD_LIBPACK_CHECKFILE_VERSION)
|
||||
SetupPython()
|
||||
SetupPCL()
|
||||
|
||||
@@ -141,6 +141,7 @@ macro(PrintFinalReport)
|
||||
conditional(pybind11 pybind11_FOUND "not enabled" ${pybind11_VERSION})
|
||||
simple(Boost ${Boost_VERSION})
|
||||
simple(XercesC "${XercesC_VERSION} [${XercesC_LIBRARIES}] [${XercesC_INCLUDE_DIRS}]")
|
||||
simple(ICU "${ICU_VERSION}")
|
||||
simple(ZLIB "${ZLIB_VERSION_STRING}")
|
||||
simple(OCC "${OCC_VERSION_STRING} [${OCC_LIBRARY_DIR}] [${OCC_INCLUDE_DIR}]")
|
||||
simple(OCC_Libs "[${OCC_LIBRARIES}]")
|
||||
|
||||
@@ -28,6 +28,7 @@ target_include_directories(
|
||||
${XercesC_INCLUDE_DIRS}
|
||||
${ZLIB_INCLUDE_DIR}
|
||||
${ZIPIOS_INCLUDES}
|
||||
${ICU_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
if(MSVC)
|
||||
@@ -35,6 +36,7 @@ if(MSVC)
|
||||
${Boost_LIBRARIES}
|
||||
${XercesC_LIBRARIES}
|
||||
${ZLIB_LIBRARIES}
|
||||
${ICU_LIBRARIES}
|
||||
debug MSVCRTD.LIB
|
||||
debug MSVCPRTD.LIB
|
||||
optimized MSVCRT.LIB
|
||||
@@ -59,6 +61,7 @@ elseif(MINGW)
|
||||
${XercesC_LIBRARIES}
|
||||
${Boost_LIBRARIES}
|
||||
${ZLIB_LIBRARIES}
|
||||
${ICU_LIBRARIES}
|
||||
Rpcrt4.lib
|
||||
)
|
||||
else(MSVC)
|
||||
@@ -66,6 +69,7 @@ else(MSVC)
|
||||
${XercesC_LIBRARIES}
|
||||
${Boost_LIBRARIES}
|
||||
${ZLIB_LIBRARIES}
|
||||
${ICU_LIBRARIES}
|
||||
)
|
||||
endif(MSVC)
|
||||
|
||||
|
||||
@@ -83,36 +83,33 @@
|
||||
#include <sstream>
|
||||
|
||||
// Xerces
|
||||
#include <xercesc/util/OutOfMemoryException.hpp>
|
||||
#include <xercesc/util/PlatformUtils.hpp>
|
||||
#include <xercesc/util/XercesVersion.hpp>
|
||||
#include <xercesc/dom/DOM.hpp>
|
||||
#include <xercesc/dom/DOMImplementation.hpp>
|
||||
#include <xercesc/dom/DOMImplementationLS.hpp>
|
||||
#include <xercesc/dom/DOMDocument.hpp>
|
||||
#include <xercesc/dom/DOMElement.hpp>
|
||||
#include <xercesc/dom/DOMImplementation.hpp>
|
||||
#include <xercesc/dom/DOMImplementationLS.hpp>
|
||||
#include <xercesc/dom/DOMText.hpp>
|
||||
#include <xercesc/framework/StdOutFormatTarget.hpp>
|
||||
#include <xercesc/framework/LocalFileFormatTarget.hpp>
|
||||
#include <xercesc/framework/LocalFileInputSource.hpp>
|
||||
#include <xercesc/framework/MemBufFormatTarget.hpp>
|
||||
#include <xercesc/framework/MemBufInputSource.hpp>
|
||||
#include <xercesc/framework/MemoryManager.hpp>
|
||||
#include <xercesc/framework/StdOutFormatTarget.hpp>
|
||||
#include <xercesc/parsers/XercesDOMParser.hpp>
|
||||
#include <xercesc/util/XMLUni.hpp>
|
||||
#include <xercesc/util/XMLUniDefs.hpp>
|
||||
#include <xercesc/util/XMLString.hpp>
|
||||
#include <xercesc/util/PlatformUtils.hpp>
|
||||
#include <xercesc/sax/SAXParseException.hpp>
|
||||
#include <xercesc/sax/EntityResolver.hpp>
|
||||
#include <xercesc/sax/ErrorHandler.hpp>
|
||||
#include <xercesc/sax/SAXParseException.hpp>
|
||||
#include <xercesc/sax/SAXException.hpp>
|
||||
#include <xercesc/sax/SAXParseException.hpp>
|
||||
#include <xercesc/sax2/Attributes.hpp>
|
||||
#include <xercesc/sax2/DefaultHandler.hpp>
|
||||
#include <xercesc/sax2/Attributes.hpp>
|
||||
#include <xercesc/sax2/SAX2XMLReader.hpp>
|
||||
#include <xercesc/sax2/XMLReaderFactory.hpp>
|
||||
#include <xercesc/util/OutOfMemoryException.hpp>
|
||||
#include <xercesc/util/PlatformUtils.hpp>
|
||||
#include <xercesc/util/XMLString.hpp>
|
||||
#include <xercesc/util/XMLUni.hpp>
|
||||
#include <xercesc/util/XMLUniDefs.hpp>
|
||||
#include <xercesc/util/XercesVersion.hpp>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <boost/algorithm/string/predicate.hpp>
|
||||
@@ -120,6 +117,10 @@
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/tokenizer.hpp>
|
||||
|
||||
// Unicode
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/uchar.h>
|
||||
|
||||
// QtCore
|
||||
#include <QBuffer>
|
||||
#include <QByteArray>
|
||||
|
||||
@@ -23,6 +23,8 @@
|
||||
|
||||
#include "PreCompiled.h"
|
||||
#ifndef _PreComp_
|
||||
#include <unicode/unistr.h>
|
||||
#include <unicode/uchar.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
@@ -34,26 +36,77 @@
|
||||
#include "Interpreter.h"
|
||||
#include "Tools.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
constexpr auto underscore = static_cast<UChar32>(U'_');
|
||||
|
||||
bool isValidFirstChar(UChar32 c)
|
||||
{
|
||||
auto category = static_cast<UCharCategory>(u_charType(c));
|
||||
|
||||
return (c == underscore)
|
||||
|| (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
|
||||
|| category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
|
||||
|| category == U_OTHER_LETTER || category == U_LETTER_NUMBER);
|
||||
}
|
||||
|
||||
bool isValidSubsequentChar(UChar32 c)
|
||||
{
|
||||
auto category = static_cast<UCharCategory>(u_charType(c));
|
||||
return (c == underscore)
|
||||
|| (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
|
||||
|| category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
|
||||
|| category == U_OTHER_LETTER || category == U_LETTER_NUMBER
|
||||
|| category == U_DECIMAL_DIGIT_NUMBER || category == U_NON_SPACING_MARK
|
||||
|| category == U_COMBINING_SPACING_MARK || category == U_CONNECTOR_PUNCTUATION);
|
||||
}
|
||||
|
||||
std::string unicodeCharToStdString(UChar32 c)
|
||||
{
|
||||
icu::UnicodeString uChar(c);
|
||||
std::string utf8Char;
|
||||
return uChar.toUTF8String(utf8Char);
|
||||
}
|
||||
|
||||
}; // namespace
|
||||
|
||||
std::string Base::Tools::getIdentifier(const std::string& name)
|
||||
{
|
||||
if (name.empty()) {
|
||||
return "_";
|
||||
}
|
||||
// check for first character whether it's a digit
|
||||
std::string CleanName = name;
|
||||
if (!CleanName.empty() && CleanName[0] >= 48 && CleanName[0] <= 57) {
|
||||
CleanName[0] = '_';
|
||||
|
||||
icu::UnicodeString uName = icu::UnicodeString::fromUTF8(name);
|
||||
std::stringstream result;
|
||||
|
||||
// Handle the first character independently, prepending an underscore if it is not a valid
|
||||
// first character, but *is* a valid later character
|
||||
UChar32 firstChar = uName.char32At(0);
|
||||
const int32_t firstCharLength = U16_LENGTH(firstChar);
|
||||
if (!isValidFirstChar(firstChar)) {
|
||||
result << "_";
|
||||
if (isValidSubsequentChar(firstChar)) {
|
||||
result << unicodeCharToStdString(firstChar);
|
||||
}
|
||||
}
|
||||
// strip illegal chars
|
||||
for (char& it : CleanName) {
|
||||
if (!((it >= 48 && it <= 57) || // number
|
||||
(it >= 65 && it <= 90) || // uppercase letter
|
||||
(it >= 97 && it <= 122))) { // lowercase letter
|
||||
it = '_'; // it's neither number nor letter
|
||||
else {
|
||||
result << unicodeCharToStdString(firstChar);
|
||||
}
|
||||
|
||||
for (int32_t i = firstCharLength; i < uName.length(); /* will increment by char length */) {
|
||||
UChar32 c = uName.char32At(i);
|
||||
int32_t charLength = U16_LENGTH(c);
|
||||
i += charLength;
|
||||
|
||||
if (isValidSubsequentChar(c)) {
|
||||
result << unicodeCharToStdString(c);
|
||||
}
|
||||
else {
|
||||
result << "_";
|
||||
}
|
||||
}
|
||||
|
||||
return CleanName;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
std::wstring Base::Tools::widen(const std::string& str)
|
||||
|
||||
@@ -346,7 +346,14 @@ public:
|
||||
|
||||
struct BaseExport Tools
|
||||
{
|
||||
static std::string getIdentifier(const std::string&);
|
||||
/**
|
||||
* Given an arbitrary string, ensure that it conforms to Python3 identifier rules, replacing
|
||||
* invalid characters with an underscore. If the first character is invalid, prepends an
|
||||
* underscore to the name. See https://unicode.org/reports/tr31/ for complete naming rules.
|
||||
* @param String to be checked and sanitized.
|
||||
* @return A std::string that is a valid Python 3 identifier.
|
||||
*/
|
||||
static std::string getIdentifier(const std::string& name);
|
||||
static std::wstring widen(const std::string& str);
|
||||
static std::string narrow(const std::wstring& str);
|
||||
static std::string escapedUnicodeFromUtf8(const char* s);
|
||||
|
||||
@@ -37,4 +37,5 @@ target_link_libraries(Base_tests_run PRIVATE
|
||||
GTest::gmock_main
|
||||
${Google_Tests_LIBS}
|
||||
FreeCADApp
|
||||
ICU::uc ICU::i18n
|
||||
)
|
||||
|
||||
@@ -113,4 +113,47 @@ TEST(BaseToolsSuite, TestEscapeQuotesFromString)
|
||||
EXPECT_EQ(Base::Tools::escapeQuotesFromString("\""), "\\\"");
|
||||
EXPECT_EQ(Base::Tools::escapeQuotesFromString("\\"), "\\");
|
||||
}
|
||||
TEST(BaseToolsSuite, TestGetIdentifier)
|
||||
{
|
||||
// ASCII and edge cases
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("valid"), "valid");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("_valid"), "_valid");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("1invalid"), "_1invalid");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier(""), "_");
|
||||
|
||||
// Unicode letters (valid start and continue)
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("πValue"), "πValue"); // Greek lowercase
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("Δx"), "Δx"); // Greek uppercase
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("Džz"), "Džz"); // Titlecase letter
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("ʰindex"), "ʰindex"); // Modifier letter
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("名字"), "名字"); // CJK characters (Lo)
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("ⅨCount"), "ⅨCount"); // Letter number (Nl)
|
||||
|
||||
// Digits not valid as first char
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("٢ndPlace"), "_٢ndPlace"); // Arabic-Indic digit (Nd)
|
||||
|
||||
// Connector punctuation
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("valid_name"), "valid_name");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("valid‿name"), "valid‿name");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("valid﹍name"), "valid﹍name");
|
||||
|
||||
// Combining marks (Mn, Mc)
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("éclair"), "éclair"); // 'e' + combining acute accent (Mn)
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("devा"), "devा"); // Devanagari vowel sign (Mc)
|
||||
|
||||
// Invalid symbols
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("hello!"), "hello_");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("foo-bar"), "foo_bar");
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("a🙂b"), "a_b"); // Emoji replaced
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("a*b&c"), "a_b_c");
|
||||
|
||||
// Edge: starts with underscore, includes mixed types
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("_नमस्ते123"), "_नमस्ते123");
|
||||
|
||||
// Starts with invalid character
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("💡idea"), "_idea");
|
||||
|
||||
// Full-width digit (U+FF11, looks like '1')
|
||||
EXPECT_EQ(Base::Tools::getIdentifier("1start"), "_1start");
|
||||
}
|
||||
// NOLINTEND(cppcoreguidelines-*,readability-*)
|
||||
|
||||
Reference in New Issue
Block a user