Base: Allow all valid Python 3 identifier characters

Note: Does not check for keywords, only character classes (not a change from the original code).
2025-07-14 09:55:32 -05:00
parent 7b85239093
commit c26e3e9756
8 changed files with 136 additions and 25 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,7 @@ SetupDoxygen()
 SetupLibFmt()
 SetupYamlCpp()
 SetupZipIos()
+find_package(ICU REQUIRED COMPONENTS uc i18n)
 if(NOT FREECAD_LIBPACK_USE OR FREECAD_LIBPACK_CHECKFILE_CLBUNDLER OR FREECAD_LIBPACK_CHECKFILE_VERSION)
    SetupPython()
    SetupPCL()
--- a/cMake/FreeCAD_Helpers/PrintFinalReport.cmake
+++ b/cMake/FreeCAD_Helpers/PrintFinalReport.cmake
@@ -141,6 +141,7 @@ macro(PrintFinalReport)
    conditional(pybind11 pybind11_FOUND "not enabled" ${pybind11_VERSION})
    simple(Boost ${Boost_VERSION})
    simple(XercesC "${XercesC_VERSION} [${XercesC_LIBRARIES}] [${XercesC_INCLUDE_DIRS}]")
+    simple(ICU "${ICU_VERSION}")
    simple(ZLIB "${ZLIB_VERSION_STRING}")
    simple(OCC "${OCC_VERSION_STRING} [${OCC_LIBRARY_DIR}] [${OCC_INCLUDE_DIR}]")
    simple(OCC_Libs "[${OCC_LIBRARIES}]")
--- a/src/Base/CMakeLists.txt
+++ b/src/Base/CMakeLists.txt
@@ -28,6 +28,7 @@ target_include_directories(
    ${XercesC_INCLUDE_DIRS}
    ${ZLIB_INCLUDE_DIR}
    ${ZIPIOS_INCLUDES}
+    ${ICU_INCLUDE_DIRS}
 )

 if(MSVC)
@@ -35,6 +36,7 @@ if(MSVC)
        ${Boost_LIBRARIES}
        ${XercesC_LIBRARIES}
        ${ZLIB_LIBRARIES}
+        ${ICU_LIBRARIES}
        debug MSVCRTD.LIB
        debug MSVCPRTD.LIB
        optimized MSVCRT.LIB
@@ -59,6 +61,7 @@ elseif(MINGW)
        ${XercesC_LIBRARIES}
        ${Boost_LIBRARIES}
        ${ZLIB_LIBRARIES}
+        ${ICU_LIBRARIES}
        Rpcrt4.lib
    )
 else(MSVC)
@@ -66,6 +69,7 @@ else(MSVC)
        ${XercesC_LIBRARIES}
        ${Boost_LIBRARIES}
        ${ZLIB_LIBRARIES}
+        ${ICU_LIBRARIES}
    )
 endif(MSVC)

--- a/src/Base/PreCompiled.h
+++ b/src/Base/PreCompiled.h
@@ -83,36 +83,33 @@
 #include <sstream>

 // Xerces
-#include <xercesc/util/OutOfMemoryException.hpp>
-#include <xercesc/util/PlatformUtils.hpp>
-#include <xercesc/util/XercesVersion.hpp>
 #include <xercesc/dom/DOM.hpp>
-#include <xercesc/dom/DOMImplementation.hpp>
-#include <xercesc/dom/DOMImplementationLS.hpp>
 #include <xercesc/dom/DOMDocument.hpp>
 #include <xercesc/dom/DOMElement.hpp>
+#include <xercesc/dom/DOMImplementation.hpp>
+#include <xercesc/dom/DOMImplementationLS.hpp>
 #include <xercesc/dom/DOMText.hpp>
-#include <xercesc/framework/StdOutFormatTarget.hpp>
 #include <xercesc/framework/LocalFileFormatTarget.hpp>
 #include <xercesc/framework/LocalFileInputSource.hpp>
 #include <xercesc/framework/MemBufFormatTarget.hpp>
 #include <xercesc/framework/MemBufInputSource.hpp>
 #include <xercesc/framework/MemoryManager.hpp>
+#include <xercesc/framework/StdOutFormatTarget.hpp>
 #include <xercesc/parsers/XercesDOMParser.hpp>
-#include <xercesc/util/XMLUni.hpp>
-#include <xercesc/util/XMLUniDefs.hpp>
-#include <xercesc/util/XMLString.hpp>
-#include <xercesc/util/PlatformUtils.hpp>
-#include <xercesc/sax/SAXParseException.hpp>
 #include <xercesc/sax/EntityResolver.hpp>
 #include <xercesc/sax/ErrorHandler.hpp>
-#include <xercesc/sax/SAXParseException.hpp>
 #include <xercesc/sax/SAXException.hpp>
+#include <xercesc/sax/SAXParseException.hpp>
 #include <xercesc/sax2/Attributes.hpp>
 #include <xercesc/sax2/DefaultHandler.hpp>
-#include <xercesc/sax2/Attributes.hpp>
 #include <xercesc/sax2/SAX2XMLReader.hpp>
 #include <xercesc/sax2/XMLReaderFactory.hpp>
+#include <xercesc/util/OutOfMemoryException.hpp>
+#include <xercesc/util/PlatformUtils.hpp>
+#include <xercesc/util/XMLString.hpp>
+#include <xercesc/util/XMLUni.hpp>
+#include <xercesc/util/XMLUniDefs.hpp>
+#include <xercesc/util/XercesVersion.hpp>

 #include <boost/algorithm/string.hpp>
 #include <boost/algorithm/string/predicate.hpp>
@@ -120,6 +117,10 @@
 #include <boost/regex.hpp>
 #include <boost/tokenizer.hpp>

+// Unicode
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+
 // QtCore
 #include <QBuffer>
 #include <QByteArray>
--- a/src/Base/Tools.cpp
+++ b/src/Base/Tools.cpp
@@ -23,6 +23,8 @@

 #include "PreCompiled.h"
 #ifndef _PreComp_
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
 #include <vector>
 #include <string>
 #include <sstream>
@@ -34,26 +36,77 @@
 #include "Interpreter.h"
 #include "Tools.h"

+namespace
+{
+constexpr auto underscore = static_cast<UChar32>(U'_');
+
+bool isValidFirstChar(UChar32 c)
+{
+    auto category = static_cast<UCharCategory>(u_charType(c));
+
+    return (c == underscore)
+        || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
+            || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
+            || category == U_OTHER_LETTER || category == U_LETTER_NUMBER);
+}
+
+bool isValidSubsequentChar(UChar32 c)
+{
+    auto category = static_cast<UCharCategory>(u_charType(c));
+    return (c == underscore)
+        || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
+            || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
+            || category == U_OTHER_LETTER || category == U_LETTER_NUMBER
+            || category == U_DECIMAL_DIGIT_NUMBER || category == U_NON_SPACING_MARK
+            || category == U_COMBINING_SPACING_MARK || category == U_CONNECTOR_PUNCTUATION);
+}
+
+std::string unicodeCharToStdString(UChar32 c)
+{
+    icu::UnicodeString uChar(c);
+    std::string utf8Char;
+    return uChar.toUTF8String(utf8Char);
+}
+
+};  // namespace
+
 std::string Base::Tools::getIdentifier(const std::string& name)
 {
    if (name.empty()) {
        return "_";
    }
-    // check for first character whether it's a digit
-    std::string CleanName = name;
-    if (!CleanName.empty() && CleanName[0] >= 48 && CleanName[0] <= 57) {
-        CleanName[0] = '_';
+
+    icu::UnicodeString uName = icu::UnicodeString::fromUTF8(name);
+    std::stringstream result;
+
+    // Handle the first character independently, prepending an underscore if it is not a valid
+    // first character, but *is* a valid later character
+    UChar32 firstChar = uName.char32At(0);
+    const int32_t firstCharLength = U16_LENGTH(firstChar);
+    if (!isValidFirstChar(firstChar)) {
+        result << "_";
+        if (isValidSubsequentChar(firstChar)) {
+            result << unicodeCharToStdString(firstChar);
+        }
    }
-    // strip illegal chars
-    for (char& it : CleanName) {
-        if (!((it >= 48 && it <= 57) ||    // number
-              (it >= 65 && it <= 90) ||    // uppercase letter
-              (it >= 97 && it <= 122))) {  // lowercase letter
-            it = '_';                      // it's neither number nor letter
+    else {
+        result << unicodeCharToStdString(firstChar);
+    }
+
+    for (int32_t i = firstCharLength; i < uName.length(); /* will increment by char length */) {
+        UChar32 c = uName.char32At(i);
+        int32_t charLength = U16_LENGTH(c);
+        i += charLength;
+
+        if (isValidSubsequentChar(c)) {
+            result << unicodeCharToStdString(c);
+        }
+        else {
+            result << "_";
        }
    }

-    return CleanName;
+    return result.str();
 }

 std::wstring Base::Tools::widen(const std::string& str)
--- a/src/Base/Tools.h
+++ b/src/Base/Tools.h
@@ -346,7 +346,14 @@ public:

 struct BaseExport Tools
 {
-    static std::string getIdentifier(const std::string&);
+    /**
+     * Given an arbitrary string, ensure that it conforms to Python3 identifier rules, replacing
+     * invalid characters with an underscore. If the first character is invalid, prepends an
+     * underscore to the name. See https://unicode.org/reports/tr31/ for complete naming rules.
+     * @param String to be checked and sanitized.
+     * @return A std::string that is a valid Python 3 identifier.
+     */
+    static std::string getIdentifier(const std::string& name);
    static std::wstring widen(const std::string& str);
    static std::string narrow(const std::wstring& str);
    static std::string escapedUnicodeFromUtf8(const char* s);
--- a/tests/src/Base/CMakeLists.txt
+++ b/tests/src/Base/CMakeLists.txt
@@ -37,4 +37,5 @@ target_link_libraries(Base_tests_run PRIVATE
    GTest::gmock_main
    ${Google_Tests_LIBS}
    FreeCADApp
+    ICU::uc ICU::i18n
 )
--- a/tests/src/Base/Tools.cpp
+++ b/tests/src/Base/Tools.cpp
@@ -113,4 +113,47 @@ TEST(BaseToolsSuite, TestEscapeQuotesFromString)
    EXPECT_EQ(Base::Tools::escapeQuotesFromString("\""), "\\\"");
    EXPECT_EQ(Base::Tools::escapeQuotesFromString("\\"), "\\");
 }
+TEST(BaseToolsSuite, TestGetIdentifier)
+{
+    // ASCII and edge cases
+    EXPECT_EQ(Base::Tools::getIdentifier("valid"), "valid");
+    EXPECT_EQ(Base::Tools::getIdentifier("_valid"), "_valid");
+    EXPECT_EQ(Base::Tools::getIdentifier("1invalid"), "_1invalid");
+    EXPECT_EQ(Base::Tools::getIdentifier(""), "_");
+
+    // Unicode letters (valid start and continue)
+    EXPECT_EQ(Base::Tools::getIdentifier("πValue"), "πValue");  // Greek lowercase
+    EXPECT_EQ(Base::Tools::getIdentifier("Δx"), "Δx");          // Greek uppercase
+    EXPECT_EQ(Base::Tools::getIdentifier("ǅz"), "ǅz");          // Titlecase letter
+    EXPECT_EQ(Base::Tools::getIdentifier("ʰindex"), "ʰindex");  // Modifier letter
+    EXPECT_EQ(Base::Tools::getIdentifier("名字"), "名字");      // CJK characters (Lo)
+    EXPECT_EQ(Base::Tools::getIdentifier("ⅨCount"), "ⅨCount");  // Letter number (Nl)
+
+    // Digits not valid as first char
+    EXPECT_EQ(Base::Tools::getIdentifier("٢ndPlace"), "_٢ndPlace");  // Arabic-Indic digit (Nd)
+
+    // Connector punctuation
+    EXPECT_EQ(Base::Tools::getIdentifier("valid_name"), "valid_name");
+    EXPECT_EQ(Base::Tools::getIdentifier("valid‿name"), "valid‿name");
+    EXPECT_EQ(Base::Tools::getIdentifier("valid﹍name"), "valid﹍name");
+
+    // Combining marks (Mn, Mc)
+    EXPECT_EQ(Base::Tools::getIdentifier("éclair"), "éclair");  // 'e' + combining acute accent (Mn)
+    EXPECT_EQ(Base::Tools::getIdentifier("devा"), "devा");      // Devanagari vowel sign (Mc)
+
+    // Invalid symbols
+    EXPECT_EQ(Base::Tools::getIdentifier("hello!"), "hello_");
+    EXPECT_EQ(Base::Tools::getIdentifier("foo-bar"), "foo_bar");
+    EXPECT_EQ(Base::Tools::getIdentifier("a🙂b"), "a_b");  // Emoji replaced
+    EXPECT_EQ(Base::Tools::getIdentifier("a*b&c"), "a_b_c");
+
+    // Edge: starts with underscore, includes mixed types
+    EXPECT_EQ(Base::Tools::getIdentifier("_नमस्ते123"), "_नमस्ते123");
+
+    // Starts with invalid character
+    EXPECT_EQ(Base::Tools::getIdentifier("💡idea"), "_idea");
+
+    // Full-width digit (U+FF11, looks like '1')
+    EXPECT_EQ(Base::Tools::getIdentifier("１start"), "_１start");
+}
 // NOLINTEND(cppcoreguidelines-*,readability-*)