Base: Allow all valid Python 3 identifier characters

Note: Does not check for keywords, only character classes (not a change from the original code).
2025-07-14 09:55:32 -05:00
parent 7b85239093
commit c26e3e9756
8 changed files with 136 additions and 25 deletions
--- a/src/Base/Tools.cpp
+++ b/src/Base/Tools.cpp
@@ -23,6 +23,8 @@

 #include "PreCompiled.h"
 #ifndef _PreComp_
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
 #include <vector>
 #include <string>
 #include <sstream>
@@ -34,26 +36,77 @@
 #include "Interpreter.h"
 #include "Tools.h"

+namespace
+{
+constexpr auto underscore = static_cast<UChar32>(U'_');
+
+bool isValidFirstChar(UChar32 c)
+{
+    auto category = static_cast<UCharCategory>(u_charType(c));
+
+    return (c == underscore)
+        || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
+            || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
+            || category == U_OTHER_LETTER || category == U_LETTER_NUMBER);
+}
+
+bool isValidSubsequentChar(UChar32 c)
+{
+    auto category = static_cast<UCharCategory>(u_charType(c));
+    return (c == underscore)
+        || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
+            || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
+            || category == U_OTHER_LETTER || category == U_LETTER_NUMBER
+            || category == U_DECIMAL_DIGIT_NUMBER || category == U_NON_SPACING_MARK
+            || category == U_COMBINING_SPACING_MARK || category == U_CONNECTOR_PUNCTUATION);
+}
+
+std::string unicodeCharToStdString(UChar32 c)
+{
+    icu::UnicodeString uChar(c);
+    std::string utf8Char;
+    return uChar.toUTF8String(utf8Char);
+}
+
+};  // namespace
+
 std::string Base::Tools::getIdentifier(const std::string& name)
 {
    if (name.empty()) {
        return "_";
    }
-    // check for first character whether it's a digit
-    std::string CleanName = name;
-    if (!CleanName.empty() && CleanName[0] >= 48 && CleanName[0] <= 57) {
-        CleanName[0] = '_';
+
+    icu::UnicodeString uName = icu::UnicodeString::fromUTF8(name);
+    std::stringstream result;
+
+    // Handle the first character independently, prepending an underscore if it is not a valid
+    // first character, but *is* a valid later character
+    UChar32 firstChar = uName.char32At(0);
+    const int32_t firstCharLength = U16_LENGTH(firstChar);
+    if (!isValidFirstChar(firstChar)) {
+        result << "_";
+        if (isValidSubsequentChar(firstChar)) {
+            result << unicodeCharToStdString(firstChar);
+        }
    }
-    // strip illegal chars
-    for (char& it : CleanName) {
-        if (!((it >= 48 && it <= 57) ||    // number
-              (it >= 65 && it <= 90) ||    // uppercase letter
-              (it >= 97 && it <= 122))) {  // lowercase letter
-            it = '_';                      // it's neither number nor letter
+    else {
+        result << unicodeCharToStdString(firstChar);
+    }
+
+    for (int32_t i = firstCharLength; i < uName.length(); /* will increment by char length */) {
+        UChar32 c = uName.char32At(i);
+        int32_t charLength = U16_LENGTH(c);
+        i += charLength;
+
+        if (isValidSubsequentChar(c)) {
+            result << unicodeCharToStdString(c);
+        }
+        else {
+            result << "_";
        }
    }

-    return CleanName;
+    return result.str();
 }

 std::wstring Base::Tools::widen(const std::string& str)