diff --git a/CMakeLists.txt b/CMakeLists.txt index abced4aca7..35fb32f9d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,7 @@ SetupDoxygen() SetupLibFmt() SetupYamlCpp() SetupZipIos() +find_package(ICU REQUIRED COMPONENTS uc i18n) if(NOT FREECAD_LIBPACK_USE OR FREECAD_LIBPACK_CHECKFILE_CLBUNDLER OR FREECAD_LIBPACK_CHECKFILE_VERSION) SetupPython() SetupPCL() diff --git a/cMake/FreeCAD_Helpers/PrintFinalReport.cmake b/cMake/FreeCAD_Helpers/PrintFinalReport.cmake index bef4f83ebc..5d50f6b02f 100644 --- a/cMake/FreeCAD_Helpers/PrintFinalReport.cmake +++ b/cMake/FreeCAD_Helpers/PrintFinalReport.cmake @@ -141,6 +141,7 @@ macro(PrintFinalReport) conditional(pybind11 pybind11_FOUND "not enabled" ${pybind11_VERSION}) simple(Boost ${Boost_VERSION}) simple(XercesC "${XercesC_VERSION} [${XercesC_LIBRARIES}] [${XercesC_INCLUDE_DIRS}]") + simple(ICU "${ICU_VERSION}") simple(ZLIB "${ZLIB_VERSION_STRING}") simple(OCC "${OCC_VERSION_STRING} [${OCC_LIBRARY_DIR}] [${OCC_INCLUDE_DIR}]") simple(OCC_Libs "[${OCC_LIBRARIES}]") diff --git a/src/Base/CMakeLists.txt b/src/Base/CMakeLists.txt index 20adc29938..dc6357a450 100644 --- a/src/Base/CMakeLists.txt +++ b/src/Base/CMakeLists.txt @@ -28,6 +28,7 @@ target_include_directories( ${XercesC_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIR} ${ZIPIOS_INCLUDES} + ${ICU_INCLUDE_DIRS} ) if(MSVC) @@ -35,6 +36,7 @@ if(MSVC) ${Boost_LIBRARIES} ${XercesC_LIBRARIES} ${ZLIB_LIBRARIES} + ${ICU_LIBRARIES} debug MSVCRTD.LIB debug MSVCPRTD.LIB optimized MSVCRT.LIB @@ -59,6 +61,7 @@ elseif(MINGW) ${XercesC_LIBRARIES} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} + ${ICU_LIBRARIES} Rpcrt4.lib ) else(MSVC) @@ -66,6 +69,7 @@ else(MSVC) ${XercesC_LIBRARIES} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} + ${ICU_LIBRARIES} ) endif(MSVC) diff --git a/src/Base/PreCompiled.h b/src/Base/PreCompiled.h index dd7d0a2182..0705fa6167 100644 --- a/src/Base/PreCompiled.h +++ b/src/Base/PreCompiled.h @@ -83,36 +83,33 @@ #include // Xerces -#include -#include -#include #include -#include -#include #include #include +#include +#include #include -#include #include #include #include #include #include +#include #include -#include -#include -#include -#include -#include #include #include -#include #include +#include #include #include -#include #include #include +#include +#include +#include +#include +#include +#include #include #include @@ -120,6 +117,10 @@ #include #include +// Unicode +#include +#include + // QtCore #include #include diff --git a/src/Base/Tools.cpp b/src/Base/Tools.cpp index ae8ae8d1b9..7ddf08c094 100644 --- a/src/Base/Tools.cpp +++ b/src/Base/Tools.cpp @@ -23,6 +23,8 @@ #include "PreCompiled.h" #ifndef _PreComp_ +#include +#include #include #include #include @@ -34,26 +36,77 @@ #include "Interpreter.h" #include "Tools.h" +namespace +{ +constexpr auto underscore = static_cast(U'_'); + +bool isValidFirstChar(UChar32 c) +{ + auto category = static_cast(u_charType(c)); + + return (c == underscore) + || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER + || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER + || category == U_OTHER_LETTER || category == U_LETTER_NUMBER); +} + +bool isValidSubsequentChar(UChar32 c) +{ + auto category = static_cast(u_charType(c)); + return (c == underscore) + || (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER + || category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER + || category == U_OTHER_LETTER || category == U_LETTER_NUMBER + || category == U_DECIMAL_DIGIT_NUMBER || category == U_NON_SPACING_MARK + || category == U_COMBINING_SPACING_MARK || category == U_CONNECTOR_PUNCTUATION); +} + +std::string unicodeCharToStdString(UChar32 c) +{ + icu::UnicodeString uChar(c); + std::string utf8Char; + return uChar.toUTF8String(utf8Char); +} + +}; // namespace + std::string Base::Tools::getIdentifier(const std::string& name) { if (name.empty()) { return "_"; } - // check for first character whether it's a digit - std::string CleanName = name; - if (!CleanName.empty() && CleanName[0] >= 48 && CleanName[0] <= 57) { - CleanName[0] = '_'; + + icu::UnicodeString uName = icu::UnicodeString::fromUTF8(name); + std::stringstream result; + + // Handle the first character independently, prepending an underscore if it is not a valid + // first character, but *is* a valid later character + UChar32 firstChar = uName.char32At(0); + const int32_t firstCharLength = U16_LENGTH(firstChar); + if (!isValidFirstChar(firstChar)) { + result << "_"; + if (isValidSubsequentChar(firstChar)) { + result << unicodeCharToStdString(firstChar); + } } - // strip illegal chars - for (char& it : CleanName) { - if (!((it >= 48 && it <= 57) || // number - (it >= 65 && it <= 90) || // uppercase letter - (it >= 97 && it <= 122))) { // lowercase letter - it = '_'; // it's neither number nor letter + else { + result << unicodeCharToStdString(firstChar); + } + + for (int32_t i = firstCharLength; i < uName.length(); /* will increment by char length */) { + UChar32 c = uName.char32At(i); + int32_t charLength = U16_LENGTH(c); + i += charLength; + + if (isValidSubsequentChar(c)) { + result << unicodeCharToStdString(c); + } + else { + result << "_"; } } - return CleanName; + return result.str(); } std::wstring Base::Tools::widen(const std::string& str) diff --git a/src/Base/Tools.h b/src/Base/Tools.h index a0ddb2a17e..3042871958 100644 --- a/src/Base/Tools.h +++ b/src/Base/Tools.h @@ -346,7 +346,14 @@ public: struct BaseExport Tools { - static std::string getIdentifier(const std::string&); + /** + * Given an arbitrary string, ensure that it conforms to Python3 identifier rules, replacing + * invalid characters with an underscore. If the first character is invalid, prepends an + * underscore to the name. See https://unicode.org/reports/tr31/ for complete naming rules. + * @param String to be checked and sanitized. + * @return A std::string that is a valid Python 3 identifier. + */ + static std::string getIdentifier(const std::string& name); static std::wstring widen(const std::string& str); static std::string narrow(const std::wstring& str); static std::string escapedUnicodeFromUtf8(const char* s); diff --git a/tests/src/Base/CMakeLists.txt b/tests/src/Base/CMakeLists.txt index 0bdc34ba5c..30be828d23 100644 --- a/tests/src/Base/CMakeLists.txt +++ b/tests/src/Base/CMakeLists.txt @@ -37,4 +37,5 @@ target_link_libraries(Base_tests_run PRIVATE GTest::gmock_main ${Google_Tests_LIBS} FreeCADApp + ICU::uc ICU::i18n ) diff --git a/tests/src/Base/Tools.cpp b/tests/src/Base/Tools.cpp index 39fc056fd9..e2dc868496 100644 --- a/tests/src/Base/Tools.cpp +++ b/tests/src/Base/Tools.cpp @@ -113,4 +113,47 @@ TEST(BaseToolsSuite, TestEscapeQuotesFromString) EXPECT_EQ(Base::Tools::escapeQuotesFromString("\""), "\\\""); EXPECT_EQ(Base::Tools::escapeQuotesFromString("\\"), "\\"); } +TEST(BaseToolsSuite, TestGetIdentifier) +{ + // ASCII and edge cases + EXPECT_EQ(Base::Tools::getIdentifier("valid"), "valid"); + EXPECT_EQ(Base::Tools::getIdentifier("_valid"), "_valid"); + EXPECT_EQ(Base::Tools::getIdentifier("1invalid"), "_1invalid"); + EXPECT_EQ(Base::Tools::getIdentifier(""), "_"); + + // Unicode letters (valid start and continue) + EXPECT_EQ(Base::Tools::getIdentifier("πValue"), "πValue"); // Greek lowercase + EXPECT_EQ(Base::Tools::getIdentifier("Δx"), "Δx"); // Greek uppercase + EXPECT_EQ(Base::Tools::getIdentifier("Džz"), "Džz"); // Titlecase letter + EXPECT_EQ(Base::Tools::getIdentifier("ʰindex"), "ʰindex"); // Modifier letter + EXPECT_EQ(Base::Tools::getIdentifier("名字"), "名字"); // CJK characters (Lo) + EXPECT_EQ(Base::Tools::getIdentifier("ⅨCount"), "ⅨCount"); // Letter number (Nl) + + // Digits not valid as first char + EXPECT_EQ(Base::Tools::getIdentifier("٢ndPlace"), "_٢ndPlace"); // Arabic-Indic digit (Nd) + + // Connector punctuation + EXPECT_EQ(Base::Tools::getIdentifier("valid_name"), "valid_name"); + EXPECT_EQ(Base::Tools::getIdentifier("valid‿name"), "valid‿name"); + EXPECT_EQ(Base::Tools::getIdentifier("valid﹍name"), "valid﹍name"); + + // Combining marks (Mn, Mc) + EXPECT_EQ(Base::Tools::getIdentifier("éclair"), "éclair"); // 'e' + combining acute accent (Mn) + EXPECT_EQ(Base::Tools::getIdentifier("devा"), "devा"); // Devanagari vowel sign (Mc) + + // Invalid symbols + EXPECT_EQ(Base::Tools::getIdentifier("hello!"), "hello_"); + EXPECT_EQ(Base::Tools::getIdentifier("foo-bar"), "foo_bar"); + EXPECT_EQ(Base::Tools::getIdentifier("a🙂b"), "a_b"); // Emoji replaced + EXPECT_EQ(Base::Tools::getIdentifier("a*b&c"), "a_b_c"); + + // Edge: starts with underscore, includes mixed types + EXPECT_EQ(Base::Tools::getIdentifier("_नमस्ते123"), "_नमस्ते123"); + + // Starts with invalid character + EXPECT_EQ(Base::Tools::getIdentifier("💡idea"), "_idea"); + + // Full-width digit (U+FF11, looks like '1') + EXPECT_EQ(Base::Tools::getIdentifier("1start"), "_1start"); +} // NOLINTEND(cppcoreguidelines-*,readability-*)