Base: Allow all valid Python 3 identifier characters

Note: Does not check for keywords, only character classes (not a change from the original code).
This commit is contained in:
Chris Hennes
2025-07-14 09:55:32 -05:00
committed by Kacper Donat
parent e41c36a82c
commit 3ecdde8491
8 changed files with 136 additions and 25 deletions

View File

@@ -28,6 +28,7 @@ target_include_directories(
${XercesC_INCLUDE_DIRS}
${ZLIB_INCLUDE_DIR}
${ZIPIOS_INCLUDES}
${ICU_INCLUDE_DIRS}
)
if(MSVC)
@@ -35,6 +36,7 @@ if(MSVC)
${Boost_LIBRARIES}
${XercesC_LIBRARIES}
${ZLIB_LIBRARIES}
${ICU_LIBRARIES}
debug MSVCRTD.LIB
debug MSVCPRTD.LIB
optimized MSVCRT.LIB
@@ -59,6 +61,7 @@ elseif(MINGW)
${XercesC_LIBRARIES}
${Boost_LIBRARIES}
${ZLIB_LIBRARIES}
${ICU_LIBRARIES}
Rpcrt4.lib
)
else(MSVC)
@@ -66,6 +69,7 @@ else(MSVC)
${XercesC_LIBRARIES}
${Boost_LIBRARIES}
${ZLIB_LIBRARIES}
${ICU_LIBRARIES}
)
endif(MSVC)

View File

@@ -83,36 +83,33 @@
#include <sstream>
// Xerces
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XercesVersion.hpp>
#include <xercesc/dom/DOM.hpp>
#include <xercesc/dom/DOMImplementation.hpp>
#include <xercesc/dom/DOMImplementationLS.hpp>
#include <xercesc/dom/DOMDocument.hpp>
#include <xercesc/dom/DOMElement.hpp>
#include <xercesc/dom/DOMImplementation.hpp>
#include <xercesc/dom/DOMImplementationLS.hpp>
#include <xercesc/dom/DOMText.hpp>
#include <xercesc/framework/StdOutFormatTarget.hpp>
#include <xercesc/framework/LocalFileFormatTarget.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/MemBufFormatTarget.hpp>
#include <xercesc/framework/MemBufInputSource.hpp>
#include <xercesc/framework/MemoryManager.hpp>
#include <xercesc/framework/StdOutFormatTarget.hpp>
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/util/XMLUni.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/EntityResolver.hpp>
#include <xercesc/sax/ErrorHandler.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax/SAXException.hpp>
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/XMLUni.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/util/XercesVersion.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/predicate.hpp>
@@ -120,6 +117,10 @@
#include <boost/regex.hpp>
#include <boost/tokenizer.hpp>
// Unicode
#include <unicode/unistr.h>
#include <unicode/uchar.h>
// QtCore
#include <QBuffer>
#include <QByteArray>

View File

@@ -23,6 +23,8 @@
#include "PreCompiled.h"
#ifndef _PreComp_
#include <unicode/unistr.h>
#include <unicode/uchar.h>
#include <vector>
#include <string>
#include <sstream>
@@ -34,26 +36,77 @@
#include "Interpreter.h"
#include "Tools.h"
namespace
{
constexpr auto underscore = static_cast<UChar32>(U'_');
bool isValidFirstChar(UChar32 c)
{
auto category = static_cast<UCharCategory>(u_charType(c));
return (c == underscore)
|| (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
|| category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
|| category == U_OTHER_LETTER || category == U_LETTER_NUMBER);
}
bool isValidSubsequentChar(UChar32 c)
{
auto category = static_cast<UCharCategory>(u_charType(c));
return (c == underscore)
|| (category == U_UPPERCASE_LETTER || category == U_LOWERCASE_LETTER
|| category == U_TITLECASE_LETTER || category == U_MODIFIER_LETTER
|| category == U_OTHER_LETTER || category == U_LETTER_NUMBER
|| category == U_DECIMAL_DIGIT_NUMBER || category == U_NON_SPACING_MARK
|| category == U_COMBINING_SPACING_MARK || category == U_CONNECTOR_PUNCTUATION);
}
std::string unicodeCharToStdString(UChar32 c)
{
icu::UnicodeString uChar(c);
std::string utf8Char;
return uChar.toUTF8String(utf8Char);
}
}; // namespace
std::string Base::Tools::getIdentifier(const std::string& name)
{
if (name.empty()) {
return "_";
}
// check for first character whether it's a digit
std::string CleanName = name;
if (!CleanName.empty() && CleanName[0] >= 48 && CleanName[0] <= 57) {
CleanName[0] = '_';
icu::UnicodeString uName = icu::UnicodeString::fromUTF8(name);
std::stringstream result;
// Handle the first character independently, prepending an underscore if it is not a valid
// first character, but *is* a valid later character
UChar32 firstChar = uName.char32At(0);
const int32_t firstCharLength = U16_LENGTH(firstChar);
if (!isValidFirstChar(firstChar)) {
result << "_";
if (isValidSubsequentChar(firstChar)) {
result << unicodeCharToStdString(firstChar);
}
}
// strip illegal chars
for (char& it : CleanName) {
if (!((it >= 48 && it <= 57) || // number
(it >= 65 && it <= 90) || // uppercase letter
(it >= 97 && it <= 122))) { // lowercase letter
it = '_'; // it's neither number nor letter
else {
result << unicodeCharToStdString(firstChar);
}
for (int32_t i = firstCharLength; i < uName.length(); /* will increment by char length */) {
UChar32 c = uName.char32At(i);
int32_t charLength = U16_LENGTH(c);
i += charLength;
if (isValidSubsequentChar(c)) {
result << unicodeCharToStdString(c);
}
else {
result << "_";
}
}
return CleanName;
return result.str();
}
std::wstring Base::Tools::widen(const std::string& str)

View File

@@ -346,7 +346,14 @@ public:
struct BaseExport Tools
{
static std::string getIdentifier(const std::string&);
/**
* Given an arbitrary string, ensure that it conforms to Python3 identifier rules, replacing
* invalid characters with an underscore. If the first character is invalid, prepends an
* underscore to the name. See https://unicode.org/reports/tr31/ for complete naming rules.
* @param String to be checked and sanitized.
* @return A std::string that is a valid Python 3 identifier.
*/
static std::string getIdentifier(const std::string& name);
static std::wstring widen(const std::string& str);
static std::string narrow(const std::wstring& str);
static std::string escapedUnicodeFromUtf8(const char* s);