Fix encoding error for import from older DXF files

The C++ importer incorrectly treated the contents of all TEXT and MTEXT
objects as beind encoded as UTF-8, but this is not true for DXF files
before AutoCAD 2007, where the encoding is "plain ASCII" plus some in-band
\U+dddd encoding. This would cause errors if the text contained non-ASCII
characters such as the Degree Sign.

This change causes the correct encoding to be used.
This commit is contained in:
Kevin Martin
2023-03-26 09:12:51 -04:00
parent f91ad00ed2
commit dbf17376b7
2 changed files with 165 additions and 5 deletions

View File

@@ -18,7 +18,7 @@
#include <Base/FileInfo.h>
#include <Base/Stream.h>
#include <Base/Vector3D.h>
#include <Base/Interpreter.h>
#include "dxf.h"
@@ -1766,11 +1766,16 @@ CDxfRead::CDxfRead(const char* filepath)
}
m_ifs->imbue(std::locale("C"));
m_version = RUnknown;
m_CodePage = NULL;
m_encoding = NULL;
}
CDxfRead::~CDxfRead()
{
delete m_ifs;
delete m_CodePage;
delete m_encoding;
}
double CDxfRead::mm( double value ) const
@@ -2418,8 +2423,12 @@ bool CDxfRead::ReadText()
// that ReadText() and all the other Read... methods return having already read a code 0.
get_line();
DerefACI();
textPrefix.append(m_str);
OnReadText(c, height * 25.4 / 72.0, textPrefix.c_str());
{
const char* utfStr = (this->*stringToUTF8)(m_str);
OnReadText(c, height * 25.4 / 72.0, utfStr);
if (utfStr != m_str)
delete utfStr;
}
return(true);
case 62:
@@ -3271,7 +3280,118 @@ bool CDxfRead::ReadLayer()
return false;
}
void CDxfRead::DoRead(const bool ignore_errors /* = false */ )
bool CDxfRead::ReadVersion()
{
static const std::vector<std::string> VersionNames = {
// This table is indexed by eDXFVersion_t - (ROlder+1)
"AC1006",
"AC1009",
"AC1012",
"AC1014",
"AC1015",
"AC1018",
"AC1021",
"AC1024",
"AC1027",
"AC1032"};
assert(VersionNames.size() == RNewer - ROlder - 1);
get_line();
get_line();
std::vector<std::string>::const_iterator first = VersionNames.cbegin();
std::vector<std::string>::const_iterator last = VersionNames.cend();
std::vector<std::string>::const_iterator found = std::lower_bound(first, last, m_str);
if (found == last)
m_version = RNewer;
else if (*found == m_str)
m_version = (eAutoCADVersion_t)(std::distance(first, found) + (ROlder + 1));
else if (found == first)
m_version = ROlder;
else
m_version = RUnknown;
return ResolveEncoding();
}
bool CDxfRead::ReadDWGCodePage()
{
get_line();
get_line();
assert(m_CodePage == NULL); // If not, we have found two DWGCODEPAGE variables or DoRead was called twice on the same CDxfRead object.
m_CodePage = new std::string(m_str);
return ResolveEncoding();
}
bool CDxfRead::ResolveEncoding()
{
if (m_encoding != NULL) {
delete m_encoding;
m_encoding = NULL;
}
if (m_version >= R2007) { // Note this does not include RUnknown, but does include RLater
m_encoding = new std::string("utf_8");
stringToUTF8 = &CDxfRead::UTF8ToUTF8;
}
else if (m_CodePage == NULL) {
// cp1252
m_encoding = new std::string("cp1252");
stringToUTF8 = &CDxfRead::GeneralToUTF8;
}
else {
// Names may be of the form "ansi_1252" which we map to "cp1252" but we don't map "ansi_x3xxxx" which means "ascii"
if (strncmp(m_CodePage->c_str(), "ansi_", 5) == 0
&& strncmp(m_CodePage->c_str(), "ansi_x3", 7) != 0) {
std::string* p = new std::string(*m_CodePage);
p->replace(0, 5, "cp");
m_encoding = p;
}
else
m_encoding = new std::string(*m_CodePage);
// At this point we want to recognize synonyms for "utf_8" and use the custom decoder function.
// This is because this is one of the common cases and our decoder function is a fast no-op.
// We don't actually use the decoder function we get from PyCodec_Decoder because to call it we have to convert the (char *) text into
// a 'bytes' object first so we can pass it to the function using PyObject_Callxxx(), getting the PYObject containing the
// Python string, which we then decode back to UTF-8. It is simpler to call PyUnicode_DecodeXxxx which takes a (const char *)
// and is just a direct c++ callable.
PyObject* pyDecoder = PyCodec_Decoder(m_encoding->c_str());
if (pyDecoder == NULL)
return false; // A key error exception will have been placed.
PyObject* pyUTF8Decoder = PyCodec_Decoder("utf_8");
assert(pyUTF8Decoder != NULL);
if (pyDecoder == pyUTF8Decoder)
stringToUTF8 = &CDxfRead::UTF8ToUTF8;
else
stringToUTF8 = &CDxfRead::GeneralToUTF8;
Py_DECREF(pyDecoder);
Py_DECREF(pyUTF8Decoder);
}
return m_encoding != NULL;
}
const char* CDxfRead::UTF8ToUTF8(const char* encoded) const
{
return encoded;
}
const char* CDxfRead::GeneralToUTF8(const char* encoded) const
{
PyObject* decoded = PyUnicode_Decode(encoded, strlen(encoded), m_encoding->c_str(), "strict");
if (decoded == NULL)
return NULL;
Py_ssize_t len;
const char* converted = PyUnicode_AsUTF8AndSize(decoded, &len);
char* result = NULL;
if (converted != NULL) {
// converted only has lifetime of decoded so we must save a copy.
result = (char *)malloc(len + 1);
if (result == NULL)
PyErr_SetString(PyExc_MemoryError, "Out of memory");
else
memcpy(result, converted, len + 1);
}
Py_DECREF(decoded);
return result;
}
void CDxfRead::DoRead(const bool ignore_errors /* = false */)
{
m_ignore_errors = ignore_errors;
if(m_fail)
@@ -3298,7 +3418,19 @@ void CDxfRead::DoRead(const bool ignore_errors /* = false */ )
continue;
} // End if - then
else if(!strcmp(m_str, "0"))
if (!strcmp(m_str, "$ACADVER")) {
if (!ReadVersion())
return;
continue;
}// End if - then
if (!strcmp(m_str, "$DWGCODEPAGE")) {
if (!ReadDWGCodePage())
return;
continue;
}// End if - then
if (!strcmp(m_str, "0"))
{
get_line();
if (!strcmp( m_str, "SECTION" )){

View File

@@ -118,6 +118,22 @@ struct LWPolyDataOut
std::vector<double> Bulge;
point3D Extr;
};
typedef enum
{
RUnknown,
ROlder,
R10,
R11_12,
R13,
R14,
R2000,
R2004,
R2007,
R2010,
R2013,
R2018,
RNewer,
} eAutoCADVersion_t;
//********************
class CDxfWrite{
@@ -277,6 +293,9 @@ private:
bool ReadInsert();
bool ReadDimension();
bool ReadBlockInfo();
bool ReadVersion();
bool ReadDWGCodePage();
bool ResolveEncoding();
void get_line();
void put_line(const char *value);
@@ -284,6 +303,15 @@ private:
protected:
Aci_t m_aci; // manifest color name or 256 for layer color
eAutoCADVersion_t m_version;// Version from $ACADVER variable in DXF
const char* (CDxfRead::*stringToUTF8)(const char*) const;
private:
const std::string* m_CodePage; // Code Page name from $DWGCODEPAGE or null if none/not read yet
// The following was going to be python's canonical name for the encoding, but this is (a) not easily found and (b) does not speed up finding the encoding object.
const std::string* m_encoding;// A name for the encoding implied by m_version and m_CodePage
const char* UTF8ToUTF8(const char* encoded) const;
const char* GeneralToUTF8(const char* encoded) const;
public:
ImportExport CDxfRead(const char* filepath); // this opens the file