Skip to content

Commit dc4b10e

Browse files
author
Jang Myeongho
committed
Added IfcCharacterDecoder::compatibility_mode
1 parent 1aa3a52 commit dc4b10e

2 files changed

Lines changed: 41 additions & 9 deletions

File tree

src/ifcparse/IfcCharacterDecoder.cpp

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ void IfcCharacterDecoder::addChar(std::stringstream& s,const UChar32& ch) {
8787
s.put(substitution_character);
8888
#endif
8989
}
90-
#include <iostream>
9190
IfcCharacterDecoder::IfcCharacterDecoder(IfcParse::File* f) {
9291
file = f;
9392
#ifdef HAVE_ICU
@@ -103,6 +102,10 @@ IfcCharacterDecoder::IfcCharacterDecoder(IfcParse::File* f) {
103102
} else if (mode == LATIN) {
104103
destination = ucnv_open("iso-8859-1", &status);
105104
}
105+
if (compatibility_charset.empty()) {
106+
compatibility_charset = ucnv_getDefaultName();
107+
}
108+
compatibility_converter = ucnv_open(compatibility_charset.c_str(), &status);
106109
#endif
107110
}
108111
IfcCharacterDecoder::~IfcCharacterDecoder() {
@@ -121,6 +124,8 @@ IfcCharacterDecoder::operator std::string() {
121124
int codepage = 1;
122125
unsigned int hex = 0;
123126
unsigned int hex_count = 0;
127+
unsigned int old_hex = 0; // for compatibility_mode
128+
124129
while ( current_char = file->Peek() ) {
125130
if ( EXPECTS_CHARACTER(parse_state) ) {
126131
#ifdef HAVE_ICU
@@ -171,14 +176,27 @@ IfcCharacterDecoder::operator std::string() {
171176
hex <<= 4;
172177
parse_state += HEX((++hex_count));
173178
hex += HEX_TO_INT(current_char);
174-
if ( (hex_count == 2 && !(parse_state & EXTENDED2)) ||
175-
(hex_count == 4 && !(parse_state & EXTENDED4)) ||
176-
(hex_count == 8) ) {
177-
addChar(s,(UChar32) hex);
178-
if ( hex_count == 2 ) parse_state = 0;
179-
else CLEAR_HEX(parse_state);
180-
hex = hex_count = 0;
181-
}
179+
if ( (hex_count == 2 && !(parse_state & EXTENDED2)) ||
180+
(hex_count == 4 && !(parse_state & EXTENDED4)) ||
181+
(hex_count == 8) ) {
182+
if (compatibility_mode) {
183+
if (old_hex == 0) {
184+
old_hex = hex;
185+
} else {
186+
char characters[3] = { old_hex, hex };
187+
const char* char_array = &characters[0];
188+
UChar32 ch = ucnv_getNextUChar(compatibility_converter,&char_array,char_array+2,&status);
189+
addChar(s,ch);
190+
old_hex = 0;
191+
}
192+
}
193+
else {
194+
addChar(s,(UChar32) hex);
195+
}
196+
if ( hex_count == 2 ) parse_state = 0;
197+
else CLEAR_HEX(parse_state);
198+
hex = hex_count = 0;
199+
}
182200
} else if ( parse_state && !(
183201
(current_char == '\\' && parse_state == FIRST_SOLIDUS) ||
184202
(current_char == '\'' && parse_state == APOSTROPHE)
@@ -256,12 +274,18 @@ void IfcCharacterDecoder::dryRun() {
256274
#ifdef HAVE_ICU
257275
UConverter* IfcCharacterDecoder::destination = 0;
258276
UConverter* IfcCharacterDecoder::converter = 0;
277+
UConverter* IfcCharacterDecoder::compatibility_converter = 0;
259278
int IfcCharacterDecoder::previous_codepage = -1;
260279
UErrorCode IfcCharacterDecoder::status = U_ZERO_ERROR;
261280
#endif
262281

263282
//#ifdef HAVE_ICU
264283
IfcCharacterDecoder::ConversionMode IfcCharacterDecoder::mode = IfcCharacterDecoder::JSON;
284+
285+
// Many BIM software (eg. Revit, ArchiCAD, ...) has wrong behavior
286+
bool IfcCharacterDecoder::compatibility_mode = false;
287+
std::string IfcCharacterDecoder::compatibility_charset = "";
288+
265289
//#else
266290
char IfcCharacterDecoder::substitution_character = '_';
267291
//#endif

src/ifcparse/IfcCharacterDecoder.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ namespace IfcParse {
4646
#ifdef HAVE_ICU
4747
static UConverter* destination;
4848
static UConverter* converter;
49+
static UConverter* compatibility_converter;
4950
static int previous_codepage;
5051
static UErrorCode status;
5152
#endif
@@ -54,6 +55,13 @@ namespace IfcParse {
5455
//#ifdef HAVE_ICU
5556
enum ConversionMode {DEFAULT,UTF8,LATIN,JSON,PYTHON};
5657
static ConversionMode mode;
58+
59+
// Many BIM software (eg. Revit, ArchiCAD, ...) has wrong behavior to encode characters.
60+
// It just translate to extended string in system default code page, not unicode.
61+
// If you want to process these strings, set true.
62+
static bool compatibility_mode;
63+
static std::string compatibility_charset;
64+
5765
//#else
5866
static char substitution_character;
5967
//#endif

0 commit comments

Comments
 (0)