/******************************************************************************** * * * This file is part of IfcOpenShell. * * * * IfcOpenShell is free software: you can redistribute it and/or modify * * it under the terms of the Lesser GNU General Public License as published by * * the Free Software Foundation, either version 3.0 of the License, or * * (at your option) any later version. * * * * IfcOpenShell is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * Lesser GNU General Public License for more details. * * * * You should have received a copy of the Lesser GNU General Public License * * along with this program. If not, see . * * * ********************************************************************************/ /******************************************************************************** * * * Implementation of character decoding as described in ISO 10303-21 table 2 and * * table 4 * * * ********************************************************************************/ #include #include #include #include "../ifcparse/IfcCharacterDecoder.h" #include "../ifcparse/IfcException.h" #include "../ifcparse/IfcSpfStream.h" #define FIRST_SOLIDUS (1 << 1) #define PAGE (1 << 2) #define ALPHABET (1 << 3) #define SECOND_SOLIDUS (1 << 4) #define ALPHABET_DEFINITION (1 << 5) #define APOSTROPHE (1 << 6) #define ARBITRARY (1 << 7) #define EXTENDED2 (1 << 8) #define EXTENDED4 (1 << 9) #define HEX(N) (1 << (9+N)) #define THIRD_SOLIDUS (1 << 18) #define ENDEXTENDED_X (1 << 19) #define ENDEXTENDED_0 (1 << 20) #define FOURTH_SOLIDUS (1 << 21) #define IGNORED_DIRECTIVE (1 << 22) #define ENCOUNTERED_HEX (1 << 23) // FIXME: These probably need to be less forgiving in terms of wrongly defined sequences #define EXPECTS_ALPHABET(S) (S & FIRST_SOLIDUS) #define EXPECTS_PAGE(S) (S & FIRST_SOLIDUS) #define EXPECTS_ARBITRARY(S) (S & FIRST_SOLIDUS) #define EXPECTS_N_OR_F(S) (S & FIRST_SOLIDUS && !(S & ARBITRARY) ) #define EXPECTS_ARBITRARY2(S) (S & ARBITRARY && !(S & SECOND_SOLIDUS)) #define EXPECTS_ALPHABET_DEFINITION(S) (S & FIRST_SOLIDUS && S & ALPHABET) #define EXPECTS_SOLIDUS(S) (S & ALPHABET_DEFINITION || S & PAGE || S & ARBITRARY || S & EXTENDED2 || S & EXTENDED4 || S & ENDEXTENDED_0 || S & IGNORED_DIRECTIVE || (S & EXTENDED4 && S & HEX(8)) || (S & EXTENDED2 && S & HEX(4))) #define EXPECTS_CHARACTER(S) (S & PAGE && S & SECOND_SOLIDUS) #define EXPECTS_HEX(S) (S & HEX(1) || S & HEX(3) || S & HEX(5) || S & HEX(6) || S & HEX(7) || (S & ARBITRARY && S & SECOND_SOLIDUS) || (S & EXTENDED2 && S & HEX(2)) || (S & EXTENDED4 && S & HEX(4)) ) #define EXPECTS_ENDEXTENDED_X(S) (S & THIRD_SOLIDUS) #define EXPECTS_ENDEXTENDED_0(S) (S & ENDEXTENDED_X) #define IS_VALID_ALPHABET_DEFINITION(C) (C >= 0x40 && C <= 0x4A) #define IS_HEXADECIMAL(C) ((C >= 0x30 && C <= 0x39 ) || (C >= 0x41 && C <= 0x46 )) #define HEX_TO_INT(C) ((C >= 0x30 && C <= 0x39 ) ? C - 0x30 : (C+10) - 0x41) #define CLEAR_HEX(C) (C &= ~(HEX(1)|HEX(2)|HEX(3)|HEX(4)|HEX(5)|HEX(6)|HEX(7)|HEX(8))) using namespace IfcParse; using namespace IfcWrite; #ifdef HAVE_ICU #include #endif void IfcCharacterDecoder::addChar(std::stringstream& s,const UChar32& ch) { #ifdef HAVE_ICU if ( destination ) { /* Note: The extraction buffer is of size 5, because in the UTF-8 encoding the maximum length in bytes is 4. We add 1 for the NUL character. In other encodings the length could be higher, but we have not taken that into account. */ char extraction_buffer[5] = {}; icu::UnicodeString(ch).extract(extraction_buffer,5,destination,status); extraction_buffer[4] = '\0'; s << extraction_buffer; } else { std::stringstream s2; s2 << "\\u" << std::hex << std::setw(4) << std::setfill('0') << (int) ch; s << s2.str(); } #else (void)ch; s.put(substitution_character); #endif } IfcCharacterDecoder::IfcCharacterDecoder(IfcParse::IfcSpfStream* f) { file = f; #ifdef HAVE_ICU if (destination) ucnv_close(destination); if (compatibility_converter) ucnv_close(compatibility_converter); destination = 0; compatibility_converter = 0; if (mode == DEFAULT) { destination = ucnv_open(0, &status); } else if (mode == UTF8) { destination = ucnv_open("utf-8", &status); } else if (mode == LATIN) { destination = ucnv_open("iso-8859-1", &status); } if (compatibility_charset.empty()) { compatibility_charset = ucnv_getDefaultName(); } compatibility_converter = ucnv_open(compatibility_charset.c_str(), &status); #endif } IfcCharacterDecoder::~IfcCharacterDecoder() { #ifdef HAVE_ICU if ( destination ) ucnv_close(destination); if ( converter ) ucnv_close(converter); if ( compatibility_converter ) ucnv_close(compatibility_converter); destination = 0; converter = 0; compatibility_converter = 0; ucnv_flushCache(); #endif } IfcCharacterDecoder::operator std::string() { unsigned int parse_state = 0; std::stringstream s; s.put('\''); char current_char; int codepage = 1; unsigned int hex = 0; unsigned int hex_count = 0; #ifdef HAVE_ICU unsigned int old_hex = 0; // for compatibility_mode #endif while ( (current_char = file->Peek()) != 0 ) { if ( EXPECTS_CHARACTER(parse_state) ) { #ifdef HAVE_ICU if ( previous_codepage != codepage ) { if ( converter ) ucnv_close(converter); char encoder[11] = {'i','s','o','-','8','8','5','9','-', static_cast(codepage + 0x30) }; converter = ucnv_open(encoder, &status); } const char characters[2] = { static_cast(current_char + 0x80) }; const char* char_array = &characters[0]; UChar32 ch = ucnv_getNextUChar(converter,&char_array,char_array+1,&status); addChar(s,ch); #else UChar32 ch = 0; addChar(s,ch); #endif parse_state = 0; } else if ( current_char == '\'' && ! parse_state ) { parse_state = APOSTROPHE; } else if ( current_char == '\\' && ! parse_state ) { parse_state = FIRST_SOLIDUS; } else if ( current_char == '\\' && EXPECTS_SOLIDUS(parse_state) ) { if ( parse_state & ALPHABET_DEFINITION || parse_state & IGNORED_DIRECTIVE || parse_state & ENDEXTENDED_0 ) parse_state = hex = hex_count = 0; else if ( parse_state & ENCOUNTERED_HEX ) { parse_state += THIRD_SOLIDUS; parse_state -= ENCOUNTERED_HEX; } else parse_state += SECOND_SOLIDUS; } else if ( current_char == 'X' && EXPECTS_ENDEXTENDED_X(parse_state) ) { parse_state += ENDEXTENDED_X; } else if ( current_char == '0' && EXPECTS_ENDEXTENDED_0(parse_state) ) { parse_state += ENDEXTENDED_0; } else if ( current_char == 'X' && EXPECTS_ARBITRARY(parse_state) ) { parse_state += ARBITRARY; } else if ( current_char == '2' && EXPECTS_ARBITRARY2(parse_state) ) { parse_state += EXTENDED2; } else if ( current_char == '4' && EXPECTS_ARBITRARY2(parse_state) ) { parse_state += EXTENDED2 + EXTENDED4; } else if ( current_char == 'P' && EXPECTS_ALPHABET(parse_state) ) { parse_state += ALPHABET; } else if ( (current_char == 'N' || current_char == 'F') && EXPECTS_N_OR_F(parse_state) ) { parse_state += IGNORED_DIRECTIVE; } else if ( IS_VALID_ALPHABET_DEFINITION(current_char) && EXPECTS_ALPHABET_DEFINITION(parse_state) ) { codepage = current_char - 0x40; parse_state += ALPHABET_DEFINITION; } else if ( current_char == 'S' && EXPECTS_PAGE(parse_state) ) { parse_state += PAGE; } else if ( IS_HEXADECIMAL(current_char) && EXPECTS_HEX(parse_state) ) { hex <<= 4; parse_state += HEX((++hex_count)); hex += HEX_TO_INT(current_char); if ( (hex_count == 2 && !(parse_state & EXTENDED2)) || (hex_count == 4 && !(parse_state & EXTENDED4)) || (hex_count == 8) ) { #ifdef HAVE_ICU if (compatibility_mode) { if (old_hex == 0) { old_hex = hex; } else { char characters[3] = { (char)old_hex, (char)hex }; const char* char_array = &characters[0]; UChar32 ch = ucnv_getNextUChar(compatibility_converter,&char_array,char_array+2,&status); addChar(s,ch); old_hex = 0; } } else { #endif addChar(s,(UChar32) hex); #ifdef HAVE_ICU } #endif if ( hex_count == 2 ) parse_state = 0; else { CLEAR_HEX(parse_state); parse_state |= ENCOUNTERED_HEX; } hex = hex_count = 0; } } else if ( parse_state && !( (current_char == '\\' && parse_state == FIRST_SOLIDUS) || (current_char == '\'' && parse_state == APOSTROPHE) ) ) { if ( parse_state == APOSTROPHE && current_char != '\'' ) break; throw IfcInvalidTokenException(file->Tell(), current_char); } else { parse_state = hex = hex_count = 0; // NOTE: this is in fact wrong, this ought to be the representation of the character. // In UTF-8 this is the same, but we should not rely on that. s.put(current_char); } file->Inc(); } s.put('\''); return s.str(); } void IfcCharacterDecoder::dryRun() { unsigned int parse_state = 0; char current_char; unsigned int hex_count = 0; while ((current_char = file->Peek()) != 0) { if ( EXPECTS_CHARACTER(parse_state) ) { parse_state = 0; } else if ( current_char == '\'' && ! parse_state ) { parse_state = APOSTROPHE; } else if ( current_char == '\\' && ! parse_state ) { parse_state = FIRST_SOLIDUS; } else if ( current_char == '\\' && EXPECTS_SOLIDUS(parse_state) ) { if ( parse_state & ALPHABET_DEFINITION || parse_state & IGNORED_DIRECTIVE || parse_state & ENDEXTENDED_0 ) parse_state = hex_count = 0; else if ( parse_state & ENCOUNTERED_HEX ) { parse_state += THIRD_SOLIDUS; parse_state -= ENCOUNTERED_HEX; } else parse_state += SECOND_SOLIDUS; } else if ( current_char == 'X' && EXPECTS_ENDEXTENDED_X(parse_state) ) { parse_state += ENDEXTENDED_X; } else if ( current_char == '0' && EXPECTS_ENDEXTENDED_0(parse_state) ) { parse_state += ENDEXTENDED_0; } else if ( current_char == 'X' && EXPECTS_ARBITRARY(parse_state) ) { parse_state += ARBITRARY; } else if ( current_char == '2' && EXPECTS_ARBITRARY2(parse_state) ) { parse_state += EXTENDED2; } else if ( current_char == '4' && EXPECTS_ARBITRARY2(parse_state) ) { parse_state += EXTENDED2 + EXTENDED4; } else if ( current_char == 'P' && EXPECTS_ALPHABET(parse_state) ) { parse_state += ALPHABET; } else if ( (current_char == 'N' || current_char == 'F') && EXPECTS_N_OR_F(parse_state) ) { parse_state += IGNORED_DIRECTIVE; } else if ( IS_VALID_ALPHABET_DEFINITION(current_char) && EXPECTS_ALPHABET_DEFINITION(parse_state) ) { parse_state += ALPHABET_DEFINITION; } else if ( current_char == 'S' && EXPECTS_PAGE(parse_state) ) { parse_state += PAGE; } else if ( IS_HEXADECIMAL(current_char) && EXPECTS_HEX(parse_state) ) { parse_state += HEX((++hex_count)); if ( (hex_count == 2 && !(parse_state & EXTENDED2)) || (hex_count == 4 && !(parse_state & EXTENDED4)) || (hex_count == 8) ) { if ( hex_count == 2 ) parse_state = 0; else { CLEAR_HEX(parse_state); parse_state |= ENCOUNTERED_HEX; } hex_count = 0; } } else if ( parse_state && !( (current_char == '\\' && parse_state == FIRST_SOLIDUS) || (current_char == '\'' && parse_state == APOSTROPHE) ) ) { if ( parse_state == APOSTROPHE && current_char != '\'' ) break; throw IfcInvalidTokenException(file->Tell(), current_char); } else { parse_state = hex_count = 0; } file->Inc(); } } #ifdef HAVE_ICU UConverter* IfcCharacterDecoder::destination = 0; UConverter* IfcCharacterDecoder::converter = 0; UConverter* IfcCharacterDecoder::compatibility_converter = 0; int IfcCharacterDecoder::previous_codepage = -1; UErrorCode IfcCharacterDecoder::status = U_ZERO_ERROR; #endif #ifdef HAVE_ICU IfcCharacterDecoder::ConversionMode IfcCharacterDecoder::mode = IfcCharacterDecoder::UTF8; // Many BIM software (eg. Revit, ArchiCAD, ...) has wrong behavior bool IfcCharacterDecoder::compatibility_mode = false; std::string IfcCharacterDecoder::compatibility_charset = ""; #else char IfcCharacterDecoder::substitution_character = '_'; #endif IfcCharacterEncoder::IfcCharacterEncoder(const std::string& input) { #ifdef HAVE_ICU if ( !converter) converter = ucnv_open("utf-8", &status); #endif str = input; } IfcCharacterEncoder::~IfcCharacterEncoder() { #ifdef HAVE_ICU if ( converter) ucnv_close(converter); converter = 0; #endif } IfcCharacterEncoder::operator std::string() { std::ostringstream oss; oss.put('\''); #ifdef HAVE_ICU // Either 2 or 4 to uses \X2 or \X4 respectively. // Currently hardcoded to 4, but \X2 might be // sufficient for nearly all purposes. const int num_bytes = 4; const std::string num_bytes_str = std::string(1,num_bytes + 0x30); UChar32 ch; const char* source = str.c_str(); const char* limit = source + str.size(); bool in_extended = false; while(source < limit) { ch = ucnv_getNextUChar(converter, &source, limit, &status); const bool within_spf_range = ch >= 0x20 && ch <= 0x7e; if ( in_extended && within_spf_range ) { oss << "\\X0\\"; } else if ( !in_extended && !within_spf_range ) { oss << "\\X" << num_bytes_str << "\\"; } if ( within_spf_range ) { oss.put((char)ch); if ( ch == '\\' || ch == '\'' ) oss.put((char)ch); } else { oss << std::hex << std::setw(num_bytes*2) << std::uppercase << std::setfill('0') << (int) ch; } in_extended = !within_spf_range; } if ( in_extended ) oss << "\\X0\\"; #else for (std::string::const_iterator i = str.begin(); i != str.end(); ++i) { char ch = *i; const bool within_spf_range = ch >= 0x20 && ch <= 0x7e; if (within_spf_range) { if ( ch == '\\' || ch == '\'' ) { oss.put(ch); } oss.put(ch); } else { oss.put('_'); } } #endif oss.put('\''); return oss.str(); } #ifdef HAVE_ICU UErrorCode IfcCharacterEncoder::status = U_ZERO_ERROR; UConverter* IfcCharacterEncoder::converter = 0; #endif