|
| 1 | +/******************************************************************************** |
| 2 | + * * |
| 3 | + * This file is part of IfcOpenShell. * |
| 4 | + * * |
| 5 | + * IfcOpenShell is free software: you can redistribute it and/or modify * |
| 6 | + * it under the terms of the Lesser GNU General Public License as published by * |
| 7 | + * the Free Software Foundation, either version 3.0 of the License, or * |
| 8 | + * (at your option) any later version. * |
| 9 | + * * |
| 10 | + * IfcOpenShell is distributed in the hope that it will be useful, * |
| 11 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of * |
| 12 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
| 13 | + * Lesser GNU General Public License for more details. * |
| 14 | + * * |
| 15 | + * You should have received a copy of the Lesser GNU General Public License * |
| 16 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. * |
| 17 | + * * |
| 18 | + ********************************************************************************/ |
| 19 | + |
| 20 | + /******************************************************************************** |
| 21 | + * * |
| 22 | + * Implementation of character decoding as described in ISO 10303-21 table 2 and * |
| 23 | + * table 4 * |
| 24 | + * * |
| 25 | + ********************************************************************************/ |
| 26 | + |
| 27 | +#include <string> |
| 28 | +#include <sstream> |
| 29 | +#include <iomanip> |
| 30 | + |
| 31 | +#include "../ifcparse/IfcCharacterDecoder.h" |
| 32 | +#include "../ifcparse/IfcException.h" |
| 33 | +#include "../ifcparse/IfcFile.h" |
| 34 | + |
| 35 | +#define FIRST_SOLIDUS (1 << 1) |
| 36 | +#define PAGE (1 << 2) |
| 37 | +#define ALPHABET (1 << 3) |
| 38 | +#define SECOND_SOLIDUS (1 << 4) |
| 39 | +#define ALPHABET_DEFINITION (1 << 5) |
| 40 | +#define APOSTROPHE (1 << 6) |
| 41 | +#define ARBITRARY (1 << 7) |
| 42 | +#define EXTENDED2 (1 << 8) |
| 43 | +#define EXTENDED4 (1 << 9) |
| 44 | +#define HEX(N) (1 << (9+N)) |
| 45 | +#define THIRD_SOLIDUS (1 << 18) |
| 46 | +#define ENDEXTENDED_X (1 << 19) |
| 47 | +#define ENDEXTENDED_0 (1 << 20) |
| 48 | +#define FOURTH_SOLIDUS (1 << 21) |
| 49 | +#define IGNORED_DIRECTIVE (1 << 22) |
| 50 | + |
| 51 | +// FIXME: These probably need to be less forgiving in terms of wrongly defined sequences |
| 52 | +#define EXPECTS_ALPHABET(S) (S & FIRST_SOLIDUS) |
| 53 | +#define EXPECTS_PAGE(S) (S & FIRST_SOLIDUS) |
| 54 | +#define EXPECTS_ARBITRARY(S) (S & FIRST_SOLIDUS) |
| 55 | +#define EXPECTS_N_OR_F(S) (S & FIRST_SOLIDUS && !(S & ARBITRARY) ) |
| 56 | +#define EXPECTS_ARBITRARY2(S) (S & ARBITRARY && !(S & SECOND_SOLIDUS)) |
| 57 | +#define EXPECTS_ALPHABET_DEFINITION(S) (S & FIRST_SOLIDUS && S & ALPHABET) |
| 58 | +#define EXPECTS_SOLIDUS(S) (S & ALPHABET_DEFINITION || S & PAGE || S & ARBITRARY || S & EXTENDED2 || S & EXTENDED4 || S & ENDEXTENDED_0 || S & IGNORED_DIRECTIVE || (S & EXTENDED4 && S & HEX(8)) || (S & EXTENDED2 && S & HEX(4))) |
| 59 | +#define EXPECTS_CHARACTER(S) (S & PAGE && S & SECOND_SOLIDUS) |
| 60 | +#define EXPECTS_HEX(S) (S & HEX(1) || S & HEX(3) || S & HEX(5) || S & HEX(6) || S & HEX(7) || (S & ARBITRARY && S & SECOND_SOLIDUS) || (S & EXTENDED2 && S & HEX(2)) || (S & EXTENDED4 && S & HEX(4)) ) |
| 61 | +#define EXPECTS_ENDEXTENDED_X(S) (S & THIRD_SOLIDUS) |
| 62 | +#define EXPECTS_ENDEXTENDED_0(S) (S & ENDEXTENDED_X) |
| 63 | + |
| 64 | +#define IS_VALID_ALPHABET_DEFINITION(C) (C >= 0x40 && C <= 0x4A) |
| 65 | +#define IS_HEXADECIMAL(C) ((C >= 0x30 && C <= 0x39 ) || (C >= 0x41 && C <= 0x46 )) |
| 66 | +#define HEX_TO_INT(C) ((C >= 0x30 && C <= 0x39 ) ? C - 0x30 : (C+10) - 0x41) |
| 67 | +#define CLEAR_HEX(C) (C &= ~(HEX(1)&HEX(2)&HEX(3)&HEX(4)&HEX(5)&HEX(6)&HEX(7)&HEX(8))) |
| 68 | + |
| 69 | +using namespace IfcParse; |
| 70 | + |
| 71 | +void IfcCharacterDecoder::addChar(std::stringstream& s,const UChar32& ch) { |
| 72 | + if ( destination ) { |
| 73 | + char* extraction_buffer = new char[4]; |
| 74 | + UnicodeString(ch).extract(extraction_buffer,4,destination,status); |
| 75 | + s << extraction_buffer; |
| 76 | + delete extraction_buffer; |
| 77 | + } else { |
| 78 | + std::stringstream s2; |
| 79 | + s2 << "\\u" << std::hex << std::setw(4) << std::setfill('0') << (int) ch; |
| 80 | + s << s2.str(); |
| 81 | + } |
| 82 | +} |
| 83 | +IfcCharacterDecoder::IfcCharacterDecoder(IfcParse::File* f) { |
| 84 | + file = f; |
| 85 | + if ( ! destination && mode == UTF8 ) { |
| 86 | + destination = ucnv_open("utf-8", &status); |
| 87 | + } else if ( ! destination && mode == LATIN ) { |
| 88 | + destination = ucnv_open("iso-8859-1", &status); |
| 89 | + } |
| 90 | +} |
| 91 | +IfcCharacterDecoder::~IfcCharacterDecoder() { |
| 92 | + if ( destination ) ucnv_close(destination); |
| 93 | + if ( converter ) ucnv_close(converter); |
| 94 | +} |
| 95 | +IfcCharacterDecoder::operator std::string() { |
| 96 | + unsigned int parse_state = 0; |
| 97 | + std::stringstream s; |
| 98 | + s.put('\''); |
| 99 | + char current_char; |
| 100 | + int codepage = 1; |
| 101 | + unsigned int hex = 0; |
| 102 | + unsigned int hex_count = 0; |
| 103 | + while ( current_char = file->Peek() ) { |
| 104 | + if ( EXPECTS_CHARACTER(parse_state) ) { |
| 105 | + if ( previous_codepage != codepage ) { |
| 106 | + if ( converter ) ucnv_close(converter); |
| 107 | + char encoder[11] = {'i','s','o','-','8','8','5','9','-',codepage + 0x30}; |
| 108 | + converter = ucnv_open(encoder, &status); |
| 109 | + } |
| 110 | + const char characters[2] = { current_char + 0x80 }; |
| 111 | + const char* char_array = &characters[0]; |
| 112 | + UChar32 ch = ucnv_getNextUChar(converter,&char_array,char_array+1,&status); |
| 113 | + addChar(s,ch); |
| 114 | + parse_state = 0; |
| 115 | + } else if ( current_char == '\'' && ! parse_state ) { |
| 116 | + parse_state = APOSTROPHE; |
| 117 | + } else if ( current_char == '\\' && ! parse_state ) { |
| 118 | + parse_state = FIRST_SOLIDUS; |
| 119 | + } else if ( current_char == '\\' && EXPECTS_SOLIDUS(parse_state) ) { |
| 120 | + if ( parse_state & ALPHABET_DEFINITION || |
| 121 | + parse_state & IGNORED_DIRECTIVE || |
| 122 | + parse_state & ENDEXTENDED_0 ) parse_state = hex = hex_count = 0; |
| 123 | + else if ( parse_state & HEX(3) ) parse_state += THIRD_SOLIDUS; |
| 124 | + else parse_state += SECOND_SOLIDUS; |
| 125 | + } else if ( current_char == 'X' && EXPECTS_ENDEXTENDED_X(parse_state) ) { |
| 126 | + parse_state += ENDEXTENDED_X; |
| 127 | + } else if ( current_char == '0' && EXPECTS_ENDEXTENDED_0(parse_state) ) { |
| 128 | + parse_state += ENDEXTENDED_0; |
| 129 | + } else if ( current_char == 'X' && EXPECTS_ARBITRARY(parse_state) ) { |
| 130 | + parse_state += ARBITRARY; |
| 131 | + } else if ( current_char == '2' && EXPECTS_ARBITRARY2(parse_state) ) { |
| 132 | + parse_state += EXTENDED2; |
| 133 | + } else if ( current_char == '4' && EXPECTS_ARBITRARY2(parse_state) ) { |
| 134 | + parse_state += EXTENDED2 + EXTENDED4; |
| 135 | + } else if ( current_char == 'P' && EXPECTS_ALPHABET(parse_state) ) { |
| 136 | + parse_state += ALPHABET; |
| 137 | + } else if ( (current_char == 'N' || current_char == 'F') && EXPECTS_N_OR_F(parse_state) ) { |
| 138 | + parse_state += IGNORED_DIRECTIVE; |
| 139 | + } else if ( IS_VALID_ALPHABET_DEFINITION(current_char) && EXPECTS_ALPHABET_DEFINITION(parse_state) ) { |
| 140 | + codepage = current_char - 0x40; |
| 141 | + parse_state += ALPHABET_DEFINITION; |
| 142 | + } else if ( current_char == 'S' && EXPECTS_PAGE(parse_state) ) { |
| 143 | + parse_state += PAGE; |
| 144 | + } else if ( IS_HEXADECIMAL(current_char) && EXPECTS_HEX(parse_state) ) { |
| 145 | + hex <<= 4; |
| 146 | + parse_state += HEX((++hex_count)); |
| 147 | + hex += HEX_TO_INT(current_char); |
| 148 | + if ( (hex_count == 2 && !(parse_state & EXTENDED2)) || |
| 149 | + (hex_count == 4 && !(parse_state & EXTENDED4)) || |
| 150 | + (hex_count == 8) ) { |
| 151 | + addChar(s,(UChar32) hex); |
| 152 | + if ( hex_count == 2 ) parse_state = 0; |
| 153 | + else CLEAR_HEX(parse_state); |
| 154 | + hex = hex_count = 0; |
| 155 | + } |
| 156 | + } else if ( parse_state && !( |
| 157 | + (current_char == '\\' && parse_state == FIRST_SOLIDUS) || |
| 158 | + (current_char == '\'' && parse_state == APOSTROPHE) |
| 159 | + ) ) { |
| 160 | + if ( parse_state == APOSTROPHE && current_char != '\'' ) break; |
| 161 | + throw IfcException("Invalid character encountered"); |
| 162 | + } else { |
| 163 | + parse_state = hex = hex_count = 0; |
| 164 | + s.put(current_char); |
| 165 | + } |
| 166 | + file->Inc(); |
| 167 | + } |
| 168 | + s.put('\''); |
| 169 | + return s.str(); |
| 170 | +} |
| 171 | + |
| 172 | +void IfcCharacterDecoder::dryRun() { |
| 173 | + unsigned int parse_state = 0; |
| 174 | + char current_char; |
| 175 | + unsigned int hex_count = 0; |
| 176 | + while ( current_char = file->Peek() ) { |
| 177 | + if ( EXPECTS_CHARACTER(parse_state) ) { |
| 178 | + parse_state = 0; |
| 179 | + } else if ( current_char == '\'' && ! parse_state ) { |
| 180 | + parse_state = APOSTROPHE; |
| 181 | + } else if ( current_char == '\\' && ! parse_state ) { |
| 182 | + parse_state = FIRST_SOLIDUS; |
| 183 | + } else if ( current_char == '\\' && EXPECTS_SOLIDUS(parse_state) ) { |
| 184 | + if ( parse_state & ALPHABET_DEFINITION || |
| 185 | + parse_state & IGNORED_DIRECTIVE || |
| 186 | + parse_state & ENDEXTENDED_0 ) parse_state = hex_count = 0; |
| 187 | + else if ( parse_state & HEX(3) ) parse_state += THIRD_SOLIDUS; |
| 188 | + else parse_state += SECOND_SOLIDUS; |
| 189 | + } else if ( current_char == 'X' && EXPECTS_ENDEXTENDED_X(parse_state) ) { |
| 190 | + parse_state += ENDEXTENDED_X; |
| 191 | + } else if ( current_char == '0' && EXPECTS_ENDEXTENDED_0(parse_state) ) { |
| 192 | + parse_state += ENDEXTENDED_0; |
| 193 | + } else if ( current_char == 'X' && EXPECTS_ARBITRARY(parse_state) ) { |
| 194 | + parse_state += ARBITRARY; |
| 195 | + } else if ( current_char == '2' && EXPECTS_ARBITRARY2(parse_state) ) { |
| 196 | + parse_state += EXTENDED2; |
| 197 | + } else if ( current_char == '4' && EXPECTS_ARBITRARY2(parse_state) ) { |
| 198 | + parse_state += EXTENDED2 + EXTENDED4; |
| 199 | + } else if ( current_char == 'P' && EXPECTS_ALPHABET(parse_state) ) { |
| 200 | + parse_state += ALPHABET; |
| 201 | + } else if ( (current_char == 'N' || current_char == 'F') && EXPECTS_N_OR_F(parse_state) ) { |
| 202 | + parse_state += IGNORED_DIRECTIVE; |
| 203 | + } else if ( IS_VALID_ALPHABET_DEFINITION(current_char) && EXPECTS_ALPHABET_DEFINITION(parse_state) ) { |
| 204 | + parse_state += ALPHABET_DEFINITION; |
| 205 | + } else if ( current_char == 'S' && EXPECTS_PAGE(parse_state) ) { |
| 206 | + parse_state += PAGE; |
| 207 | + } else if ( IS_HEXADECIMAL(current_char) && EXPECTS_HEX(parse_state) ) { |
| 208 | + parse_state += HEX((++hex_count)); |
| 209 | + if ( (hex_count == 2 && !(parse_state & EXTENDED2)) || |
| 210 | + (hex_count == 4 && !(parse_state & EXTENDED4)) || |
| 211 | + (hex_count == 8) ) { |
| 212 | + if ( hex_count == 2 ) parse_state = 0; |
| 213 | + else CLEAR_HEX(parse_state); |
| 214 | + hex_count = 0; |
| 215 | + } |
| 216 | + } else if ( parse_state && !( |
| 217 | + (current_char == '\\' && parse_state == FIRST_SOLIDUS) || |
| 218 | + (current_char == '\'' && parse_state == APOSTROPHE) |
| 219 | + ) ) { |
| 220 | + if ( parse_state == APOSTROPHE && current_char != '\'' ) break; |
| 221 | + throw IfcException("Invalid character encountered"); |
| 222 | + } else { |
| 223 | + parse_state = hex_count = 0; |
| 224 | + } |
| 225 | + file->Inc(); |
| 226 | + } |
| 227 | +} |
| 228 | + |
| 229 | +UConverter* IfcCharacterDecoder::destination = 0; |
| 230 | +UConverter* IfcCharacterDecoder::converter = 0; |
| 231 | +int IfcCharacterDecoder::previous_codepage = -1; |
| 232 | +UErrorCode IfcCharacterDecoder::status = U_ZERO_ERROR; |
| 233 | +IfcCharacterDecoder::ConversionMode IfcCharacterDecoder::mode = IfcCharacterDecoder::JSON; |
| 234 | + |
0 commit comments