// Copyright 2016 Mozilla Foundation. See the COPYRIGHT // file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. #include #include #include #include #include #include #include "encoding_rs_cpp.h" using namespace encoding_rs; const Encoding* get_encoding(const char* label) { const Encoding* enc = Encoding::for_label(gsl::cstring_span<>(label, strlen(label))); if (!enc) { fprintf(stderr, "%s is not a known encoding label; exiting.", label); exit(-2); } return enc; } void print_usage(const char* program) { printf( "Usage: %s [-f INPUT_ENCODING] [-t OUTPUT_ENCODING] [-o OUTFILE] [INFILE] " "[...]\n\n" "Options:\n" " -o, --output PATH\n" " set output file name (- for stdout; the default)\n" " -f, --from-code LABEL\n" " set input encoding (defaults to UTF-8)\n" " -t, --to-code LABEL\n" " set output encoding (defaults to UTF-8)\n" " -u, --utf16-intermediate\n" " use UTF-16 instead of UTF-8 as the intermediate\n" " encoding\n" " -h, --help print usage help\n", program); } #define INPUT_BUFFER_SIZE 2048 #define UTF8_INTERMEDIATE_BUFFER_SIZE 4096 #define UTF16_INTERMEDIATE_BUFFER_SIZE 2048 #define OUTPUT_BUFFER_SIZE 4096 void convert_via_utf8(Decoder& decoder, Encoder& encoder, FILE* read, FILE* write, bool last) { std::array input_buffer; std::array intermediate_buffer; std::array output_buffer; bool current_input_ended = false; while (!current_input_ended) { size_t decoder_input_end = fread(input_buffer.data(), 1, input_buffer.size(), read); if (ferror(read)) { fprintf(stderr, "Error reading input."); exit(-5); } current_input_ended = (decoder_input_end == 0); bool input_ended = last && current_input_ended; size_t decoder_input_start = 0; for (;;) { size_t decoder_read; size_t decoder_written; uint32_t decoder_result; std::tie(decoder_result, decoder_read, decoder_written, std::ignore) = decoder.decode_to_utf8( gsl::span(input_buffer) .subspan(decoder_input_start, decoder_input_end - decoder_input_start), intermediate_buffer, input_ended); decoder_input_start += decoder_read; bool last_output = (input_ended && (decoder_result == INPUT_EMPTY)); // Regardless of whether the intermediate buffer got full // or the input buffer was exhausted, let's process what's // in the intermediate buffer. if (encoder.encoding() == UTF_8_ENCODING) { // If the target is UTF-8, optimize out the encoder. size_t file_written = fwrite(intermediate_buffer.data(), 1, decoder_written, write); if (file_written != decoder_written) { fprintf(stderr, "Error writing output."); exit(-7); } } else { size_t encoder_input_start = 0; for (;;) { size_t encoder_read; size_t encoder_written; uint32_t encoder_result; std::tie(encoder_result, encoder_read, encoder_written, std::ignore) = encoder.encode_from_utf8( std::string_view( reinterpret_cast(intermediate_buffer.data()), intermediate_buffer.size()) .substr(encoder_input_start, decoder_written - encoder_input_start), output_buffer, last_output); encoder_input_start += encoder_read; size_t file_written = fwrite(output_buffer.data(), 1, encoder_written, write); if (file_written != encoder_written) { fprintf(stderr, "Error writing output."); exit(-6); } if (encoder_result == INPUT_EMPTY) { break; } } } // Now let's see if we should read again or process the // rest of the current input buffer. if (decoder_result == INPUT_EMPTY) { break; } } } } void convert_via_utf16(Decoder& decoder, Encoder& encoder, FILE* read, FILE* write, bool last) { std::array input_buffer; std::array intermediate_buffer; std::array output_buffer; bool current_input_ended = false; while (!current_input_ended) { size_t decoder_input_end = fread(input_buffer.data(), 1, input_buffer.size(), read); if (ferror(read)) { fprintf(stderr, "Error reading input."); exit(-5); } current_input_ended = (decoder_input_end == 0); bool input_ended = last && current_input_ended; size_t decoder_input_start = 0; for (;;) { size_t decoder_read; size_t decoder_written; uint32_t decoder_result; std::tie(decoder_result, decoder_read, decoder_written, std::ignore) = decoder.decode_to_utf16( gsl::span(input_buffer) .subspan(decoder_input_start, decoder_input_end - decoder_input_start), intermediate_buffer, input_ended); decoder_input_start += decoder_read; bool last_output = (input_ended && (decoder_result == INPUT_EMPTY)); // Regardless of whether the intermediate buffer got full // or the input buffer was exhausted, let's process what's // in the intermediate buffer. size_t encoder_input_start = 0; for (;;) { size_t encoder_read; size_t encoder_written; uint32_t encoder_result; std::tie(encoder_result, encoder_read, encoder_written, std::ignore) = encoder.encode_from_utf16( std::u16string_view(intermediate_buffer.data(), intermediate_buffer.size()) .substr(encoder_input_start, decoder_written - encoder_input_start), output_buffer, last_output); encoder_input_start += encoder_read; size_t file_written = fwrite(output_buffer.data(), 1, encoder_written, write); if (file_written != encoder_written) { fprintf(stderr, "Error writing output."); exit(-6); } if (encoder_result == INPUT_EMPTY) { break; } } // Now let's see if we should read again or process the // rest of the current input buffer. if (decoder_result == INPUT_EMPTY) { break; } } } } void convert(Decoder& decoder, Encoder& encoder, FILE* read, FILE* write, bool last, bool use_utf16) { if (use_utf16) { convert_via_utf16(decoder, encoder, read, write, last); } else { convert_via_utf8(decoder, encoder, read, write, last); } } int main(int argc, char** argv) { static struct option long_options[] = { { "output", required_argument, NULL, 'o' }, { "from-code", required_argument, NULL, 'f' }, { "to-code", required_argument, NULL, 't' }, { "utf16-intermediate", no_argument, NULL, 'u' }, { "help", no_argument, NULL, 'h' }, { 0, 0, 0, 0 } }; bool use_utf16 = false; const Encoding* input_encoding = UTF_8_ENCODING; const Encoding* output_encoding = UTF_8_ENCODING; FILE* output = stdout; for (;;) { int option_index = 0; int c = getopt_long(argc, argv, "o:f:t:uh", long_options, &option_index); if (c == -1) { break; } if (!c) { // Got a long option c = long_options[option_index].val; } switch (c) { case 'o': output = fopen(optarg, "wb"); if (!output) { fprintf(stderr, "Cannot open %s for writing; exiting.", optarg); exit(-3); } break; case 'f': input_encoding = get_encoding(optarg); break; case 't': output_encoding = get_encoding(optarg); break; case 'u': use_utf16 = true; break; case 'h': print_usage(argv[0]); exit(0); case '?': print_usage(argv[0]); exit(-1); default: break; } } std::unique_ptr decoder = input_encoding->new_decoder(); std::unique_ptr encoder = output_encoding->new_encoder(); if (optind == argc) { convert(*decoder, *encoder, stdin, output, true, use_utf16); } else { while (optind < argc) { const char* path = argv[optind++]; FILE* read = fopen(path, "rb"); if (!read) { fprintf(stderr, "Cannot open %s for reading; exiting.", path); exit(-4); } convert(*decoder, *encoder, read, output, (optind == argc), use_utf16); } } exit(0); }