#include "Data.h" #include #include // Note that these aren't all possible BOMs. But they are probably the most common ones. // The size is needed at least for the ones with character zero in them. static const QByteArray bom3("\xEF\xBB\xBF", 3); static const QByteArray bom2a("\xFE\xFF", 2); static const QByteArray bom2b("\xFF\xFE", 2); static const QByteArray bom4a("\x00\x00\xFE\xFF", 4); static const QByteArray bom4b("\xFF\xFE\x00\x00", 4); bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest) { // If the data starts with a Unicode BOM, we always assume it is text if(startsWithBom(data)) return true; // Truncate to the first few bytes for quick testing int testSize = quickTest? std::min(512, data.size()) : data.size(); // If the quick test has been requested and we have to truncate the string, we have to use // an approach where truncated multibyte characters are not interpreted as invalid characters. if(quickTest && data.size() > testSize) { // We can assume that the default encoding (UTF-8) and all the ISO-8859 // cannot contain character zero. // This has to be checked explicitly because toUnicode() is using zero as // a terminator for these encodings. if((encoding.isEmpty() || encoding.startsWith("ISO-8859")) && data.contains('\0')) return false; QTextCodec::ConverterState state; QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8()); const QString text = codec->toUnicode(data.constData(), testSize, &state); return state.invalidChars == 0; } else { // Convert to Unicode if necessary data = decodeString(data, encoding); // Perform check return QString(data).toUtf8() == data; } } bool containsRightToLeft(const QString& text) { for(QChar ch : text) { switch(ch.direction()) { case QChar::DirR: case QChar::DirAL: case QChar::DirRLE: case QChar::DirRLO: case QChar::DirRLI: return true; } } return false; } bool startsWithBom(const QByteArray& data) { if(data.startsWith(bom3) || data.startsWith(bom2a) || data.startsWith(bom2b) || data.startsWith(bom4a) || data.startsWith(bom4b)) return true; else return false; } QByteArray removeBom(QByteArray& data) { if(data.startsWith(bom3)) { QByteArray bom = data.left(3); data.remove(0, 3); return bom; } else if(data.startsWith(bom2a) || data.startsWith(bom2b)) { QByteArray bom = data.left(2); data.remove(0, 2); return bom; } else if(data.startsWith(bom4a) || data.startsWith(bom4b)) { QByteArray bom = data.left(4); data.remove(0, 4); return bom; } else { return QByteArray(); } } QStringList toStringList(const QList& list) { QStringList strings; for (const QByteArray &item : list) { strings.append(QString::fromUtf8(item)); } return strings; } QByteArray encodeString(const QByteArray& str, const QString& encoding) { if(encoding.isEmpty()) return str; else return QTextCodec::codecForName(encoding.toUtf8())->fromUnicode(str); } QByteArray decodeString(const QByteArray& str, const QString& encoding) { if(encoding.isEmpty()) return str; else return QTextCodec::codecForName(encoding.toUtf8())->toUnicode(str).toUtf8(); }