diff --git a/Source/SConscript b/Source/SConscript index 16d8f4f9..ae0f02f0 100644 --- a/Source/SConscript +++ b/Source/SConscript @@ -23,7 +23,9 @@ makensis_files = Split(""" ShConstants.cpp strlist.cpp tokens.cpp + tstring.cpp util.cpp + validateunicode.cpp winchar.cpp writer.cpp """) diff --git a/Source/tstring.cpp b/Source/tstring.cpp index 9be781ee..7fcbd228 100644 --- a/Source/tstring.cpp +++ b/Source/tstring.cpp @@ -75,8 +75,6 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode) case CValidateUnicode::UTF_8: case CValidateUnicode::UTF_16LE: case CValidateUnicode::UTF_16BE: - //_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"), - // file, CValidateUnicode::TypeToName(ftype)); break; case CValidateUnicode::UTF_32LE: case CValidateUnicode::UTF_32BE: @@ -86,15 +84,12 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode) break; case CValidateUnicode::UNKNOWN: // If unknown, let's see if it's not just UTF_8 without a BOM. - if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size())) + if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()) == 2) { - ftype = CValidateUnicode::UTF_8; + // contains UTF-8 characters sequences _ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file); - } - else - { - _ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file); - } + ftype = CValidateUnicode::UTF_8; + } break; default: _ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"), @@ -102,7 +97,7 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode) exit(-1); break; } - } + } } } @@ -112,16 +107,13 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode) { case CValidateUnicode::UTF_8: strMode.append(_T(", ccs=UTF-8")); - _ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file); break; case CValidateUnicode::UTF_16LE: strMode.append(_T(", ccs=UTF-16LE")); - _ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file); break; default: // Looks like fopen() doesn't support other encodings of Unicode. strMode.append(_T(", ccs=UNICODE")); - _ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file); break; } diff --git a/Source/validateunicode.cpp b/Source/validateunicode.cpp index bc6edc78..571d69f5 100644 --- a/Source/validateunicode.cpp +++ b/Source/validateunicode.cpp @@ -18,76 +18,41 @@ #include "validateunicode.h" #include -// anonymous namespace -namespace +int CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters) { - struct CUTF8BytesToFollow - { - unsigned char m_rShift; - unsigned char m_result; - unsigned char m_bytesToFollow; - }; - - const CUTF8BytesToFollow g_utf8BytesToFollow[] = - { - /* r-shift, result, length */ - { 7, 0x0, 0}, - { 5, 0x6, 1}, - { 4, 0xe, 2}, - { 3, 0x1e, 3}, - { 2, 0x3e, 4}, - { 1, 0x7e, 5} - }; -}; - -bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters) -{ - bool valid = true; + bool hasNonAscii = false; int bytesToFollow = 0; - while (valid && characters > 0) + for ( ; characters != 0 ; --characters) { - // Last character may be 0. - if (*buf == 0 && characters != 1) - { - valid = false; - } - else - { - bytesToFollow = GetBytesToFollow(*buf); - if (bytesToFollow > 0) - { - while (bytesToFollow) - { - ++buf; - --characters; - if (*buf >> 6 != 0x2) - { - valid = false; - } - --bytesToFollow; - } - } - } - ++buf; - --characters; + unsigned char ch = *buf++; + if (bytesToFollow != 0) // in the middle of a multi-byte sequence? + { + if ((ch & 0xC0) != 0x80) + return 0; // we expected a continuation byte + hasNonAscii = true; + --bytesToFollow; + } + else if (ch & 0x80) + { + if ((ch & 0xC0) == 0x80) + return 0; // continuation byte outside multi-byte sequence + else if ((ch & 0xE0) == 0xC0) + bytesToFollow = 1; + else if ((ch & 0xF0) == 0xE0) + bytesToFollow = 2; + else if ((ch & 0xF8) == 0xF0) + bytesToFollow = 3; + else + return 0; // byte is invalid UTF-8 (outside RFC 3629) + + } + else if (ch == 0 && characters != 1) + return 0; // NUL character in the middle of the buffer } - - return valid; -} - -int CValidateUnicode::GetBytesToFollow(unsigned char ch) -{ - int result = -1; - for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i) - { - if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result) - { - result = g_utf8BytesToFollow[i].m_bytesToFollow; - } - } - - return result; + if (bytesToFollow != 0) + return 0; // end of buffer in the middle of a multi-byte sequence + return hasNonAscii ? 2 : 1; } bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes) diff --git a/Source/validateunicode.h b/Source/validateunicode.h index e7de80bd..8fb652ff 100644 --- a/Source/validateunicode.h +++ b/Source/validateunicode.h @@ -34,7 +34,7 @@ class CValidateUnicode }; // Make sure that the buffer contains valid UTF-8 encoding. - static bool ValidateUTF8(unsigned char* buf, size_t characters); + static int ValidateUTF8(unsigned char* buf, size_t characters); // Make sure that the buffer contains valid UTF-16LE encoding. static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);