// validateunicode.cpp // // This file is a part of Unicode NSIS. // // Copyright (C) 2009 - Jim Park // // Licensed under the zlib/libpng license (the "License"); // you may not use this file except in compliance with the License. // // This software is provided 'as-is', without any expressed or implied // warranty. // // This class can be used to check a buffer to see if it has the expected // Unicode encoding and look for byte order marks. #ifdef _UNICODE #include "validateunicode.h" #include // anonymous namespace namespace { struct CUTF8BytesToFollow { unsigned char m_rShift; unsigned char m_result; unsigned char m_bytesToFollow; }; const CUTF8BytesToFollow g_utf8BytesToFollow[] = { /* r-shift, result, length */ { 7, 0x0, 0}, { 5, 0x6, 1}, { 4, 0xe, 2}, { 3, 0x1e, 3}, { 2, 0x3e, 4}, { 1, 0x7e, 5} }; }; bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters) { bool valid = true; int bytesToFollow = 0; while (valid && characters > 0) { // Last character may be 0. if (*buf == 0 && characters != 1) { valid = false; } else { bytesToFollow = GetBytesToFollow(*buf); if (bytesToFollow > 0) { while (bytesToFollow) { ++buf; --characters; if (*buf >> 6 != 0x2) { valid = false; } --bytesToFollow; } } } ++buf; --characters; } return valid; } int CValidateUnicode::GetBytesToFollow(unsigned char ch) { int result = -1; for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i) { if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result) { result = g_utf8BytesToFollow[i].m_bytesToFollow; } } return result; } bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes) { // We need to make sure the endianness matches the processor. // Intel x86 is little endian. return ValidateUTF16((unsigned short*)(buf), bytes/2); } bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes) { std::vector correctedBuf(bytes/2); for (size_t i = 0; i < bytes; i += 2) { correctedBuf[i/2] = buf[i] << 8 | buf[i+1]; } return ValidateUTF16(&correctedBuf[0], correctedBuf.size()); } bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters) { unsigned short ch; bool valid = true; while (valid && characters > 0) { // Last character may be 0. if ((ch = *buf) == 0 && characters != 1) { valid = false; } else if (ch >= 0xd800 && ch <= 0xdbff) { unsigned short trailing = *(++buf); --characters; // Unpaired leading surrogate found? if (trailing < 0xdc00 || trailing > 0xdfff) { valid = false; } // Invalid surrogate pairs found? else if ((ch == 0xd83f || ch == 0xd87f || ch == 0xd8bf || ch == 0xd8ff || ch == 0xd93f || ch == 0xd97f || ch == 0xd9bf || ch == 0xd9ff || ch == 0xda3f || ch == 0xdA7f || ch == 0xdabf || ch == 0xdaff || ch == 0xdb3f || ch == 0xdb7f || ch == 0xdbbf || ch == 0xdbff) && (trailing == 0xdffe || trailing == 0xdfff)) { valid = false; } } // Unpaired trailing surrogate! else if (ch >= 0xdc00 && ch <= 0xdfff) { valid = false; } // Invalid values else if (ch == 0xfffe || ch == 0xffff || (ch >= 0xfdd0 && ch <= 0xfdef)) { valid = false; } ++buf; --characters; } return valid; } CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM( unsigned char* buf, size_t bytes) { FILE_TYPE result = UNKNOWN; if (bytes >= 2) { if (buf[0] == 0xff && buf[1] == 0xfe) { result = UTF_16LE; } else if (buf[0] == 0xfe && buf[1] == 0xff) { result = UTF_16BE; } else if (bytes >= 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) { result = UTF_8; } else if (bytes >= 4) { if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xfe && buf[3] == 0xff) { result = UTF_32BE; } else if (buf[0] == 0xff && buf[1] == 0xfe && buf[2] == 0 && buf[3] == 0) { result = UTF_32LE; } } } return result; } const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype) { static const TCHAR* names[] = { _T("UTF-8"), _T("UTF-16LE"), _T("UTF-16BE"), _T("UTF-32LE"), _T("UTF-32BE"), _T("UNKNOWN") }; return names[ftype]; } #endif