Unicode: Improved UTF8 detection, less verbose

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6067 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
wizou 2010-04-20 15:29:55 +00:00
parent c8d77cd501
commit ca54cf728c
4 changed files with 38 additions and 79 deletions

View file

@ -23,7 +23,9 @@ makensis_files = Split("""
ShConstants.cpp ShConstants.cpp
strlist.cpp strlist.cpp
tokens.cpp tokens.cpp
tstring.cpp
util.cpp util.cpp
validateunicode.cpp
winchar.cpp winchar.cpp
writer.cpp writer.cpp
""") """)

View file

@ -75,8 +75,6 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
case CValidateUnicode::UTF_8: case CValidateUnicode::UTF_8:
case CValidateUnicode::UTF_16LE: case CValidateUnicode::UTF_16LE:
case CValidateUnicode::UTF_16BE: case CValidateUnicode::UTF_16BE:
//_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"),
// file, CValidateUnicode::TypeToName(ftype));
break; break;
case CValidateUnicode::UTF_32LE: case CValidateUnicode::UTF_32LE:
case CValidateUnicode::UTF_32BE: case CValidateUnicode::UTF_32BE:
@ -86,15 +84,12 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
break; break;
case CValidateUnicode::UNKNOWN: case CValidateUnicode::UNKNOWN:
// If unknown, let's see if it's not just UTF_8 without a BOM. // If unknown, let's see if it's not just UTF_8 without a BOM.
if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size())) if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()) == 2)
{ {
ftype = CValidateUnicode::UTF_8; // contains UTF-8 characters sequences
_ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file); _ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
} ftype = CValidateUnicode::UTF_8;
else }
{
_ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file);
}
break; break;
default: default:
_ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"), _ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
@ -102,7 +97,7 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
exit(-1); exit(-1);
break; break;
} }
} }
} }
} }
@ -112,16 +107,13 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
{ {
case CValidateUnicode::UTF_8: case CValidateUnicode::UTF_8:
strMode.append(_T(", ccs=UTF-8")); strMode.append(_T(", ccs=UTF-8"));
_ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file);
break; break;
case CValidateUnicode::UTF_16LE: case CValidateUnicode::UTF_16LE:
strMode.append(_T(", ccs=UTF-16LE")); strMode.append(_T(", ccs=UTF-16LE"));
_ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file);
break; break;
default: default:
// Looks like fopen() doesn't support other encodings of Unicode. // Looks like fopen() doesn't support other encodings of Unicode.
strMode.append(_T(", ccs=UNICODE")); strMode.append(_T(", ccs=UNICODE"));
_ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file);
break; break;
} }

View file

@ -18,76 +18,41 @@
#include "validateunicode.h" #include "validateunicode.h"
#include <vector> #include <vector>
// anonymous namespace int CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
namespace
{ {
struct CUTF8BytesToFollow bool hasNonAscii = false;
{
unsigned char m_rShift;
unsigned char m_result;
unsigned char m_bytesToFollow;
};
const CUTF8BytesToFollow g_utf8BytesToFollow[] =
{
/* r-shift, result, length */
{ 7, 0x0, 0},
{ 5, 0x6, 1},
{ 4, 0xe, 2},
{ 3, 0x1e, 3},
{ 2, 0x3e, 4},
{ 1, 0x7e, 5}
};
};
bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
{
bool valid = true;
int bytesToFollow = 0; int bytesToFollow = 0;
while (valid && characters > 0) for ( ; characters != 0 ; --characters)
{ {
// Last character may be 0. unsigned char ch = *buf++;
if (*buf == 0 && characters != 1) if (bytesToFollow != 0) // in the middle of a multi-byte sequence?
{ {
valid = false; if ((ch & 0xC0) != 0x80)
} return 0; // we expected a continuation byte
else hasNonAscii = true;
{ --bytesToFollow;
bytesToFollow = GetBytesToFollow(*buf); }
if (bytesToFollow > 0) else if (ch & 0x80)
{ {
while (bytesToFollow) if ((ch & 0xC0) == 0x80)
{ return 0; // continuation byte outside multi-byte sequence
++buf; else if ((ch & 0xE0) == 0xC0)
--characters; bytesToFollow = 1;
if (*buf >> 6 != 0x2) else if ((ch & 0xF0) == 0xE0)
{ bytesToFollow = 2;
valid = false; else if ((ch & 0xF8) == 0xF0)
} bytesToFollow = 3;
--bytesToFollow; else
} return 0; // byte is invalid UTF-8 (outside RFC 3629)
}
} }
++buf; else if (ch == 0 && characters != 1)
--characters; return 0; // NUL character in the middle of the buffer
} }
if (bytesToFollow != 0)
return valid; return 0; // end of buffer in the middle of a multi-byte sequence
} return hasNonAscii ? 2 : 1;
int CValidateUnicode::GetBytesToFollow(unsigned char ch)
{
int result = -1;
for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
{
if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
{
result = g_utf8BytesToFollow[i].m_bytesToFollow;
}
}
return result;
} }
bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes) bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)

View file

@ -34,7 +34,7 @@ class CValidateUnicode
}; };
// Make sure that the buffer contains valid UTF-8 encoding. // Make sure that the buffer contains valid UTF-8 encoding.
static bool ValidateUTF8(unsigned char* buf, size_t characters); static int ValidateUTF8(unsigned char* buf, size_t characters);
// Make sure that the buffer contains valid UTF-16LE encoding. // Make sure that the buffer contains valid UTF-16LE encoding.
static bool ValidateUTF16LE(unsigned char* buf, size_t bytes); static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);