Unicode: Improved UTF8 detection, less verbose
git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6067 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
parent
c8d77cd501
commit
ca54cf728c
4 changed files with 38 additions and 79 deletions
|
@ -23,7 +23,9 @@ makensis_files = Split("""
|
||||||
ShConstants.cpp
|
ShConstants.cpp
|
||||||
strlist.cpp
|
strlist.cpp
|
||||||
tokens.cpp
|
tokens.cpp
|
||||||
|
tstring.cpp
|
||||||
util.cpp
|
util.cpp
|
||||||
|
validateunicode.cpp
|
||||||
winchar.cpp
|
winchar.cpp
|
||||||
writer.cpp
|
writer.cpp
|
||||||
""")
|
""")
|
||||||
|
|
|
@ -75,8 +75,6 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
|
||||||
case CValidateUnicode::UTF_8:
|
case CValidateUnicode::UTF_8:
|
||||||
case CValidateUnicode::UTF_16LE:
|
case CValidateUnicode::UTF_16LE:
|
||||||
case CValidateUnicode::UTF_16BE:
|
case CValidateUnicode::UTF_16BE:
|
||||||
//_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"),
|
|
||||||
// file, CValidateUnicode::TypeToName(ftype));
|
|
||||||
break;
|
break;
|
||||||
case CValidateUnicode::UTF_32LE:
|
case CValidateUnicode::UTF_32LE:
|
||||||
case CValidateUnicode::UTF_32BE:
|
case CValidateUnicode::UTF_32BE:
|
||||||
|
@ -86,15 +84,12 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
|
||||||
break;
|
break;
|
||||||
case CValidateUnicode::UNKNOWN:
|
case CValidateUnicode::UNKNOWN:
|
||||||
// If unknown, let's see if it's not just UTF_8 without a BOM.
|
// If unknown, let's see if it's not just UTF_8 without a BOM.
|
||||||
if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()))
|
if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()) == 2)
|
||||||
{
|
{
|
||||||
ftype = CValidateUnicode::UTF_8;
|
// contains UTF-8 characters sequences
|
||||||
_ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
|
_ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
|
||||||
}
|
ftype = CValidateUnicode::UTF_8;
|
||||||
else
|
}
|
||||||
{
|
|
||||||
_ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
_ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
|
_ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
|
||||||
|
@ -102,7 +97,7 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
|
||||||
exit(-1);
|
exit(-1);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -112,16 +107,13 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
|
||||||
{
|
{
|
||||||
case CValidateUnicode::UTF_8:
|
case CValidateUnicode::UTF_8:
|
||||||
strMode.append(_T(", ccs=UTF-8"));
|
strMode.append(_T(", ccs=UTF-8"));
|
||||||
_ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file);
|
|
||||||
break;
|
break;
|
||||||
case CValidateUnicode::UTF_16LE:
|
case CValidateUnicode::UTF_16LE:
|
||||||
strMode.append(_T(", ccs=UTF-16LE"));
|
strMode.append(_T(", ccs=UTF-16LE"));
|
||||||
_ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// Looks like fopen() doesn't support other encodings of Unicode.
|
// Looks like fopen() doesn't support other encodings of Unicode.
|
||||||
strMode.append(_T(", ccs=UNICODE"));
|
strMode.append(_T(", ccs=UNICODE"));
|
||||||
_ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,76 +18,41 @@
|
||||||
#include "validateunicode.h"
|
#include "validateunicode.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// anonymous namespace
|
int CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
|
||||||
namespace
|
|
||||||
{
|
{
|
||||||
struct CUTF8BytesToFollow
|
bool hasNonAscii = false;
|
||||||
{
|
|
||||||
unsigned char m_rShift;
|
|
||||||
unsigned char m_result;
|
|
||||||
unsigned char m_bytesToFollow;
|
|
||||||
};
|
|
||||||
|
|
||||||
const CUTF8BytesToFollow g_utf8BytesToFollow[] =
|
|
||||||
{
|
|
||||||
/* r-shift, result, length */
|
|
||||||
{ 7, 0x0, 0},
|
|
||||||
{ 5, 0x6, 1},
|
|
||||||
{ 4, 0xe, 2},
|
|
||||||
{ 3, 0x1e, 3},
|
|
||||||
{ 2, 0x3e, 4},
|
|
||||||
{ 1, 0x7e, 5}
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
|
|
||||||
{
|
|
||||||
bool valid = true;
|
|
||||||
int bytesToFollow = 0;
|
int bytesToFollow = 0;
|
||||||
|
|
||||||
while (valid && characters > 0)
|
for ( ; characters != 0 ; --characters)
|
||||||
{
|
{
|
||||||
// Last character may be 0.
|
unsigned char ch = *buf++;
|
||||||
if (*buf == 0 && characters != 1)
|
if (bytesToFollow != 0) // in the middle of a multi-byte sequence?
|
||||||
{
|
{
|
||||||
valid = false;
|
if ((ch & 0xC0) != 0x80)
|
||||||
}
|
return 0; // we expected a continuation byte
|
||||||
else
|
hasNonAscii = true;
|
||||||
{
|
--bytesToFollow;
|
||||||
bytesToFollow = GetBytesToFollow(*buf);
|
}
|
||||||
if (bytesToFollow > 0)
|
else if (ch & 0x80)
|
||||||
{
|
{
|
||||||
while (bytesToFollow)
|
if ((ch & 0xC0) == 0x80)
|
||||||
{
|
return 0; // continuation byte outside multi-byte sequence
|
||||||
++buf;
|
else if ((ch & 0xE0) == 0xC0)
|
||||||
--characters;
|
bytesToFollow = 1;
|
||||||
if (*buf >> 6 != 0x2)
|
else if ((ch & 0xF0) == 0xE0)
|
||||||
{
|
bytesToFollow = 2;
|
||||||
valid = false;
|
else if ((ch & 0xF8) == 0xF0)
|
||||||
}
|
bytesToFollow = 3;
|
||||||
--bytesToFollow;
|
else
|
||||||
}
|
return 0; // byte is invalid UTF-8 (outside RFC 3629)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
++buf;
|
else if (ch == 0 && characters != 1)
|
||||||
--characters;
|
return 0; // NUL character in the middle of the buffer
|
||||||
}
|
}
|
||||||
|
if (bytesToFollow != 0)
|
||||||
return valid;
|
return 0; // end of buffer in the middle of a multi-byte sequence
|
||||||
}
|
return hasNonAscii ? 2 : 1;
|
||||||
|
|
||||||
int CValidateUnicode::GetBytesToFollow(unsigned char ch)
|
|
||||||
{
|
|
||||||
int result = -1;
|
|
||||||
for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
|
|
||||||
{
|
|
||||||
if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
|
|
||||||
{
|
|
||||||
result = g_utf8BytesToFollow[i].m_bytesToFollow;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
|
bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
|
||||||
|
|
|
@ -34,7 +34,7 @@ class CValidateUnicode
|
||||||
};
|
};
|
||||||
|
|
||||||
// Make sure that the buffer contains valid UTF-8 encoding.
|
// Make sure that the buffer contains valid UTF-8 encoding.
|
||||||
static bool ValidateUTF8(unsigned char* buf, size_t characters);
|
static int ValidateUTF8(unsigned char* buf, size_t characters);
|
||||||
|
|
||||||
// Make sure that the buffer contains valid UTF-16LE encoding.
|
// Make sure that the buffer contains valid UTF-16LE encoding.
|
||||||
static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);
|
static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue