Unicode port: Support for Unicode/UTF8 input files by Jim Park.
git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6066 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
parent
2d3bf19b0d
commit
c8d77cd501
4 changed files with 429 additions and 1 deletions
235
Source/validateunicode.cpp
Normal file
235
Source/validateunicode.cpp
Normal file
|
@ -0,0 +1,235 @@
|
|||
// validateunicode.cpp
|
||||
//
|
||||
// This file is a part of Unicode NSIS.
|
||||
//
|
||||
// Copyright (C) 2009 - Jim Park
|
||||
//
|
||||
// Licensed under the zlib/libpng license (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
//
|
||||
// This software is provided 'as-is', without any expressed or implied
|
||||
// warranty.
|
||||
//
|
||||
// This class can be used to check a buffer to see if it has the expected
|
||||
// Unicode encoding and look for byte order marks.
|
||||
|
||||
#ifdef _UNICODE
|
||||
|
||||
#include "validateunicode.h"
|
||||
#include <vector>
|
||||
|
||||
// anonymous namespace
|
||||
namespace
|
||||
{
|
||||
struct CUTF8BytesToFollow
|
||||
{
|
||||
unsigned char m_rShift;
|
||||
unsigned char m_result;
|
||||
unsigned char m_bytesToFollow;
|
||||
};
|
||||
|
||||
const CUTF8BytesToFollow g_utf8BytesToFollow[] =
|
||||
{
|
||||
/* r-shift, result, length */
|
||||
{ 7, 0x0, 0},
|
||||
{ 5, 0x6, 1},
|
||||
{ 4, 0xe, 2},
|
||||
{ 3, 0x1e, 3},
|
||||
{ 2, 0x3e, 4},
|
||||
{ 1, 0x7e, 5}
|
||||
};
|
||||
};
|
||||
|
||||
bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
|
||||
{
|
||||
bool valid = true;
|
||||
int bytesToFollow = 0;
|
||||
|
||||
while (valid && characters > 0)
|
||||
{
|
||||
// Last character may be 0.
|
||||
if (*buf == 0 && characters != 1)
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
bytesToFollow = GetBytesToFollow(*buf);
|
||||
if (bytesToFollow > 0)
|
||||
{
|
||||
while (bytesToFollow)
|
||||
{
|
||||
++buf;
|
||||
--characters;
|
||||
if (*buf >> 6 != 0x2)
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
--bytesToFollow;
|
||||
}
|
||||
}
|
||||
}
|
||||
++buf;
|
||||
--characters;
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
int CValidateUnicode::GetBytesToFollow(unsigned char ch)
|
||||
{
|
||||
int result = -1;
|
||||
for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
|
||||
{
|
||||
if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
|
||||
{
|
||||
result = g_utf8BytesToFollow[i].m_bytesToFollow;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
|
||||
{
|
||||
// We need to make sure the endianness matches the processor.
|
||||
// Intel x86 is little endian.
|
||||
return ValidateUTF16((unsigned short*)(buf), bytes/2);
|
||||
}
|
||||
|
||||
bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes)
|
||||
{
|
||||
std::vector<unsigned short> correctedBuf(bytes/2);
|
||||
|
||||
for (size_t i = 0; i < bytes; i += 2)
|
||||
{
|
||||
correctedBuf[i/2] = buf[i] << 8 | buf[i+1];
|
||||
}
|
||||
|
||||
return ValidateUTF16(&correctedBuf[0], correctedBuf.size());
|
||||
}
|
||||
|
||||
bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters)
|
||||
{
|
||||
unsigned short ch;
|
||||
bool valid = true;
|
||||
|
||||
while (valid && characters > 0)
|
||||
{
|
||||
// Last character may be 0.
|
||||
if ((ch = *buf) == 0 && characters != 1)
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
else if (ch >= 0xd800 && ch <= 0xdbff)
|
||||
{
|
||||
unsigned short trailing = *(++buf);
|
||||
--characters;
|
||||
// Unpaired leading surrogate found?
|
||||
if (trailing < 0xdc00 || trailing > 0xdfff)
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
// Invalid surrogate pairs found?
|
||||
else if ((ch == 0xd83f ||
|
||||
ch == 0xd87f ||
|
||||
ch == 0xd8bf ||
|
||||
ch == 0xd8ff ||
|
||||
ch == 0xd93f ||
|
||||
ch == 0xd97f ||
|
||||
ch == 0xd9bf ||
|
||||
ch == 0xd9ff ||
|
||||
ch == 0xda3f ||
|
||||
ch == 0xdA7f ||
|
||||
ch == 0xdabf ||
|
||||
ch == 0xdaff ||
|
||||
ch == 0xdb3f ||
|
||||
ch == 0xdb7f ||
|
||||
ch == 0xdbbf ||
|
||||
ch == 0xdbff)
|
||||
&&
|
||||
(trailing == 0xdffe || trailing == 0xdfff))
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
// Unpaired trailing surrogate!
|
||||
else if (ch >= 0xdc00 && ch <= 0xdfff)
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
// Invalid values
|
||||
else if (ch == 0xfffe || ch == 0xffff ||
|
||||
(ch >= 0xfdd0 && ch <= 0xfdef))
|
||||
{
|
||||
valid = false;
|
||||
}
|
||||
|
||||
++buf;
|
||||
--characters;
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM(
|
||||
unsigned char* buf,
|
||||
size_t bytes)
|
||||
{
|
||||
FILE_TYPE result = UNKNOWN;
|
||||
|
||||
if (bytes >= 2)
|
||||
{
|
||||
if (buf[0] == 0xff && buf[1] == 0xfe)
|
||||
{
|
||||
result = UTF_16LE;
|
||||
}
|
||||
else if (buf[0] == 0xfe && buf[1] == 0xff)
|
||||
{
|
||||
result = UTF_16BE;
|
||||
}
|
||||
else if (bytes >= 3 &&
|
||||
buf[0] == 0xef &&
|
||||
buf[1] == 0xbb &&
|
||||
buf[2] == 0xbf)
|
||||
{
|
||||
result = UTF_8;
|
||||
}
|
||||
else if (bytes >= 4)
|
||||
{
|
||||
if (buf[0] == 0 &&
|
||||
buf[1] == 0 &&
|
||||
buf[2] == 0xfe &&
|
||||
buf[3] == 0xff)
|
||||
{
|
||||
result = UTF_32BE;
|
||||
}
|
||||
else if (buf[0] == 0xff &&
|
||||
buf[1] == 0xfe &&
|
||||
buf[2] == 0 &&
|
||||
buf[3] == 0)
|
||||
{
|
||||
result = UTF_32LE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype)
|
||||
{
|
||||
static const TCHAR* names[] =
|
||||
{
|
||||
_T("UTF-8"),
|
||||
_T("UTF-16LE"),
|
||||
_T("UTF-16BE"),
|
||||
_T("UTF-32LE"),
|
||||
_T("UTF-32BE"),
|
||||
_T("UNKNOWN")
|
||||
};
|
||||
|
||||
return names[ftype];
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue