Unicode port: Support for Unicode/UTF8 input files by Jim Park.

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6066 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
wizou 2010-04-20 09:04:26 +00:00
parent 2d3bf19b0d
commit c8d77cd501
4 changed files with 429 additions and 1 deletions

131
Source/tstring.cpp Normal file
View file

@ -0,0 +1,131 @@
// tstring.cpp
//
// This file is a part of Unicode NSIS.
//
// Copyright (C) 2007-2009 Jim Park
//
// Licensed under the zlib/libpng license (the "License");
// you may not use this file except in compliance with the License.
//
// This software is provided 'as-is', without any expressed or implied
// warranty.
//
// Provides TSTRING support.
#ifdef _UNICODE
#include "tstring.h"
#include "validateunicode.h"
#include <vector>
// Simple RAII for C-styled FILE pointers.
class ScopedFile
{
public:
ScopedFile(FILE* file) : m_file(file) {}
~ScopedFile()
{
if (this->m_file != NULL)
{
fflush(this->m_file);
fclose(this->m_file);
}
}
operator FILE*(){ return this->m_file; }
operator bool() { return this->m_file != NULL; }
private:
FILE* m_file;
};
FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
{
extern FILE *g_output;
CValidateUnicode::FILE_TYPE ftype = CValidateUnicode::UTF_16LE;
// If we are reading an existing file, check to see what type of file it
// is first.
if (_tcsstr(mode, _T("w+")) ||
_tcsstr(mode, _T("r")))
{
ScopedFile fp(_tfopen(file, _T("rb")));
if (fp)
{
fseek(fp, 0, SEEK_END);
size_t fileSize = ftell(fp);
if (fileSize == 0)
{
// Empty files are treated as UTF-8.
ftype = CValidateUnicode::UTF_8;
}
else
{
std::vector<unsigned char> buffer(fileSize);
fseek(fp, 0, SEEK_SET);
fread(&buffer[0], sizeof(unsigned char), fileSize, fp);
ftype = CValidateUnicode::CheckBOM(&buffer[0], buffer.size());
switch (ftype)
{
case CValidateUnicode::UTF_8:
case CValidateUnicode::UTF_16LE:
case CValidateUnicode::UTF_16BE:
//_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"),
// file, CValidateUnicode::TypeToName(ftype));
break;
case CValidateUnicode::UTF_32LE:
case CValidateUnicode::UTF_32BE:
_ftprintf(g_output, _T("File '%s' has a BOM marked as %s which is not supported at this time.\n"),
file, CValidateUnicode::TypeToName(ftype));
exit(-1);
break;
case CValidateUnicode::UNKNOWN:
// If unknown, let's see if it's not just UTF_8 without a BOM.
if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()))
{
ftype = CValidateUnicode::UTF_8;
_ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
}
else
{
_ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file);
}
break;
default:
_ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
file, ftype);
exit(-1);
break;
}
}
}
}
tstring strMode(mode);
switch (ftype)
{
case CValidateUnicode::UTF_8:
strMode.append(_T(", ccs=UTF-8"));
_ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file);
break;
case CValidateUnicode::UTF_16LE:
strMode.append(_T(", ccs=UTF-16LE"));
_ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file);
break;
default:
// Looks like fopen() doesn't support other encodings of Unicode.
strMode.append(_T(", ccs=UNICODE"));
_ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file);
break;
}
return _tfopen(file, strMode.c_str());
}
#endif

View file

@ -29,7 +29,8 @@ typedef std::wstring tstring;
typedef std::wofstream tofstream;
typedef std::wifstream tifstream;
// Use the following macros to open text files.
#define FOPENTEXT(file, mode) _wfopen(file, mode)
FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode);
#define FOPENTEXT(file, mode) FileOpenUnicodeText(file, mode)
#else
typedef std::string tstring;
typedef std::ofstream tofstream;

235
Source/validateunicode.cpp Normal file
View file

@ -0,0 +1,235 @@
// validateunicode.cpp
//
// This file is a part of Unicode NSIS.
//
// Copyright (C) 2009 - Jim Park
//
// Licensed under the zlib/libpng license (the "License");
// you may not use this file except in compliance with the License.
//
// This software is provided 'as-is', without any expressed or implied
// warranty.
//
// This class can be used to check a buffer to see if it has the expected
// Unicode encoding and look for byte order marks.
#ifdef _UNICODE
#include "validateunicode.h"
#include <vector>
// anonymous namespace
namespace
{
struct CUTF8BytesToFollow
{
unsigned char m_rShift;
unsigned char m_result;
unsigned char m_bytesToFollow;
};
const CUTF8BytesToFollow g_utf8BytesToFollow[] =
{
/* r-shift, result, length */
{ 7, 0x0, 0},
{ 5, 0x6, 1},
{ 4, 0xe, 2},
{ 3, 0x1e, 3},
{ 2, 0x3e, 4},
{ 1, 0x7e, 5}
};
};
bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
{
bool valid = true;
int bytesToFollow = 0;
while (valid && characters > 0)
{
// Last character may be 0.
if (*buf == 0 && characters != 1)
{
valid = false;
}
else
{
bytesToFollow = GetBytesToFollow(*buf);
if (bytesToFollow > 0)
{
while (bytesToFollow)
{
++buf;
--characters;
if (*buf >> 6 != 0x2)
{
valid = false;
}
--bytesToFollow;
}
}
}
++buf;
--characters;
}
return valid;
}
int CValidateUnicode::GetBytesToFollow(unsigned char ch)
{
int result = -1;
for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
{
if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
{
result = g_utf8BytesToFollow[i].m_bytesToFollow;
}
}
return result;
}
bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
{
// We need to make sure the endianness matches the processor.
// Intel x86 is little endian.
return ValidateUTF16((unsigned short*)(buf), bytes/2);
}
bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes)
{
std::vector<unsigned short> correctedBuf(bytes/2);
for (size_t i = 0; i < bytes; i += 2)
{
correctedBuf[i/2] = buf[i] << 8 | buf[i+1];
}
return ValidateUTF16(&correctedBuf[0], correctedBuf.size());
}
bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters)
{
unsigned short ch;
bool valid = true;
while (valid && characters > 0)
{
// Last character may be 0.
if ((ch = *buf) == 0 && characters != 1)
{
valid = false;
}
else if (ch >= 0xd800 && ch <= 0xdbff)
{
unsigned short trailing = *(++buf);
--characters;
// Unpaired leading surrogate found?
if (trailing < 0xdc00 || trailing > 0xdfff)
{
valid = false;
}
// Invalid surrogate pairs found?
else if ((ch == 0xd83f ||
ch == 0xd87f ||
ch == 0xd8bf ||
ch == 0xd8ff ||
ch == 0xd93f ||
ch == 0xd97f ||
ch == 0xd9bf ||
ch == 0xd9ff ||
ch == 0xda3f ||
ch == 0xdA7f ||
ch == 0xdabf ||
ch == 0xdaff ||
ch == 0xdb3f ||
ch == 0xdb7f ||
ch == 0xdbbf ||
ch == 0xdbff)
&&
(trailing == 0xdffe || trailing == 0xdfff))
{
valid = false;
}
}
// Unpaired trailing surrogate!
else if (ch >= 0xdc00 && ch <= 0xdfff)
{
valid = false;
}
// Invalid values
else if (ch == 0xfffe || ch == 0xffff ||
(ch >= 0xfdd0 && ch <= 0xfdef))
{
valid = false;
}
++buf;
--characters;
}
return valid;
}
CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM(
unsigned char* buf,
size_t bytes)
{
FILE_TYPE result = UNKNOWN;
if (bytes >= 2)
{
if (buf[0] == 0xff && buf[1] == 0xfe)
{
result = UTF_16LE;
}
else if (buf[0] == 0xfe && buf[1] == 0xff)
{
result = UTF_16BE;
}
else if (bytes >= 3 &&
buf[0] == 0xef &&
buf[1] == 0xbb &&
buf[2] == 0xbf)
{
result = UTF_8;
}
else if (bytes >= 4)
{
if (buf[0] == 0 &&
buf[1] == 0 &&
buf[2] == 0xfe &&
buf[3] == 0xff)
{
result = UTF_32BE;
}
else if (buf[0] == 0xff &&
buf[1] == 0xfe &&
buf[2] == 0 &&
buf[3] == 0)
{
result = UTF_32LE;
}
}
}
return result;
}
const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype)
{
static const TCHAR* names[] =
{
_T("UTF-8"),
_T("UTF-16LE"),
_T("UTF-16BE"),
_T("UTF-32LE"),
_T("UTF-32BE"),
_T("UNKNOWN")
};
return names[ftype];
}
#endif

61
Source/validateunicode.h Normal file
View file

@ -0,0 +1,61 @@
// validateunicode.h
//
// This file is a part of Unicode NSIS.
//
// Copyright (C) 2009 Jim Park
//
// Licensed under the zlib/libpng license (the "License");
// you may not use this file except in compliance with the License.
//
// This software is provided 'as-is', without any expressed or implied
// warranty.
//
// This class can be used to check a buffer to see if it has the expected
// Unicode encoding and look for byte order marks.
#ifndef _VALIDATEUNICODE_
#define _VALIDATEUNICODE_
#include "tchar.h"
class CValidateUnicode
{
public:
// Enum type for each Unicode encoding.
enum FILE_TYPE
{
UTF_8 = 0,
UTF_16LE,
UTF_16BE,
UTF_32LE,
UTF_32BE,
UNKNOWN
};
// Make sure that the buffer contains valid UTF-8 encoding.
static bool ValidateUTF8(unsigned char* buf, size_t characters);
// Make sure that the buffer contains valid UTF-16LE encoding.
static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);
// Make sure that the buffer contains valid UTF-16BE encoding.
static bool ValidateUTF16BE(unsigned char* buf, size_t bytes);
// Make sure that the buffer contains valid UTF-16 encoding.
static bool ValidateUTF16(unsigned short* buf, size_t characters);
// Does the buffer have a byte order mark? And if so, what does it say?
static FILE_TYPE CheckBOM(unsigned char* buf, size_t bytes);
// Convert a FILE_TYPE enum to a string.
static const TCHAR* TypeToName(FILE_TYPE ftype);
protected:
// Given the initial byte of a UTF-8 character, how many bytes are to
// follow?
static int GetBytesToFollow(unsigned char ch);
};
#endif