Unicode port: Support for Unicode/UTF8 input files by Jim Park.

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6066 212acab6-be3b-0410-9dea-997c60f758d6
2010-04-20 09:04:26 +00:00 · 2010-04-20 09:04:26 +00:00 · c8d77cd501
commit c8d77cd501
parent 2d3bf19b0d
4 changed files with 429 additions and 1 deletions
--- a/Source/tstring.cpp
+++ b/Source/tstring.cpp
@ -0,0 +1,131 @@
+// tstring.cpp
+//
+// This file is a part of Unicode NSIS.
+//
+// Copyright (C) 2007-2009 Jim Park
+//
+// Licensed under the zlib/libpng license (the "License");
+// you may not use this file except in compliance with the License.
+//
+// This software is provided 'as-is', without any expressed or implied
+// warranty.
+//
+// Provides TSTRING support.
+
+#ifdef _UNICODE
+
+#include "tstring.h"
+#include "validateunicode.h"
+#include <vector>
+
+// Simple RAII for C-styled FILE pointers.
+class ScopedFile
+{
+	public:
+		ScopedFile(FILE* file) : m_file(file) {}
+
+		~ScopedFile()
+		{
+			if (this->m_file != NULL)
+			{
+				fflush(this->m_file);
+				fclose(this->m_file);
+			}
+		}
+
+		operator FILE*(){ return this->m_file; }
+
+		operator bool() { return this->m_file != NULL; }
+
+	private:
+		FILE* m_file;
+};
+
+FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
+{
+	extern FILE *g_output;
+	CValidateUnicode::FILE_TYPE ftype = CValidateUnicode::UTF_16LE;
+
+	// If we are reading an existing file, check to see what type of file it
+	// is first.
+	if (_tcsstr(mode, _T("w+")) ||
+	    _tcsstr(mode, _T("r")))
+	{
+		ScopedFile fp(_tfopen(file, _T("rb")));
+
+		if (fp)
+		{
+			fseek(fp, 0, SEEK_END);
+			size_t fileSize = ftell(fp);
+			if (fileSize == 0)
+			{
+			   // Empty files are treated as UTF-8.
+			   ftype = CValidateUnicode::UTF_8;
+			}
+			else
+			{
+			   std::vector<unsigned char> buffer(fileSize);
+			   fseek(fp, 0, SEEK_SET);
+			   fread(&buffer[0], sizeof(unsigned char), fileSize, fp);
+
+			   ftype = CValidateUnicode::CheckBOM(&buffer[0], buffer.size());
+
+			   switch (ftype)
+			   {
+				   case CValidateUnicode::UTF_8:
+				   case CValidateUnicode::UTF_16LE:
+				   case CValidateUnicode::UTF_16BE:
+					   //_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"),
+					   //   file, CValidateUnicode::TypeToName(ftype));
+					   break;
+				   case CValidateUnicode::UTF_32LE:
+				   case CValidateUnicode::UTF_32BE:
+					   _ftprintf(g_output, _T("File '%s' has a BOM marked as %s which is not supported at this time.\n"),
+							   file, CValidateUnicode::TypeToName(ftype));
+					   exit(-1);
+					   break;
+				   case CValidateUnicode::UNKNOWN:
+					   // If unknown, let's see if it's not just UTF_8 without a BOM.
+					   if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()))
+					   {
+						   ftype = CValidateUnicode::UTF_8;
+						   _ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
+					   }
+					   else
+					   {
+						   _ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file);
+					   }
+					   break;
+				   default:
+					   _ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
+							   file, ftype);
+					   exit(-1);
+					   break;
+			   }
+         }			   
+		}
+	}
+
+	tstring strMode(mode);
+
+	switch (ftype)
+	{
+		case CValidateUnicode::UTF_8:
+			strMode.append(_T(", ccs=UTF-8"));
+			_ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file);
+			break;
+		case CValidateUnicode::UTF_16LE:
+			strMode.append(_T(", ccs=UTF-16LE"));
+			_ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file);
+			break;
+		default:
+			// Looks like fopen() doesn't support other encodings of Unicode.
+			strMode.append(_T(", ccs=UNICODE"));
+			_ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file);
+			break;
+	}
+
+	return _tfopen(file, strMode.c_str());
+}
+
+#endif
--- a/Source/tstring.h
+++ b/Source/tstring.h
@ -29,7 +29,8 @@ typedef std::wstring     tstring;
 typedef std::wofstream   tofstream;
 typedef std::wifstream   tifstream;
 // Use the following macros to open text files.
-#define FOPENTEXT(file, mode) _wfopen(file, mode)
+FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode);
+#define FOPENTEXT(file, mode) FileOpenUnicodeText(file, mode)
 #else
 typedef std::string      tstring;
 typedef std::ofstream    tofstream;
--- a/Source/validateunicode.cpp
+++ b/Source/validateunicode.cpp
@ -0,0 +1,235 @@
+// validateunicode.cpp
+//
+// This file is a part of Unicode NSIS.
+//
+// Copyright (C) 2009 - Jim Park
+//
+// Licensed under the zlib/libpng license (the "License");
+// you may not use this file except in compliance with the License.
+//
+// This software is provided 'as-is', without any expressed or implied
+// warranty.
+//
+// This class can be used to check a buffer to see if it has the expected
+// Unicode encoding and look for byte order marks.
+
+#ifdef _UNICODE
+
+#include "validateunicode.h"
+#include <vector>
+
+// anonymous namespace
+namespace
+{
+	struct CUTF8BytesToFollow
+	{
+		unsigned char m_rShift;
+		unsigned char m_result;
+		unsigned char m_bytesToFollow;
+	};
+
+	const CUTF8BytesToFollow g_utf8BytesToFollow[] =
+	{
+		 /* r-shift, result, length */
+		{ 7,  0x0, 0},
+		{ 5,  0x6, 1},
+		{ 4,  0xe, 2},
+		{ 3, 0x1e, 3},
+		{ 2, 0x3e, 4},
+		{ 1, 0x7e, 5}
+	};
+};
+
+bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
+{
+	bool valid = true;
+	int bytesToFollow = 0;
+
+	while (valid && characters > 0)
+	{
+		// Last character may be 0.
+		if (*buf == 0 && characters != 1)
+		{
+			valid = false;
+		}
+		else
+		{
+			bytesToFollow = GetBytesToFollow(*buf);
+			if (bytesToFollow > 0)
+			{
+				while (bytesToFollow)
+				{
+					++buf;
+					--characters;
+					if (*buf >> 6 != 0x2)
+					{
+						valid = false;
+					}
+					--bytesToFollow;
+				}
+			}
+		}
+		++buf;
+		--characters;
+	}
+
+	return valid;
+}
+
+int CValidateUnicode::GetBytesToFollow(unsigned char ch)
+{
+	int result = -1;
+	for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
+	{
+		if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
+		{
+			result = g_utf8BytesToFollow[i].m_bytesToFollow;
+		}
+	}
+
+	return result;
+}
+
+bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
+{
+	// We need to make sure the endianness matches the processor.
+	// Intel x86 is little endian.
+	return ValidateUTF16((unsigned short*)(buf), bytes/2);
+}
+
+bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes)
+{
+	std::vector<unsigned short> correctedBuf(bytes/2);
+
+	for (size_t i = 0; i < bytes; i += 2)
+	{
+		correctedBuf[i/2] = buf[i] << 8 | buf[i+1];
+	}
+
+	return ValidateUTF16(&correctedBuf[0], correctedBuf.size());
+}
+
+bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters)
+{
+	unsigned short ch;
+	bool valid = true;
+
+	while (valid && characters > 0)
+	{
+		// Last character may be 0.
+		if ((ch = *buf) == 0 && characters != 1)
+		{
+			valid = false;
+		}
+		else if (ch >= 0xd800 && ch <= 0xdbff)
+		{
+			unsigned short trailing = *(++buf);
+			--characters;
+			// Unpaired leading surrogate found?
+			if (trailing < 0xdc00 || trailing > 0xdfff)
+			{
+				valid = false;
+			}
+			// Invalid surrogate pairs found?
+			else if ((ch == 0xd83f ||
+						 ch == 0xd87f ||
+						 ch == 0xd8bf ||
+						 ch == 0xd8ff ||
+						 ch == 0xd93f ||
+						 ch == 0xd97f ||
+						 ch == 0xd9bf ||
+						 ch == 0xd9ff ||
+						 ch == 0xda3f ||
+						 ch == 0xdA7f ||
+						 ch == 0xdabf ||
+						 ch == 0xdaff ||
+						 ch == 0xdb3f ||
+						 ch == 0xdb7f ||
+						 ch == 0xdbbf ||
+						 ch == 0xdbff)
+					  	&&
+				     	(trailing == 0xdffe || trailing == 0xdfff))
+			{
+				valid = false;
+			}
+		}
+		// Unpaired trailing surrogate!
+		else if (ch >= 0xdc00 && ch <= 0xdfff)
+		{
+			valid = false;
+		}
+		// Invalid values
+		else if (ch == 0xfffe || ch == 0xffff ||
+				   (ch >= 0xfdd0 && ch <= 0xfdef))
+		{
+			valid = false;
+		}
+
+		++buf;
+	   --characters;
+	}
+
+	return valid;
+}
+
+CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM(
+	unsigned char* buf,
+	size_t         bytes)
+{
+	FILE_TYPE result = UNKNOWN;
+
+	if (bytes >= 2)
+	{
+		if (buf[0] == 0xff && buf[1] == 0xfe)
+		{
+			result = UTF_16LE;
+		}
+		else if (buf[0] == 0xfe && buf[1] == 0xff)
+		{
+			result = UTF_16BE;
+		}
+		else if (bytes >= 3 &&
+			    	buf[0] == 0xef &&
+					buf[1] == 0xbb &&
+					buf[2] == 0xbf)
+		{
+			result = UTF_8;
+		}
+		else if (bytes >= 4)
+		{
+			if (buf[0] == 0 &&
+				 buf[1] == 0 &&
+				 buf[2] == 0xfe &&
+				 buf[3] == 0xff)
+			{
+				result = UTF_32BE;
+			}
+			else if (buf[0] == 0xff &&
+					   buf[1] == 0xfe &&
+						buf[2] == 0 &&
+						buf[3] == 0)
+			{
+				result = UTF_32LE;
+			}
+		}
+	}
+
+	return result;
+}
+
+const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype)
+{
+	static const TCHAR* names[] =
+	{
+		_T("UTF-8"),
+		_T("UTF-16LE"),
+		_T("UTF-16BE"),
+		_T("UTF-32LE"),
+		_T("UTF-32BE"),
+		_T("UNKNOWN")
+	};
+
+	return names[ftype];
+}
+
+#endif
--- a/Source/validateunicode.h
+++ b/Source/validateunicode.h
@ -0,0 +1,61 @@
+// validateunicode.h
+//
+// This file is a part of Unicode NSIS.
+//
+// Copyright (C) 2009 Jim Park
+//
+// Licensed under the zlib/libpng license (the "License");
+// you may not use this file except in compliance with the License.
+//
+// This software is provided 'as-is', without any expressed or implied
+// warranty.
+//
+// This class can be used to check a buffer to see if it has the expected
+// Unicode encoding and look for byte order marks.
+
+#ifndef _VALIDATEUNICODE_
+#define _VALIDATEUNICODE_
+
+#include "tchar.h"
+
+class CValidateUnicode
+{
+	public:
+
+		// Enum type for each Unicode encoding.
+		enum FILE_TYPE
+		{
+			UTF_8 = 0,
+			UTF_16LE,
+			UTF_16BE,
+			UTF_32LE,
+			UTF_32BE,
+			UNKNOWN
+		};
+
+		// Make sure that the buffer contains valid UTF-8 encoding.
+		static bool ValidateUTF8(unsigned char* buf, size_t characters);
+
+		// Make sure that the buffer contains valid UTF-16LE encoding.
+		static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);
+
+		// Make sure that the buffer contains valid UTF-16BE encoding.
+		static bool ValidateUTF16BE(unsigned char* buf, size_t bytes);
+
+		// Make sure that the buffer contains valid UTF-16 encoding.
+		static bool ValidateUTF16(unsigned short* buf, size_t characters);
+
+		// Does the buffer have a byte order mark?  And if so, what does it say?
+		static FILE_TYPE CheckBOM(unsigned char* buf, size_t bytes);
+
+		// Convert a FILE_TYPE enum to a string.
+		static const TCHAR* TypeToName(FILE_TYPE ftype);
+
+	protected:
+
+		// Given the initial byte of a UTF-8 character, how many bytes are to
+		// follow?
+		static int GetBytesToFollow(unsigned char ch);
+};
+
+#endif