Unicode port: Support for Unicode/UTF8 input files by Jim Park.

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6066 212acab6-be3b-0410-9dea-997c60f758d6
2010-04-20 09:04:26 +00:00 · 2010-04-20 09:04:26 +00:00 · c8d77cd501
commit c8d77cd501
parent 2d3bf19b0d
4 changed files with 429 additions and 1 deletions
--- a/Source/validateunicode.cpp
+++ b/Source/validateunicode.cpp
@ -0,0 +1,235 @@
+// validateunicode.cpp
+//
+// This file is a part of Unicode NSIS.
+//
+// Copyright (C) 2009 - Jim Park
+//
+// Licensed under the zlib/libpng license (the "License");
+// you may not use this file except in compliance with the License.
+//
+// This software is provided 'as-is', without any expressed or implied
+// warranty.
+//
+// This class can be used to check a buffer to see if it has the expected
+// Unicode encoding and look for byte order marks.
+
+#ifdef _UNICODE
+
+#include "validateunicode.h"
+#include <vector>
+
+// anonymous namespace
+namespace
+{
+	struct CUTF8BytesToFollow
+	{
+		unsigned char m_rShift;
+		unsigned char m_result;
+		unsigned char m_bytesToFollow;
+	};
+
+	const CUTF8BytesToFollow g_utf8BytesToFollow[] =
+	{
+		 /* r-shift, result, length */
+		{ 7,  0x0, 0},
+		{ 5,  0x6, 1},
+		{ 4,  0xe, 2},
+		{ 3, 0x1e, 3},
+		{ 2, 0x3e, 4},
+		{ 1, 0x7e, 5}
+	};
+};
+
+bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
+{
+	bool valid = true;
+	int bytesToFollow = 0;
+
+	while (valid && characters > 0)
+	{
+		// Last character may be 0.
+		if (*buf == 0 && characters != 1)
+		{
+			valid = false;
+		}
+		else
+		{
+			bytesToFollow = GetBytesToFollow(*buf);
+			if (bytesToFollow > 0)
+			{
+				while (bytesToFollow)
+				{
+					++buf;
+					--characters;
+					if (*buf >> 6 != 0x2)
+					{
+						valid = false;
+					}
+					--bytesToFollow;
+				}
+			}
+		}
+		++buf;
+		--characters;
+	}
+
+	return valid;
+}
+
+int CValidateUnicode::GetBytesToFollow(unsigned char ch)
+{
+	int result = -1;
+	for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
+	{
+		if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
+		{
+			result = g_utf8BytesToFollow[i].m_bytesToFollow;
+		}
+	}
+
+	return result;
+}
+
+bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
+{
+	// We need to make sure the endianness matches the processor.
+	// Intel x86 is little endian.
+	return ValidateUTF16((unsigned short*)(buf), bytes/2);
+}
+
+bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes)
+{
+	std::vector<unsigned short> correctedBuf(bytes/2);
+
+	for (size_t i = 0; i < bytes; i += 2)
+	{
+		correctedBuf[i/2] = buf[i] << 8 | buf[i+1];
+	}
+
+	return ValidateUTF16(&correctedBuf[0], correctedBuf.size());
+}
+
+bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters)
+{
+	unsigned short ch;
+	bool valid = true;
+
+	while (valid && characters > 0)
+	{
+		// Last character may be 0.
+		if ((ch = *buf) == 0 && characters != 1)
+		{
+			valid = false;
+		}
+		else if (ch >= 0xd800 && ch <= 0xdbff)
+		{
+			unsigned short trailing = *(++buf);
+			--characters;
+			// Unpaired leading surrogate found?
+			if (trailing < 0xdc00 || trailing > 0xdfff)
+			{
+				valid = false;
+			}
+			// Invalid surrogate pairs found?
+			else if ((ch == 0xd83f ||
+						 ch == 0xd87f ||
+						 ch == 0xd8bf ||
+						 ch == 0xd8ff ||
+						 ch == 0xd93f ||
+						 ch == 0xd97f ||
+						 ch == 0xd9bf ||
+						 ch == 0xd9ff ||
+						 ch == 0xda3f ||
+						 ch == 0xdA7f ||
+						 ch == 0xdabf ||
+						 ch == 0xdaff ||
+						 ch == 0xdb3f ||
+						 ch == 0xdb7f ||
+						 ch == 0xdbbf ||
+						 ch == 0xdbff)
+					  	&&
+				     	(trailing == 0xdffe || trailing == 0xdfff))
+			{
+				valid = false;
+			}
+		}
+		// Unpaired trailing surrogate!
+		else if (ch >= 0xdc00 && ch <= 0xdfff)
+		{
+			valid = false;
+		}
+		// Invalid values
+		else if (ch == 0xfffe || ch == 0xffff ||
+				   (ch >= 0xfdd0 && ch <= 0xfdef))
+		{
+			valid = false;
+		}
+
+		++buf;
+	   --characters;
+	}
+
+	return valid;
+}
+
+CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM(
+	unsigned char* buf,
+	size_t         bytes)
+{
+	FILE_TYPE result = UNKNOWN;
+
+	if (bytes >= 2)
+	{
+		if (buf[0] == 0xff && buf[1] == 0xfe)
+		{
+			result = UTF_16LE;
+		}
+		else if (buf[0] == 0xfe && buf[1] == 0xff)
+		{
+			result = UTF_16BE;
+		}
+		else if (bytes >= 3 &&
+			    	buf[0] == 0xef &&
+					buf[1] == 0xbb &&
+					buf[2] == 0xbf)
+		{
+			result = UTF_8;
+		}
+		else if (bytes >= 4)
+		{
+			if (buf[0] == 0 &&
+				 buf[1] == 0 &&
+				 buf[2] == 0xfe &&
+				 buf[3] == 0xff)
+			{
+				result = UTF_32BE;
+			}
+			else if (buf[0] == 0xff &&
+					   buf[1] == 0xfe &&
+						buf[2] == 0 &&
+						buf[3] == 0)
+			{
+				result = UTF_32LE;
+			}
+		}
+	}
+
+	return result;
+}
+
+const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype)
+{
+	static const TCHAR* names[] =
+	{
+		_T("UTF-8"),
+		_T("UTF-16LE"),
+		_T("UTF-16BE"),
+		_T("UTF-32LE"),
+		_T("UTF-32BE"),
+		_T("UNKNOWN")
+	};
+
+	return names[ftype];
+}
+
+#endif