Unicode: Improved UTF8 detection, less verbose

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6067 212acab6-be3b-0410-9dea-997c60f758d6
2010-04-20 15:29:55 +00:00 · 2010-04-20 15:29:55 +00:00 · ca54cf728c
commit ca54cf728c
parent c8d77cd501
4 changed files with 38 additions and 79 deletions
--- a/Source/SConscript
+++ b/Source/SConscript
@ -23,7 +23,9 @@ makensis_files = Split("""
 	ShConstants.cpp
 	strlist.cpp
 	tokens.cpp
 	tstring.cpp
 	util.cpp
 	validateunicode.cpp
 	winchar.cpp
 	writer.cpp
 """)
--- a/Source/tstring.cpp
+++ b/Source/tstring.cpp
@ -75,8 +75,6 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
 				   case CValidateUnicode::UTF_8:
 				   case CValidateUnicode::UTF_16LE:
 				   case CValidateUnicode::UTF_16BE:
 					   //_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"),
 					   //   file, CValidateUnicode::TypeToName(ftype));
 					   break;
 				   case CValidateUnicode::UTF_32LE:
 				   case CValidateUnicode::UTF_32BE:
@ -86,15 +84,12 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
 					   break;
 				   case CValidateUnicode::UNKNOWN:
 					   // If unknown, let's see if it's not just UTF_8 without a BOM.
-					   if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()))
+					   if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size()) == 2)
 					   {
-						   ftype = CValidateUnicode::UTF_8;
+                           // contains UTF-8 characters sequences
 						   _ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file);
-					   }
+						   ftype = CValidateUnicode::UTF_8;
-					   else
+                       }
 					   {
 						   _ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file);
 					   }
 					   break;
 				   default:
 					   _ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"),
@ -102,7 +97,7 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
 					   exit(-1);
 					   break;
 			   }
-         }			   
+			}			   
 		}
 	}
@ -112,16 +107,13 @@ FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode)
 	{
 		case CValidateUnicode::UTF_8:
 			strMode.append(_T(", ccs=UTF-8"));
 			_ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file);
 			break;
 		case CValidateUnicode::UTF_16LE:
 			strMode.append(_T(", ccs=UTF-16LE"));
 			_ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file);
 			break;
 		default:
 			// Looks like fopen() doesn't support other encodings of Unicode.
 			strMode.append(_T(", ccs=UNICODE"));
 			_ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file);
 			break;
 	}
--- a/Source/validateunicode.cpp
+++ b/Source/validateunicode.cpp
@ -18,76 +18,41 @@
 #include "validateunicode.h"
 #include <vector>
-// anonymous namespace
+int CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
 namespace
 {
-	struct CUTF8BytesToFollow
+	bool hasNonAscii = false;
 	{
 		unsigned char m_rShift;
 		unsigned char m_result;
 		unsigned char m_bytesToFollow;
 	};
 	const CUTF8BytesToFollow g_utf8BytesToFollow[] =
 	{
 		 /* r-shift, result, length */
 		{ 7,  0x0, 0},
 		{ 5,  0x6, 1},
 		{ 4,  0xe, 2},
 		{ 3, 0x1e, 3},
 		{ 2, 0x3e, 4},
 		{ 1, 0x7e, 5}
 	};
 };
 bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters)
 {
 	bool valid = true;
 	int bytesToFollow = 0;
-	while (valid && characters > 0)
+	for ( ; characters != 0 ; --characters)
 	{
-		// Last character may be 0.
+        unsigned char ch = *buf++;
-		if (*buf == 0 && characters != 1)
+        if (bytesToFollow != 0) // in the middle of a multi-byte sequence?
-		{
+        {
-			valid = false;
+            if ((ch & 0xC0) != 0x80)
-		}
+		    	return 0; // we expected a continuation byte
-		else
+            hasNonAscii = true;
-		{
+            --bytesToFollow;
-			bytesToFollow = GetBytesToFollow(*buf);
+        }
-			if (bytesToFollow > 0)
+        else if (ch & 0x80)
-			{
+        {
-				while (bytesToFollow)
+            if ((ch & 0xC0) == 0x80)
-				{
+		    	return 0; // continuation byte outside multi-byte sequence
-					++buf;
+            else if ((ch & 0xE0) == 0xC0)
-					--characters;
+		    	bytesToFollow = 1;
-					if (*buf >> 6 != 0x2)
+            else if ((ch & 0xF0) == 0xE0)
-					{
+		    	bytesToFollow = 2;
-						valid = false;
+            else if ((ch & 0xF8) == 0xF0)
-					}
+		    	bytesToFollow = 3;
-					--bytesToFollow;
+            else
-				}
+                return 0; // byte is invalid UTF-8 (outside RFC 3629)
-			}
+
-		}
+        }
-		++buf;
+        else if (ch == 0 && characters != 1)
-		--characters;
+	        return 0; // NUL character in the middle of the buffer
 	}
-
+    if (bytesToFollow != 0)
-	return valid;
+        return 0; // end of buffer in the middle of a multi-byte sequence
-}
+    return hasNonAscii ? 2 : 1;
 int CValidateUnicode::GetBytesToFollow(unsigned char ch)
 {
 	int result = -1;
 	for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i)
 	{
 		if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result)
 		{
 			result = g_utf8BytesToFollow[i].m_bytesToFollow;
 		}
 	}
 	return result;
 }
 bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes)
--- a/Source/validateunicode.h
+++ b/Source/validateunicode.h
@ -34,7 +34,7 @@ class CValidateUnicode
 		};
 		// Make sure that the buffer contains valid UTF-8 encoding.
-		static bool ValidateUTF8(unsigned char* buf, size_t characters);
+		static int ValidateUTF8(unsigned char* buf, size_t characters);
 		// Make sure that the buffer contains valid UTF-16LE encoding.
 		static bool ValidateUTF16LE(unsigned char* buf, size_t bytes);