From ef8a83bd41d0c801c3589b461ad487a789166ed1 Mon Sep 17 00:00:00 2001
From: anders_k <anders_k@212acab6-be3b-0410-9dea-997c60f758d6>
Date: Mon, 5 Dec 2011 23:44:26 +0000
Subject: [PATCH] Basic UTF-8 support in ansi build so it can read UTF-8 .nlf
 files and LangStrings

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6196 212acab6-be3b-0410-9dea-997c60f758d6
---
 Source/Platform.h |  3 ++
 Source/SConscript |  1 +
 Source/build.cpp  |  3 ++
 Source/build.h    |  6 ++++
 Source/lang.cpp   | 49 ++++++++++++++++++++++++++-
 Source/script.cpp | 19 ++++++++++-
 Source/utf.cpp    | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 Source/utf.h      | 43 ++++++++++++++++++++++++
 Source/util.cpp   | 20 +++++++++--
 Source/util.h     |  3 ++
 10 files changed, 227 insertions(+), 5 deletions(-)
 create mode 100644 Source/utf.cpp
 create mode 100644 Source/utf.h

diff --git a/Source/Platform.h b/Source/Platform.h
index 2f9371c6..e3135f9a 100644
--- a/Source/Platform.h
+++ b/Source/Platform.h
@@ -448,6 +448,9 @@ typedef DWORDLONG ULONGLONG,*PULONGLONG;
 #ifndef CP_ACP
 #  define CP_ACP 0
 #endif
+#ifndef CP_UTF8
+#  define CP_UTF8 65001
+#endif
 
 #ifndef COLOR_BTNFACE
 #  define COLOR_BTNFACE 15
diff --git a/Source/SConscript b/Source/SConscript
index b5ac6df5..d7e190ba 100644
--- a/Source/SConscript
+++ b/Source/SConscript
@@ -24,6 +24,7 @@ makensis_files = Split("""
 	strlist.cpp
 	tokens.cpp
 	tstring.cpp
+	utf.cpp
 	util.cpp
 	validateunicode.cpp
 	winchar.cpp
diff --git a/Source/build.cpp b/Source/build.cpp
index db86da36..5c23c1ca 100644
--- a/Source/build.cpp
+++ b/Source/build.cpp
@@ -121,6 +121,9 @@ CEXEBuild::CEXEBuild() :
   multiple_entries_instruction=0;
 
   build_include_depth=0;
+#ifndef _UNICODE
+  build_include_isutf8=false;
+#endif
 
   has_called_write_output=false;
 
diff --git a/Source/build.h b/Source/build.h
index cb77a175..1cf99dbf 100644
--- a/Source/build.h
+++ b/Source/build.h
@@ -328,6 +328,9 @@ class CEXEBuild {
      * this will return a PS_WARNING.
      */
     int SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL unicode);
+#ifndef _UNICODE
+    int SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8);
+#endif
 
     /**
      * Sets the user string to the specific NLF_STRINGS id.
@@ -424,6 +427,9 @@ class CEXEBuild {
     TCHAR build_output_filename[1024];
 
     int build_include_depth;
+#ifndef _UNICODE
+    bool build_include_isutf8; // UTF-8 LangString in .nsh hack for ANSI builds
+#endif
 
     // Added by ramon 6 jun 2003
 #ifdef NSIS_SUPPORT_VERSION_INFO
diff --git a/Source/lang.cpp b/Source/lang.cpp
index bda48b10..d1c486a3 100644
--- a/Source/lang.cpp
+++ b/Source/lang.cpp
@@ -26,6 +26,7 @@
 #include "exehead/resource.h"
 #include <nsis-version.h>
 #include "tstring.h"
+#include "utf.h"
 
 using namespace std;
 
@@ -492,6 +493,9 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
 
   int sn;
 
+  if (_tcsclen(str) > NSIS_MAX_STRLEN-1)
+    warning_fl("LangString \"%s\" longer than NSIS_MAX_STRLEN!", name);
+
   int pos = build_langstrings.get(name, &sn);
   if (pos < 0)
     pos = build_langstrings.add(name, &sn);
@@ -502,6 +506,21 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
   return PS_OK;
 }
 
+#ifndef _UNICODE
+int CEXEBuild::SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8)
+{
+  LanguageTable *table = GetLangTable(lang);
+  if (!table) return PS_ERROR;
+  if (!Platform_SupportsUTF8Conversion()) return PS_ERROR;
+
+  EXEHEADTCHAR_T *bufEHTStr = UTF8ToExeHeadTStr(stru8, table->nlf.m_uCodePage);
+  if (!bufEHTStr) return PS_ERROR;
+  const int ret = SetLangString(name, lang, bufEHTStr, sizeof(EXEHEADTCHAR_T) > 1);
+  ExeHeadTStrFree(bufEHTStr);
+  return ret;
+}
+#endif
+
 // Sets the user string to the specific NLF_STRINGS id.
 //
 // @return If the id is invalid or the string is not valid, it will return a
@@ -925,6 +944,11 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
     return 0;
   }
 
+#ifndef _UNICODE
+  char fencoding = 0; // 0 = ansi, 8 = utf-8 (16/17 for uft-16le/be not supported)
+  if (IsUTF8BOM(f)) fencoding = 8;
+#endif
+
   // Check header
   TCHAR buf[NSIS_MAX_STRLEN];
   buf[0] = SkipComments(f);
@@ -1096,8 +1120,31 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
     buf[0] = SkipComments(f);
 
     _fgetts(buf+1, NSIS_MAX_STRLEN, f);
+#ifndef _UNICODE
+    if (8 == fencoding)
+    {
+      if (!Platform_SupportsUTF8Conversion()) {
+        ERROR_MSG(_T("Error: UTF-8 language files not supported on this OS!\n"));
+        return 0;
+      }
+      EXEHEADTCHAR_T *bufConv = UTF8ToExeHeadTStr(buf, nlf->m_uCodePage);
+      if (!bufConv) {
+        ERROR_MSG(_T("Error: Invalid UTF-8? (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
+        return 0;
+      }
+      else {
+        UINT cch = _tcslen(bufConv);
+        _tcsnccpy(buf, bufConv, NSIS_MAX_STRLEN);
+        if (cch >= NSIS_MAX_STRLEN-1) {
+          buf[NSIS_MAX_STRLEN-1] = _T('\0'); // Make sure we fail the "String too long" check
+        }
+      }
+      ExeHeadTStrFree(bufConv);
+    }
+#endif
+
     if (_tcslen(buf) == NSIS_MAX_STRLEN-1) {
-      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")"), i, NLFStrings[i].szLangStringName);
+      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
       return 0;
     }
     temp=_tcslen(buf);
diff --git a/Source/script.cpp b/Source/script.cpp
index eed1ef45..c02d0c30 100644
--- a/Source/script.cpp
+++ b/Source/script.cpp
@@ -34,6 +34,7 @@
 #include <cassert> // for assert(3)
 #include <time.h>
 #include "tstring.h"
+#include "utf.h"
 #include <algorithm>
 #include "boost/scoped_ptr.hpp"
 
@@ -813,6 +814,10 @@ int CEXEBuild::includeScript(TCHAR *f)
     return PS_ERROR;
   }
   build_include_depth++;
+#ifndef _UNICODE
+  const bool org_build_include_isutf8 = build_include_isutf8;
+  build_include_isutf8 = IsUTF8BOM(incfp);
+#endif
 
   int last_linecnt=linecnt;
   linecnt=0;
@@ -837,6 +842,10 @@ int CEXEBuild::includeScript(TCHAR *f)
   restore_timestamp_predefine(oldtimestamp);
 #endif
 
+#ifndef _UNICODE
+  build_include_isutf8 = org_build_include_isutf8;
+#endif
+
   int errlinecnt=linecnt;
 
   linecnt=last_linecnt;
@@ -1712,13 +1721,21 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
       TCHAR *name = line.gettoken_str(1);
       LANGID lang = line.gettoken_int(2);
       TCHAR *str = line.gettoken_str(3);
-      int ret = SetLangString(name, lang, str, curfile_unicode);
+      int ret;
+#ifndef _UNICODE
+        if (build_include_isutf8)
+          ret = SetUTF8LangString(name, lang, str);
+        else
+#endif
+          ret = SetLangString(name, lang, str, curfile_unicode);
+
       if (ret == PS_WARNING)
         warning_fl(_T("LangString \"%s\" set multiple times for %d, wasting space"), name, lang);
       else if (ret == PS_ERROR) {
         ERROR_MSG(_T("Error: can't set LangString \"%s\"!\n"), name);
         return PS_ERROR;
       }
+      // BUGBUG: Does not display UTF-8 properly.
       SCRIPT_MSG(_T("LangString: \"%s\" %d \"%s\"\n"), name, lang, str);
     }
     return PS_OK;
diff --git a/Source/utf.cpp b/Source/utf.cpp
new file mode 100644
index 00000000..1f6a4959
--- /dev/null
+++ b/Source/utf.cpp
@@ -0,0 +1,85 @@
+/*
+ * utf.cpp
+ * 
+ * This file is a part of NSIS.
+ * 
+ * Copyright (C) 2011 Anders Kjersem
+ * 
+ * Licensed under the zlib/libpng license (the "License");
+ * you may not use this file except in compliance with the License.
+ * 
+ * Licence details can be found in the file COPYING.
+ * 
+ * This software is provided 'as-is', without any express or implied
+ * warranty.
+ *
+ */
+
+#include "utf.h"
+
+// BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported
+// on < WinXP or in our current POSIX implementation.
+static const int UTF8MBTWCFLAGS  = 0;
+
+
+#define ExeHeadWStrFree free
+static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch) 
+{
+  EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T));
+#if 0
+  // TODO: We should add POSIX versions of  G/SetLastError
+  // if we want to tell _why_ UTF8ToExeHeadTStr failed...
+  if (!s) SetLastError(ERROR_OUTOFMEMORY);
+#endif
+  return s;
+}
+
+#ifdef _UNICODE
+#else // !_UNICODE
+
+EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage) 
+{
+  int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0);
+  if (!cchW) return NULL;
+  WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW);
+  if (!bufWStr) return NULL;
+  EXEHEADTCHAR_T *outstr = NULL;
+  if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW))
+  {
+    int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL);
+    if (cbA && (outstr = ExeHeadTStrAlloc(cbA)))
+    {
+      if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL))
+      {
+        ExeHeadTStrFree(outstr);
+        outstr = NULL;
+      }
+    }
+  }
+  ExeHeadWStrFree(bufWStr);
+  return outstr;
+}
+
+#endif // ?_UNICODE
+
+
+bool IsUTF8BOM(FILE*fstrm) 
+{
+  // ungetc is only guaranteed to support 1 pushback, 
+  // lets hope no ASCII file starts with 0xEF and is not a BOM!
+  const int c = fgetc(fstrm);
+  if (EOF == c) return false;
+  if (0xef == c)
+  {
+    const int c2 = fgetc(fstrm);
+    if (0xbb == c2)
+    {
+      const int c3 = fgetc(fstrm);
+      if (0xbf == c3) return true;
+      ungetc(c3,fstrm);
+    }
+    ungetc(c2,fstrm);
+  }
+  ungetc(c,fstrm);
+  return false;
+}
diff --git a/Source/utf.h b/Source/utf.h
new file mode 100644
index 00000000..96a227db
--- /dev/null
+++ b/Source/utf.h
@@ -0,0 +1,43 @@
+/*
+ * utf.h
+ * 
+ * This file is a part of NSIS.
+ * 
+ * Copyright (C) 2011 Anders Kjersem
+ * 
+ * Licensed under the zlib/libpng license (the "License");
+ * you may not use this file except in compliance with the License.
+ * 
+ * Licence details can be found in the file COPYING.
+ * 
+ * This software is provided 'as-is', without any express or implied
+ * warranty.
+ *
+ */
+
+#include "Platform.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef unsigned short EXEHEADWCHAR_T;
+
+
+#ifdef _UNICODE
+typedef EXEHEADWCHAR_T EXEHEADTCHAR_T;
+
+#else // !_UNICODE
+typedef char EXEHEADTCHAR_T;
+
+#define ExeHeadTStrFree free
+inline EXEHEADTCHAR_T* ExeHeadTStrAlloc(UINT cb) {return (EXEHEADTCHAR_T*) malloc(cb);}
+extern EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage);
+
+#endif // ?_UNICODE
+
+
+/**
+ * Tries to peek at the first few bytes in the stream to determine if it is a UTF-8 BOM.
+ * If it is a UTF-8 BOM it will eat the BOM, 
+ * if it is not it tries its best to restore the data.
+ */
+extern bool IsUTF8BOM(FILE*fstrm);
diff --git a/Source/util.cpp b/Source/util.cpp
index dc3d488d..e2be0010 100644
--- a/Source/util.cpp
+++ b/Source/util.cpp
@@ -200,8 +200,15 @@ inline size_t nsis_iconv_adaptor
 }
 
 void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
-  if (code_page == CP_ACP)
+  switch(code_page)
+  {
+  case CP_ACP:
     code_page = 1252;
+    break;
+  case CP_UTF8:
+    _sntprintf(buf, len, _T("UTF-8"));
+    return;
+  }
 
   _sntprintf(buf, len, _T("CP%d"), code_page);
 }
@@ -209,7 +216,7 @@ void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
 int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
     int cchWideChar, LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
     LPBOOL lpUsedDefaultChar) {
-  static char buffer[4096];
+  static char buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
 
   char cp[128];
   create_code_page_string(cp, sizeof(cp), CodePage);
@@ -245,7 +252,7 @@ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
 
 int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
     int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) {
-  static WCHAR buffer[4096];
+  static WCHAR buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
 
   char cp[128];
   create_code_page_string(cp, sizeof(cp), CodePage);
@@ -900,3 +907,10 @@ bool GetDLLVersion(const tstring& filepath, DWORD& high, DWORD& low)
 
   return false;
 }
+
+bool Platform_SupportsUTF8Conversion()
+{
+  static unsigned char cached = -1;
+  if (-1 == cached) cached = !!IsValidCodePage(CP_UTF8);
+  return cached != 0;
+}
\ No newline at end of file
diff --git a/Source/util.h b/Source/util.h
index 022d563b..f2f3be73 100644
--- a/Source/util.h
+++ b/Source/util.h
@@ -185,4 +185,7 @@ RM_DEFINE_FREEFUNC(my_convert_free);
 #  define PATH_CONVERT(x)
 #endif
 
+// Platform detection
+bool Platform_SupportsUTF8Conversion();
+
 #endif //_UTIL_H_