Basic UTF-8 support in ansi build so it can read UTF-8 .nlf files and LangStrings

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6196 212acab6-be3b-0410-9dea-997c60f758d6
2011-12-05 23:44:26 +00:00 · 2011-12-05 23:44:26 +00:00 · ef8a83bd41
commit ef8a83bd41
parent 359ad0a055
10 changed files with 227 additions and 5 deletions
--- a/Source/Platform.h
+++ b/Source/Platform.h
@ -448,6 +448,9 @@ typedef DWORDLONG ULONGLONG,*PULONGLONG;
 #ifndef CP_ACP
 #  define CP_ACP 0
 #endif
+#ifndef CP_UTF8
+#  define CP_UTF8 65001
+#endif

 #ifndef COLOR_BTNFACE
 #  define COLOR_BTNFACE 15
--- a/Source/SConscript
+++ b/Source/SConscript
@ -24,6 +24,7 @@ makensis_files = Split("""
 	strlist.cpp
 	tokens.cpp
 	tstring.cpp
+	utf.cpp
 	util.cpp
 	validateunicode.cpp
 	winchar.cpp
--- a/Source/build.cpp
+++ b/Source/build.cpp
@ -121,6 +121,9 @@ CEXEBuild::CEXEBuild() :
  multiple_entries_instruction=0;

  build_include_depth=0;
+#ifndef _UNICODE
+  build_include_isutf8=false;
+#endif

  has_called_write_output=false;

--- a/Source/build.h
+++ b/Source/build.h
@ -328,6 +328,9 @@ class CEXEBuild {
     * this will return a PS_WARNING.
     */
    int SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL unicode);
+#ifndef _UNICODE
+    int SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8);
+#endif

    /**
     * Sets the user string to the specific NLF_STRINGS id.
@ -424,6 +427,9 @@ class CEXEBuild {
    TCHAR build_output_filename[1024];

    int build_include_depth;
+#ifndef _UNICODE
+    bool build_include_isutf8; // UTF-8 LangString in .nsh hack for ANSI builds
+#endif

    // Added by ramon 6 jun 2003
 #ifdef NSIS_SUPPORT_VERSION_INFO
--- a/Source/lang.cpp
+++ b/Source/lang.cpp
@ -26,6 +26,7 @@
 #include "exehead/resource.h"
 #include <nsis-version.h>
 #include "tstring.h"
+#include "utf.h"

 using namespace std;

@ -492,6 +493,9 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un

  int sn;

+  if (_tcsclen(str) > NSIS_MAX_STRLEN-1)
+    warning_fl("LangString \"%s\" longer than NSIS_MAX_STRLEN!", name);
+
  int pos = build_langstrings.get(name, &sn);
  if (pos < 0)
    pos = build_langstrings.add(name, &sn);
@ -502,6 +506,21 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
  return PS_OK;
 }

+#ifndef _UNICODE
+int CEXEBuild::SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8)
+{
+  LanguageTable *table = GetLangTable(lang);
+  if (!table) return PS_ERROR;
+  if (!Platform_SupportsUTF8Conversion()) return PS_ERROR;
+
+  EXEHEADTCHAR_T *bufEHTStr = UTF8ToExeHeadTStr(stru8, table->nlf.m_uCodePage);
+  if (!bufEHTStr) return PS_ERROR;
+  const int ret = SetLangString(name, lang, bufEHTStr, sizeof(EXEHEADTCHAR_T) > 1);
+  ExeHeadTStrFree(bufEHTStr);
+  return ret;
+}
+#endif
+
 // Sets the user string to the specific NLF_STRINGS id.
 //
 // @return If the id is invalid or the string is not valid, it will return a
@ -925,6 +944,11 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
    return 0;
  }

+#ifndef _UNICODE
+  char fencoding = 0; // 0 = ansi, 8 = utf-8 (16/17 for uft-16le/be not supported)
+  if (IsUTF8BOM(f)) fencoding = 8;
+#endif
+
  // Check header
  TCHAR buf[NSIS_MAX_STRLEN];
  buf[0] = SkipComments(f);
@ -1096,8 +1120,31 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
    buf[0] = SkipComments(f);

    _fgetts(buf+1, NSIS_MAX_STRLEN, f);
+#ifndef _UNICODE
+    if (8 == fencoding)
+    {
+      if (!Platform_SupportsUTF8Conversion()) {
+        ERROR_MSG(_T("Error: UTF-8 language files not supported on this OS!\n"));
+        return 0;
+      }
+      EXEHEADTCHAR_T *bufConv = UTF8ToExeHeadTStr(buf, nlf->m_uCodePage);
+      if (!bufConv) {
+        ERROR_MSG(_T("Error: Invalid UTF-8? (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
+        return 0;
+      }
+      else {
+        UINT cch = _tcslen(bufConv);
+        _tcsnccpy(buf, bufConv, NSIS_MAX_STRLEN);
+        if (cch >= NSIS_MAX_STRLEN-1) {
+          buf[NSIS_MAX_STRLEN-1] = _T('\0'); // Make sure we fail the "String too long" check
+        }
+      }
+      ExeHeadTStrFree(bufConv);
+    }
+#endif
+
    if (_tcslen(buf) == NSIS_MAX_STRLEN-1) {
-      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")"), i, NLFStrings[i].szLangStringName);
+      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
      return 0;
    }
    temp=_tcslen(buf);
--- a/Source/script.cpp
+++ b/Source/script.cpp
@ -34,6 +34,7 @@
 #include <cassert> // for assert(3)
 #include <time.h>
 #include "tstring.h"
+#include "utf.h"
 #include <algorithm>
 #include "boost/scoped_ptr.hpp"

@ -813,6 +814,10 @@ int CEXEBuild::includeScript(TCHAR *f)
    return PS_ERROR;
  }
  build_include_depth++;
+#ifndef _UNICODE
+  const bool org_build_include_isutf8 = build_include_isutf8;
+  build_include_isutf8 = IsUTF8BOM(incfp);
+#endif

  int last_linecnt=linecnt;
  linecnt=0;
@ -837,6 +842,10 @@ int CEXEBuild::includeScript(TCHAR *f)
  restore_timestamp_predefine(oldtimestamp);
 #endif

+#ifndef _UNICODE
+  build_include_isutf8 = org_build_include_isutf8;
+#endif
+
  int errlinecnt=linecnt;

  linecnt=last_linecnt;
@ -1712,13 +1721,21 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
      TCHAR *name = line.gettoken_str(1);
      LANGID lang = line.gettoken_int(2);
      TCHAR *str = line.gettoken_str(3);
-      int ret = SetLangString(name, lang, str, curfile_unicode);
+      int ret;
+#ifndef _UNICODE
+        if (build_include_isutf8)
+          ret = SetUTF8LangString(name, lang, str);
+        else
+#endif
+          ret = SetLangString(name, lang, str, curfile_unicode);
+
      if (ret == PS_WARNING)
        warning_fl(_T("LangString \"%s\" set multiple times for %d, wasting space"), name, lang);
      else if (ret == PS_ERROR) {
        ERROR_MSG(_T("Error: can't set LangString \"%s\"!\n"), name);
        return PS_ERROR;
      }
+      // BUGBUG: Does not display UTF-8 properly.
      SCRIPT_MSG(_T("LangString: \"%s\" %d \"%s\"\n"), name, lang, str);
    }
    return PS_OK;
--- a/Source/utf.cpp
+++ b/Source/utf.cpp
@ -0,0 +1,85 @@
+/*
+ * utf.cpp
+ * 
+ * This file is a part of NSIS.
+ * 
+ * Copyright (C) 2011 Anders Kjersem
+ * 
+ * Licensed under the zlib/libpng license (the "License");
+ * you may not use this file except in compliance with the License.
+ * 
+ * Licence details can be found in the file COPYING.
+ * 
+ * This software is provided 'as-is', without any express or implied
+ * warranty.
+ *
+ */
+
+#include "utf.h"
+
+// BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported
+// on < WinXP or in our current POSIX implementation.
+static const int UTF8MBTWCFLAGS  = 0;
+
+
+#define ExeHeadWStrFree free
+static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch) 
+{
+  EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T));
+#if 0
+  // TODO: We should add POSIX versions of  G/SetLastError
+  // if we want to tell _why_ UTF8ToExeHeadTStr failed...
+  if (!s) SetLastError(ERROR_OUTOFMEMORY);
+#endif
+  return s;
+}
+
+#ifdef _UNICODE
+#else // !_UNICODE
+
+EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage) 
+{
+  int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0);
+  if (!cchW) return NULL;
+  WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW);
+  if (!bufWStr) return NULL;
+  EXEHEADTCHAR_T *outstr = NULL;
+  if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW))
+  {
+    int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL);
+    if (cbA && (outstr = ExeHeadTStrAlloc(cbA)))
+    {
+      if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL))
+      {
+        ExeHeadTStrFree(outstr);
+        outstr = NULL;
+      }
+    }
+  }
+  ExeHeadWStrFree(bufWStr);
+  return outstr;
+}
+
+#endif // ?_UNICODE
+
+
+bool IsUTF8BOM(FILE*fstrm) 
+{
+  // ungetc is only guaranteed to support 1 pushback, 
+  // lets hope no ASCII file starts with 0xEF and is not a BOM!
+  const int c = fgetc(fstrm);
+  if (EOF == c) return false;
+  if (0xef == c)
+  {
+    const int c2 = fgetc(fstrm);
+    if (0xbb == c2)
+    {
+      const int c3 = fgetc(fstrm);
+      if (0xbf == c3) return true;
+      ungetc(c3,fstrm);
+    }
+    ungetc(c2,fstrm);
+  }
+  ungetc(c,fstrm);
+  return false;
+}
--- a/Source/utf.h
+++ b/Source/utf.h
@ -0,0 +1,43 @@
+/*
+ * utf.h
+ * 
+ * This file is a part of NSIS.
+ * 
+ * Copyright (C) 2011 Anders Kjersem
+ * 
+ * Licensed under the zlib/libpng license (the "License");
+ * you may not use this file except in compliance with the License.
+ * 
+ * Licence details can be found in the file COPYING.
+ * 
+ * This software is provided 'as-is', without any express or implied
+ * warranty.
+ *
+ */
+
+#include "Platform.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+typedef unsigned short EXEHEADWCHAR_T;
+
+
+#ifdef _UNICODE
+typedef EXEHEADWCHAR_T EXEHEADTCHAR_T;
+
+#else // !_UNICODE
+typedef char EXEHEADTCHAR_T;
+
+#define ExeHeadTStrFree free
+inline EXEHEADTCHAR_T* ExeHeadTStrAlloc(UINT cb) {return (EXEHEADTCHAR_T*) malloc(cb);}
+extern EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage);
+
+#endif // ?_UNICODE
+
+
+/**
+ * Tries to peek at the first few bytes in the stream to determine if it is a UTF-8 BOM.
+ * If it is a UTF-8 BOM it will eat the BOM, 
+ * if it is not it tries its best to restore the data.
+ */
+extern bool IsUTF8BOM(FILE*fstrm);
--- a/Source/util.cpp
+++ b/Source/util.cpp
@ -200,8 +200,15 @@ inline size_t nsis_iconv_adaptor
 }

 void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
-  if (code_page == CP_ACP)
+  switch(code_page)
+  {
+  case CP_ACP:
    code_page = 1252;
+    break;
+  case CP_UTF8:
+    _sntprintf(buf, len, _T("UTF-8"));
+    return;
+  }

  _sntprintf(buf, len, _T("CP%d"), code_page);
 }
@ -209,7 +216,7 @@ void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
 int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
    int cchWideChar, LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
    LPBOOL lpUsedDefaultChar) {
-  static char buffer[4096];
+  static char buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?

  char cp[128];
  create_code_page_string(cp, sizeof(cp), CodePage);
@ -245,7 +252,7 @@ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,

 int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
    int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) {
-  static WCHAR buffer[4096];
+  static WCHAR buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?

  char cp[128];
  create_code_page_string(cp, sizeof(cp), CodePage);
@ -900,3 +907,10 @@ bool GetDLLVersion(const tstring& filepath, DWORD& high, DWORD& low)

  return false;
 }
+
+bool Platform_SupportsUTF8Conversion()
+{
+  static unsigned char cached = -1;
+  if (-1 == cached) cached = !!IsValidCodePage(CP_UTF8);
+  return cached != 0;
+}
--- a/Source/util.h
+++ b/Source/util.h
@ -185,4 +185,7 @@ RM_DEFINE_FREEFUNC(my_convert_free);
 #  define PATH_CONVERT(x)
 #endif

+// Platform detection
+bool Platform_SupportsUTF8Conversion();
+
 #endif //_UTIL_H_