Basic UTF-8 support in ansi build so it can read UTF-8 .nlf files and LangStrings

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6196 212acab6-be3b-0410-9dea-997c60f758d6
2011-12-05 23:44:26 +00:00 · 2011-12-05 23:44:26 +00:00 · ef8a83bd41
commit ef8a83bd41
parent 359ad0a055
10 changed files with 227 additions and 5 deletions
--- a/Source/Platform.h
+++ b/Source/Platform.h
@ -448,6 +448,9 @@ typedef DWORDLONG ULONGLONG,*PULONGLONG;
 #ifndef CP_ACP
 #  define CP_ACP 0
 #endif
 #ifndef CP_UTF8
 #  define CP_UTF8 65001
 #endif
 #ifndef COLOR_BTNFACE
 #  define COLOR_BTNFACE 15
--- a/Source/SConscript
+++ b/Source/SConscript
@ -24,6 +24,7 @@ makensis_files = Split("""
 	strlist.cpp
 	tokens.cpp
 	tstring.cpp
 	utf.cpp
 	util.cpp
 	validateunicode.cpp
 	winchar.cpp
--- a/Source/build.cpp
+++ b/Source/build.cpp
@ -121,6 +121,9 @@ CEXEBuild::CEXEBuild() :
  multiple_entries_instruction=0;
  build_include_depth=0;
 #ifndef _UNICODE
  build_include_isutf8=false;
 #endif
  has_called_write_output=false;
--- a/Source/build.h
+++ b/Source/build.h
@ -328,6 +328,9 @@ class CEXEBuild {
     * this will return a PS_WARNING.
     */
    int SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL unicode);
 #ifndef _UNICODE
    int SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8);
 #endif
    /**
     * Sets the user string to the specific NLF_STRINGS id.
@ -424,6 +427,9 @@ class CEXEBuild {
    TCHAR build_output_filename[1024];
    int build_include_depth;
 #ifndef _UNICODE
    bool build_include_isutf8; // UTF-8 LangString in .nsh hack for ANSI builds
 #endif
    // Added by ramon 6 jun 2003
 #ifdef NSIS_SUPPORT_VERSION_INFO
--- a/Source/lang.cpp
+++ b/Source/lang.cpp
@ -26,6 +26,7 @@
 #include "exehead/resource.h"
 #include <nsis-version.h>
 #include "tstring.h"
 #include "utf.h"
 using namespace std;
@ -492,6 +493,9 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
  int sn;
  if (_tcsclen(str) > NSIS_MAX_STRLEN-1)
    warning_fl("LangString \"%s\" longer than NSIS_MAX_STRLEN!", name);
  int pos = build_langstrings.get(name, &sn);
  if (pos < 0)
    pos = build_langstrings.add(name, &sn);
@ -502,6 +506,21 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
  return PS_OK;
 }
 #ifndef _UNICODE
 int CEXEBuild::SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8)
 {
  LanguageTable *table = GetLangTable(lang);
  if (!table) return PS_ERROR;
  if (!Platform_SupportsUTF8Conversion()) return PS_ERROR;
  EXEHEADTCHAR_T *bufEHTStr = UTF8ToExeHeadTStr(stru8, table->nlf.m_uCodePage);
  if (!bufEHTStr) return PS_ERROR;
  const int ret = SetLangString(name, lang, bufEHTStr, sizeof(EXEHEADTCHAR_T) > 1);
  ExeHeadTStrFree(bufEHTStr);
  return ret;
 }
 #endif
 // Sets the user string to the specific NLF_STRINGS id.
 //
 // @return If the id is invalid or the string is not valid, it will return a
@ -925,6 +944,11 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
    return 0;
  }
 #ifndef _UNICODE
  char fencoding = 0; // 0 = ansi, 8 = utf-8 (16/17 for uft-16le/be not supported)
  if (IsUTF8BOM(f)) fencoding = 8;
 #endif
  // Check header
  TCHAR buf[NSIS_MAX_STRLEN];
  buf[0] = SkipComments(f);
@ -1096,8 +1120,31 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
    buf[0] = SkipComments(f);
    _fgetts(buf+1, NSIS_MAX_STRLEN, f);
 #ifndef _UNICODE
    if (8 == fencoding)
    {
      if (!Platform_SupportsUTF8Conversion()) {
        ERROR_MSG(_T("Error: UTF-8 language files not supported on this OS!\n"));
        return 0;
      }
      EXEHEADTCHAR_T *bufConv = UTF8ToExeHeadTStr(buf, nlf->m_uCodePage);
      if (!bufConv) {
        ERROR_MSG(_T("Error: Invalid UTF-8? (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
        return 0;
      }
      else {
        UINT cch = _tcslen(bufConv);
        _tcsnccpy(buf, bufConv, NSIS_MAX_STRLEN);
        if (cch >= NSIS_MAX_STRLEN-1) {
          buf[NSIS_MAX_STRLEN-1] = _T('\0'); // Make sure we fail the "String too long" check
        }
      }
      ExeHeadTStrFree(bufConv);
    }
 #endif
    if (_tcslen(buf) == NSIS_MAX_STRLEN-1) {
-      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")"), i, NLFStrings[i].szLangStringName);
+      ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
      return 0;
    }
    temp=_tcslen(buf);
--- a/Source/script.cpp
+++ b/Source/script.cpp
@ -34,6 +34,7 @@
 #include <cassert> // for assert(3)
 #include <time.h>
 #include "tstring.h"
 #include "utf.h"
 #include <algorithm>
 #include "boost/scoped_ptr.hpp"
@ -813,6 +814,10 @@ int CEXEBuild::includeScript(TCHAR *f)
    return PS_ERROR;
  }
  build_include_depth++;
 #ifndef _UNICODE
  const bool org_build_include_isutf8 = build_include_isutf8;
  build_include_isutf8 = IsUTF8BOM(incfp);
 #endif
  int last_linecnt=linecnt;
  linecnt=0;
@ -837,6 +842,10 @@ int CEXEBuild::includeScript(TCHAR *f)
  restore_timestamp_predefine(oldtimestamp);
 #endif
 #ifndef _UNICODE
  build_include_isutf8 = org_build_include_isutf8;
 #endif
  int errlinecnt=linecnt;
  linecnt=last_linecnt;
@ -1712,13 +1721,21 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
      TCHAR *name = line.gettoken_str(1);
      LANGID lang = line.gettoken_int(2);
      TCHAR *str = line.gettoken_str(3);
-      int ret = SetLangString(name, lang, str, curfile_unicode);
+      int ret;
 #ifndef _UNICODE
        if (build_include_isutf8)
          ret = SetUTF8LangString(name, lang, str);
        else
 #endif
          ret = SetLangString(name, lang, str, curfile_unicode);
      if (ret == PS_WARNING)
        warning_fl(_T("LangString \"%s\" set multiple times for %d, wasting space"), name, lang);
      else if (ret == PS_ERROR) {
        ERROR_MSG(_T("Error: can't set LangString \"%s\"!\n"), name);
        return PS_ERROR;
      }
      // BUGBUG: Does not display UTF-8 properly.
      SCRIPT_MSG(_T("LangString: \"%s\" %d \"%s\"\n"), name, lang, str);
    }
    return PS_OK;
--- a/Source/utf.cpp
+++ b/Source/utf.cpp
@ -0,0 +1,85 @@
 /*
 * utf.cpp
 * 
 * This file is a part of NSIS.
 * 
 * Copyright (C) 2011 Anders Kjersem
 * 
 * Licensed under the zlib/libpng license (the "License");
 * you may not use this file except in compliance with the License.
 * 
 * Licence details can be found in the file COPYING.
 * 
 * This software is provided 'as-is', without any express or implied
 * warranty.
 *
 */
 #include "utf.h"
 // BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported
 // on < WinXP or in our current POSIX implementation.
 static const int UTF8MBTWCFLAGS  = 0;
 #define ExeHeadWStrFree free
 static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch) 
 {
  EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T));
 #if 0
  // TODO: We should add POSIX versions of  G/SetLastError
  // if we want to tell _why_ UTF8ToExeHeadTStr failed...
  if (!s) SetLastError(ERROR_OUTOFMEMORY);
 #endif
  return s;
 }
 #ifdef _UNICODE
 #else // !_UNICODE
 EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage) 
 {
  int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0);
  if (!cchW) return NULL;
  WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW);
  if (!bufWStr) return NULL;
  EXEHEADTCHAR_T *outstr = NULL;
  if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW))
  {
    int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL);
    if (cbA && (outstr = ExeHeadTStrAlloc(cbA)))
    {
      if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL))
      {
        ExeHeadTStrFree(outstr);
        outstr = NULL;
      }
    }
  }
  ExeHeadWStrFree(bufWStr);
  return outstr;
 }
 #endif // ?_UNICODE
 bool IsUTF8BOM(FILE*fstrm) 
 {
  // ungetc is only guaranteed to support 1 pushback, 
  // lets hope no ASCII file starts with 0xEF and is not a BOM!
  const int c = fgetc(fstrm);
  if (EOF == c) return false;
  if (0xef == c)
  {
    const int c2 = fgetc(fstrm);
    if (0xbb == c2)
    {
      const int c3 = fgetc(fstrm);
      if (0xbf == c3) return true;
      ungetc(c3,fstrm);
    }
    ungetc(c2,fstrm);
  }
  ungetc(c,fstrm);
  return false;
 }
--- a/Source/utf.h
+++ b/Source/utf.h
@ -0,0 +1,43 @@
 /*
 * utf.h
 * 
 * This file is a part of NSIS.
 * 
 * Copyright (C) 2011 Anders Kjersem
 * 
 * Licensed under the zlib/libpng license (the "License");
 * you may not use this file except in compliance with the License.
 * 
 * Licence details can be found in the file COPYING.
 * 
 * This software is provided 'as-is', without any express or implied
 * warranty.
 *
 */
 #include "Platform.h"
 #include <stdlib.h>
 #include <stdio.h>
 typedef unsigned short EXEHEADWCHAR_T;
 #ifdef _UNICODE
 typedef EXEHEADWCHAR_T EXEHEADTCHAR_T;
 #else // !_UNICODE
 typedef char EXEHEADTCHAR_T;
 #define ExeHeadTStrFree free
 inline EXEHEADTCHAR_T* ExeHeadTStrAlloc(UINT cb) {return (EXEHEADTCHAR_T*) malloc(cb);}
 extern EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage);
 #endif // ?_UNICODE
 /**
 * Tries to peek at the first few bytes in the stream to determine if it is a UTF-8 BOM.
 * If it is a UTF-8 BOM it will eat the BOM, 
 * if it is not it tries its best to restore the data.
 */
 extern bool IsUTF8BOM(FILE*fstrm);
--- a/Source/util.cpp
+++ b/Source/util.cpp
@ -200,8 +200,15 @@ inline size_t nsis_iconv_adaptor
 }
 void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
-  if (code_page == CP_ACP)
+  switch(code_page)
  {
  case CP_ACP:
    code_page = 1252;
    break;
  case CP_UTF8:
    _sntprintf(buf, len, _T("UTF-8"));
    return;
  }
  _sntprintf(buf, len, _T("CP%d"), code_page);
 }
@ -209,7 +216,7 @@ void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
 int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
    int cchWideChar, LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
    LPBOOL lpUsedDefaultChar) {
-  static char buffer[4096];
+  static char buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
  char cp[128];
  create_code_page_string(cp, sizeof(cp), CodePage);
@ -245,7 +252,7 @@ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
 int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
    int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) {
-  static WCHAR buffer[4096];
+  static WCHAR buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
  char cp[128];
  create_code_page_string(cp, sizeof(cp), CodePage);
@ -900,3 +907,10 @@ bool GetDLLVersion(const tstring& filepath, DWORD& high, DWORD& low)
  return false;
 }
 bool Platform_SupportsUTF8Conversion()
 {
  static unsigned char cached = -1;
  if (-1 == cached) cached = !!IsValidCodePage(CP_UTF8);
  return cached != 0;
 }
--- a/Source/util.h
+++ b/Source/util.h
@ -185,4 +185,7 @@ RM_DEFINE_FREEFUNC(my_convert_free);
 #  define PATH_CONVERT(x)
 #endif
 // Platform detection
 bool Platform_SupportsUTF8Conversion();
 #endif //_UTIL_H_