From ef8a83bd41d0c801c3589b461ad487a789166ed1 Mon Sep 17 00:00:00 2001 From: anders_k Date: Mon, 5 Dec 2011 23:44:26 +0000 Subject: [PATCH] Basic UTF-8 support in ansi build so it can read UTF-8 .nlf files and LangStrings git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6196 212acab6-be3b-0410-9dea-997c60f758d6 --- Source/Platform.h | 3 ++ Source/SConscript | 1 + Source/build.cpp | 3 ++ Source/build.h | 6 ++++ Source/lang.cpp | 49 ++++++++++++++++++++++++++- Source/script.cpp | 19 ++++++++++- Source/utf.cpp | 85 +++++++++++++++++++++++++++++++++++++++++++++++ Source/utf.h | 43 ++++++++++++++++++++++++ Source/util.cpp | 20 +++++++++-- Source/util.h | 3 ++ 10 files changed, 227 insertions(+), 5 deletions(-) create mode 100644 Source/utf.cpp create mode 100644 Source/utf.h diff --git a/Source/Platform.h b/Source/Platform.h index 2f9371c6..e3135f9a 100644 --- a/Source/Platform.h +++ b/Source/Platform.h @@ -448,6 +448,9 @@ typedef DWORDLONG ULONGLONG,*PULONGLONG; #ifndef CP_ACP # define CP_ACP 0 #endif +#ifndef CP_UTF8 +# define CP_UTF8 65001 +#endif #ifndef COLOR_BTNFACE # define COLOR_BTNFACE 15 diff --git a/Source/SConscript b/Source/SConscript index b5ac6df5..d7e190ba 100644 --- a/Source/SConscript +++ b/Source/SConscript @@ -24,6 +24,7 @@ makensis_files = Split(""" strlist.cpp tokens.cpp tstring.cpp + utf.cpp util.cpp validateunicode.cpp winchar.cpp diff --git a/Source/build.cpp b/Source/build.cpp index db86da36..5c23c1ca 100644 --- a/Source/build.cpp +++ b/Source/build.cpp @@ -121,6 +121,9 @@ CEXEBuild::CEXEBuild() : multiple_entries_instruction=0; build_include_depth=0; +#ifndef _UNICODE + build_include_isutf8=false; +#endif has_called_write_output=false; diff --git a/Source/build.h b/Source/build.h index cb77a175..1cf99dbf 100644 --- a/Source/build.h +++ b/Source/build.h @@ -328,6 +328,9 @@ class CEXEBuild { * this will return a PS_WARNING. */ int SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL unicode); +#ifndef _UNICODE + int SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8); +#endif /** * Sets the user string to the specific NLF_STRINGS id. @@ -424,6 +427,9 @@ class CEXEBuild { TCHAR build_output_filename[1024]; int build_include_depth; +#ifndef _UNICODE + bool build_include_isutf8; // UTF-8 LangString in .nsh hack for ANSI builds +#endif // Added by ramon 6 jun 2003 #ifdef NSIS_SUPPORT_VERSION_INFO diff --git a/Source/lang.cpp b/Source/lang.cpp index bda48b10..d1c486a3 100644 --- a/Source/lang.cpp +++ b/Source/lang.cpp @@ -26,6 +26,7 @@ #include "exehead/resource.h" #include #include "tstring.h" +#include "utf.h" using namespace std; @@ -492,6 +493,9 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un int sn; + if (_tcsclen(str) > NSIS_MAX_STRLEN-1) + warning_fl("LangString \"%s\" longer than NSIS_MAX_STRLEN!", name); + int pos = build_langstrings.get(name, &sn); if (pos < 0) pos = build_langstrings.add(name, &sn); @@ -502,6 +506,21 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un return PS_OK; } +#ifndef _UNICODE +int CEXEBuild::SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8) +{ + LanguageTable *table = GetLangTable(lang); + if (!table) return PS_ERROR; + if (!Platform_SupportsUTF8Conversion()) return PS_ERROR; + + EXEHEADTCHAR_T *bufEHTStr = UTF8ToExeHeadTStr(stru8, table->nlf.m_uCodePage); + if (!bufEHTStr) return PS_ERROR; + const int ret = SetLangString(name, lang, bufEHTStr, sizeof(EXEHEADTCHAR_T) > 1); + ExeHeadTStrFree(bufEHTStr); + return ret; +} +#endif + // Sets the user string to the specific NLF_STRINGS id. // // @return If the id is invalid or the string is not valid, it will return a @@ -925,6 +944,11 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) { return 0; } +#ifndef _UNICODE + char fencoding = 0; // 0 = ansi, 8 = utf-8 (16/17 for uft-16le/be not supported) + if (IsUTF8BOM(f)) fencoding = 8; +#endif + // Check header TCHAR buf[NSIS_MAX_STRLEN]; buf[0] = SkipComments(f); @@ -1096,8 +1120,31 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) { buf[0] = SkipComments(f); _fgetts(buf+1, NSIS_MAX_STRLEN, f); +#ifndef _UNICODE + if (8 == fencoding) + { + if (!Platform_SupportsUTF8Conversion()) { + ERROR_MSG(_T("Error: UTF-8 language files not supported on this OS!\n")); + return 0; + } + EXEHEADTCHAR_T *bufConv = UTF8ToExeHeadTStr(buf, nlf->m_uCodePage); + if (!bufConv) { + ERROR_MSG(_T("Error: Invalid UTF-8? (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName); + return 0; + } + else { + UINT cch = _tcslen(bufConv); + _tcsnccpy(buf, bufConv, NSIS_MAX_STRLEN); + if (cch >= NSIS_MAX_STRLEN-1) { + buf[NSIS_MAX_STRLEN-1] = _T('\0'); // Make sure we fail the "String too long" check + } + } + ExeHeadTStrFree(bufConv); + } +#endif + if (_tcslen(buf) == NSIS_MAX_STRLEN-1) { - ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")"), i, NLFStrings[i].szLangStringName); + ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName); return 0; } temp=_tcslen(buf); diff --git a/Source/script.cpp b/Source/script.cpp index eed1ef45..c02d0c30 100644 --- a/Source/script.cpp +++ b/Source/script.cpp @@ -34,6 +34,7 @@ #include // for assert(3) #include #include "tstring.h" +#include "utf.h" #include #include "boost/scoped_ptr.hpp" @@ -813,6 +814,10 @@ int CEXEBuild::includeScript(TCHAR *f) return PS_ERROR; } build_include_depth++; +#ifndef _UNICODE + const bool org_build_include_isutf8 = build_include_isutf8; + build_include_isutf8 = IsUTF8BOM(incfp); +#endif int last_linecnt=linecnt; linecnt=0; @@ -837,6 +842,10 @@ int CEXEBuild::includeScript(TCHAR *f) restore_timestamp_predefine(oldtimestamp); #endif +#ifndef _UNICODE + build_include_isutf8 = org_build_include_isutf8; +#endif + int errlinecnt=linecnt; linecnt=last_linecnt; @@ -1712,13 +1721,21 @@ int CEXEBuild::doCommand(int which_token, LineParser &line) TCHAR *name = line.gettoken_str(1); LANGID lang = line.gettoken_int(2); TCHAR *str = line.gettoken_str(3); - int ret = SetLangString(name, lang, str, curfile_unicode); + int ret; +#ifndef _UNICODE + if (build_include_isutf8) + ret = SetUTF8LangString(name, lang, str); + else +#endif + ret = SetLangString(name, lang, str, curfile_unicode); + if (ret == PS_WARNING) warning_fl(_T("LangString \"%s\" set multiple times for %d, wasting space"), name, lang); else if (ret == PS_ERROR) { ERROR_MSG(_T("Error: can't set LangString \"%s\"!\n"), name); return PS_ERROR; } + // BUGBUG: Does not display UTF-8 properly. SCRIPT_MSG(_T("LangString: \"%s\" %d \"%s\"\n"), name, lang, str); } return PS_OK; diff --git a/Source/utf.cpp b/Source/utf.cpp new file mode 100644 index 00000000..1f6a4959 --- /dev/null +++ b/Source/utf.cpp @@ -0,0 +1,85 @@ +/* + * utf.cpp + * + * This file is a part of NSIS. + * + * Copyright (C) 2011 Anders Kjersem + * + * Licensed under the zlib/libpng license (the "License"); + * you may not use this file except in compliance with the License. + * + * Licence details can be found in the file COPYING. + * + * This software is provided 'as-is', without any express or implied + * warranty. + * + */ + +#include "utf.h" + +// BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported +// on < WinXP or in our current POSIX implementation. +static const int UTF8MBTWCFLAGS = 0; + + +#define ExeHeadWStrFree free +static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch) +{ + EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T)); +#if 0 + // TODO: We should add POSIX versions of G/SetLastError + // if we want to tell _why_ UTF8ToExeHeadTStr failed... + if (!s) SetLastError(ERROR_OUTOFMEMORY); +#endif + return s; +} + +#ifdef _UNICODE +#else // !_UNICODE + +EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage) +{ + int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0); + if (!cchW) return NULL; + WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW); + if (!bufWStr) return NULL; + EXEHEADTCHAR_T *outstr = NULL; + if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW)) + { + int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL); + if (cbA && (outstr = ExeHeadTStrAlloc(cbA))) + { + if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL)) + { + ExeHeadTStrFree(outstr); + outstr = NULL; + } + } + } + ExeHeadWStrFree(bufWStr); + return outstr; +} + +#endif // ?_UNICODE + + +bool IsUTF8BOM(FILE*fstrm) +{ + // ungetc is only guaranteed to support 1 pushback, + // lets hope no ASCII file starts with 0xEF and is not a BOM! + const int c = fgetc(fstrm); + if (EOF == c) return false; + if (0xef == c) + { + const int c2 = fgetc(fstrm); + if (0xbb == c2) + { + const int c3 = fgetc(fstrm); + if (0xbf == c3) return true; + ungetc(c3,fstrm); + } + ungetc(c2,fstrm); + } + ungetc(c,fstrm); + return false; +} diff --git a/Source/utf.h b/Source/utf.h new file mode 100644 index 00000000..96a227db --- /dev/null +++ b/Source/utf.h @@ -0,0 +1,43 @@ +/* + * utf.h + * + * This file is a part of NSIS. + * + * Copyright (C) 2011 Anders Kjersem + * + * Licensed under the zlib/libpng license (the "License"); + * you may not use this file except in compliance with the License. + * + * Licence details can be found in the file COPYING. + * + * This software is provided 'as-is', without any express or implied + * warranty. + * + */ + +#include "Platform.h" +#include +#include + +typedef unsigned short EXEHEADWCHAR_T; + + +#ifdef _UNICODE +typedef EXEHEADWCHAR_T EXEHEADTCHAR_T; + +#else // !_UNICODE +typedef char EXEHEADTCHAR_T; + +#define ExeHeadTStrFree free +inline EXEHEADTCHAR_T* ExeHeadTStrAlloc(UINT cb) {return (EXEHEADTCHAR_T*) malloc(cb);} +extern EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage); + +#endif // ?_UNICODE + + +/** + * Tries to peek at the first few bytes in the stream to determine if it is a UTF-8 BOM. + * If it is a UTF-8 BOM it will eat the BOM, + * if it is not it tries its best to restore the data. + */ +extern bool IsUTF8BOM(FILE*fstrm); diff --git a/Source/util.cpp b/Source/util.cpp index dc3d488d..e2be0010 100644 --- a/Source/util.cpp +++ b/Source/util.cpp @@ -200,8 +200,15 @@ inline size_t nsis_iconv_adaptor } void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) { - if (code_page == CP_ACP) + switch(code_page) + { + case CP_ACP: code_page = 1252; + break; + case CP_UTF8: + _sntprintf(buf, len, _T("UTF-8")); + return; + } _sntprintf(buf, len, _T("CP%d"), code_page); } @@ -209,7 +216,7 @@ void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) { int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar) { - static char buffer[4096]; + static char buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build? char cp[128]; create_code_page_string(cp, sizeof(cp), CodePage); @@ -245,7 +252,7 @@ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) { - static WCHAR buffer[4096]; + static WCHAR buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build? char cp[128]; create_code_page_string(cp, sizeof(cp), CodePage); @@ -900,3 +907,10 @@ bool GetDLLVersion(const tstring& filepath, DWORD& high, DWORD& low) return false; } + +bool Platform_SupportsUTF8Conversion() +{ + static unsigned char cached = -1; + if (-1 == cached) cached = !!IsValidCodePage(CP_UTF8); + return cached != 0; +} \ No newline at end of file diff --git a/Source/util.h b/Source/util.h index 022d563b..f2f3be73 100644 --- a/Source/util.h +++ b/Source/util.h @@ -185,4 +185,7 @@ RM_DEFINE_FREEFUNC(my_convert_free); # define PATH_CONVERT(x) #endif +// Platform detection +bool Platform_SupportsUTF8Conversion(); + #endif //_UTIL_H_