Basic UTF-8 support in ansi build so it can read UTF-8 .nlf files and LangStrings

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6196 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
anders_k 2011-12-05 23:44:26 +00:00
parent 359ad0a055
commit ef8a83bd41
10 changed files with 227 additions and 5 deletions

View file

@ -448,6 +448,9 @@ typedef DWORDLONG ULONGLONG,*PULONGLONG;
#ifndef CP_ACP
# define CP_ACP 0
#endif
#ifndef CP_UTF8
# define CP_UTF8 65001
#endif
#ifndef COLOR_BTNFACE
# define COLOR_BTNFACE 15

View file

@ -24,6 +24,7 @@ makensis_files = Split("""
strlist.cpp
tokens.cpp
tstring.cpp
utf.cpp
util.cpp
validateunicode.cpp
winchar.cpp

View file

@ -121,6 +121,9 @@ CEXEBuild::CEXEBuild() :
multiple_entries_instruction=0;
build_include_depth=0;
#ifndef _UNICODE
build_include_isutf8=false;
#endif
has_called_write_output=false;

View file

@ -328,6 +328,9 @@ class CEXEBuild {
* this will return a PS_WARNING.
*/
int SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL unicode);
#ifndef _UNICODE
int SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8);
#endif
/**
* Sets the user string to the specific NLF_STRINGS id.
@ -424,6 +427,9 @@ class CEXEBuild {
TCHAR build_output_filename[1024];
int build_include_depth;
#ifndef _UNICODE
bool build_include_isutf8; // UTF-8 LangString in .nsh hack for ANSI builds
#endif
// Added by ramon 6 jun 2003
#ifdef NSIS_SUPPORT_VERSION_INFO

View file

@ -26,6 +26,7 @@
#include "exehead/resource.h"
#include <nsis-version.h>
#include "tstring.h"
#include "utf.h"
using namespace std;
@ -492,6 +493,9 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
int sn;
if (_tcsclen(str) > NSIS_MAX_STRLEN-1)
warning_fl("LangString \"%s\" longer than NSIS_MAX_STRLEN!", name);
int pos = build_langstrings.get(name, &sn);
if (pos < 0)
pos = build_langstrings.add(name, &sn);
@ -502,6 +506,21 @@ int CEXEBuild::SetLangString(TCHAR *name, LANGID lang, const TCHAR *str, BOOL un
return PS_OK;
}
#ifndef _UNICODE
int CEXEBuild::SetUTF8LangString(TCHAR *name, LANGID lang, const char* stru8)
{
LanguageTable *table = GetLangTable(lang);
if (!table) return PS_ERROR;
if (!Platform_SupportsUTF8Conversion()) return PS_ERROR;
EXEHEADTCHAR_T *bufEHTStr = UTF8ToExeHeadTStr(stru8, table->nlf.m_uCodePage);
if (!bufEHTStr) return PS_ERROR;
const int ret = SetLangString(name, lang, bufEHTStr, sizeof(EXEHEADTCHAR_T) > 1);
ExeHeadTStrFree(bufEHTStr);
return ret;
}
#endif
// Sets the user string to the specific NLF_STRINGS id.
//
// @return If the id is invalid or the string is not valid, it will return a
@ -925,6 +944,11 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
return 0;
}
#ifndef _UNICODE
char fencoding = 0; // 0 = ansi, 8 = utf-8 (16/17 for uft-16le/be not supported)
if (IsUTF8BOM(f)) fencoding = 8;
#endif
// Check header
TCHAR buf[NSIS_MAX_STRLEN];
buf[0] = SkipComments(f);
@ -1096,8 +1120,31 @@ LanguageTable * CEXEBuild::LoadLangFile(TCHAR *filename) {
buf[0] = SkipComments(f);
_fgetts(buf+1, NSIS_MAX_STRLEN, f);
#ifndef _UNICODE
if (8 == fencoding)
{
if (!Platform_SupportsUTF8Conversion()) {
ERROR_MSG(_T("Error: UTF-8 language files not supported on this OS!\n"));
return 0;
}
EXEHEADTCHAR_T *bufConv = UTF8ToExeHeadTStr(buf, nlf->m_uCodePage);
if (!bufConv) {
ERROR_MSG(_T("Error: Invalid UTF-8? (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
return 0;
}
else {
UINT cch = _tcslen(bufConv);
_tcsnccpy(buf, bufConv, NSIS_MAX_STRLEN);
if (cch >= NSIS_MAX_STRLEN-1) {
buf[NSIS_MAX_STRLEN-1] = _T('\0'); // Make sure we fail the "String too long" check
}
}
ExeHeadTStrFree(bufConv);
}
#endif
if (_tcslen(buf) == NSIS_MAX_STRLEN-1) {
ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")"), i, NLFStrings[i].szLangStringName);
ERROR_MSG(_T("Error: String too long (string #%d - \"%s\")\n"), i, NLFStrings[i].szLangStringName);
return 0;
}
temp=_tcslen(buf);

View file

@ -34,6 +34,7 @@
#include <cassert> // for assert(3)
#include <time.h>
#include "tstring.h"
#include "utf.h"
#include <algorithm>
#include "boost/scoped_ptr.hpp"
@ -813,6 +814,10 @@ int CEXEBuild::includeScript(TCHAR *f)
return PS_ERROR;
}
build_include_depth++;
#ifndef _UNICODE
const bool org_build_include_isutf8 = build_include_isutf8;
build_include_isutf8 = IsUTF8BOM(incfp);
#endif
int last_linecnt=linecnt;
linecnt=0;
@ -837,6 +842,10 @@ int CEXEBuild::includeScript(TCHAR *f)
restore_timestamp_predefine(oldtimestamp);
#endif
#ifndef _UNICODE
build_include_isutf8 = org_build_include_isutf8;
#endif
int errlinecnt=linecnt;
linecnt=last_linecnt;
@ -1712,13 +1721,21 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
TCHAR *name = line.gettoken_str(1);
LANGID lang = line.gettoken_int(2);
TCHAR *str = line.gettoken_str(3);
int ret = SetLangString(name, lang, str, curfile_unicode);
int ret;
#ifndef _UNICODE
if (build_include_isutf8)
ret = SetUTF8LangString(name, lang, str);
else
#endif
ret = SetLangString(name, lang, str, curfile_unicode);
if (ret == PS_WARNING)
warning_fl(_T("LangString \"%s\" set multiple times for %d, wasting space"), name, lang);
else if (ret == PS_ERROR) {
ERROR_MSG(_T("Error: can't set LangString \"%s\"!\n"), name);
return PS_ERROR;
}
// BUGBUG: Does not display UTF-8 properly.
SCRIPT_MSG(_T("LangString: \"%s\" %d \"%s\"\n"), name, lang, str);
}
return PS_OK;

85
Source/utf.cpp Normal file
View file

@ -0,0 +1,85 @@
/*
* utf.cpp
*
* This file is a part of NSIS.
*
* Copyright (C) 2011 Anders Kjersem
*
* Licensed under the zlib/libpng license (the "License");
* you may not use this file except in compliance with the License.
*
* Licence details can be found in the file COPYING.
*
* This software is provided 'as-is', without any express or implied
* warranty.
*
*/
#include "utf.h"
// BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported
// on < WinXP or in our current POSIX implementation.
static const int UTF8MBTWCFLAGS = 0;
#define ExeHeadWStrFree free
static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch)
{
EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T));
#if 0
// TODO: We should add POSIX versions of G/SetLastError
// if we want to tell _why_ UTF8ToExeHeadTStr failed...
if (!s) SetLastError(ERROR_OUTOFMEMORY);
#endif
return s;
}
#ifdef _UNICODE
#else // !_UNICODE
EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage)
{
int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0);
if (!cchW) return NULL;
WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW);
if (!bufWStr) return NULL;
EXEHEADTCHAR_T *outstr = NULL;
if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW))
{
int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL);
if (cbA && (outstr = ExeHeadTStrAlloc(cbA)))
{
if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL))
{
ExeHeadTStrFree(outstr);
outstr = NULL;
}
}
}
ExeHeadWStrFree(bufWStr);
return outstr;
}
#endif // ?_UNICODE
bool IsUTF8BOM(FILE*fstrm)
{
// ungetc is only guaranteed to support 1 pushback,
// lets hope no ASCII file starts with 0xEF and is not a BOM!
const int c = fgetc(fstrm);
if (EOF == c) return false;
if (0xef == c)
{
const int c2 = fgetc(fstrm);
if (0xbb == c2)
{
const int c3 = fgetc(fstrm);
if (0xbf == c3) return true;
ungetc(c3,fstrm);
}
ungetc(c2,fstrm);
}
ungetc(c,fstrm);
return false;
}

43
Source/utf.h Normal file
View file

@ -0,0 +1,43 @@
/*
* utf.h
*
* This file is a part of NSIS.
*
* Copyright (C) 2011 Anders Kjersem
*
* Licensed under the zlib/libpng license (the "License");
* you may not use this file except in compliance with the License.
*
* Licence details can be found in the file COPYING.
*
* This software is provided 'as-is', without any express or implied
* warranty.
*
*/
#include "Platform.h"
#include <stdlib.h>
#include <stdio.h>
typedef unsigned short EXEHEADWCHAR_T;
#ifdef _UNICODE
typedef EXEHEADWCHAR_T EXEHEADTCHAR_T;
#else // !_UNICODE
typedef char EXEHEADTCHAR_T;
#define ExeHeadTStrFree free
inline EXEHEADTCHAR_T* ExeHeadTStrAlloc(UINT cb) {return (EXEHEADTCHAR_T*) malloc(cb);}
extern EXEHEADTCHAR_T* UTF8ToExeHeadTStr(LPCSTR StrU8,UINT Codepage);
#endif // ?_UNICODE
/**
* Tries to peek at the first few bytes in the stream to determine if it is a UTF-8 BOM.
* If it is a UTF-8 BOM it will eat the BOM,
* if it is not it tries its best to restore the data.
*/
extern bool IsUTF8BOM(FILE*fstrm);

View file

@ -200,8 +200,15 @@ inline size_t nsis_iconv_adaptor
}
void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
if (code_page == CP_ACP)
switch(code_page)
{
case CP_ACP:
code_page = 1252;
break;
case CP_UTF8:
_sntprintf(buf, len, _T("UTF-8"));
return;
}
_sntprintf(buf, len, _T("CP%d"), code_page);
}
@ -209,7 +216,7 @@ void static create_code_page_string(TCHAR *buf, size_t len, UINT code_page) {
int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
int cchWideChar, LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
LPBOOL lpUsedDefaultChar) {
static char buffer[4096];
static char buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
char cp[128];
create_code_page_string(cp, sizeof(cp), CodePage);
@ -245,7 +252,7 @@ int WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr,
int MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr,
int cbMultiByte, LPWSTR lpWideCharStr, int cchWideChar) {
static WCHAR buffer[4096];
static WCHAR buffer[4096]; // BUGBUG: Should this be 4*NSIS_MAX_STRLEN for large string build?
char cp[128];
create_code_page_string(cp, sizeof(cp), CodePage);
@ -900,3 +907,10 @@ bool GetDLLVersion(const tstring& filepath, DWORD& high, DWORD& low)
return false;
}
bool Platform_SupportsUTF8Conversion()
{
static unsigned char cached = -1;
if (-1 == cached) cached = !!IsValidCodePage(CP_UTF8);
return cached != 0;
}

View file

@ -185,4 +185,7 @@ RM_DEFINE_FREEFUNC(my_convert_free);
# define PATH_CONVERT(x)
#endif
// Platform detection
bool Platform_SupportsUTF8Conversion();
#endif //_UTIL_H_