Use a custom file reader with UTF8 support for nsi/nsh/nlf files and store UTF16LE or MBCS (stringblock) strings in ExeHeadStringList
git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6289 212acab6-be3b-0410-9dea-997c60f758d6
This commit is contained in:
parent
e6ac4e6d9b
commit
dcddf977b2
18 changed files with 1208 additions and 623 deletions
364
Source/utf.cpp
364
Source/utf.cpp
|
@ -17,24 +17,7 @@
|
|||
|
||||
#include "utf.h"
|
||||
|
||||
// BUGBUG: We might want to use MB_ERR_INVALID_CHARS but it is not supported
|
||||
// on < WinXP or in our current POSIX implementation.
|
||||
static const int UTF8MBTWCFLAGS = 0;
|
||||
|
||||
|
||||
#define ExeHeadWStrFree free
|
||||
static EXEHEADWCHAR_T* ExeHeadWStrAlloc(UINT cch)
|
||||
{
|
||||
EXEHEADWCHAR_T* s = (EXEHEADWCHAR_T*) malloc(cch*sizeof(EXEHEADWCHAR_T));
|
||||
#if 0
|
||||
// TODO: We should add POSIX versions of G/SetLastError
|
||||
// if we want to tell _why_ UTF8ToExeHeadTStr failed...
|
||||
if (!s) SetLastError(ERROR_OUTOFMEMORY);
|
||||
#endif
|
||||
return s;
|
||||
}
|
||||
|
||||
#ifdef _UNICODE
|
||||
#define FIX_ENDIAN_INT16LETOHOST_INPLACE FIX_ENDIAN_INT16_INPLACE
|
||||
|
||||
void RawTStrToASCII(const TCHAR*in,char*out,UINT maxcch)
|
||||
{
|
||||
|
@ -43,51 +26,326 @@ void RawTStrToASCII(const TCHAR*in,char*out,UINT maxcch)
|
|||
if (!empty) *out = 0;
|
||||
}
|
||||
|
||||
#else // !_UNICODE
|
||||
|
||||
EXEHEADTCHAR_T* UTF8ToExeHeadTStrDup(LPCSTR StrU8,UINT Codepage)
|
||||
UINT StrLenUTF16LE(const void*str)
|
||||
{
|
||||
int cchW = MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,NULL,0);
|
||||
if (!cchW) return NULL;
|
||||
WCHAR *bufWStr = (WCHAR*) ExeHeadWStrAlloc(cchW);
|
||||
if (!bufWStr) return NULL;
|
||||
EXEHEADTCHAR_T *outstr = NULL;
|
||||
if (MultiByteToWideChar(CP_UTF8,UTF8MBTWCFLAGS,StrU8,-1,bufWStr,cchW))
|
||||
unsigned short *p = (unsigned short *) str;
|
||||
for(;*p;) ++p;
|
||||
UINT cch = 0;
|
||||
if ((size_t)p > (size_t)str) cch = ((size_t)p - (size_t)str) - 1;
|
||||
return cch;
|
||||
}
|
||||
|
||||
bool StrSetUTF16LE(tstring&dest, const void*src)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
dest = (unsigned short *) src;
|
||||
#else
|
||||
#error TODO: UTF16LE to wchar_t
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
inline UINT UTF8ToWC_Convert(LPCSTR StrU8,UINT cbU8,wchar_t*Buffer,UINT cchBuf)
|
||||
{
|
||||
#ifndef MB_ERR_INVALID_CHARS
|
||||
const UINT MB_ERR_INVALID_CHARS = 8; // MSDN says this flag is OK for CP_UTF8
|
||||
#endif
|
||||
return (UINT) MultiByteToWideChar(CP_UTF8,MB_ERR_INVALID_CHARS,StrU8,cbU8,Buffer,cchBuf);
|
||||
}
|
||||
inline UINT UTF8ToWC_Prepare(LPCSTR StrU8,UINT cbU8)
|
||||
{
|
||||
return UTF8ToWC_Convert(StrU8,cbU8,0,0);
|
||||
}
|
||||
|
||||
wchar_t* DupWCFromBytes(void*Buffer,UINT cbBuffer,WORD SrcCP)
|
||||
{
|
||||
/*\
|
||||
Converts a buffer encoded with SrcCP to a \0 terminated wchar_t malloc'ed buffer.
|
||||
Returns 0 if malloc failed or -1 if conversion to wchar_t failed.
|
||||
\*/
|
||||
NStreamEncoding srcenc(SrcCP);
|
||||
wchar_t*pwc = 0;
|
||||
#ifdef _WIN32
|
||||
if (srcenc.IsUTF16LE())
|
||||
{
|
||||
int cbA = WideCharToMultiByte(Codepage,0,bufWStr,cchW,NULL,0,NULL,NULL);
|
||||
if (cbA && (outstr = ExeHeadTStrAlloc(cbA)))
|
||||
// Assuming wchar_t==UTF16LE
|
||||
pwc = (wchar_t*) malloc(cbBuffer + 2);
|
||||
if (!pwc) return pwc;
|
||||
memcpy(pwc, Buffer, cbBuffer);
|
||||
*((wchar_t*)(((char*)pwc)+cbBuffer)) = L'\0';
|
||||
return pwc;
|
||||
}
|
||||
// TODO: MBTWC on Windows is lame, we are going to fail if SrcCP is UTF16BE or UTF32
|
||||
#endif
|
||||
UINT cchW = MultiByteToWideChar(SrcCP,0,(char*)Buffer,cbBuffer,0,0);
|
||||
if (!cchW && NStreamEncoding::GetCodeUnitSize(SrcCP) <= cbBuffer)
|
||||
{
|
||||
return (wchar_t*)-1;
|
||||
}
|
||||
pwc = (wchar_t*) malloc((cchW+1)*sizeof(wchar_t));
|
||||
if (!pwc) return pwc;
|
||||
MultiByteToWideChar(SrcCP,0,(char*)Buffer,cbBuffer,pwc,cchW);
|
||||
pwc[cchW] = L'\0';
|
||||
return pwc;
|
||||
}
|
||||
|
||||
UINT DetectUTFBOM(FILE*strm)
|
||||
{
|
||||
/*\
|
||||
Tries to detect a BOM at the start of a stream. If a BOM is found it is eaten.
|
||||
NOTE: ungetc is only guaranteed to support 1 pushback,
|
||||
lets hope no MBCS file starts with parts of a BOM.
|
||||
\*/
|
||||
const int b1 = fgetc(strm);
|
||||
if (EOF == b1) return 0;
|
||||
if (0xef == b1)
|
||||
{
|
||||
const int b2 = fgetc(strm);
|
||||
if (0xbb == b2)
|
||||
{
|
||||
if (!WideCharToMultiByte(Codepage,0,bufWStr,cchW,outstr,cbA,NULL,NULL))
|
||||
const int b3 = fgetc(strm);
|
||||
if (0xbf == b3) return NStreamEncoding::UTF8;
|
||||
ungetc(b3,strm);
|
||||
}
|
||||
ungetc(b2,strm);
|
||||
}
|
||||
if (0xfe == b1 || 0xff == b1 || 0x00 == b1)
|
||||
{
|
||||
const int b2 = fgetc(strm), b3 = fgetc(strm);
|
||||
if (b1 && (b1^b2) == (0xfe^0xff))
|
||||
{
|
||||
if (0xff == b1 && 0 == b3)
|
||||
{
|
||||
free(outstr);
|
||||
outstr = NULL;
|
||||
const int b4 = fgetc(strm);
|
||||
if (0 == b4) return NStreamEncoding::UTF32LE;
|
||||
ungetc(b4,strm);
|
||||
}
|
||||
ungetc(b3,strm);
|
||||
return 0xff == b1 ? NStreamEncoding::UTF16LE : NStreamEncoding::UTF16BE;
|
||||
}
|
||||
if (0 == b1 && 0 == b2)
|
||||
{
|
||||
if (0xfe == b3)
|
||||
{
|
||||
const int b4 = fgetc(strm);
|
||||
if (0xff == b4) return NStreamEncoding::UTF32BE;
|
||||
ungetc(b4,strm);
|
||||
}
|
||||
}
|
||||
ungetc(b3,strm);
|
||||
ungetc(b2,strm);
|
||||
}
|
||||
ExeHeadWStrFree(bufWStr);
|
||||
return outstr;
|
||||
ungetc(b1,strm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // ?_UNICODE
|
||||
|
||||
|
||||
bool IsUTF8BOM(FILE*fstrm)
|
||||
WORD GetEncodingFromString(const TCHAR*s)
|
||||
{
|
||||
// ungetc is only guaranteed to support 1 pushback,
|
||||
// lets hope no ASCII file starts with 0xEF and is not a BOM!
|
||||
const int c = fgetc(fstrm);
|
||||
if (EOF == c) return false;
|
||||
if (0xef == c)
|
||||
if (!_tcsicmp(s,_T("ACP"))) return NStreamEncoding::ACP;
|
||||
if (!_tcsicmp(s,_T("OEM"))) return NStreamEncoding::OEMCP;
|
||||
if (!_tcsicmp(s,_T("UTF8"))) return NStreamEncoding::UTF8;
|
||||
if (!_tcsicmp(s,_T("UTF16LE"))) return NStreamEncoding::UTF16LE;
|
||||
if (!_tcsicmp(s,_T("UTF16BE"))) return NStreamEncoding::UTF16BE;
|
||||
if (S7IsChEqualI('C',*s++) && S7IsChEqualI('P',*s++))
|
||||
{
|
||||
const int c2 = fgetc(fstrm);
|
||||
if (0xbb == c2)
|
||||
{
|
||||
const int c3 = fgetc(fstrm);
|
||||
if (0xbf == c3) return true;
|
||||
ungetc(c3,fstrm);
|
||||
}
|
||||
ungetc(c2,fstrm);
|
||||
int cp = _tstoi(s);
|
||||
if (cp > 0 && cp < NStreamEncoding::CPCOUNT) return (WORD) cp;
|
||||
}
|
||||
ungetc(c,fstrm);
|
||||
return false;
|
||||
return NStreamEncoding::UNKNOWN;
|
||||
}
|
||||
|
||||
void NStreamEncoding::GetCPDisplayName(WORD CP, TCHAR*Buf)
|
||||
{
|
||||
TCHAR mybuf[10], *p = mybuf;
|
||||
switch(CP)
|
||||
{
|
||||
case ACP: p = _T("ACP"); break;
|
||||
case OEMCP: p = _T("OEM"); break;
|
||||
case UTF16LE: p = _T("UTF16LE"); break;
|
||||
case UTF16BE: p = _T("UTF16BE"); break;
|
||||
case UTF32LE: p = _T("UTF32LE"); break;
|
||||
case UTF32BE: p = _T("UTF32BE"); break;
|
||||
case UTF8: p = _T("UTF8"); break;
|
||||
default:
|
||||
_stprintf(mybuf,_T("CP%u"),CP);
|
||||
if (CP >= NStreamEncoding::CPCOUNT) p = _T("?");
|
||||
}
|
||||
_tcscpy(Buf,p);
|
||||
}
|
||||
|
||||
tstring NStreamLineReader::GetErrorMessage(UINT Error, const TCHAR*Filename, UINT Line)
|
||||
{
|
||||
tstring msg;
|
||||
TCHAR buf[40];
|
||||
switch(Error)
|
||||
{
|
||||
case NStream::ERR_BUFFEROVERFLOW:
|
||||
msg = _T("Line too long: ");
|
||||
break;
|
||||
case NStream::ERR_IOERROR:
|
||||
msg = _T("I/O error"), Filename = 0;
|
||||
break;
|
||||
case NStream::ERR_UNSUPPORTEDENCODING:
|
||||
StreamEncoding().GetCPDisplayName(buf);
|
||||
msg = tstring(buf) + _T(" is not supported"), Filename = 0;
|
||||
break;
|
||||
default:
|
||||
msg = _T("Bad text encoding: ");
|
||||
break;
|
||||
}
|
||||
if (Filename)
|
||||
{
|
||||
_stprintf(buf,_T("%u"),Line);
|
||||
msg = msg + Filename + _T(":") + buf;
|
||||
}
|
||||
return msg + _T("\n");
|
||||
}
|
||||
|
||||
UINT NStreamLineReader::ReadLine(wchar_t*Buffer, UINT cchBuf)
|
||||
{
|
||||
/*\
|
||||
Reads from the associated stream until it finds a new-line or
|
||||
the read fails (I/O error or EOF). It fails with ERR_BUFFEROVERFLOW if
|
||||
cchBuf-1 wchar_t's are read without finding the end of the line.
|
||||
Buffer MUST be a valid pointer, it will be \0 terminated as long as cchBuf > 0.
|
||||
\*/
|
||||
if (!cchBuf) return NStream::ERR_BUFFEROVERFLOW;
|
||||
#ifndef MB_ERR_INVALID_CHARS
|
||||
const UINT MB_ERR_INVALID_CHARS = 8;
|
||||
#endif
|
||||
const UINT cchFullBuf = cchBuf;
|
||||
NIStream&strm = GetStream();
|
||||
|
||||
l_restart:
|
||||
// Only supports MBCS and UTF-8 for now...
|
||||
if (StreamEncoding().IsUTF8())
|
||||
{
|
||||
for(;;)
|
||||
{
|
||||
BYTE cb = 0; // bytes in chU8 -1
|
||||
BYTE chU8[6];
|
||||
if (!strm.ReadOctet(&chU8[0])) goto l_ioerror;
|
||||
UINT cchWC;
|
||||
#if defined(WIN32) // TODO: Is wchar_t==UTF16LE under cygwin?
|
||||
// Fast path if wchar_t == UTF16 and in ASCII range
|
||||
if (chU8[0] <= 127 && sizeof(wchar_t) == 2)
|
||||
{
|
||||
cchWC = ++cb;
|
||||
if (cchBuf <= cchWC) goto l_lineoverflow;
|
||||
*Buffer = (wchar_t) chU8[0];
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (0xC0 == (0xC0 & chU8[0]))
|
||||
{
|
||||
++cb;
|
||||
if (0xE0 == (0xE0 & chU8[0]))
|
||||
{
|
||||
++cb;
|
||||
if (0xF0 == (0xF0 & chU8[0]))
|
||||
{
|
||||
++cb;
|
||||
if (0xF8 == (0xF8 & chU8[0]))
|
||||
{
|
||||
++cb;
|
||||
if (0xFC == (0xFE & chU8[0]))
|
||||
++cb;
|
||||
else
|
||||
goto l_badutf;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for(BYTE moreU8 = 0; moreU8 < cb;)
|
||||
{
|
||||
BYTE b;
|
||||
if (!strm.ReadOctet(&b)) goto l_ioerror;
|
||||
if (0x80 != (0xC0 & b)) goto l_badutf; // chU8[1..n] must be 0b10xxxxxx
|
||||
chU8[++moreU8] = b;
|
||||
}
|
||||
++cb;
|
||||
cchWC = UTF8ToWC_Prepare((LPCSTR)chU8,cb);
|
||||
if (!cchWC) goto l_badutf;
|
||||
if (cchBuf <= cchWC) goto l_lineoverflow;
|
||||
cchWC = UTF8ToWC_Convert((LPCSTR)chU8,cb,Buffer,cchWC);
|
||||
}
|
||||
if (CompleteLine(Buffer,cchWC,cchBuf,true)) goto l_success;
|
||||
}
|
||||
}
|
||||
#ifdef _WIN32
|
||||
else if (StreamEncoding().IsUTF16LE())
|
||||
{
|
||||
unsigned short lead, trail, cchWC;
|
||||
for(;;)
|
||||
{
|
||||
if (!strm.ReadInt16(&lead)) goto l_ioerror;
|
||||
FIX_ENDIAN_INT16LETOHOST_INPLACE(lead);
|
||||
if (IsTrailSurrogateUTF16(lead)) goto l_badutf;
|
||||
UINT32 codpt = lead;
|
||||
Buffer[0] = lead, cchWC = 0;
|
||||
if (IsLeadSurrogateUTF16(lead))
|
||||
{
|
||||
if (!strm.ReadInt16(&trail)) goto l_ioerror;
|
||||
FIX_ENDIAN_INT16LETOHOST_INPLACE(trail);
|
||||
if (!IsTrailSurrogateUTF16(trail)) goto l_badutf;
|
||||
codpt = CodePointFromUTF16SurrogatePair(lead,trail);
|
||||
Buffer[1] = trail, ++cchWC;
|
||||
}
|
||||
if (!IsValidUnicodeCodePoint(codpt)) goto l_badutf;
|
||||
if (CompleteLine(Buffer,++cchWC,cchBuf,true)) goto l_success;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
else if (StreamEncoding().IsUnicode())
|
||||
{
|
||||
goto l_unsupportedencoding;
|
||||
}
|
||||
else
|
||||
{
|
||||
const UINT cp = StreamEncoding().GetCodepage();
|
||||
UINT mbtowcflags = 0;
|
||||
if (cp < 50220 && cp != 42) mbtowcflags = MB_ERR_INVALID_CHARS;
|
||||
for(;;)
|
||||
{
|
||||
BYTE bufMB[2];
|
||||
BYTE mb = 0;
|
||||
if (!strm.ReadOctet(&bufMB[0])) goto l_ioerror;
|
||||
if (IsDBCSLeadByteEx(cp,bufMB[0]))
|
||||
{
|
||||
if (!strm.ReadOctet(&bufMB[++mb])) goto l_ioerror;
|
||||
}
|
||||
++mb;
|
||||
UINT cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,0,0);
|
||||
if (!cchWC) goto l_badencoding;
|
||||
if (cchBuf <= cchWC) goto l_lineoverflow;
|
||||
cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,Buffer,cchWC);
|
||||
if (CompleteLine(Buffer,cchWC,cchBuf,false)) goto l_success;
|
||||
}
|
||||
}
|
||||
l_ioerror:
|
||||
*Buffer = 0;
|
||||
return NStream::ERR_IOERROR;
|
||||
l_lineoverflow:
|
||||
*Buffer = 0;
|
||||
return NStream::ERR_BUFFEROVERFLOW;
|
||||
l_badutf:
|
||||
l_badencoding:
|
||||
*Buffer = 0;
|
||||
return NStream::ERR_INVALIDENCODING;
|
||||
l_unsupportedencoding:
|
||||
*Buffer = 0;
|
||||
return NStream::ERR_UNSUPPORTEDENCODING;
|
||||
l_success:
|
||||
*Buffer = 0;
|
||||
// "Foo\r\nBar" is 2 and not 3 lines
|
||||
const wchar_t chThisNL = *--Buffer, chPrevNL = m_PrevNL;
|
||||
const bool onlyNL = ++cchBuf == cchFullBuf;
|
||||
m_PrevNL = chThisNL;
|
||||
if (onlyNL && (chPrevNL^chThisNL) == ('\r'^'\n'))
|
||||
{
|
||||
m_PrevNL = 0;
|
||||
goto l_restart; // Previous line was "Foo\r". This line was "\n", ignore it.
|
||||
}
|
||||
return NStream::OK;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue