NSIS/Source/utf.cpp
2013-03-08 19:38:46 +00:00

366 lines
10 KiB
C++

/*
* utf.cpp
*
* This file is a part of NSIS.
*
* Copyright (C) 2011 Anders Kjersem
*
* Licensed under the zlib/libpng license (the "License");
* you may not use this file except in compliance with the License.
*
* Licence details can be found in the file COPYING.
*
* This software is provided 'as-is', without any express or implied
* warranty.
*
*/
#include "utf.h"
#define FIX_ENDIAN_INT16LETOHOST_INPLACE FIX_ENDIAN_INT16_INPLACE
void RawTStrToASCII(const TCHAR*in,char*out,UINT maxcch)
{
const bool empty = !maxcch;
for(; maxcch && *in; --maxcch) *out++ = (char) *in++;
if (!empty) *out = 0;
}
UINT StrLenUTF16LE(const void*str)
{
unsigned short *p = (unsigned short *) str;
UINT cch = 0;
for(;p[cch];) ++cch;
return cch;
}
bool StrSetUTF16LE(tstring&dest, const void*src)
{
#ifdef _WIN32
dest = (unsigned short *) src;
#else
#error TODO: UTF16LE to wchar_t
#endif
return true;
}
inline UINT UTF8ToWC_Convert(LPCSTR StrU8,UINT cbU8,wchar_t*Buffer,UINT cchBuf)
{
#ifndef MB_ERR_INVALID_CHARS
const UINT MB_ERR_INVALID_CHARS = 8; // MSDN says this flag is OK for CP_UTF8
#endif
return (UINT) MultiByteToWideChar(CP_UTF8,MB_ERR_INVALID_CHARS,StrU8,cbU8,Buffer,cchBuf);
}
inline UINT UTF8ToWC_Prepare(LPCSTR StrU8,UINT cbU8)
{
return UTF8ToWC_Convert(StrU8,cbU8,0,0);
}
wchar_t* DupWCFromBytes(void*Buffer,UINT cbBuffer,WORD SrcCP)
{
/*\
Converts a buffer encoded with SrcCP to a \0 terminated wchar_t malloc'ed buffer.
Returns 0 if malloc failed or -1 if conversion to wchar_t failed.
\*/
NStreamEncoding srcenc(SrcCP);
wchar_t*pwc = 0;
#ifdef _WIN32
if (srcenc.IsUTF16LE())
{
// Assuming wchar_t==UTF16LE
pwc = (wchar_t*) malloc(cbBuffer + 2);
if (!pwc) return pwc;
memcpy(pwc, Buffer, cbBuffer);
*((wchar_t*)(((char*)pwc)+cbBuffer)) = L'\0';
return pwc;
}
// TODO: MBTWC on Windows is lame, we are going to fail if SrcCP is UTF16BE or UTF32
#endif
UINT cchW = MultiByteToWideChar(SrcCP,0,(char*)Buffer,cbBuffer,0,0);
if (!cchW && NStreamEncoding::GetCodeUnitSize(SrcCP) <= cbBuffer)
{
return (wchar_t*)-1;
}
pwc = (wchar_t*) malloc((cchW+1)*sizeof(wchar_t));
if (!pwc) return pwc;
MultiByteToWideChar(SrcCP,0,(char*)Buffer,cbBuffer,pwc,cchW);
pwc[cchW] = L'\0';
return pwc;
}
UINT DetectUTFBOM(FILE*strm)
{
/*\
Tries to detect a BOM at the start of a stream. If a BOM is found it is eaten.
NOTE: ungetc is only guaranteed to support 1 pushback,
lets hope no MBCS file starts with parts of a BOM.
\*/
const int b1 = fgetc(strm);
if (EOF == b1) return 0;
if (0xef == b1)
{
const int b2 = fgetc(strm);
if (0xbb == b2)
{
const int b3 = fgetc(strm);
if (0xbf == b3) return NStreamEncoding::UTF8;
ungetc(b3,strm);
}
ungetc(b2,strm);
}
if (0xfe == b1 || 0xff == b1 || 0x00 == b1)
{
const int b2 = fgetc(strm), b3 = fgetc(strm);
if (b1 && (b1^b2) == (0xfe^0xff))
{
if (0xff == b1 && 0 == b3)
{
const int b4 = fgetc(strm);
if (0 == b4) return NStreamEncoding::UTF32LE;
ungetc(b4,strm);
}
ungetc(b3,strm);
return 0xff == b1 ? NStreamEncoding::UTF16LE : NStreamEncoding::UTF16BE;
}
if (0 == b1 && 0 == b2)
{
if (0xfe == b3)
{
const int b4 = fgetc(strm);
if (0xff == b4) return NStreamEncoding::UTF32BE;
ungetc(b4,strm);
}
}
ungetc(b3,strm);
ungetc(b2,strm);
}
ungetc(b1,strm);
return 0;
}
WORD GetEncodingFromString(const TCHAR*s)
{
if (!_tcsicmp(s,_T("ACP"))) return NStreamEncoding::ACP;
if (!_tcsicmp(s,_T("OEM"))) return NStreamEncoding::OEMCP;
if (!_tcsicmp(s,_T("UTF8"))) return NStreamEncoding::UTF8;
if (!_tcsicmp(s,_T("UTF16LE"))) return NStreamEncoding::UTF16LE;
if (!_tcsicmp(s,_T("UTF16BE"))) return NStreamEncoding::UTF16BE;
if (S7IsChEqualI('C',*s++) && S7IsChEqualI('P',*s++))
{
int cp = _tstoi(s);
if (cp > 0 && cp < NStreamEncoding::CPCOUNT) return (WORD) cp;
}
return NStreamEncoding::UNKNOWN;
}
void NStreamEncoding::GetCPDisplayName(WORD CP, TCHAR*Buf)
{
TCHAR mybuf[10], *p = mybuf;
switch(CP)
{
case ACP: p = _T("ACP"); break;
case OEMCP: p = _T("OEM"); break;
case UTF16LE: p = _T("UTF16LE"); break;
case UTF16BE: p = _T("UTF16BE"); break;
case UTF32LE: p = _T("UTF32LE"); break;
case UTF32BE: p = _T("UTF32BE"); break;
case UTF8: p = _T("UTF8"); break;
default:
_stprintf(mybuf,_T("CP%u"),CP);
if (CP >= NStreamEncoding::CPCOUNT) p = _T("?");
}
_tcscpy(Buf,p);
}
tstring NStreamLineReader::GetErrorMessage(UINT Error, const TCHAR*Filename, UINT Line)
{
tstring msg;
TCHAR buf[40];
switch(Error)
{
case NStream::ERR_BUFFEROVERFLOW:
msg = _T("Line too long: ");
break;
case NStream::ERR_IOERROR:
msg = _T("I/O error"), Filename = 0;
break;
case NStream::ERR_UNSUPPORTEDENCODING:
StreamEncoding().GetCPDisplayName(buf);
msg = tstring(buf) + _T(" is not supported"), Filename = 0;
break;
default:
msg = _T("Bad text encoding: ");
break;
}
if (Filename)
{
_stprintf(buf,_T("%u"),Line);
msg = msg + Filename + _T(":") + buf;
}
return msg + _T("\n");
}
UINT NStreamLineReader::ReadLine(wchar_t*Buffer, UINT cchBuf)
{
/*\
Reads from the associated stream until it finds a new-line or
the read fails (I/O error or EOF). It fails with ERR_BUFFEROVERFLOW if
cchBuf-1 wchar_t's are read without finding the end of the line.
Buffer MUST be a valid pointer, it will be \0 terminated as long as cchBuf > 0.
\*/
if (!cchBuf) return NStream::ERR_BUFFEROVERFLOW;
#ifndef MB_ERR_INVALID_CHARS
const UINT MB_ERR_INVALID_CHARS = 8;
#endif
const UINT cchFullBuf = cchBuf;
NIStream&strm = GetStream();
#ifndef _WIN32
iconvdescriptor iconvd;
#endif
l_restart:
// Only supports MBCS and UTF-8 for now...
if (StreamEncoding().IsUTF8())
{
for(;;)
{
BYTE cb = 0; // bytes in chU8 -1
BYTE chU8[6];
if (!strm.ReadOctet(&chU8[0])) goto l_ioerror;
UINT cchWC;
#if defined(WIN32) // TODO: Is wchar_t==UTF16LE under cygwin?
// Fast path if wchar_t == UTF16 and in ASCII range
if (chU8[0] <= 127 && sizeof(wchar_t) == 2)
{
cchWC = ++cb;
if (cchBuf <= cchWC) goto l_lineoverflow;
*Buffer = (wchar_t) chU8[0];
}
else
#endif
{
if (0xC0 == (0xC0 & chU8[0]))
{
++cb;
if (0xE0 == (0xE0 & chU8[0]))
{
++cb;
if (0xF0 == (0xF0 & chU8[0]))
{
++cb;
if (0xF8 == (0xF8 & chU8[0]))
{
++cb;
if (0xFC == (0xFE & chU8[0]))
++cb;
else
goto l_badutf;
}
}
}
}
for(BYTE moreU8 = 0; moreU8 < cb;)
{
BYTE b;
if (!strm.ReadOctet(&b)) goto l_ioerror;
if (0x80 != (0xC0 & b)) goto l_badutf; // chU8[1..n] must be 0b10xxxxxx
chU8[++moreU8] = b;
}
++cb;
cchWC = UTF8ToWC_Prepare((LPCSTR)chU8,cb);
if (!cchWC) goto l_badutf;
if (cchBuf <= cchWC) goto l_lineoverflow;
cchWC = UTF8ToWC_Convert((LPCSTR)chU8,cb,Buffer,cchWC);
}
if (CompleteLine(Buffer,cchWC,cchBuf,true)) goto l_success;
}
}
else if (StreamEncoding().IsUTF16LE())
{
#ifndef _WIN32
if (!iconvd.Open("wchar_t", iconvd::GetHostEndianUCS4Code())) goto ERR_UNSUPPORTEDENCODING;
#endif
unsigned short lead, trail, cchWC;
for(;;)
{
if (!strm.ReadInt16(&lead)) goto l_ioerror;
FIX_ENDIAN_INT16LETOHOST_INPLACE(lead);
if (IsTrailSurrogateUTF16(lead)) goto l_badutf;
UINT32 codpt = lead;
if (cchBuf <= 1) goto l_lineoverflow;
Buffer[0] = lead, cchWC = 1;
if (IsLeadSurrogateUTF16(lead))
{
if (!strm.ReadInt16(&trail)) goto l_ioerror;
FIX_ENDIAN_INT16LETOHOST_INPLACE(trail);
if (!IsTrailSurrogateUTF16(trail)) goto l_badutf;
codpt = CodePointFromUTF16SurrogatePair(lead,trail);
#ifdef _WIN32
if (cchBuf <= 2) goto l_lineoverflow;
Buffer[1] = trail, ++cchWC;
#endif
}
if (!IsValidUnicodeCodePoint(codpt)) goto l_badutf;
#ifndef _WIN32
char tmpdest[8]; // Should be plenty of space to store one UCS4 character as wchar_t(s)
size_t inleft = 4;
cchWC = iconvd.Convert(&codpt,&inleft,tmpdest,sizeof(tmpdest)) / sizeof(wchar_t);
if (!cchWC) goto l_badutf;
if (cchBuf <= cchWC) goto l_lineoverflow;
for (UINT i = cchWC; i;) --i, Buffer[i] = ((wchar_t*)tmpdest)[i];
#endif
if (CompleteLine(Buffer,cchWC,cchBuf,true)) goto l_success;
}
}
else if (StreamEncoding().IsUnicode())
{
goto l_unsupportedencoding;
}
else
{
const UINT cp = StreamEncoding().GetCodepage();
UINT mbtowcflags = 0;
if (cp < 50220 && cp != 42) mbtowcflags = MB_ERR_INVALID_CHARS;
for(;;)
{
BYTE bufMB[2];
BYTE mb = 0;
if (!strm.ReadOctet(&bufMB[0])) goto l_ioerror;
if (IsDBCSLeadByteEx(cp,bufMB[0]))
{
if (!strm.ReadOctet(&bufMB[++mb])) goto l_ioerror;
}
++mb;
UINT cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,0,0);
if (!cchWC) goto l_badencoding;
if (cchBuf <= cchWC) goto l_lineoverflow;
cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,Buffer,cchWC);
if (CompleteLine(Buffer,cchWC,cchBuf,false)) goto l_success;
}
}
l_ioerror:
*Buffer = 0;
return NStream::ERR_IOERROR;
l_lineoverflow:
*Buffer = 0;
return NStream::ERR_BUFFEROVERFLOW;
l_badutf:
l_badencoding:
*Buffer = 0;
return NStream::ERR_INVALIDENCODING;
l_unsupportedencoding:
*Buffer = 0;
return NStream::ERR_UNSUPPORTEDENCODING;
l_success:
*Buffer = 0;
// "Foo\r\nBar" is 2 and not 3 lines
const wchar_t chThisNL = *--Buffer, chPrevNL = m_PrevNL;
const bool onlyNL = ++cchBuf == cchFullBuf;
m_PrevNL = chThisNL;
if (onlyNL && (chPrevNL^chThisNL) == ('\r'^'\n'))
{
m_PrevNL = 0;
goto l_restart; // Previous line was "Foo\r". This line was "\n", ignore it.
}
return NStream::OK;
}