/* * utf.cpp * * This file is a part of NSIS. * * Copyright (C) 2011-2021 Anders Kjersem * * Licensed under the zlib/libpng license (the "License"); * you may not use this file except in compliance with the License. * * Licence details can be found in the file COPYING. * * This software is provided 'as-is', without any express or implied * warranty. * */ #include "utf.h" #include "util.h" #define FIX_ENDIAN_INT16LETOHOST_INPLACE FIX_ENDIAN_INT16_INPLACE UINT StrLenUTF16(const void*str) { return sizeof(wchar_t) == 2 ? (UINT)wcslen((wchar_t*)str) : InlineStrLenUTF16(str); } bool StrSetUTF16LE(tstring&dest, const void*src) { #ifndef _WIN32 CharEncConv cec; if (!cec.Initialize(-1,NStreamEncoding::UTF16LE)) return false; src = (const void*) cec.Convert(src); if (!src) return false; #endif #ifdef C_ASSERT C_ASSERT(sizeof(tstring::value_type) >= sizeof(wchar_t)); #endif try { dest = (wchar_t*) src; } catch(...) { return false; } return true; } void UTF16InplaceEndianSwap(void*Buffer, UINT cch) { unsigned short *p = (unsigned short *) Buffer; while(cch--) p[cch] = SWAP_ENDIAN_INT16(p[cch]); } inline UINT UTF8ToWC_Convert(LPCSTR StrU8,UINT cbU8,wchar_t*Buffer,UINT cchBuf) { #ifndef MB_ERR_INVALID_CHARS const UINT MB_ERR_INVALID_CHARS = 8; // MSDN says this flag is OK for CP_UTF8 #endif return (UINT) MultiByteToWideChar(CP_UTF8,MB_ERR_INVALID_CHARS,StrU8,cbU8,Buffer,cchBuf); } inline UINT UTF8ToWC_Prepare(LPCSTR StrU8,UINT cbU8) { return UTF8ToWC_Convert(StrU8,cbU8,0,0); } UINT WCFromCodePoint(wchar_t*Dest,UINT cchDest,UINT32 CodPt) { // Don't allow half surrogate pairs if (CodPt >= 0xd800 && CodPt <= 0xdfff) CodPt = UNICODE_REPLACEMENT_CHARACTER; #ifdef _WIN32 if (CodPt <= 0xffff && cchDest) { *Dest = (wchar_t) CodPt; return 1; } else if (cchDest >= 2) { const UINT32 lead_offset = 0xd800 - (0x10000 >> 10); UINT16 lead = lead_offset + (CodPt >> 10), trail = 0xdc00 + (CodPt & 0x3ff); Dest[0] = lead, Dest[1] = trail; return 2; } return 0; #else iconvdescriptor iconvd; if (!iconvd.Open("wchar_t",iconvdescriptor::GetHostEndianUCS4Code())) return 0; size_t inleft = 4; UINT cchW = iconvd.Convert(&CodPt,&inleft,Dest,cchDest*sizeof(wchar_t)) / sizeof(wchar_t); return !inleft ? cchW : 0; #endif } wchar_t* DupWCFromBytes(void*Buffer,UINT cbBuffer,UINT32 SrcCP) { /*\ Converts a buffer encoded with SrcCP to a \0 terminated wchar_t malloc'ed buffer. Returns 0 on failure. \*/ CharEncConv cec; cec.SetAllowOptimizedReturn(!!(SrcCP&DWCFBF_ALLOWOPTIMIZEDRETURN)); if (!cec.Initialize(-1, SrcCP&=~DWCFBF_ALLOWOPTIMIZEDRETURN)) return 0; wchar_t *pWC = (wchar_t*) cec.Convert(Buffer, cbBuffer); return pWC ? (wchar_t*) cec.Detach() : 0; } BOOL CharEncConv::IsValidCodePage(UINT cp) { #ifdef _WIN32 if (cp <= 1 || NStreamEncoding::IsUnicodeCodepage(cp)) return true; // Allow ACP/OEM/UTF* #endif return cp < NStreamEncoding::CPCOUNT && ::IsValidCodePage(cp); } bool CharEncConv::Initialize(UINT32 ToEnc, UINT32 FromEnc) { /* ** Initialize() with a Windows codepage or -1 for wchar_t */ const WORD UTF32LE = NStreamEncoding::UTF32LE; #ifdef _WIN32 if (NStreamEncoding::UTF16LE == ToEnc) ToEnc = -1; if (NStreamEncoding::UTF16LE == FromEnc) FromEnc = -1; #endif m_TE = (WORD) ToEnc, m_FE = (WORD) FromEnc; if ((UTF32LE|1) == (m_FE|1) || (UTF32LE|1) == (m_TE|1)) return false; // UTF-32 is a pain to deal with on Windows #ifdef _WIN32 return (IsWE(m_FE) || IsValidCodePage(FromEnc)) && (IsWE(m_TE) || IsValidCodePage(ToEnc)); #else char f[50], t[COUNTOF(f)]; if (IsWE(m_FE)) strcpy(f, "wchar_t"); else create_code_page_string(f, COUNTOF(f), m_FE); if (IsWE(m_TE)) strcpy(t, "wchar_t"); else create_code_page_string(t, COUNTOF(t), m_TE); return m_TE == m_FE || m_iconvd.Open(t, f); #endif } size_t CharEncConv::GuessOutputSize(size_t cbConverted) { UINT cus = IsWE(m_TE) ? sizeof(wchar_t) : NStreamEncoding::GetCodeUnitSize(m_TE); size_t zt = 1, cch = cbConverted / cus; if (!cch) return 0; switch(cus) { case 1: zt = !((char*)m_Result)[--cch]; break; case 2: zt = !((WORD*)m_Result)[--cch]; break; case 4: zt = !((UINT32*)m_Result)[--cch]; break; } return (cch + (zt ? 0 : 1)) * cus; } void* CharEncConv::Convert(const void*Src, size_t cbSrc, size_t*cbOut) { /* ** Convert() mallocs a buffer and converts Src (as m_FE) to m_TE. ** If cbSrc is -1 the size is calculated. cbOut can be NULL. ** Returns a pointer to the buffer on success or 0 on error. ** The buffer is valid until you call Close() or Convert(). */ #ifdef _WIN32 m_OptimizedReturn = false; #endif if ((size_t)-1 == cbSrc) { UINT cus = IsWE(m_FE) ? sizeof(wchar_t) : NStreamEncoding::GetCodeUnitSize(m_FE); switch(cus) { case 1: cbSrc = strlen((char*)Src); break; case 2: cbSrc = StrLenUTF16(Src); break; //case 4: // No UTF-32 support... default: if (sizeof(wchar_t) > 2 && sizeof(wchar_t) == cus) { cbSrc = wcslen((wchar_t*)Src); break; } assert(0); return 0; } cbSrc = (cbSrc + 1) * cus; } if (m_FE == m_TE) { #ifdef _WIN32 if (m_AllowOptimizedReturn && IsWE(m_FE)) { if (cbOut) { cbSrc /= sizeof(wchar_t); if (cbSrc && ((WORD*)Src)[--cbSrc]) ++cbSrc; *cbOut = cbSrc * sizeof(wchar_t); } m_OptimizedReturn = true; return (void*) (m_Result = (char*) Src); } #endif char *p = (char*) realloc(m_Result, cbSrc + sizeof(UINT32)); if (p) m_Result = p; else return 0; memcpy(p, Src, cbSrc); *((UINT32*)(p+cbSrc)) = 0; if (cbOut) *cbOut = GuessOutputSize(cbSrc); return m_Result; } #ifdef _WIN32 if (!IsWE(m_FE) && !IsWE(m_TE) && NStreamEncoding::UTF16BE != m_TE) { // We need a middle step: Src -> wchar_t -> Target CharEncConv cec; if (!cec.Initialize(-1, m_FE)) return 0; size_t cbConv; char *pWC = (char*) cec.Convert(Src, cbSrc, &cbConv); if (!pWC) return 0; this->m_FE = -1; return this->Convert(pWC, cbConv, cbOut); } if (IsWE(m_FE)) { if (NStreamEncoding::UTF16BE == m_TE) goto l_swapUTF16; cbSrc /= sizeof(wchar_t); UINT cbDest = WideCharToMultiByte(m_TE, 0, (wchar_t*)Src, (int)cbSrc, 0, 0, 0, 0); char *p = (char*) realloc(m_Result, (cbDest + 1) * sizeof(char)); if (p) m_Result = p; else return 0; if (!(cbDest = WideCharToMultiByte(m_TE, 0, (wchar_t*)Src, (int)cbSrc, p, (int)cbDest, 0, 0))) return 0; if (p[--cbDest]) p[++cbDest] = '\0'; // Always \0 terminate if (cbOut) *cbOut = cbDest; // cbOut never includes the \0 terminator } else { UINT cchDest; if (NStreamEncoding::UTF16BE == m_FE) // UTF16BE -> UTF16LE/wchar_t { l_swapUTF16: char *p = (char*) realloc(m_Result, cbSrc + sizeof(wchar_t)); if (p) m_Result = p; else return 0; memcpy(p, Src, cbSrc); cchDest = (UINT) (cbSrc / sizeof(wchar_t)); UTF16InplaceEndianSwap(p, cchDest); if (!cchDest) *((WORD*)p) = 0, ++cchDest; // For "--cchDest" during \0 termination } else { cchDest = MultiByteToWideChar(m_FE, 0, (char*)Src, (int)cbSrc, 0, 0); char *p = (char*) realloc(m_Result, (cchDest + 1) * sizeof(wchar_t)); if (p) m_Result = p; else return 0; if (!(cchDest = MultiByteToWideChar(m_FE, 0, (char*)Src, (int)cbSrc, (LPWSTR)p, (int)cchDest))) return 0; if (NStreamEncoding::UTF16BE == m_TE) UTF16InplaceEndianSwap(p, cchDest); } if (((WORD*)m_Result)[--cchDest]) ((WORD*)m_Result)[++cchDest] = '\0'; if (cbOut) *cbOut = cchDest * sizeof(wchar_t); } #else char *in = (char*) Src; size_t cbConv; if (!nsis_iconv_reallociconv(m_iconvd, &in, &cbSrc, &m_Result, cbConv)) return 0; if (cbOut) *cbOut = GuessOutputSize(cbConv); #endif return m_Result; } #if !defined(_WIN32) || !defined(_UNICODE) bool WCToUTF16LEHlpr::Create(const TCHAR*in, unsigned int codepage) { CharEncConv cec; if (!cec.Initialize(NStreamEncoding::UTF16LE, -1)) return false; if (!cec.Convert(in)) return false; m_s = (unsigned short*) cec.Detach(); return true; } #endif UINT DetectUTFBOM(void*Buffer, UINT cb) { unsigned char *b = (unsigned char*) Buffer; if (cb >= 3 && 0xef == b[0] && 0xbb == b[1] && 0xbf == b[2]) return NStreamEncoding::UTF8; if (cb >= 2) { if (cb >= 4 && !b[0] && !b[1] && 0xfe == b[2] && 0xff == b[3]) return NStreamEncoding::UTF32BE; if (0xff == b[0] && 0xfe == b[1]) return (cb >= 4 && !b[2] && !b[3]) ? NStreamEncoding::UTF32LE : NStreamEncoding::UTF16LE; if (0xfe == b[0] && 0xff == b[1]) return NStreamEncoding::UTF16BE; } return 0; } UINT DetectUTFBOM(FILE*strm) { /*\ Tries to detect a BOM at the current position in a stream. If a BOM is found it is eaten. NOTE: ungetc is only guaranteed to support 1 pushback, lets hope no MBCS file starts with parts of a BOM. \*/ const int b1 = fgetc(strm); if (EOF == b1) return 0; if (0xef == b1) { const int b2 = fgetc(strm); if (0xbb == b2) { const int b3 = fgetc(strm); if (0xbf == b3) return NStreamEncoding::UTF8; ungetc(b3,strm); } ungetc(b2,strm); } if (0xfe == b1 || 0xff == b1 || 0x00 == b1) { const int b2 = fgetc(strm), b3 = fgetc(strm); if (b1 && (b1^b2) == (0xfe^0xff)) { if (0xff == b1 && 0 == b3) { const int b4 = fgetc(strm); if (0 == b4) return NStreamEncoding::UTF32LE; ungetc(b4,strm); } ungetc(b3,strm); return 0xff == b1 ? NStreamEncoding::UTF16LE : NStreamEncoding::UTF16BE; } if (0 == b1 && 0 == b2) { if (0xfe == b3) { const int b4 = fgetc(strm); if (0xff == b4) return NStreamEncoding::UTF32BE; ungetc(b4,strm); } } ungetc(b3,strm); ungetc(b2,strm); } ungetc(b1,strm); return 0; } WORD GetEncodingFromString(const TCHAR*s, bool&BOM) { BOM = false; if (!_tcsicmp(s,_T("ACP"))) return NStreamEncoding::ACP; if (!_tcsicmp(s,_T("OEM"))) return NStreamEncoding::OEMCP; if (!_tcsicmp(s,_T("UTF8"))) return NStreamEncoding::UTF8; if ((!_tcsicmp(s,_T("UTF8SIG")) || !_tcsicmp(s,_T("UTF8BOM"))) && (BOM = true)) return NStreamEncoding::UTF8; if (!_tcsicmp(s,_T("UTF16LE")) || (!_tcsicmp(s,_T("UTF16LEBOM")) && (BOM = true))) return NStreamEncoding::UTF16LE; if (!_tcsicmp(s,_T("UTF16BE")) || (!_tcsicmp(s,_T("UTF16BEBOM")) && (BOM = true))) return NStreamEncoding::UTF16BE; if (S7IsChEqualI('C',*s++) && S7IsChEqualI('P',*s++)) { int cp = _tstoi(s); if (cp > 0 && cp < NStreamEncoding::CPCOUNT) return (WORD) cp; } return NStreamEncoding::UNKNOWN; } WORD GetEncodingFromString(const TCHAR*s) { bool bom; return GetEncodingFromString(s, bom); } void NStreamEncoding::GetCPDisplayName(WORD CP, TCHAR*Buf) { TCHAR mybuf[10]; const TCHAR *p = mybuf; switch(CP) { case ACP: p = _T("ACP"); break; case OEMCP: p = _T("OEM"); break; case UTF16LE: p = _T("UTF16LE"); break; case UTF16BE: p = _T("UTF16BE"); break; case UTF32LE: p = _T("UTF32LE"); break; case UTF32BE: p = _T("UTF32BE"); break; case UTF8: p = _T("UTF8"); break; case BINARY: p = _T("BIN"); break; default: _stprintf(mybuf,_T("CP%u"),CP); if (CP >= NStreamEncoding::CPCOUNT) p = _T("?"); } _tcscpy(Buf,p); } bool NBaseStream::Attach(FILE*hFile, WORD enc, bool Seek /*= true*/) { Close(); m_hFile = hFile; if (!m_hFile) return false; if (!NStream::SetBinaryMode(m_hFile) && m_hFile != stdin) return false; WORD cp = 0; if (enc != NStreamEncoding::BINARY) { fpos_t pos; if (Seek && !fgetpos(m_hFile, &pos)) rewind(m_hFile); else Seek = false; cp = DetectUTFBOM(m_hFile); if (Seek) { fsetpos(m_hFile, &pos); if (cp) DetectUTFBOM(m_hFile); // parseScript() etc does not like the BOM, make sure we skip past it } } if (!cp) cp = enc; m_Enc.SafeSetCodepage(cp); return true; } bool NOStream::WriteString(const wchar_t*Str, size_t cch /*= -1*/) { CharEncConv cec; if (!cec.Initialize(m_Enc.GetCodepage(), -1)) return false; cec.SetAllowOptimizedReturn(true); if ((size_t)-1 != cch) cch *= sizeof(wchar_t); // cec.Convert wants byte count size_t cbConv; char *p = (char*) cec.Convert(Str, cch, &cbConv); return p && WriteOctets(p, cbConv); } bool NOStream::WritePlatformNLString(const wchar_t*Str, size_t cch /*= -1*/) { #ifdef _WIN32 size_t cch2 = 0, nlcount = 0; for(; cch2 < cch && Str[cch2]; ++cch2) if (L'\n' == Str[cch2]) ++nlcount; if (nlcount) { cch = cch2 + nlcount; wchar_t chPrev = 0, *buf = (wchar_t*) malloc(cch * sizeof(wchar_t)); if (!buf) return false; for(size_t s = 0, d = 0; d < cch; ++s, ++d) { if (L'\n' == Str[s]) { if (L'\r' != chPrev) buf[d++] = L'\r'; else --cch; } buf[d] = chPrev = Str[s]; } bool retval = WriteString(buf, cch); free(buf); return retval; } #endif return WriteString(Str, cch); } tstring NStreamLineReader::GetErrorMessage(UINT Error, const TCHAR*Filename, UINT Line) { tstring msg; TCHAR buf[40]; switch(Error) { case NStream::ERR_BUFFEROVERFLOW: msg = _T("Line too long"); break; case NStream::ERR_IOERROR: msg = _T("I/O error"), Filename = 0; break; case NStream::ERR_UNSUPPORTEDENCODING: StreamEncoding().GetCPDisplayName(buf); msg = tstring(buf) + _T(" is not supported"), Filename = 0; break; default: msg = _T("Bad text encoding"); break; } if (Filename) { const TCHAR *filelinesep = *Filename ? _T(":") : _T(""); _stprintf(buf,_T("%") NPRIs _T("%u"),filelinesep,Line); msg = msg + _T(": ") + Filename + buf; } return msg + _T("\n"); } UINT NStreamLineReader::ReadLine(wchar_t*Buffer, UINT cchBuf) { /*\ Reads from the associated stream until it finds a new-line or the read fails (I/O error or EOF). It fails with ERR_BUFFEROVERFLOW if cchBuf-1 wchar_t's are read without finding the end of the line. Buffer MUST be a valid pointer, it will be \0 terminated as long as cchBuf > 0. \*/ if (!cchBuf) return NStream::ERR_BUFFEROVERFLOW; #ifndef MB_ERR_INVALID_CHARS const UINT MB_ERR_INVALID_CHARS = 8; #endif const UINT cchFullBuf = cchBuf; NIStream&strm = GetStream(); #ifndef _WIN32 iconvdescriptor iconvd; #endif l_restart: if (StreamEncoding().IsUTF8()) { for(;;) { BYTE cb = 0; // bytes in chU8 -1 BYTE chU8[6]; if (!strm.ReadOctet(&chU8[0])) goto l_ioerror; UINT cchWC; #if defined(_WIN32) || defined(__CYGWIN__) // wchar_t==UTF16LE on Cygwin: www.mail-archive.com/bug-gnulib@gnu.org/msg21543.html // Fast path if wchar_t == UTF16 and in ASCII range if (chU8[0] <= 127 && sizeof(wchar_t) == 2) { cchWC = ++cb; if (cchBuf <= cchWC) goto l_lineoverflow; *Buffer = (wchar_t) chU8[0]; } else #endif { if (!UTF8_GetTrailCount(chU8[0], cb)) goto l_badutf; for(BYTE moreU8 = 0; moreU8 < cb;) { BYTE b; if (!strm.ReadOctet(&b)) goto l_ioerror; if (0x80 != (0xC0 & b)) goto l_badutf; // chU8[1..n] must be 0b10xxxxxx chU8[++moreU8] = b; } ++cb; cchWC = UTF8ToWC_Prepare((LPCSTR)chU8,cb); if (!cchWC) goto l_badutf; if (cchBuf <= cchWC) goto l_lineoverflow; cchWC = UTF8ToWC_Convert((LPCSTR)chU8,cb,Buffer,cchWC); } if (CompleteLine(Buffer,cchWC,cchBuf,true)) goto l_success; } } else if (StreamEncoding().IsUTF16()) { #ifndef _WIN32 if (!iconvd.Open("wchar_t", iconvdescriptor::GetHostEndianUCS4Code())) goto l_unsupportedencoding; #endif const bool utf16be = StreamEncoding().IsUTF16BE(); unsigned short lead, trail, cchWC; for(;;) { if (!strm.ReadInt16(&lead)) goto l_ioerror; FIX_ENDIAN_INT16LETOHOST_INPLACE(lead); if (utf16be) lead = SWAP_ENDIAN_INT16(lead); if (IsTrailSurrogateUTF16(lead)) goto l_badutf; UINT32 codpt = lead; if (cchBuf <= 1) goto l_lineoverflow; Buffer[0] = lead, cchWC = 1; if (IsLeadSurrogateUTF16(lead)) { if (!strm.ReadInt16(&trail)) goto l_ioerror; FIX_ENDIAN_INT16LETOHOST_INPLACE(trail); if (utf16be) trail = SWAP_ENDIAN_INT16(trail); if (!IsTrailSurrogateUTF16(trail)) goto l_badutf; codpt = CodePointFromUTF16SurrogatePair(lead,trail); #ifdef _WIN32 if (cchBuf <= 2) goto l_lineoverflow; Buffer[1] = trail, ++cchWC; #endif } if (!IsValidUnicodeCodePoint(codpt)) goto l_badutf; #ifndef _WIN32 char tmpdest[8]; // Should be plenty of space to store one UCS4 character as wchar_t(s) size_t inleft = 4; cchWC = iconvd.Convert(&codpt,&inleft,tmpdest,sizeof(tmpdest)) / sizeof(wchar_t); if (!cchWC) goto l_badutf; if (cchBuf <= cchWC) goto l_lineoverflow; for (UINT i = cchWC; i;) --i, Buffer[i] = ((wchar_t*)tmpdest)[i]; #endif if (CompleteLine(Buffer,cchWC,cchBuf,true)) goto l_success; } } else if (StreamEncoding().IsUnicode()) { goto l_unsupportedencoding; } else { const UINT cp = StreamEncoding().GetCodepage(); UINT mbtowcflags = (cp < 50220 && cp != 42) ? MB_ERR_INVALID_CHARS : 0; for(;;) { BYTE bufMB[2]; BYTE mb = 0; if (!strm.ReadOctet(&bufMB[0])) goto l_ioerror; if (IsDBCSLeadByteEx(cp,bufMB[0])) { if (!strm.ReadOctet(&bufMB[++mb])) goto l_ioerror; } ++mb; UINT cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,0,0); if (!cchWC) goto l_badencoding; if (cchBuf <= cchWC) goto l_lineoverflow; cchWC = MultiByteToWideChar(cp,mbtowcflags,(LPCSTR)bufMB,mb,Buffer,cchWC); if (CompleteLine(Buffer,cchWC,cchBuf,false)) goto l_success; } } l_ioerror: *Buffer = 0; return NStream::ERR_IOERROR; l_lineoverflow: *Buffer = 0; return NStream::ERR_BUFFEROVERFLOW; l_badutf: l_badencoding: *Buffer = 0; return NStream::ERR_INVALIDENCODING; l_unsupportedencoding: *Buffer = 0; return NStream::ERR_UNSUPPORTEDENCODING; l_success: *Buffer = 0; // "Foo\r\nBar" is 2 and not 3 lines const wchar_t chThisNL = *--Buffer, chPrevNL = m_PrevNL; const bool onlyNL = ++cchBuf == cchFullBuf; m_PrevNL = chThisNL; if (onlyNL && (chPrevNL^chThisNL) == ('\r'^'\n')) { m_PrevNL = 0; goto l_restart; // Previous line was "Foo\r". This line was "\n", ignore it. } return NStream::OK; }