From d387a32658c0468928dc9c5decc1c57d2b3d90d1 Mon Sep 17 00:00:00 2001 From: anders_k Date: Sat, 27 Jul 2013 23:09:54 +0000 Subject: [PATCH] FileRead in Unicode installers can handle DBCS, conversion output is limited to UCS-2 git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6399 212acab6-be3b-0410-9dea-997c60f758d6 --- Docs/src/file.but | 2 +- Docs/src/history.but | 4 ++++ Source/exehead/exec.c | 30 +++++++++++++++++++----------- Source/script.cpp | 2 +- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/Docs/src/file.but b/Docs/src/file.but index 08802c30..ac4e926a 100644 --- a/Docs/src/file.but +++ b/Docs/src/file.but @@ -26,7 +26,7 @@ Reads a string (ANSI characters) from a file opened with \R{FileOpen}{FileOpen}. \NsisWarnBlockContainerBegin \NsisBlockHeaderExeheadU \#{This is a bug in exehead but it is probably a good idea to document it here...} -The \R{intro-unicode}{Unicode} version can only read text encoded with a single byte character set! The \NsisACPcp is used during the conversion. +DBCS text is supported but conversion output is limited to UCS-2/BMP, surrogate pairs are not supported. The \NsisACPcp is used during the conversion. \NsisWarnBlockContainerEnd diff --git a/Docs/src/history.but b/Docs/src/history.but index 3b76f880..6ae8145d 100644 --- a/Docs/src/history.but +++ b/Docs/src/history.but @@ -8,6 +8,10 @@ Released on ? \S2{} Minor Changes +\b FileRead in Unicode installers can handle DBCS, conversion output is limited to UCS-2. + +\b FileRead in Unicode installers now uses the Unicode replacement character (U+FFFD) for invalid characters and not '?'. + \b FileReadByte no longer performs a Unicode conversion on non-ASCII characters \H{v3.0a1} 3.0 Alpha 1 diff --git a/Source/exehead/exec.c b/Source/exehead/exec.c index 4a9a1062..2233ebb4 100644 --- a/Source/exehead/exec.c +++ b/Source/exehead/exec.c @@ -1406,7 +1406,7 @@ static int NSISCALL ExecuteEntry(entry *entry_) #endif { TCHAR *textout=var1; - int rpos=0; + int rpos=0, ungetseek=sizeof(TCHAR); TCHAR *hptr=var0; int maxlen=GetIntFromParm(2); if (maxlen<1) break; @@ -1419,30 +1419,38 @@ static int NSISCALL ExecuteEntry(entry *entry_) { TCHAR c; #ifdef _UNICODE + c=0; // Make sure high byte is 0 for FileReadByte if (which==EW_FGETS && !parm3) { - /* BUGBUG: - How is MBTWC supposed to be able to determine the correct WCHAR for a multibyte string when it only has 1 byte to look at? - And what if the multibyte character needs two WCHARs? - */ - char tmpc; - if (!myReadFile(h,&tmpc,1)) break; - if (0==MultiByteToWideChar(CP_ACP, 0, &tmpc, 1, &c, 1)) c = _T('?'); + char tmpc[2]; + DWORD mbtwcflags=MB_ERR_INVALID_CHARS, cbio; + if (!ReadFile(h,tmpc,2,&cbio,NULL) || !cbio) break; + ungetseek=cbio; + for(;;) // Try to parse as DBCS first, if that fails try again as a single byte + { + // BUGBUG: Limited to UCS-2/BMP, surrogate pairs are not supported. + if (MultiByteToWideChar(CP_ACP,mbtwcflags,tmpc,cbio,&c,1)) break; + c=0xfffd; // Unicode replacement character + // If we read 2 bytes and it was not a DBCS character, we need to seek -1 + if (--cbio) SetFilePointer(h,-(--ungetseek),NULL,FILE_CURRENT); else break; + } } else #endif { - if (!myReadFile(h,&c,1)) break; + // Read 1 TCHAR (FileReadUTF16LE and (Ansi)FileRead) or + // parm3 bytes (FileReadByte and (Unicode)FileReadWord) + if (!myReadFile(h,&c,!parm3 ? sizeof(TCHAR) : sizeof(TCHAR) > 1 ? parm3 : 1)) break; } if (parm3) { - myitoa(textout,(unsigned char)c); + myitoa(textout,(UINT)(_TUCHAR)c); return 0; } if (lc == _T('\r') || lc == _T('\n')) { if (lc == c || (c != _T('\r') && c != _T('\n'))) - SetFilePointer(h,-((int)(sizeof(c))),NULL,FILE_CURRENT); + SetFilePointer(h,-((int)ungetseek),NULL,FILE_CURRENT); else textout[rpos++]=c; break; diff --git a/Source/script.cpp b/Source/script.cpp index d153f2cb..57725b4b 100644 --- a/Source/script.cpp +++ b/Source/script.cpp @@ -5872,7 +5872,7 @@ int CEXEBuild::doCommand(int which_token, LineParser &line) ent.offsets[0]=GetUserVarIndex(line, 1); // file handle ent.offsets[1]=GetUserVarIndex(line, 2); // output string ent.offsets[2]=add_asciistring(_T("1")); - ent.offsets[3]=1; + ent.offsets[3]=2; if (ent.offsets[0]<0 || ent.offsets[1]<0) PRINTHELP() SCRIPT_MSG(_T("FileReadWord: %s->%s\n"),line.gettoken_str(1),line.gettoken_str(2)); return add_entry(&ent);