From d387a32658c0468928dc9c5decc1c57d2b3d90d1 Mon Sep 17 00:00:00 2001
From: anders_k <anders_k@212acab6-be3b-0410-9dea-997c60f758d6>
Date: Sat, 27 Jul 2013 23:09:54 +0000
Subject: [PATCH] FileRead in Unicode installers can handle DBCS, conversion
 output is limited to UCS-2

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6399 212acab6-be3b-0410-9dea-997c60f758d6
---
 Docs/src/file.but     |  2 +-
 Docs/src/history.but  |  4 ++++
 Source/exehead/exec.c | 30 +++++++++++++++++++-----------
 Source/script.cpp     |  2 +-
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/Docs/src/file.but b/Docs/src/file.but
index 08802c30..ac4e926a 100644
--- a/Docs/src/file.but
+++ b/Docs/src/file.but
@@ -26,7 +26,7 @@ Reads a string (ANSI characters) from a file opened with \R{FileOpen}{FileOpen}.
 \NsisWarnBlockContainerBegin
 \NsisBlockHeaderExeheadU
 \#{This is a bug in exehead but it is probably a good idea to document it here...}
-The \R{intro-unicode}{Unicode} version can only read text encoded with a single byte character set! The \NsisACPcp is used during the conversion.
+DBCS text is supported but conversion output is limited to UCS-2/BMP, surrogate pairs are not supported. The \NsisACPcp is used during the conversion.
 \NsisWarnBlockContainerEnd
 
 
diff --git a/Docs/src/history.but b/Docs/src/history.but
index 3b76f880..6ae8145d 100644
--- a/Docs/src/history.but
+++ b/Docs/src/history.but
@@ -8,6 +8,10 @@ Released on ?
 
 \S2{} Minor Changes
 
+\b FileRead in Unicode installers can handle DBCS, conversion output is limited to UCS-2.
+
+\b FileRead in Unicode installers now uses the Unicode replacement character (U+FFFD) for invalid characters and not '?'.
+
 \b FileReadByte no longer performs a Unicode conversion on non-ASCII characters
 
 \H{v3.0a1} 3.0 Alpha 1
diff --git a/Source/exehead/exec.c b/Source/exehead/exec.c
index 4a9a1062..2233ebb4 100644
--- a/Source/exehead/exec.c
+++ b/Source/exehead/exec.c
@@ -1406,7 +1406,7 @@ static int NSISCALL ExecuteEntry(entry *entry_)
 #endif
       {
         TCHAR *textout=var1;
-        int rpos=0;
+        int rpos=0, ungetseek=sizeof(TCHAR);
         TCHAR *hptr=var0;
         int maxlen=GetIntFromParm(2);
         if (maxlen<1) break;
@@ -1419,30 +1419,38 @@ static int NSISCALL ExecuteEntry(entry *entry_)
           {
             TCHAR c;
 #ifdef _UNICODE
+            c=0; // Make sure high byte is 0 for FileReadByte
             if (which==EW_FGETS && !parm3)
             {
-              /* BUGBUG:
-              How is MBTWC supposed to be able to determine the correct WCHAR for a multibyte string when it only has 1 byte to look at?
-              And what if the multibyte character needs two WCHARs?
-              */
-              char tmpc;
-              if (!myReadFile(h,&tmpc,1)) break;
-              if (0==MultiByteToWideChar(CP_ACP, 0, &tmpc, 1, &c, 1)) c = _T('?');
+              char tmpc[2];
+              DWORD mbtwcflags=MB_ERR_INVALID_CHARS, cbio;
+              if (!ReadFile(h,tmpc,2,&cbio,NULL) || !cbio) break;
+              ungetseek=cbio;
+              for(;;) // Try to parse as DBCS first, if that fails try again as a single byte
+              {
+                // BUGBUG: Limited to UCS-2/BMP, surrogate pairs are not supported.
+                if (MultiByteToWideChar(CP_ACP,mbtwcflags,tmpc,cbio,&c,1)) break;
+                c=0xfffd; // Unicode replacement character
+                // If we read 2 bytes and it was not a DBCS character, we need to seek -1
+                if (--cbio) SetFilePointer(h,-(--ungetseek),NULL,FILE_CURRENT); else break;
+              }
             }
             else
 #endif
             {
-              if (!myReadFile(h,&c,1)) break;
+              // Read 1 TCHAR (FileReadUTF16LE and (Ansi)FileRead) or 
+              // parm3 bytes (FileReadByte and (Unicode)FileReadWord)
+              if (!myReadFile(h,&c,!parm3 ? sizeof(TCHAR) : sizeof(TCHAR) > 1 ? parm3 : 1)) break;
             }
             if (parm3)
             {
-              myitoa(textout,(unsigned char)c);
+              myitoa(textout,(UINT)(_TUCHAR)c);
               return 0;
             }
             if (lc == _T('\r') || lc == _T('\n'))
             {
               if (lc == c || (c != _T('\r') && c != _T('\n')))
-                SetFilePointer(h,-((int)(sizeof(c))),NULL,FILE_CURRENT);
+                SetFilePointer(h,-((int)ungetseek),NULL,FILE_CURRENT);
               else
                 textout[rpos++]=c;
               break;
diff --git a/Source/script.cpp b/Source/script.cpp
index d153f2cb..57725b4b 100644
--- a/Source/script.cpp
+++ b/Source/script.cpp
@@ -5872,7 +5872,7 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
       ent.offsets[0]=GetUserVarIndex(line, 1); // file handle
       ent.offsets[1]=GetUserVarIndex(line, 2); // output string
       ent.offsets[2]=add_asciistring(_T("1"));
-      ent.offsets[3]=1;
+      ent.offsets[3]=2;
       if (ent.offsets[0]<0 || ent.offsets[1]<0) PRINTHELP()
       SCRIPT_MSG(_T("FileReadWord: %s->%s\n"),line.gettoken_str(1),line.gettoken_str(2));
     return add_entry(&ent);