FileRead in Unicode installers can handle DBCS, conversion output is limited to UCS-2

git-svn-id: https://svn.code.sf.net/p/nsis/code/NSIS/trunk@6399 212acab6-be3b-0410-9dea-997c60f758d6
2013-07-27 23:09:54 +00:00 · 2013-07-27 23:09:54 +00:00 · d387a32658
commit d387a32658
parent 7ce021a376
4 changed files with 25 additions and 13 deletions
--- a/Docs/src/file.but
+++ b/Docs/src/file.but
@ -26,7 +26,7 @@ Reads a string (ANSI characters) from a file opened with \R{FileOpen}{FileOpen}.
 \NsisWarnBlockContainerBegin
 \NsisBlockHeaderExeheadU
 \#{This is a bug in exehead but it is probably a good idea to document it here...}
-The \R{intro-unicode}{Unicode} version can only read text encoded with a single byte character set! The \NsisACPcp is used during the conversion.
+DBCS text is supported but conversion output is limited to UCS-2/BMP, surrogate pairs are not supported. The \NsisACPcp is used during the conversion.
 \NsisWarnBlockContainerEnd


--- a/Docs/src/history.but
+++ b/Docs/src/history.but
@ -8,6 +8,10 @@ Released on ?

 \S2{} Minor Changes

+\b FileRead in Unicode installers can handle DBCS, conversion output is limited to UCS-2.
+
+\b FileRead in Unicode installers now uses the Unicode replacement character (U+FFFD) for invalid characters and not '?'.
+
 \b FileReadByte no longer performs a Unicode conversion on non-ASCII characters

 \H{v3.0a1} 3.0 Alpha 1
--- a/Source/exehead/exec.c
+++ b/Source/exehead/exec.c
@ -1406,7 +1406,7 @@ static int NSISCALL ExecuteEntry(entry *entry_)
 #endif
      {
        TCHAR *textout=var1;
-        int rpos=0;
+        int rpos=0, ungetseek=sizeof(TCHAR);
        TCHAR *hptr=var0;
        int maxlen=GetIntFromParm(2);
        if (maxlen<1) break;
@ -1419,30 +1419,38 @@ static int NSISCALL ExecuteEntry(entry *entry_)
          {
            TCHAR c;
 #ifdef _UNICODE
+            c=0; // Make sure high byte is 0 for FileReadByte
            if (which==EW_FGETS && !parm3)
            {
-              /* BUGBUG:
-              How is MBTWC supposed to be able to determine the correct WCHAR for a multibyte string when it only has 1 byte to look at?
-              And what if the multibyte character needs two WCHARs?
-              */
-              char tmpc;
-              if (!myReadFile(h,&tmpc,1)) break;
-              if (0==MultiByteToWideChar(CP_ACP, 0, &tmpc, 1, &c, 1)) c = _T('?');
+              char tmpc[2];
+              DWORD mbtwcflags=MB_ERR_INVALID_CHARS, cbio;
+              if (!ReadFile(h,tmpc,2,&cbio,NULL) || !cbio) break;
+              ungetseek=cbio;
+              for(;;) // Try to parse as DBCS first, if that fails try again as a single byte
+              {
+                // BUGBUG: Limited to UCS-2/BMP, surrogate pairs are not supported.
+                if (MultiByteToWideChar(CP_ACP,mbtwcflags,tmpc,cbio,&c,1)) break;
+                c=0xfffd; // Unicode replacement character
+                // If we read 2 bytes and it was not a DBCS character, we need to seek -1
+                if (--cbio) SetFilePointer(h,-(--ungetseek),NULL,FILE_CURRENT); else break;
+              }
            }
            else
 #endif
            {
-              if (!myReadFile(h,&c,1)) break;
+              // Read 1 TCHAR (FileReadUTF16LE and (Ansi)FileRead) or 
+              // parm3 bytes (FileReadByte and (Unicode)FileReadWord)
+              if (!myReadFile(h,&c,!parm3 ? sizeof(TCHAR) : sizeof(TCHAR) > 1 ? parm3 : 1)) break;
            }
            if (parm3)
            {
-              myitoa(textout,(unsigned char)c);
+              myitoa(textout,(UINT)(_TUCHAR)c);
              return 0;
            }
            if (lc == _T('\r') || lc == _T('\n'))
            {
              if (lc == c || (c != _T('\r') && c != _T('\n')))
-                SetFilePointer(h,-((int)(sizeof(c))),NULL,FILE_CURRENT);
+                SetFilePointer(h,-((int)ungetseek),NULL,FILE_CURRENT);
              else
                textout[rpos++]=c;
              break;
--- a/Source/script.cpp
+++ b/Source/script.cpp
@ -5872,7 +5872,7 @@ int CEXEBuild::doCommand(int which_token, LineParser &line)
      ent.offsets[0]=GetUserVarIndex(line, 1); // file handle
      ent.offsets[1]=GetUserVarIndex(line, 2); // output string
      ent.offsets[2]=add_asciistring(_T("1"));
-      ent.offsets[3]=1;
+      ent.offsets[3]=2;
      if (ent.offsets[0]<0 || ent.offsets[1]<0) PRINTHELP()
      SCRIPT_MSG(_T("FileReadWord: %s->%s\n"),line.gettoken_str(1),line.gettoken_str(2));
    return add_entry(&ent);