/*
 * input.c: read the source form
 */

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include "halibut.h"

#define TAB_STOP 8              /* for column number tracking */

static void setpos(input * in, char *fname)
{
  in->pos.filename = fname;
  in->pos.line = 1;
  in->pos.col = (in->reportcols ? 1 : -1);
}

static void unget(input * in, int c, filepos * pos)
{
  if (in->npushback >= in->pushbacksize)
  {
    in->pushbacksize = in->npushback + 16;
    in->pushback = resize(in->pushback, in->pushbacksize);
  }
  in->pushback[in->npushback].chr = c;
  in->pushback[in->npushback].pos = *pos;       /* structure copy */
  in->npushback++;
}

/* ---------------------------------------------------------------------- */
/*
 * Macro subsystem
 */
typedef struct macro_Tag macro;
struct macro_Tag {
  wchar_t *name, *text;
};
struct macrostack_Tag {
  macrostack *next;
  wchar_t *text;
  int ptr, npushback;
  filepos pos;
};
static int macrocmp(void *av, void *bv)
{
  macro *a = (macro *) av, *b = (macro *) bv;
  return ustrcmp(a->name, b->name);
}
static void
macrodef(tree234 * macros, wchar_t * name, wchar_t * text, filepos fpos)
{
  macro *m = mknew(macro);
  m->name = name;
  m->text = text;
  if (add234(macros, m) != m)
  {
    error(err_macroexists, &fpos, name);
    sfree(name);
    sfree(text);
  }
}
static int
macrolookup(tree234 * macros, input * in, wchar_t * name, filepos * pos)
{
  macro m, *gotit;
  m.name = name;
  gotit = find234(macros, &m, NULL);
  if (gotit)
  {
    macrostack *expansion = mknew(macrostack);
    expansion->next = in->stack;
    expansion->text = gotit->text;
    expansion->pos = *pos;      /* structure copy */
    expansion->ptr = 0;
    expansion->npushback = in->npushback;
    in->stack = expansion;
    return TRUE;
  } else
    return FALSE;
}
static void macrocleanup(tree234 * macros)
{
  int ti;
  macro *m;
  for (ti = 0; (m = (macro *) index234(macros, ti)) != NULL; ti++)
  {
    sfree(m->name);
    sfree(m->text);
    sfree(m);
  }
  freetree234(macros);
}

/*
 * Can return EOF
 */
static int get(input * in, filepos * pos)
{
  int pushbackpt = in->stack ? in->stack->npushback : 0;
  if (in->npushback > pushbackpt)
  {
    --in->npushback;
    if (pos)
      *pos = in->pushback[in->npushback].pos;   /* structure copy */
    return in->pushback[in->npushback].chr;
  } else if (in->stack)
  {
    wchar_t c = in->stack->text[in->stack->ptr];
    if (pos) *pos = in->stack->pos;
    if (in->stack->text[++in->stack->ptr] == L'\0')
    {
      macrostack *tmp = in->stack;
      in->stack = tmp->next;
      sfree(tmp);
    }
    return c;
  } else if (in->currfp)
  {
    int c = getc(in->currfp);

    if (c == EOF)
    {
      fclose(in->currfp);
      in->currfp = NULL;
    }
    /* Track line numbers, for error reporting */
    if (pos)
      *pos = in->pos;
    if (in->reportcols)
    {
      switch (c)
      {
      case '\t':
        in->pos.col = 1 + (in->pos.col + TAB_STOP - 1) % TAB_STOP;
        break;
      case '\n':
        in->pos.col = 1;
        in->pos.line++;
        break;
      default:
        in->pos.col++;
        break;
      }
    } else
    {
      in->pos.col = -1;
      if (c == '\n')
        in->pos.line++;
    }
    /* FIXME: do input charmap translation. We should be returning
     * Unicode here. */
    return c;
  } else
    return EOF;
}

/*
 * Lexical analysis of source files.
 */
typedef struct token_Tag token;
struct token_Tag {
  int type;
  int cmd, aux;
  wchar_t *text;
  filepos pos;
};
enum {
  tok_eof,                      /* end of file */
  tok_eop,                      /* end of paragraph */
  tok_white,                    /* whitespace */
  tok_word,                     /* a word or word fragment */
  tok_cmd,                      /* \command */
  tok_lbrace,                   /* { */
  tok_rbrace                    /* } */
};

#define tokiscmd(t,c) ( (t).type == tok_cmd && (t).cmd == (c) )

/* Halibut command keywords. */
enum {
  c__invalid,                   /* invalid command */
  c__comment,                   /* comment command (\#) */
  c__escaped,                   /* escaped character */
  c__nop,                       /* no-op */
  c__nbsp,                      /* nonbreaking space */
  c__midparacmd_unixnow,
  c_A,                          /* appendix heading */
  c_B,                          /* bibliography entry */
  c_BR,                         /* bibliography rewrite */
  c_C,                          /* chapter heading */
  c_H,                          /* heading */
  c_I,                          /* invisible index mark */
  c_IM,                         /* index merge/rewrite */
  c_K,                          /* capitalised cross-reference */
  c_S,                          /* aux field is 0, 1, 2, ... */
  c_U,                          /* unnumbered-chapter heading */
  c_W,                          /* Web hyperlink */
  c_L,                          /* Relative/local hyperlink */
  c_b,                          /* bulletted list */
  c_c,                          /* code */
  c_cfg,                        /* configuration directive */
  c_copyright,                  /* copyright statement */
  c_cw,                         /* weak code */
  c_date,                       /* document processing date */
  c_define,                     /* macro definition */
  c_e,                          /* emphasis */
  c_i,                          /* visible index mark */
  c_ii,                         /* uncapitalised visible index mark */
  c_k,                          /* uncapitalised cross-reference */
  c_R,                          /* free text cross-reference */
  c_n,                          /* numbered list */
  c_nocite,                     /* bibliography trickery */
  c_preamble,                   /* document preamble text */
  c_q,                          /* quote marks */
  c_rule,                       /* horizontal rule */
  c_title,                      /* document title */
  c_u,                          /* aux field is char code */
  c_versionid                   /* document RCS id */
};

#define getcmdstyle(c) \
  (c) == c_c ? word_Code : \
  (c) == c_cw ? word_WeakCode : \
  (c) == c_e ? word_Emph : \
  word_Normal


/* Perhaps whitespace should be defined in a more Unicode-friendly way? */
#define iswhite(c) ( (c)==32 || (c)==9 || (c)==13 || (c)==10 )
#define isnl(c) ( (c)==10 )
#define isdec(c) ( ((c)>='0'&&(c)<='9') )
#define fromdec(c) ( (c)-'0' )
#define ishex(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='F') || ((c)>='a'&&(c)<='f'))
#define fromhex(c) ( (c)<='9' ? (c)-'0' : ((c)&0xDF) - ('A'-10) )
#define iscmd(c) ( ((c)>='0'&&(c)<='9') || ((c)>='A'&&(c)<='Z') || ((c)>='a'&&(c)<='z'))

/*
 * Keyword comparison function. Like strcmp, but between a wchar_t *
 * and a char *.
 */
static int kwcmp(wchar_t const *p, char const *q)
{
  int i;
  do
  {
    i = *p - *q;
  }
  while (*p++ && *q++ && !i);
  return i;
}

/*
 * Match a keyword.
 */
static void match_kw(token * tok)
{
  /*
   * FIXME. The ids are explicit in here so as to allow long-name
   * equivalents to the various very short keywords.
   *
   * This list must be sorted, it's searched using binary search.
   */
  static const struct {
    char const *name;
    int id;
  } keywords[] = {
    {
    "#", c__comment}
    ,                           /* comment command (\#) */
    {
    "-", c__escaped}
    ,                           /* nonbreaking hyphen */
    { ".", c__nop },
    {
    "A", c_A}
    ,                           /* appendix heading */
    {
    "B", c_B}
    ,                           /* bibliography entry */
    {
    "BR", c_BR}
    ,                           /* bibliography rewrite */
    {
    "C", c_C}
    ,                           /* chapter heading */
    {
    "H", c_H}
    ,                           /* heading */
    {
    "I", c_I}
    ,                           /* invisible index mark */
    {
    "IM", c_IM}
    ,                           /* index merge/rewrite */
    {
    "K", c_K}
    ,                           /* capitalised cross-reference */
    {
    "L", c_L}
    ,                           /* Relative/local hyperlink */
    {
    "R", c_R}
    ,                           /* free text cross-reference */
    {
    "U", c_U}
    ,                           /* unnumbered-chapter heading */
    {
    "W", c_W}
    ,                           /* Web hyperlink */
    {
    "\\", c__escaped}
    ,                           /* escaped backslash (\\) */
    {
    "_", c__nbsp}
    ,                           /* nonbreaking space (\_) */
    {
    "b", c_b}
    ,                           /* bulletted list */
    {
    "c", c_c}
    ,                           /* code */
    {
    "cfg", c_cfg}
    ,                           /* configuration directive */
    {
    "copyright", c_copyright}
    ,                           /* copyright statement */
    {
    "cw", c_cw}
    ,                           /* weak code */
    {
    "date", c_date}
    ,                           /* document processing date */
    {
    "define", c_define}
    ,                           /* macro definition */
    {
    "e", c_e}
    ,                           /* emphasis */
    { "hackunixnow", c__midparacmd_unixnow },
    {
    "i", c_i}
    ,                           /* visible index mark */
    {
    "ii", c_ii}
    ,                           /* uncapitalised visible index mark */
    {
    "k", c_k}
    ,                           /* uncapitalised cross-reference */
    {
    "n", c_n}
    ,                           /* numbered list */
    {
    "nocite", c_nocite}
    ,                           /* bibliography trickery */
    {
    "preamble", c_preamble}
    ,                           /* document preamble text */
    {
    "q", c_q}
    ,                           /* quote marks */
    {
    "rule", c_rule}
    ,                           /* horizontal rule */
    {
    "title", c_title}
    ,                           /* document title */
    {
    "versionid", c_versionid}
    ,                           /* document RCS id */
    {
    "{", c__escaped}
    ,                           /* escaped lbrace (\{) */
    {
    "}", c__escaped}
    ,                           /* escaped rbrace (\}) */
  };
  int i, j, k, c;

  /*
   * Special cases: \S{0,1,2,...} and \uABCD. If the syntax
   * doesn't match correctly, we just fall through to the
   * binary-search phase.
   */
  if (tok->text[0] == 'S')
  {
    /* We expect numeric characters thereafter. */
    wchar_t *p = tok->text + 1;
    int n;
    if (!*p)
      n = 1;
    else
    {
      n = 0;
      while (*p && isdec(*p))
      {
        n = 10 * n + fromdec(*p);
        p++;
      }
    }
    if (!*p)
    {
      tok->cmd = c_S;
      tok->aux = n;
      return;
    }
  } else if (tok->text[0] == 'u')
  {
    /* We expect hex characters thereafter. */
    wchar_t *p = tok->text + 1;
    int n = 0;
    while (*p && ishex(*p))
    {
      n = 16 * n + fromhex(*p);
      p++;
    }
    if (!*p)
    {
      tok->cmd = c_u;
      tok->aux = n;
      return;
    }
  }

  i = -1;
  j = sizeof(keywords) / sizeof(*keywords);
  while (j - i > 1)
  {
    k = (i + j) / 2;
    c = kwcmp(tok->text, keywords[k].name);
    if (c < 0)
      j = k;
    else if (c > 0)
      i = k;
    else
    {                           /* c == 0 */

      tok->cmd = keywords[k].id;
      return;
    }
  }

  tok->cmd = c__invalid;
}


/*
 * Read a token from the input file, in the normal way (`normal' in
 * the sense that code paragraphs work a different way).
 */
token get_token(input * in)
{
  int c;
  int nls;
  token ret;
  rdstring rs = { 0, 0, NULL };
  filepos cpos;

  ret.cmd = c__invalid;
  ret.aux = FALSE;
  ret.text = NULL;              /* default */
  c = get(in, &cpos);
  ret.pos = cpos;
  if (iswhite(c))
  {                             /* tok_white or tok_eop */
    nls = 0;
    do
    {
      if (isnl(c))
        nls++;
    }
    while ((c = get(in, &cpos)) != EOF && iswhite(c));
    if (c == EOF)
    {
      ret.type = tok_eof;
      return ret;
    }
    unget(in, c, &cpos);
    ret.type = (nls > 1 ? tok_eop : tok_white);
    return ret;
  } else if (c == EOF)
  {                             /* tok_eof */
    ret.type = tok_eof;
    return ret;
  } else if (c == '\\')
  {                             /* tok_cmd */
    c = get(in, &cpos);
    if (c == '-' || c == '\\' || c == '_' ||
        c == '#' || c == '{' || c == '}' || c == '.')
    {
      /* single-char command */
      rdadd(&rs, (wchar_t)c);
    } else if (c == 'u')
    {
      int len = 0;
      do
      {
        rdadd(&rs, (wchar_t)c);
        len++;
        c = get(in, &cpos);
      }
      while (ishex(c) && len < 5);
      unget(in, c, &cpos);
    } else if (iscmd(c))
    {
      do
      {
        rdadd(&rs, (wchar_t)c);
        c = get(in, &cpos);
      }
      while (iscmd(c));
      unget(in, c, &cpos);
    }
    /*
     * Now match the command against the list of available
     * ones.
     */
    ret.type = tok_cmd;
    ret.text = ustrdup(rs.text);
    match_kw(&ret);
    sfree(rs.text);
    return ret;
  } else if (c == '{')
  {                             /* tok_lbrace */
    ret.type = tok_lbrace;
    return ret;
  } else if (c == '}')
  {                             /* tok_rbrace */
    ret.type = tok_rbrace;
    return ret;
  } else
  {                             /* tok_word */
    /*
     * Read a word: the longest possible contiguous sequence of
     * things other than whitespace, backslash, braces and
     * hyphen. A hyphen terminates the word but is returned as
     * part of it; everything else is pushed back for the next
     * token. The `aux' field contains TRUE if the word ends in
     * a hyphen.
     */
    ret.aux = FALSE;            /* assumed for now */
    while (1)
    {
      if (iswhite(c) || c == '{' || c == '}' || c == '\\' || c == EOF)
      {
        /* Put back the character that caused termination */
        unget(in, c, &cpos);
        break;
      } else
      {
        rdadd(&rs, (wchar_t)c);
        if (c == '-')
        {
          ret.aux = TRUE;
          break;                /* hyphen terminates word */
        }
      }
      c = get(in, &cpos);
    }
    ret.type = tok_word;
    ret.text = ustrdup(rs.text);
    sfree(rs.text);
    return ret;
  }
}

/*
 * Determine whether the next input character is an open brace (for
 * telling code paragraphs from paragraphs which merely start with
 * code).
 */
int isbrace(input * in)
{
  int c;
  filepos cpos;

  c = get(in, &cpos);
  unget(in, c, &cpos);
  return (c == '{');
}

/*
 * Read the rest of a line that starts `\c'. Including nothing at
 * all (tok_word with empty text).
 */
token get_codepar_token(input * in)
{
  int c;
  token ret;
  rdstring rs = { 0, 0, NULL };
  filepos cpos;

  ret.type = tok_word;
  c = get(in, &cpos);           /* expect (and discard) one space */
  ret.pos = cpos;
  if (c == ' ')
  {
    c = get(in, &cpos);
    ret.pos = cpos;
  }
  while (!isnl(c) && c != EOF)
  {
    int c2 = c;
    c = get(in, &cpos);
    /* Discard \r just before \n. */
    if (c2 != 13 || !isnl(c))
      rdadd(&rs, (wchar_t)c2);
  }
  unget(in, c, &cpos);
  ret.text = ustrdup(rs.text);
  sfree(rs.text);
  return ret;
}

/*
 * Adds a new word to a linked list
 */
static word *addword(word newword, word *** hptrptr)
{
  word *mnewword;
  if (!hptrptr)
    return NULL;
  mnewword = mknew(word);
  *mnewword = newword;          /* structure copy */
  mnewword->next = NULL;
  **hptrptr = mnewword;
  *hptrptr = &mnewword->next;
  return mnewword;
}

/*
 * Adds a new paragraph to a linked list
 */
static paragraph *addpara(paragraph newpara, paragraph *** hptrptr)
{
  paragraph *mnewpara = mknew(paragraph);
  *mnewpara = newpara;          /* structure copy */
  mnewpara->next = NULL;
  **hptrptr = mnewpara;
  *hptrptr = &mnewpara->next;
  return mnewpara;
}

/*
 * Destructor before token is reassigned; should catch most memory
 * leaks
 */
#define dtor(t) ( sfree(t.text) )

static int is_special_midpara_cmd(token*t)
{
  return tokiscmd(*t, c__midparacmd_unixnow);
}

static int handle_special_midpara_cmd(token*t, rdstring*rs, paragraph ***hptrptr)
{
  wchar_t wbuf[100];
  paragraph par;

  if (t->type != tok_cmd) return 0;
  initpara(par);
  par.fpos = t->pos;
  switch(t->cmd)
  {
  case c__midparacmd_unixnow:
    ultou(getutcunixtime(), wbuf);
    rdadds(rs, wbuf);
    return 1;
  }
  return 0;
}

#define stack_item_push(stck__, sitype__) do { \
    struct stack_item *si__ = mknew(struct stack_item); \
    si__->type = sitype__; \
    stk_push((stck__), si__); \
  } while(!__LINE__)


/*
 * Reads a single file (ie until get() returns EOF)
 */
static void read_file(paragraph *** ret, input * in, indexdata * idx, tree234 *macros)
{
  token t;
  paragraph par;
  word wd, **whptr, **idximplicit;
  wchar_t utext[2], *wdtext;
  int style, spcstyle, tmpstyle;
  int already;
  int iswhite, seenwhite;
  int type;
  struct stack_item {
    enum {
      stack_nop = 0,            /* do nothing (for error recovery) */
      stack_ualt = 1,           /* \u alternative */
      stack_style = 2,          /* \e, \c, \cw */
      stack_idx = 4,            /* \I, \i, \ii */
      stack_hyper = 8,          /* \W */
      stack_quote = 16,         /* \q */
    } type;
    word **whptr;               /* to restore from \u alternatives */
    word **idximplicit;         /* to restore from \u alternatives */
  } *sitem;
  stack parsestk;
  word *indexword=NULL, *uword=NULL, *iword=NULL;
  word *idxwordlist;
  rdstring indexstr;
  int index_downcase=0, index_visible=0, indexing=0;
  const rdstring nullrs = { 0, 0, NULL };
  wchar_t uchr;

  t.text = NULL;
  already = FALSE;

  /*
   * Loop on each paragraph.
   */
  while (1)
  {
    int start_cmd = c__invalid;
    par.words = NULL;
    par.keyword = NULL;
    whptr = &par.words;

    /*
     * Get a token.
     */
    if (!already)
    {
      dtor(t), t = get_token(in);
    }
    already = FALSE;
    if (t.type == tok_eof)
      break;

    /*
     * Parse code paragraphs separately.
     */
    if (t.type == tok_cmd && t.cmd == c_c && !isbrace(in))
    {
      par.type = para_Code;
      par.fpos = t.pos;
      while (1)
      {
        dtor(t), t = get_codepar_token(in);
        wd.type = word_WeakCode;
        wd.breaks = FALSE;      /* shouldn't need this... */
        wd.text = ustrdup(t.text);
        wd.alt = NULL;
        wd.fpos = t.pos;
        addword(wd, &whptr);
        dtor(t), t = get_token(in);
        if (t.type == tok_white)
        {
          /*
           * The newline after a code-paragraph line
           */
          dtor(t), t = get_token(in);
        }
        if (t.type == tok_eop || t.type == tok_eof)
          break;
        else if (t.type != tok_cmd || t.cmd != c_c)
        {
          error(err_brokencodepara, &t.pos);
          addpara(par, ret);
          while (t.type != tok_eop)     /* error recovery: */
            dtor(t), t = get_token(in); /* eat rest of paragraph */
          goto codeparabroken;  /* ick, but such is life */
        }
      }
      addpara(par, ret);
    codeparabroken:
      continue;
    }

    while (t.type == tok_cmd && macrolookup(macros, in, t.text, &t.pos))
    {
      dtor(t), t = get_token(in);
    }


    /*
     * This token begins a paragraph. See if it's one of the
     * special commands that define a paragraph type.
     *
     * (note that \# is special in a way, and \nocite takes no
     * text)
     */
    par.type = para_Normal;
    if (t.type == tok_cmd)
    {
      int needkw=0;
      int is_macro = FALSE;

      par.fpos = t.pos;
      switch (t.cmd)
      {
      default:
        needkw = -1;
        break;
      case c__invalid:
        error(err_badparatype, t.text, &t.pos);
        needkw = 4;
        break;
      case c__comment:
        if (isbrace(in))
        {
          needkw = -1; // Upstream 56b96573 (r8312)
          break;                /* `\#{': isn't a comment para */
        }
        do
        {
          dtor(t), t = get_token(in);
        }
        while (t.type != tok_eop && t.type != tok_eof);
        continue;               /* next paragraph */
        /*
         * `needkw' values:
         *
         *   1 -- exactly one keyword
         *   2 -- at least one keyword
         *   4 -- any number of keywords including zero
         *   8 -- at least one keyword and then nothing else
         *  16 -- nothing at all! no keywords, no body
         *  32 -- no keywords at all
         */
      case c_A:
        needkw = 2;
        par.type = para_Appendix;
        break;
      case c_B:
        needkw = 2;
        par.type = para_Biblio;
        break;
      case c_BR:
        needkw = 1;
        par.type = para_BR;
        start_cmd = c_BR;
        break;
      case c_C:
        needkw = 2;
        par.type = para_Chapter;
        break;
      case c_H:
        needkw = 2;
        par.type = para_Heading;
        par.aux = 0;
        break;
      case c_IM:
        needkw = 2;
        par.type = para_IM;
        start_cmd = c_IM;
        break;
      case c_S:
        needkw = 2;
        par.type = para_Subsect;
        par.aux = t.aux;
        break;
      case c_U:
        needkw = 32;
        par.type = para_UnnumberedChapter;
        break;
        /* For \b and \n the keyword is optional */
      case c_b:
        needkw = 4;
        par.type = para_Bullet;
        break;
      case c_n:
        needkw = 4;
        par.type = para_NumberedList;
        break;
      case c_cfg:
        needkw = 8;
        par.type = para_Config;
        start_cmd = c_cfg;
        break;
      case c_copyright:
        needkw = 32;
        par.type = para_Copyright;
        break;
      case c_define:
        is_macro = TRUE;
        needkw = 1;
        break;
        /* For \nocite the keyword is _everything_ */
      case c_nocite:
        needkw = 8;
        par.type = para_NoCite;
        break;
      case c_preamble:
        needkw = 32;
        par.type = para_Preamble;
        break;
      case c_rule:
        needkw = 16;
        par.type = para_Rule;
        break;
      case c_title:
        needkw = 32;
        par.type = para_Title;
        break;
      case c_versionid:
        needkw = 32;
        par.type = para_VersionID;
        break;
      }

      if (needkw > 0)
      {
        rdstring rs = { 0, 0, NULL };
        int nkeys = 0;
        filepos fp;

        /* Get keywords. */
        dtor(t), t = get_token(in);
        fp = t.pos;
        while (t.type == tok_lbrace)
        {
          /* This is a keyword. */
          nkeys++;
          /* FIXME: there will be bugs if anyone specifies an
           * empty keyword (\foo{}), so trap this case. */
          while (dtor(t), t = get_token(in),
                 t.type == tok_word ||
                 t.type == tok_white ||
                 (t.type == tok_cmd && t.cmd == c__nbsp) ||
                 (t.type == tok_cmd && t.cmd == c__escaped) ||
                 /* TODO: Merge from upstream?: (t.type == tok_cmd && t.cmd == c_u) || */
                is_special_midpara_cmd(&t))
          {
            if (t.type == tok_white ||
                (t.type == tok_cmd && t.cmd == c__nbsp))
              rdadd(&rs, ' ');
            else if (!handle_special_midpara_cmd(&t, &rs, ret))
              rdadds(&rs, t.text);
          }
          if (t.type != tok_rbrace)
          {
            error(err_kwunclosed, &t.pos);
            continue;
          }
          rdadd(&rs, 0);        /* add string terminator */
          dtor(t), t = get_token(in);   /* eat right brace */
        }

        rdadd(&rs, 0);          /* add string terminator */

        /* See whether we have the right number of keywords. */
        if ((needkw & 48) && nkeys > 0)
          error(err_kwillegal, &fp);
        if ((needkw & 11) && nkeys == 0)
          error(err_kwexpected, &fp);
        if ((needkw & 5) && nkeys > 1)
          error(err_kwtoomany, &fp);

        if (is_macro)
        {
          /*
           * Macro definition. Get the rest of the line
           * as a code-paragraph token, repeatedly until
           * there's nothing more left of it. Separate
           * with newlines.
           */
          rdstring macrotext = { 0, 0, NULL };
          while (1)
          {
            dtor(t), t = get_codepar_token(in);
            if (macrotext.pos > 0)
              rdadd(&macrotext, L'\n');
            rdadds(&macrotext, t.text);
            dtor(t), t = get_token(in);
            if (t.type == tok_eop || t.type == tok_eof)
              break;
          }
          macrodef(macros, rs.text, macrotext.text, fp);
          continue;             /* next paragraph */
        }

        par.keyword = rdtrim(&rs);

        /* Move to EOP in case of needkw==8 or 16 (no body) */
        if (needkw & 24)
        {
          /* We allow whitespace even when we expect no para body */
          while (t.type == tok_white)
            dtor(t), t = get_token(in);
          if (t.type != tok_eop && t.type != tok_eof &&
              (start_cmd == c__invalid ||
               t.type != tok_cmd || t.cmd != start_cmd))
          {
            error(err_bodyillegal, &t.pos);
            /* Error recovery: eat the rest of the paragraph */
            while (t.type != tok_eop && t.type != tok_eof &&
                   (start_cmd == c__invalid ||
                    t.type != tok_cmd || t.cmd != start_cmd))
              dtor(t), t = get_token(in);
          }
          if (t.type == tok_cmd)
            already = TRUE;     /* inhibit get_token at top of loop */
          addpara(par, ret);
          continue;             /* next paragraph */
        }
      }
    }

    /*
     * Now read the actual paragraph, word by word, adding to
     * the paragraph list.
     *
     * Mid-paragraph commands:
     *
     *  \K \k
     *  \c \cw
     *  \e
     *  \i \ii
     *  \I
     *  \u
     *  \W
     *  \date
     *  \\ \{ \}
     */
    parsestk = stk_new();
    style = word_Normal;
    spcstyle = word_WhiteSpace;
    indexing = FALSE;
    seenwhite = TRUE;
    while (t.type != tok_eop && t.type != tok_eof)
    {
      iswhite = FALSE;
      already = FALSE;

      /* Handle implicit paragraph breaks after \IM, \BR etc */
      if (start_cmd != c__invalid &&
          t.type == tok_cmd && t.cmd == start_cmd)
      {
        already = TRUE;         /* inhibit get_token at top of loop */
        break;
      }

      if (t.type == tok_cmd && t.cmd == c__nop) {
        dtor(t), t = get_token(in);
        continue;              /* do nothing! */
      }
      if (t.type == tok_cmd && t.cmd == c__escaped)
      {
        t.type = tok_word;      /* nice and simple */
        t.aux = 0;              /* even if `\-' - nonbreaking! */
      }
      if (t.type == tok_cmd && t.cmd == c__nbsp)
      {
        t.type = tok_word;      /* nice and simple */
        sfree(t.text);
        t.text = ustrdup(L" "); /* text is ` ' not `_' */
        t.aux = 0;              /* (nonbreaking) */
      }
      switch (t.type)
      {
      case tok_white:
        if (whptr == &par.words)
          break;                /* strip whitespace at start of para */
        wd.text = NULL;
        wd.type = spcstyle;
        wd.alt = NULL;
        wd.aux = 0;
        wd.fpos = t.pos;
        wd.breaks = FALSE;

        /*
         * Inhibit use of whitespace if it's (probably the
         * newline) before a repeat \IM / \BR type
         * directive.
         */
        if (start_cmd != c__invalid)
        {
          dtor(t), t = get_token(in);
          already = TRUE;
          if (t.type == tok_cmd && t.cmd == start_cmd)
            break;
        }

        if (indexing)
          rdadd(&indexstr, ' ');
        if (!indexing || index_visible)
          addword(wd, &whptr);
        if (indexing)
          addword(wd, &idximplicit);
        iswhite = TRUE;
        break;
      case tok_word:
        if (indexing)
          rdadds(&indexstr, t.text);
        wd.type = style;
        wd.alt = NULL;
        wd.aux = 0;
        wd.fpos = t.pos;
        wd.breaks = t.aux;
        if (!indexing || index_visible)
        {
          wd.text = ustrdup(t.text);
          addword(wd, &whptr);
        }
        if (indexing)
        {
          wd.text = ustrdup(t.text);
          addword(wd, &idximplicit);
        }
        break;
      case tok_lbrace:
        error(err_unexbrace, &t.pos);
        /* Error recovery: push nop */
        sitem = mknew(struct stack_item);
        sitem->type = stack_nop;
        stk_push(parsestk, sitem);
        break;
      case tok_rbrace:
        sitem = stk_pop(parsestk);
        if (!sitem)
          error(err_unexbrace, &t.pos);
        else
        {
          if (sitem->type & stack_ualt)
          {
            whptr = sitem->whptr;
            idximplicit = sitem->idximplicit;
          }
          if (sitem->type & stack_style)
          {
            style = word_Normal;
            spcstyle = word_WhiteSpace;
          }
          if (sitem->type & stack_idx )          {
            indexword->text = ustrdup(indexstr.text);
            if (index_downcase)
              ustrlow(indexword->text);
            indexing = FALSE;
            rdadd(&indexstr, L'\0');
            index_merge(idx, FALSE, indexstr.text, idxwordlist);
            sfree(indexstr.text);
          }
          if (sitem->type & stack_hyper)
          {
            wd.text = NULL;
            wd.type = word_HyperEnd;
            wd.alt = NULL;
            wd.aux = 0;
            wd.fpos = t.pos;
            wd.breaks = FALSE;
            if (!indexing || index_visible)
              addword(wd, &whptr);
            if (indexing)
              addword(wd, &idximplicit);
          }
          if (sitem->type & stack_quote)
          {
            wd.text = NULL;
            wd.type = toquotestyle(style);
            wd.alt = NULL;
            wd.aux = quote_Close;
            wd.fpos = t.pos;
            wd.breaks = FALSE;
            if (!indexing || index_visible)
              addword(wd, &whptr);
            if (indexing)
            {
              rdadd(&indexstr, L'"');
              addword(wd, &idximplicit);
            }
          }
        }
        sfree(sitem);
        break;
      case tok_cmd:
        switch (t.cmd)
        {
        case c__comment:
          /*
           * In-paragraph comment: \#{ balanced braces }
           *
           * Anything goes here; even tok_eop. We should
           * eat whitespace after the close brace _if_
           * there was whitespace before the \#.
           */
          dtor(t), t = get_token(in);
          if (t.type != tok_lbrace)
          {
            error(err_explbr, &t.pos);
          } else
          {
            int braces = 1;
            while (braces > 0)
            {
              dtor(t), t = get_token(in);
              if (t.type == tok_lbrace)
                braces++;
              else if (t.type == tok_rbrace)
                braces--;
              else if (t.type == tok_eof)
              {
                error(err_commenteof, &t.pos);
                break;
              }
            }
          }
          if (seenwhite)
          {
            already = TRUE;
            dtor(t), t = get_token(in);
            if (t.type == tok_white)
            {
              iswhite = TRUE;
              already = FALSE;
            }
          }
          break;
        case c_q:
          dtor(t), t = get_token(in);
          if (t.type != tok_lbrace)
          {
            error(err_explbr, &t.pos);
          } else
          {
            wd.text = NULL;
            wd.type = toquotestyle(style);
            wd.alt = NULL;
            wd.aux = quote_Open;
            wd.fpos = t.pos;
            wd.breaks = FALSE;
            if (!indexing || index_visible)
              addword(wd, &whptr);
            if (indexing)
            {
              rdadd(&indexstr, L'"');
              addword(wd, &idximplicit);
            }
            sitem = mknew(struct stack_item);
            sitem->type = stack_quote;
            stk_push(parsestk, sitem);
          }
          break;
        case c_K:
        case c_k:
        case c_R:
        case c_W:
        case c_L:
        case c_date:
          /*
           * Keyword, hyperlink, or \date. We expect a
           * left brace, some text, and then a right
           * brace. No nesting; no arguments.
           */
          wd.fpos = t.pos;
          wd.breaks = FALSE;
          if (t.cmd == c_K)
            wd.type = word_UpperXref;
          else if (t.cmd == c_k)
            wd.type = word_LowerXref;
          else if (t.cmd == c_R)
            wd.type = word_FreeTextXref;
          else if (t.cmd == c_W)
            wd.type = word_HyperLink;
          else if (t.cmd == c_L)
            wd.type = word_LocalHyperLink;
          else
            wd.type = word_Normal;
          dtor(t), t = get_token(in);
          if (t.type != tok_lbrace)
          {
            if (wd.type == word_Normal)
            {
              time_t thetime = time(NULL);
              struct tm *broken = localtime(&thetime);
              already = TRUE;
              wdtext = ustrftime(NULL, broken);
              wd.type = style;
            } else
            {
              error(err_explbr, &t.pos);
              wdtext = NULL;
            }
          } else
          {
            rdstring rs = { 0, 0, NULL };
            while (dtor(t), t = get_token(in),
                   t.type == tok_word || t.type == tok_white)
            {
              if (t.type == tok_white)
                rdadd(&rs, ' ');
              else
                rdadds(&rs, t.text);
            }
            if (wd.type == word_Normal)
            {
              time_t thetime = time(NULL);
              struct tm *broken = localtime(&thetime);
              wdtext = ustrftime(rs.text, broken);
              wd.type = style;
            } else
            {
              wdtext = ustrdup(rs.text);
            }
            sfree(rs.text);
            if (t.type != tok_rbrace)
            {
              error(err_kwexprbr, &t.pos);
            }
          }
          wd.alt = NULL;
          wd.aux = 0;
          if (!indexing || index_visible)
          {
            wd.text = ustrdup(wdtext);
            addword(wd, &whptr);
          }
          if (indexing)
          {
            wd.text = ustrdup(wdtext);
            addword(wd, &idximplicit);
          }
          sfree(wdtext);
          if (wd.type == word_FreeTextXref || wd.type == word_HyperLink || wd.type == word_LocalHyperLink)
          {
            /*
             * Hyperlinks are different: they then
             * expect another left brace, to begin
             * delimiting the text marked by the link.
             */
            dtor(t), t = get_token(in);
            /*
             * Special cases: \W{}\c, \W{}\e, \W{}\cw
             */
            sitem = mknew(struct stack_item);
            sitem->type = stack_hyper;
            if (t.type == tok_cmd && (tmpstyle = getcmdstyle(t.cmd)))
            {
              if (style != word_Normal)
                error(err_nestedstyles, &t.pos);
              else
              {
                style = tmpstyle;
                spcstyle = tospacestyle(style);
                sitem->type |= stack_style;
              }
              dtor(t), t = get_token(in);
            }
            if (t.type != tok_lbrace)
            {
              error(err_explbr, &t.pos);
              sfree(sitem);
            } else
            {
              stk_push(parsestk, sitem);
            }
          }
          break;
        case c_c:
        case c_cw:
        case c_e:
          type = t.cmd;
          if (style != word_Normal)
          {
            error(err_nestedstyles, &t.pos);
            /* Error recovery: eat lbrace, push nop. */
            dtor(t), t = get_token(in);
            sitem = mknew(struct stack_item);
            sitem->type = stack_nop;
            stk_push(parsestk, sitem);
          }
          dtor(t), t = get_token(in);
          if (t.type != tok_lbrace)
          {
            error(err_explbr, &t.pos);
          } else
          {
            style = getcmdstyle(type);
            spcstyle = tospacestyle(style);
            sitem = mknew(struct stack_item);
            sitem->type = stack_style;
            stk_push(parsestk, sitem);
          }
          break;
        case c_i:
        case c_ii:
        case c_I:
          type = t.cmd;
          if (indexing)
          {
            error(err_nestedindex, &t.pos);
            /* Error recovery: eat lbrace, push nop. */
            dtor(t), t = get_token(in);
            sitem = mknew(struct stack_item);
            sitem->type = stack_nop;
            stk_push(parsestk, sitem);
          }
          sitem = mknew(struct stack_item);
          sitem->type = stack_idx;
          dtor(t), t = get_token(in);
          /*
           * Special cases: \i\c, \i\e, \i\cw
           */
          wd.fpos = t.pos;
          if (t.type == tok_cmd && (tmpstyle = getcmdstyle(t.cmd)))
          {
            if (style != word_Normal)
              error(err_nestedstyles, &t.pos);
            else
            {
              style = tmpstyle;
              spcstyle = tospacestyle(style);
              sitem->type |= stack_style;
            }
            dtor(t), t = get_token(in);
          }
          if (t.type != tok_lbrace)
          {
            sfree(sitem);
            error(err_explbr, &t.pos);
          } else
          {
            /* Add an index-reference word with no text as yet */
            wd.type = word_IndexRef;
            wd.text = NULL;
            wd.alt = NULL;
            wd.aux = 0;
            wd.breaks = FALSE;
            indexword = addword(wd, &whptr);
            /* Set up a rdstring to read the index text */
            indexstr = nullrs;
            /* Flags so that we do the Right Things with text */
            index_visible = (type != c_I);
            index_downcase = (type == c_ii);
            indexing = TRUE;
            idxwordlist = NULL;
            idximplicit = &idxwordlist;
            /* Stack item to close the indexing on exit */
            stk_push(parsestk, sitem);
          }
          break;
        case c_u:
          uchr = t.aux;
          utext[0] = uchr;
          utext[1] = 0;
          wd.type = style;
          wd.breaks = FALSE;
          wd.alt = NULL;
          wd.aux = 0;
          wd.fpos = t.pos;
          if (!indexing || index_visible)
          {
            wd.text = ustrdup(utext);
            uword = addword(wd, &whptr);
          } else
            uword = NULL;
          if (indexing)
          {
            wd.text = ustrdup(utext);
            iword = addword(wd, &idximplicit);
          } else
            iword = NULL;
          dtor(t), t = get_token(in);
          if (t.type == tok_lbrace)
          {
            /*
             * \u with a left brace. Until the brace
             * closes, all further words go on a
             * sidetrack from the main thread of the
             * paragraph.
             */
            sitem = mknew(struct stack_item);
            sitem->type = stack_ualt;
            sitem->whptr = whptr;
            sitem->idximplicit = idximplicit;
            stk_push(parsestk, sitem);
            whptr = uword ? &uword->alt : NULL;
            idximplicit = iword ? &iword->alt : NULL;
          } else
          {
            if (indexing)
              rdadd(&indexstr, uchr);
            already = TRUE;
          }
          break;
        default:
          if (!macrolookup(macros, in, t.text, &t.pos))
            error(err_badmidcmd, t.text, &t.pos);
          break;
        }
      }
      if (!already)
        dtor(t), t = get_token(in);
      seenwhite = iswhite;
    }
    /* Check the stack is empty */
    if (NULL != (sitem = stk_pop(parsestk)))
    {
      do
      {
        sfree(sitem);
        sitem = stk_pop(parsestk);
      }
      while (sitem);
      error(err_missingrbrace, &t.pos);
    }
    stk_free(parsestk);
    addpara(par, ret);
  }

  /*
   * We break to here rather than returning, because otherwise
   * this cleanup doesn't happen.
   */
  dtor(t);
}

paragraph *read_input(input * in, indexdata * idx)
{
  paragraph *head = NULL;
  paragraph **hptr = &head;
  tree234 *macros;

  macros = newtree234(macrocmp);

  while (in->currindex < in->nfiles)
  {
    in->currfp = fopen(in->filenames[in->currindex], "r");
    if (in->currfp)
    {
      setpos(in, in->filenames[in->currindex]);
      read_file(&hptr, in, idx, macros);
    }
    in->currindex++;
  }

  macrocleanup(macros);
  return head;
}